Generic JNI implementation for x86_64

Starting implementation for generic JNI on x86_64. Frames are of
large static size (>4K) right now, should be compacted later. Passes
the whole of jni_compiler_test.

Change-Id: I88ac3e13a534afe7568d62a1ef97cb766e8260e4
diff --git a/runtime/arch/x86/jni_entrypoints_x86.S b/runtime/arch/x86/jni_entrypoints_x86.S
index ebd82b5..45d7356 100644
--- a/runtime/arch/x86/jni_entrypoints_x86.S
+++ b/runtime/arch/x86/jni_entrypoints_x86.S
@@ -29,8 +29,8 @@
     UNDO_SETUP_GOT
     addl LITERAL(8), %esp         // restore the stack
     CFI_ADJUST_CFA_OFFSET(-12)
-    cmpl LITERAL(0), %eax         // check if returned method code is null
-    je .Lno_native_code_found     // if null, jump to return to handle
+    testl %eax, %eax         // check if returned method code is null
+    jz .Lno_native_code_found     // if null, jump to return to handle
     jmp *%eax                     // otherwise, tail call to intended method
 .Lno_native_code_found:
     ret
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index b24bfd5..4bde8b7 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -44,10 +44,10 @@
 
 MACRO0(RESTORE_REF_ONLY_CALLEE_SAVE_FRAME)
     addl MACRO_LITERAL(16), %esp  // Unwind stack up to return address
+    CFI_ADJUST_CFA_OFFSET(-16)
     POP ebp  // Restore callee saves (ebx is saved/restored by the upcall)
     POP esi
     POP edi
-    CFI_ADJUST_CFA_OFFSET(-28)
 END_MACRO
 
     /*
diff --git a/runtime/arch/x86_64/context_x86_64.cc b/runtime/arch/x86_64/context_x86_64.cc
index 1310402..3f1f86d 100644
--- a/runtime/arch/x86_64/context_x86_64.cc
+++ b/runtime/arch/x86_64/context_x86_64.cc
@@ -26,8 +26,11 @@
 static const uintptr_t gZero = 0;
 
 void X86_64Context::Reset() {
-  for (size_t i = 0; i < kNumberOfCpuRegisters; i++) {
-    gprs_[i] = NULL;
+  for (size_t i = 0; i < kNumberOfCpuRegisters; ++i) {
+    gprs_[i] = nullptr;
+  }
+  for (size_t i = 0; i < kNumberOfFloatRegisters; ++i) {
+    fprs_[i] = nullptr;
   }
   gprs_[RSP] = &rsp_;
   // Initialize registers with easy to spot debug values.
@@ -38,19 +41,30 @@
 void X86_64Context::FillCalleeSaves(const StackVisitor& fr) {
   mirror::ArtMethod* method = fr.GetMethod();
   uint32_t core_spills = method->GetCoreSpillMask();
+  uint32_t fp_core_spills = method->GetFpSpillMask();
   size_t spill_count = __builtin_popcount(core_spills);
-  DCHECK_EQ(method->GetFpSpillMask(), 0u);
+  size_t fp_spill_count = __builtin_popcount(fp_core_spills);
   size_t frame_size = method->GetFrameSizeInBytes();
   if (spill_count > 0) {
     // Lowest number spill is farthest away, walk registers and fill into context.
-    int j = 2;  // Offset j to skip return address spill.
-    for (int i = 0; i < kNumberOfCpuRegisters; i++) {
+    size_t j = 2;  // Offset j to skip return address spill.
+    for (size_t i = 0; i < kNumberOfCpuRegisters; ++i) {
       if (((core_spills >> i) & 1) != 0) {
         gprs_[i] = fr.CalleeSaveAddress(spill_count - j, frame_size);
         j++;
       }
     }
   }
+  if (fp_spill_count > 0) {
+    // Lowest number spill is farthest away, walk registers and fill into context.
+    size_t j = 2;  // Offset j to skip return address spill.
+    for (size_t i = 0; i < kNumberOfFloatRegisters; ++i) {
+      if (((fp_core_spills >> i) & 1) != 0) {
+        fprs_[i] = fr.CalleeSaveAddress(spill_count + fp_spill_count - j, frame_size);
+        j++;
+      }
+    }
+  }
 }
 
 void X86_64Context::SmashCallerSaves() {
@@ -58,7 +72,12 @@
   gprs_[RAX] = const_cast<uintptr_t*>(&gZero);
   gprs_[RDX] = const_cast<uintptr_t*>(&gZero);
   gprs_[RCX] = nullptr;
-  gprs_[RBX] = nullptr;
+  gprs_[RSI] = nullptr;
+  gprs_[RDI] = nullptr;
+  gprs_[R8] = nullptr;
+  gprs_[R9] = nullptr;
+  gprs_[R10] = nullptr;
+  gprs_[R11] = nullptr;
 }
 
 void X86_64Context::SetGPR(uint32_t reg, uintptr_t value) {
@@ -69,7 +88,43 @@
 }
 
 void X86_64Context::DoLongJump() {
+#if defined(__x86_64__)
+  // Array of GPR values, filled from the context backward for the long jump pop. We add a slot at
+  // the top for the stack pointer that doesn't get popped in a pop-all.
+  volatile uintptr_t gprs[kNumberOfCpuRegisters + 1];
+  for (size_t i = 0; i < kNumberOfCpuRegisters; ++i) {
+    gprs[kNumberOfCpuRegisters - i - 1] = gprs_[i] != NULL ? *gprs_[i] : X86_64Context::kBadGprBase + i;
+  }
+  // We want to load the stack pointer one slot below so that the ret will pop eip.
+  uintptr_t rsp = gprs[kNumberOfCpuRegisters - RSP - 1] - kWordSize;
+  gprs[kNumberOfCpuRegisters] = rsp;
+  *(reinterpret_cast<uintptr_t*>(rsp)) = rip_;
+  __asm__ __volatile__(
+      "movq %0, %%rsp\n\t"  // RSP points to gprs.
+      "popq %%r15\n\t"       // Load all registers except RSP and RIP with values in gprs.
+      "popq %%r14\n\t"
+      "popq %%r13\n\t"
+      "popq %%r12\n\t"
+      "popq %%r11\n\t"
+      "popq %%r10\n\t"
+      "popq %%r9\n\t"
+      "popq %%r8\n\t"
+      "popq %%rdi\n\t"
+      "popq %%rsi\n\t"
+      "popq %%rbp\n\t"
+      "addq $8, %%rsp\n\t"
+      "popq %%rbx\n\t"
+      "popq %%rdx\n\t"
+      "popq %%rcx\n\t"
+      "popq %%rax\n\t"
+      "popq %%rsp\n\t"      // Load stack pointer.
+      "ret\n\t"             // From higher in the stack pop rip.
+      :  // output.
+      : "g"(&gprs[0])  // input.
+      :);  // clobber.
+#else
   UNIMPLEMENTED(FATAL);
+#endif
 }
 
 }  // namespace x86_64
diff --git a/runtime/arch/x86_64/context_x86_64.h b/runtime/arch/x86_64/context_x86_64.h
index 78ef89c..055df61 100644
--- a/runtime/arch/x86_64/context_x86_64.h
+++ b/runtime/arch/x86_64/context_x86_64.h
@@ -59,9 +59,9 @@
   virtual void DoLongJump();
 
  private:
-  // Pointers to register locations, floating point registers are all caller save. Values are
-  // initialized to NULL or the special registers below.
+  // Pointers to register locations. Values are initialized to NULL or the special registers below.
   uintptr_t* gprs_[kNumberOfCpuRegisters];
+  uint64_t* fprs_[kNumberOfFloatRegisters];
   // Hold values for rsp and rip if they are not located within a stack frame. RIP is somewhat
   // special in that it cannot be encoded normally as a register operand to an instruction (except
   // in 64bit addressing modes).
diff --git a/runtime/arch/x86_64/jni_entrypoints_x86_64.S b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
index 35fcccb..10f39b7 100644
--- a/runtime/arch/x86_64/jni_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
@@ -19,4 +19,50 @@
     /*
      * Jni dlsym lookup stub.
      */
-UNIMPLEMENTED art_jni_dlsym_lookup_stub
+DEFINE_FUNCTION art_jni_dlsym_lookup_stub
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
+    PUSH r9   // Arg.
+    PUSH r8   // Arg.
+    PUSH rdi  //
+    PUSH rsi  // Arg.
+    PUSH rdx  // Arg.
+    PUSH rcx  // Arg.
+    // Create space for FPR args, plus padding for alignment
+    subq LITERAL(72), %rsp
+    CFI_ADJUST_CFA_OFFSET(72)
+    // Save FPRs.
+    movq %xmm0, 0(%rsp)
+    movq %xmm1, 8(%rsp)
+    movq %xmm2, 16(%rsp)
+    movq %xmm3, 24(%rsp)
+    movq %xmm4, 32(%rsp)
+    movq %xmm5, 40(%rsp)
+    movq %xmm6, 48(%rsp)
+    movq %xmm7, 56(%rsp)
+    // prepare call
+    movq %gs:THREAD_SELF_OFFSET, %rsi      // RSI := Thread::Current()
+    // call
+    call PLT_SYMBOL(artFindNativeMethod)  // (Thread*)
+    // restore arguments
+    movq 0(%rsp), %xmm0
+    movq 8(%rsp), %xmm1
+    movq 16(%rsp), %xmm2
+    movq 24(%rsp), %xmm3
+    movq 32(%rsp), %xmm4
+    movq 40(%rsp), %xmm5
+    movq 48(%rsp), %xmm6
+    movq 56(%rsp), %xmm7
+    addq LITERAL(72), %rsp
+    CFI_ADJUST_CFA_OFFSET(-72)
+    POP rcx  // Arg.
+    POP rdx  // Arg.
+    POP rsi  // Arg.
+    POP rdi  //
+    POP r8   // Arg.
+    POP r9   // Arg.
+    testq %rax, %rax         // check if returned method code is null
+    jz .Lno_native_code_found     // if null, jump to return to handle
+    jmp *%rax                     // otherwise, tail call to intended method
+.Lno_native_code_found:
+    ret
+END_FUNCTION art_jni_dlsym_lookup_stub
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 32e8434..4cd7880 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -23,8 +23,22 @@
      * Runtime::CreateCalleeSaveMethod(kSaveAll)
      */
 MACRO0(SETUP_SAVE_ALL_CALLEE_SAVE_FRAME)
-    int3
-    int3
+    // R10 := Runtime::Current()
+    movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
+    movq (%r10), %r10
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
+    PUSH r15  // Callee save.
+    PUSH r14  // Callee save.
+    PUSH r13  // Callee save.
+    PUSH r12  // Callee save.
+    PUSH rbp  // Callee save.
+    PUSH rbx  // Callee save.
+    subq LITERAL(8), %rsp  // Space for Method* (also aligns the frame).
+    CFI_ADJUST_CFA_OFFSET(8)
+    // R10 := ArtMethod* for ref and args callee save frame method.
+    movq RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
+    // Store ArtMethod* to bottom of stack.
+    movq %r10, 0(%rsp)
 END_MACRO
 
     /*
@@ -54,13 +68,13 @@
     PUSH r14  // Callee save.
     PUSH r13  // Callee save.
     PUSH r12  // Callee save.
-    PUSH r9   // Arg.
-    PUSH r8   // Arg.
-    PUSH rsi  // Arg.
+    PUSH r9   // Quick arg 5.
+    PUSH r8   // Quick arg 4.
+    PUSH rsi  // Quick arg 1.
     PUSH rbp  // Callee save.
     PUSH rbx  // Callee save.
-    PUSH rdx  // Arg.
-    PUSH rcx  // Arg.
+    PUSH rdx  // Quick arg 2.
+    PUSH rcx  // Quick arg 3.
     // Create space for FPR args and create 2 slots, 1 of padding and 1 for the ArtMethod*.
     subq LITERAL(80), %rsp
     CFI_ADJUST_CFA_OFFSET(80)
@@ -105,13 +119,18 @@
     POP r15
 END_MACRO
 
+
     /*
      * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
      * exception is Thread::Current()->exception_.
      */
 MACRO0(DELIVER_PENDING_EXCEPTION)
-    int3
-    int3
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME         // save callee saves for throw
+    // (Thread*, SP) setup
+    movq %gs:THREAD_SELF_OFFSET, %rdi
+    movq %rsp, %rsi
+    call PLT_SYMBOL(artDeliverPendingExceptionFromCode)  // artDeliverPendingExceptionFromCode(Thread*, SP)
+    int3                                     // unreached
 END_MACRO
 
 MACRO2(NO_ARG_RUNTIME_EXCEPTION, c_name, cxx_name)
@@ -635,11 +654,188 @@
 UNIMPLEMENTED art_quick_imt_conflict_trampoline
 UNIMPLEMENTED art_quick_resolution_trampoline
 
-
+/* Proposed Generic JNI setup
+ *
+ * #-------------------#
+ * |                   |
+ * | caller method...  |
+ * #-------------------#    <--- SP on entry
+ * | Return            |
+ * | R15               |    callee save
+ * | R14               |    callee save
+ * | R13               |    callee save
+ * | R12               |    callee save
+ * | R9                |    arg5
+ * | R8                |    arg4
+ * | RSI/R6            |    arg1
+ * | RBP/R5            |    callee save
+ * | RBX/R3            |    callee save
+ * | RDX/R2            |    arg2
+ * | RCX/R1            |    arg3
+ * | XMM7              |    float arg 8
+ * | XMM6              |    float arg 7
+ * | XMM5              |    float arg 6
+ * | XMM4              |    float arg 5
+ * | XMM3              |    float arg 4
+ * | XMM2              |    float arg 3
+ * | XMM1              |    float arg 2
+ * | XMM0              |    float arg 1
+ * | Padding           |
+ * | RDI/Method*       |  <- sp
+ * #-------------------#
+ * | local ref cookie  | // 4B
+ * |   padding         | // 4B
+ * #----------#--------#
+ * |          |      | |
+ * | Temp/    | SIRT | |    Scratch frame is 4k
+ * | Scratch  |      v |
+ * | Frame    #--------|
+ * |                   |
+ * |          #--------|
+ * |          |      ^ |
+ * |          | JNI  | |
+ * |          | Stack| |
+ * #----------#--------#    <--- SP on native call (needs alignment?)
+ * |                   |
+ * | Stack for Regs    |    The trampoline assembly will pop these values
+ * |                   |    into registers for native call
+ * #---------#---------#
+ * |         | sp*     |
+ * | Tramp.  #---------#
+ * | args    | thread  |
+ * | Tramp.  #---------#
+ * |         | method  |
+ * #-------------------#    <--- SP on artQuickGenericJniTrampoline
+ */
     /*
      * Called to do a generic JNI down-call
      */
-UNIMPLEMENTED art_quick_generic_jni_trampoline
+DEFINE_FUNCTION art_quick_generic_jni_trampoline
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
+    PUSH r15  // Callee save.
+    PUSH r14  // Callee save.
+    PUSH r13  // Callee save.
+    PUSH r12  // Callee save.
+    PUSH r9   // Quick arg 5.
+    PUSH r8   // Quick arg 4.
+    PUSH rsi  // Quick arg 1.
+    PUSH rbp  // Callee save.
+    PUSH rbx  // Callee save.
+    PUSH rdx  // Quick arg 2.
+    PUSH rcx  // Quick arg 3.
+    // Create space for FPR args and create 2 slots, 1 of padding and 1 for the ArtMethod*.
+    subq LITERAL(80), %rsp
+    CFI_ADJUST_CFA_OFFSET(80)
+    // Save FPRs.
+    movq %xmm0, 16(%rsp)
+    movq %xmm1, 24(%rsp)
+    movq %xmm2, 32(%rsp)
+    movq %xmm3, 40(%rsp)
+    movq %xmm4, 48(%rsp)
+    movq %xmm5, 56(%rsp)
+    movq %xmm6, 64(%rsp)
+    movq %xmm7, 72(%rsp)
+    // Store native ArtMethod* to bottom of stack.
+    movq %rdi, 0(%rsp)
+    movq %rsp, %rbp                 // save SP at callee-save frame
+    CFI_DEF_CFA_REGISTER(rbp)
+    //
+    // reserve a lot of space
+    //
+    //      4    local state ref
+    //      4    padding
+    //   4196    4k scratch space, enough for 2x 256 8-byte parameters (TODO: SIRT overhead?)
+    //     16    SIRT member fields ?
+    // +  112    14x 8-byte stack-2-register space
+    // ------
+    //   4332
+    // 16-byte aligned: 4336
+    // Note: 14x8 = 7*16, so the stack stays aligned for the native call...
+    //       Also means: the padding is somewhere in the middle
+    subq LITERAL(4336), %rsp
+    // prepare for artQuickGenericJniTrampoline call
+    // (Thread*,  SP)
+    //    rdi    rsi      <= C calling convention
+    //  gs:...   rbp      <= where they are
+    movq %gs:THREAD_SELF_OFFSET, %rdi
+    movq %rbp, %rsi
+    call PLT_SYMBOL(artQuickGenericJniTrampoline)
+    test %rax, %rax                 // check whether code pointer is NULL, also indicates exception
+    jz 1f
+    // pop from the register-passing alloca
+    // what's the right layout?
+    popq %rdi
+    popq %rsi
+    popq %rdx
+    popq %rcx
+    popq %r8
+    popq %r9
+    // TODO: skip floating point if unused, some flag.
+    movq 0(%rsp), %xmm0
+    movq 8(%rsp), %xmm1
+    movq 16(%rsp), %xmm2
+    movq 24(%rsp), %xmm3
+    movq 32(%rsp), %xmm4
+    movq 40(%rsp), %xmm5
+    movq 48(%rsp), %xmm6
+    movq 56(%rsp), %xmm7
+    addq LITERAL(64), %rsp          // floating-point done
+    // native call
+    call *%rax                      // Q: is the stack aligned 16B with or without the return addr?
+    // result sign extension is handled in C code
+    // prepare for artQuickGenericJniEndTrampoline call
+    // (Thread*,  SP, result, result_f)
+    //   rdi      rsi   rdx   rcx       <= C calling convention
+    //  gs:...    rbp   rax   xmm0      <= where they are
+    movq %gs:THREAD_SELF_OFFSET, %rdi
+    movq %rbp, %rsi
+    movq %rax, %rdx
+    movq %xmm0, %rcx
+    call PLT_SYMBOL(artQuickGenericJniEndTrampoline)
+    // tear down the alloca already
+    movq %rbp, %rsp
+    CFI_DEF_CFA_REGISTER(rsp)
+    // Exceptions possible.
+    // TODO: use cmpq, needs direct encoding because of gas bug
+    movq %gs:THREAD_EXCEPTION_OFFSET, %rbx
+    test %rbx, %rbx
+    jnz 2f
+    // Tear down the callee-save frame
+    // Load FPRs.
+    // movq %xmm0, 16(%rsp)         // doesn't make sense!!!
+    movq 24(%rsp), %xmm1            // neither does this!!!
+    movq 32(%rsp), %xmm2
+    movq 40(%rsp), %xmm3
+    movq 48(%rsp), %xmm4
+    movq 56(%rsp), %xmm5
+    movq 64(%rsp), %xmm6
+    movq 72(%rsp), %xmm7
+    // was 80 bytes
+    addq LITERAL(80), %rsp
+    CFI_ADJUST_CFA_OFFSET(-80)
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
+    POP rcx  // Arg.
+    POP rdx  // Arg.
+    POP rbx  // Callee save.
+    POP rbp  // Callee save.
+    POP rsi  // Arg.
+    POP r8   // Arg.
+    POP r9   // Arg.
+    POP r12  // Callee save.
+    POP r13  // Callee save.
+    POP r14  // Callee save.
+    POP r15  // Callee save.
+    // store into fpr, for when it's a fpr return...
+    movq %rax, %xmm0
+    ret
+1:
+    // tear down the _whole_ scratch space, assumes SIRT is empty, cookie not valid etc.
+    movq %rbp, %rsp
+    CFI_DEF_CFA_REGISTER(rsp)
+2:  RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
+    DELIVER_PENDING_EXCEPTION
+END_FUNCTION art_quick_generic_jni_trampoline
 
     /*
      * Called to bridge from the quick to interpreter ABI. On entry the arguments match those
diff --git a/runtime/arch/x86_64/registers_x86_64.h b/runtime/arch/x86_64/registers_x86_64.h
index c1a9942..8b0dc07 100644
--- a/runtime/arch/x86_64/registers_x86_64.h
+++ b/runtime/arch/x86_64/registers_x86_64.h
@@ -65,6 +65,7 @@
   XMM13 = 13,
   XMM14 = 14,
   XMM15 = 15,
+  kNumberOfFloatRegisters = 16
 };
 std::ostream& operator<<(std::ostream& os, const FloatRegister& rhs);
 
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 7e43994..6255c8c 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -1747,7 +1747,7 @@
   bool enter_interpreter = NeedsInterpreter(method.get(),
                                             method->GetEntryPointFromQuickCompiledCode(),
                                             method->GetEntryPointFromPortableCompiledCode());
-  if (enter_interpreter) {
+  if (enter_interpreter && !method->IsNative()) {
     method->SetEntryPointFromInterpreter(interpreter::artInterpreterToInterpreterBridge);
   } else {
     method->SetEntryPointFromInterpreter(artInterpreterToCompiledCodeBridge);
@@ -1767,9 +1767,14 @@
     method->SetEntryPointFromQuickCompiledCode(GetQuickResolutionTrampoline(runtime->GetClassLinker()));
     method->SetEntryPointFromPortableCompiledCode(GetPortableResolutionTrampoline(runtime->GetClassLinker()));
   } else if (enter_interpreter) {
-    // Set entry point from compiled code if there's no code or in interpreter only mode.
-    method->SetEntryPointFromQuickCompiledCode(GetQuickToInterpreterBridge());
-    method->SetEntryPointFromPortableCompiledCode(GetPortableToInterpreterBridge());
+    if (!method->IsNative()) {
+      // Set entry point from compiled code if there's no code or in interpreter only mode.
+      method->SetEntryPointFromQuickCompiledCode(GetQuickToInterpreterBridge());
+      method->SetEntryPointFromPortableCompiledCode(GetPortableToInterpreterBridge());
+    } else {
+      method->SetEntryPointFromQuickCompiledCode(GetQuickGenericJniTrampoline());
+      method->SetEntryPointFromPortableCompiledCode(GetPortableToQuickBridge());
+    }
   } else if (method->GetEntryPointFromPortableCompiledCode() != nullptr) {
     DCHECK(method->GetEntryPointFromQuickCompiledCode() == nullptr);
     have_portable_code = true;
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 63e0d42..bf8b8ba 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -57,9 +57,9 @@
   // | R1         |    arg1
   // | R0         |    padding
   // | Method*    |  <- sp
-  static constexpr bool kSoftFloatAbi = true;  // This is a soft float ABI.
-  static constexpr size_t kNumGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumFprArgs = 0;  // 0 arguments passed in FPRs.
+  static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
+  static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
   static constexpr size_t kBytesPerFprSpillLocation = 4;  // FPR spill size is 4 bytes.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 8;  // Offset of first GPR arg.
@@ -83,9 +83,9 @@
   // | A2         |    arg2
   // | A1         |    arg1
   // | A0/Method* |  <- sp
-  static constexpr bool kSoftFloatAbi = true;  // This is a soft float ABI.
-  static constexpr size_t kNumGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumFprArgs = 0;  // 0 arguments passed in FPRs.
+  static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
+  static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
   static constexpr size_t kBytesPerFprSpillLocation = 4;  // FPR spill size is 4 bytes.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 4;  // Offset of first GPR arg.
@@ -109,9 +109,9 @@
   // | EDX         |    arg2
   // | ECX         |    arg1
   // | EAX/Method* |  <- sp
-  static constexpr bool kSoftFloatAbi = true;  // This is a soft float ABI.
-  static constexpr size_t kNumGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumFprArgs = 0;  // 0 arguments passed in FPRs.
+  static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
+  static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
   static constexpr size_t kBytesPerFprSpillLocation = 8;  // FPR spill size is 8 bytes.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 4;  // Offset of first GPR arg.
@@ -148,9 +148,9 @@
   // | XMM0            |    float arg 1
   // | Padding         |
   // | RDI/Method*     |  <- sp
-  static constexpr bool kSoftFloatAbi = false;  // This is a hard float ABI.
-  static constexpr size_t kNumGprArgs = 5;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumFprArgs = 8;  // 0 arguments passed in FPRs.
+  static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
+  static constexpr size_t kNumQuickGprArgs = 5;  // 3 arguments passed in GPRs.
+  static constexpr size_t kNumQuickFprArgs = 8;  // 0 arguments passed in FPRs.
   static constexpr size_t kBytesPerFprSpillLocation = 8;  // FPR spill size is 8 bytes.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 80;  // Offset of first GPR arg.
@@ -211,15 +211,15 @@
   }
 
   byte* GetParamAddress() const {
-    if (!kSoftFloatAbi) {
+    if (!kQuickSoftFloatAbi) {
       Primitive::Type type = GetParamPrimitiveType();
       if (UNLIKELY((type == Primitive::kPrimDouble) || (type == Primitive::kPrimFloat))) {
-        if ((kNumFprArgs != 0) && (fpr_index_ + 1 < kNumFprArgs + 1)) {
+        if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
           return fpr_args_ + (fpr_index_ * kBytesPerFprSpillLocation);
         }
       }
     }
-    if (gpr_index_ < kNumGprArgs) {
+    if (gpr_index_ < kNumQuickGprArgs) {
       return gpr_args_ + GprIndexToGprOffset(gpr_index_);
     }
     return stack_args_ + (stack_index_ * kBytesStackArgLocation);
@@ -257,7 +257,7 @@
       cur_type_ = Primitive::kPrimNot;
       is_split_long_or_double_ = false;
       Visit();
-      if (kNumGprArgs > 0) {
+      if (kNumQuickGprArgs > 0) {
         gpr_index_++;
       } else {
         stack_index_++;
@@ -274,7 +274,7 @@
         case Primitive::kPrimInt:
           is_split_long_or_double_ = false;
           Visit();
-          if (gpr_index_ < kNumGprArgs) {
+          if (gpr_index_ < kNumQuickGprArgs) {
             gpr_index_++;
           } else {
             stack_index_++;
@@ -283,14 +283,14 @@
         case Primitive::kPrimFloat:
           is_split_long_or_double_ = false;
           Visit();
-          if (kSoftFloatAbi) {
-            if (gpr_index_ < kNumGprArgs) {
+          if (kQuickSoftFloatAbi) {
+            if (gpr_index_ < kNumQuickGprArgs) {
               gpr_index_++;
             } else {
               stack_index_++;
             }
           } else {
-            if ((kNumFprArgs != 0) && (fpr_index_ + 1 < kNumFprArgs + 1)) {
+            if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
               fpr_index_++;
             } else {
               stack_index_++;
@@ -299,14 +299,14 @@
           break;
         case Primitive::kPrimDouble:
         case Primitive::kPrimLong:
-          if (kSoftFloatAbi || (cur_type_ == Primitive::kPrimLong)) {
+          if (kQuickSoftFloatAbi || (cur_type_ == Primitive::kPrimLong)) {
             is_split_long_or_double_ = (kBytesPerGprSpillLocation == 4) &&
-                ((gpr_index_ + 1) == kNumGprArgs);
+                ((gpr_index_ + 1) == kNumQuickGprArgs);
             Visit();
-            if (gpr_index_ < kNumGprArgs) {
+            if (gpr_index_ < kNumQuickGprArgs) {
               gpr_index_++;
               if (kBytesPerGprSpillLocation == 4) {
-                if (gpr_index_ < kNumGprArgs) {
+                if (gpr_index_ < kNumQuickGprArgs) {
                   gpr_index_++;
                 } else {
                   stack_index_++;
@@ -322,12 +322,12 @@
             }
           } else {
             is_split_long_or_double_ = (kBytesPerFprSpillLocation == 4) &&
-                ((fpr_index_ + 1) == kNumFprArgs);
+                ((fpr_index_ + 1) == kNumQuickFprArgs);
             Visit();
-            if ((kNumFprArgs != 0) && (fpr_index_ + 1 < kNumFprArgs + 1)) {
+            if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
               fpr_index_++;
               if (kBytesPerFprSpillLocation == 4) {
-                if ((kNumFprArgs != 0) && (fpr_index_ + 1 < kNumFprArgs + 1)) {
+                if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
                   fpr_index_++;
                 } else {
                   stack_index_++;
@@ -352,14 +352,14 @@
  private:
   static size_t StackArgumentStartFromShorty(bool is_static, const char* shorty,
                                              uint32_t shorty_len) {
-    if (kSoftFloatAbi) {
-      CHECK_EQ(kNumFprArgs, 0U);
-      return (kNumGprArgs * kBytesPerGprSpillLocation) + kBytesPerGprSpillLocation /* ArtMethod* */;
+    if (kQuickSoftFloatAbi) {
+      CHECK_EQ(kNumQuickFprArgs, 0U);
+      return (kNumQuickGprArgs * kBytesPerGprSpillLocation) + kBytesPerGprSpillLocation /* ArtMethod* */;
     } else {
       size_t offset = kBytesPerGprSpillLocation;  // Skip Method*.
       size_t gprs_seen = 0;
       size_t fprs_seen = 0;
-      if (!is_static && (gprs_seen < kNumGprArgs)) {
+      if (!is_static && (gprs_seen < kNumQuickGprArgs)) {
         gprs_seen++;
         offset += kBytesStackArgLocation;
       }
@@ -371,34 +371,34 @@
           case 'S':
           case 'I':
           case 'L':
-            if (gprs_seen < kNumGprArgs) {
+            if (gprs_seen < kNumQuickGprArgs) {
               gprs_seen++;
               offset += kBytesStackArgLocation;
             }
             break;
           case 'J':
-            if (gprs_seen < kNumGprArgs) {
+            if (gprs_seen < kNumQuickGprArgs) {
               gprs_seen++;
               offset += 2 * kBytesStackArgLocation;
               if (kBytesPerGprSpillLocation == 4) {
-                if (gprs_seen < kNumGprArgs) {
+                if (gprs_seen < kNumQuickGprArgs) {
                   gprs_seen++;
                 }
               }
             }
             break;
           case 'F':
-            if ((kNumFprArgs != 0) && (fprs_seen + 1 < kNumFprArgs + 1)) {
+            if ((kNumQuickFprArgs != 0) && (fprs_seen + 1 < kNumQuickFprArgs + 1)) {
               fprs_seen++;
               offset += kBytesStackArgLocation;
             }
             break;
           case 'D':
-            if ((kNumFprArgs != 0) && (fprs_seen + 1 < kNumFprArgs + 1)) {
+            if ((kNumQuickFprArgs != 0) && (fprs_seen + 1 < kNumQuickFprArgs + 1)) {
               fprs_seen++;
               offset += 2 * kBytesStackArgLocation;
               if (kBytesPerFprSpillLocation == 4) {
-                if ((kNumFprArgs != 0) && (fprs_seen + 1 < kNumFprArgs + 1)) {
+                if ((kNumQuickFprArgs != 0) && (fprs_seen + 1 < kNumQuickFprArgs + 1)) {
                   fprs_seen++;
                 }
               }
@@ -428,13 +428,13 @@
 };
 
 // Visits arguments on the stack placing them into the shadow frame.
-class BuildQuickShadowFrameVisitor : public QuickArgumentVisitor {
+class BuildQuickShadowFrameVisitor FINAL : public QuickArgumentVisitor {
  public:
   BuildQuickShadowFrameVisitor(mirror::ArtMethod** sp, bool is_static, const char* shorty,
                                uint32_t shorty_len, ShadowFrame* sf, size_t first_arg_reg) :
     QuickArgumentVisitor(sp, is_static, shorty, shorty_len), sf_(sf), cur_reg_(first_arg_reg) {}
 
-  virtual void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE {
     Primitive::Type type = GetParamPrimitiveType();
     switch (type) {
       case Primitive::kPrimLong:  // Fall-through.
@@ -525,14 +525,14 @@
 
 // Visits arguments on the stack placing them into the args vector, Object* arguments are converted
 // to jobjects.
-class BuildQuickArgumentVisitor : public QuickArgumentVisitor {
+class BuildQuickArgumentVisitor FINAL : public QuickArgumentVisitor {
  public:
   BuildQuickArgumentVisitor(mirror::ArtMethod** sp, bool is_static, const char* shorty,
                             uint32_t shorty_len, ScopedObjectAccessUnchecked* soa,
                             std::vector<jvalue>* args) :
     QuickArgumentVisitor(sp, is_static, shorty, shorty_len), soa_(soa), args_(args) {}
 
-  virtual void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE {
     jvalue val;
     Primitive::Type type = GetParamPrimitiveType();
     switch (type) {
@@ -638,13 +638,13 @@
 
 // Read object references held in arguments from quick frames and place in a JNI local references,
 // so they don't get garbage collected.
-class RememberForGcArgumentVisitor : public QuickArgumentVisitor {
+class RememberForGcArgumentVisitor FINAL : public QuickArgumentVisitor {
  public:
   RememberForGcArgumentVisitor(mirror::ArtMethod** sp, bool is_static, const char* shorty,
                                uint32_t shorty_len, ScopedObjectAccessUnchecked* soa) :
     QuickArgumentVisitor(sp, is_static, shorty, shorty_len), soa_(soa) {}
 
-  virtual void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE {
     if (IsParamAReference()) {
       StackReference<mirror::Object>* stack_ref =
           reinterpret_cast<StackReference<mirror::Object>*>(GetParamAddress());
@@ -671,14 +671,14 @@
 // Lazily resolve a method for quick. Called by stub code.
 extern "C" const void* artQuickResolutionTrampoline(mirror::ArtMethod* called,
                                                     mirror::Object* receiver,
-                                                    Thread* thread, mirror::ArtMethod** sp)
+                                                    Thread* self, mirror::ArtMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  FinishCalleeSaveFrameSetup(thread, sp, Runtime::kRefsAndArgs);
+  FinishCalleeSaveFrameSetup(self, sp, Runtime::kRefsAndArgs);
   // Start new JNI local reference state
-  JNIEnvExt* env = thread->GetJniEnv();
+  JNIEnvExt* env = self->GetJniEnv();
   ScopedObjectAccessUnchecked soa(env);
   ScopedJniEnvLocalRefState env_state(env);
-  const char* old_cause = thread->StartAssertNoThreadSuspension("Quick method resolution set up");
+  const char* old_cause = self->StartAssertNoThreadSuspension("Quick method resolution set up");
 
   // Compute details about the called method (avoid GCs)
   ClassLinker* linker = Runtime::Current()->GetClassLinker();
@@ -757,7 +757,7 @@
       dex_file->GetMethodShorty(dex_file->GetMethodId(dex_method_idx), &shorty_len);
   RememberForGcArgumentVisitor visitor(sp, invoke_type == kStatic, shorty, shorty_len, &soa);
   visitor.VisitArguments();
-  thread->EndAssertNoThreadSuspension(old_cause);
+  self->EndAssertNoThreadSuspension(old_cause);
   bool virtual_or_interface = invoke_type == kVirtual || invoke_type == kInterface;
   // Resolve method filling in dex cache.
   if (called->IsRuntimeMethod()) {
@@ -766,7 +766,7 @@
     receiver = sirt_receiver.get();
   }
   const void* code = NULL;
-  if (LIKELY(!thread->IsExceptionPending())) {
+  if (LIKELY(!self->IsExceptionPending())) {
     // Incompatible class change should have been handled in resolve method.
     CHECK(!called->CheckIncompatibleClassChange(invoke_type))
         << PrettyMethod(called) << " " << invoke_type;
@@ -812,7 +812,7 @@
       DCHECK(called_class->IsErroneous());
     }
   }
-  CHECK_EQ(code == NULL, thread->IsExceptionPending());
+  CHECK_EQ(code == NULL, self->IsExceptionPending());
   // Fixup any locally saved objects may have moved during a GC.
   visitor.FixupReferences();
   // Place called method in callee-save frame to be placed as first argument to quick method.
@@ -820,13 +820,375 @@
   return code;
 }
 
-extern "C" const void* artQuickGenericJniTrampoline(mirror::ArtMethod* called,
-                                                    mirror::Object* receiver,
-                                                    Thread* thread, mirror::ArtMethod** sp)
+// Visits arguments on the stack placing them into a region lower down the stack for the benefit
+// of transitioning into native code.
+class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
+#if defined(__arm__)
+  // TODO: These are all dummy values!
+  static constexpr bool kNativeSoftFloatAbi = false;  // This is a hard float ABI.
+  static constexpr size_t kNumNativeGprArgs = 3;  // 3 arguments passed in GPRs.
+  static constexpr size_t kNumNativeFprArgs = 0;  // 0 arguments passed in FPRs.
+
+  static constexpr size_t kGprStackOffset = 4336;
+  static constexpr size_t kFprStackOffset = 4336 - 6*8;
+  static constexpr size_t kCallStackStackOffset = 4336 - 112;
+
+  static constexpr size_t kRegistersNeededForLong = 2;
+  static constexpr size_t kRegistersNeededForDouble = 2;
+#elif defined(__mips__)
+  // TODO: These are all dummy values!
+  static constexpr bool kNativeSoftFloatAbi = true;  // This is a hard float ABI.
+  static constexpr size_t kNumNativeGprArgs = 0;  // 6 arguments passed in GPRs.
+  static constexpr size_t kNumNativeFprArgs = 0;  // 8 arguments passed in FPRs.
+
+  // update these
+  static constexpr size_t kGprStackOffset = 4336;
+  static constexpr size_t kFprStackOffset = 4336 - 6*8;
+  static constexpr size_t kCallStackStackOffset = 4336 - 112;
+
+  static constexpr size_t kRegistersNeededForLong = 2;
+  static constexpr size_t kRegistersNeededForDouble = 2;
+#elif defined(__i386__)
+  // TODO: Check these!
+  static constexpr bool kNativeSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr size_t kNumNativeGprArgs = 0;  // 6 arguments passed in GPRs.
+  static constexpr size_t kNumNativeFprArgs = 0;  // 8 arguments passed in FPRs.
+
+  // update these
+  static constexpr size_t kGprStackOffset = 4336;
+  static constexpr size_t kFprStackOffset = 4336 - 6*8;
+  static constexpr size_t kCallStackStackOffset = 4336 - 112;
+
+  static constexpr size_t kRegistersNeededForLong = 2;
+  static constexpr size_t kRegistersNeededForDouble = 2;
+#elif defined(__x86_64__)
+  static constexpr bool kNativeSoftFloatAbi = false;  // This is a hard float ABI.
+  static constexpr size_t kNumNativeGprArgs = 6;  // 6 arguments passed in GPRs.
+  static constexpr size_t kNumNativeFprArgs = 8;  // 8 arguments passed in FPRs.
+
+  static constexpr size_t kGprStackOffset = 4336;
+  static constexpr size_t kFprStackOffset = 4336 - 6*8;
+  static constexpr size_t kCallStackStackOffset = 4336 - 112;
+
+  static constexpr size_t kRegistersNeededForLong = 1;
+  static constexpr size_t kRegistersNeededForDouble = 1;
+#else
+#error "Unsupported architecture"
+#endif
+
+
+ public:
+  BuildGenericJniFrameVisitor(mirror::ArtMethod** sp, bool is_static, const char* shorty,
+                              uint32_t shorty_len, Thread* self) :
+      QuickArgumentVisitor(sp, is_static, shorty, shorty_len) {
+    // size of cookie plus padding
+    uint8_t* sp8 = reinterpret_cast<uint8_t*>(sp);
+    top_of_sirt_ =  sp8 - 8;
+    cur_sirt_entry_ = reinterpret_cast<StackReference<mirror::Object>*>(top_of_sirt_) - 1;
+    sirt_number_of_references_ = 0;
+    gpr_index_ = kNumNativeGprArgs;
+    fpr_index_ = kNumNativeFprArgs;
+
+    cur_gpr_reg_ = reinterpret_cast<uintptr_t*>(sp8 - kGprStackOffset);
+    cur_fpr_reg_ = reinterpret_cast<uint32_t*>(sp8 - kFprStackOffset);
+    cur_stack_arg_ = reinterpret_cast<uintptr_t*>(sp8 - kCallStackStackOffset);
+
+    // jni environment is always first argument
+    PushPointer(self->GetJniEnv());
+
+    if (is_static) {
+      PushArgumentInSirt((*sp)->GetDeclaringClass());
+    }
+  }
+
+  void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE {
+    Primitive::Type type = GetParamPrimitiveType();
+    switch (type) {
+      case Primitive::kPrimLong: {
+        jlong long_arg;
+        if (IsSplitLongOrDouble()) {
+          long_arg = ReadSplitLongParam();
+        } else {
+          long_arg = *reinterpret_cast<jlong*>(GetParamAddress());
+        }
+        PushLongArgument(long_arg);
+        break;
+      }
+      case Primitive::kPrimDouble: {
+        uint64_t double_arg;
+        if (IsSplitLongOrDouble()) {
+          // Read into union so that we don't case to a double.
+          double_arg = ReadSplitLongParam();
+        } else {
+          double_arg = *reinterpret_cast<uint64_t*>(GetParamAddress());
+        }
+        PushDoubleArgument(double_arg);
+        break;
+      }
+      case Primitive::kPrimNot: {
+        StackReference<mirror::Object>* stack_ref =
+            reinterpret_cast<StackReference<mirror::Object>*>(GetParamAddress());
+        PushArgumentInSirt(stack_ref->AsMirrorPtr());
+        break;
+      }
+      case Primitive::kPrimFloat:
+        PushFloatArgument(*reinterpret_cast<int32_t*>(GetParamAddress()));
+        break;
+      case Primitive::kPrimBoolean:  // Fall-through.
+      case Primitive::kPrimByte:     // Fall-through.
+      case Primitive::kPrimChar:     // Fall-through.
+      case Primitive::kPrimShort:    // Fall-through.
+      case Primitive::kPrimInt:      // Fall-through.
+        PushIntArgument(*reinterpret_cast<jint*>(GetParamAddress()));
+        break;
+      case Primitive::kPrimVoid:
+        LOG(FATAL) << "UNREACHABLE";
+        break;
+    }
+  }
+
+  void FinalizeSirt(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    if (!IsAligned<8>(StackIndirectReferenceTable::SizeOf(sirt_number_of_references_))) {
+      sirt_number_of_references_++;
+      *cur_sirt_entry_ = StackReference<mirror::Object>();
+      cur_sirt_entry_--;
+    }
+    CHECK(IsAligned<8>(StackIndirectReferenceTable::SizeOf(sirt_number_of_references_)));
+    StackIndirectReferenceTable* sirt = reinterpret_cast<StackIndirectReferenceTable*>(
+        top_of_sirt_ - StackIndirectReferenceTable::SizeOf(sirt_number_of_references_));
+
+    sirt->SetNumberOfReferences(sirt_number_of_references_);
+    self->PushSirt(sirt);
+  }
+
+  jobject GetFirstSirtEntry() {
+    return reinterpret_cast<jobject>(reinterpret_cast<StackReference<mirror::Object>*>(top_of_sirt_) - 1);
+  }
+
+ private:
+  void PushArgumentInSirt(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    // Do something to push into the SIRT.
+    uintptr_t sirt_or_null;
+    if (obj != nullptr) {
+      sirt_number_of_references_++;
+      *cur_sirt_entry_ = StackReference<mirror::Object>::FromMirrorPtr(obj);
+      sirt_or_null = reinterpret_cast<uintptr_t>(cur_sirt_entry_);
+      cur_sirt_entry_--;
+    } else {
+      sirt_or_null = reinterpret_cast<uintptr_t>(nullptr);
+    }
+    // Push the GPR or stack arg.
+    if (gpr_index_ > 0) {
+      *cur_gpr_reg_ = sirt_or_null;
+      cur_gpr_reg_++;
+      gpr_index_--;
+    } else {
+      *cur_stack_arg_ = sirt_or_null;
+      cur_stack_arg_++;
+    }
+  }
+
+  void PushPointer(void* val) {
+    if (gpr_index_ > 0) {
+      *cur_gpr_reg_ = reinterpret_cast<uintptr_t>(val);
+      cur_gpr_reg_++;
+      gpr_index_--;
+    } else {
+      *cur_stack_arg_ = reinterpret_cast<uintptr_t>(val);
+      cur_stack_arg_++;
+    }
+  }
+
+  void PushIntArgument(jint val) {
+    if (gpr_index_ > 0) {
+      *cur_gpr_reg_ = val;
+      cur_gpr_reg_++;
+      gpr_index_--;
+    } else {
+      *cur_stack_arg_ = val;
+      cur_stack_arg_++;
+    }
+  }
+
+  void PushLongArgument(jlong val) {
+    // This is an ugly hack for the following problem:
+    //  Assume odd number of 32b registers. Then having exactly kRegsNeeded left needs to spill!
+    if (gpr_index_ >= kRegistersNeededForLong + (kNumNativeGprArgs % kRegistersNeededForLong)) {
+      if (kRegistersNeededForLong > 1 && ((kNumNativeGprArgs - gpr_index_) & 1) == 1) {
+        // Pad.
+        gpr_index_--;
+        cur_gpr_reg_++;
+      }
+      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_gpr_reg_);
+      *tmp = val;
+      cur_gpr_reg_ += kRegistersNeededForLong;
+      gpr_index_ -= kRegistersNeededForLong;
+    } else {
+      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_stack_arg_);
+      *tmp = val;
+      cur_stack_arg_ += kRegistersNeededForLong;
+
+      gpr_index_ = 0;                   // can't use GPRs anymore
+    }
+  }
+
+  void PushFloatArgument(int32_t val) {
+    if (kNativeSoftFloatAbi) {
+      PushIntArgument(val);
+    } else {
+      if (fpr_index_ > 0) {
+        *cur_fpr_reg_ = val;
+        cur_fpr_reg_++;
+        if (kRegistersNeededForDouble == 1) {
+          // will pop 64 bits from the stack
+          // TODO: extend/clear bits???
+          cur_fpr_reg_++;
+        }
+        fpr_index_--;
+      } else {
+        // TODO: Check ABI for floats.
+        *cur_stack_arg_ = val;
+        cur_stack_arg_++;
+      }
+    }
+  }
+
+  void PushDoubleArgument(uint64_t val) {
+    // See PushLongArgument for explanation
+    if (fpr_index_ >= kRegistersNeededForDouble + (kNumNativeFprArgs % kRegistersNeededForDouble)) {
+      if (kRegistersNeededForDouble > 1 && ((kNumNativeFprArgs - fpr_index_) & 1) == 1) {
+        // Pad.
+        fpr_index_--;
+        cur_fpr_reg_++;
+      }
+      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_fpr_reg_);
+      *tmp = val;
+      // TODO: the whole thing doesn't make sense if we take uint32_t*...
+      cur_fpr_reg_ += 2;        // kRegistersNeededForDouble;
+      fpr_index_ -= kRegistersNeededForDouble;
+    } else {
+      if (!IsAligned<8>(cur_stack_arg_)) {
+        cur_stack_arg_++;  // Pad.
+      }
+      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_stack_arg_);
+      *tmp = val;
+      cur_stack_arg_ += kRegistersNeededForDouble;
+
+      fpr_index_ = 0;                   // can't use FPRs anymore
+    }
+  }
+
+  uint32_t sirt_number_of_references_;
+  StackReference<mirror::Object>* cur_sirt_entry_;
+  uint32_t gpr_index_;           // should be uint, but gives error because on some archs no regs
+  uintptr_t* cur_gpr_reg_;
+  uint32_t fpr_index_;           //                      ----- # -----
+  uint32_t* cur_fpr_reg_;
+  uintptr_t* cur_stack_arg_;
+  uint8_t* top_of_sirt_;
+
+  DISALLOW_COPY_AND_ASSIGN(BuildGenericJniFrameVisitor);
+};
+
+extern "C" const void* artQuickGenericJniTrampoline(Thread* self, mirror::ArtMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  LOG(FATAL) << "artQuickGenericJniTrampoline not implemented: "
-      << PrettyMethod(called);
-  return NULL;
+  uint32_t* sp32 = reinterpret_cast<uint32_t*>(sp);
+  mirror::ArtMethod* called = *sp;
+  DCHECK(called->IsNative());
+
+  // run the visitor
+  MethodHelper mh(called);
+  BuildGenericJniFrameVisitor visitor(sp, called->IsStatic(), mh.GetShorty(), mh.GetShortyLength(),
+                                      self);
+  visitor.VisitArguments();
+  visitor.FinalizeSirt(self);
+
+  // fix up managed-stack things in Thread
+  self->SetTopOfStack(sp, 0);
+
+  // start JNI, save the cookie
+  uint32_t cookie;
+  if (called->IsSynchronized()) {
+    cookie = JniMethodStartSynchronized(visitor.GetFirstSirtEntry(), self);
+    // TODO: error checking.
+    if (self->IsExceptionPending()) {
+      self->PopSirt();
+      return nullptr;
+    }
+  } else {
+    cookie = JniMethodStart(self);
+  }
+  *(sp32-1) = cookie;
+
+  // retrieve native code
+  const void* nativeCode = called->GetNativeMethod();
+  if (nativeCode == nullptr) {
+    // TODO: is this really an error, or do we need to try to find native code?
+    LOG(FATAL) << "Finding native code not implemented yet.";
+  }
+
+  return nativeCode;
+}
+
+/*
+ * Is called after the native JNI code. Responsible for cleanup (SIRT, saved state) and
+ * unlocking.
+ */
+extern "C" uint64_t artQuickGenericJniEndTrampoline(Thread* self, mirror::ArtMethod** sp,
+                                                    jvalue result, uint64_t result_f)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  uint32_t* sp32 = reinterpret_cast<uint32_t*>(sp);
+  mirror::ArtMethod* called = *sp;
+  uint32_t cookie = *(sp32-1);
+
+  // TODO: synchronized.
+  MethodHelper mh(called);
+  char return_shorty_char = mh.GetShorty()[0];
+
+  if (return_shorty_char == 'L') {
+    // the only special ending call
+    if (called->IsSynchronized()) {
+      BuildGenericJniFrameVisitor visitor(sp, called->IsStatic(), mh.GetShorty(),
+                                          mh.GetShortyLength(), self);
+      return reinterpret_cast<uint64_t>(JniMethodEndWithReferenceSynchronized(result.l, cookie,
+                                                                              visitor.GetFirstSirtEntry(),
+                                                                              self));
+    } else {
+      return reinterpret_cast<uint64_t>(JniMethodEndWithReference(result.l, cookie, self));
+    }
+  } else {
+    if (called->IsSynchronized()) {
+      // run the visitor
+      BuildGenericJniFrameVisitor visitor(sp, called->IsStatic(), mh.GetShorty(),
+                                          mh.GetShortyLength(), self);
+      JniMethodEndSynchronized(cookie, visitor.GetFirstSirtEntry(), self);
+    } else {
+      JniMethodEnd(cookie, self);
+    }
+
+    switch (return_shorty_char) {
+      case 'F':  // Fall-through.
+      case 'D':
+        return result_f;
+      case 'Z':
+        return result.z;
+      case 'B':
+        return result.b;
+      case 'C':
+        return result.c;
+      case 'S':
+        return result.s;
+      case 'I':
+        return result.i;
+      case 'J':
+        return result.j;
+      case 'V':
+        return 0;
+      default:
+        LOG(FATAL) << "Unexpected return shorty character " << return_shorty_char;
+        return 0;
+    }
+  }
 }
 
 }  // namespace art
diff --git a/runtime/exception_test.cc b/runtime/exception_test.cc
index 3653b37..5e3f504 100644
--- a/runtime/exception_test.cc
+++ b/runtime/exception_test.cc
@@ -179,7 +179,7 @@
     fake_stack.push_back(0);
 
     // Set up thread to appear as if we called out of method_g_ at pc dex 3
-    thread->SetTopOfStack(&fake_stack[0], method_g_->ToNativePc(dex_pc));  // return pc
+    thread->SetTopOfStack(reinterpret_cast<mirror::ArtMethod**>(&fake_stack[0]), method_g_->ToNativePc(dex_pc));  // return pc
   } else {
     // Create/push fake 20-byte shadow frame for method g
     fake_stack.push_back(0);
diff --git a/runtime/mirror/art_method.h b/runtime/mirror/art_method.h
index e17dc5f..a18e1719 100644
--- a/runtime/mirror/art_method.h
+++ b/runtime/mirror/art_method.h
@@ -360,7 +360,7 @@
   }
 
   const void* GetNativeMethod() {
-    return reinterpret_cast<const void*>(GetField32(NativeMethodOffset(), false));
+    return GetFieldPtr<const void*>(NativeMethodOffset(), false);
   }
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
diff --git a/runtime/stack.h b/runtime/stack.h
index 6a62922..f840f67 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -532,7 +532,7 @@
     DCHECK(GetMethod() != nullptr);
     byte* save_addr =
         reinterpret_cast<byte*>(cur_quick_frame_) + frame_size - ((num + 1) * kPointerSize);
-#if defined(__i386__)
+#if defined(__i386__) || defined(__x86_64__)
     save_addr -= kPointerSize;  // account for return address
 #endif
     return reinterpret_cast<uintptr_t*>(save_addr);
diff --git a/runtime/stack_indirect_reference_table.h b/runtime/stack_indirect_reference_table.h
index d22650b..c2d6a59 100644
--- a/runtime/stack_indirect_reference_table.h
+++ b/runtime/stack_indirect_reference_table.h
@@ -33,17 +33,24 @@
 class StackIndirectReferenceTable {
  public:
   explicit StackIndirectReferenceTable(mirror::Object* object) :
-      number_of_references_(1), link_(NULL) {
+      link_(NULL), number_of_references_(1) {
     references_[0].Assign(object);
   }
 
   ~StackIndirectReferenceTable() {}
 
   // Number of references contained within this SIRT
-  size_t NumberOfReferences() const {
+  uint32_t NumberOfReferences() const {
     return number_of_references_;
   }
 
+  // Returns the size of a StackIndirectReferenceTable containing num_references sirts.
+  static size_t SizeOf(uint32_t num_references) {
+    size_t header_size = OFFSETOF_MEMBER(StackIndirectReferenceTable, references_);
+    size_t data_size = sizeof(StackReference<mirror::Object>) * num_references;
+    return header_size + data_size;
+  }
+
   // Link to previous SIRT or NULL
   StackIndirectReferenceTable* GetLink() const {
     return link_;
@@ -54,6 +61,12 @@
     link_ = sirt;
   }
 
+  // Sets the number_of_references_ field for constructing tables out of raw memory. Warning: will
+  // not resize anything.
+  void SetNumberOfReferences(uint32_t num_references) {
+    number_of_references_ = num_references;
+  }
+
   mirror::Object* GetReference(size_t i) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK_LT(i, number_of_references_);
     return references_[i].AsMirrorPtr();
@@ -74,7 +87,7 @@
   }
 
   // Offset of length within SIRT, used by generated code
-  static size_t NumberOfReferencesOffset() {
+  static uint32_t NumberOfReferencesOffset() {
     return OFFSETOF_MEMBER(StackIndirectReferenceTable, number_of_references_);
   }
 
@@ -86,8 +99,8 @@
  private:
   StackIndirectReferenceTable() {}
 
-  size_t number_of_references_;
   StackIndirectReferenceTable* link_;
+  uint32_t number_of_references_;
 
   // number_of_references_ are available if this is allocated and filled in by jni_compiler.
   StackReference<mirror::Object> references_[1];
diff --git a/runtime/thread.h b/runtime/thread.h
index 6df2b1c..c7ab735 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -320,8 +320,7 @@
 
   ThrowLocation GetCurrentLocationForThrow() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void SetTopOfStack(void* stack, uintptr_t pc) {
-    mirror::ArtMethod** top_method = reinterpret_cast<mirror::ArtMethod**>(stack);
+  void SetTopOfStack(mirror::ArtMethod** top_method, uintptr_t pc) {
     managed_stack_.SetTopQuickFrame(top_method);
     managed_stack_.SetTopQuickFramePc(pc);
   }