Merge "Annotate used fields."
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 5fa4596..35d193c 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -1257,17 +1257,13 @@
   } else {
     LoadValueDirectFixed(rl_start, reg_start);
   }
-  int r_tgt = (cu_->instruction_set != kX86) ? LoadHelper(QUICK_ENTRYPOINT_OFFSET(pIndexOf)) : 0;
+  int r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pIndexOf));
   GenNullCheck(rl_obj.s_reg_low, reg_ptr, info->opt_flags);
   LIR* launch_pad = RawLIR(0, kPseudoIntrinsicRetry, WrapPointer(info));
   intrinsic_launchpads_.Insert(launch_pad);
   OpCmpImmBranch(kCondGt, reg_char, 0xFFFF, launch_pad);
   // NOTE: not a safepoint
-  if (cu_->instruction_set != kX86) {
-    OpReg(kOpBlx, r_tgt);
-  } else {
-    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(pIndexOf));
-  }
+  OpReg(kOpBlx, r_tgt);
   LIR* resume_tgt = NewLIR0(kPseudoTargetLabel);
   launch_pad->operands[2] = WrapPointer(resume_tgt);
   // Record that we've already inlined & null checked
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index c36013f..8eb67aa 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -676,7 +676,7 @@
     bool GenInlinedAbsDouble(CallInfo* info);
     bool GenInlinedFloatCvt(CallInfo* info);
     bool GenInlinedDoubleCvt(CallInfo* info);
-    bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
+    virtual bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
     bool GenInlinedStringCompareTo(CallInfo* info);
     bool GenInlinedCurrentThread(CallInfo* info);
     bool GenInlinedUnsafeGet(CallInfo* info, bool is_long, bool is_volatile);
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 6481589..538ce0d 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -175,6 +175,8 @@
   { kX86Mov32AI, kArrayImm,  IS_STORE | IS_QUIN_OP     | REG_USE01,      { 0,             0, 0xC7, 0, 0, 0, 0, 4 }, "Mov32AI", "[!0r+!1r<<!2d+!3d],!4d" },
   { kX86Mov32TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, 0, 0xC7, 0, 0, 0, 0, 4 }, "Mov32TI", "fs:[!0d],!1d" },
 
+  { kX86Lea32RM, kRegMem, IS_TERTIARY_OP | IS_LOAD | REG_DEF0_USE12, { 0, 0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RM", "!0r,[!1r+!2d]" },
+
   { kX86Lea32RA, kRegArray, IS_QUIN_OP | REG_DEF0_USE12, { 0, 0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RA", "!0r,[!1r+!2r<<!3d+!4d]" },
 
   { kX86Cmov32RRC, kRegRegCond, IS_TERTIARY_OP | REG_DEF0_USE01 | USES_CCODES, {0, 0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc32RR", "!2c !0r,!1r" },
@@ -354,6 +356,7 @@
   { kX86Jmp8,  kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP,               { 0,             0, 0xEB, 0,    0, 0, 0, 0 }, "Jmp8",  "!0t" },
   { kX86Jmp32, kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP,               { 0,             0, 0xE9, 0,    0, 0, 0, 0 }, "Jmp32", "!0t" },
   { kX86JmpR,  kJmp,  IS_UNARY_OP  | IS_BRANCH | REG_USE0,                  { 0,             0, 0xFF, 0,    0, 4, 0, 0 }, "JmpR",  "!0r" },
+  { kX86Jecxz8, kJmp, NO_OPERAND   | IS_BRANCH | NEEDS_FIXUP | REG_USEC,    { 0,             0, 0xE3, 0,    0, 0, 0, 0 }, "Jecxz", "!0t" },
   { kX86CallR, kCall, IS_UNARY_OP  | IS_BRANCH | REG_USE0,                  { 0,             0, 0xE8, 0,    0, 0, 0, 0 }, "CallR", "!0r" },
   { kX86CallM, kCall, IS_BINARY_OP | IS_BRANCH | IS_LOAD | REG_USE0,        { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallM", "[!0r+!1d]" },
   { kX86CallA, kCall, IS_QUAD_OP   | IS_BRANCH | IS_LOAD | REG_USE01,       { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallA", "[!0r+!1r<<!2d+!3d]" },
@@ -364,6 +367,7 @@
   { kX86StartOfMethod, kMacro,  IS_UNARY_OP | SETS_CCODES,             { 0, 0, 0,    0, 0, 0, 0, 0 }, "StartOfMethod", "!0r" },
   { kX86PcRelLoadRA,   kPcRel,  IS_LOAD | IS_QUIN_OP | REG_DEF0_USE12, { 0, 0, 0x8B, 0, 0, 0, 0, 0 }, "PcRelLoadRA",   "!0r,[!1r+!2r<<!3d+!4p]" },
   { kX86PcRelAdr,      kPcRel,  IS_LOAD | IS_BINARY_OP | REG_DEF0,     { 0, 0, 0xB8, 0, 0, 0, 0, 4 }, "PcRelAdr",      "!0r,!1d" },
+  { kX86RepneScasw, kPrefix2Nullary, NO_OPERAND | SETS_CCODES,         { 0x66, 0xF2, 0xAF, 0, 0, 0, 0, 0 }, "RepNE ScasW", "" },
 };
 
 static size_t ComputeSize(const X86EncodingMap* entry, int base, int displacement, bool has_sib) {
@@ -407,6 +411,8 @@
       return lir->operands[0];  // length of nop is sole operand
     case kNullary:
       return 1;  // 1 byte of opcode
+    case kPrefix2Nullary:
+      return 3;  // 1 byte of opcode + 2 prefixes
     case kRegOpcode:  // lir operands - 0: reg
       return ComputeSize(entry, 0, 0, false) - 1;  // substract 1 for modrm
     case kReg:  // lir operands - 0: reg
@@ -489,7 +495,7 @@
         return 6;  // 2 byte opcode + rel32
       }
     case kJmp:
-      if (lir->opcode == kX86Jmp8) {
+      if (lir->opcode == kX86Jmp8 || lir->opcode == kX86Jecxz8) {
         return 2;  // opcode + rel8
       } else if (lir->opcode == kX86Jmp32) {
         return 5;  // opcode + rel32
@@ -957,6 +963,10 @@
     code_buffer_.push_back((rel >> 8) & 0xFF);
     code_buffer_.push_back((rel >> 16) & 0xFF);
     code_buffer_.push_back((rel >> 24) & 0xFF);
+  } else if (entry->opcode == kX86Jecxz8) {
+    DCHECK(IS_SIMM8(rel));
+    code_buffer_.push_back(0xE3);
+    code_buffer_.push_back(rel & 0xFF);
   } else {
     DCHECK(entry->opcode == kX86JmpR);
     code_buffer_.push_back(entry->skeleton.opcode);
@@ -1148,6 +1158,17 @@
           lir->operands[0] = delta;
           break;
         }
+        case kX86Jecxz8: {
+          LIR *target_lir = lir->target;
+          DCHECK(target_lir != NULL);
+          CodeOffset pc;
+          pc = lir->offset + 2;  // opcode + rel8
+          CodeOffset target = target_lir->offset;
+          int delta = target - pc;
+          lir->operands[0] = delta;
+          DCHECK(IS_SIMM8(delta));
+          break;
+        }
         case kX86Jmp8: {
           LIR *target_lir = lir->target;
           DCHECK(target_lir != NULL);
@@ -1226,6 +1247,14 @@
         DCHECK_EQ(0, entry->skeleton.ax_opcode);
         DCHECK_EQ(0, entry->skeleton.immediate_bytes);
         break;
+      case kPrefix2Nullary:  // 1 byte of opcode + 2 prefixes.
+        DCHECK_NE(0, entry->skeleton.prefix1);
+        DCHECK_NE(0, entry->skeleton.prefix2);
+        EmitPrefixAndOpcode(entry);
+        DCHECK_EQ(0, entry->skeleton.modrm_opcode);
+        DCHECK_EQ(0, entry->skeleton.ax_opcode);
+        DCHECK_EQ(0, entry->skeleton.immediate_bytes);
+        break;
       case kRegOpcode:  // lir operands - 0: reg
         EmitOpRegOpcode(entry, lir->operands[0]);
         break;
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 6100a1d..421d51e 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -359,6 +359,15 @@
     void GenConstWide(RegLocation rl_dest, int64_t value);
 
     /*
+     * @brief generate inline code for fast case of Strng.indexOf.
+     * @param info Call parameters
+     * @param zero_based 'true' if the index into the string is 0.
+     * @returns 'true' if the call was inlined, 'false' if a regular call needs to be
+     * generated.
+     */
+    bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
+
+    /*
      * @brief Return the correct x86 opcode for the Dex operation
      * @param op Dex opcode for the operation
      * @param loc Register location of the operand
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index fa9a944..ad5b154 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -185,6 +185,14 @@
   if (flags & REG_USEB) {
     SetupRegMask(&lir->u.m.use_mask, rBX);
   }
+
+  // Fixup hard to describe instruction: Uses rAX, rCX, rDI; sets rDI.
+  if (lir->opcode == kX86RepneScasw) {
+    SetupRegMask(&lir->u.m.use_mask, rAX);
+    SetupRegMask(&lir->u.m.use_mask, rCX);
+    SetupRegMask(&lir->u.m.use_mask, rDI);
+    SetupRegMask(&lir->u.m.def_mask, rDI);
+  }
 }
 
 /* For dumping instructions */
@@ -936,4 +944,174 @@
   Mir2Lir::InstallLiteralPools();
 }
 
+// Offsets within java.lang.String.
+#define STRING_VALUE_OFFSET 8
+#define STRING_COUNT_OFFSET 12
+#define STRING_OFFSET_OFFSET 20
+#define STRING_DATA_OFFSET 12
+
+/*
+ * Fast string.index_of(I) & (II).  Inline check for simple case of char <= 0xffff,
+ * otherwise bails to standard library code.
+ */
+bool X86Mir2Lir::GenInlinedIndexOf(CallInfo* info, bool zero_based) {
+  ClobberCallerSave();
+  LockCallTemps();  // Using fixed registers
+
+  // EAX: 16 bit character being searched.
+  // ECX: count: number of words to be searched.
+  // EDI: String being searched.
+  // EDX: temporary during execution.
+  // EBX: temporary during execution.
+
+  RegLocation rl_obj = info->args[0];
+  RegLocation rl_char = info->args[1];
+  RegLocation rl_start = info->args[2];
+
+  uint32_t char_value =
+    rl_char.is_const ? mir_graph_->ConstantValue(rl_char.orig_sreg) : 0;
+
+  if (char_value > 0xFFFF) {
+    // We have to punt to the real String.indexOf.
+    return false;
+  }
+
+  // Okay, we are commited to inlining this.
+  RegLocation rl_return = GetReturn(false);
+  RegLocation rl_dest = InlineTarget(info);
+
+  // Is the string non-NULL?
+  LoadValueDirectFixed(rl_obj, rDX);
+  GenNullCheck(rl_obj.s_reg_low, rDX, info->opt_flags);
+
+  // Record that we have inlined & null checked the object.
+  info->opt_flags |= (MIR_INLINED | MIR_IGNORE_NULL_CHECK);
+
+  // Does the character fit in 16 bits?
+  LIR* launch_pad = nullptr;
+  if (rl_char.is_const) {
+    // We need the value in EAX.
+    LoadConstantNoClobber(rAX, char_value);
+  } else {
+    // Character is not a constant; compare at runtime.
+    LoadValueDirectFixed(rl_char, rAX);
+    launch_pad = RawLIR(0, kPseudoIntrinsicRetry, WrapPointer(info));
+    intrinsic_launchpads_.Insert(launch_pad);
+    OpCmpImmBranch(kCondGt, rAX, 0xFFFF, launch_pad);
+  }
+
+  // From here down, we know that we are looking for a char that fits in 16 bits.
+
+  // Character is in EAX.
+  // Object pointer is in EDX.
+
+  // We need to preserve EDI, but have no spare registers, so push it on the stack.
+  // We have to remember that all stack addresses after this are offset by sizeof(EDI).
+  NewLIR1(kX86Push32R, rDI);
+
+  // Compute the number of words to search in to rCX.
+  LoadWordDisp(rDX, STRING_COUNT_OFFSET, rCX);
+  LIR *length_compare = nullptr;
+  int start_value = 0;
+  if (zero_based) {
+    // We have to handle an empty string.  Use special instruction JECXZ.
+    length_compare = NewLIR0(kX86Jecxz8);
+  } else {
+    // We have to offset by the start index.
+    if (rl_start.is_const) {
+      start_value = mir_graph_->ConstantValue(rl_start.orig_sreg);
+      start_value = std::max(start_value, 0);
+
+      // Is the start > count?
+      length_compare = OpCmpImmBranch(kCondLe, rCX, start_value, nullptr);
+
+      if (start_value != 0) {
+        OpRegImm(kOpSub, rCX, start_value);
+      }
+    } else {
+      // Runtime start index.
+      rl_start = UpdateLoc(rl_start);
+      if (rl_start.location == kLocPhysReg) {
+        length_compare = OpCmpBranch(kCondLe, rCX, rl_start.low_reg, nullptr);
+        OpRegReg(kOpSub, rCX, rl_start.low_reg);
+      } else {
+        // Compare to memory to avoid a register load.  Handle pushed EDI.
+        int displacement = SRegOffset(rl_start.s_reg_low) + sizeof(uint32_t);
+        OpRegMem(kOpCmp, rDX, rX86_SP, displacement);
+        length_compare = NewLIR2(kX86Jcc8, 0, kX86CondLe);
+        OpRegMem(kOpSub, rCX, rX86_SP, displacement);
+      }
+    }
+  }
+  DCHECK(length_compare != nullptr);
+
+  // ECX now contains the count in words to be searched.
+
+  // Load the address of the string into EBX.
+  // The string starts at VALUE(String) + 2 * OFFSET(String) + STRING_DATA_OFFSET.
+  LoadWordDisp(rDX, STRING_VALUE_OFFSET, rDI);
+  LoadWordDisp(rDX, STRING_OFFSET_OFFSET, rBX);
+  OpLea(rBX, rDI, rBX, 1, STRING_DATA_OFFSET);
+
+  // Now compute into EDI where the search will start.
+  if (zero_based || rl_start.is_const) {
+    if (start_value == 0) {
+      OpRegCopy(rDI, rBX);
+    } else {
+      NewLIR3(kX86Lea32RM, rDI, rBX, 2 * start_value);
+    }
+  } else {
+    if (rl_start.location == kLocPhysReg) {
+      if (rl_start.low_reg == rDI) {
+        // We have a slight problem here.  We are already using RDI!
+        // Grab the value from the stack.
+        LoadWordDisp(rX86_SP, 0, rDX);
+        OpLea(rDI, rBX, rDX, 1, 0);
+      } else {
+        OpLea(rDI, rBX, rl_start.low_reg, 1, 0);
+      }
+    } else {
+      OpRegCopy(rDI, rBX);
+      // Load the start index from stack, remembering that we pushed EDI.
+      int displacement = SRegOffset(rl_start.s_reg_low) + sizeof(uint32_t);
+      LoadWordDisp(rX86_SP, displacement, rDX);
+      OpLea(rDI, rBX, rDX, 1, 0);
+    }
+  }
+
+  // EDI now contains the start of the string to be searched.
+  // We are all prepared to do the search for the character.
+  NewLIR0(kX86RepneScasw);
+
+  // Did we find a match?
+  LIR* failed_branch = OpCondBranch(kCondNe, nullptr);
+
+  // yes, we matched.  Compute the index of the result.
+  // index = ((curr_ptr - orig_ptr) / 2) - 1.
+  OpRegReg(kOpSub, rDI, rBX);
+  OpRegImm(kOpAsr, rDI, 1);
+  NewLIR3(kX86Lea32RM, rl_return.low_reg, rDI, -1);
+  LIR *all_done = NewLIR1(kX86Jmp8, 0);
+
+  // Failed to match; return -1.
+  LIR *not_found = NewLIR0(kPseudoTargetLabel);
+  length_compare->target = not_found;
+  failed_branch->target = not_found;
+  LoadConstantNoClobber(rl_return.low_reg, -1);
+
+  // And join up at the end.
+  all_done->target = NewLIR0(kPseudoTargetLabel);
+  // Restore EDI from the stack.
+  NewLIR1(kX86Pop32R, rDI);
+
+  // Out of line code returns here.
+  if (launch_pad != nullptr) {
+    LIR *return_point = NewLIR0(kPseudoTargetLabel);
+    launch_pad->operands[2] = WrapPointer(return_point);
+  }
+
+  StoreValue(rl_dest, rl_return);
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 480d5f5..4064bd6 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -277,6 +277,7 @@
   kX86Mov32MR, kX86Mov32AR, kX86Mov32TR,
   kX86Mov32RR, kX86Mov32RM, kX86Mov32RA, kX86Mov32RT,
   kX86Mov32RI, kX86Mov32MI, kX86Mov32AI, kX86Mov32TI,
+  kX86Lea32RM,
   kX86Lea32RA,
   // RRC - Register Register ConditionCode - cond_opcode reg1, reg2
   //             - lir operands - 0: reg1, 1: reg2, 2: CC
@@ -384,6 +385,7 @@
   kX86Jcc8, kX86Jcc32,  // jCC rel8/32; lir operands - 0: rel, 1: CC, target assigned
   kX86Jmp8, kX86Jmp32,  // jmp rel8/32; lir operands - 0: rel, target assigned
   kX86JmpR,             // jmp reg; lir operands - 0: reg
+  kX86Jecxz8,           // jcexz rel8; jump relative if ECX is zero.
   kX86CallR,            // call reg; lir operands - 0: reg
   kX86CallM,            // call [base + disp]; lir operands - 0: base, 1: disp
   kX86CallA,            // call [base + index * scale + disp]
@@ -396,6 +398,7 @@
   kX86PcRelLoadRA,      // mov reg, [base + index * scale + PC relative displacement]
                         // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
   kX86PcRelAdr,         // mov reg, PC relative displacement; lir operands - 0: reg, 1: table
+  kX86RepneScasw,       // repne scasw
   kX86Last
 };
 
@@ -404,6 +407,7 @@
   kData,                                   // Special case for raw data.
   kNop,                                    // Special case for variable length nop.
   kNullary,                                // Opcode that takes no arguments.
+  kPrefix2Nullary,                         // Opcode that takes no arguments, but 2 prefixes.
   kRegOpcode,                              // Shorter form of R instruction kind (opcode+rd)
   kReg, kMem, kArray,                      // R, M and A instruction kinds.
   kMemReg, kArrayReg, kThreadReg,          // MR, AR and TR instruction kinds.
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index dab419f..ce339bf 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -1043,6 +1043,21 @@
       }
     }
 
+    const bool kSaveDexInput = false;
+    if (kSaveDexInput) {
+      for (size_t i = 0; i < dex_files.size(); ++i) {
+        const DexFile* dex_file = dex_files[i];
+        std::string tmp_file_name(StringPrintf("/data/local/tmp/dex2oat.%d.%d.dex", getpid(), i));
+        UniquePtr<File> tmp_file(OS::CreateEmptyFile(tmp_file_name.c_str()));
+        if (tmp_file.get() == nullptr) {
+            PLOG(ERROR) << "Failed to open file " << tmp_file_name << ". Try: adb shell chmod 777 /data/local/tmp";
+            continue;
+        }
+        tmp_file->WriteFully(dex_file->Begin(), dex_file->Size());
+        LOG(INFO) << "Wrote input to " << tmp_file_name;
+      }
+    }
+
     // Ensure opened dex files are writable for dex-to-dex transformations.
     for (const auto& dex_file : dex_files) {
       if (!dex_file->EnableWrite()) {
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index b6ddc95..d86ba7b 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -633,6 +633,9 @@
   case 0x99:
     opcode << "cdq";
     break;
+  case 0xAF:
+    opcode << (prefix[2] == 0x66 ? "scasw" : "scasl");
+    break;
   case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7:
     opcode << "mov";
     immediate_bytes = 1;
@@ -693,6 +696,7 @@
     has_modrm = true;
     reg_is_opcode = true;
     break;
+  case 0xE3: opcode << "jecxz"; branch_bytes = 1; break;
   case 0xE8: opcode << "call"; branch_bytes = 4; break;
   case 0xE9: opcode << "jmp"; branch_bytes = 4; break;
   case 0xEB: opcode << "jmp"; branch_bytes = 1; break;
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 53b07f9..2a0d826 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -808,6 +808,10 @@
     {
       WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
       heap->FlushAllocStack();
+      // Since FlushAllocStack() above resets the (active) allocation
+      // stack. Need to revoke the thread-local allocation stacks that
+      // point into it.
+      heap->RevokeAllThreadLocalAllocationStacks(self);
     }
     {
       std::ostream* saved_os = os_;
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index 589c7d9..888310a 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -84,7 +84,6 @@
 
 // Intrinsic entrypoints.
 extern "C" int32_t art_quick_memcmp16(void*, void*, int32_t);
-extern "C" int32_t art_quick_indexof(void*, uint32_t, uint32_t, uint32_t);
 extern "C" int32_t art_quick_string_compareto(void*, void*);
 extern "C" void* art_quick_memcpy(void*, const void*, size_t);
 
@@ -193,7 +192,7 @@
   qpoints->pUshrLong = art_quick_lushr;
 
   // Intrinsics
-  qpoints->pIndexOf = art_quick_indexof;
+  // qpoints->pIndexOf = nullptr;  // Not needed on x86
   qpoints->pMemcmp16 = art_quick_memcmp16;
   qpoints->pStringCompareTo = art_quick_string_compareto;
   qpoints->pMemcpy = art_quick_memcpy;
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index c76c6b2..7597a4e 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1277,56 +1277,6 @@
 END_FUNCTION art_quick_deoptimize
 
     /*
-     * String's indexOf.
-     *
-     * On entry:
-     *    eax:   string object (known non-null)
-     *    ecx:   char to match (known <= 0xFFFF)
-     *    edx:   Starting offset in string data
-     */
-DEFINE_FUNCTION art_quick_indexof
-    PUSH edi                      // push callee save reg
-    mov STRING_COUNT_OFFSET(%eax), %ebx
-    mov STRING_VALUE_OFFSET(%eax), %edi
-    mov STRING_OFFSET_OFFSET(%eax), %eax
-    testl %edx, %edx              // check if start < 0
-    jl   clamp_min
-clamp_done:
-    cmpl %ebx, %edx               // check if start >= count
-    jge  not_found
-    lea  STRING_DATA_OFFSET(%edi, %eax, 2), %edi  // build a pointer to the start of string data
-    mov  %edi, %eax               // save a copy in eax to later compute result
-    lea  (%edi, %edx, 2), %edi    // build pointer to start of data to compare
-    subl  %edx, %ebx              // compute iteration count
-    /*
-     * At this point we have:
-     *   eax: original start of string data
-     *   ecx: char to compare
-     *   ebx: length to compare
-     *   edi: start of data to test
-     */
-    mov  %eax, %edx
-    mov  %ecx, %eax               // put char to match in %eax
-    mov  %ebx, %ecx               // put length to compare in %ecx
-    repne scasw                   // find %ax, starting at [%edi], up to length %ecx
-    jne  not_found
-    subl %edx, %edi
-    sar  LITERAL(1), %edi
-    decl %edi                     // index = ((curr_ptr - orig_ptr) / 2) - 1
-    mov  %edi, %eax
-    POP edi                       // pop callee save reg
-    ret
-    .balign 16
-not_found:
-    mov  LITERAL(-1), %eax        // return -1 (not found)
-    POP edi                       // pop callee save reg
-    ret
-clamp_min:
-    xor  %edx, %edx               // clamp start to 0
-    jmp  clamp_done
-END_FUNCTION art_quick_indexof
-
-    /*
      * String's compareTo.
      *
      * On entry:
diff --git a/runtime/arch/x86_64/asm_support_x86_64.h b/runtime/arch/x86_64/asm_support_x86_64.h
index d425ed8..444fa22 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.h
+++ b/runtime/arch/x86_64/asm_support_x86_64.h
@@ -19,6 +19,13 @@
 
 #include "asm_support.h"
 
+// Offset of field Runtime::callee_save_methods_[kSaveAll]
+#define RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET 208
+// Offset of field Runtime::callee_save_methods_[kRefsOnly]
+#define RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET 216
+// Offset of field Runtime::callee_save_methods_[kRefsAndArgs]
+#define RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET 224
+
 // Offset of field Thread::self_ verified in InitCpu
 #define THREAD_SELF_OFFSET 72
 // Offset of field Thread::card_table_ verified in InitCpu
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 44bc7a2..ac238f0 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -46,12 +46,63 @@
      * Runtime::CreateCalleeSaveMethod(kRefsAndArgs)
      */
 MACRO0(SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME)
-    int3
-    int3
+    // R10 := Runtime::Current()
+    movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
+    movq (%r10), %r10
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
+    PUSH r15  // Callee save.
+    PUSH r14  // Callee save.
+    PUSH r13  // Callee save.
+    PUSH r12  // Callee save.
+    PUSH r9   // Arg.
+    PUSH r8   // Arg.
+    PUSH rsi  // Arg.
+    PUSH rbp  // Callee save.
+    PUSH rbx  // Callee save.
+    PUSH rdx  // Arg.
+    PUSH rcx  // Arg.
+    // Create space for FPR args and create 2 slots, 1 of padding and 1 for the ArtMethod*.
+    subq LITERAL(80), %rsp
+    CFI_ADJUST_CFA_OFFSET(80)
+    // R10 := ArtMethod* for ref and args callee save frame method.
+    movq RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
+    // Save FPRs.
+    movq %xmm0, 16(%rsp)
+    movq %xmm1, 24(%rsp)
+    movq %xmm2, 32(%rsp)
+    movq %xmm3, 40(%rsp)
+    movq %xmm4, 48(%rsp)
+    movq %xmm5, 56(%rsp)
+    movq %xmm6, 64(%rsp)
+    movq %xmm7, 72(%rsp)
+    // Store ArtMethod* to bottom of stack.
+    movq %r10, 0(%rsp)
 END_MACRO
 
 MACRO0(RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME)
-    int3
+    // Restore FPRs.
+    movq 16(%rsp), %xmm0
+    movq 24(%rsp), %xmm1
+    movq 32(%rsp), %xmm2
+    movq 40(%rsp), %xmm3
+    movq 48(%rsp), %xmm4
+    movq 56(%rsp), %xmm5
+    movq 64(%rsp), %xmm6
+    movq 72(%rsp), %xmm7
+    addq LITERAL(80), %rsp
+    CFI_ADJUST_CFA_OFFSET(-80)
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
+    POP rcx
+    POP rdx
+    POP rbx
+    POP rbp
+    POP rsi
+    POP r8
+    POP r9
+    POP r12
+    POP r13
+    POP r14
+    POP r15
 END_MACRO
 
     /*
@@ -147,20 +198,210 @@
 INVOKE_TRAMPOLINE art_quick_invoke_super_trampoline_with_access_check, artInvokeSuperTrampolineWithAccessCheck
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
+
+    /*
+     * Helper for quick invocation stub to set up XMM registers. Assumes r10 == shorty,
+     * r11 == arg_array. Clobbers r10, r11 and al. Branches to xmm_setup_finished if it encounters
+     * the end of the shorty.
+     */
+MACRO2(LOOP_OVER_SHORTY_LOADING_XMMS, xmm_reg, finished)
+1: // LOOP
+    movb (%r10), %al              // al := *shorty
+    addq LITERAL(1), %r10         // shorty++
+    cmpb LITERAL(0), %al          // if (al == '\0') goto xmm_setup_finished
+    je VAR(finished, 1)
+    cmpb LITERAL(68), %al         // if (al == 'D') goto FOUND_DOUBLE
+    je 2f
+    cmpb LITERAL(70), %al         // if (al == 'F') goto FOUND_FLOAT
+    je 3f
+    addq LITERAL(4), %r11         // arg_array++
+    //  Handle extra space in arg array taken by a long.
+    cmpb LITERAL(74), %al         // if (al != 'J') goto LOOP
+    jne 1b
+    addq LITERAL(4), %r11         // arg_array++
+    jmp 1b                        // goto LOOP
+2:  // FOUND_DOUBLE
+    movsd (%r11), REG_VAR(xmm_reg, 0)
+    addq LITERAL(8), %r11         // arg_array+=2
+    jmp 4f
+3:  // FOUND_FLOAT
+    movss (%r11), REG_VAR(xmm_reg, 0)
+    addq LITERAL(4), %r11         // arg_array++
+4:
+END_MACRO
+
+    /*
+     * Helper for quick invocation stub to set up GPR registers. Assumes r10 == shorty,
+     * r11 == arg_array. Clobbers r10, r11 and al. Branches to gpr_setup_finished if it encounters
+     * the end of the shorty.
+     */
+MACRO3(LOOP_OVER_SHORTY_LOADING_GPRS, gpr_reg64, gpr_reg32, finished)
+1: // LOOP
+    movb (%r10), %al              // al := *shorty
+    addq LITERAL(1), %r10         // shorty++
+    cmpb LITERAL(0), %al          // if (al == '\0') goto gpr_setup_finished
+    je  VAR(finished, 2)
+    cmpb LITERAL(74), %al         // if (al == 'J') goto FOUND_LONG
+    je 2f
+    cmpb LITERAL(70), %al         // if (al == 'F') goto SKIP_FLOAT
+    je 3f
+    cmpb LITERAL(68), %al         // if (al == 'D') goto SKIP_DOUBLE
+    je 4f
+    movl (%r11), REG_VAR(gpr_reg32, 1)
+    addq LITERAL(4), %r11         // arg_array++
+    jmp 5f
+2:  // FOUND_LONG
+    movq (%r11), REG_VAR(gpr_reg64, 0)
+    addq LITERAL(8), %r11         // arg_array+=2
+    jmp 5f
+3:  // SKIP_FLOAT
+    addq LITERAL(4), %r11         // arg_array++
+    jmp 1b
+4:  // SKIP_DOUBLE
+    addq LITERAL(8), %r11         // arg_array+=2
+    jmp 1b
+5:
+END_MACRO
+
     /*
      * Quick invocation stub.
      * On entry:
      *   [sp] = return address
      *   rdi = method pointer
-     *   rsi = argument array or NULL for no argument methods
+     *   rsi = argument array that must at least contain the this pointer.
      *   rdx = size of argument array in bytes
      *   rcx = (managed) thread pointer
      *   r8 = JValue* result
      *   r9 = char* shorty
      */
 DEFINE_FUNCTION art_quick_invoke_stub
-    int3
-    int3
+    // Set up argument XMM registers.
+    leaq 1(%r9), %r10             // R10 := shorty + 1  ; ie skip return arg character.
+    leaq 4(%rsi), %r11            // R11 := arg_array + 4 ; ie skip this pointer.
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm0, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm1, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm2, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm3, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm4, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm5, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm6, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm7, .Lxmm_setup_finished
+    .balign 16
+.Lxmm_setup_finished:
+    PUSH rbp                      // Save rbp.
+    PUSH r8                       // Save r8/result*.
+    PUSH r9                       // Save r9/shorty*.
+    mov %rsp, %rbp                // Copy value of stack pointer into base pointer.
+    CFI_DEF_CFA_REGISTER(rbp)
+    movl %edx, %r10d
+    addl LITERAL(64), %edx        // Reserve space for return addr, method*, rbp, r8 and r9 in frame.
+    andl LITERAL(0xFFFFFFF0), %edx    // Align frame size to 16 bytes.
+    subl LITERAL(32), %edx        // Remove space for return address, rbp, r8 and r9.
+    subq %rdx, %rsp               // Reserve stack space for argument array.
+    movq LITERAL(0), (%rsp)       // Store NULL for method*
+    movl %r10d, %ecx              // Place size of args in rcx.
+    movq %rdi, %rax               // RAX := method to be called
+    movq %rsi, %r11               // R11 := arg_array
+    leaq 8(%rsp), %rdi            // Rdi is pointing just above the method* in the stack arguments.
+    // Copy arg array into stack.
+    rep movsb                     // while (rcx--) { *rdi++ = *rsi++ }
+    leaq 1(%r9), %r10             // R10 := shorty + 1  ; ie skip return arg character
+    movq %rax, %rdi               // RDI := method to be called
+    movl (%r11), %esi             // RSI := this pointer
+    addq LITERAL(4), %r11         // arg_array++
+    LOOP_OVER_SHORTY_LOADING_GPRS rdx, edx, .Lgpr_setup_finished
+    LOOP_OVER_SHORTY_LOADING_GPRS rcx, ecx, .Lgpr_setup_finished
+    LOOP_OVER_SHORTY_LOADING_GPRS r8, r8d, .Lgpr_setup_finished
+    LOOP_OVER_SHORTY_LOADING_GPRS r9, r9d, .Lgpr_setup_finished
+.Lgpr_setup_finished:
+    call *METHOD_QUICK_CODE_OFFSET(%rdi) // Call the method.
+    movq %rbp, %rsp               // Restore stack pointer.
+    CFI_DEF_CFA_REGISTER(rsp)
+    POP r9                        // Pop r9 - shorty*.
+    POP r8                        // Pop r8 - result*.
+    POP rbp                       // Pop rbp
+    cmpb LITERAL(68), (%r9)       // Test if result type char == 'D'.
+    je .Lreturn_double_quick
+    cmpb LITERAL(70), (%r9)       // Test if result type char == 'F'.
+    je .Lreturn_float_quick
+    movq %rax, (%r8)              // Store the result assuming its a long, int or Object*
+    ret
+.Lreturn_double_quick:
+    movsd %xmm0, (%r8)           // Store the double floating point result.
+    ret
+.Lreturn_float_quick:
+    movss %xmm0, (%r8)           // Store the floating point result.
+    ret
+END_FUNCTION art_quick_invoke_stub
+
+    /*
+     * Quick invocation stub.
+     * On entry:
+     *   [sp] = return address
+     *   rdi = method pointer
+     *   rsi = argument array or NULL if no arguments.
+     *   rdx = size of argument array in bytes
+     *   rcx = (managed) thread pointer
+     *   r8 = JValue* result
+     *   r9 = char* shorty
+     */
+DEFINE_FUNCTION art_quick_invoke_static_stub
+    // Set up argument XMM registers.
+    leaq 1(%r9), %r10             // R10 := shorty + 1  ; ie skip return arg character
+    movq %rsi, %r11               // R11 := arg_array
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm0, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm1, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm2, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm3, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm4, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm5, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm6, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm7, .Lxmm_setup_finished2
+    .balign 16
+.Lxmm_setup_finished2:
+    PUSH rbp                      // Save rbp.
+    PUSH r8                       // Save r8/result*.
+    PUSH r9                       // Save r9/shorty*.
+    mov %rsp, %rbp                // Copy value of stack pointer into base pointer.
+    CFI_DEF_CFA_REGISTER(rbp)
+    movl %edx, %r10d
+    addl LITERAL(64), %edx        // Reserve space for return addr, method*, rbp, r8 and r9 in frame.
+    andl LITERAL(0xFFFFFFF0), %edx    // Align frame size to 16 bytes.
+    subl LITERAL(32), %edx        // Remove space for return address, rbp, r8 and r9.
+    subq %rdx, %rsp               // Reserve stack space for argument array.
+    movq LITERAL(0), (%rsp)       // Store NULL for method*
+    movl %r10d, %ecx              // Place size of args in rcx.
+    movq %rdi, %rax               // RAX := method to be called
+    movq %rsi, %r11               // R11 := arg_array
+    leaq 8(%rsp), %rdi            // Rdi is pointing just above the method* in the stack arguments.
+    // Copy arg array into stack.
+    rep movsb                     // while (rcx--) { *rdi++ = *rsi++ }
+    leaq 1(%r9), %r10             // R10 := shorty + 1  ; ie skip return arg character
+    movq %rax, %rdi               // RDI := method to be called
+    LOOP_OVER_SHORTY_LOADING_GPRS rsi, esi, .Lgpr_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_GPRS rdx, edx, .Lgpr_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_GPRS rcx, ecx, .Lgpr_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_GPRS r8, r8d, .Lgpr_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_GPRS r9, r9d, .Lgpr_setup_finished2
+.Lgpr_setup_finished2:
+    call *METHOD_QUICK_CODE_OFFSET(%rdi) // Call the method.
+    movq %rbp, %rsp               // Restore stack pointer.
+    CFI_DEF_CFA_REGISTER(rsp)
+    POP r9                        // Pop r9 - shorty*.
+    POP r8                        // Pop r8 - result*.
+    POP rbp                       // Pop rbp
+    cmpb LITERAL(68), (%r9)       // Test if result type char == 'D'.
+    je .Lreturn_double_quick2
+    cmpb LITERAL(70), (%r9)       // Test if result type char == 'F'.
+    je .Lreturn_float_quick2
+    movq %rax, (%r8)              // Store the result assuming its a long, int or Object*
+    ret
+.Lreturn_double_quick2:
+    movsd %xmm0, (%r8)           // Store the double floating point result.
+    ret
+.Lreturn_float_quick2:
+    movss %xmm0, (%r8)           // Store the floating point result.
+    ret
 END_FUNCTION art_quick_invoke_stub
 
 MACRO3(NO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
@@ -210,8 +451,11 @@
 END_MACRO
 
 MACRO0(RETURN_OR_DELIVER_PENDING_EXCEPTION)
-    int3
-    int3
+    movq %gs:THREAD_EXCEPTION_OFFSET, %rcx // get exception field
+    testq %rcx, %rcx               // rcx == 0 ?
+    jnz 1f                         // if rcx != 0 goto 1
+    ret                            // return
+1:                                 // deliver exception on current thread
     DELIVER_PENDING_EXCEPTION
 END_MACRO
 
@@ -390,7 +634,22 @@
      */
 UNIMPLEMENTED art_quick_imt_conflict_trampoline
 UNIMPLEMENTED art_quick_resolution_trampoline
-UNIMPLEMENTED art_quick_to_interpreter_bridge
+
+    /*
+     * Called to bridge from the quick to interpreter ABI. On entry the arguments match those
+     * of a quick call:
+     * RDI = method being called / to bridge to.
+     * RSI, RDX, RCX, R8, R9 are arguments to that method.
+     */
+DEFINE_FUNCTION art_quick_to_interpreter_bridge
+    SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME   // Set up frame and save arguments.
+    movq %gs:THREAD_SELF_OFFSET, %rsi      // RSI := Thread::Current()
+    movq %rsp, %rdx                        // RDX := sp
+    call PLT_SYMBOL(artQuickToInterpreterBridge)  // (method, Thread*, SP)
+    RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME  // TODO: no need to restore arguments in this case.
+    movq %rax, %xmm0                   // Place return value also into floating point return value.
+    RETURN_OR_DELIVER_PENDING_EXCEPTION    // return or deliver exception
+END_FUNCTION art_quick_to_interpreter_bridge
 
     /*
      * Routine that intercepts method calls and returns.
diff --git a/runtime/arch/x86_64/registers_x86_64.h b/runtime/arch/x86_64/registers_x86_64.h
index 9808d91..c1a9942 100644
--- a/runtime/arch/x86_64/registers_x86_64.h
+++ b/runtime/arch/x86_64/registers_x86_64.h
@@ -48,6 +48,26 @@
 };
 std::ostream& operator<<(std::ostream& os, const Register& rhs);
 
+enum FloatRegister {
+  XMM0 = 0,
+  XMM1 = 1,
+  XMM2 = 2,
+  XMM3 = 3,
+  XMM4 = 4,
+  XMM5 = 5,
+  XMM6 = 6,
+  XMM7 = 7,
+  XMM8 = 8,
+  XMM9 = 9,
+  XMM10 = 10,
+  XMM11 = 11,
+  XMM12 = 12,
+  XMM13 = 13,
+  XMM14 = 14,
+  XMM15 = 15,
+};
+std::ostream& operator<<(std::ostream& os, const FloatRegister& rhs);
+
 }  // namespace x86_64
 }  // namespace art
 
diff --git a/runtime/arch/x86_64/thread_x86_64.cc b/runtime/arch/x86_64/thread_x86_64.cc
index 9e45a72..b74fc5d 100644
--- a/runtime/arch/x86_64/thread_x86_64.cc
+++ b/runtime/arch/x86_64/thread_x86_64.cc
@@ -48,6 +48,12 @@
   CHECK_EQ(self_check, this);
 
   // Sanity check other offsets.
+  CHECK_EQ(static_cast<size_t>(RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET),
+           Runtime::GetCalleeSaveMethodOffset(Runtime::kSaveAll));
+  CHECK_EQ(static_cast<size_t>(RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET),
+           Runtime::GetCalleeSaveMethodOffset(Runtime::kRefsOnly));
+  CHECK_EQ(static_cast<size_t>(RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET),
+           Runtime::GetCalleeSaveMethodOffset(Runtime::kRefsAndArgs));
   CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
   CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
   CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
diff --git a/runtime/atomic.h b/runtime/atomic.h
index 2a47e46..795f917 100644
--- a/runtime/atomic.h
+++ b/runtime/atomic.h
@@ -96,7 +96,7 @@
 // quasiatomic operations that are performed on partially-overlapping
 // memory.
 class QuasiAtomic {
-#if !defined(__arm__) && !defined(__i386__)
+#if defined(__mips__) && !defined(__LP64__)
   static constexpr bool kNeedSwapMutexes = true;
 #else
   static constexpr bool kNeedSwapMutexes = false;
@@ -141,7 +141,7 @@
   }
 
   static void MembarLoadStore() {
-  #if defined(__arm__)
+  #if defined(__arm__) || defined(__aarch64__)
     __asm__ __volatile__("dmb ish" : : : "memory");
   #elif defined(__i386__) || defined(__x86_64__)
     __asm__ __volatile__("" : : : "memory");
@@ -153,7 +153,7 @@
   }
 
   static void MembarLoadLoad() {
-  #if defined(__arm__)
+  #if defined(__arm__) || defined(__aarch64__)
     __asm__ __volatile__("dmb ish" : : : "memory");
   #elif defined(__i386__) || defined(__x86_64__)
     __asm__ __volatile__("" : : : "memory");
@@ -165,7 +165,7 @@
   }
 
   static void MembarStoreStore() {
-  #if defined(__arm__)
+  #if defined(__arm__) || defined(__aarch64__)
     __asm__ __volatile__("dmb ishst" : : : "memory");
   #elif defined(__i386__) || defined(__x86_64__)
     __asm__ __volatile__("" : : : "memory");
@@ -177,7 +177,7 @@
   }
 
   static void MembarStoreLoad() {
-  #if defined(__arm__)
+  #if defined(__arm__) || defined(__aarch64__)
     __asm__ __volatile__("dmb ish" : : : "memory");
   #elif defined(__i386__) || defined(__x86_64__)
     __asm__ __volatile__("mfence" : : : "memory");
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 3c238d6..89f841e 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -594,12 +594,12 @@
     MutexLock mu(Thread::Current(), *Locks::deoptimization_lock_);
     gDeoptimizationRequests.clear();
   }
-  runtime->GetInstrumentation()->DisableDeoptimization();
   runtime->GetInstrumentation()->RemoveListener(&gDebugInstrumentationListener,
                                                 instrumentation::Instrumentation::kMethodEntered |
                                                 instrumentation::Instrumentation::kMethodExited |
                                                 instrumentation::Instrumentation::kDexPcMoved |
                                                 instrumentation::Instrumentation::kExceptionCaught);
+  runtime->GetInstrumentation()->DisableDeoptimization();
   gDebuggerActive = false;
   gRegistry->Clear();
   gDebuggerConnected = false;
diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index 5e2b9ff..a3415d3 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc
@@ -721,7 +721,8 @@
   }
 
   if (it.HasNext()) {
-    LOG(ERROR) << "invalid stream - problem with parameter iterator in " << GetLocation();
+    LOG(ERROR) << "invalid stream - problem with parameter iterator in " << GetLocation()
+               << " for method " << PrettyMethod(method_idx, *this);
     return;
   }
 
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 012dabb..b3fce5a 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -35,10 +35,14 @@
 
 // Visits the arguments as saved to the stack by a Runtime::kRefAndArgs callee save frame.
 class QuickArgumentVisitor {
- public:
-// Offset to first (not the Method*) argument in a Runtime::kRefAndArgs callee save frame.
-// Size of Runtime::kRefAndArgs callee save frame.
-// Size of Method* and register parameters in out stack arguments.
+  // Size of each spilled GPR.
+#ifdef __LP64__
+  static constexpr size_t kBytesPerGprSpillLocation = 8;
+#else
+  static constexpr size_t kBytesPerGprSpillLocation = 4;
+#endif
+  // Number of bytes for each out register in the caller method's frame.
+  static constexpr size_t kBytesStackArgLocation = 4;
 #if defined(__arm__)
   // The callee save frame is pointed to by SP.
   // | argN       |  |
@@ -53,12 +57,19 @@
   // | R3         |    arg3
   // | R2         |    arg2
   // | R1         |    arg1
-  // | R0         |
+  // | R0         |    padding
   // | Method*    |  <- sp
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__R1_OFFSET 8
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__LR_OFFSET 44
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__FRAME_SIZE 48
-#define QUICK_STACK_ARG_SKIP 16
+  static constexpr bool kSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr size_t kNumGprArgs = 3;  // 3 arguments passed in GPRs.
+  static constexpr size_t kNumFprArgs = 0;  // 0 arguments passed in FPRs.
+  static constexpr size_t kBytesPerFprSpillLocation = 4;  // FPR spill size is 4 bytes.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 8;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 44;  // Offset of return address.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_FrameSize = 48;  // Frame size.
+  static size_t GprIndexToGprOffset(uint32_t gpr_index) {
+    return gpr_index * kBytesPerGprSpillLocation;
+  }
 #elif defined(__mips__)
   // The callee save frame is pointed to by SP.
   // | argN       |  |
@@ -74,10 +85,17 @@
   // | A2         |    arg2
   // | A1         |    arg1
   // | A0/Method* |  <- sp
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__R1_OFFSET 4
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__LR_OFFSET 60
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__FRAME_SIZE 64
-#define QUICK_STACK_ARG_SKIP 16
+  static constexpr bool kSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr size_t kNumGprArgs = 3;  // 3 arguments passed in GPRs.
+  static constexpr size_t kNumFprArgs = 0;  // 0 arguments passed in FPRs.
+  static constexpr size_t kBytesPerFprSpillLocation = 4;  // FPR spill size is 4 bytes.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 4;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 60;  // Offset of return address.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_FrameSize = 64;  // Frame size.
+  static size_t GprIndexToGprOffset(uint32_t gpr_index) {
+    return gpr_index * kBytesPerGprSpillLocation;
+  }
 #elif defined(__i386__)
   // The callee save frame is pointed to by SP.
   // | argN        |  |
@@ -93,49 +111,96 @@
   // | EDX         |    arg2
   // | ECX         |    arg1
   // | EAX/Method* |  <- sp
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__R1_OFFSET 4
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__LR_OFFSET 28
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__FRAME_SIZE 32
-#define QUICK_STACK_ARG_SKIP 16
+  static constexpr bool kSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr size_t kNumGprArgs = 3;  // 3 arguments passed in GPRs.
+  static constexpr size_t kNumFprArgs = 0;  // 0 arguments passed in FPRs.
+  static constexpr size_t kBytesPerFprSpillLocation = 8;  // FPR spill size is 8 bytes.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 4;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 28;  // Offset of return address.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_FrameSize = 32;  // Frame size.
+  static size_t GprIndexToGprOffset(uint32_t gpr_index) {
+    return gpr_index * kBytesPerGprSpillLocation;
+  }
 #elif defined(__x86_64__)
-// TODO: implement and check these.
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__R1_OFFSET 8
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__LR_OFFSET 56
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__FRAME_SIZE 64
-#define QUICK_STACK_ARG_SKIP 32
+  // The callee save frame is pointed to by SP.
+  // | argN            |  |
+  // | ...             |  |
+  // | reg. arg spills |  |  Caller's frame
+  // | Method*         | ---
+  // | Return          |
+  // | R15             |    callee save
+  // | R14             |    callee save
+  // | R13             |    callee save
+  // | R12             |    callee save
+  // | R9              |    arg5
+  // | R8              |    arg4
+  // | RSI/R6          |    arg1
+  // | RBP/R5          |    callee save
+  // | RBX/R3          |    callee save
+  // | RDX/R2          |    arg2
+  // | RCX/R1          |    arg3
+  // | XMM7            |    float arg 8
+  // | XMM6            |    float arg 7
+  // | XMM5            |    float arg 6
+  // | XMM4            |    float arg 5
+  // | XMM3            |    float arg 4
+  // | XMM2            |    float arg 3
+  // | XMM1            |    float arg 2
+  // | XMM0            |    float arg 1
+  // | Padding         |
+  // | RDI/Method*     |  <- sp
+  static constexpr bool kSoftFloatAbi = false;  // This is a hard float ABI.
+  static constexpr size_t kNumGprArgs = 5;  // 3 arguments passed in GPRs.
+  static constexpr size_t kNumFprArgs = 8;  // 0 arguments passed in FPRs.
+  static constexpr size_t kBytesPerFprSpillLocation = 8;  // FPR spill size is 8 bytes.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 80;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 168;  // Offset of return address.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_FrameSize = 176;  // Frame size.
+  static size_t GprIndexToGprOffset(uint32_t gpr_index) {
+    switch (gpr_index) {
+      case 0: return (4 * kBytesPerGprSpillLocation);
+      case 1: return (1 * kBytesPerGprSpillLocation);
+      case 2: return (0 * kBytesPerGprSpillLocation);
+      case 3: return (5 * kBytesPerGprSpillLocation);
+      case 4: return (6 * kBytesPerGprSpillLocation);
+      default:
+        LOG(FATAL) << "Unexpected GPR index: " << gpr_index;
+        return 0;
+    }
+  }
 #else
 #error "Unsupported architecture"
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__R1_OFFSET 0
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__LR_OFFSET 0
-#define QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__FRAME_SIZE 0
-#define QUICK_STACK_ARG_SKIP 0
 #endif
 
-  static mirror::ArtMethod* GetCallingMethod(mirror::ArtMethod** sp) {
-    byte* previous_sp = reinterpret_cast<byte*>(sp) +
-        QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__FRAME_SIZE;
+ public:
+  static mirror::ArtMethod* GetCallingMethod(mirror::ArtMethod** sp)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    DCHECK((*sp)->IsCalleeSaveMethod());
+    byte* previous_sp = reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_FrameSize;
     return *reinterpret_cast<mirror::ArtMethod**>(previous_sp);
   }
 
-  static uintptr_t GetCallingPc(mirror::ArtMethod** sp) {
-    byte* lr = reinterpret_cast<byte*>(sp) + QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__LR_OFFSET;
+  // For the given quick ref and args quick frame, return the caller's PC.
+  static uintptr_t GetCallingPc(mirror::ArtMethod** sp)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    DCHECK((*sp)->IsCalleeSaveMethod());
+    byte* lr = reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_LrOffset;
     return *reinterpret_cast<uintptr_t*>(lr);
   }
 
   QuickArgumentVisitor(mirror::ArtMethod** sp, bool is_static,
                        const char* shorty, uint32_t shorty_len)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) :
-    is_static_(is_static), shorty_(shorty), shorty_len_(shorty_len),
-    args_in_regs_(ComputeArgsInRegs(is_static, shorty, shorty_len)),
-    num_params_((is_static ? 0 : 1) + shorty_len - 1),  // +1 for this, -1 for return type
-    reg_args_(reinterpret_cast<byte*>(sp) + QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__R1_OFFSET),
-    stack_args_(reinterpret_cast<byte*>(sp) + QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__FRAME_SIZE
-                + QUICK_STACK_ARG_SKIP),
-    cur_args_(reg_args_),
-    cur_arg_index_(0),
-    param_index_(0),
-    is_split_long_or_double_(false) {
-    DCHECK_EQ(static_cast<size_t>(QUICK_CALLEE_SAVE_FRAME__REF_AND_ARGS__FRAME_SIZE),
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) :
+      is_static_(is_static), shorty_(shorty), shorty_len_(shorty_len),
+      gpr_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset),
+      fpr_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset),
+      stack_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_FrameSize
+                  + StackArgumentStartFromShorty(is_static, shorty, shorty_len)),
+      gpr_index_(0), fpr_index_(0), stack_index_(0), cur_type_(Primitive::kPrimVoid),
+      is_split_long_or_double_(false) {
+    DCHECK_EQ(kQuickCalleeSaveFrame_RefAndArgs_FrameSize,
               Runtime::Current()->GetCalleeSaveMethod(Runtime::kRefsAndArgs)->GetFrameSizeInBytes());
   }
 
@@ -143,30 +208,38 @@
 
   virtual void Visit() = 0;
 
-  Primitive::Type GetParamPrimitiveType() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    size_t index = param_index_;
-    if (is_static_) {
-      index++;  // 0th argument must skip return value at start of the shorty
-    } else if (index == 0) {
-      return Primitive::kPrimNot;
-    }
-    CHECK_LT(index, shorty_len_);
-    return Primitive::GetType(shorty_[index]);
+  Primitive::Type GetParamPrimitiveType() const {
+    return cur_type_;
   }
 
   byte* GetParamAddress() const {
-    return cur_args_ + (cur_arg_index_ * kPointerSize);
+    if (!kSoftFloatAbi) {
+      Primitive::Type type = GetParamPrimitiveType();
+      if (UNLIKELY((type == Primitive::kPrimDouble) || (type == Primitive::kPrimFloat))) {
+        if ((kNumFprArgs != 0) && (fpr_index_ + 1 < kNumFprArgs + 1)) {
+          return fpr_args_ + (fpr_index_ * kBytesPerFprSpillLocation);
+        }
+      }
+    }
+    if (gpr_index_ < kNumGprArgs) {
+      return gpr_args_ + GprIndexToGprOffset(gpr_index_);
+    }
+    return stack_args_ + (stack_index_ * kBytesStackArgLocation);
   }
 
   bool IsSplitLongOrDouble() const {
-    return is_split_long_or_double_;
+    if ((kBytesPerGprSpillLocation == 4) || (kBytesPerFprSpillLocation == 4)) {
+      return is_split_long_or_double_;
+    } else {
+      return false;  // An optimization for when GPR and FPRs are 64bit.
+    }
   }
 
-  bool IsParamAReference() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  bool IsParamAReference() const {
     return GetParamPrimitiveType() == Primitive::kPrimNot;
   }
 
-  bool IsParamALongOrDouble() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  bool IsParamALongOrDouble() const {
     Primitive::Type type = GetParamPrimitiveType();
     return type == Primitive::kPrimLong || type == Primitive::kPrimDouble;
   }
@@ -179,51 +252,179 @@
   }
 
   void VisitArguments() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    for (cur_arg_index_ = 0;  cur_arg_index_ < args_in_regs_ && param_index_ < num_params_; ) {
-      is_split_long_or_double_ = (cur_arg_index_ == 2) && IsParamALongOrDouble();
+    gpr_index_ = 0;
+    fpr_index_ = 0;
+    stack_index_ = 0;
+    if (!is_static_) {  // Handle this.
+      cur_type_ = Primitive::kPrimNot;
+      is_split_long_or_double_ = false;
       Visit();
-      cur_arg_index_ += (IsParamALongOrDouble() ? 2 : 1);
-      param_index_++;
+      if (kNumGprArgs > 0) {
+        gpr_index_++;
+      } else {
+        stack_index_++;
+      }
     }
-    cur_args_ = stack_args_;
-    cur_arg_index_ = is_split_long_or_double_ ? 1 : 0;
-    is_split_long_or_double_ = false;
-    while (param_index_ < num_params_) {
-      Visit();
-      cur_arg_index_ += (IsParamALongOrDouble() ? 2 : 1);
-      param_index_++;
+    for (uint32_t shorty_index = 1; shorty_index < shorty_len_; ++shorty_index) {
+      cur_type_ = Primitive::GetType(shorty_[shorty_index]);
+      switch (cur_type_) {
+        case Primitive::kPrimNot:
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+        case Primitive::kPrimInt:
+          is_split_long_or_double_ = false;
+          Visit();
+          if (gpr_index_ < kNumGprArgs) {
+            gpr_index_++;
+          } else {
+            stack_index_++;
+          }
+          break;
+        case Primitive::kPrimFloat:
+          is_split_long_or_double_ = false;
+          Visit();
+          if (kSoftFloatAbi) {
+            if (gpr_index_ < kNumGprArgs) {
+              gpr_index_++;
+            } else {
+              stack_index_++;
+            }
+          } else {
+            if ((kNumFprArgs != 0) && (fpr_index_ + 1 < kNumFprArgs + 1)) {
+              fpr_index_++;
+            } else {
+              stack_index_++;
+            }
+          }
+          break;
+        case Primitive::kPrimDouble:
+        case Primitive::kPrimLong:
+          if (kSoftFloatAbi || (cur_type_ == Primitive::kPrimLong)) {
+            is_split_long_or_double_ = (kBytesPerGprSpillLocation == 4) &&
+                ((gpr_index_ + 1) == kNumGprArgs);
+            Visit();
+            if (gpr_index_ < kNumGprArgs) {
+              gpr_index_++;
+              if (kBytesPerGprSpillLocation == 4) {
+                if (gpr_index_ < kNumGprArgs) {
+                  gpr_index_++;
+                } else {
+                  stack_index_++;
+                }
+              }
+            } else {
+              if (kBytesStackArgLocation == 4) {
+                stack_index_+= 2;
+              } else {
+                CHECK_EQ(kBytesStackArgLocation, 8U);
+                stack_index_++;
+              }
+            }
+          } else {
+            is_split_long_or_double_ = (kBytesPerFprSpillLocation == 4) &&
+                ((fpr_index_ + 1) == kNumFprArgs);
+            Visit();
+            if ((kNumFprArgs != 0) && (fpr_index_ + 1 < kNumFprArgs + 1)) {
+              fpr_index_++;
+              if (kBytesPerFprSpillLocation == 4) {
+                if ((kNumFprArgs != 0) && (fpr_index_ + 1 < kNumFprArgs + 1)) {
+                  fpr_index_++;
+                } else {
+                  stack_index_++;
+                }
+              }
+            } else {
+              if (kBytesStackArgLocation == 4) {
+                stack_index_+= 2;
+              } else {
+                CHECK_EQ(kBytesStackArgLocation, 8U);
+                stack_index_++;
+              }
+            }
+          }
+          break;
+        default:
+          LOG(FATAL) << "Unexpected type: " << cur_type_ << " in " << shorty_;
+      }
     }
   }
 
  private:
-  static size_t ComputeArgsInRegs(bool is_static, const char* shorty, uint32_t shorty_len)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    size_t args_in_regs = (is_static ? 0 : 1);
-    for (size_t i = 0; i < shorty_len; i++) {
-      char s = shorty[i];
-      if (s == 'J' || s == 'D') {
-        args_in_regs += 2;
-      } else {
-        args_in_regs++;
+  static size_t StackArgumentStartFromShorty(bool is_static, const char* shorty,
+                                             uint32_t shorty_len) {
+    if (kSoftFloatAbi) {
+      CHECK_EQ(kNumFprArgs, 0U);
+      return (kNumGprArgs * kBytesPerGprSpillLocation) + kBytesPerGprSpillLocation /* ArtMethod* */;
+    } else {
+      size_t offset = kBytesPerGprSpillLocation;  // Skip Method*.
+      size_t gprs_seen = 0;
+      size_t fprs_seen = 0;
+      if (!is_static && (gprs_seen < kNumGprArgs)) {
+        gprs_seen++;
+        offset += kBytesStackArgLocation;
       }
-      if (args_in_regs > 3) {
-        args_in_regs = 3;
-        break;
+      for (uint32_t i = 1; i < shorty_len; ++i) {
+        switch (shorty[i]) {
+          case 'Z':
+          case 'B':
+          case 'C':
+          case 'S':
+          case 'I':
+          case 'L':
+            if (gprs_seen < kNumGprArgs) {
+              gprs_seen++;
+              offset += kBytesStackArgLocation;
+            }
+            break;
+          case 'J':
+            if (gprs_seen < kNumGprArgs) {
+              gprs_seen++;
+              offset += 2 * kBytesStackArgLocation;
+              if (kBytesPerGprSpillLocation == 4) {
+                if (gprs_seen < kNumGprArgs) {
+                  gprs_seen++;
+                }
+              }
+            }
+            break;
+          case 'F':
+            if ((kNumFprArgs != 0) && (fprs_seen + 1 < kNumFprArgs + 1)) {
+              fprs_seen++;
+              offset += kBytesStackArgLocation;
+            }
+            break;
+          case 'D':
+            if ((kNumFprArgs != 0) && (fprs_seen + 1 < kNumFprArgs + 1)) {
+              fprs_seen++;
+              offset += 2 * kBytesStackArgLocation;
+              if (kBytesPerFprSpillLocation == 4) {
+                if ((kNumFprArgs != 0) && (fprs_seen + 1 < kNumFprArgs + 1)) {
+                  fprs_seen++;
+                }
+              }
+            }
+            break;
+          default:
+            LOG(FATAL) << "Unexpected shorty character: " << shorty[i] << " in " << shorty;
+        }
       }
+      return offset;
     }
-    return args_in_regs;
   }
 
   const bool is_static_;
   const char* const shorty_;
   const uint32_t shorty_len_;
-  const size_t args_in_regs_;
-  const size_t num_params_;
-  byte* const reg_args_;
-  byte* const stack_args_;
-  byte* cur_args_;
-  size_t cur_arg_index_;
-  size_t param_index_;
+  byte* const gpr_args_;  // Address of GPR arguments in callee save frame.
+  byte* const fpr_args_;  // Address of FPR arguments in callee save frame.
+  byte* const stack_args_;  // Address of stack arguments in caller's frame.
+  uint32_t gpr_index_;  // Index into spilled GPRs.
+  uint32_t fpr_index_;  // Index into spilled FPRs.
+  uint32_t stack_index_;  // Index into arguments on the stack.
+  // The current type of argument during VisitArguments.
+  Primitive::Type cur_type_;
   // Does a 64bit parameter straddle the register and stack arguments?
   bool is_split_long_or_double_;
 };
@@ -231,9 +432,8 @@
 // Visits arguments on the stack placing them into the shadow frame.
 class BuildQuickShadowFrameVisitor : public QuickArgumentVisitor {
  public:
-  BuildQuickShadowFrameVisitor(mirror::ArtMethod** sp,
-      bool is_static, const char* shorty,
-       uint32_t shorty_len, ShadowFrame& sf, size_t first_arg_reg) :
+  BuildQuickShadowFrameVisitor(mirror::ArtMethod** sp, bool is_static, const char* shorty,
+                               uint32_t shorty_len, ShadowFrame* sf, size_t first_arg_reg) :
     QuickArgumentVisitor(sp, is_static, shorty, shorty_len), sf_(sf), cur_reg_(first_arg_reg) {}
 
   virtual void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -242,14 +442,14 @@
       case Primitive::kPrimLong:  // Fall-through.
       case Primitive::kPrimDouble:
         if (IsSplitLongOrDouble()) {
-          sf_.SetVRegLong(cur_reg_, ReadSplitLongParam());
+          sf_->SetVRegLong(cur_reg_, ReadSplitLongParam());
         } else {
-          sf_.SetVRegLong(cur_reg_, *reinterpret_cast<jlong*>(GetParamAddress()));
+          sf_->SetVRegLong(cur_reg_, *reinterpret_cast<jlong*>(GetParamAddress()));
         }
         ++cur_reg_;
         break;
       case Primitive::kPrimNot:
-        sf_.SetVRegReference(cur_reg_, *reinterpret_cast<mirror::Object**>(GetParamAddress()));
+        sf_->SetVRegReference(cur_reg_, *reinterpret_cast<mirror::Object**>(GetParamAddress()));
         break;
       case Primitive::kPrimBoolean:  // Fall-through.
       case Primitive::kPrimByte:     // Fall-through.
@@ -257,7 +457,7 @@
       case Primitive::kPrimShort:    // Fall-through.
       case Primitive::kPrimInt:      // Fall-through.
       case Primitive::kPrimFloat:
-        sf_.SetVReg(cur_reg_, *reinterpret_cast<jint*>(GetParamAddress()));
+        sf_->SetVReg(cur_reg_, *reinterpret_cast<jint*>(GetParamAddress()));
         break;
       case Primitive::kPrimVoid:
         LOG(FATAL) << "UNREACHABLE";
@@ -267,8 +467,8 @@
   }
 
  private:
-  ShadowFrame& sf_;
-  size_t cur_reg_;
+  ShadowFrame* const sf_;
+  uint32_t cur_reg_;
 
   DISALLOW_COPY_AND_ASSIGN(BuildQuickShadowFrameVisitor);
 };
@@ -293,8 +493,8 @@
                                                   method, 0, memory));
     size_t first_arg_reg = code_item->registers_size_ - code_item->ins_size_;
     BuildQuickShadowFrameVisitor shadow_frame_builder(sp, mh.IsStatic(), mh.GetShorty(),
-                                                 mh.GetShortyLength(),
-                                                 *shadow_frame, first_arg_reg);
+                                                      mh.GetShortyLength(),
+                                                      shadow_frame, first_arg_reg);
     shadow_frame_builder.VisitArguments();
     // Push a transition back into managed code onto the linked list in thread.
     ManagedStack fragment;
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 006c271..7b9d675 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -179,11 +179,11 @@
   TimingLogger::ScopedSplit split("ProcessReferences", &timings_);
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   GetHeap()->ProcessReferences(timings_, clear_soft_references_, &IsMarkedCallback,
-                               &RecursiveMarkObjectCallback, this);
+                               &MarkObjectCallback, &ProcessMarkStackPausedCallback, this);
 }
 
 bool MarkSweep::HandleDirtyObjectsPhase() {
-  TimingLogger::ScopedSplit split("HandleDirtyObjectsPhase", &timings_);
+  TimingLogger::ScopedSplit split("(Paused)HandleDirtyObjectsPhase", &timings_);
   Thread* self = Thread::Current();
   Locks::mutator_lock_->AssertExclusiveHeld(self);
 
@@ -400,10 +400,9 @@
   }
 }
 
-mirror::Object* MarkSweep::RecursiveMarkObjectCallback(mirror::Object* obj, void* arg) {
+mirror::Object* MarkSweep::MarkObjectCallback(mirror::Object* obj, void* arg) {
   MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
   mark_sweep->MarkObject(obj);
-  mark_sweep->ProcessMarkStack(true);
   return obj;
 }
 
@@ -546,13 +545,6 @@
   reinterpret_cast<MarkSweep*>(arg)->MarkObjectNonNull(*root);
 }
 
-mirror::Object* MarkSweep::MarkObjectCallback(mirror::Object* object, void* arg) {
-  DCHECK(object != nullptr);
-  DCHECK(arg != nullptr);
-  reinterpret_cast<MarkSweep*>(arg)->MarkObjectNonNull(object);
-  return object;
-}
-
 void MarkSweep::VerifyRootCallback(const Object* root, void* arg, size_t vreg,
                                    const StackVisitor* visitor) {
   reinterpret_cast<MarkSweep*>(arg)->VerifyRoot(root, vreg, visitor);
@@ -957,7 +949,7 @@
 }
 
 void MarkSweep::ReMarkRoots() {
-  timings_.StartSplit("ReMarkRoots");
+  timings_.StartSplit("(Paused)ReMarkRoots");
   Runtime::Current()->VisitRoots(MarkRootCallback, this, true, true);
   timings_.EndSplit();
 }
@@ -1208,6 +1200,11 @@
   ScanObjectVisit(obj, visitor);
 }
 
+void MarkSweep::ProcessMarkStackPausedCallback(void* arg) {
+  DCHECK(arg != nullptr);
+  reinterpret_cast<MarkSweep*>(arg)->ProcessMarkStack(true);
+}
+
 void MarkSweep::ProcessMarkStackParallel(size_t thread_count) {
   Thread* self = Thread::Current();
   ThreadPool* thread_pool = GetHeap()->GetThreadPool();
@@ -1231,7 +1228,7 @@
 
 // Scan anything that's on the mark stack.
 void MarkSweep::ProcessMarkStack(bool paused) {
-  timings_.StartSplit("ProcessMarkStack");
+  timings_.StartSplit(paused ? "(Paused)ProcessMarkStack" : "ProcessMarkStack");
   size_t thread_count = GetThreadCount(paused);
   if (kParallelProcessMarkStack && thread_count > 1 &&
       mark_stack_->Size() >= kMinimumParallelMarkStackSize) {
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 6a48cf7..963b9ea 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -176,7 +176,7 @@
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_,
                             Locks::mutator_lock_);
 
-  static mirror::Object* RecursiveMarkObjectCallback(mirror::Object* obj, void* arg)
+  static mirror::Object* MarkObjectCallback(mirror::Object* obj, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -185,9 +185,8 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  static mirror::Object* MarkObjectCallback(mirror::Object* object, void* arg)
-        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-        EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+  static void ProcessMarkStackPausedCallback(void* arg)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   static void MarkRootParallelCallback(mirror::Object** root, void* arg, uint32_t thread_id,
                                        RootType root_type)
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index d64ec61..882867b 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -163,7 +163,7 @@
   TimingLogger::ScopedSplit split("ProcessReferences", &timings_);
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   GetHeap()->ProcessReferences(timings_, clear_soft_references_, &MarkedForwardingAddressCallback,
-                               &RecursiveMarkObjectCallback, this);
+                               &MarkObjectCallback, &ProcessMarkStackCallback, this);
 }
 
 void SemiSpace::MarkingPhase() {
@@ -310,7 +310,7 @@
   }
 
   // Recursively process the mark stack.
-  ProcessMarkStack(true);
+  ProcessMarkStack();
 }
 
 void SemiSpace::ReclaimPhase() {
@@ -571,13 +571,15 @@
   return forward_address;
 }
 
-mirror::Object* SemiSpace::RecursiveMarkObjectCallback(mirror::Object* root, void* arg) {
+void SemiSpace::ProcessMarkStackCallback(void* arg) {
+  DCHECK(arg != nullptr);
+  reinterpret_cast<SemiSpace*>(arg)->ProcessMarkStack();
+}
+
+mirror::Object* SemiSpace::MarkObjectCallback(mirror::Object* root, void* arg) {
   DCHECK(root != nullptr);
   DCHECK(arg != nullptr);
-  SemiSpace* semi_space = reinterpret_cast<SemiSpace*>(arg);
-  mirror::Object* ret = semi_space->MarkObject(root);
-  semi_space->ProcessMarkStack(true);
-  return ret;
+  return reinterpret_cast<SemiSpace*>(arg)->MarkObject(root);
 }
 
 void SemiSpace::MarkRootCallback(Object** root, void* arg, uint32_t /*thread_id*/,
@@ -587,12 +589,6 @@
   *root = reinterpret_cast<SemiSpace*>(arg)->MarkObject(*root);
 }
 
-Object* SemiSpace::MarkObjectCallback(Object* object, void* arg) {
-  DCHECK(object != nullptr);
-  DCHECK(arg != nullptr);
-  return reinterpret_cast<SemiSpace*>(arg)->MarkObject(object);
-}
-
 // Marks all objects in the root set.
 void SemiSpace::MarkRoots() {
   timings_.StartSplit("MarkRoots");
@@ -680,7 +676,7 @@
 }
 
 // Scan anything that's on the mark stack.
-void SemiSpace::ProcessMarkStack(bool paused) {
+void SemiSpace::ProcessMarkStack() {
   space::MallocSpace* promo_dest_space = NULL;
   accounting::SpaceBitmap* live_bitmap = NULL;
   if (generational_ && !whole_heap_collection_) {
@@ -694,7 +690,7 @@
     DCHECK(mark_bitmap != nullptr);
     DCHECK_EQ(live_bitmap, mark_bitmap);
   }
-  timings_.StartSplit(paused ? "(paused)ProcessMarkStack" : "ProcessMarkStack");
+  timings_.StartSplit("ProcessMarkStack");
   while (!mark_stack_->IsEmpty()) {
     Object* obj = mark_stack_->PopBack();
     if (generational_ && !whole_heap_collection_ && promo_dest_space->HasAddress(obj)) {
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index 89fe326..ba97376 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -146,12 +146,12 @@
                                RootType /*root_type*/)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
-  static mirror::Object* MarkObjectCallback(mirror::Object* objecgt, void* arg)
-        EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
-
-  static mirror::Object* RecursiveMarkObjectCallback(mirror::Object* root, void* arg)
+  static mirror::Object* MarkObjectCallback(mirror::Object* root, void* arg)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
+  static void ProcessMarkStackCallback(void* arg)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+
   virtual mirror::Object* MarkNonForwardedObject(mirror::Object* obj)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
@@ -174,10 +174,6 @@
   // Returns true if we should sweep the space.
   virtual bool ShouldSweepSpace(space::ContinuousSpace* space) const;
 
-  // Returns how many threads we should use for the current GC phase based on if we are paused,
-  // whether or not we care about pauses.
-  size_t GetThreadCount(bool paused) const;
-
   // Returns true if an object is inside of the immune region (assumed to be marked).
   bool IsImmune(const mirror::Object* obj) const ALWAYS_INLINE {
     return obj >= immune_begin_ && obj < immune_end_;
@@ -237,7 +233,7 @@
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   // Recursively blackens objects on the mark stack.
-  void ProcessMarkStack(bool paused)
+  void ProcessMarkStack()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
 
   void EnqueueFinalizerReferences(mirror::Object** ref)
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 9e5e3ab..8c89cdc 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -610,41 +610,44 @@
 
 struct SoftReferenceArgs {
   IsMarkedCallback* is_marked_callback_;
-  MarkObjectCallback* recursive_mark_callback_;
+  MarkObjectCallback* mark_callback_;
   void* arg_;
 };
 
 mirror::Object* Heap::PreserveSoftReferenceCallback(mirror::Object* obj, void* arg) {
   SoftReferenceArgs* args = reinterpret_cast<SoftReferenceArgs*>(arg);
   // TODO: Not preserve all soft references.
-  return args->recursive_mark_callback_(obj, args->arg_);
+  return args->mark_callback_(obj, args->arg_);
 }
 
 // Process reference class instances and schedule finalizations.
 void Heap::ProcessReferences(TimingLogger& timings, bool clear_soft,
                              IsMarkedCallback* is_marked_callback,
-                             MarkObjectCallback* recursive_mark_object_callback, void* arg) {
+                             MarkObjectCallback* mark_object_callback,
+                             ProcessMarkStackCallback* process_mark_stack_callback, void* arg) {
   // Unless we are in the zygote or required to clear soft references with white references,
   // preserve some white referents.
   if (!clear_soft && !Runtime::Current()->IsZygote()) {
     SoftReferenceArgs soft_reference_args;
     soft_reference_args.is_marked_callback_ = is_marked_callback;
-    soft_reference_args.recursive_mark_callback_ = recursive_mark_object_callback;
+    soft_reference_args.mark_callback_ = mark_object_callback;
     soft_reference_args.arg_ = arg;
     soft_reference_queue_.PreserveSomeSoftReferences(&PreserveSoftReferenceCallback,
                                                      &soft_reference_args);
+    process_mark_stack_callback(arg);
   }
-  timings.StartSplit("ProcessReferences");
+  timings.StartSplit("(Paused)ProcessReferences");
   // Clear all remaining soft and weak references with white referents.
   soft_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
   weak_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
   timings.EndSplit();
   // Preserve all white objects with finalize methods and schedule them for finalization.
-  timings.StartSplit("EnqueueFinalizerReferences");
+  timings.StartSplit("(Paused)EnqueueFinalizerReferences");
   finalizer_reference_queue_.EnqueueFinalizerReferences(cleared_references_, is_marked_callback,
-                                                        recursive_mark_object_callback, arg);
+                                                        mark_object_callback, arg);
+  process_mark_stack_callback(arg);
   timings.EndSplit();
-  timings.StartSplit("ProcessReferences");
+  timings.StartSplit("(Paused)ProcessReferences");
   // Clear all f-reachable soft and weak references with white referents.
   soft_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
   weak_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 80a5a1a..21a2365 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -339,7 +339,9 @@
   static mirror::Object* PreserveSoftReferenceCallback(mirror::Object* obj, void* arg);
   void ProcessReferences(TimingLogger& timings, bool clear_soft,
                          IsMarkedCallback* is_marked_callback,
-                         MarkObjectCallback* recursive_mark_object_callback, void* arg)
+                         MarkObjectCallback* mark_object_callback,
+                         ProcessMarkStackCallback* process_mark_stack_callback,
+                         void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -474,6 +476,9 @@
   void FlushAllocStack()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
+  // Revoke all the thread-local allocation stacks.
+  void RevokeAllThreadLocalAllocationStacks(Thread* self);
+
   // Mark all the objects in the allocation stack in the specified bitmap.
   void MarkAllocStack(accounting::SpaceBitmap* bitmap1, accounting::SpaceBitmap* bitmap2,
                       accounting::ObjectSet* large_objects, accounting::ObjectStack* stack)
@@ -670,9 +675,6 @@
   // Swap the allocation stack with the live stack.
   void SwapStacks(Thread* self);
 
-  // Revoke all the thread-local allocation stacks.
-  void RevokeAllThreadLocalAllocationStacks(Thread* self);
-
   // Clear cards and update the mod union table.
   void ProcessCards(TimingLogger& timings);
 
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 59ffdc1..9d05169 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -145,12 +145,10 @@
 static void InstrumentationInstallStack(Thread* thread, void* arg)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   struct InstallStackVisitor : public StackVisitor {
-    InstallStackVisitor(Thread* thread, Context* context, uintptr_t instrumentation_exit_pc,
-                        bool is_deoptimization_enabled)
+    InstallStackVisitor(Thread* thread, Context* context, uintptr_t instrumentation_exit_pc)
         : StackVisitor(thread, context),  instrumentation_stack_(thread->GetInstrumentationStack()),
           existing_instrumentation_frames_count_(instrumentation_stack_->size()),
           instrumentation_exit_pc_(instrumentation_exit_pc),
-          is_deoptimization_enabled_(is_deoptimization_enabled),
           reached_existing_instrumentation_frames_(false), instrumentation_stack_depth_(0),
           last_return_pc_(0) {
     }
@@ -218,7 +216,6 @@
     const size_t existing_instrumentation_frames_count_;
     std::vector<uint32_t> dex_pcs_;
     const uintptr_t instrumentation_exit_pc_;
-    const bool is_deoptimization_enabled_;
     bool reached_existing_instrumentation_frames_;
     size_t instrumentation_stack_depth_;
     uintptr_t last_return_pc_;
@@ -232,12 +229,11 @@
   Instrumentation* instrumentation = reinterpret_cast<Instrumentation*>(arg);
   UniquePtr<Context> context(Context::Create());
   uintptr_t instrumentation_exit_pc = GetQuickInstrumentationExitPc();
-  InstallStackVisitor visitor(thread, context.get(), instrumentation_exit_pc,
-                              instrumentation->IsDeoptimizationEnabled());
+  InstallStackVisitor visitor(thread, context.get(), instrumentation_exit_pc);
   visitor.WalkStack(true);
   CHECK_EQ(visitor.dex_pcs_.size(), thread->GetInstrumentationStack()->size());
 
-  if (!instrumentation->IsDeoptimizationEnabled()) {
+  if (!instrumentation->ShouldNotifyMethodEnterExitEvents()) {
     // Create method enter events for all methods currently on the thread's stack. We only do this
     // if no debugger is attached to prevent from posting events twice.
     typedef std::deque<InstrumentationStackFrame>::const_reverse_iterator It;
@@ -295,7 +291,7 @@
             CHECK(m == instrumentation_frame.method_) << PrettyMethod(m);
           }
           SetReturnPc(instrumentation_frame.return_pc_);
-          if (!instrumentation_->IsDeoptimizationEnabled()) {
+          if (!instrumentation_->ShouldNotifyMethodEnterExitEvents()) {
             // Create the method exit events. As the methods didn't really exit the result is 0.
             // We only do this if no debugger is attached to prevent from posting events twice.
             instrumentation_->MethodExitEvent(thread_, instrumentation_frame.this_object_, m,
@@ -586,9 +582,12 @@
 
 void Instrumentation::EnableDeoptimization() {
   CHECK(deoptimized_methods_.empty());
+  CHECK_EQ(deoptimization_enabled_, false);
+  deoptimization_enabled_ = true;
 }
 
 void Instrumentation::DisableDeoptimization() {
+  CHECK_EQ(deoptimization_enabled_, true);
   // If we deoptimized everything, undo it.
   if (interpreter_stubs_installed_) {
     UndeoptimizeEverything();
@@ -599,10 +598,12 @@
     Undeoptimize(*it_begin);
   }
   CHECK(deoptimized_methods_.empty());
+  deoptimization_enabled_ = false;
 }
 
-bool Instrumentation::IsDeoptimizationEnabled() const {
-  return interpreter_stubs_installed_ || !deoptimized_methods_.empty();
+// Indicates if instrumentation should notify method enter/exit events to the listeners.
+bool Instrumentation::ShouldNotifyMethodEnterExitEvents() const {
+  return deoptimization_enabled_ || interpreter_stubs_installed_;
 }
 
 void Instrumentation::DeoptimizeEverything() {
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index f01add1..1ce72bd 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -105,6 +105,7 @@
       have_method_entry_listeners_(false), have_method_exit_listeners_(false),
       have_method_unwind_listeners_(false), have_dex_pc_listeners_(false),
       have_exception_caught_listeners_(false),
+      deoptimization_enabled_(false),
       interpreter_handler_table_(kMainHandlerTable),
       quick_alloc_entry_points_instrumentation_counter_(0) {}
 
@@ -124,7 +125,7 @@
   // Deoptimization.
   void EnableDeoptimization() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   void DisableDeoptimization() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
-  bool IsDeoptimizationEnabled() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool ShouldNotifyMethodEnterExitEvents() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Executes everything with interpreter.
   void DeoptimizeEverything()
@@ -345,6 +346,7 @@
   // only.
   // TODO we need to visit these methods as roots.
   std::set<mirror::ArtMethod*> deoptimized_methods_;
+  bool deoptimization_enabled_;
 
   // Current interpreter handler table. This is updated each time the thread state flags are
   // modified.
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 362df8c..a0665b5 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -2509,8 +2509,7 @@
       JniAbortF("NewDirectByteBuffer", "non-zero capacity for nullptr pointer: %" PRId64, capacity);
     }
 
-    // At the moment, the Java side is limited to 32 bits.
-    CHECK_LE(reinterpret_cast<uintptr_t>(address), 0xffffffff);
+    // At the moment, the capacity is limited to 32 bits.
     CHECK_LE(capacity, 0xffffffff);
     jlong address_arg = reinterpret_cast<jlong>(address);
     jint capacity_arg = static_cast<jint>(capacity);
diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc
index 4c58c84..2dd7d96 100644
--- a/runtime/jni_internal_test.cc
+++ b/runtime/jni_internal_test.cc
@@ -127,7 +127,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "nop", "()V");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("V", 1);
     JValue result;
 
     if (!is_static) {
@@ -143,7 +143,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "identity", "(B)B");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("BB", 2);
     uint32_t* args = arg_array.GetArray();
     JValue result;
 
@@ -179,7 +179,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "identity", "(I)I");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("II", 2);
     uint32_t* args = arg_array.GetArray();
     JValue result;
 
@@ -215,7 +215,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "identity", "(D)D");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("DD", 2);
     uint32_t* args = arg_array.GetArray();
     JValue value;
     JValue result;
@@ -259,7 +259,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "sum", "(II)I");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("III", 3);
     uint32_t* args = arg_array.GetArray();
     JValue result;
 
@@ -305,7 +305,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "sum", "(III)I");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("IIII", 4);
     uint32_t* args = arg_array.GetArray();
     JValue result;
 
@@ -361,7 +361,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "sum", "(IIII)I");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("IIIII", 5);
     uint32_t* args = arg_array.GetArray();
     JValue result;
 
@@ -422,7 +422,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "sum", "(IIIII)I");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("IIIIII", 6);
     uint32_t* args = arg_array.GetArray();
     JValue result;
 
@@ -488,7 +488,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "sum", "(DD)D");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("DDD", 3);
     uint32_t* args = arg_array.GetArray();
     JValue value;
     JValue value2;
@@ -559,7 +559,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "sum", "(DDD)D");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("DDDD", 4);
     uint32_t* args = arg_array.GetArray();
     JValue value;
     JValue value2;
@@ -617,7 +617,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "sum", "(DDDD)D");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("DDDDD", 5);
     uint32_t* args = arg_array.GetArray();
     JValue value;
     JValue value2;
@@ -684,7 +684,7 @@
     mirror::Object* receiver;
     JniInternalTestMakeExecutable(&method, &receiver, is_static, "sum", "(DDDDD)D");
 
-    ArgArray arg_array(NULL, 0);
+    ArgArray arg_array("DDDDDD", 6);
     uint32_t* args = arg_array.GetArray();
     JValue value;
     JValue value2;
@@ -1784,7 +1784,7 @@
   mirror::ArtMethod* method = klass->FindDirectMethod("main", "([Ljava/lang/String;)V");
   ASSERT_TRUE(method != NULL);
 
-  ArgArray arg_array(NULL, 0);
+  ArgArray arg_array("VL", 2);
   arg_array.Append(0U);
   JValue result;
 
diff --git a/runtime/mirror/art_method.cc b/runtime/mirror/art_method.cc
index d5f7597..20d2b18 100644
--- a/runtime/mirror/art_method.cc
+++ b/runtime/mirror/art_method.cc
@@ -37,6 +37,10 @@
 extern "C" void art_portable_invoke_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*, char);
 extern "C" void art_quick_invoke_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*,
                                       const char*);
+#ifdef __x86_64__
+extern "C" void art_quick_invoke_static_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*,
+                                             const char*);
+#endif
 
 // TODO: get global references for these
 Class* ArtMethod::java_lang_reflect_ArtMethod_ = NULL;
@@ -276,7 +280,15 @@
                                                   : GetEntryPointFromPortableCompiledCode());
       }
       if (!IsPortableCompiled()) {
+#ifdef __x86_64__
+        if (!IsStatic()) {
+          (*art_quick_invoke_stub)(this, args, args_size, self, result, shorty);
+        } else {
+          (*art_quick_invoke_static_stub)(this, args, args_size, self, result, shorty);
+        }
+#else
         (*art_quick_invoke_stub)(this, args, args_size, self, result, shorty);
+#endif
       } else {
         (*art_portable_invoke_stub)(this, args, args_size, self, result, shorty[0]);
       }
diff --git a/runtime/mirror/art_method.h b/runtime/mirror/art_method.h
index 71cc7af..86f5348 100644
--- a/runtime/mirror/art_method.h
+++ b/runtime/mirror/art_method.h
@@ -313,9 +313,9 @@
   void SetOatNativeGcMapOffset(uint32_t gc_map_offset);
   uint32_t GetOatNativeGcMapOffset();
 
-  size_t GetFrameSizeInBytes() {
-    DCHECK_EQ(sizeof(size_t), sizeof(uint32_t));
-    size_t result = GetField32(OFFSET_OF_OBJECT_MEMBER(ArtMethod, quick_frame_size_in_bytes_), false);
+  uint32_t GetFrameSizeInBytes() {
+    uint32_t result = GetField32(OFFSET_OF_OBJECT_MEMBER(ArtMethod, quick_frame_size_in_bytes_),
+                                 false);
     DCHECK_LE(static_cast<size_t>(kStackAlignment), result);
     return result;
   }
diff --git a/runtime/object_callbacks.h b/runtime/object_callbacks.h
index 6af338b..468ba08 100644
--- a/runtime/object_callbacks.h
+++ b/runtime/object_callbacks.h
@@ -60,6 +60,7 @@
 // address the object (if the object didn't move, returns the object input parameter).
 typedef mirror::Object* (IsMarkedCallback)(mirror::Object* object, void* arg)
     __attribute__((warn_unused_result));
+typedef void (ProcessMarkStackCallback)(void* arg);
 
 }  // namespace art
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index e66e5af..3ccea36 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1486,12 +1486,18 @@
         (1 << art::x86_64::RSI) | (1 << art::x86_64::RDX) | (1 << art::x86_64::RCX) |
         (1 << art::x86_64::R8) | (1 << art::x86_64::R9);
     uint32_t core_spills = ref_spills | (type == kRefsAndArgs ? arg_spills : 0) |
-                         (1 << art::x86::kNumberOfCpuRegisters);  // fake return address callee save
+                         (1 << art::x86_64::kNumberOfCpuRegisters);  // fake return address callee save
+    uint32_t fp_arg_spills =
+        (1 << art::x86_64::XMM0) | (1 << art::x86_64::XMM1) | (1 << art::x86_64::XMM2) |
+        (1 << art::x86_64::XMM3) | (1 << art::x86_64::XMM4) | (1 << art::x86_64::XMM5) |
+        (1 << art::x86_64::XMM6) | (1 << art::x86_64::XMM7);
+    uint32_t fp_spills = (type == kRefsAndArgs ? fp_arg_spills : 0);
     size_t frame_size = RoundUp((__builtin_popcount(core_spills) /* gprs */ +
+                                 __builtin_popcount(fp_spills) /* fprs */ +
                                  1 /* Method* */) * kPointerSize, kStackAlignment);
     method->SetFrameSizeInBytes(frame_size);
     method->SetCoreSpillMask(core_spills);
-    method->SetFpSpillMask(0);
+    method->SetFpSpillMask(fp_spills);
   } else {
     UNIMPLEMENTED(FATAL) << instruction_set;
   }
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 159de2e..223b8d5 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -429,6 +429,10 @@
     return callee_save_methods_[type];
   }
 
+  static size_t GetCalleeSaveMethodOffset(CalleeSaveType type) {
+    return OFFSETOF_MEMBER(Runtime, callee_save_methods_[type]);
+  }
+
   void SetCalleeSaveMethod(mirror::ArtMethod* method, CalleeSaveType type);
 
   mirror::ArtMethod* CreateCalleeSaveMethod(InstructionSet instruction_set,
diff --git a/test/030-bad-finalizer/expected.txt b/test/030-bad-finalizer/expected.txt
index 88b1896..ee9cfff 100644
--- a/test/030-bad-finalizer/expected.txt
+++ b/test/030-bad-finalizer/expected.txt
@@ -1,7 +1,4 @@
-Constructed object.
-Nulled. Requestion gc.
+About to null reference and request GC.
 Finalizer started and spinning...
 Finalizer done spinning.
 Finalizer sleeping forever now.
-Requesting another GC.
-Requesting another GC.
diff --git a/test/030-bad-finalizer/src/BadFinalizer.java b/test/030-bad-finalizer/src/BadFinalizer.java
deleted file mode 100644
index 6911a02..0000000
--- a/test/030-bad-finalizer/src/BadFinalizer.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (C) 2007 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Class with a bad finalizer.
- */
-public class BadFinalizer {
-    public static void snooze(int ms) {
-        try {
-            Thread.sleep(ms);
-        } catch (InterruptedException ie) {
-            System.out.println("Snooze: " + ie.getMessage());
-        }
-    }
-
-    protected void finalize() {
-        System.out.println("Finalizer started and spinning...");
-        int j = 0;
-
-        /* spin for a bit */
-        long start, end;
-        start = System.nanoTime();
-        for (int i = 0; i < 1000000; i++)
-            j++;
-        end = System.nanoTime();
-        System.out.println("Finalizer done spinning.");
-
-        System.out.println("Finalizer sleeping forever now.");
-        while (true) {
-            snooze(10000);
-        }
-    }
-}
diff --git a/test/030-bad-finalizer/src/Main.java b/test/030-bad-finalizer/src/Main.java
index 330e344..942ee25 100644
--- a/test/030-bad-finalizer/src/Main.java
+++ b/test/030-bad-finalizer/src/Main.java
@@ -21,19 +21,47 @@
     public static void main(String[] args) {
         BadFinalizer bf = new BadFinalizer();
 
-        System.out.println("Constructed object.");
+        System.out.println("About to null reference and request GC.");
         bf = null;
-
-        System.out.println("Nulled. Requestion gc.");
         Runtime.getRuntime().gc();
 
         for (int i = 0; i < 8; i++) {
-            BadFinalizer.snooze(4000);
-            System.out.println("Requesting another GC.");
+            snooze(4000);
             Runtime.getRuntime().gc();
         }
 
-        System.out.println("Done waiting.");
+        System.out.println("UNREACHABLE");
         System.exit(0);
     }
+
+    public static void snooze(int ms) {
+        try {
+            Thread.sleep(ms);
+        } catch (InterruptedException ie) {
+        }
+    }
+
+    /**
+     * Class with a bad finalizer.
+     */
+    public static class BadFinalizer {
+        protected void finalize() {
+            System.out.println("Finalizer started and spinning...");
+            int j = 0;
+
+            /* spin for a bit */
+            long start, end;
+            start = System.nanoTime();
+            for (int i = 0; i < 1000000; i++) {
+                j++;
+            }
+            end = System.nanoTime();
+            System.out.println("Finalizer done spinning.");
+
+            System.out.println("Finalizer sleeping forever now.");
+            while (true) {
+                snooze(10000);
+            }
+        }
+    }
 }
diff --git a/test/036-finalizer/expected.txt b/test/036-finalizer/expected.txt
index f9b29b0..a2a74fc 100644
--- a/test/036-finalizer/expected.txt
+++ b/test/036-finalizer/expected.txt
@@ -1,14 +1,13 @@
-wimp: wahoo
+wimp: [FinalizerTest message=wahoo, finalized=false]
 gc
-finalizer executed: wahoo
 wimp: null
 finalize
 wimp: null
 sleep
-reborn: wahoo
+reborn: [FinalizerTest message=wahoo, finalized=true]
 wimp: null
 reset reborn
 gc + finalize
 sleep
-reborn: nothing
+reborn: [FinalizerTest message=nothing, finalized=false]
 wimp: null
diff --git a/test/036-finalizer/src/FinalizerTest.java b/test/036-finalizer/src/FinalizerTest.java
deleted file mode 100644
index b0d014d..0000000
--- a/test/036-finalizer/src/FinalizerTest.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.lang.ref.WeakReference;
-
-public class FinalizerTest {
-    public static FinalizerTest mNothing = new FinalizerTest("nothing");
-    public static FinalizerTest mReborn = mNothing;
-
-    public String mMsg = "default";
-
-    public FinalizerTest(String msg) {
-        mMsg = msg;
-    }
-
-    public String toString() {
-        return mMsg;
-    }
-
-    protected void finalize() {
-        System.out.println("finalizer executed: " + mMsg);
-        mReborn = this;
-    }
-}
diff --git a/test/036-finalizer/src/Main.java b/test/036-finalizer/src/Main.java
index 6195aff..328425f 100644
--- a/test/036-finalizer/src/Main.java
+++ b/test/036-finalizer/src/Main.java
@@ -15,6 +15,8 @@
  */
 
 import java.lang.ref.WeakReference;
+import java.util.ArrayList;
+import java.util.List;
 
 /**
  * Some finalizer tests.
@@ -31,18 +33,19 @@
         }
     }
 
-    public static WeakReference makeRef() {
+    public static WeakReference<FinalizerTest> makeRef() {
         /*
          * Make ft in another thread, so there is no danger of
          * a conservative reference leaking onto the main thread's
          * stack.
          */
 
-        final WeakReference[] wimp = new WeakReference[1];
+        final List<WeakReference<FinalizerTest>> wimp =
+                new ArrayList<WeakReference<FinalizerTest>>();
         Thread t = new Thread() {
                 public void run() {
                     FinalizerTest ft = new FinalizerTest("wahoo");
-                    wimp[0] = new WeakReference(ft);
+                    wimp.add(new WeakReference<FinalizerTest>(ft));
                     ft = null;
                 }
             };
@@ -55,10 +58,10 @@
             throw new RuntimeException(ie);
         }
 
-        return wimp[0];
+        return wimp.get(0);
     }
 
-    public static String wimpString(final WeakReference wimp) {
+    public static String wimpString(final WeakReference<FinalizerTest> wimp) {
         /*
          * Do the work in another thread, so there is no danger of a
          * conservative reference to ft leaking onto the main thread's
@@ -68,7 +71,7 @@
         final String[] s = new String[1];
         Thread t = new Thread() {
                 public void run() {
-                    Object ref = wimp.get();
+                    FinalizerTest ref = wimp.get();
                     if (ref != null) {
                         s[0] = ref.toString();
                     }
@@ -87,7 +90,7 @@
     }
 
     public static void main(String[] args) {
-        WeakReference wimp = makeRef();
+        WeakReference<FinalizerTest> wimp = makeRef();
 
         System.out.println("wimp: " + wimpString(wimp));
 
@@ -118,4 +121,26 @@
         System.out.println("reborn: " + FinalizerTest.mReborn);
         System.out.println("wimp: " + wimpString(wimp));
     }
+
+    public static class FinalizerTest {
+        public static FinalizerTest mNothing = new FinalizerTest("nothing");
+        public static FinalizerTest mReborn = mNothing;
+
+        private final String message;
+        private boolean finalized = false;
+
+        public FinalizerTest(String message) {
+            this.message = message;
+        }
+
+        public String toString() {
+            return "[FinalizerTest message=" + message +
+                    ", finalized=" + finalized + "]";
+        }
+
+        protected void finalize() {
+            finalized = true;
+            mReborn = this;
+        }
+    }
 }
diff --git a/test/048-server-socket/expected.txt b/test/048-server-socket/expected.txt
deleted file mode 100644
index 23c3e84..0000000
--- a/test/048-server-socket/expected.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-opened!
-closed!
-reopened!
-done
diff --git a/test/048-server-socket/info.txt b/test/048-server-socket/info.txt
deleted file mode 100644
index 08127da..0000000
--- a/test/048-server-socket/info.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-This is a miscellaneous test that was imported into the new-at-the-time
-runtime test framework. The test is intended to exercise basic features,
-and as such cannot be build on top of junit, since failure of such basic
-features might disrupt junit.
-
-TODO: Real description goes here.
diff --git a/test/048-server-socket/src/Main.java b/test/048-server-socket/src/Main.java
deleted file mode 100644
index 5b287ca..0000000
--- a/test/048-server-socket/src/Main.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (C) 2007 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.net.ServerSocket;
-import java.io.IOException;
-
-
-/**
- * Quick server socket test.
- */
-public class Main {
-    private static void snooze(int sec) {
-        try {
-            Thread.sleep(sec * 1000);
-        } catch (InterruptedException ie) {
-            ie.printStackTrace();
-        }
-    }
-
-    public static void main(String[] args) {
-        ServerSocket socket;
-
-        try {
-            socket = new ServerSocket(7890);
-        } catch (IOException ioe) {
-            System.out.println("couldn't open socket " + ioe.getMessage());
-            return;
-        }
-
-        System.out.println("opened!");
-        snooze(1);
-
-        try {
-            socket.close();
-        } catch (IOException ioe) {
-            System.out.println("couldn't close socket " + ioe.getMessage());
-            return;
-        }
-
-        System.out.println("closed!");
-        snooze(1);
-
-        try {
-            socket = new ServerSocket(7890);
-        } catch (IOException ioe) {
-            System.out.println("couldn't reopen socket " + ioe.getMessage());
-            return;
-        }
-
-        System.out.println("reopened!");
-        System.out.println("done");
-    }
-}