Merge "art_quick_lock_object uses registers incorrectly"
diff --git a/compiler/dex/local_value_numbering.cc b/compiler/dex/local_value_numbering.cc
index 8dbc2bb..c0068b2 100644
--- a/compiler/dex/local_value_numbering.cc
+++ b/compiler/dex/local_value_numbering.cc
@@ -215,17 +215,13 @@
     case Instruction::CONST_STRING_JUMBO:
     case Instruction::CONST_CLASS:
     case Instruction::NEW_ARRAY:
-      if ((mir->optimization_flags & MIR_INLINED) == 0) {
-        // 1 result, treat as unique each time, use result s_reg - will be unique.
-        res = MarkNonAliasingNonNull(mir);
-      }
+      // 1 result, treat as unique each time, use result s_reg - will be unique.
+      res = MarkNonAliasingNonNull(mir);
       break;
     case Instruction::MOVE_RESULT_WIDE:
-      if ((mir->optimization_flags & MIR_INLINED) == 0) {
-        // 1 wide result, treat as unique each time, use result s_reg - will be unique.
-        res = GetOperandValueWide(mir->ssa_rep->defs[0]);
-        SetOperandValueWide(mir->ssa_rep->defs[0], res);
-      }
+      // 1 wide result, treat as unique each time, use result s_reg - will be unique.
+      res = GetOperandValueWide(mir->ssa_rep->defs[0]);
+      SetOperandValueWide(mir->ssa_rep->defs[0], res);
       break;
 
     case kMirOpPhi:
diff --git a/compiler/dex/quick/arm/arm_lir.h b/compiler/dex/quick/arm/arm_lir.h
index 1784af3..c9acd66 100644
--- a/compiler/dex/quick/arm/arm_lir.h
+++ b/compiler/dex/quick/arm/arm_lir.h
@@ -454,8 +454,6 @@
   kThumb2Vcmps,      // vcmp [111011101] D [11010] rd[15-12] [1011] E [1] M [0] rm[3-0].
   kThumb2LdrPcRel12,  // ldr rd,[pc,#imm12] [1111100011011111] rt[15-12] imm12[11-0].
   kThumb2BCond,      // b<c> [1110] S cond[25-22] imm6[21-16] [10] J1 [0] J2 imm11[10..0].
-  kThumb2Vmovd_RR,   // vmov [111011101] D [110000] vd[15-12 [101101] M [0] vm[3-0].
-  kThumb2Vmovs_RR,   // vmov [111011101] D [110000] vd[15-12 [101001] M [0] vm[3-0].
   kThumb2Fmrs,       // vmov [111011100000] vn[19-16] rt[15-12] [1010] N [0010000].
   kThumb2Fmsr,       // vmov [111011100001] vn[19-16] rt[15-12] [1010] N [0010000].
   kThumb2Fmrrd,      // vmov [111011000100] rt2[19-16] rt[15-12] [101100] M [1] vm[3-0].
diff --git a/compiler/dex/quick/arm/assemble_arm.cc b/compiler/dex/quick/arm/assemble_arm.cc
index 1c35018..f77b0a6 100644
--- a/compiler/dex/quick/arm/assemble_arm.cc
+++ b/compiler/dex/quick/arm/assemble_arm.cc
@@ -848,14 +848,6 @@
                  kFmtUnused, -1, -1,
                  IS_BINARY_OP | IS_BRANCH | USES_CCODES | NEEDS_FIXUP,
                  "b!1c", "!0t", 4, kFixupCondBranch),
-    ENCODING_MAP(kThumb2Vmovd_RR,       0xeeb00b40,
-                 kFmtDfp, 22, 12, kFmtDfp, 5, 0, kFmtUnused, -1, -1,
-                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
-                 "vmov.f64", "!0S, !1S", 4, kFixupNone),
-    ENCODING_MAP(kThumb2Vmovs_RR,       0xeeb00a40,
-                 kFmtSfp, 22, 12, kFmtSfp, 5, 0, kFmtUnused, -1, -1,
-                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
-                 "vmov.f32", "!0s, !1s", 4, kFixupNone),
     ENCODING_MAP(kThumb2Fmrs,       0xee100a10,
                  kFmtBitBlt, 15, 12, kFmtSfp, 7, 16, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 1c563bb..1abb91d 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -434,10 +434,6 @@
   if (pattern == DivideNone) {
     return false;
   }
-  // Tuning: add rem patterns
-  if (!is_div) {
-    return false;
-  }
 
   RegStorage r_magic = AllocTemp();
   LoadConstant(r_magic, magic_table[lit].magic);
@@ -445,25 +441,45 @@
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage r_hi = AllocTemp();
   RegStorage r_lo = AllocTemp();
+
+  // rl_dest and rl_src might overlap.
+  // Reuse r_hi to save the div result for reminder case.
+  RegStorage r_div_result = is_div ? rl_result.reg : r_hi;
+
   NewLIR4(kThumb2Smull, r_lo.GetReg(), r_hi.GetReg(), r_magic.GetReg(), rl_src.reg.GetReg());
   switch (pattern) {
     case Divide3:
-      OpRegRegRegShift(kOpSub, rl_result.reg, r_hi, rl_src.reg, EncodeShift(kArmAsr, 31));
+      OpRegRegRegShift(kOpSub, r_div_result, r_hi, rl_src.reg, EncodeShift(kArmAsr, 31));
       break;
     case Divide5:
       OpRegRegImm(kOpAsr, r_lo, rl_src.reg, 31);
-      OpRegRegRegShift(kOpRsub, rl_result.reg, r_lo, r_hi,
+      OpRegRegRegShift(kOpRsub, r_div_result, r_lo, r_hi,
                        EncodeShift(kArmAsr, magic_table[lit].shift));
       break;
     case Divide7:
       OpRegReg(kOpAdd, r_hi, rl_src.reg);
       OpRegRegImm(kOpAsr, r_lo, rl_src.reg, 31);
-      OpRegRegRegShift(kOpRsub, rl_result.reg, r_lo, r_hi,
+      OpRegRegRegShift(kOpRsub, r_div_result, r_lo, r_hi,
                        EncodeShift(kArmAsr, magic_table[lit].shift));
       break;
     default:
       LOG(FATAL) << "Unexpected pattern: " << pattern;
   }
+
+  if (!is_div) {
+    // div_result = src / lit
+    // tmp1 = div_result * lit
+    // dest = src - tmp1
+    RegStorage tmp1 = r_lo;
+    EasyMultiplyOp ops[2];
+
+    bool canEasyMultiply = GetEasyMultiplyTwoOps(lit, ops);
+    DCHECK_NE(canEasyMultiply, false);
+
+    GenEasyMultiplyTwoOps(tmp1, r_div_result, ops);
+    OpRegRegReg(kOpSub, rl_result.reg, rl_src.reg, tmp1);
+  }
+
   StoreValue(rl_dest, rl_result);
   return true;
 }
@@ -489,6 +505,7 @@
   }
 
   op->op = kOpInvalid;
+  op->shift = 0;
   return false;
 }
 
@@ -497,6 +514,7 @@
   GetEasyMultiplyOp(lit, &ops[0]);
   if (GetEasyMultiplyOp(lit, &ops[0])) {
     ops[1].op = kOpInvalid;
+    ops[1].shift = 0;
     return true;
   }
 
@@ -527,31 +545,52 @@
   return false;
 }
 
+// Generate instructions to do multiply.
+// Additional temporary register is required,
+// if it need to generate 2 instructions and src/dest overlap.
 void ArmMir2Lir::GenEasyMultiplyTwoOps(RegStorage r_dest, RegStorage r_src, EasyMultiplyOp* ops) {
-  // dest = ( src << shift1) + [ src | -src | 0 ]
-  // dest = (dest << shift2) + [ src | -src | 0 ]
-  for (int i = 0; i < 2; i++) {
-    RegStorage r_src2;
-    if (i == 0) {
-      r_src2 = r_src;
-    } else {
-      r_src2 = r_dest;
-    }
-    switch (ops[i].op) {
+  // tmp1 = ( src << shift1) + [ src | -src | 0 ]
+  // dest = (tmp1 << shift2) + [ src | -src | 0 ]
+
+  RegStorage r_tmp1;
+  if (ops[1].op == kOpInvalid) {
+    r_tmp1 = r_dest;
+  } else if (r_dest.GetReg() != r_src.GetReg()) {
+    r_tmp1 = r_dest;
+  } else {
+    r_tmp1 = AllocTemp();
+  }
+
+  switch (ops[0].op) {
     case kOpLsl:
-      OpRegRegImm(kOpLsl, r_dest, r_src2, ops[i].shift);
+      OpRegRegImm(kOpLsl, r_tmp1, r_src, ops[0].shift);
       break;
     case kOpAdd:
-      OpRegRegRegShift(kOpAdd, r_dest, r_src, r_src2, EncodeShift(kArmLsl, ops[i].shift));
+      OpRegRegRegShift(kOpAdd, r_tmp1, r_src, r_src, EncodeShift(kArmLsl, ops[0].shift));
       break;
     case kOpRsub:
-      OpRegRegRegShift(kOpRsub, r_dest, r_src, r_src2, EncodeShift(kArmLsl, ops[i].shift));
+      OpRegRegRegShift(kOpRsub, r_tmp1, r_src, r_src, EncodeShift(kArmLsl, ops[0].shift));
       break;
     default:
-      DCHECK_NE(i, 0);
-      DCHECK_EQ(ops[i].op, kOpInvalid);
+      DCHECK_EQ(ops[0].op, kOpInvalid);
       break;
-    }
+  }
+
+  switch (ops[1].op) {
+    case kOpInvalid:
+      return;
+    case kOpLsl:
+      OpRegRegImm(kOpLsl, r_dest, r_tmp1, ops[1].shift);
+      break;
+    case kOpAdd:
+      OpRegRegRegShift(kOpAdd, r_dest, r_src, r_tmp1, EncodeShift(kArmLsl, ops[1].shift));
+      break;
+    case kOpRsub:
+      OpRegRegRegShift(kOpRsub, r_dest, r_src, r_tmp1, EncodeShift(kArmLsl, ops[1].shift));
+      break;
+    default:
+      LOG(FATAL) << "Unexpected opcode passed to GenEasyMultiplyTwoOps";
+      break;
   }
 }
 
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index 5e9a8b0..1053a8f 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -562,7 +562,8 @@
 
   // Keep special registers from being allocated
   // Don't reserve the r4 if we are doing implicit suspend checks.
-  bool no_suspend = NO_SUSPEND || !Runtime::Current()->ExplicitSuspendChecks();
+  // TODO: re-enable this when we can safely save r4 over the suspension code path.
+  bool no_suspend = NO_SUSPEND;  // || !Runtime::Current()->ExplicitSuspendChecks();
   for (int i = 0; i < num_reserved; i++) {
     if (no_suspend && (ReservedRegs[i] == rARM_SUSPEND)) {
       // Don't reserve the suspend register.
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc
index fa6de96..06eff4e 100644
--- a/compiler/dex/quick/dex_file_method_inliner.cc
+++ b/compiler/dex/quick/dex_file_method_inliner.cc
@@ -47,6 +47,22 @@
   return insn;
 }
 
+uint32_t GetInvokeReg(MIR* invoke, uint32_t arg) {
+  DCHECK_LT(arg, invoke->dalvikInsn.vA);
+  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc) {
+    return invoke->dalvikInsn.vC + arg;  // Non-range invoke.
+  } else {
+    DCHECK_EQ(Instruction::FormatOf(invoke->dalvikInsn.opcode), Instruction::k35c);
+    return invoke->dalvikInsn.arg[arg];  // Range invoke.
+  }
+}
+
+bool WideArgIsInConsecutiveDalvikRegs(MIR* invoke, uint32_t arg) {
+  DCHECK_LT(arg + 1, invoke->dalvikInsn.vA);
+  return Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc ||
+      invoke->dalvikInsn.arg[arg + 1u] == invoke->dalvikInsn.arg[arg] + 1u;
+}
+
 }  // anonymous namespace
 
 const uint32_t DexFileMethodInliner::kIndexUnresolved;
@@ -396,7 +412,8 @@
       result = GenInlineIGet(mir_graph, bb, invoke, move_result, method, method_idx);
       break;
     case kInlineOpIPut:
-      result = GenInlineIPut(mir_graph, bb, invoke, method, method_idx);
+      move_result = mir_graph->FindMoveResult(bb, invoke);
+      result = GenInlineIPut(mir_graph, bb, invoke, move_result, method, method_idx);
       break;
     default:
       LOG(FATAL) << "Unexpected inline op: " << method.opcode;
@@ -578,25 +595,24 @@
   // Select opcode and argument.
   const InlineReturnArgData& data = method.d.return_data;
   Instruction::Code opcode = Instruction::MOVE_FROM16;
+  uint32_t arg = GetInvokeReg(invoke, data.arg);
   if (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_OBJECT) {
     DCHECK_EQ(data.is_object, 1u);
+    DCHECK_EQ(data.is_wide, 0u);
     opcode = Instruction::MOVE_OBJECT_FROM16;
   } else if (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_WIDE) {
     DCHECK_EQ(data.is_wide, 1u);
+    DCHECK_EQ(data.is_object, 0u);
     opcode = Instruction::MOVE_WIDE_FROM16;
+    if (!WideArgIsInConsecutiveDalvikRegs(invoke, data.arg)) {
+      // The two halfs of the source value are not in consecutive dalvik registers in INVOKE.
+      return false;
+    }
   } else {
     DCHECK(move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT);
     DCHECK_EQ(data.is_wide, 0u);
     DCHECK_EQ(data.is_object, 0u);
   }
-  DCHECK_LT(data.is_wide ? data.arg + 1u : data.arg, invoke->dalvikInsn.vA);
-  int arg;
-  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k35c) {
-    arg = invoke->dalvikInsn.arg[data.arg];  // Non-range invoke.
-  } else {
-    DCHECK_EQ(Instruction::FormatOf(invoke->dalvikInsn.opcode), Instruction::k3rc);
-    arg = invoke->dalvikInsn.vC + data.arg;  // Range invoke.
-  }
 
   // Insert the move instruction
   MIR* insn = AllocReplacementMIR(mir_graph, invoke, move_result);
@@ -616,33 +632,39 @@
   }
 
   const InlineIGetIPutData& data = method.d.ifield_data;
-  if (invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
-      invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE ||
-      data.object_arg != 0) {
-    // TODO: Implement inlining of IGET on non-"this" registers (needs correct stack trace for NPE).
-    return false;
-  }
+  Instruction::Code opcode = static_cast<Instruction::Code>(Instruction::IGET + data.op_variant);
+  DCHECK_EQ(InlineMethodAnalyser::IGetVariant(opcode), data.op_variant);
+  uint32_t object_reg = GetInvokeReg(invoke, data.object_arg);
 
   if (move_result == nullptr) {
     // Result is unused. If volatile, we still need to emit the IGET but we have no destination.
     return !data.is_volatile;
   }
 
-  Instruction::Code opcode = static_cast<Instruction::Code>(Instruction::IGET + data.op_variant);
-  DCHECK_EQ(InlineMethodAnalyser::IGetVariant(opcode), data.op_variant);
+  DCHECK_EQ(data.method_is_static != 0u,
+            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
+            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE);
+  bool object_is_this = (data.method_is_static == 0u && data.object_arg == 0u);
+  if (!object_is_this) {
+    // TODO: Implement inlining of IGET on non-"this" registers (needs correct stack trace for NPE).
+    // Allow synthetic accessors. We don't care about losing their stack frame in NPE.
+    if (!InlineMethodAnalyser::IsSyntheticAccessor(
+        mir_graph->GetMethodLoweringInfo(invoke).GetTargetMethod())) {
+      return false;
+    }
+  }
+
+  if (object_is_this) {
+    // Mark invoke as NOP, null-check is done on IGET. No aborts after this.
+    invoke->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+  }
 
   MIR* insn = AllocReplacementMIR(mir_graph, invoke, move_result);
   insn->width += insn->offset - invoke->offset;
   insn->offset = invoke->offset;
   insn->dalvikInsn.opcode = opcode;
   insn->dalvikInsn.vA = move_result->dalvikInsn.vA;
-  DCHECK_LT(data.object_arg, invoke->dalvikInsn.vA);
-  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc) {
-    insn->dalvikInsn.vB = invoke->dalvikInsn.vC + data.object_arg;
-  } else {
-    DCHECK_EQ(Instruction::FormatOf(invoke->dalvikInsn.opcode), Instruction::k35c);
-    insn->dalvikInsn.vB = invoke->dalvikInsn.arg[data.object_arg];
-  }
+  insn->dalvikInsn.vB = object_reg;
   mir_graph->ComputeInlineIFieldLoweringInfo(data.field_idx, invoke, insn);
 
   DCHECK(mir_graph->GetIFieldLoweringInfo(insn).IsResolved());
@@ -655,32 +677,55 @@
 }
 
 bool DexFileMethodInliner::GenInlineIPut(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
-                                         const InlineMethod& method, uint32_t method_idx) {
+                                         MIR* move_result, const InlineMethod& method,
+                                         uint32_t method_idx) {
   CompilationUnit* cu = mir_graph->GetCurrentDexCompilationUnit()->GetCompilationUnit();
   if (cu->enable_debug & (1 << kDebugSlowFieldPath)) {
     return false;
   }
 
   const InlineIGetIPutData& data = method.d.ifield_data;
-  if (invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
-      invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE ||
-      data.object_arg != 0) {
-    // TODO: Implement inlining of IPUT on non-"this" registers (needs correct stack trace for NPE).
+  Instruction::Code opcode = static_cast<Instruction::Code>(Instruction::IPUT + data.op_variant);
+  DCHECK_EQ(InlineMethodAnalyser::IPutVariant(opcode), data.op_variant);
+  uint32_t object_reg = GetInvokeReg(invoke, data.object_arg);
+  uint32_t src_reg = GetInvokeReg(invoke, data.src_arg);
+  uint32_t return_reg =
+      data.return_arg_plus1 != 0u ? GetInvokeReg(invoke, data.return_arg_plus1 - 1u) : 0u;
+
+  if (opcode == Instruction::IPUT_WIDE && !WideArgIsInConsecutiveDalvikRegs(invoke, data.src_arg)) {
+    // The two halfs of the source value are not in consecutive dalvik registers in INVOKE.
     return false;
   }
 
-  Instruction::Code opcode = static_cast<Instruction::Code>(Instruction::IPUT + data.op_variant);
-  DCHECK_EQ(InlineMethodAnalyser::IPutVariant(opcode), data.op_variant);
-
-  MIR* insn = AllocReplacementMIR(mir_graph, invoke, nullptr);
-  insn->dalvikInsn.opcode = opcode;
-  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc) {
-    insn->dalvikInsn.vA = invoke->dalvikInsn.vC + data.src_arg;
-    insn->dalvikInsn.vB = invoke->dalvikInsn.vC + data.object_arg;
-  } else {
-    insn->dalvikInsn.vA = invoke->dalvikInsn.arg[data.src_arg];
-    insn->dalvikInsn.vB = invoke->dalvikInsn.arg[data.object_arg];
+  DCHECK(move_result == nullptr || data.return_arg_plus1 != 0u);
+  if (move_result != nullptr && move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_WIDE &&
+      !WideArgIsInConsecutiveDalvikRegs(invoke, data.return_arg_plus1 - 1u)) {
+    // The two halfs of the return value are not in consecutive dalvik registers in INVOKE.
+    return false;
   }
+
+  DCHECK_EQ(data.method_is_static != 0u,
+            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
+            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE);
+  bool object_is_this = (data.method_is_static == 0u && data.object_arg == 0u);
+  if (!object_is_this) {
+    // TODO: Implement inlining of IPUT on non-"this" registers (needs correct stack trace for NPE).
+    // Allow synthetic accessors. We don't care about losing their stack frame in NPE.
+    if (!InlineMethodAnalyser::IsSyntheticAccessor(
+        mir_graph->GetMethodLoweringInfo(invoke).GetTargetMethod())) {
+      return false;
+    }
+  }
+
+  if (object_is_this) {
+    // Mark invoke as NOP, null-check is done on IPUT. No aborts after this.
+    invoke->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+  }
+
+  MIR* insn = AllocReplacementMIR(mir_graph, invoke, move_result);
+  insn->dalvikInsn.opcode = opcode;
+  insn->dalvikInsn.vA = src_reg;
+  insn->dalvikInsn.vB = object_reg;
   mir_graph->ComputeInlineIFieldLoweringInfo(data.field_idx, invoke, insn);
 
   DCHECK(mir_graph->GetIFieldLoweringInfo(insn).IsResolved());
@@ -689,6 +734,23 @@
   DCHECK_EQ(data.is_volatile, mir_graph->GetIFieldLoweringInfo(insn).IsVolatile() ? 1u : 0u);
 
   bb->InsertMIRAfter(invoke, insn);
+
+  if (move_result != nullptr) {
+    MIR* move = AllocReplacementMIR(mir_graph, invoke, move_result);
+    insn->width = invoke->width;
+    move->offset = move_result->offset;
+    move->width = move_result->width;
+    if (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT) {
+      move->dalvikInsn.opcode = Instruction::MOVE_FROM16;
+    } else if (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_OBJECT) {
+      move->dalvikInsn.opcode = Instruction::MOVE_OBJECT_FROM16;
+    } else {
+      DCHECK_EQ(move_result->dalvikInsn.opcode, Instruction::MOVE_RESULT_WIDE);
+      move->dalvikInsn.opcode = Instruction::MOVE_WIDE_FROM16;
+    }
+    move->dalvikInsn.vA = move_result->dalvikInsn.vA;
+    move->dalvikInsn.vB = return_reg;
+  }
   return true;
 }
 
diff --git a/compiler/dex/quick/dex_file_method_inliner.h b/compiler/dex/quick/dex_file_method_inliner.h
index b4e190a..c03f89c 100644
--- a/compiler/dex/quick/dex_file_method_inliner.h
+++ b/compiler/dex/quick/dex_file_method_inliner.h
@@ -302,7 +302,7 @@
     static bool GenInlineIGet(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
                               MIR* move_result, const InlineMethod& method, uint32_t method_idx);
     static bool GenInlineIPut(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
-                              const InlineMethod& method, uint32_t method_idx);
+                              MIR* move_result, const InlineMethod& method, uint32_t method_idx);
 
     ReaderWriterMutex lock_;
     /*
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index bfa22da..a3fb420 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -1654,9 +1654,8 @@
     StoreValue(rl_dest, rl_result);
     return true;
   }
-  // There is RegRegRegShift on Arm, so check for more special cases.
-  // TODO: disabled, need to handle case of "dest == src" properly.
-  if (false && cu_->instruction_set == kThumb2) {
+  // There is RegRegRegShift on Arm, so check for more special cases
+  if (cu_->instruction_set == kThumb2) {
     return EasyMultiply(rl_src, rl_dest, lit);
   }
   // Can we simplify this multiplication?
diff --git a/compiler/dex/quick/gen_loadstore.cc b/compiler/dex/quick/gen_loadstore.cc
index 897d86d..208eadd 100644
--- a/compiler/dex/quick/gen_loadstore.cc
+++ b/compiler/dex/quick/gen_loadstore.cc
@@ -211,7 +211,12 @@
     LoadValueDirectWide(rl_src, rl_src.reg);
     rl_src.location = kLocPhysReg;
     MarkLive(rl_src.reg.GetLow(), rl_src.s_reg_low);
-    MarkLive(rl_src.reg.GetHigh(), GetSRegHi(rl_src.s_reg_low));
+    if (rl_src.reg.GetLowReg() != rl_src.reg.GetHighReg()) {
+      MarkLive(rl_src.reg.GetHigh(), GetSRegHi(rl_src.s_reg_low));
+    } else {
+      // This must be an x86 vector register value.
+      DCHECK(IsFpReg(rl_src.reg) && (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64));
+    }
   }
   return rl_src;
 }
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 73fdc82..6fcdf70 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -120,7 +120,7 @@
 bool Mir2Lir::GenSpecialIGet(MIR* mir, const InlineMethod& special) {
   // FastInstance() already checked by DexFileMethodInliner.
   const InlineIGetIPutData& data = special.d.ifield_data;
-  if (data.method_is_static || data.object_arg != 0) {
+  if (data.method_is_static != 0u || data.object_arg != 0u) {
     // The object is not "this" and has to be null-checked.
     return false;
   }
@@ -151,10 +151,14 @@
 bool Mir2Lir::GenSpecialIPut(MIR* mir, const InlineMethod& special) {
   // FastInstance() already checked by DexFileMethodInliner.
   const InlineIGetIPutData& data = special.d.ifield_data;
-  if (data.method_is_static || data.object_arg != 0) {
+  if (data.method_is_static != 0u || data.object_arg != 0u) {
     // The object is not "this" and has to be null-checked.
     return false;
   }
+  if (data.return_arg_plus1 != 0u) {
+    // The setter returns a method argument which we don't support here.
+    return false;
+  }
 
   bool wide = (data.op_variant == InlineMethodAnalyser::IPutVariant(Instruction::IPUT_WIDE));
 
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index b12b6a7..a241d51 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1898,8 +1898,9 @@
       compiled_method = compiler_->Compile(
           *this, code_item, access_flags, invoke_type, class_def_idx,
           method_idx, class_loader, dex_file);
-    } else if (dex_to_dex_compilation_level != kDontDexToDexCompile) {
-      // TODO: add a mode to disable DEX-to-DEX compilation ?
+    }
+    if (compiled_method == nullptr && dex_to_dex_compilation_level != kDontDexToDexCompile) {
+      // TODO: add a command-line option to disable DEX-to-DEX compilation ?
       (*dex_to_dex_compiler_)(*this, code_item, access_flags,
                               invoke_type, class_def_idx,
                               method_idx, class_loader, dex_file,
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 0554876..d90405a 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -193,7 +193,8 @@
       break;
     }
 
-    case Instruction::INVOKE_STATIC: {
+    case Instruction::INVOKE_STATIC:
+    case Instruction::INVOKE_DIRECT: {
       uint32_t method_idx = instruction.VRegB_35c();
       const DexFile::MethodId& method_id = dex_file_->GetMethodId(method_idx);
       uint32_t return_type_idx = dex_file_->GetProtoId(method_id.proto_idx_).return_type_idx_;
@@ -204,6 +205,7 @@
         return false;
       }
 
+      // Treat invoke-direct like static calls for now.
       HInvokeStatic* invoke = new (arena_) HInvokeStatic(
           arena_, number_of_arguments, dex_offset, method_idx);
 
@@ -221,7 +223,8 @@
       break;
     }
 
-    case Instruction::INVOKE_STATIC_RANGE: {
+    case Instruction::INVOKE_STATIC_RANGE:
+    case Instruction::INVOKE_DIRECT_RANGE: {
       uint32_t method_idx = instruction.VRegB_3rc();
       const DexFile::MethodId& method_id = dex_file_->GetMethodId(method_idx);
       uint32_t return_type_idx = dex_file_->GetProtoId(method_id.proto_idx_).return_type_idx_;
@@ -232,6 +235,7 @@
         return false;
       }
 
+      // Treat invoke-direct like static calls for now.
       HInvokeStatic* invoke = new (arena_) HInvokeStatic(
           arena_, number_of_arguments, dex_offset, method_idx);
       int32_t register_index = instruction.VRegC();
@@ -277,6 +281,13 @@
       break;
     }
 
+    case Instruction::NEW_INSTANCE: {
+      current_block_->AddInstruction(
+          new (arena_) HNewInstance(dex_offset, instruction.VRegB_21c()));
+      UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+      break;
+    }
+
     case Instruction::NOP:
       break;
 
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 05e5d7b..d6295db 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -30,7 +30,7 @@
 namespace art {
 
 void CodeGenerator::Compile(CodeAllocator* allocator) {
-  frame_size_ = GetGraph()->GetMaximumNumberOfOutVRegs() * kWordSize;
+  frame_size_ = GetGraph()->GetMaximumNumberOfOutVRegs() * GetWordSize();
   const GrowableArray<HBasicBlock*>* blocks = GetGraph()->GetBlocks();
   DCHECK(blocks->Get(0) == GetGraph()->GetEntryBlock());
   DCHECK(GoesToNextBlock(GetGraph()->GetEntryBlock(), blocks->Get(1)));
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 01bbcc0..e144733 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -145,6 +145,7 @@
   virtual HGraphVisitor* GetLocationBuilder() = 0;
   virtual HGraphVisitor* GetInstructionVisitor() = 0;
   virtual Assembler* GetAssembler() = 0;
+  virtual size_t GetWordSize() const = 0;
 
   uint32_t GetFrameSize() const { return frame_size_; }
   void SetFrameSize(uint32_t size) { frame_size_ = size; }
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 09d6f7b..cb77f57 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -39,14 +39,14 @@
   __ PushList((1 << LR));
 
   // Add the current ART method to the frame size and the return PC.
-  SetFrameSize(RoundUp(GetFrameSize() + 2 * kWordSize, kStackAlignment));
+  SetFrameSize(RoundUp(GetFrameSize() + 2 * kArmWordSize, kStackAlignment));
   // The retrn PC has already been pushed on the stack.
-  __ AddConstant(SP, -(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kWordSize));
+  __ AddConstant(SP, -(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kArmWordSize));
   __ str(R0, Address(SP, 0));
 }
 
 void CodeGeneratorARM::GenerateFrameExit() {
-  __ AddConstant(SP, GetFrameSize() - kNumberOfPushedRegistersAtEntry * kWordSize);
+  __ AddConstant(SP, GetFrameSize() - kNumberOfPushedRegistersAtEntry * kArmWordSize);
   __ PopList((1 << PC));
 }
 
@@ -55,7 +55,7 @@
 }
 
 int32_t CodeGeneratorARM::GetStackSlot(HLocal* local) const {
-  return (GetGraph()->GetMaximumNumberOfOutVRegs() + local->GetRegNumber()) * kWordSize;
+  return (GetGraph()->GetMaximumNumberOfOutVRegs() + local->GetRegNumber()) * kArmWordSize;
 }
 
 void CodeGeneratorARM::Move(HInstruction* instruction, Location location, HInstruction* move_for) {
@@ -134,7 +134,7 @@
 
 void InstructionCodeGeneratorARM::VisitLocal(HLocal* local) {
   DCHECK_EQ(local->GetBlock(), GetGraph()->GetEntryBlock());
-  codegen_->SetFrameSize(codegen_->GetFrameSize() + kWordSize);
+  codegen_->SetFrameSize(codegen_->GetFrameSize() + kArmWordSize);
 }
 
 void LocationsBuilderARM::VisitLoadLocal(HLoadLocal* load) {
@@ -185,7 +185,7 @@
 }
 
 static constexpr Register kParameterCoreRegisters[] = { R1, R2, R3 };
-static constexpr int kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
+static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
 
 class InvokeStaticCallingConvention : public CallingConvention<Register> {
  public:
@@ -235,7 +235,7 @@
 void InstructionCodeGeneratorARM::VisitInvokeStatic(HInvokeStatic* invoke) {
   Register temp = invoke->GetLocations()->GetTemp(0).reg<Register>();
   size_t index_in_cache = mirror::Array::DataOffset(sizeof(mirror::Object*)).Int32Value() +
-      invoke->GetIndexInDexCache() * kWordSize;
+      invoke->GetIndexInDexCache() * kArmWordSize;
 
   // TODO: Implement all kinds of calls:
   // 1) boot -> boot
@@ -287,5 +287,37 @@
   }
 }
 
+static constexpr Register kRuntimeParameterCoreRegisters[] = { R0, R1 };
+static constexpr size_t kRuntimeParameterCoreRegistersLength =
+    arraysize(kRuntimeParameterCoreRegisters);
+
+class InvokeRuntimeCallingConvention : public CallingConvention<Register> {
+ public:
+  InvokeRuntimeCallingConvention()
+      : CallingConvention(kRuntimeParameterCoreRegisters,
+                          kRuntimeParameterCoreRegistersLength) {}
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InvokeRuntimeCallingConvention);
+};
+
+void LocationsBuilderARM::VisitNewInstance(HNewInstance* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  locations->SetOut(Location(R0));
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorARM::VisitNewInstance(HNewInstance* instruction) {
+  InvokeRuntimeCallingConvention calling_convention;
+  LoadCurrentMethod(calling_convention.GetRegisterAt(1));
+  __ LoadImmediate(calling_convention.GetRegisterAt(0), instruction->GetTypeIndex());
+
+  int32_t offset = QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocObjectWithAccessCheck).Int32Value();
+  __ ldr(LR, Address(TR, offset));
+  __ blx(LR);
+
+  codegen_->RecordPcInfo(instruction->GetDexPc());
+}
+
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 52d6b2e..a51d85e 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -22,12 +22,10 @@
 #include "utils/arm/assembler_arm.h"
 
 namespace art {
-
-class Assembler;
-class Label;
-
 namespace arm {
 
+static constexpr size_t kArmWordSize = 4;
+
 class LocationsBuilderARM : public HGraphVisitor {
  public:
   explicit LocationsBuilderARM(HGraph* graph) : HGraphVisitor(graph) { }
@@ -79,6 +77,10 @@
   virtual void Bind(Label* label) OVERRIDE;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
 
+  virtual size_t GetWordSize() const OVERRIDE {
+    return kArmWordSize;
+  }
+
   virtual HGraphVisitor* GetLocationBuilder() OVERRIDE {
     return &location_builder_;
   }
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 7b0a087..c695e26 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -40,14 +40,14 @@
   core_spill_mask_ |= (1 << kFakeReturnRegister);
 
   // Add the current ART method to the frame size and the return PC.
-  SetFrameSize(RoundUp(GetFrameSize() + 2 * kWordSize, kStackAlignment));
+  SetFrameSize(RoundUp(GetFrameSize() + 2 * kX86WordSize, kStackAlignment));
   // The return PC has already been pushed on the stack.
-  __ subl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kWordSize));
+  __ subl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86WordSize));
   __ movl(Address(ESP, 0), EAX);
 }
 
 void CodeGeneratorX86::GenerateFrameExit() {
-  __ addl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kWordSize));
+  __ addl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86WordSize));
 }
 
 void CodeGeneratorX86::Bind(Label* label) {
@@ -59,7 +59,7 @@
 }
 
 int32_t CodeGeneratorX86::GetStackSlot(HLocal* local) const {
-  return (GetGraph()->GetMaximumNumberOfOutVRegs() + local->GetRegNumber()) * kWordSize;
+  return (GetGraph()->GetMaximumNumberOfOutVRegs() + local->GetRegNumber()) * kX86WordSize;
 }
 
 void CodeGeneratorX86::Move(HInstruction* instruction, Location location, HInstruction* move_for) {
@@ -122,7 +122,7 @@
 
 void InstructionCodeGeneratorX86::VisitLocal(HLocal* local) {
   DCHECK_EQ(local->GetBlock(), GetGraph()->GetEntryBlock());
-  codegen_->SetFrameSize(codegen_->GetFrameSize() + kWordSize);
+  codegen_->SetFrameSize(codegen_->GetFrameSize() + kX86WordSize);
 }
 
 void LocationsBuilderX86::VisitLoadLocal(HLoadLocal* local) {
@@ -188,7 +188,7 @@
 }
 
 static constexpr Register kParameterCoreRegisters[] = { ECX, EDX, EBX };
-static constexpr int kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
+static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
 
 class InvokeStaticCallingConvention : public CallingConvention<Register> {
  public:
@@ -199,6 +199,20 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeStaticCallingConvention);
 };
 
+static constexpr Register kRuntimeParameterCoreRegisters[] = { EAX, ECX, EDX };
+static constexpr size_t kRuntimeParameterCoreRegistersLength =
+    arraysize(kRuntimeParameterCoreRegisters);
+
+class InvokeRuntimeCallingConvention : public CallingConvention<Register> {
+ public:
+  InvokeRuntimeCallingConvention()
+      : CallingConvention(kRuntimeParameterCoreRegisters,
+                          kRuntimeParameterCoreRegistersLength) {}
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InvokeRuntimeCallingConvention);
+};
+
 void LocationsBuilderX86::VisitPushArgument(HPushArgument* argument) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(argument);
   InvokeStaticCallingConvention calling_convention;
@@ -236,7 +250,7 @@
 void InstructionCodeGeneratorX86::VisitInvokeStatic(HInvokeStatic* invoke) {
   Register temp = invoke->GetLocations()->GetTemp(0).reg<Register>();
   size_t index_in_cache = mirror::Array::DataOffset(sizeof(mirror::Object*)).Int32Value() +
-      invoke->GetIndexInDexCache() * kWordSize;
+      invoke->GetIndexInDexCache() * kX86WordSize;
 
   // TODO: Implement all kinds of calls:
   // 1) boot -> boot
@@ -284,5 +298,23 @@
   }
 }
 
+void LocationsBuilderX86::VisitNewInstance(HNewInstance* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  locations->SetOut(Location(EAX));
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86::VisitNewInstance(HNewInstance* instruction) {
+  InvokeRuntimeCallingConvention calling_convention;
+  LoadCurrentMethod(calling_convention.GetRegisterAt(1));
+  __ movl(calling_convention.GetRegisterAt(0),
+          Immediate(instruction->GetTypeIndex()));
+
+  __ fs()->call(
+      Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocObjectWithAccessCheck)));
+
+  codegen_->RecordPcInfo(instruction->GetDexPc());
+}
+
 }  // namespace x86
 }  // namespace art
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index dd5044f..bba81c0 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -22,9 +22,10 @@
 #include "utils/x86/assembler_x86.h"
 
 namespace art {
-
 namespace x86 {
 
+static constexpr size_t kX86WordSize = 4;
+
 class LocationsBuilderX86 : public HGraphVisitor {
  public:
   explicit LocationsBuilderX86(HGraph* graph) : HGraphVisitor(graph) { }
@@ -77,6 +78,10 @@
   virtual void Bind(Label* label) OVERRIDE;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
 
+  virtual size_t GetWordSize() const OVERRIDE {
+    return kX86WordSize;
+  }
+
   virtual HGraphVisitor* GetLocationBuilder() OVERRIDE {
     return &location_builder_;
   }
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 2b21905..830d0c7 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -201,6 +201,7 @@
   M(InvokeStatic)                                          \
   M(LoadLocal)                                             \
   M(Local)                                                 \
+  M(NewInstance)                                           \
   M(PushArgument)                                          \
   M(Return)                                                \
   M(ReturnVoid)                                            \
@@ -593,7 +594,7 @@
 
 class HInvoke : public HInstruction {
  public:
-  HInvoke(ArenaAllocator* arena, uint32_t number_of_arguments, int32_t dex_pc)
+  HInvoke(ArenaAllocator* arena, uint32_t number_of_arguments, uint32_t dex_pc)
     : inputs_(arena, number_of_arguments),
       dex_pc_(dex_pc) {
     inputs_.SetSize(number_of_arguments);
@@ -606,11 +607,11 @@
     inputs_.Put(index, argument);
   }
 
-  int32_t GetDexPc() const { return dex_pc_; }
+  uint32_t GetDexPc() const { return dex_pc_; }
 
  protected:
   GrowableArray<HInstruction*> inputs_;
-  const int32_t dex_pc_;
+  const uint32_t dex_pc_;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HInvoke);
@@ -620,8 +621,8 @@
  public:
   HInvokeStatic(ArenaAllocator* arena,
                 uint32_t number_of_arguments,
-                int32_t dex_pc,
-                int32_t index_in_dex_cache)
+                uint32_t dex_pc,
+                uint32_t index_in_dex_cache)
       : HInvoke(arena, number_of_arguments, dex_pc), index_in_dex_cache_(index_in_dex_cache) {}
 
   uint32_t GetIndexInDexCache() const { return index_in_dex_cache_; }
@@ -634,6 +635,22 @@
   DISALLOW_COPY_AND_ASSIGN(HInvokeStatic);
 };
 
+class HNewInstance : public HTemplateInstruction<0> {
+ public:
+  HNewInstance(uint32_t dex_pc, uint16_t type_index) : dex_pc_(dex_pc), type_index_(type_index) {}
+
+  uint32_t GetDexPc() const { return dex_pc_; }
+  uint16_t GetTypeIndex() const { return type_index_; }
+
+  DECLARE_INSTRUCTION(NewInstance)
+
+ private:
+  const uint32_t dex_pc_;
+  const uint16_t type_index_;
+
+  DISALLOW_COPY_AND_ASSIGN(HNewInstance);
+};
+
 // HPushArgument nodes are inserted after the evaluation of an argument
 // of a call. Their mere purpose is to ease the code generator's work.
 class HPushArgument : public HTemplateInstruction<1> {
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index f665f5c..c6e448e 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -743,19 +743,7 @@
   InstructionSetFeatures instruction_set_features =
       ParseFeatureList(Runtime::GetDefaultInstructionSetFeatures());
 
-#if defined(__arm__)
-  InstructionSet instruction_set = kThumb2;
-#elif defined(__aarch64__)
-  InstructionSet instruction_set = kArm64;
-#elif defined(__i386__)
-  InstructionSet instruction_set = kX86;
-#elif defined(__x86_64__)
-  InstructionSet instruction_set = kX86_64;
-#elif defined(__mips__)
-  InstructionSet instruction_set = kMips;
-#else
-  InstructionSet instruction_set = kNone;
-#endif
+  InstructionSet instruction_set = kRuntimeISA;
 
   // Profile file to use
   std::string profile_file;
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 899aa78..d6d2058 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -115,6 +115,10 @@
   "tst", "rsb", "cmp", "cmn", "orr", "mul", "bic", "mvn",
 };
 
+static const char* const kThumb2ShiftOperations[] = {
+    "lsl", "lsr", "asr", "ror"
+};
+
 static const char* kThumbReverseOperations[] = {
     "rev", "rev16", "rbit", "revsh"
 };
@@ -359,6 +363,61 @@
   }
 }
 
+uint32_t VFPExpand32(uint32_t imm8) {
+  CHECK_EQ(imm8 & 0xffu, imm8);
+  uint32_t bit_a = (imm8 >> 7) & 1;
+  uint32_t bit_b = (imm8 >> 6) & 1;
+  uint32_t slice = imm8 & 0x3f;
+  return (bit_a << 31) | ((1 << 30) - (bit_b << 25)) | (slice << 19);
+}
+
+uint64_t VFPExpand64(uint32_t imm8) {
+  CHECK_EQ(imm8 & 0xffu, imm8);
+  uint64_t bit_a = (imm8 >> 7) & 1;
+  uint64_t bit_b = (imm8 >> 6) & 1;
+  uint64_t slice = imm8 & 0x3f;
+  return (bit_a << 31) | ((UINT64_C(1) << 62) - (bit_b << 54)) | (slice << 48);
+}
+
+uint64_t AdvSIMDExpand(uint32_t op, uint32_t cmode, uint32_t imm8) {
+  CHECK_EQ(op & 1, op);
+  CHECK_EQ(cmode & 0xf, cmode);
+  CHECK_EQ(imm8 & 0xff, imm8);
+  int32_t cmode321 = cmode >> 1;
+  if (imm8 == 0 && cmode321 != 0 && cmode321 != 4 && cmode321 != 7) {
+    return INT64_C(0x00000000deadbeef);  // UNPREDICTABLE
+  }
+  uint64_t imm = imm8;
+  switch (cmode321) {
+    case 3: imm <<= 8;  // Fall through.
+    case 2: imm <<= 8;  // Fall through.
+    case 1: imm <<= 8;  // Fall through.
+    case 0: return static_cast<int64_t>((imm << 32) | imm);
+    case 5: imm <<= 8;  // Fall through.
+    case 4: return static_cast<int64_t>((imm << 48) | (imm << 32) | (imm << 16) | imm);
+    case 6:
+      imm = ((imm + 1u) << ((cmode & 1) != 0 ? 16 : 8)) - 1u;  // Add 8 or 16 ones.
+      return static_cast<int64_t>((imm << 32) | imm);
+    default:
+      CHECK_EQ(cmode321, 7);
+      if ((cmode & 1) == 0 && op == 0) {
+        imm = (imm << 8) | imm;
+        return static_cast<int64_t>((imm << 48) | (imm << 32) | (imm << 16) | imm);
+      } else if ((cmode & 1) == 0 && op != 0) {
+        for (int i = 1; i != 8; ++i) {
+          imm |= ((imm >> i) & UINT64_C(1)) << (i * 8);
+        }
+        imm = imm & ~UINT64_C(0xfe);
+        return static_cast<int64_t>((imm << 8) - imm);
+      } else if ((cmode & 1) != 0 && op == 0) {
+        imm = static_cast<uint32_t>(VFPExpand32(imm8));
+        return static_cast<int64_t>((imm << 32) | imm);
+      } else {
+        return INT64_C(0xdeadbeef00000000);  // UNDEFINED
+      }
+  }
+}
+
 size_t DisassemblerArm::DumpThumb32(std::ostream& os, const uint8_t* instr_ptr) {
   uint32_t instr = (ReadU16(instr_ptr) << 16) | ReadU16(instr_ptr + 2);
   // |111|1 1|1000000|0000|1111110000000000|
@@ -757,83 +816,136 @@
             }
           } else if ((op3 >> 4) == 2 && op4 == 0) {     // 10xxxx, op = 0
             // fp data processing
+            // VMLA, VMLS, VMUL, VNMUL, VADD, VSUB, VDIV, VMOV, ...
+            // |1111|1100|0|0|00|0000|1111|110|0|0|0|0|0|0000|
+            // |5  2|1  8|7|6|54|3  0|5  2|1 9|8|7|6|5|4|3  0|
+            // |----|----|-|-|--|----|----|---|-|-|-|-|-|----|
+            // |3322|2222|2|2|22|1111|1111|110|0|0|0|0|0|0000|
+            // |1  8|7  4|3|2|10|9  6|5  2|1 9|8|7|6|5|4|3  0|
+            // |----|----|-|-|--|----|----|---|-|-|-|-|-|----|
+            // |1110|1110|  op3 | Vn | Vd |101|S|N|Q|M|0| Vm |
+            // |1110|1110|0|D|00| Vn | Vd |101|S|N|0|M|0| Vm | VMLA
+            // |1110|1110|0|D|00| Vn | Vd |101|S|N|1|M|0| Vm | VMLS
+            // |1110|1110|0|D|10| Vn | Vd |101|S|N|0|M|0| Vm | VMUL
+            // |1110|1110|0|D|10| Vn | Vd |101|S|N|1|M|0| Vm | VNMUL
+            // |1110|1110|0|D|11| Vn | Vd |101|S|N|0|M|0| Vm | VADD
+            // |1110|1110|0|D|11| Vn | Vd |101|S|N|1|M|0| Vm | VSUB
+            // |1110|1110|1|D|00| Vn | Vd |101|S|N|0|M|0| Vm | VDIV
+            // |1110|1110|1|D|11| iH | Vd |101|S|0|0|0|0| iL | VMOV (imm)
+            // |1110|1110|1|D|11|op5 | Vd |101|S|.|1|M|0| Vm | ... (see below)
+            uint32_t S = (instr >> 8) & 1;
+            uint32_t Q = (instr >> 6) & 1;
+            FpRegister d(instr, 12, 22);
+            FpRegister n(instr, 16, 7);
+            FpRegister m(instr, 0, 5);
             if ((op3 & 0xB) == 0) {  // 100x00
-              // VMLA, VMLS
-              // |1111|1100|0|0|00|0000|1111|110|0|0|0 |0|0|0000|
-              // |5  2|1  8|7|6|54|3  0|5  2|1 9|8|7|6 |5|4|3  0|
-              // |----|----|-|-|--|----|----|---|-|-|- |-|-|----|
-              // |3322|2222|2|2|22|1111|1111|110|0|0|0 |0|0|0000|
-              // |1  8|7  4|3|2|10|9  6|5  2|1 9|8|7|6 |5|4|3  0|
-              // |----|----|-|-|--|----|----|---|-|-|- |-|-|----|
-              // |1110|1110|0|D|00| Vn | Vd |101|S|N|op|M|0| Vm |
-              uint32_t op = (instr >> 6) & 1;
-              FpRegister d(instr, 12, 22);
-              FpRegister n(instr, 16, 7);
-              FpRegister m(instr, 0, 5);
-              opcode << (op == 0 ? "vmla" : "vmls");
+              opcode << (Q == 0 ? "vmla" : "vmls") << (S != 0 ? ".f64" : ".f32");
               args << d << ", " << n << ", " << m;
-            } else if ((op3 & 0xB) == 0xB) {  // 101x11
-              uint32_t Q = (instr >> 6) & 1;
-              if (Q == 1) {
-                // VCVT (floating-point conversion)
-                // |1111|1100|0|0|00|0000|1111|110|0|0 |0|0|0|0000|
-                // |5  2|1  8|7|6|54|3  0|5  2|1 9|8|7 |6|5|4|3  0|
-                // |----|----|-|-|--|----|----|---|-|- |-|-|-|----|
-                // |3322|2222|2|2|22|1111|1111|110|0|0 |0|0|0|0000|
-                // |1  8|7  4|3|2|10|9  6|5  2|1 9|8|7 |6|5|4|3  0|
-                // |----|----|-|-|--|----|----|---|-|- |-|-|-|----|
-                // |1110|1110|1|D|11|op5 | Vd |101|S|op|1|M|0| Vm |
-                uint32_t op5 = (instr >> 16) & 0xF;
-                uint32_t S = (instr >> 8) & 1;
-                uint32_t op = (instr >> 7) & 1;
-                // Register types in these instructions relies on the combination of op5 and S.
-                FpRegister Dd(instr, 12, 22, 1);
-                FpRegister Sd(instr, 12, 22, 0);
-                FpRegister Dm(instr, 0, 5, 1);
-                FpRegister Sm(instr, 0, 5, 0);
-                if (op5 == 0xD) {
+            } else if ((op3 & 0xB) == 0x2) {  // 100x10
+              opcode << (Q == 0 ? "vmul" : "vnmul") << (S != 0 ? ".f64" : ".f32");
+              args << d << ", " << n << ", " << m;
+            } else if ((op3 & 0xB) == 0x3) {  // 100x11
+              opcode << (Q == 0 ? "vadd" : "vsub") << (S != 0 ? ".f64" : ".f32");
+              args << d << ", " << n << ", " << m;
+            } else if ((op3 & 0xB) == 0x8 && Q == 0) {  // 101x00, Q == 0
+              opcode << "vdiv" << (S != 0 ? ".f64" : ".f32");
+              args << d << ", " << n << ", " << m;
+            } else if ((op3 & 0xB) == 0xB && Q == 0) {  // 101x11, Q == 0
+              uint32_t imm8 = ((instr & 0xf0000u) >> 12) | (instr & 0xfu);
+              opcode << "vmov" << (S != 0 ? ".f64" : ".f32");
+              args << d << ", " << (S != 0 ? StringPrintf("0x%016" PRIx64, VFPExpand64(imm8))
+                                           : StringPrintf("0x%08x", VFPExpand32(imm8)));
+              if ((instr & 0xa0) != 0) {
+                args << " (UNPREDICTABLE)";
+              }
+            } else if ((op3 & 0xB) == 0xB && Q == 1) {  // 101x11, Q == 1
+              // VNEG, VSQRT, VCMP, VCMPE, VCVT (floating-point conversion)
+              // |1111|1100|0|0|00|0000|1111|110|0|0 |0|0|0|0000|
+              // |5  2|1  8|7|6|54|3  0|5  2|1 9|8|7 |6|5|4|3  0|
+              // |----|----|-|-|--|----|----|---|-|- |-|-|-|----|
+              // |3322|2222|2|2|22|1111|1111|110|0|0 |0|0|0|0000|
+              // |1  8|7  4|3|2|10|9  6|5  2|1 9|8|7 |6|5|4|3  0|
+              // |----|----|-|-|--|----|----|---|-|- |-|-|-|----|
+              // |1110|1110|1|D|11|0000| Vd |101|S|0 |1|M|0| Vm | VMOV (reg)
+              // |1110|1110|1|D|11|0000| Vd |101|S|1 |1|M|0| Vm | VABS
+              // |1110|1110|1|D|11|0001| Vd |101|S|0 |1|M|0| Vm | VNEG
+              // |1110|1110|1|D|11|0001| Vd |101|S|1 |1|M|0| Vm | VSQRT
+              // |1110|1110|1|D|11|0100| Vd |101|S|op|1|M|0| Vm | VCMP
+              // |1110|1110|1|D|11|0101| Vd |101|S|op|1|0|0|0000| VCMPE
+              // |1110|1110|1|D|11|op5 | Vd |101|S|op|1|M|0| Vm | VCVT
+              uint32_t op5 = (instr >> 16) & 0xF;
+              uint32_t op = (instr >> 7) & 1;
+              // Register types in VCVT instructions rely on the combination of op5 and S.
+              FpRegister Dd(instr, 12, 22, 1);
+              FpRegister Sd(instr, 12, 22, 0);
+              FpRegister Dm(instr, 0, 5, 1);
+              FpRegister Sm(instr, 0, 5, 0);
+              if (op5 == 0) {
+                opcode << (op == 0 ? "vmov" : "vabs") << (S != 0 ? ".f64" : ".f32");
+                args << d << ", " << m;
+              } else if (op5 == 1) {
+                opcode << (op != 0 ? "vsqrt" : "vneg") << (S != 0 ? ".f64" : ".f32");
+                args << d << ", " << m;
+              } else if (op5 == 4) {
+                opcode << "vcmp" << (S != 0 ? ".f64" : ".f32");
+                args << d << ", " << m;
+                if (op != 0) {
+                  args << " (quiet nan)";
+                }
+              } else if (op5 == 5) {
+                opcode << "vcmpe" << (S != 0 ? ".f64" : ".f32");
+                args << d << ", #0.0";
+                if (op != 0) {
+                  args << " (quiet nan)";
+                }
+                if ((instr & 0x2f) != 0) {
+                  args << " (UNPREDICTABLE)";
+                }
+              } else if (op5 == 0xD) {
+                if (S == 1) {
+                  // vcvt{r}.s32.f64
+                  opcode << "vcvt" << (op == 0 ? "r" : "") << ".s32.f64";
+                  args << Sd << ", " << Dm;
+                } else {
+                  // vcvt{r}.s32.f32
+                  opcode << "vcvt" << (op == 0 ? "r" : "") << ".s32.f32";
+                  args << Sd << ", " << Sm;
+                }
+              } else if (op5 == 0xC) {
+                if (S == 1) {
+                  // vcvt{r}.u32.f64
+                  opcode << "vcvt" << (op == 0 ? "r" : "") << ".u32.f64";
+                  args << Sd << ", " << Dm;
+                } else {
+                  // vcvt{r}.u32.f32
+                  opcode << "vcvt" << (op == 0 ? "r" : "") << ".u32.f32";
+                  args << Sd << ", " << Sm;
+                }
+              } else if (op5 == 0x8) {
+                if (S == 1) {
+                  // vcvt.f64.<Tm>
+                  opcode << "vcvt.f64." << (op == 0 ? "u" : "s") << "32";
+                  args << Dd << ", " << Sm;
+                } else {
+                  // vcvt.f32.<Tm>
+                  opcode << "vcvt.f32." << (op == 0 ? "u" : "s") << "32";
+                  args << Sd << ", " << Sm;
+                }
+              } else if (op5 == 0x7) {
+                if (op == 1) {
                   if (S == 1) {
-                    // vcvt{r}.s32.f64
-                    opcode << "vcvt" << (op == 0 ? "r" : "") << ".s32.f64";
-                    args << Sd << ", " << Dm;
-                  } else {
-                    // vcvt{r}.s32.f32
-                    opcode << "vcvt" << (op == 0 ? "r" : "") << ".s32.f32";
-                    args << Sd << ", " << Sm;
-                  }
-                } else if (op5 == 0xC) {
-                  if (S == 1) {
-                    // vcvt{r}.u32.f64
-                    opcode << "vcvt" << (op == 0 ? "r" : "") << ".u32.f64";
-                    args << Sd << ", " << Dm;
-                  } else {
-                    // vcvt{r}.u32.f32
-                    opcode << "vcvt" << (op == 0 ? "r" : "") << ".u32.f32";
-                    args << Sd << ", " << Sm;
-                  }
-                } else if (op5 == 0x8) {
-                  if (S == 1) {
-                    // vcvt.f64.<Tm>
-                    opcode << "vcvt.f64." << (op == 0 ? "u" : "s") << "32";
+                    // vcvt.f64.f32
+                    opcode << "vcvt.f64.f32";
                     args << Dd << ", " << Sm;
                   } else {
-                    // vcvt.f32.<Tm>
-                    opcode << "vcvt.f32." << (op == 0 ? "u" : "s") << "32";
-                    args << Sd << ", " << Sm;
-                  }
-                } else if (op5 == 0x7) {
-                  if (op == 1) {
-                    if (S == 1) {
-                      // vcvt.f64.f32
-                      opcode << "vcvt.f64.f32";
-                      args << Dd << ", " << Sm;
-                    } else {
-                      // vcvt.f32.f64
-                      opcode << "vcvt.f32.f64";
-                      args << Sd << ", " << Dm;
-                    }
+                    // vcvt.f32.f64
+                    opcode << "vcvt.f32.f64";
+                    args << Sd << ", " << Dm;
                   }
                 }
+              } else if ((op5 & 0xa) == 0xa) {
+                opcode << "vcvt";
+                args << "[undecoded: floating <-> fixed]";
               }
             }
           } else if ((op3 >> 4) == 2 && op4 == 1) {     // 10xxxx, op = 1
@@ -886,53 +998,6 @@
             }
           }
         }
-
-        if ((op3 & 0x30) == 0x20 && op4 == 0) {  // 10 xxxx ... 0
-          if ((coproc & 0xE) == 0xA) {
-            // VFP data-processing instructions
-            // |111|1|1100|0000|0000|1111|110|0|00  |0|0|0000|
-            // |5 3|2|1098|7654|3  0|54 2|10 |8|76  |5|4|3  0|
-            // |---|-|----|----|----|----|---|-|----|-|-|----|
-            // |332|2|2222|2222|1111|1111|110|0|00  |0|0|0000|
-            // |1 9|8|7654|3210|9  6|54 2|109|8|76  |5|4|3  0|
-            // |---|-|----|----|----|----|---|-|----|-|-|----|
-            // |111|T|1110|opc1|opc2|    |101| |opc3| | |    |
-            //  111 0 1110|1111 0100 1110 101 0 01   1 0 1001 - eef4ea69
-            uint32_t opc1 = (instr >> 20) & 0xF;
-            uint32_t opc2 = (instr >> 16) & 0xF;
-            uint32_t opc3 = (instr >> 6) & 0x3;
-            if ((opc1 & 0xB) == 0xB) {  // 1x11
-              // Other VFP data-processing instructions.
-              uint32_t sz = (instr >> 8) & 1;
-              FpRegister d(instr, 12, 22);
-              FpRegister m(instr, 0, 5);
-              switch (opc2) {
-                case 0x1:  // Vneg/Vsqrt
-                  //  1110 11101 D 11 0001 dddd 101s o1M0 mmmm
-                  opcode << (opc3 == 1 ? "vneg" : "vsqrt") << (sz == 1 ? ".f64" : ".f32");
-                  args << d << ", " << m;
-                  break;
-                case 0x4: case 0x5:  {  // Vector compare
-                  // 1110 11101 D 11 0100 dddd 101 sE1M0 mmmm
-                  opcode << (opc3 == 1 ? "vcmp" : "vcmpe") << (sz == 1 ? ".f64" : ".f32");
-                  args << d << ", " << m;
-                  break;
-                }
-              }
-            }
-          }
-        } else if ((op3 & 0x30) == 0x30) {  // 11 xxxx
-          // Advanced SIMD
-          if ((instr & 0xFFBF0ED0) == 0xeeb10ac0) {  // Vsqrt
-            //  1110 11101 D 11 0001 dddd 101S 11M0 mmmm
-            //  1110 11101 0 11 0001 1101 1011 1100 1000 - eeb1dbc8
-            uint32_t sz = (instr >> 8) & 1;
-            FpRegister d(instr, 12, 22);
-            FpRegister m(instr, 0, 5);
-            opcode << "vsqrt" << (sz == 1 ? ".f64" : ".f32");
-            args << d << ", " << m;
-          }
-        }
       }
       break;
     case 2:
@@ -1388,6 +1453,16 @@
       default:      // more formats
         if ((op2 >> 4) == 2) {      // 010xxxx
           // data processing (register)
+          if ((instr & 0x0080f0f0) == 0x0000f000) {
+            // LSL, LSR, ASR, ROR
+            uint32_t shift_op = (instr >> 21) & 3;
+            uint32_t S = (instr >> 20) & 1;
+            ArmRegister Rd(instr, 8);
+            ArmRegister Rn(instr, 16);
+            ArmRegister Rm(instr, 0);
+            opcode << kThumb2ShiftOperations[shift_op] << (S != 0 ? "s" : "");
+            args << Rd << ", " << Rn << ", " << Rm;
+          }
         } else if ((op2 >> 3) == 6) {       // 0110xxx
           // Multiply, multiply accumulate, and absolute difference
           op1 = (instr >> 20) & 0x7;
diff --git a/runtime/arch/arm/fault_handler_arm.cc b/runtime/arch/arm/fault_handler_arm.cc
index aaba598..3bbec71 100644
--- a/runtime/arch/arm/fault_handler_arm.cc
+++ b/runtime/arch/arm/fault_handler_arm.cc
@@ -35,7 +35,7 @@
 
 extern "C" void art_quick_throw_null_pointer_exception();
 extern "C" void art_quick_throw_stack_overflow(void*);
-extern "C" void art_quick_test_suspend();
+extern "C" void art_quick_implicit_suspend();
 
 // Get the size of a thumb2 instruction in bytes.
 static uint32_t GetInstructionSize(uint8_t* pc) {
@@ -142,7 +142,7 @@
   if (found) {
     LOG(DEBUG) << "suspend check match";
     // This is a suspend check.  Arrange for the signal handler to return to
-    // art_quick_test_suspend.  Also set LR so that after the suspend check it
+    // art_quick_implicit_suspend.  Also set LR so that after the suspend check it
     // will resume the instruction (current PC + 2).  PC points to the
     // ldr r0,[r0,#0] instruction (r0 will be 0, set by the trigger).
 
@@ -151,7 +151,7 @@
     LOG(DEBUG) << "arm lr: " << std::hex << sc->arm_lr;
     LOG(DEBUG) << "arm pc: " << std::hex << sc->arm_pc;
     sc->arm_lr = sc->arm_pc + 3;      // +2 + 1 (for thumb)
-    sc->arm_pc = reinterpret_cast<uintptr_t>(art_quick_test_suspend);
+    sc->arm_pc = reinterpret_cast<uintptr_t>(art_quick_implicit_suspend);
 
     // Now remove the suspend trigger that caused this fault.
     Thread::Current()->RemoveSuspendTrigger();
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 71dcd7f..4903732 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -888,6 +888,14 @@
     RESTORE_REF_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
 END art_quick_test_suspend
 
+ENTRY art_quick_implicit_suspend
+    mov    r0, rSELF
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME          @ save callee saves for stack crawl
+    mov    r1, sp
+    bl     artTestSuspendFromCode             @ (Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
+END art_quick_implicit_suspend
+
     /*
      * Called by managed code that is attempting to call a method on a proxy class. On entry
      * r0 holds the proxy method and r1 holds the receiver; r2 and r3 may contain arguments. The
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 8ef407d..62f3593 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -17,7 +17,7 @@
 #ifndef ART_RUNTIME_ASM_SUPPORT_H_
 #define ART_RUNTIME_ASM_SUPPORT_H_
 
-#include "read_barrier.h"
+#include "read_barrier_c.h"
 
 // Value loaded into rSUSPEND for quick. When this value is counted down to zero we do a suspend
 // check.
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 6c5406e..78b7cc0 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -202,7 +202,7 @@
   // The GC can't handle an object with a null class since we can't get the size of this object.
   heap->IncrementDisableMovingGC(self);
   SirtRef<mirror::Class> java_lang_Class(self, down_cast<mirror::Class*>(
-      heap->AllocNonMovableObject<true>(self, nullptr, sizeof(mirror::ClassClass))));
+      heap->AllocNonMovableObject<true>(self, nullptr, sizeof(mirror::ClassClass), VoidFunctor())));
   CHECK(java_lang_Class.get() != NULL);
   mirror::Class::SetClassClass(java_lang_Class.get());
   java_lang_Class->SetClass(java_lang_Class.get());
@@ -1180,7 +1180,8 @@
   SirtRef<mirror::Class> dex_cache_class(self, GetClassRoot(kJavaLangDexCache));
   SirtRef<mirror::DexCache> dex_cache(
       self, down_cast<mirror::DexCache*>(
-          heap->AllocObject<true>(self, dex_cache_class.get(), dex_cache_class->GetObjectSize())));
+          heap->AllocObject<true>(self, dex_cache_class.get(), dex_cache_class->GetObjectSize(),
+                                  VoidFunctor())));
   if (dex_cache.get() == NULL) {
     return NULL;
   }
diff --git a/runtime/elf_file.cc b/runtime/elf_file.cc
index 0c8a4f0..01ca60f 100644
--- a/runtime/elf_file.cc
+++ b/runtime/elf_file.cc
@@ -22,6 +22,7 @@
 #include "base/logging.h"
 #include "base/stl_util.h"
 #include "utils.h"
+#include "instruction_set.h"
 
 namespace art {
 
@@ -773,6 +774,40 @@
 
 bool ElfFile::Load(bool executable, std::string* error_msg) {
   CHECK(program_header_only_) << file_->GetPath();
+
+  if (executable) {
+    InstructionSet elf_ISA = kNone;
+    switch (GetHeader().e_machine) {
+      case EM_ARM: {
+        elf_ISA = kArm;
+        break;
+      }
+      case EM_AARCH64: {
+        elf_ISA = kArm64;
+        break;
+      }
+      case EM_386: {
+        elf_ISA = kX86;
+        break;
+      }
+      case EM_X86_64: {
+        elf_ISA = kX86_64;
+        break;
+      }
+      case EM_MIPS: {
+        elf_ISA = kMips;
+        break;
+      }
+    }
+
+    if (elf_ISA != kRuntimeISA) {
+      std::ostringstream oss;
+      oss << "Expected ISA " << kRuntimeISA << " but found " << elf_ISA;
+      *error_msg = oss.str();
+      return false;
+    }
+  }
+
   for (Elf32_Word i = 0; i < GetProgramHeaderNum(); i++) {
     Elf32_Phdr& program_header = GetProgramHeader(i);
 
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 9fc173a..963c3d1 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -1634,15 +1634,14 @@
 }
 
 template<InvokeType type, bool access_check>
-uint64_t artInvokeCommon(uint32_t method_idx, mirror::Object* this_object,
-                         mirror::ArtMethod* caller_method,
-                         Thread* self, mirror::ArtMethod** sp)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+static uint64_t artInvokeCommon(uint32_t method_idx, mirror::Object* this_object,
+                                mirror::ArtMethod* caller_method,
+                                Thread* self, mirror::ArtMethod** sp);
 
 template<InvokeType type, bool access_check>
-uint64_t artInvokeCommon(uint32_t method_idx, mirror::Object* this_object,
-                         mirror::ArtMethod* caller_method,
-                         Thread* self, mirror::ArtMethod** sp) {
+static uint64_t artInvokeCommon(uint32_t method_idx, mirror::Object* this_object,
+                                mirror::ArtMethod* caller_method,
+                                Thread* self, mirror::ArtMethod** sp) {
   mirror::ArtMethod* method = FindMethodFast(method_idx, this_object, caller_method, access_check,
                                              type);
   if (UNLIKELY(method == nullptr)) {
@@ -1682,6 +1681,26 @@
 #endif
 }
 
+// Explicit artInvokeCommon template function declarations to please analysis tool.
+#define EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(type, access_check)                                \
+  template SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)                                          \
+  uint64_t artInvokeCommon<type, access_check>(uint32_t method_idx,                             \
+                                               mirror::Object* this_object,                     \
+                                               mirror::ArtMethod* caller_method,                \
+                                               Thread* self, mirror::ArtMethod** sp)            \
+
+EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kVirtual, false);
+EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kVirtual, true);
+EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kInterface, false);
+EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kInterface, true);
+EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kDirect, false);
+EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kDirect, true);
+EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kStatic, false);
+EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kStatic, true);
+EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kSuper, false);
+EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kSuper, true);
+#undef EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL
+
 
 // See comments in runtime_support_asm.S
 extern "C" uint64_t artInvokeInterfaceTrampolineWithAccessCheck(uint32_t method_idx,
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index f5f6f16..920741f 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -1997,6 +1997,8 @@
         CHECK_LE(obj_size, kLargeSizeThreshold)
             << "A run slot contains a large object " << Dump();
         CHECK_EQ(SizeToIndex(obj_size), idx)
+            << PrettyTypeOf(obj) << " "
+            << "obj_size=" << obj_size << ", idx=" << idx << " "
             << "A run slot contains an object with wrong size " << Dump();
       }
     }
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index ca2d0bd..944ef8d 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -76,7 +76,7 @@
 
 // Turn off kCheckLocks when profiling the GC since it slows the GC down by up to 40%.
 static constexpr bool kCheckLocks = kDebugLocking;
-static constexpr bool kVerifyRoots = kIsDebugBuild;
+static constexpr bool kVerifyRootsMarked = kIsDebugBuild;
 
 // If true, revoke the rosalloc thread-local buffers at the
 // checkpoint, as opposed to during the pause.
@@ -466,16 +466,17 @@
 }
 
 void MarkSweep::VerifyRootCallback(const Object* root, void* arg, size_t vreg,
-                                   const StackVisitor* visitor) {
-  reinterpret_cast<MarkSweep*>(arg)->VerifyRoot(root, vreg, visitor);
+                                   const StackVisitor* visitor, RootType root_type) {
+  reinterpret_cast<MarkSweep*>(arg)->VerifyRoot(root, vreg, visitor, root_type);
 }
 
-void MarkSweep::VerifyRoot(const Object* root, size_t vreg, const StackVisitor* visitor) {
+void MarkSweep::VerifyRoot(const Object* root, size_t vreg, const StackVisitor* visitor,
+                           RootType root_type) {
   // See if the root is on any space bitmap.
-  if (GetHeap()->GetLiveBitmap()->GetContinuousSpaceBitmap(root) == NULL) {
+  if (GetHeap()->GetLiveBitmap()->GetContinuousSpaceBitmap(root) == nullptr) {
     space::LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
     if (!large_object_space->Contains(root)) {
-      LOG(ERROR) << "Found invalid root: " << root;
+      LOG(ERROR) << "Found invalid root: " << root << " with type " << root_type;
       if (visitor != NULL) {
         LOG(ERROR) << visitor->DescribeLocation() << " in VReg: " << vreg;
       }
@@ -918,7 +919,7 @@
                                                           kVisitRootFlagStopLoggingNewRoots |
                                                           kVisitRootFlagClearRootLog));
   timings_.EndSplit();
-  if (kVerifyRoots) {
+  if (kVerifyRootsMarked) {
     timings_.StartSplit("(Paused)VerifyRoots");
     Runtime::Current()->VisitRoots(VerifyRootMarked, this);
     timings_.EndSplit();
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index f1fd546..d49e427 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -249,10 +249,10 @@
   size_t GetThreadCount(bool paused) const;
 
   static void VerifyRootCallback(const mirror::Object* root, void* arg, size_t vreg,
-                                 const StackVisitor *visitor);
+                                 const StackVisitor *visitor, RootType root_type);
 
-  void VerifyRoot(const mirror::Object* root, size_t vreg, const StackVisitor* visitor)
-      NO_THREAD_SAFETY_ANALYSIS;
+  void VerifyRoot(const mirror::Object* root, size_t vreg, const StackVisitor* visitor,
+                  RootType root_type) NO_THREAD_SAFETY_ANALYSIS;
 
   // Push a single reference on a mark stack.
   void PushOnMarkStack(mirror::Object* obj);
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 25f20d6..a06f272 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -65,7 +65,7 @@
       bool after_is_current_allocator = allocator == GetCurrentAllocator();
       if (is_current_allocator && !after_is_current_allocator) {
         // If the allocator changed, we need to restart the allocation.
-        return AllocObject<kInstrumented>(self, klass, byte_count);
+        return AllocObject<kInstrumented>(self, klass, byte_count, pre_fence_visitor);
       }
       return nullptr;
     }
@@ -111,7 +111,7 @@
     DCHECK(!Runtime::Current()->HasStatsEnabled());
   }
   if (AllocatorHasAllocationStack(allocator)) {
-    PushOnAllocationStack(self, obj);
+    PushOnAllocationStack(self, &obj);
   }
   if (kInstrumented) {
     if (Dbg::IsAllocTrackingEnabled()) {
@@ -135,28 +135,34 @@
 // The size of a thread-local allocation stack in the number of references.
 static constexpr size_t kThreadLocalAllocationStackSize = 128;
 
-inline void Heap::PushOnAllocationStack(Thread* self, mirror::Object* obj) {
+inline void Heap::PushOnAllocationStack(Thread* self, mirror::Object** obj) {
   if (kUseThreadLocalAllocationStack) {
-    bool success = self->PushOnThreadLocalAllocationStack(obj);
+    bool success = self->PushOnThreadLocalAllocationStack(*obj);
     if (UNLIKELY(!success)) {
       // Slow path. Allocate a new thread-local allocation stack.
       mirror::Object** start_address;
       mirror::Object** end_address;
       while (!allocation_stack_->AtomicBumpBack(kThreadLocalAllocationStackSize,
                                                 &start_address, &end_address)) {
+        // Disable verify object in SirtRef as obj isn't on the alloc stack yet.
+        SirtRefNoVerify<mirror::Object> ref(self, *obj);
         CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+        *obj = ref.get();
       }
       self->SetThreadLocalAllocationStack(start_address, end_address);
       // Retry on the new thread-local allocation stack.
-      success = self->PushOnThreadLocalAllocationStack(obj);
+      success = self->PushOnThreadLocalAllocationStack(*obj);
       // Must succeed.
       CHECK(success);
     }
   } else {
     // This is safe to do since the GC will never free objects which are neither in the allocation
     // stack or the live bitmap.
-    while (!allocation_stack_->AtomicPushBack(obj)) {
+    while (!allocation_stack_->AtomicPushBack(*obj)) {
+      // Disable verify object in SirtRef as obj isn't on the alloc stack yet.
+      SirtRefNoVerify<mirror::Object> ref(self, *obj);
       CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+      *obj = ref.get();
     }
   }
 }
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 5879757..a8989ec 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -115,6 +115,8 @@
 };
 std::ostream& operator<<(std::ostream& os, const ProcessState& process_state);
 
+std::ostream& operator<<(std::ostream& os, const RootType& root_type);
+
 class Heap {
  public:
   // If true, measure the total allocation time.
@@ -158,28 +160,28 @@
   ~Heap();
 
   // Allocates and initializes storage for an object instance.
-  template <bool kInstrumented, typename PreFenceVisitor = VoidFunctor>
+  template <bool kInstrumented, typename PreFenceVisitor>
   mirror::Object* AllocObject(Thread* self, mirror::Class* klass, size_t num_bytes,
-                              const PreFenceVisitor& pre_fence_visitor = VoidFunctor())
+                              const PreFenceVisitor& pre_fence_visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return AllocObjectWithAllocator<kInstrumented, true>(self, klass, num_bytes,
                                                          GetCurrentAllocator(),
                                                          pre_fence_visitor);
   }
 
-  template <bool kInstrumented, typename PreFenceVisitor = VoidFunctor>
+  template <bool kInstrumented, typename PreFenceVisitor>
   mirror::Object* AllocNonMovableObject(Thread* self, mirror::Class* klass, size_t num_bytes,
-                                        const PreFenceVisitor& pre_fence_visitor = VoidFunctor())
+                                        const PreFenceVisitor& pre_fence_visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return AllocObjectWithAllocator<kInstrumented, true>(self, klass, num_bytes,
                                                          GetCurrentNonMovingAllocator(),
                                                          pre_fence_visitor);
   }
 
-  template <bool kInstrumented, bool kCheckLargeObject, typename PreFenceVisitor = VoidFunctor>
+  template <bool kInstrumented, bool kCheckLargeObject, typename PreFenceVisitor>
   ALWAYS_INLINE mirror::Object* AllocObjectWithAllocator(
       Thread* self, mirror::Class* klass, size_t byte_count, AllocatorType allocator,
-      const PreFenceVisitor& pre_fence_visitor = VoidFunctor())
+      const PreFenceVisitor& pre_fence_visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   AllocatorType GetCurrentAllocator() const {
@@ -691,7 +693,8 @@
   void SignalHeapTrimDaemon(Thread* self);
 
   // Push an object onto the allocation stack.
-  void PushOnAllocationStack(Thread* self, mirror::Object* obj);
+  void PushOnAllocationStack(Thread* self, mirror::Object** obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // What kind of concurrency behavior is the runtime after? Currently true for concurrent mark
   // sweep GC, false for other GC types.
diff --git a/runtime/gc/space/space_test.h b/runtime/gc/space/space_test.h
index 5c735df..9896a48 100644
--- a/runtime/gc/space/space_test.h
+++ b/runtime/gc/space/space_test.h
@@ -85,8 +85,13 @@
     EXPECT_GE(size, SizeOfZeroLengthByteArray());
     EXPECT_TRUE(byte_array_class != nullptr);
     o->SetClass(byte_array_class);
-    if (kUseBrooksReadBarrier) {
-      o->SetReadBarrierPointer(o);
+    if (kUseBakerOrBrooksReadBarrier) {
+      // Like the proper heap object allocation, install and verify
+      // the correct read barrier pointer.
+      if (kUseBrooksReadBarrier) {
+        o->SetReadBarrierPointer(o);
+      }
+      o->AssertReadBarrierPointer();
     }
     mirror::Array* arr = o->AsArray<kVerifyNone>();
     size_t header_size = SizeOfZeroLengthByteArray();
diff --git a/runtime/globals.h b/runtime/globals.h
index ee8dc07..7e85231 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -19,7 +19,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include "read_barrier.h"
+#include "read_barrier_c.h"
 
 namespace art {
 
diff --git a/runtime/instruction_set.h b/runtime/instruction_set.h
index c5a4ec8..f4eecfc 100644
--- a/runtime/instruction_set.h
+++ b/runtime/instruction_set.h
@@ -35,6 +35,20 @@
 };
 std::ostream& operator<<(std::ostream& os, const InstructionSet& rhs);
 
+#if defined(__arm__)
+static constexpr InstructionSet kRuntimeISA = kArm;
+#elif defined(__aarch64__)
+static constexpr InstructionSet kRuntimeISA = kArm64;
+#elif defined(__mips__)
+static constexpr InstructionSet kRuntimeISA = kMips;
+#elif defined(__i386__)
+static constexpr InstructionSet kRuntimeISA = kX86;
+#elif defined(__x86_64__)
+static constexpr InstructionSet kRuntimeISA = kX86_64;
+#else
+static constexpr InstructionSet kRuntimeISA = kNone;
+#endif
+
 enum InstructionFeatures {
   kHwDiv = 1                  // Supports hardware divide.
 };
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 89d9241..025e62a 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -442,7 +442,14 @@
 }
 
 inline void Class::CheckObjectAlloc() {
-  DCHECK(!IsArrayClass()) << PrettyClass(this);
+  DCHECK(!IsArrayClass())
+      << PrettyClass(this)
+      << "A array shouldn't be allocated through this "
+      << "as it requires a pre-fence visitor that sets the class size.";
+  DCHECK(!IsClassClass())
+      << PrettyClass(this)
+      << "A class object shouldn't be allocated through this "
+      << "as it requires a pre-fence visitor that sets the class size.";
   DCHECK(IsInstantiable()) << PrettyClass(this);
   // TODO: decide whether we want this check. It currently fails during bootstrap.
   // DCHECK(!Runtime::Current()->IsStarted() || IsInitializing()) << PrettyClass(this);
@@ -454,7 +461,7 @@
   CheckObjectAlloc();
   gc::Heap* heap = Runtime::Current()->GetHeap();
   return heap->AllocObjectWithAllocator<kIsInstrumented, false>(self, this, this->object_size_,
-                                                                allocator_type);
+                                                                allocator_type, VoidFunctor());
 }
 
 inline Object* Class::AllocObject(Thread* self) {
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index b6c140d..a6db387 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -26,6 +26,7 @@
 #include "class.h"
 #include "lock_word-inl.h"
 #include "monitor.h"
+#include "read_barrier-inl.h"
 #include "runtime.h"
 #include "reference.h"
 #include "throwable.h"
@@ -96,7 +97,7 @@
 inline Object* Object::GetReadBarrierPointer() {
 #ifdef USE_BAKER_OR_BROOKS_READ_BARRIER
   DCHECK(kUseBakerOrBrooksReadBarrier);
-  return GetFieldObject<Object, kVerifyNone>(OFFSET_OF_OBJECT_MEMBER(Object, x_rb_ptr_), false);
+  return GetFieldObject<Object, kVerifyNone, false>(OFFSET_OF_OBJECT_MEMBER(Object, x_rb_ptr_), false);
 #else
   LOG(FATAL) << "Unreachable";
   return nullptr;
@@ -116,21 +117,19 @@
 }
 
 inline void Object::AssertReadBarrierPointer() const {
-#if defined(USE_BAKER_READ_BARRIER)
-  DCHECK(kUseBakerReadBarrier);
-  Object* obj = const_cast<Object*>(this);
-  DCHECK(obj->GetReadBarrierPointer() == nullptr)
-      << "Bad Baker pointer: obj=" << reinterpret_cast<void*>(obj)
-      << " ptr=" << reinterpret_cast<void*>(obj->GetReadBarrierPointer());
-#elif defined(USE_BROOKS_READ_BARRIER)
-  DCHECK(kUseBrooksReadBarrier);
-  Object* obj = const_cast<Object*>(this);
-  DCHECK_EQ(obj, obj->GetReadBarrierPointer())
-      << "Bad Brooks pointer: obj=" << reinterpret_cast<void*>(obj)
-      << " ptr=" << reinterpret_cast<void*>(obj->GetReadBarrierPointer());
-#else
-  LOG(FATAL) << "Unreachable";
-#endif
+  if (kUseBakerReadBarrier) {
+    Object* obj = const_cast<Object*>(this);
+    DCHECK(obj->GetReadBarrierPointer() == nullptr)
+        << "Bad Baker pointer: obj=" << reinterpret_cast<void*>(obj)
+        << " ptr=" << reinterpret_cast<void*>(obj->GetReadBarrierPointer());
+  } else if (kUseBrooksReadBarrier) {
+    Object* obj = const_cast<Object*>(this);
+    DCHECK_EQ(obj, obj->GetReadBarrierPointer())
+        << "Bad Brooks pointer: obj=" << reinterpret_cast<void*>(obj)
+        << " ptr=" << reinterpret_cast<void*>(obj->GetReadBarrierPointer());
+  } else {
+    LOG(FATAL) << "Unreachable";
+  }
 }
 
 template<VerifyObjectFlags kVerifyFlags>
@@ -470,19 +469,17 @@
   return QuasiAtomic::Cas64(old_value, new_value, addr);
 }
 
-template<class T, VerifyObjectFlags kVerifyFlags>
+template<class T, VerifyObjectFlags kVerifyFlags, bool kDoReadBarrier>
 inline T* Object::GetFieldObject(MemberOffset field_offset, bool is_volatile) {
   if (kVerifyFlags & kVerifyThis) {
     VerifyObject(this);
   }
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
   HeapReference<T>* objref_addr = reinterpret_cast<HeapReference<T>*>(raw_addr);
-  HeapReference<T> objref = *objref_addr;
-
+  T* result = ReadBarrier::Barrier<T, kDoReadBarrier>(this, field_offset, objref_addr);
   if (UNLIKELY(is_volatile)) {
     QuasiAtomic::MembarLoadLoad();  // Ensure loads don't re-order.
   }
-  T* result = objref.AsMirrorPtr();
   if (kVerifyFlags & kVerifyReads) {
     VerifyObject(result);
   }
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index f1485e5..d9155f5 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -66,6 +66,26 @@
   return dest;
 }
 
+// An allocation pre-fence visitor that copies the object.
+class CopyObjectVisitor {
+ public:
+  explicit CopyObjectVisitor(Thread* self, SirtRef<Object>* orig, size_t num_bytes)
+      : self_(self), orig_(orig), num_bytes_(num_bytes) {
+  }
+
+  void operator()(Object* obj, size_t usable_size) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    UNUSED(usable_size);
+    CopyObject(self_, obj, orig_->get(), num_bytes_);
+  }
+
+ private:
+  Thread* const self_;
+  SirtRef<Object>* const orig_;
+  const size_t num_bytes_;
+  DISALLOW_COPY_AND_ASSIGN(CopyObjectVisitor);
+};
+
 Object* Object::Clone(Thread* self) {
   CHECK(!IsClass()) << "Can't clone classes.";
   // Object::SizeOf gets the right size even if we're an array. Using c->AllocObject() here would
@@ -74,13 +94,11 @@
   size_t num_bytes = SizeOf();
   SirtRef<Object> this_object(self, this);
   Object* copy;
+  CopyObjectVisitor visitor(self, &this_object, num_bytes);
   if (heap->IsMovableObject(this)) {
-    copy = heap->AllocObject<true>(self, GetClass(), num_bytes);
+    copy = heap->AllocObject<true>(self, GetClass(), num_bytes, visitor);
   } else {
-    copy = heap->AllocNonMovableObject<true>(self, GetClass(), num_bytes);
-  }
-  if (LIKELY(copy != nullptr)) {
-    return CopyObject(self, copy, this_object.get(), num_bytes);
+    copy = heap->AllocNonMovableObject<true>(self, GetClass(), num_bytes, visitor);
   }
   return copy;
 }
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 1ac23ce..f652202 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -185,7 +185,7 @@
   bool IsPhantomReferenceInstance() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Accessor for Java type fields.
-  template<class T, VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  template<class T, VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags, bool kDoReadBarrier = true>
   T* GetFieldObject(MemberOffset field_offset, bool is_volatile)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   template<bool kTransactionActive, bool kCheckTransaction = true,
diff --git a/runtime/object_callbacks.h b/runtime/object_callbacks.h
index 89ee34e..9198c90 100644
--- a/runtime/object_callbacks.h
+++ b/runtime/object_callbacks.h
@@ -56,7 +56,7 @@
     __attribute__((warn_unused_result));
 // A callback for verifying roots.
 typedef void (VerifyRootCallback)(const mirror::Object* root, void* arg, size_t vreg,
-    const StackVisitor* visitor);
+    const StackVisitor* visitor, RootType root_type);
 
 typedef void (MarkHeapReferenceCallback)(mirror::HeapReference<mirror::Object>* ref, void* arg);
 
diff --git a/runtime/quick/inline_method_analyser.cc b/runtime/quick/inline_method_analyser.cc
index a9072d8..8bd8dba 100644
--- a/runtime/quick/inline_method_analyser.cc
+++ b/runtime/quick/inline_method_analyser.cc
@@ -135,6 +135,12 @@
   }
 }
 
+bool InlineMethodAnalyser::IsSyntheticAccessor(MethodReference ref) {
+  const DexFile::MethodId& method_id = ref.dex_file->GetMethodId(ref.dex_method_index);
+  const char* method_name = ref.dex_file->GetMethodName(method_id);
+  return strncmp(method_name, "access$", strlen("access$")) == 0;
+}
+
 bool InlineMethodAnalyser::AnalyseReturnMethod(const DexFile::CodeItem* code_item,
                                                InlineMethod* result) {
   const Instruction* return_instruction = Instruction::At(code_item->insns_);
@@ -218,13 +224,24 @@
   uint32_t arg_start = code_item->registers_size_ - code_item->ins_size_;
   DCHECK_GE(object_reg, arg_start);
   DCHECK_LT(object_reg, code_item->registers_size_);
+  uint32_t object_arg = object_reg - arg_start;
+
   DCHECK_LT(opcode == Instruction::IGET_WIDE ? dst_reg + 1 : dst_reg, code_item->registers_size_);
   if (dst_reg != return_reg) {
     return false;  // Not returning the value retrieved by IGET?
   }
 
-  if ((verifier->GetAccessFlags() & kAccStatic) != 0 || object_reg != arg_start) {
-    // TODO: Support inlining IGET on other register than "this".
+  if ((verifier->GetAccessFlags() & kAccStatic) != 0u || object_arg != 0u) {
+    // TODO: Implement inlining of IGET on non-"this" registers (needs correct stack trace for NPE).
+    // Allow synthetic accessors. We don't care about losing their stack frame in NPE.
+    if (!IsSyntheticAccessor(verifier->GetMethodReference())) {
+      return false;
+    }
+  }
+
+  // InlineIGetIPutData::object_arg is only 4 bits wide.
+  static constexpr uint16_t kMaxObjectArg = 15u;
+  if (object_arg > kMaxObjectArg) {
     return false;
   }
 
@@ -236,10 +253,10 @@
     result->opcode = kInlineOpIGet;
     result->flags = kInlineSpecial;
     data->op_variant = IGetVariant(opcode);
-    data->object_arg = object_reg - arg_start;  // Allow IGET on any register, not just "this".
-    data->src_arg = 0;
-    data->method_is_static = (verifier->GetAccessFlags() & kAccStatic) != 0;
-    data->reserved = 0;
+    data->method_is_static = (verifier->GetAccessFlags() & kAccStatic) != 0u ? 1u : 0u;
+    data->object_arg = object_arg;  // Allow IGET on any register, not just "this".
+    data->src_arg = 0u;
+    data->return_arg_plus1 = 0u;
   }
   return true;
 }
@@ -253,26 +270,45 @@
 
   const Instruction* return_instruction = instruction->Next();
   Instruction::Code return_opcode = return_instruction->Opcode();
+  uint32_t arg_start = code_item->registers_size_ - code_item->ins_size_;
+  uint16_t return_arg_plus1 = 0u;
   if (return_opcode != Instruction::RETURN_VOID) {
-    // TODO: Support returning an argument.
-    // This is needed by builder classes and generated accessor setters.
-    //    builder.setX(value): iput value, this, fieldX; return-object this;
-    //    object.access$nnn(value): iput value, this, fieldX; return value;
-    // Use InlineIGetIPutData::reserved to hold the information.
-    return false;
+    if (return_opcode != Instruction::RETURN &&
+        return_opcode != Instruction::RETURN_OBJECT &&
+        return_opcode != Instruction::RETURN_WIDE) {
+      return false;
+    }
+    // Returning an argument.
+    uint32_t return_reg = return_instruction->VRegA_11x();
+    DCHECK_GE(return_reg, arg_start);
+    DCHECK_LT(return_opcode == Instruction::RETURN_WIDE ? return_reg + 1u : return_reg,
+              code_item->registers_size_);
+    return_arg_plus1 = return_reg - arg_start + 1u;
   }
 
   uint32_t src_reg = instruction->VRegA_22c();
   uint32_t object_reg = instruction->VRegB_22c();
   uint32_t field_idx = instruction->VRegC_22c();
-  uint32_t arg_start = code_item->registers_size_ - code_item->ins_size_;
   DCHECK_GE(object_reg, arg_start);
   DCHECK_LT(object_reg, code_item->registers_size_);
   DCHECK_GE(src_reg, arg_start);
   DCHECK_LT(opcode == Instruction::IPUT_WIDE ? src_reg + 1 : src_reg, code_item->registers_size_);
+  uint32_t object_arg = object_reg - arg_start;
+  uint32_t src_arg = src_reg - arg_start;
 
-  if ((verifier->GetAccessFlags() & kAccStatic) != 0 || object_reg != arg_start) {
-    // TODO: Support inlining IPUT on other register than "this".
+  if ((verifier->GetAccessFlags() & kAccStatic) != 0u || object_arg != 0u) {
+    // TODO: Implement inlining of IPUT on non-"this" registers (needs correct stack trace for NPE).
+    // Allow synthetic accessors. We don't care about losing their stack frame in NPE.
+    if (!IsSyntheticAccessor(verifier->GetMethodReference())) {
+      return false;
+    }
+  }
+
+  // InlineIGetIPutData::object_arg/src_arg/return_arg_plus1 are each only 4 bits wide.
+  static constexpr uint16_t kMaxObjectArg = 15u;
+  static constexpr uint16_t kMaxSrcArg = 15u;
+  static constexpr uint16_t kMaxReturnArgPlus1 = 15u;
+  if (object_arg > kMaxObjectArg || src_arg > kMaxSrcArg || return_arg_plus1 > kMaxReturnArgPlus1) {
     return false;
   }
 
@@ -284,10 +320,10 @@
     result->opcode = kInlineOpIPut;
     result->flags = kInlineSpecial;
     data->op_variant = IPutVariant(opcode);
-    data->object_arg = object_reg - arg_start;  // Allow IPUT on any register, not just "this".
-    data->src_arg = src_reg - arg_start;
-    data->method_is_static = (verifier->GetAccessFlags() & kAccStatic) != 0;
-    data->reserved = 0;
+    data->method_is_static = (verifier->GetAccessFlags() & kAccStatic) != 0u ? 1u : 0u;
+    data->object_arg = object_arg;  // Allow IPUT on any register, not just "this".
+    data->src_arg = src_arg;
+    data->return_arg_plus1 = return_arg_plus1;
   }
   return true;
 }
diff --git a/runtime/quick/inline_method_analyser.h b/runtime/quick/inline_method_analyser.h
index 8e1a408..ddee89b 100644
--- a/runtime/quick/inline_method_analyser.h
+++ b/runtime/quick/inline_method_analyser.h
@@ -21,6 +21,7 @@
 #include "base/mutex.h"
 #include "dex_file.h"
 #include "dex_instruction.h"
+#include "method_reference.h"
 
 /*
  * NOTE: This code is part of the quick compiler. It lives in the runtime
@@ -98,10 +99,10 @@
   // opcode-Instruction::IPUT for IPUTs. This is because the runtime
   // doesn't know the OpSize enumeration.
   uint16_t op_variant : 3;
+  uint16_t method_is_static : 1;
   uint16_t object_arg : 4;
   uint16_t src_arg : 4;  // iput only
-  uint16_t method_is_static : 1;
-  uint16_t reserved : 4;
+  uint16_t return_arg_plus1 : 4;  // iput only, method argument to return + 1, 0 = return void.
   uint16_t field_idx;
   uint32_t is_volatile : 1;
   uint32_t field_offset : 31;
@@ -156,6 +157,9 @@
     return opcode - Instruction::IPUT;
   }
 
+  // Determines whether the method is a synthetic accessor (method name starts with "access$").
+  static bool IsSyntheticAccessor(MethodReference ref);
+
  private:
   static bool AnalyseReturnMethod(const DexFile::CodeItem* code_item, InlineMethod* result);
   static bool AnalyseConstMethod(const DexFile::CodeItem* code_item, InlineMethod* result);
diff --git a/runtime/read_barrier-inl.h b/runtime/read_barrier-inl.h
new file mode 100644
index 0000000..ea2f830
--- /dev/null
+++ b/runtime/read_barrier-inl.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_READ_BARRIER_INL_H_
+#define ART_RUNTIME_READ_BARRIER_INL_H_
+
+#include "read_barrier.h"
+
+#include "mirror/object_reference.h"
+
+namespace art {
+
+template <typename MirrorType, bool kDoReadBarrier>
+inline MirrorType* ReadBarrier::Barrier(
+    mirror::Object* obj, MemberOffset offset, mirror::HeapReference<MirrorType>* ref_addr) {
+  // Unused for now.
+  UNUSED(obj);
+  UNUSED(offset);
+  UNUSED(ref_addr);
+  if (kDoReadBarrier && kUseBakerReadBarrier) {
+    // To be implemented.
+    return ref_addr->AsMirrorPtr();
+  } else if (kDoReadBarrier && kUseBrooksReadBarrier) {
+    // To be implemented.
+    return ref_addr->AsMirrorPtr();
+  } else {
+    // No read barrier.
+    return ref_addr->AsMirrorPtr();
+  }
+}
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_READ_BARRIER_INL_H_
diff --git a/runtime/read_barrier.h b/runtime/read_barrier.h
index ba0d830..6f59004 100644
--- a/runtime/read_barrier.h
+++ b/runtime/read_barrier.h
@@ -17,21 +17,28 @@
 #ifndef ART_RUNTIME_READ_BARRIER_H_
 #define ART_RUNTIME_READ_BARRIER_H_
 
-// This is in a separate file (from globals.h) because asm_support.h
-// (a C header, not C++) can't include globals.h.
+#include "base/mutex.h"
+#include "base/macros.h"
+#include "offsets.h"
+#include "read_barrier_c.h"
 
-// Uncomment one of the following two and the two fields in
-// Object.java (libcore) to enable baker or brooks pointers.
+// This is a C++ (not C) header file, separate from read_barrier_c.h
+// which needs to be a C header file for asm_support.h.
 
-// #define USE_BAKER_READ_BARRIER
-// #define USE_BROOKS_READ_BARRIER
+namespace art {
+namespace mirror {
+  class Object;
+  template<typename MirrorType> class HeapReference;
+}  // namespace mirror
 
-#if defined(USE_BAKER_READ_BARRIER) || defined(USE_BROOKS_READ_BARRIER)
-#define USE_BAKER_OR_BROOKS_READ_BARRIER
-#endif
+class ReadBarrier {
+ public:
+  template <typename MirrorType, bool kDoReadBarrier = true>
+  ALWAYS_INLINE static MirrorType* Barrier(
+      mirror::Object* obj, MemberOffset offset, mirror::HeapReference<MirrorType>* ref_addr)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+};
 
-#if defined(USE_BAKER_READ_BARRIER) && defined(USE_BROOKS_READ_BARRIER)
-#error "Only one of Baker or Brooks can be enabled at a time."
-#endif
+}  // namespace art
 
 #endif  // ART_RUNTIME_READ_BARRIER_H_
diff --git a/runtime/read_barrier_c.h b/runtime/read_barrier_c.h
new file mode 100644
index 0000000..f4af61f
--- /dev/null
+++ b/runtime/read_barrier_c.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_READ_BARRIER_C_H_
+#define ART_RUNTIME_READ_BARRIER_C_H_
+
+// This is a C (not C++) header file and is in a separate file (from
+// globals.h) because asm_support.h is a C header file and can't
+// include globals.h.
+
+// Uncomment one of the following two and the two fields in
+// Object.java (libcore) to enable baker or brooks pointers.
+
+// #define USE_BAKER_READ_BARRIER
+// #define USE_BROOKS_READ_BARRIER
+
+#if defined(USE_BAKER_READ_BARRIER) || defined(USE_BROOKS_READ_BARRIER)
+#define USE_BAKER_OR_BROOKS_READ_BARRIER
+#endif
+
+#if defined(USE_BAKER_READ_BARRIER) && defined(USE_BROOKS_READ_BARRIER)
+#error "Only one of Baker or Brooks can be enabled at a time."
+#endif
+
+#endif  // ART_RUNTIME_READ_BARRIER_C_H_
diff --git a/runtime/sirt_ref-inl.h b/runtime/sirt_ref-inl.h
index 7f2d847..7de624a 100644
--- a/runtime/sirt_ref-inl.h
+++ b/runtime/sirt_ref-inl.h
@@ -23,8 +23,11 @@
 
 namespace art {
 
-template<class T> inline SirtRef<T>::SirtRef(Thread* self, T* object) : self_(self), sirt_(object) {
-  VerifyObject(object);
+template<class T> inline SirtRef<T>::SirtRef(Thread* self, T* object, bool should_verify)
+  : self_(self), sirt_(object) {
+  if (should_verify) {
+    VerifyObject(object);
+  }
   self_->PushSirt(&sirt_);
 }
 
@@ -33,8 +36,10 @@
   DCHECK_EQ(top_sirt, &sirt_);
 }
 
-template<class T> inline T* SirtRef<T>::reset(T* object) {
-  VerifyObject(object);
+template<class T> inline T* SirtRef<T>::reset(T* object, bool should_verify) {
+  if (should_verify) {
+    VerifyObject(object);
+  }
   T* old_ref = get();
   sirt_.SetReference(0, object);
   return old_ref;
diff --git a/runtime/sirt_ref.h b/runtime/sirt_ref.h
index 2226e17..cf23891 100644
--- a/runtime/sirt_ref.h
+++ b/runtime/sirt_ref.h
@@ -28,7 +28,7 @@
 template<class T>
 class SirtRef {
  public:
-  SirtRef(Thread* self, T* object);
+  SirtRef(Thread* self, T* object, bool should_verify = true);
   ~SirtRef();
 
   T& operator*() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -42,7 +42,8 @@
   }
 
   // Returns the old reference.
-  T* reset(T* object = nullptr) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  T* reset(T* object = nullptr, bool should_verify = true)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
   Thread* const self_;
@@ -51,6 +52,17 @@
   DISALLOW_COPY_AND_ASSIGN(SirtRef);
 };
 
+// A version of SirtRef which disables the object verification.
+template<class T>
+class SirtRefNoVerify : public SirtRef<T> {
+ public:
+  SirtRefNoVerify(Thread* self, T* object) : SirtRef<T>(self, object, false) {}
+  // Returns the old reference.
+  T* reset(T* object = nullptr) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return SirtRef<T>::reset(object, false);
+  }
+};
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_SIRT_REF_H_
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 7de9433..8dad419 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -823,9 +823,9 @@
 };
 
 static void VerifyRootWrapperCallback(mirror::Object** root, void* arg, uint32_t /*thread_id*/,
-                                      RootType /*root_type*/) {
+                                      RootType root_type) {
   VerifyRootWrapperArg* wrapperArg = reinterpret_cast<VerifyRootWrapperArg*>(arg);
-  wrapperArg->callback_(*root, wrapperArg->arg_, 0, NULL);
+  wrapperArg->callback_(*root, wrapperArg->arg_, 0, NULL, root_type);
 }
 
 void ThreadList::VerifyRoots(VerifyRootCallback* callback, void* arg) const {
diff --git a/test/401-optimizing-compiler/expected.txt b/test/401-optimizing-compiler/expected.txt
index 268da55..a65e544 100644
--- a/test/401-optimizing-compiler/expected.txt
+++ b/test/401-optimizing-compiler/expected.txt
@@ -4,3 +4,6 @@
 In static method with 7 args 1 2 3 4 5 6 7
 Forced GC
 java.lang.Error: Error
+Forced GC
+In static method with object arg class java.lang.Object
+Forced GC
diff --git a/test/401-optimizing-compiler/src/Main.java b/test/401-optimizing-compiler/src/Main.java
index 4031ff1..aa08137 100644
--- a/test/401-optimizing-compiler/src/Main.java
+++ b/test/401-optimizing-compiler/src/Main.java
@@ -26,6 +26,8 @@
       error = e;
     }
     System.out.println(error);
+
+    $opt$TestInvokeNew();
   }
 
   public static void $opt$TestInvokeStatic() {
@@ -37,6 +39,13 @@
     throwStaticMethod();
   }
 
+  public static void $opt$TestInvokeNew() {
+    Object o = new Object();
+    forceGCStaticMethod();
+    printStaticMethodWithObjectArg(o);
+    forceGCStaticMethod();
+  }
+
   public static void printStaticMethod() {
     System.out.println("In static method");
   }
@@ -55,6 +64,10 @@
         + a + " " + b + " " + c + " " + d + " " + e + " " + f + " " + g);
   }
 
+  public static void printStaticMethodWithObjectArg(Object a) {
+    System.out.println("In static method with object arg " + a.getClass());
+  }
+
   public static void forceGCStaticMethod() {
     Runtime.getRuntime().gc();
     Runtime.getRuntime().gc();