Merge "Move quick frame info to OatQuickMethodHeader."
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 6135571..188ddb5 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -110,6 +110,12 @@
 DALVIKVM_FLAGS := -Xcompiler-option --compiler-backend=Optimizing
 endif
 
+#
+# Used to change the default GC. Valid values are CMS, SS, GSS. The default is CMS.
+#
+ART_DEFAULT_GC_TYPE ?= CMS
+ART_DEFAULT_GC_TYPE_CFLAGS := -DART_DEFAULT_GC_TYPE_IS_$(ART_DEFAULT_GC_TYPE)
+
 LLVM_ROOT_PATH := external/llvm
 # Don't fail a dalvik minimal host build.
 -include $(LLVM_ROOT_PATH)/llvm.mk
@@ -237,6 +243,7 @@
 
 ART_HOST_CFLAGS := $(art_cflags) -DANDROID_SMP=1 -DART_BASE_ADDRESS=$(LIBART_IMG_HOST_BASE_ADDRESS)
 ART_HOST_CFLAGS += -DART_DEFAULT_INSTRUCTION_SET_FEATURES=default
+ART_HOST_CFLAGS += $(ART_DEFAULT_GC_TYPE_CFLAGS)
 
 ART_TARGET_CFLAGS := $(art_cflags) -DART_TARGET -DART_BASE_ADDRESS=$(LIBART_IMG_TARGET_BASE_ADDRESS)
 ifeq ($(TARGET_CPU_SMP),true)
@@ -244,6 +251,7 @@
 else
   ART_TARGET_CFLAGS += -DANDROID_SMP=0
 endif
+ART_TARGET_CFLAGS += $(ART_DEFAULT_GC_TYPE_CFLAGS)
 
 # DEX2OAT_TARGET_INSTRUCTION_SET_FEATURES is set in ../build/core/dex_preopt.mk based on
 # the TARGET_CPU_VARIANT
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index d4e2cbb..c986c57 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -78,6 +78,7 @@
 	compiler/oat_test.cc \
 	compiler/optimizing/codegen_test.cc \
 	compiler/optimizing/dominator_test.cc \
+	compiler/optimizing/find_loops_test.cc \
 	compiler/optimizing/liveness_test.cc \
 	compiler/optimizing/pretty_printer_test.cc \
 	compiler/optimizing/ssa_test.cc \
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 56f4830..6696cf7 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -31,7 +31,8 @@
                             RegLocation rl_dest, int lit);
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(ThreadOffset<4> offset);
+    RegStorage LoadHelper(ThreadOffset<4> offset) OVERRIDE;
+    RegStorage LoadHelper(ThreadOffset<8> offset) OVERRIDE;
     LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
                               OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
@@ -174,12 +175,14 @@
     LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
     LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
     LIR* OpTestSuspend(LIR* target);
-    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset);
+    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) OVERRIDE;
+    LIR* OpThreadMem(OpKind op, ThreadOffset<8> thread_offset) OVERRIDE;
     LIR* OpVldm(RegStorage r_base, int count);
     LIR* OpVstm(RegStorage r_base, int count);
     void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
     void OpRegCopyWide(RegStorage dest, RegStorage src);
-    void OpTlsCmp(ThreadOffset<4> offset, int val);
+    void OpTlsCmp(ThreadOffset<4> offset, int val) OVERRIDE;
+    void OpTlsCmp(ThreadOffset<8> offset, int val) OVERRIDE;
 
     LIR* LoadBaseDispBody(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size);
     LIR* StoreBaseDispBody(RegStorage r_base, int displacement, RegStorage r_src, OpSize size);
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 0948ce3..2e0e559 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -725,6 +725,10 @@
   LOG(FATAL) << "Unexpected use of OpTlsCmp for Arm";
 }
 
+void ArmMir2Lir::OpTlsCmp(ThreadOffset<8> offset, int val) {
+  UNIMPLEMENTED(FATAL) << "Should not be called.";
+}
+
 bool ArmMir2Lir::GenInlinedCas(CallInfo* info, bool is_long, bool is_object) {
   DCHECK_EQ(cu_->instruction_set, kThumb2);
   // Unused - RegLocation rl_src_unsafe = info->args[0];
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index d0c81d5..8cf1f86 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -738,6 +738,11 @@
   return rs_rARM_LR;
 }
 
+RegStorage ArmMir2Lir::LoadHelper(ThreadOffset<8> offset) {
+  UNIMPLEMENTED(FATAL) << "Should not be called.";
+  return RegStorage::InvalidReg();
+}
+
 LIR* ArmMir2Lir::CheckSuspendUsingLoad() {
   RegStorage tmp = rs_r0;
   Load32Disp(rs_rARM_SELF, Thread::ThreadSuspendTriggerOffset<4>().Int32Value(), tmp);
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 1afd890..fe18ed9 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -1161,6 +1161,11 @@
   return NULL;
 }
 
+LIR* ArmMir2Lir::OpThreadMem(OpKind op, ThreadOffset<8> thread_offset) {
+  UNIMPLEMENTED(FATAL) << "Should not be called.";
+  return nullptr;
+}
+
 LIR* ArmMir2Lir::OpMem(OpKind op, RegStorage r_base, int disp) {
   LOG(FATAL) << "Unexpected use of OpMem for Arm";
   return NULL;
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index 452c8d7..7ae4b02 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -95,25 +95,6 @@
  * +========================+
  */
 
-#if 1
-#define A64_PTR_SIZE 4
-#define A64_GET_INT_OFFS(offs) ((offs).Int32Value())
-#else
-// Not yet ready for this.
-#define A64_PTR_SIZE 8
-#define A64_GET_INT_OFFS(offs) ((offs).Int32Value())
-#endif
-
-#define A64_QUICK_ENTRYPOINT_OFFSET(name) QUICK_ENTRYPOINT_OFFSET(A64_PTR_SIZE, name)
-#define A64_QUICK_ENTRYPOINT_INT_OFFS(name) A64_GET_INT_OFFS(A64_QUICK_ENTRYPOINT_OFFSET(name))
-#define A64_THREAD_THIN_LOCK_ID_OFFSET A64_GET_INT_OFFS(Thread::ThinLockIdOffset<A64_PTR_SIZE>())
-#define A64_THREAD_EXCEPTION_INT_OFFS A64_GET_INT_OFFS(Thread::ExceptionOffset<A64_PTR_SIZE>())
-#define A64_THREAD_CARD_TABLE_INT_OFFS A64_GET_INT_OFFS(Thread::CardTableOffset<A64_PTR_SIZE>())
-#define A64_THREAD_STACK_END_INT_OFFS A64_GET_INT_OFFS(Thread::StackEndOffset<A64_PTR_SIZE>())
-#define A64_THREAD_SUSPEND_TRIGGER_OFFSET \
-  A64_GET_INT_OFFS(Thread::ThreadSuspendTriggerOffset<A64_PTR_SIZE>())
-typedef ThreadOffset<A64_PTR_SIZE> A64ThreadOffset;
-
 // Offset to distinguish FP regs.
 #define ARM_FP_REG_OFFSET 32
 // First FP callee save.
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index c210816..51e97cd 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -180,7 +180,7 @@
   // Making a call - use explicit registers
   FlushAllRegs();   /* Everything to home location */
   LoadValueDirectFixed(rl_src, rs_x0);
-  LoadWordDisp(rs_rA64_SELF, A64_QUICK_ENTRYPOINT_INT_OFFS(pHandleFillArrayData),
+  LoadWordDisp(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pHandleFillArrayData).Int32Value(),
                rs_rA64_LR);
   // Materialize a pointer to the fill data image
   NewLIR3(kA64Adr2xd, rx1, 0, WrapPointer(tab_rec));
@@ -209,7 +209,7 @@
         null_check_branch = OpCmpImmBranch(kCondEq, rs_x0, 0, NULL);
       }
     }
-    Load32Disp(rs_rA64_SELF, A64_THREAD_THIN_LOCK_ID_OFFSET, rs_x2);
+    Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_x2);
     NewLIR3(kA64Ldxr2rX, rx1, rx0, mirror::Object::MonitorOffset().Int32Value() >> 2);
     MarkPossibleNullPointerException(opt_flags);
     LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_x1, 0, NULL);
@@ -224,7 +224,7 @@
     }
     // TODO: move to a slow path.
     // Go expensive route - artLockObjectFromCode(obj);
-    LoadWordDisp(rs_rA64_SELF, A64_QUICK_ENTRYPOINT_INT_OFFS(pLockObject), rs_rA64_LR);
+    LoadWordDisp(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pLockObject).Int32Value(), rs_rA64_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx, rs_rA64_LR);
     MarkSafepointPC(call_inst);
@@ -235,7 +235,7 @@
   } else {
     // Explicit null-check as slow-path is entered using an IT.
     GenNullCheck(rs_x0, opt_flags);
-    Load32Disp(rs_rA64_SELF, A64_THREAD_THIN_LOCK_ID_OFFSET, rs_x2);
+    Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_x2);
     MarkPossibleNullPointerException(opt_flags);
     NewLIR3(kA64Ldxr2rX, rx1, rx0, mirror::Object::MonitorOffset().Int32Value() >> 2);
     OpRegImm(kOpCmp, rs_x1, 0);
@@ -244,7 +244,8 @@
     OpRegImm(kOpCmp, rs_x1, 0);
     OpIT(kCondNe, "T");
     // Go expensive route - artLockObjectFromCode(self, obj);
-    LoadWordDisp/*ne*/(rs_rA64_SELF, A64_QUICK_ENTRYPOINT_INT_OFFS(pLockObject), rs_rA64_LR);
+    LoadWordDisp/*ne*/(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pLockObject).Int32Value(),
+                       rs_rA64_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rA64_LR);
     MarkSafepointPC(call_inst);
@@ -262,7 +263,7 @@
   LoadValueDirectFixed(rl_src, rs_x0);  // Get obj
   LockCallTemps();  // Prepare for explicit register usage
   LIR* null_check_branch = nullptr;
-  Load32Disp(rs_rA64_SELF, A64_THREAD_THIN_LOCK_ID_OFFSET, rs_x2);
+  Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_x2);
   constexpr bool kArchVariantHasGoodBranchPredictor = false;  // TODO: true if cortex-A15.
   if (kArchVariantHasGoodBranchPredictor) {
     if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) {
@@ -287,7 +288,7 @@
     }
     // TODO: move to a slow path.
     // Go expensive route - artUnlockObjectFromCode(obj);
-    LoadWordDisp(rs_rA64_SELF, A64_QUICK_ENTRYPOINT_INT_OFFS(pUnlockObject), rs_rA64_LR);
+    LoadWordDisp(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pUnlockObject).Int32Value(), rs_rA64_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx, rs_rA64_LR);
     MarkSafepointPC(call_inst);
@@ -300,14 +301,15 @@
     GenNullCheck(rs_x0, opt_flags);
     Load32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_x1);  // Get lock
     MarkPossibleNullPointerException(opt_flags);
-    Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_x2);
+    Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_x2);
     LoadConstantNoClobber(rs_x3, 0);
     // Is lock unheld on lock or held by us (==thread_id) on unlock?
     OpRegReg(kOpCmp, rs_x1, rs_x2);
     OpIT(kCondEq, "EE");
     Store32Disp/*eq*/(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_x3);
     // Go expensive route - UnlockObjectFromCode(obj);
-    LoadWordDisp/*ne*/(rs_rA64_SELF, A64_QUICK_ENTRYPOINT_INT_OFFS(pUnlockObject), rs_rA64_LR);
+    LoadWordDisp/*ne*/(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pUnlockObject).Int32Value(),
+                       rs_rA64_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rA64_LR);
     MarkSafepointPC(call_inst);
@@ -316,7 +318,7 @@
 }
 
 void Arm64Mir2Lir::GenMoveException(RegLocation rl_dest) {
-  int ex_offset = A64_THREAD_EXCEPTION_INT_OFFS;
+  int ex_offset = Thread::ExceptionOffset<8>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage reset_reg = AllocTemp();
   Load32Disp(rs_rA64_SELF, ex_offset, rl_result.reg);
@@ -333,7 +335,7 @@
   RegStorage reg_card_base = AllocTemp();
   RegStorage reg_card_no = AllocTemp();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
-  LoadWordDisp(rs_rA64_SELF, A64_THREAD_CARD_TABLE_INT_OFFS, reg_card_base);
+  LoadWordDisp(rs_rA64_SELF, Thread::CardTableOffset<8>().Int32Value(), reg_card_base);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
@@ -364,7 +366,7 @@
   NewLIR0(kPseudoMethodEntry);
 
   if (!skip_overflow_check) {
-    LoadWordDisp(rs_rA64_SELF, A64_THREAD_STACK_END_INT_OFFS, rs_x12);
+    LoadWordDisp(rs_rA64_SELF, Thread::StackEndOffset<8>().Int32Value(), rs_x12);
     OpRegImm64(kOpSub, rs_rA64_SP, frame_size_, /*is_wide*/true);
     if (Runtime::Current()->ExplicitStackOverflowChecks()) {
       /* Load stack limit */
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 26084a2..af0029c 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -31,7 +31,8 @@
                             RegLocation rl_dest, int lit);
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(A64ThreadOffset offset);
+    RegStorage LoadHelper(ThreadOffset<4> offset) OVERRIDE;
+    RegStorage LoadHelper(ThreadOffset<8> offset) OVERRIDE;
     LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
                               OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
@@ -181,12 +182,14 @@
     LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
     LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
     LIR* OpTestSuspend(LIR* target);
-    LIR* OpThreadMem(OpKind op, A64ThreadOffset thread_offset);
+    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) OVERRIDE;
+    LIR* OpThreadMem(OpKind op, ThreadOffset<8> thread_offset) OVERRIDE;
     LIR* OpVldm(RegStorage r_base, int count);
     LIR* OpVstm(RegStorage r_base, int count);
     void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
     void OpRegCopyWide(RegStorage dest, RegStorage src);
-    void OpTlsCmp(A64ThreadOffset offset, int val);
+    void OpTlsCmp(ThreadOffset<4> offset, int val) OVERRIDE;
+    void OpTlsCmp(ThreadOffset<8> offset, int val) OVERRIDE;
 
     LIR* LoadBaseDispBody(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size);
     LIR* StoreBaseDispBody(RegStorage r_base, int displacement, RegStorage r_src, OpSize size);
diff --git a/compiler/dex/quick/arm64/fp_arm64.cc b/compiler/dex/quick/arm64/fp_arm64.cc
index c2a550e..87ab6fe 100644
--- a/compiler/dex/quick/arm64/fp_arm64.cc
+++ b/compiler/dex/quick/arm64/fp_arm64.cc
@@ -49,7 +49,7 @@
     case Instruction::REM_FLOAT_2ADDR:
     case Instruction::REM_FLOAT:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(A64_QUICK_ENTRYPOINT_OFFSET(pFmodf), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(8, pFmodf), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturn(true);
       StoreValue(rl_dest, rl_result);
@@ -92,7 +92,7 @@
     case Instruction::REM_DOUBLE_2ADDR:
     case Instruction::REM_DOUBLE:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(A64_QUICK_ENTRYPOINT_OFFSET(pFmod), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(8, pFmod), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturnWide(true);
       StoreValueWide(rl_dest, rl_result);
@@ -310,7 +310,7 @@
   branch = NewLIR2(kA64B2ct, kArmCondEq, 0);
   ClobberCallerSave();
   LockCallTemps();  // Using fixed registers
-  RegStorage r_tgt = LoadHelper(A64_QUICK_ENTRYPOINT_OFFSET(pSqrt));
+  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(8, pSqrt));
   // NewLIR3(kThumb2Fmrrd, r0, r1, rl_src.reg.GetReg());
   NewLIR1(kA64Blr1x, r_tgt.GetReg());
   // NewLIR3(kThumb2Fmdrr, rl_result.reg.GetReg(), r0, r1);
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 709f583..0465249 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -501,7 +501,11 @@
   LOG(FATAL) << "Unexpected use of OpLea for Arm64";
 }
 
-void Arm64Mir2Lir::OpTlsCmp(A64ThreadOffset offset, int val) {
+void Arm64Mir2Lir::OpTlsCmp(ThreadOffset<4> offset, int val) {
+  UNIMPLEMENTED(FATAL) << "Should not be used.";
+}
+
+void Arm64Mir2Lir::OpTlsCmp(ThreadOffset<8> offset, int val) {
   LOG(FATAL) << "Unexpected use of OpTlsCmp for Arm64";
 }
 
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index e4764eb..6caacc8 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -731,7 +731,12 @@
   FreeTemp(rs_x3);
 }
 
-RegStorage Arm64Mir2Lir::LoadHelper(A64ThreadOffset offset) {
+RegStorage Arm64Mir2Lir::LoadHelper(ThreadOffset<4> offset) {
+  UNIMPLEMENTED(FATAL) << "Should not be called.";
+  return RegStorage::InvalidReg();
+}
+
+RegStorage Arm64Mir2Lir::LoadHelper(ThreadOffset<8> offset) {
   // TODO(Arm64): use LoadWordDisp instead.
   //   e.g. LoadWordDisp(rs_rA64_SELF, offset.Int32Value(), rs_rA64_LR);
   LoadBaseDisp(rs_rA64_SELF, offset.Int32Value(), rs_rA64_LR, k64);
@@ -740,7 +745,7 @@
 
 LIR* Arm64Mir2Lir::CheckSuspendUsingLoad() {
   RegStorage tmp = rs_x0;
-  LoadWordDisp(rs_rA64_SELF, A64_THREAD_SUSPEND_TRIGGER_OFFSET, tmp);
+  LoadWordDisp(rs_rA64_SELF, Thread::ThreadSuspendTriggerOffset<8>().Int32Value(), tmp);
   LIR* load2 = LoadWordDisp(tmp, 0, tmp);
   return load2;
 }
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index ae17711..77e4c3c 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -1056,7 +1056,12 @@
   return NULL;
 }
 
-LIR* Arm64Mir2Lir::OpThreadMem(OpKind op, A64ThreadOffset thread_offset) {
+LIR* Arm64Mir2Lir::OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) {
+  UNIMPLEMENTED(FATAL) << "Should not be used.";
+  return nullptr;
+}
+
+LIR* Arm64Mir2Lir::OpThreadMem(OpKind op, ThreadOffset<8> thread_offset) {
   LOG(FATAL) << "Unexpected use of OpThreadMem for Arm64";
   return NULL;
 }
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 732e776..350823d 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -73,7 +73,11 @@
       m2l_->ResetRegPool();
       m2l_->ResetDefTracking();
       GenerateTargetLabel(kPseudoThrowTarget);
-      m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(4, pThrowDivZero), true);
+      if (Is64BitInstructionSet(m2l_->cu_->instruction_set)) {
+        m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(8, pThrowDivZero), true);
+      } else {
+        m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(4, pThrowDivZero), true);
+      }
     }
   };
 
@@ -92,8 +96,13 @@
       m2l_->ResetRegPool();
       m2l_->ResetDefTracking();
       GenerateTargetLabel(kPseudoThrowTarget);
-      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds),
-                                    index_, length_, true);
+      if (Is64BitInstructionSet(m2l_->cu_->instruction_set)) {
+        m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(8, pThrowArrayBounds),
+                                      index_, length_, true);
+      } else {
+        m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds),
+                                      index_, length_, true);
+      }
     }
 
    private:
@@ -120,8 +129,13 @@
 
       m2l_->OpRegCopy(m2l_->TargetReg(kArg1), length_);
       m2l_->LoadConstant(m2l_->TargetReg(kArg0), index_);
-      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds),
-                                    m2l_->TargetReg(kArg0), m2l_->TargetReg(kArg1), true);
+      if (Is64BitInstructionSet(m2l_->cu_->instruction_set)) {
+        m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(8, pThrowArrayBounds),
+                                      m2l_->TargetReg(kArg0), m2l_->TargetReg(kArg1), true);
+      } else {
+        m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds),
+                                      m2l_->TargetReg(kArg0), m2l_->TargetReg(kArg1), true);
+      }
     }
 
    private:
@@ -144,7 +158,11 @@
       m2l_->ResetRegPool();
       m2l_->ResetDefTracking();
       GenerateTargetLabel(kPseudoThrowTarget);
-      m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(4, pThrowNullPointer), true);
+      if (Is64BitInstructionSet(m2l_->cu_->instruction_set)) {
+        m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(8, pThrowNullPointer), true);
+      } else {
+        m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(4, pThrowNullPointer), true);
+      }
     }
   };
 
@@ -314,19 +332,16 @@
   StoreValue(rl_dest, rl_result);
 }
 
-/*
- * Let helper function take care of everything.  Will call
- * Array::AllocFromCode(type_idx, method, count);
- * Note: AllocFromCode will handle checks for errNegativeArraySize.
- */
-void Mir2Lir::GenNewArray(uint32_t type_idx, RegLocation rl_dest,
-                          RegLocation rl_src) {
-  FlushAllRegs();  /* Everything to home location */
-  ThreadOffset<4> func_offset(-1);
-  const DexFile* dex_file = cu_->dex_file;
-  CompilerDriver* driver = cu_->compiler_driver;
-  if (cu_->compiler_driver->CanAccessTypeWithoutChecks(cu_->method_idx, *dex_file,
-                                                       type_idx)) {
+template <size_t pointer_size>
+static void GenNewArrayImpl(Mir2Lir* mir_to_lir, CompilationUnit* cu,
+                            uint32_t type_idx, RegLocation rl_dest,
+                            RegLocation rl_src) {
+  mir_to_lir->FlushAllRegs();  /* Everything to home location */
+  ThreadOffset<pointer_size> func_offset(-1);
+  const DexFile* dex_file = cu->dex_file;
+  CompilerDriver* driver = cu->compiler_driver;
+  if (cu->compiler_driver->CanAccessTypeWithoutChecks(cu->method_idx, *dex_file,
+                                                      type_idx)) {
     bool is_type_initialized;  // Ignored as an array does not have an initializer.
     bool use_direct_type_ptr;
     uintptr_t direct_type_ptr;
@@ -336,27 +351,54 @@
                                    &direct_type_ptr, &is_finalizable)) {
       // The fast path.
       if (!use_direct_type_ptr) {
-        LoadClassType(type_idx, kArg0);
-        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocArrayResolved);
-        CallRuntimeHelperRegMethodRegLocation(func_offset, TargetReg(kArg0), rl_src, true);
+        mir_to_lir->LoadClassType(type_idx, kArg0);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pAllocArrayResolved);
+        mir_to_lir->CallRuntimeHelperRegMethodRegLocation(func_offset, mir_to_lir->TargetReg(kArg0),
+                                                          rl_src, true);
       } else {
         // Use the direct pointer.
-        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocArrayResolved);
-        CallRuntimeHelperImmMethodRegLocation(func_offset, direct_type_ptr, rl_src, true);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pAllocArrayResolved);
+        mir_to_lir->CallRuntimeHelperImmMethodRegLocation(func_offset, direct_type_ptr, rl_src,
+                                                          true);
       }
     } else {
       // The slow path.
-      DCHECK_EQ(func_offset.Int32Value(), -1);
-      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocArray);
-      CallRuntimeHelperImmMethodRegLocation(func_offset, type_idx, rl_src, true);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pAllocArray);
+      mir_to_lir->CallRuntimeHelperImmMethodRegLocation(func_offset, type_idx, rl_src, true);
     }
     DCHECK_NE(func_offset.Int32Value(), -1);
   } else {
-    func_offset= QUICK_ENTRYPOINT_OFFSET(4, pAllocArrayWithAccessCheck);
-    CallRuntimeHelperImmMethodRegLocation(func_offset, type_idx, rl_src, true);
+    func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pAllocArrayWithAccessCheck);
+    mir_to_lir->CallRuntimeHelperImmMethodRegLocation(func_offset, type_idx, rl_src, true);
   }
-  RegLocation rl_result = GetReturn(false);
-  StoreValue(rl_dest, rl_result);
+  RegLocation rl_result = mir_to_lir->GetReturn(false);
+  mir_to_lir->StoreValue(rl_dest, rl_result);
+}
+
+/*
+ * Let helper function take care of everything.  Will call
+ * Array::AllocFromCode(type_idx, method, count);
+ * Note: AllocFromCode will handle checks for errNegativeArraySize.
+ */
+void Mir2Lir::GenNewArray(uint32_t type_idx, RegLocation rl_dest,
+                          RegLocation rl_src) {
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    GenNewArrayImpl<8>(this, cu_, type_idx, rl_dest, rl_src);
+  } else {
+    GenNewArrayImpl<4>(this, cu_, type_idx, rl_dest, rl_src);
+  }
+}
+
+template <size_t pointer_size>
+static void GenFilledNewArrayCall(Mir2Lir* mir_to_lir, CompilationUnit* cu, int elems, int type_idx) {
+  ThreadOffset<pointer_size> func_offset(-1);
+  if (cu->compiler_driver->CanAccessTypeWithoutChecks(cu->method_idx, *cu->dex_file,
+                                                      type_idx)) {
+    func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pCheckAndAllocArray);
+  } else {
+    func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pCheckAndAllocArrayWithAccessCheck);
+  }
+  mir_to_lir->CallRuntimeHelperImmMethodImm(func_offset, type_idx, elems, true);
 }
 
 /*
@@ -369,14 +411,11 @@
   int elems = info->num_arg_words;
   int type_idx = info->index;
   FlushAllRegs();  /* Everything to home location */
-  ThreadOffset<4> func_offset(-1);
-  if (cu_->compiler_driver->CanAccessTypeWithoutChecks(cu_->method_idx, *cu_->dex_file,
-                                                       type_idx)) {
-    func_offset = QUICK_ENTRYPOINT_OFFSET(4, pCheckAndAllocArray);
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    GenFilledNewArrayCall<8>(this, cu_, elems, type_idx);
   } else {
-    func_offset = QUICK_ENTRYPOINT_OFFSET(4, pCheckAndAllocArrayWithAccessCheck);
+    GenFilledNewArrayCall<4>(this, cu_, elems, type_idx);
   }
-  CallRuntimeHelperImmMethodImm(func_offset, type_idx, elems, true);
   FreeTemp(TargetReg(kArg2));
   FreeTemp(TargetReg(kArg1));
   /*
@@ -482,8 +521,13 @@
   void Compile() {
     LIR* unresolved_target = GenerateTargetLabel();
     uninit_->target = unresolved_target;
-    m2l_->CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeStaticStorage),
-                               storage_index_, true);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      m2l_->CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(8, pInitializeStaticStorage),
+                                 storage_index_, true);
+    } else {
+      m2l_->CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeStaticStorage),
+                                 storage_index_, true);
+    }
     // Copy helper's result into r_base, a no-op on all but MIPS.
     m2l_->OpRegCopy(r_base_,  m2l_->TargetReg(kRet0));
 
@@ -496,6 +540,17 @@
   const RegStorage r_base_;
 };
 
+template <size_t pointer_size>
+static void GenSputCall(Mir2Lir* mir_to_lir, bool is_long_or_double, bool is_object,
+                        const MirSFieldLoweringInfo* field_info, RegLocation rl_src) {
+  ThreadOffset<pointer_size> setter_offset =
+      is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pointer_size, pSet64Static)
+          : (is_object ? QUICK_ENTRYPOINT_OFFSET(pointer_size, pSetObjStatic)
+              : QUICK_ENTRYPOINT_OFFSET(pointer_size, pSet32Static));
+  mir_to_lir->CallRuntimeHelperImmRegLocation(setter_offset, field_info->FieldIndex(), rl_src,
+                                              true);
+}
+
 void Mir2Lir::GenSput(MIR* mir, RegLocation rl_src, bool is_long_or_double,
                       bool is_object) {
   const MirSFieldLoweringInfo& field_info = mir_graph_->GetSFieldLoweringInfo(mir);
@@ -573,14 +628,24 @@
     FreeTemp(r_base);
   } else {
     FlushAllRegs();  // Everything to home locations
-    ThreadOffset<4> setter_offset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pSet64Static)
-                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(4, pSetObjStatic)
-                                       : QUICK_ENTRYPOINT_OFFSET(4, pSet32Static));
-    CallRuntimeHelperImmRegLocation(setter_offset, field_info.FieldIndex(), rl_src, true);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      GenSputCall<8>(this, is_long_or_double, is_object, &field_info, rl_src);
+    } else {
+      GenSputCall<4>(this, is_long_or_double, is_object, &field_info, rl_src);
+    }
   }
 }
 
+template <size_t pointer_size>
+static void GenSgetCall(Mir2Lir* mir_to_lir, bool is_long_or_double, bool is_object,
+                        const MirSFieldLoweringInfo* field_info) {
+  ThreadOffset<pointer_size> getter_offset =
+      is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pointer_size, pGet64Static)
+          : (is_object ? QUICK_ENTRYPOINT_OFFSET(pointer_size, pGetObjStatic)
+              : QUICK_ENTRYPOINT_OFFSET(pointer_size, pGet32Static));
+  mir_to_lir->CallRuntimeHelperImm(getter_offset, field_info->FieldIndex(), true);
+}
+
 void Mir2Lir::GenSget(MIR* mir, RegLocation rl_dest,
                       bool is_long_or_double, bool is_object) {
   const MirSFieldLoweringInfo& field_info = mir_graph_->GetSFieldLoweringInfo(mir);
@@ -655,11 +720,11 @@
     }
   } else {
     FlushAllRegs();  // Everything to home locations
-    ThreadOffset<4> getterOffset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pGet64Static)
-                          :(is_object ? QUICK_ENTRYPOINT_OFFSET(4, pGetObjStatic)
-                                      : QUICK_ENTRYPOINT_OFFSET(4, pGet32Static));
-    CallRuntimeHelperImm(getterOffset, field_info.FieldIndex(), true);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      GenSgetCall<8>(this, is_long_or_double, is_object, &field_info);
+    } else {
+      GenSgetCall<4>(this, is_long_or_double, is_object, &field_info);
+    }
     if (is_long_or_double) {
       RegLocation rl_result = GetReturnWide(rl_dest.fp);
       StoreValueWide(rl_dest, rl_result);
@@ -680,6 +745,17 @@
   slow_paths_.Reset();
 }
 
+template <size_t pointer_size>
+static void GenIgetCall(Mir2Lir* mir_to_lir, bool is_long_or_double, bool is_object,
+                        const MirIFieldLoweringInfo* field_info, RegLocation rl_obj) {
+  ThreadOffset<pointer_size> getter_offset =
+      is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pointer_size, pGet64Instance)
+          : (is_object ? QUICK_ENTRYPOINT_OFFSET(pointer_size, pGetObjInstance)
+              : QUICK_ENTRYPOINT_OFFSET(pointer_size, pGet32Instance));
+  mir_to_lir->CallRuntimeHelperImmRegLocation(getter_offset, field_info->FieldIndex(), rl_obj,
+                                              true);
+}
+
 void Mir2Lir::GenIGet(MIR* mir, int opt_flags, OpSize size,
                       RegLocation rl_dest, RegLocation rl_obj, bool is_long_or_double,
                       bool is_object) {
@@ -711,11 +787,11 @@
       StoreValue(rl_dest, rl_result);
     }
   } else {
-    ThreadOffset<4> getterOffset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pGet64Instance)
-                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(4, pGetObjInstance)
-                                       : QUICK_ENTRYPOINT_OFFSET(4, pGet32Instance));
-    CallRuntimeHelperImmRegLocation(getterOffset, field_info.FieldIndex(), rl_obj, true);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      GenIgetCall<8>(this, is_long_or_double, is_object, &field_info, rl_obj);
+    } else {
+      GenIgetCall<4>(this, is_long_or_double, is_object, &field_info, rl_obj);
+    }
     if (is_long_or_double) {
       RegLocation rl_result = GetReturnWide(rl_dest.fp);
       StoreValueWide(rl_dest, rl_result);
@@ -726,6 +802,18 @@
   }
 }
 
+template <size_t pointer_size>
+static void GenIputCall(Mir2Lir* mir_to_lir, bool is_long_or_double, bool is_object,
+                        const MirIFieldLoweringInfo* field_info, RegLocation rl_obj,
+                        RegLocation rl_src) {
+  ThreadOffset<pointer_size> setter_offset =
+      is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pointer_size, pSet64Instance)
+          : (is_object ? QUICK_ENTRYPOINT_OFFSET(pointer_size, pSetObjInstance)
+              : QUICK_ENTRYPOINT_OFFSET(pointer_size, pSet32Instance));
+  mir_to_lir->CallRuntimeHelperImmRegLocationRegLocation(setter_offset, field_info->FieldIndex(),
+                                                         rl_obj, rl_src, true);
+}
+
 void Mir2Lir::GenIPut(MIR* mir, int opt_flags, OpSize size,
                       RegLocation rl_src, RegLocation rl_obj, bool is_long_or_double,
                       bool is_object) {
@@ -759,25 +847,35 @@
       MarkGCCard(rl_src.reg, rl_obj.reg);
     }
   } else {
-    ThreadOffset<4> setter_offset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pSet64Instance)
-                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(4, pSetObjInstance)
-                                       : QUICK_ENTRYPOINT_OFFSET(4, pSet32Instance));
-    CallRuntimeHelperImmRegLocationRegLocation(setter_offset, field_info.FieldIndex(),
-                                               rl_obj, rl_src, true);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      GenIputCall<8>(this, is_long_or_double, is_object, &field_info, rl_obj, rl_src);
+    } else {
+      GenIputCall<4>(this, is_long_or_double, is_object, &field_info, rl_obj, rl_src);
+    }
   }
 }
 
+template <size_t pointer_size>
+static void GenArrayObjPutCall(Mir2Lir* mir_to_lir, bool needs_range_check, bool needs_null_check,
+                               RegLocation rl_array, RegLocation rl_index, RegLocation rl_src) {
+  ThreadOffset<pointer_size> helper = needs_range_check
+        ? (needs_null_check ? QUICK_ENTRYPOINT_OFFSET(pointer_size, pAputObjectWithNullAndBoundCheck)
+                            : QUICK_ENTRYPOINT_OFFSET(pointer_size, pAputObjectWithBoundCheck))
+        : QUICK_ENTRYPOINT_OFFSET(pointer_size, pAputObject);
+  mir_to_lir->CallRuntimeHelperRegLocationRegLocationRegLocation(helper, rl_array, rl_index, rl_src,
+                                                                 true);
+}
+
 void Mir2Lir::GenArrayObjPut(int opt_flags, RegLocation rl_array, RegLocation rl_index,
                              RegLocation rl_src) {
   bool needs_range_check = !(opt_flags & MIR_IGNORE_RANGE_CHECK);
   bool needs_null_check = !((cu_->disable_opt & (1 << kNullCheckElimination)) &&
       (opt_flags & MIR_IGNORE_NULL_CHECK));
-  ThreadOffset<4> helper = needs_range_check
-      ? (needs_null_check ? QUICK_ENTRYPOINT_OFFSET(4, pAputObjectWithNullAndBoundCheck)
-                          : QUICK_ENTRYPOINT_OFFSET(4, pAputObjectWithBoundCheck))
-      : QUICK_ENTRYPOINT_OFFSET(4, pAputObject);
-  CallRuntimeHelperRegLocationRegLocationRegLocation(helper, rl_array, rl_index, rl_src, true);
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    GenArrayObjPutCall<8>(this, needs_range_check, needs_null_check, rl_array, rl_index, rl_src);
+  } else {
+    GenArrayObjPutCall<4>(this, needs_range_check, needs_null_check, rl_array, rl_index, rl_src);
+  }
 }
 
 void Mir2Lir::GenConstClass(uint32_t type_idx, RegLocation rl_dest) {
@@ -789,8 +887,13 @@
                                                    type_idx)) {
     // Call out to helper which resolves type and verifies access.
     // Resolved type returned in kRet0.
-    CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
-                            type_idx, rl_method.reg, true);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(8, pInitializeTypeAndVerifyAccess),
+                              type_idx, rl_method.reg, true);
+    } else {
+      CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
+                              type_idx, rl_method.reg, true);
+    }
     RegLocation rl_result = GetReturn(false);
     StoreValue(rl_dest, rl_result);
   } else {
@@ -819,8 +922,13 @@
         void Compile() {
           GenerateTargetLabel();
 
-          m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx_,
-                                        rl_method_.reg, true);
+          if (Is64BitInstructionSet(cu_->instruction_set)) {
+            m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(8, pInitializeType), type_idx_,
+                                          rl_method_.reg, true);
+          } else {
+            m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx_,
+                                                      rl_method_.reg, true);
+          }
           m2l_->OpRegCopy(rl_result_.reg,  m2l_->TargetReg(kRet0));
 
           m2l_->OpUnconditionalBranch(cont_);
@@ -883,8 +991,13 @@
 
         void Compile() {
           GenerateTargetLabel();
-          m2l_->CallRuntimeHelperRegImm(QUICK_ENTRYPOINT_OFFSET(4, pResolveString),
-                                        r_method_, string_idx_, true);
+          if (Is64BitInstructionSet(cu_->instruction_set)) {
+            m2l_->CallRuntimeHelperRegImm(QUICK_ENTRYPOINT_OFFSET(8, pResolveString),
+                                          r_method_, string_idx_, true);
+          } else {
+            m2l_->CallRuntimeHelperRegImm(QUICK_ENTRYPOINT_OFFSET(4, pResolveString),
+                                          r_method_, string_idx_, true);
+          }
           m2l_->OpUnconditionalBranch(cont_);
         }
 
@@ -908,19 +1021,17 @@
   }
 }
 
-/*
- * Let helper function take care of everything.  Will
- * call Class::NewInstanceFromCode(type_idx, method);
- */
-void Mir2Lir::GenNewInstance(uint32_t type_idx, RegLocation rl_dest) {
-  FlushAllRegs();  /* Everything to home location */
+template <size_t pointer_size>
+static void GenNewInstanceImpl(Mir2Lir* mir_to_lir, CompilationUnit* cu, uint32_t type_idx,
+                               RegLocation rl_dest) {
+  mir_to_lir->FlushAllRegs();  /* Everything to home location */
   // alloc will always check for resolution, do we also need to verify
   // access because the verifier was unable to?
-  ThreadOffset<4> func_offset(-1);
-  const DexFile* dex_file = cu_->dex_file;
-  CompilerDriver* driver = cu_->compiler_driver;
+  ThreadOffset<pointer_size> func_offset(-1);
+  const DexFile* dex_file = cu->dex_file;
+  CompilerDriver* driver = cu->compiler_driver;
   if (driver->CanAccessInstantiableTypeWithoutChecks(
-      cu_->method_idx, *dex_file, type_idx)) {
+      cu->method_idx, *dex_file, type_idx)) {
     bool is_type_initialized;
     bool use_direct_type_ptr;
     uintptr_t direct_type_ptr;
@@ -931,42 +1042,58 @@
                                    !is_finalizable) {
       // The fast path.
       if (!use_direct_type_ptr) {
-        LoadClassType(type_idx, kArg0);
+        mir_to_lir->LoadClassType(type_idx, kArg0);
         if (!is_type_initialized) {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectResolved);
-          CallRuntimeHelperRegMethod(func_offset, TargetReg(kArg0), true);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pAllocObjectResolved);
+          mir_to_lir->CallRuntimeHelperRegMethod(func_offset, mir_to_lir->TargetReg(kArg0), true);
         } else {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectInitialized);
-          CallRuntimeHelperRegMethod(func_offset, TargetReg(kArg0), true);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pAllocObjectInitialized);
+          mir_to_lir->CallRuntimeHelperRegMethod(func_offset, mir_to_lir->TargetReg(kArg0), true);
         }
       } else {
         // Use the direct pointer.
         if (!is_type_initialized) {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectResolved);
-          CallRuntimeHelperImmMethod(func_offset, direct_type_ptr, true);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pAllocObjectResolved);
+          mir_to_lir->CallRuntimeHelperImmMethod(func_offset, direct_type_ptr, true);
         } else {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectInitialized);
-          CallRuntimeHelperImmMethod(func_offset, direct_type_ptr, true);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pAllocObjectInitialized);
+          mir_to_lir->CallRuntimeHelperImmMethod(func_offset, direct_type_ptr, true);
         }
       }
     } else {
       // The slow path.
       DCHECK_EQ(func_offset.Int32Value(), -1);
-      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObject);
-      CallRuntimeHelperImmMethod(func_offset, type_idx, true);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pAllocObject);
+      mir_to_lir->CallRuntimeHelperImmMethod(func_offset, type_idx, true);
     }
     DCHECK_NE(func_offset.Int32Value(), -1);
   } else {
-    func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectWithAccessCheck);
-    CallRuntimeHelperImmMethod(func_offset, type_idx, true);
+    func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pAllocObjectWithAccessCheck);
+    mir_to_lir->CallRuntimeHelperImmMethod(func_offset, type_idx, true);
   }
-  RegLocation rl_result = GetReturn(false);
-  StoreValue(rl_dest, rl_result);
+  RegLocation rl_result = mir_to_lir->GetReturn(false);
+  mir_to_lir->StoreValue(rl_dest, rl_result);
+}
+
+/*
+ * Let helper function take care of everything.  Will
+ * call Class::NewInstanceFromCode(type_idx, method);
+ */
+void Mir2Lir::GenNewInstance(uint32_t type_idx, RegLocation rl_dest) {
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    GenNewInstanceImpl<8>(this, cu_, type_idx, rl_dest);
+  } else {
+    GenNewInstanceImpl<4>(this, cu_, type_idx, rl_dest);
+  }
 }
 
 void Mir2Lir::GenThrow(RegLocation rl_src) {
   FlushAllRegs();
-  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pDeliverException), rl_src, true);
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(8, pDeliverException), rl_src, true);
+  } else {
+    CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pDeliverException), rl_src, true);
+  }
 }
 
 // For final classes there are no sub-classes to check and so we can answer the instance-of
@@ -1041,8 +1168,13 @@
   if (needs_access_check) {
     // Check we have access to type_idx and if not throw IllegalAccessError,
     // returns Class* in kArg0
-    CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
-                         type_idx, true);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(8, pInitializeTypeAndVerifyAccess),
+                           type_idx, true);
+    } else {
+      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
+                           type_idx, true);
+    }
     OpRegCopy(class_reg, TargetReg(kRet0));  // Align usage with fast path
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));  // kArg0 <= ref
   } else if (use_declaring_class) {
@@ -1061,7 +1193,11 @@
       LIR* hop_branch = OpCmpImmBranch(kCondNe, class_reg, 0, NULL);
       // Not resolved
       // Call out to helper, which will return resolved type in kRet0
-      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx, true);
+      if (Is64BitInstructionSet(cu_->instruction_set)) {
+        CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(8, pInitializeType), type_idx, true);
+      } else {
+        CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx, true);
+      }
       OpRegCopy(TargetReg(kArg2), TargetReg(kRet0));  // Align usage with fast path
       LoadValueDirectFixed(rl_src, TargetReg(kArg0));  /* reload Ref */
       // Rejoin code paths
@@ -1097,7 +1233,9 @@
     }
   } else {
     if (cu_->instruction_set == kThumb2) {
-      RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pInstanceofNonTrivial));
+      RegStorage r_tgt = Is64BitInstructionSet(cu_->instruction_set) ?
+          LoadHelper(QUICK_ENTRYPOINT_OFFSET(8, pInstanceofNonTrivial)) :
+          LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pInstanceofNonTrivial));
       LIR* it = nullptr;
       if (!type_known_abstract) {
       /* Uses conditional nullification */
@@ -1117,7 +1255,9 @@
         LoadConstant(rl_result.reg, 1);     // assume true
         branchover = OpCmpBranch(kCondEq, TargetReg(kArg1), TargetReg(kArg2), NULL);
       }
-      RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pInstanceofNonTrivial));
+      RegStorage r_tgt = Is64BitInstructionSet(cu_->instruction_set) ?
+          LoadHelper(QUICK_ENTRYPOINT_OFFSET(8, pInstanceofNonTrivial)) :
+          LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pInstanceofNonTrivial));
       OpRegCopy(TargetReg(kArg0), TargetReg(kArg2));    // .ne case - arg0 <= class
       OpReg(kOpBlx, r_tgt);    // .ne case: helper(class, ref->class)
       FreeTemp(r_tgt);
@@ -1178,8 +1318,13 @@
     // Check we have access to type_idx and if not throw IllegalAccessError,
     // returns Class* in kRet0
     // InitializeTypeAndVerifyAccess(idx, method)
-    CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
-                            type_idx, TargetReg(kArg1), true);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(8, pInitializeTypeAndVerifyAccess),
+                              type_idx, TargetReg(kArg1), true);
+    } else {
+      CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
+                              type_idx, TargetReg(kArg1), true);
+    }
     OpRegCopy(class_reg, TargetReg(kRet0));  // Align usage with fast path
   } else if (use_declaring_class) {
     LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
@@ -1209,11 +1354,17 @@
 
           // Call out to helper, which will return resolved type in kArg0
           // InitializeTypeFromCode(idx, method)
-          m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx_,
-                                        m2l_->TargetReg(kArg1), true);
+          if (Is64BitInstructionSet(m2l_->cu_->instruction_set)) {
+            m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(8, pInitializeType), type_idx_,
+                                          m2l_->TargetReg(kArg1), true);
+          } else {
+            m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx_,
+                                                      m2l_->TargetReg(kArg1), true);
+          }
           m2l_->OpRegCopy(class_reg_, m2l_->TargetReg(kRet0));  // Align usage with fast path
           m2l_->OpUnconditionalBranch(cont_);
         }
+
        public:
         const int type_idx_;
         const RegStorage class_reg_;
@@ -1240,8 +1391,13 @@
         m2l_->LoadRefDisp(m2l_->TargetReg(kArg0), mirror::Object::ClassOffset().Int32Value(),
                           m2l_->TargetReg(kArg1));
       }
-      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pCheckCast), m2l_->TargetReg(kArg2),
-                                    m2l_->TargetReg(kArg1), true);
+      if (Is64BitInstructionSet(m2l_->cu_->instruction_set)) {
+        m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(8, pCheckCast), m2l_->TargetReg(kArg2),
+                                      m2l_->TargetReg(kArg1), true);
+      } else {
+        m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pCheckCast), m2l_->TargetReg(kArg2),
+                                              m2l_->TargetReg(kArg1), true);
+      }
 
       m2l_->OpUnconditionalBranch(cont_);
     }
@@ -1323,28 +1479,38 @@
 }
 
 
-void Mir2Lir::GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                             RegLocation rl_src1, RegLocation rl_shift) {
-  ThreadOffset<4> func_offset(-1);
+template <size_t pointer_size>
+static void GenShiftOpLongCall(Mir2Lir* mir_to_lir, Instruction::Code opcode, RegLocation rl_src1,
+                               RegLocation rl_shift) {
+  ThreadOffset<pointer_size> func_offset(-1);
 
   switch (opcode) {
     case Instruction::SHL_LONG:
     case Instruction::SHL_LONG_2ADDR:
-      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pShlLong);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pShlLong);
       break;
     case Instruction::SHR_LONG:
     case Instruction::SHR_LONG_2ADDR:
-      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pShrLong);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pShrLong);
       break;
     case Instruction::USHR_LONG:
     case Instruction::USHR_LONG_2ADDR:
-      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pUshrLong);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pUshrLong);
       break;
     default:
       LOG(FATAL) << "Unexpected case";
   }
-  FlushAllRegs();   /* Send everything to home location */
-  CallRuntimeHelperRegLocationRegLocation(func_offset, rl_src1, rl_shift, false);
+  mir_to_lir->FlushAllRegs();   /* Send everything to home location */
+  mir_to_lir->CallRuntimeHelperRegLocationRegLocation(func_offset, rl_src1, rl_shift, false);
+}
+
+void Mir2Lir::GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
+                             RegLocation rl_src1, RegLocation rl_shift) {
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    GenShiftOpLongCall<8>(this, opcode, rl_src1, rl_shift);
+  } else {
+    GenShiftOpLongCall<4>(this, opcode, rl_src1, rl_shift);
+  }
   RegLocation rl_result = GetReturnWide(false);
   StoreValueWide(rl_dest, rl_result);
 }
@@ -1471,16 +1637,21 @@
 
     // If we haven't already generated the code use the callout function.
     if (!done) {
-      ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pIdivmod);
       FlushAllRegs();   /* Send everything to home location */
       LoadValueDirectFixed(rl_src2, TargetReg(kArg1));
-      RegStorage r_tgt = CallHelperSetup(func_offset);
+      RegStorage r_tgt = Is64BitInstructionSet(cu_->instruction_set) ?
+          CallHelperSetup(QUICK_ENTRYPOINT_OFFSET(8, pIdivmod)) :
+          CallHelperSetup(QUICK_ENTRYPOINT_OFFSET(4, pIdivmod));
       LoadValueDirectFixed(rl_src1, TargetReg(kArg0));
       if (check_zero) {
         GenDivZeroCheck(TargetReg(kArg1));
       }
       // NOTE: callout here is not a safepoint.
-      CallHelper(r_tgt, func_offset, false /* not a safepoint */);
+      if (Is64BitInstructionSet(cu_->instruction_set)) {
+        CallHelper(r_tgt, QUICK_ENTRYPOINT_OFFSET(8, pIdivmod), false /* not a safepoint */);
+      } else {
+        CallHelper(r_tgt, QUICK_ENTRYPOINT_OFFSET(4, pIdivmod), false /* not a safepoint */);
+      }
       if (op == kOpDiv)
         rl_result = GetReturn(false);
       else
@@ -1739,8 +1910,13 @@
         FlushAllRegs();   /* Everything to home location. */
         LoadValueDirectFixed(rl_src, TargetReg(kArg0));
         Clobber(TargetReg(kArg0));
-        ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pIdivmod);
-        CallRuntimeHelperRegImm(func_offset, TargetReg(kArg0), lit, false);
+        if (Is64BitInstructionSet(cu_->instruction_set)) {
+          CallRuntimeHelperRegImm(QUICK_ENTRYPOINT_OFFSET(8, pIdivmod), TargetReg(kArg0), lit,
+                                  false);
+        } else {
+          CallRuntimeHelperRegImm(QUICK_ENTRYPOINT_OFFSET(4, pIdivmod), TargetReg(kArg0), lit,
+                                  false);
+        }
         if (is_div)
           rl_result = GetReturn(false);
         else
@@ -1763,37 +1939,38 @@
   StoreValue(rl_dest, rl_result);
 }
 
-void Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                             RegLocation rl_src1, RegLocation rl_src2) {
+template <size_t pointer_size>
+static void GenArithOpLongImpl(Mir2Lir* mir_to_lir, CompilationUnit* cu, Instruction::Code opcode,
+                               RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2) {
   RegLocation rl_result;
   OpKind first_op = kOpBkpt;
   OpKind second_op = kOpBkpt;
   bool call_out = false;
   bool check_zero = false;
-  ThreadOffset<4> func_offset(-1);
-  int ret_reg = TargetReg(kRet0).GetReg();
+  ThreadOffset<pointer_size> func_offset(-1);
+  int ret_reg = mir_to_lir->TargetReg(kRet0).GetReg();
 
   switch (opcode) {
     case Instruction::NOT_LONG:
-      rl_src2 = LoadValueWide(rl_src2, kCoreReg);
-      rl_result = EvalLoc(rl_dest, kCoreReg, true);
+      rl_src2 = mir_to_lir->LoadValueWide(rl_src2, kCoreReg);
+      rl_result = mir_to_lir->EvalLoc(rl_dest, kCoreReg, true);
       // Check for destructive overlap
       if (rl_result.reg.GetLowReg() == rl_src2.reg.GetHighReg()) {
-        RegStorage t_reg = AllocTemp();
-        OpRegCopy(t_reg, rl_src2.reg.GetHigh());
-        OpRegReg(kOpMvn, rl_result.reg.GetLow(), rl_src2.reg.GetLow());
-        OpRegReg(kOpMvn, rl_result.reg.GetHigh(), t_reg);
-        FreeTemp(t_reg);
+        RegStorage t_reg = mir_to_lir->AllocTemp();
+        mir_to_lir->OpRegCopy(t_reg, rl_src2.reg.GetHigh());
+        mir_to_lir->OpRegReg(kOpMvn, rl_result.reg.GetLow(), rl_src2.reg.GetLow());
+        mir_to_lir->OpRegReg(kOpMvn, rl_result.reg.GetHigh(), t_reg);
+        mir_to_lir->FreeTemp(t_reg);
       } else {
-        OpRegReg(kOpMvn, rl_result.reg.GetLow(), rl_src2.reg.GetLow());
-        OpRegReg(kOpMvn, rl_result.reg.GetHigh(), rl_src2.reg.GetHigh());
+        mir_to_lir->OpRegReg(kOpMvn, rl_result.reg.GetLow(), rl_src2.reg.GetLow());
+        mir_to_lir->OpRegReg(kOpMvn, rl_result.reg.GetHigh(), rl_src2.reg.GetHigh());
       }
-      StoreValueWide(rl_dest, rl_result);
+      mir_to_lir->StoreValueWide(rl_dest, rl_result);
       return;
     case Instruction::ADD_LONG:
     case Instruction::ADD_LONG_2ADDR:
-      if (cu_->instruction_set != kThumb2) {
-        GenAddLong(opcode, rl_dest, rl_src1, rl_src2);
+      if (cu->instruction_set != kThumb2) {
+        mir_to_lir->GenAddLong(opcode, rl_dest, rl_src1, rl_src2);
         return;
       }
       first_op = kOpAdd;
@@ -1801,8 +1978,8 @@
       break;
     case Instruction::SUB_LONG:
     case Instruction::SUB_LONG_2ADDR:
-      if (cu_->instruction_set != kThumb2) {
-        GenSubLong(opcode, rl_dest, rl_src1, rl_src2);
+      if (cu->instruction_set != kThumb2) {
+        mir_to_lir->GenSubLong(opcode, rl_dest, rl_src1, rl_src2);
         return;
       }
       first_op = kOpSub;
@@ -1810,42 +1987,43 @@
       break;
     case Instruction::MUL_LONG:
     case Instruction::MUL_LONG_2ADDR:
-      if (cu_->instruction_set != kMips) {
-        GenMulLong(opcode, rl_dest, rl_src1, rl_src2);
+      if (cu->instruction_set != kMips) {
+        mir_to_lir->GenMulLong(opcode, rl_dest, rl_src1, rl_src2);
         return;
       } else {
         call_out = true;
-        ret_reg = TargetReg(kRet0).GetReg();
-        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLmul);
+        ret_reg = mir_to_lir->TargetReg(kRet0).GetReg();
+        func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pLmul);
       }
       break;
     case Instruction::DIV_LONG:
     case Instruction::DIV_LONG_2ADDR:
       call_out = true;
       check_zero = true;
-      ret_reg = TargetReg(kRet0).GetReg();
-      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLdiv);
+      ret_reg = mir_to_lir->TargetReg(kRet0).GetReg();
+      func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pLdiv);
       break;
     case Instruction::REM_LONG:
     case Instruction::REM_LONG_2ADDR:
       call_out = true;
       check_zero = true;
-      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLmod);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pLmod);
       /* NOTE - for Arm, result is in kArg2/kArg3 instead of kRet0/kRet1 */
-      ret_reg = (cu_->instruction_set == kThumb2) ? TargetReg(kArg2).GetReg() : TargetReg(kRet0).GetReg();
+      ret_reg = (cu->instruction_set == kThumb2) ? mir_to_lir->TargetReg(kArg2).GetReg() :
+          mir_to_lir->TargetReg(kRet0).GetReg();
       break;
     case Instruction::AND_LONG_2ADDR:
     case Instruction::AND_LONG:
-      if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
-        return GenAndLong(opcode, rl_dest, rl_src1, rl_src2);
+      if (cu->instruction_set == kX86 || cu->instruction_set == kX86_64) {
+        return mir_to_lir->GenAndLong(opcode, rl_dest, rl_src1, rl_src2);
       }
       first_op = kOpAnd;
       second_op = kOpAnd;
       break;
     case Instruction::OR_LONG:
     case Instruction::OR_LONG_2ADDR:
-      if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
-        GenOrLong(opcode, rl_dest, rl_src1, rl_src2);
+      if (cu->instruction_set == kX86 || cu->instruction_set == kX86_64) {
+        mir_to_lir->GenOrLong(opcode, rl_dest, rl_src1, rl_src2);
         return;
       }
       first_op = kOpOr;
@@ -1853,51 +2031,66 @@
       break;
     case Instruction::XOR_LONG:
     case Instruction::XOR_LONG_2ADDR:
-      if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
-        GenXorLong(opcode, rl_dest, rl_src1, rl_src2);
+      if (cu->instruction_set == kX86 || cu->instruction_set == kX86_64) {
+        mir_to_lir->GenXorLong(opcode, rl_dest, rl_src1, rl_src2);
         return;
       }
       first_op = kOpXor;
       second_op = kOpXor;
       break;
     case Instruction::NEG_LONG: {
-      GenNegLong(rl_dest, rl_src2);
+      mir_to_lir->GenNegLong(rl_dest, rl_src2);
       return;
     }
     default:
       LOG(FATAL) << "Invalid long arith op";
   }
   if (!call_out) {
-    GenLong3Addr(first_op, second_op, rl_dest, rl_src1, rl_src2);
+    mir_to_lir->GenLong3Addr(first_op, second_op, rl_dest, rl_src1, rl_src2);
   } else {
-    FlushAllRegs();   /* Send everything to home location */
+    mir_to_lir->FlushAllRegs();   /* Send everything to home location */
     if (check_zero) {
-      RegStorage r_tmp1 = RegStorage::MakeRegPair(TargetReg(kArg0), TargetReg(kArg1));
-      RegStorage r_tmp2 = RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3));
-      LoadValueDirectWideFixed(rl_src2, r_tmp2);
-      RegStorage r_tgt = CallHelperSetup(func_offset);
-      GenDivZeroCheckWide(RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3)));
-      LoadValueDirectWideFixed(rl_src1, r_tmp1);
+      RegStorage r_tmp1 = RegStorage::MakeRegPair(mir_to_lir->TargetReg(kArg0),
+                                                  mir_to_lir->TargetReg(kArg1));
+      RegStorage r_tmp2 = RegStorage::MakeRegPair(mir_to_lir->TargetReg(kArg2),
+                                                  mir_to_lir->TargetReg(kArg3));
+      mir_to_lir->LoadValueDirectWideFixed(rl_src2, r_tmp2);
+      RegStorage r_tgt = mir_to_lir->CallHelperSetup(func_offset);
+      mir_to_lir->GenDivZeroCheckWide(RegStorage::MakeRegPair(mir_to_lir->TargetReg(kArg2),
+                                                              mir_to_lir->TargetReg(kArg3)));
+      mir_to_lir->LoadValueDirectWideFixed(rl_src1, r_tmp1);
       // NOTE: callout here is not a safepoint
-      CallHelper(r_tgt, func_offset, false /* not safepoint */);
+      mir_to_lir->CallHelper(r_tgt, func_offset, false /* not safepoint */);
     } else {
-      CallRuntimeHelperRegLocationRegLocation(func_offset, rl_src1, rl_src2, false);
+      mir_to_lir->CallRuntimeHelperRegLocationRegLocation(func_offset, rl_src1, rl_src2, false);
     }
     // Adjust return regs in to handle case of rem returning kArg2/kArg3
-    if (ret_reg == TargetReg(kRet0).GetReg())
-      rl_result = GetReturnWide(false);
+    if (ret_reg == mir_to_lir->TargetReg(kRet0).GetReg())
+      rl_result = mir_to_lir->GetReturnWide(false);
     else
-      rl_result = GetReturnWideAlt();
-    StoreValueWide(rl_dest, rl_result);
+      rl_result = mir_to_lir->GetReturnWideAlt();
+    mir_to_lir->StoreValueWide(rl_dest, rl_result);
   }
 }
 
-void Mir2Lir::GenConversionCall(ThreadOffset<4> func_offset,
+void Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
+                             RegLocation rl_src1, RegLocation rl_src2) {
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    GenArithOpLongImpl<8>(this, cu_, opcode, rl_dest, rl_src1, rl_src2);
+  } else {
+    GenArithOpLongImpl<4>(this, cu_, opcode, rl_dest, rl_src1, rl_src2);
+  }
+}
+
+template <size_t pointer_size>
+void Mir2Lir::GenConversionCall(ThreadOffset<pointer_size> func_offset,
                                 RegLocation rl_dest, RegLocation rl_src) {
   /*
    * Don't optimize the register usage since it calls out to support
    * functions
    */
+  DCHECK_EQ(pointer_size, GetInstructionSetPointerSize(cu_->instruction_set));
+
   FlushAllRegs();   /* Send everything to home location */
   CallRuntimeHelperRegLocation(func_offset, rl_src, false);
   if (rl_dest.wide) {
@@ -1910,6 +2103,10 @@
     StoreValue(rl_dest, rl_result);
   }
 }
+template void Mir2Lir::GenConversionCall(ThreadOffset<4> func_offset,
+                                         RegLocation rl_dest, RegLocation rl_src);
+template void Mir2Lir::GenConversionCall(ThreadOffset<8> func_offset,
+                                         RegLocation rl_dest, RegLocation rl_src);
 
 class SuspendCheckSlowPath : public Mir2Lir::LIRSlowPath {
  public:
@@ -1921,7 +2118,11 @@
     m2l_->ResetRegPool();
     m2l_->ResetDefTracking();
     GenerateTargetLabel(kPseudoSuspendTarget);
-    m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(4, pTestSuspend), true);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(8, pTestSuspend), true);
+    } else {
+      m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(4, pTestSuspend), true);
+    }
     if (cont_ != nullptr) {
       m2l_->OpUnconditionalBranch(cont_);
     }
@@ -1976,13 +2177,21 @@
 /* Call out to helper assembly routine that will null check obj and then lock it. */
 void Mir2Lir::GenMonitorEnter(int opt_flags, RegLocation rl_src) {
   FlushAllRegs();
-  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pLockObject), rl_src, true);
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(8, pLockObject), rl_src, true);
+  } else {
+    CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pLockObject), rl_src, true);
+  }
 }
 
 /* Call out to helper assembly routine that will null check obj and then unlock it. */
 void Mir2Lir::GenMonitorExit(int opt_flags, RegLocation rl_src) {
   FlushAllRegs();
-  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject), rl_src, true);
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(8, pUnlockObject), rl_src, true);
+  } else {
+    CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject), rl_src, true);
+  }
 }
 
 /* Generic code for generating a wide constant into a VR. */
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index d321b00..7aaffcb 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -63,19 +63,46 @@
   AddSlowPath(new (arena_) IntrinsicSlowPathPath(this, info, branch, resume));
 }
 
+// Macro to help instantiate.
+// TODO: This might be used to only instantiate <4> on pure 32b systems.
+#define INSTANTIATE(sig_part1, ...) \
+  template sig_part1(ThreadOffset<4>, __VA_ARGS__); \
+  template sig_part1(ThreadOffset<8>, __VA_ARGS__); \
+
+
 /*
  * To save scheduling time, helper calls are broken into two parts: generation of
  * the helper target address, and the actual call to the helper.  Because x86
  * has a memory call operation, part 1 is a NOP for x86.  For other targets,
  * load arguments between the two parts.
  */
+// template <size_t pointer_size>
 RegStorage Mir2Lir::CallHelperSetup(ThreadOffset<4> helper_offset) {
-  return (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) ? RegStorage::InvalidReg() : LoadHelper(helper_offset);
+  // All CallRuntimeHelperXXX call this first. So make a central check here.
+  DCHECK_EQ(4U, GetInstructionSetPointerSize(cu_->instruction_set));
+
+  if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
+    return RegStorage::InvalidReg();
+  } else {
+    return LoadHelper(helper_offset);
+  }
+}
+
+RegStorage Mir2Lir::CallHelperSetup(ThreadOffset<8> helper_offset) {
+  // All CallRuntimeHelperXXX call this first. So make a central check here.
+  DCHECK_EQ(8U, GetInstructionSetPointerSize(cu_->instruction_set));
+
+  if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
+    return RegStorage::InvalidReg();
+  } else {
+    return LoadHelper(helper_offset);
+  }
 }
 
 /* NOTE: if r_tgt is a temp, it will be freed following use */
-LIR* Mir2Lir::CallHelper(RegStorage r_tgt, ThreadOffset<4> helper_offset, bool safepoint_pc,
-                         bool use_link) {
+template <size_t pointer_size>
+LIR* Mir2Lir::CallHelper(RegStorage r_tgt, ThreadOffset<pointer_size> helper_offset,
+                         bool safepoint_pc, bool use_link) {
   LIR* call_inst;
   OpKind op = use_link ? kOpBlx : kOpBx;
   if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
@@ -89,30 +116,41 @@
   }
   return call_inst;
 }
+template LIR* Mir2Lir::CallHelper(RegStorage r_tgt, ThreadOffset<4> helper_offset,
+                                        bool safepoint_pc, bool use_link);
+template LIR* Mir2Lir::CallHelper(RegStorage r_tgt, ThreadOffset<8> helper_offset,
+                                        bool safepoint_pc, bool use_link);
 
-void Mir2Lir::CallRuntimeHelper(ThreadOffset<4> helper_offset, bool safepoint_pc) {
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelper(ThreadOffset<pointer_size> helper_offset, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelper, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperImm(ThreadOffset<4> helper_offset, int arg0, bool safepoint_pc) {
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperImm(ThreadOffset<pointer_size> helper_offset, int arg0, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadConstant(TargetReg(kArg0), arg0);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperImm, int arg0, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperReg(ThreadOffset<4> helper_offset, RegStorage arg0,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperReg(ThreadOffset<pointer_size> helper_offset, RegStorage arg0,
                                    bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   OpRegCopy(TargetReg(kArg0), arg0);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperReg, RegStorage arg0, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperRegLocation(ThreadOffset<4> helper_offset, RegLocation arg0,
-                                           bool safepoint_pc) {
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperRegLocation(ThreadOffset<pointer_size> helper_offset,
+                                           RegLocation arg0, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   if (arg0.wide == 0) {
     LoadValueDirectFixed(arg0, TargetReg(kArg0));
@@ -121,19 +159,23 @@
     LoadValueDirectWideFixed(arg0, r_tmp);
   }
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperRegLocation, RegLocation arg0, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperImmImm(ThreadOffset<4> helper_offset, int arg0, int arg1,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperImmImm(ThreadOffset<pointer_size> helper_offset, int arg0, int arg1,
                                       bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadConstant(TargetReg(kArg0), arg0);
   LoadConstant(TargetReg(kArg1), arg1);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperImmImm, int arg0, int arg1, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperImmRegLocation(ThreadOffset<4> helper_offset, int arg0,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperImmRegLocation(ThreadOffset<pointer_size> helper_offset, int arg0,
                                               RegLocation arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   if (arg1.wide == 0) {
@@ -144,46 +186,58 @@
   }
   LoadConstant(TargetReg(kArg0), arg0);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperImmRegLocation, int arg0, RegLocation arg1,
+            bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperRegLocationImm(ThreadOffset<4> helper_offset, RegLocation arg0,
-                                              int arg1, bool safepoint_pc) {
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperRegLocationImm(ThreadOffset<pointer_size> helper_offset,
+                                              RegLocation arg0, int arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadValueDirectFixed(arg0, TargetReg(kArg0));
   LoadConstant(TargetReg(kArg1), arg1);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperRegLocationImm, RegLocation arg0, int arg1,
+            bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperImmReg(ThreadOffset<4> helper_offset, int arg0, RegStorage arg1,
-                                      bool safepoint_pc) {
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperImmReg(ThreadOffset<pointer_size> helper_offset, int arg0,
+                                      RegStorage arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   OpRegCopy(TargetReg(kArg1), arg1);
   LoadConstant(TargetReg(kArg0), arg0);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperImmReg, int arg0, RegStorage arg1, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperRegImm(ThreadOffset<4> helper_offset, RegStorage arg0, int arg1,
-                                      bool safepoint_pc) {
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperRegImm(ThreadOffset<pointer_size> helper_offset, RegStorage arg0,
+                                      int arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   OpRegCopy(TargetReg(kArg0), arg0);
   LoadConstant(TargetReg(kArg1), arg1);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperRegImm, RegStorage arg0, int arg1, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperImmMethod(ThreadOffset<4> helper_offset, int arg0,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperImmMethod(ThreadOffset<pointer_size> helper_offset, int arg0,
                                          bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadCurrMethodDirect(TargetReg(kArg1));
   LoadConstant(TargetReg(kArg0), arg0);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperImmMethod, int arg0, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperRegMethod(ThreadOffset<4> helper_offset, RegStorage arg0,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperRegMethod(ThreadOffset<pointer_size> helper_offset, RegStorage arg0,
                                          bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_NE(TargetReg(kArg1).GetReg(), arg0.GetReg());
@@ -192,11 +246,14 @@
   }
   LoadCurrMethodDirect(TargetReg(kArg1));
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperRegMethod, RegStorage arg0, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperRegMethodRegLocation(ThreadOffset<4> helper_offset, RegStorage arg0,
-                                                    RegLocation arg2, bool safepoint_pc) {
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperRegMethodRegLocation(ThreadOffset<pointer_size> helper_offset,
+                                                    RegStorage arg0, RegLocation arg2,
+                                                    bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_NE(TargetReg(kArg1).GetReg(), arg0.GetReg());
   if (TargetReg(kArg0) != arg0) {
@@ -205,10 +262,13 @@
   LoadCurrMethodDirect(TargetReg(kArg1));
   LoadValueDirectFixed(arg2, TargetReg(kArg2));
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperRegMethodRegLocation, RegStorage arg0, RegLocation arg2,
+            bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperRegLocationRegLocation(ThreadOffset<4> helper_offset,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperRegLocationRegLocation(ThreadOffset<pointer_size> helper_offset,
                                                       RegLocation arg0, RegLocation arg1,
                                                       bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
@@ -255,8 +315,10 @@
     }
   }
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperRegLocationRegLocation, RegLocation arg0,
+            RegLocation arg1, bool safepoint_pc)
 
 void Mir2Lir::CopyToArgumentRegs(RegStorage arg0, RegStorage arg1) {
   if (arg1.GetReg() == TargetReg(kArg0).GetReg()) {
@@ -275,48 +337,61 @@
   }
 }
 
-void Mir2Lir::CallRuntimeHelperRegReg(ThreadOffset<4> helper_offset, RegStorage arg0,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperRegReg(ThreadOffset<pointer_size> helper_offset, RegStorage arg0,
                                       RegStorage arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   CopyToArgumentRegs(arg0, arg1);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperRegReg, RegStorage arg0, RegStorage arg1,
+            bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperRegRegImm(ThreadOffset<4> helper_offset, RegStorage arg0,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperRegRegImm(ThreadOffset<pointer_size> helper_offset, RegStorage arg0,
                                          RegStorage arg1, int arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   CopyToArgumentRegs(arg0, arg1);
   LoadConstant(TargetReg(kArg2), arg2);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperRegRegImm, RegStorage arg0, RegStorage arg1, int arg2,
+            bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperImmMethodRegLocation(ThreadOffset<4> helper_offset,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperImmMethodRegLocation(ThreadOffset<pointer_size> helper_offset,
                                                     int arg0, RegLocation arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadValueDirectFixed(arg2, TargetReg(kArg2));
   LoadCurrMethodDirect(TargetReg(kArg1));
   LoadConstant(TargetReg(kArg0), arg0);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperImmMethodRegLocation, int arg0, RegLocation arg2,
+            bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperImmMethodImm(ThreadOffset<4> helper_offset, int arg0,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperImmMethodImm(ThreadOffset<pointer_size> helper_offset, int arg0,
                                             int arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadCurrMethodDirect(TargetReg(kArg1));
   LoadConstant(TargetReg(kArg2), arg2);
   LoadConstant(TargetReg(kArg0), arg0);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperImmMethodImm, int arg0, int arg2, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset<4> helper_offset,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset<pointer_size> helper_offset,
                                                          int arg0, RegLocation arg1,
                                                          RegLocation arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
-  DCHECK_EQ(arg1.wide, 0U);
+  DCHECK_EQ(static_cast<unsigned int>(arg1.wide), 0U);  // The static_cast works around an
+                                                        // instantiation bug in GCC.
   LoadValueDirectFixed(arg1, TargetReg(kArg1));
   if (arg2.wide == 0) {
     LoadValueDirectFixed(arg2, TargetReg(kArg2));
@@ -326,23 +401,28 @@
   }
   LoadConstant(TargetReg(kArg0), arg0);
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperImmRegLocationRegLocation, int arg0, RegLocation arg1,
+            RegLocation arg2, bool safepoint_pc)
 
-void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset<4> helper_offset,
+template <size_t pointer_size>
+void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset<pointer_size> helper_offset,
                                                                  RegLocation arg0, RegLocation arg1,
                                                                  RegLocation arg2,
                                                                  bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
-  DCHECK_EQ(arg0.wide, 0U);
+  DCHECK_EQ(static_cast<unsigned int>(arg0.wide), 0U);
   LoadValueDirectFixed(arg0, TargetReg(kArg0));
-  DCHECK_EQ(arg1.wide, 0U);
+  DCHECK_EQ(static_cast<unsigned int>(arg1.wide), 0U);
   LoadValueDirectFixed(arg1, TargetReg(kArg1));
-  DCHECK_EQ(arg1.wide, 0U);
+  DCHECK_EQ(static_cast<unsigned int>(arg1.wide), 0U);
   LoadValueDirectFixed(arg2, TargetReg(kArg2));
   ClobberCallerSave();
-  CallHelper(r_tgt, helper_offset, safepoint_pc);
+  CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
+INSTANTIATE(void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocation, RegLocation arg0,
+            RegLocation arg1, RegLocation arg2, bool safepoint_pc)
 
 /*
  * If there are any ins passed in registers that have not been promoted
@@ -627,7 +707,8 @@
   return state + 1;
 }
 
-static int NextInvokeInsnSP(CompilationUnit* cu, CallInfo* info, ThreadOffset<4> trampoline,
+template <size_t pointer_size>
+static int NextInvokeInsnSP(CompilationUnit* cu, CallInfo* info, ThreadOffset<pointer_size> trampoline,
                             int state, const MethodReference& target_method,
                             uint32_t method_idx) {
   Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
@@ -653,32 +734,52 @@
                                 const MethodReference& target_method,
                                 uint32_t unused, uintptr_t unused2,
                                 uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeStaticTrampolineWithAccessCheck);
-  return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
+  if (Is64BitInstructionSet(cu->instruction_set)) {
+    ThreadOffset<8> trampoline = QUICK_ENTRYPOINT_OFFSET(8, pInvokeStaticTrampolineWithAccessCheck);
+    return NextInvokeInsnSP<8>(cu, info, trampoline, state, target_method, 0);
+  } else {
+    ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeStaticTrampolineWithAccessCheck);
+    return NextInvokeInsnSP<4>(cu, info, trampoline, state, target_method, 0);
+  }
 }
 
 static int NextDirectCallInsnSP(CompilationUnit* cu, CallInfo* info, int state,
                                 const MethodReference& target_method,
                                 uint32_t unused, uintptr_t unused2,
                                 uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeDirectTrampolineWithAccessCheck);
-  return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
+  if (Is64BitInstructionSet(cu->instruction_set)) {
+    ThreadOffset<8> trampoline = QUICK_ENTRYPOINT_OFFSET(8, pInvokeDirectTrampolineWithAccessCheck);
+    return NextInvokeInsnSP<8>(cu, info, trampoline, state, target_method, 0);
+  } else {
+    ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeDirectTrampolineWithAccessCheck);
+    return NextInvokeInsnSP<4>(cu, info, trampoline, state, target_method, 0);
+  }
 }
 
 static int NextSuperCallInsnSP(CompilationUnit* cu, CallInfo* info, int state,
                                const MethodReference& target_method,
                                uint32_t unused, uintptr_t unused2,
                                uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeSuperTrampolineWithAccessCheck);
-  return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
+  if (Is64BitInstructionSet(cu->instruction_set)) {
+    ThreadOffset<8> trampoline = QUICK_ENTRYPOINT_OFFSET(8, pInvokeSuperTrampolineWithAccessCheck);
+    return NextInvokeInsnSP<8>(cu, info, trampoline, state, target_method, 0);
+  } else {
+    ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeSuperTrampolineWithAccessCheck);
+    return NextInvokeInsnSP<4>(cu, info, trampoline, state, target_method, 0);
+  }
 }
 
 static int NextVCallInsnSP(CompilationUnit* cu, CallInfo* info, int state,
                            const MethodReference& target_method,
                            uint32_t unused, uintptr_t unused2,
                            uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeVirtualTrampolineWithAccessCheck);
-  return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
+  if (Is64BitInstructionSet(cu->instruction_set)) {
+    ThreadOffset<8> trampoline = QUICK_ENTRYPOINT_OFFSET(8, pInvokeVirtualTrampolineWithAccessCheck);
+    return NextInvokeInsnSP<8>(cu, info, trampoline, state, target_method, 0);
+  } else {
+    ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeVirtualTrampolineWithAccessCheck);
+    return NextInvokeInsnSP<4>(cu, info, trampoline, state, target_method, 0);
+  }
 }
 
 static int NextInterfaceCallInsnWithAccessCheck(CompilationUnit* cu,
@@ -686,9 +787,13 @@
                                                 const MethodReference& target_method,
                                                 uint32_t unused, uintptr_t unused2,
                                                 uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset<4> trampoline =
-      QUICK_ENTRYPOINT_OFFSET(4, pInvokeInterfaceTrampolineWithAccessCheck);
-  return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
+  if (Is64BitInstructionSet(cu->instruction_set)) {
+      ThreadOffset<8> trampoline = QUICK_ENTRYPOINT_OFFSET(8, pInvokeInterfaceTrampolineWithAccessCheck);
+      return NextInvokeInsnSP<8>(cu, info, trampoline, state, target_method, 0);
+    } else {
+      ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeInterfaceTrampolineWithAccessCheck);
+      return NextInvokeInsnSP<4>(cu, info, trampoline, state, target_method, 0);
+    }
 }
 
 int Mir2Lir::LoadArgRegs(CallInfo* info, int call_state,
@@ -1010,8 +1115,13 @@
     // Generate memcpy
     OpRegRegImm(kOpAdd, TargetReg(kArg0), TargetReg(kSp), outs_offset);
     OpRegRegImm(kOpAdd, TargetReg(kArg1), TargetReg(kSp), start_offset);
-    CallRuntimeHelperRegRegImm(QUICK_ENTRYPOINT_OFFSET(4, pMemcpy), TargetReg(kArg0),
-                               TargetReg(kArg1), (info->num_arg_words - 3) * 4, false);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      CallRuntimeHelperRegRegImm(QUICK_ENTRYPOINT_OFFSET(8, pMemcpy), TargetReg(kArg0),
+                                 TargetReg(kArg1), (info->num_arg_words - 3) * 4, false);
+    } else {
+      CallRuntimeHelperRegRegImm(QUICK_ENTRYPOINT_OFFSET(4, pMemcpy), TargetReg(kArg0),
+                                 TargetReg(kArg1), (info->num_arg_words - 3) * 4, false);
+    }
   }
 
   call_state = LoadArgRegs(info, call_state, next_call_insn,
@@ -1341,7 +1451,9 @@
     RegLocation rl_start = info->args[2];     // 3rd arg only present in III flavor of IndexOf.
     LoadValueDirectFixed(rl_start, reg_start);
   }
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pIndexOf));
+  RegStorage r_tgt = Is64BitInstructionSet(cu_->instruction_set) ?
+      LoadHelper(QUICK_ENTRYPOINT_OFFSET(8, pIndexOf)) :
+      LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pIndexOf));
   GenExplicitNullCheck(reg_ptr, info->opt_flags);
   LIR* high_code_point_branch =
       rl_char.is_const ? nullptr : OpCmpImmBranch(kCondGt, reg_char, 0xFFFF, nullptr);
@@ -1378,8 +1490,16 @@
   RegLocation rl_cmp = info->args[1];
   LoadValueDirectFixed(rl_this, reg_this);
   LoadValueDirectFixed(rl_cmp, reg_cmp);
-  RegStorage r_tgt = (cu_->instruction_set != kX86 && cu_->instruction_set != kX86_64) ?
-      LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pStringCompareTo)) : RegStorage::InvalidReg();
+  RegStorage r_tgt;
+  if (cu_->instruction_set != kX86 && cu_->instruction_set != kX86_64) {
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(8, pStringCompareTo));
+    } else {
+      r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pStringCompareTo));
+    }
+  } else {
+    r_tgt = RegStorage::InvalidReg();
+  }
   GenExplicitNullCheck(reg_this, info->opt_flags);
   info->opt_flags |= MIR_IGNORE_NULL_CHECK;  // Record that we've null checked.
   // TUNING: check if rl_cmp.s_reg_low is already null checked
@@ -1389,7 +1509,11 @@
   if (cu_->instruction_set != kX86 && cu_->instruction_set != kX86_64) {
     OpReg(kOpBlx, r_tgt);
   } else {
-    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(4, pStringCompareTo));
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(8, pStringCompareTo));
+    } else {
+      OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(4, pStringCompareTo));
+    }
   }
   RegLocation rl_return = GetReturn(false);
   RegLocation rl_dest = InlineTarget(info);
@@ -1400,12 +1524,32 @@
 bool Mir2Lir::GenInlinedCurrentThread(CallInfo* info) {
   RegLocation rl_dest = InlineTarget(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  ThreadOffset<4> offset = Thread::PeerOffset<4>();
-  if (cu_->instruction_set == kThumb2 || cu_->instruction_set == kMips) {
-    Load32Disp(TargetReg(kSelf), offset.Int32Value(), rl_result.reg);
-  } else {
-    CHECK(cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64);
-    reinterpret_cast<X86Mir2Lir*>(this)->OpRegThreadMem(kOpMov, rl_result.reg, offset);
+
+  switch (cu_->instruction_set) {
+    case kArm:
+      // Fall-through.
+    case kThumb2:
+      // Fall-through.
+    case kMips:
+      Load32Disp(TargetReg(kSelf), Thread::PeerOffset<4>().Int32Value(), rl_result.reg);
+      break;
+
+    case kArm64:
+      Load32Disp(TargetReg(kSelf), Thread::PeerOffset<8>().Int32Value(), rl_result.reg);
+      break;
+
+    case kX86:
+      reinterpret_cast<X86Mir2Lir*>(this)->OpRegThreadMem(kOpMov, rl_result.reg,
+                                                          Thread::PeerOffset<4>());
+      break;
+
+    case kX86_64:
+      reinterpret_cast<X86Mir2Lir*>(this)->OpRegThreadMem(kOpMov, rl_result.reg,
+                                                          Thread::PeerOffset<8>());
+      break;
+
+    default:
+      LOG(FATAL) << "Unexpected isa " << cu_->instruction_set;
   }
   StoreValue(rl_dest, rl_result);
   return true;
@@ -1519,6 +1663,31 @@
   GenInvokeNoInline(info);
 }
 
+template <size_t pointer_size>
+static LIR* GenInvokeNoInlineCall(Mir2Lir* mir_to_lir, InvokeType type) {
+  ThreadOffset<pointer_size> trampoline(-1);
+  switch (type) {
+    case kInterface:
+      trampoline = QUICK_ENTRYPOINT_OFFSET(pointer_size, pInvokeInterfaceTrampolineWithAccessCheck);
+      break;
+    case kDirect:
+      trampoline = QUICK_ENTRYPOINT_OFFSET(pointer_size, pInvokeDirectTrampolineWithAccessCheck);
+      break;
+    case kStatic:
+      trampoline = QUICK_ENTRYPOINT_OFFSET(pointer_size, pInvokeStaticTrampolineWithAccessCheck);
+      break;
+    case kSuper:
+      trampoline = QUICK_ENTRYPOINT_OFFSET(pointer_size, pInvokeSuperTrampolineWithAccessCheck);
+      break;
+    case kVirtual:
+      trampoline = QUICK_ENTRYPOINT_OFFSET(pointer_size, pInvokeVirtualTrampolineWithAccessCheck);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected invoke type";
+  }
+  return mir_to_lir->OpThreadMem(kOpBlx, trampoline);
+}
+
 void Mir2Lir::GenInvokeNoInline(CallInfo* info) {
   int call_state = 0;
   LIR* null_ck;
@@ -1586,27 +1755,12 @@
                           mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().Int32Value());
       }
     } else {
-      ThreadOffset<4> trampoline(-1);
-      switch (info->type) {
-      case kInterface:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeInterfaceTrampolineWithAccessCheck);
-        break;
-      case kDirect:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeDirectTrampolineWithAccessCheck);
-        break;
-      case kStatic:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeStaticTrampolineWithAccessCheck);
-        break;
-      case kSuper:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeSuperTrampolineWithAccessCheck);
-        break;
-      case kVirtual:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeVirtualTrampolineWithAccessCheck);
-        break;
-      default:
-        LOG(FATAL) << "Unexpected invoke type";
+      // TODO: Extract?
+      if (Is64BitInstructionSet(cu_->instruction_set)) {
+        call_inst = GenInvokeNoInlineCall<8>(this, info->type);
+      } else {
+        call_inst = GenInvokeNoInlineCall<4>(this, info->type);
       }
-      call_inst = OpThreadMem(kOpBlx, trampoline);
     }
   }
   MarkSafepointPC(call_inst);
diff --git a/compiler/dex/quick/gen_loadstore.cc b/compiler/dex/quick/gen_loadstore.cc
index faa9461..8fcb09b 100644
--- a/compiler/dex/quick/gen_loadstore.cc
+++ b/compiler/dex/quick/gen_loadstore.cc
@@ -139,12 +139,25 @@
 }
 
 RegLocation Mir2Lir::LoadValue(RegLocation rl_src, RegisterClass op_kind) {
-  rl_src = EvalLoc(rl_src, op_kind, false);
-  if (IsInexpensiveConstant(rl_src) || rl_src.location != kLocPhysReg) {
-    LoadValueDirect(rl_src, rl_src.reg);
-    rl_src.location = kLocPhysReg;
-    MarkLive(rl_src);
+  rl_src = UpdateLoc(rl_src);
+  if (rl_src.location == kLocPhysReg) {
+    if (!RegClassMatches(op_kind, rl_src.reg)) {
+      // Wrong register class, realloc, copy and transfer ownership.
+      RegStorage new_reg = AllocTypedTemp(rl_src.fp, op_kind);
+      OpRegCopy(new_reg, rl_src.reg);
+      // Associate the old sreg with the new register and clobber the old register.
+      GetRegInfo(new_reg)->SetSReg(GetRegInfo(rl_src.reg)->SReg());
+      Clobber(rl_src.reg);
+      rl_src.reg = new_reg;
+    }
+    return rl_src;
   }
+
+  DCHECK_NE(rl_src.s_reg_low, INVALID_SREG);
+  rl_src.reg = AllocTypedTemp(rl_src.fp, op_kind);
+  LoadValueDirect(rl_src, rl_src.reg);
+  rl_src.location = kLocPhysReg;
+  MarkLive(rl_src);
   return rl_src;
 }
 
@@ -203,12 +216,26 @@
 
 RegLocation Mir2Lir::LoadValueWide(RegLocation rl_src, RegisterClass op_kind) {
   DCHECK(rl_src.wide);
-  rl_src = EvalLoc(rl_src, op_kind, false);
-  if (IsInexpensiveConstant(rl_src) || rl_src.location != kLocPhysReg) {
-    LoadValueDirectWide(rl_src, rl_src.reg);
-    rl_src.location = kLocPhysReg;
-    MarkLive(rl_src);
+  rl_src = UpdateLocWide(rl_src);
+  if (rl_src.location == kLocPhysReg) {
+    if (!RegClassMatches(op_kind, rl_src.reg)) {
+      // Wrong register class, realloc, copy and transfer ownership.
+      RegStorage new_regs = AllocTypedTempWide(rl_src.fp, op_kind);
+      OpRegCopyWide(new_regs, rl_src.reg);
+      // Associate the old sreg with the new register and clobber the old register.
+      GetRegInfo(new_regs)->SetSReg(GetRegInfo(rl_src.reg)->SReg());
+      Clobber(rl_src.reg);
+      rl_src.reg = new_regs;
+    }
+    return rl_src;
   }
+
+  DCHECK_NE(rl_src.s_reg_low, INVALID_SREG);
+  DCHECK_NE(GetSRegHi(rl_src.s_reg_low), INVALID_SREG);
+  rl_src.reg = AllocTypedTempWide(rl_src.fp, op_kind);
+  LoadValueDirectWide(rl_src, rl_src.reg);
+  rl_src.location = kLocPhysReg;
+  MarkLive(rl_src);
   return rl_src;
 }
 
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 90d5a28..b7ea34f 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -31,7 +31,8 @@
                             RegLocation rl_dest, int lit);
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(ThreadOffset<4> offset);
+    RegStorage LoadHelper(ThreadOffset<4> offset) OVERRIDE;
+    RegStorage LoadHelper(ThreadOffset<8> offset) OVERRIDE;
     LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
                               OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
@@ -171,12 +172,14 @@
     LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
     LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
     LIR* OpTestSuspend(LIR* target);
-    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset);
+    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) OVERRIDE;
+    LIR* OpThreadMem(OpKind op, ThreadOffset<8> thread_offset) OVERRIDE;
     LIR* OpVldm(RegStorage r_base, int count);
     LIR* OpVstm(RegStorage r_base, int count);
     void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
     void OpRegCopyWide(RegStorage dest, RegStorage src);
-    void OpTlsCmp(ThreadOffset<4> offset, int val);
+    void OpTlsCmp(ThreadOffset<4> offset, int val) OVERRIDE;
+    void OpTlsCmp(ThreadOffset<8> offset, int val) OVERRIDE;
 
     // TODO: collapse r_dest.
     LIR* LoadBaseDispBody(RegStorage r_base, int displacement, RegStorage r_dest,
diff --git a/compiler/dex/quick/mips/int_mips.cc b/compiler/dex/quick/mips/int_mips.cc
index fdfe7fe..55e93d7 100644
--- a/compiler/dex/quick/mips/int_mips.cc
+++ b/compiler/dex/quick/mips/int_mips.cc
@@ -269,6 +269,10 @@
   LOG(FATAL) << "Unexpected use of OpTlsCmp for Arm";
 }
 
+void MipsMir2Lir::OpTlsCmp(ThreadOffset<8> offset, int val) {
+  UNIMPLEMENTED(FATAL) << "Should not be called.";
+}
+
 bool MipsMir2Lir::GenInlinedCas(CallInfo* info, bool is_long, bool is_object) {
   DCHECK_NE(cu_->instruction_set, kThumb2);
   return false;
diff --git a/compiler/dex/quick/mips/target_mips.cc b/compiler/dex/quick/mips/target_mips.cc
index 570c220..2821209 100644
--- a/compiler/dex/quick/mips/target_mips.cc
+++ b/compiler/dex/quick/mips/target_mips.cc
@@ -512,6 +512,11 @@
   return rs_rT9;
 }
 
+RegStorage MipsMir2Lir::LoadHelper(ThreadOffset<8> offset) {
+  UNIMPLEMENTED(FATAL) << "Should not be called.";
+  return RegStorage::InvalidReg();
+}
+
 LIR* MipsMir2Lir::CheckSuspendUsingLoad() {
   RegStorage tmp = AllocTemp();
   // NOTE: native pointer.
diff --git a/compiler/dex/quick/mips/utility_mips.cc b/compiler/dex/quick/mips/utility_mips.cc
index 58fbace..2757b7b 100644
--- a/compiler/dex/quick/mips/utility_mips.cc
+++ b/compiler/dex/quick/mips/utility_mips.cc
@@ -670,6 +670,11 @@
   return NULL;
 }
 
+LIR* MipsMir2Lir::OpThreadMem(OpKind op, ThreadOffset<8> thread_offset) {
+  UNIMPLEMENTED(FATAL) << "Should not be called.";
+  return nullptr;
+}
+
 LIR* MipsMir2Lir::OpMem(OpKind op, RegStorage r_base, int disp) {
   LOG(FATAL) << "Unexpected use of OpMem for MIPS";
   return NULL;
diff --git a/compiler/dex/quick/mir_to_lir-inl.h b/compiler/dex/quick/mir_to_lir-inl.h
index b5b50a4..2973e14 100644
--- a/compiler/dex/quick/mir_to_lir-inl.h
+++ b/compiler/dex/quick/mir_to_lir-inl.h
@@ -25,7 +25,8 @@
 
 /* Mark a temp register as dead.  Does not affect allocation state. */
 inline void Mir2Lir::ClobberBody(RegisterInfo* p) {
-  if (p->IsTemp()) {
+  DCHECK(p->IsTemp());
+  if (!p->IsDead()) {
     DCHECK(!(p->IsLive() && p->IsDirty()))  << "Live & dirty temp in clobber";
     p->MarkDead();
     p->ResetDefBody();
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 0ffd189..77119a4 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -979,7 +979,7 @@
   }
 
   // Free temp registers and reset redundant store tracking.
-  ClobberAllRegs();
+  ClobberAllTemps();
 
   if (bb->block_type == kEntryBlock) {
     ResetRegPool();
@@ -994,7 +994,7 @@
   for (mir = bb->first_mir_insn; mir != NULL; mir = mir->next) {
     ResetRegPool();
     if (cu_->disable_opt & (1 << kTrackLiveTemps)) {
-      ClobberAllRegs();
+      ClobberAllTemps();
       // Reset temp allocation to minimize differences when A/B testing.
       reg_pool_->ResetNextTemp();
     }
@@ -1074,7 +1074,7 @@
   // Free temp registers and reset redundant store tracking.
   ResetRegPool();
   ResetDefTracking();
-  ClobberAllRegs();
+  ClobberAllTemps();
 
   return GenSpecialCase(bb, mir, special);
 }
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 74245a4..77e5649 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -333,6 +333,9 @@
       bool InUse() { return (storage_mask_ & master_->used_storage_) != 0; }
       void MarkInUse() { master_->used_storage_ |= storage_mask_; }
       void MarkFree() { master_->used_storage_ &= ~storage_mask_; }
+      // No part of the containing storage is live in this view.
+      bool IsDead() { return (master_->liveness_ & storage_mask_) == 0; }
+      // Liveness of this view matches.  Note: not equivalent to !IsDead().
       bool IsLive() { return (master_->liveness_ & storage_mask_) == storage_mask_; }
       void MarkLive() { master_->liveness_ |= storage_mask_; }
       void MarkDead() {
@@ -358,9 +361,13 @@
         master_ = master;
         if (master != this) {
           master_->aliased_ = true;
+          DCHECK(alias_chain_ == nullptr);
+          alias_chain_ = master_->alias_chain_;
+          master_->alias_chain_ = this;
         }
       }
       bool IsAliased() { return aliased_; }
+      RegisterInfo* GetAliasChain() { return alias_chain_; }
       uint32_t StorageMask() { return storage_mask_; }
       void SetStorageMask(uint32_t storage_mask) { storage_mask_ = storage_mask; }
       LIR* DefStart() { return def_start_; }
@@ -385,6 +392,7 @@
       uint32_t storage_mask_;      // Track allocation of sub-units.
       LIR *def_start_;             // Starting inst in last def sequence.
       LIR *def_end_;               // Ending inst in last def sequence.
+      RegisterInfo* alias_chain_;  // Chain of aliased registers.
     };
 
     class RegisterPool {
@@ -462,7 +470,7 @@
      public:
       LIRSlowPath(Mir2Lir* m2l, const DexOffset dexpc, LIR* fromfast,
                   LIR* cont = nullptr) :
-        m2l_(m2l), current_dex_pc_(dexpc), fromfast_(fromfast), cont_(cont) {
+        m2l_(m2l), cu_(m2l->cu_), current_dex_pc_(dexpc), fromfast_(fromfast), cont_(cont) {
       }
       virtual ~LIRSlowPath() {}
       virtual void Compile() = 0;
@@ -475,6 +483,7 @@
       LIR* GenerateTargetLabel(int opcode = kPseudoTargetLabel);
 
       Mir2Lir* const m2l_;
+      CompilationUnit* const cu_;
       const DexOffset current_dex_pc_;
       LIR* const fromfast_;
       LIR* const cont_;
@@ -655,7 +664,7 @@
     void ResetDefLoc(RegLocation rl);
     void ResetDefLocWide(RegLocation rl);
     void ResetDefTracking();
-    void ClobberAllRegs();
+    void ClobberAllTemps();
     void FlushSpecificReg(RegisterInfo* info);
     void FlushAllRegs();
     bool RegClassMatches(int reg_class, RegStorage reg);
@@ -673,9 +682,9 @@
     RegLocation UpdateRawLoc(RegLocation loc);
 
     /**
-     * @brief Used to load register location into a typed temporary or pair of temporaries.
+     * @brief Used to prepare a register location to receive a wide value.
      * @see EvalLoc
-     * @param loc The register location to load from.
+     * @param loc the location where the value will be stored.
      * @param reg_class Type of register needed.
      * @param update Whether the liveness information should be updated.
      * @return Returns the properly typed temporary in physical register pairs.
@@ -683,8 +692,8 @@
     RegLocation EvalLocWide(RegLocation loc, int reg_class, bool update);
 
     /**
-     * @brief Used to load register location into a typed temporary.
-     * @param loc The register location to load from.
+     * @brief Used to prepare a register location to receive a value.
+     * @param loc the location where the value will be stored.
      * @param reg_class Type of register needed.
      * @param update Whether the liveness information should be updated.
      * @return Returns the properly typed temporary in physical register.
@@ -756,7 +765,8 @@
                           RegLocation rl_src, int lit);
     void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
                         RegLocation rl_src1, RegLocation rl_src2);
-    void GenConversionCall(ThreadOffset<4> func_offset, RegLocation rl_dest,
+    template <size_t pointer_size>
+    void GenConversionCall(ThreadOffset<pointer_size> func_offset, RegLocation rl_dest,
                            RegLocation rl_src);
     void GenSuspendTest(int opt_flags);
     void GenSuspendTestAndBranch(int opt_flags, LIR* target);
@@ -767,45 +777,66 @@
                        RegLocation rl_src1, RegLocation rl_src2);
 
     // Shared by all targets - implemented in gen_invoke.cc.
-    LIR* CallHelper(RegStorage r_tgt, ThreadOffset<4> helper_offset, bool safepoint_pc,
+    template <size_t pointer_size>
+    LIR* CallHelper(RegStorage r_tgt, ThreadOffset<pointer_size> helper_offset, bool safepoint_pc,
                     bool use_link = true);
     RegStorage CallHelperSetup(ThreadOffset<4> helper_offset);
-    void CallRuntimeHelper(ThreadOffset<4> helper_offset, bool safepoint_pc);
-    void CallRuntimeHelperImm(ThreadOffset<4> helper_offset, int arg0, bool safepoint_pc);
-    void CallRuntimeHelperReg(ThreadOffset<4> helper_offset, RegStorage arg0, bool safepoint_pc);
-    void CallRuntimeHelperRegLocation(ThreadOffset<4> helper_offset, RegLocation arg0,
+    RegStorage CallHelperSetup(ThreadOffset<8> helper_offset);
+    template <size_t pointer_size>
+    void CallRuntimeHelper(ThreadOffset<pointer_size> helper_offset, bool safepoint_pc);
+    template <size_t pointer_size>
+    void CallRuntimeHelperImm(ThreadOffset<pointer_size> helper_offset, int arg0, bool safepoint_pc);
+    template <size_t pointer_size>
+    void CallRuntimeHelperReg(ThreadOffset<pointer_size> helper_offset, RegStorage arg0, bool safepoint_pc);
+    template <size_t pointer_size>
+    void CallRuntimeHelperRegLocation(ThreadOffset<pointer_size> helper_offset, RegLocation arg0,
                                       bool safepoint_pc);
-    void CallRuntimeHelperImmImm(ThreadOffset<4> helper_offset, int arg0, int arg1,
+    template <size_t pointer_size>
+    void CallRuntimeHelperImmImm(ThreadOffset<pointer_size> helper_offset, int arg0, int arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperImmRegLocation(ThreadOffset<4> helper_offset, int arg0,
+    template <size_t pointer_size>
+    void CallRuntimeHelperImmRegLocation(ThreadOffset<pointer_size> helper_offset, int arg0,
                                          RegLocation arg1, bool safepoint_pc);
-    void CallRuntimeHelperRegLocationImm(ThreadOffset<4> helper_offset, RegLocation arg0,
+    template <size_t pointer_size>
+    void CallRuntimeHelperRegLocationImm(ThreadOffset<pointer_size> helper_offset, RegLocation arg0,
                                          int arg1, bool safepoint_pc);
-    void CallRuntimeHelperImmReg(ThreadOffset<4> helper_offset, int arg0, RegStorage arg1,
+    template <size_t pointer_size>
+    void CallRuntimeHelperImmReg(ThreadOffset<pointer_size> helper_offset, int arg0, RegStorage arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperRegImm(ThreadOffset<4> helper_offset, RegStorage arg0, int arg1,
+    template <size_t pointer_size>
+    void CallRuntimeHelperRegImm(ThreadOffset<pointer_size> helper_offset, RegStorage arg0, int arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperImmMethod(ThreadOffset<4> helper_offset, int arg0,
+    template <size_t pointer_size>
+    void CallRuntimeHelperImmMethod(ThreadOffset<pointer_size> helper_offset, int arg0,
                                     bool safepoint_pc);
-    void CallRuntimeHelperRegMethod(ThreadOffset<4> helper_offset, RegStorage arg0,
+    template <size_t pointer_size>
+    void CallRuntimeHelperRegMethod(ThreadOffset<pointer_size> helper_offset, RegStorage arg0,
                                     bool safepoint_pc);
-    void CallRuntimeHelperRegMethodRegLocation(ThreadOffset<4> helper_offset, RegStorage arg0,
-                                               RegLocation arg2, bool safepoint_pc);
-    void CallRuntimeHelperRegLocationRegLocation(ThreadOffset<4> helper_offset,
+    template <size_t pointer_size>
+    void CallRuntimeHelperRegMethodRegLocation(ThreadOffset<pointer_size> helper_offset,
+                                               RegStorage arg0, RegLocation arg2, bool safepoint_pc);
+    template <size_t pointer_size>
+    void CallRuntimeHelperRegLocationRegLocation(ThreadOffset<pointer_size> helper_offset,
                                                  RegLocation arg0, RegLocation arg1,
                                                  bool safepoint_pc);
-    void CallRuntimeHelperRegReg(ThreadOffset<4> helper_offset, RegStorage arg0, RegStorage arg1,
-                                 bool safepoint_pc);
-    void CallRuntimeHelperRegRegImm(ThreadOffset<4> helper_offset, RegStorage arg0, RegStorage arg1,
-                                    int arg2, bool safepoint_pc);
-    void CallRuntimeHelperImmMethodRegLocation(ThreadOffset<4> helper_offset, int arg0,
+    template <size_t pointer_size>
+    void CallRuntimeHelperRegReg(ThreadOffset<pointer_size> helper_offset, RegStorage arg0,
+                                 RegStorage arg1, bool safepoint_pc);
+    template <size_t pointer_size>
+    void CallRuntimeHelperRegRegImm(ThreadOffset<pointer_size> helper_offset, RegStorage arg0,
+                                    RegStorage arg1, int arg2, bool safepoint_pc);
+    template <size_t pointer_size>
+    void CallRuntimeHelperImmMethodRegLocation(ThreadOffset<pointer_size> helper_offset, int arg0,
                                                RegLocation arg2, bool safepoint_pc);
-    void CallRuntimeHelperImmMethodImm(ThreadOffset<4> helper_offset, int arg0, int arg2,
+    template <size_t pointer_size>
+    void CallRuntimeHelperImmMethodImm(ThreadOffset<pointer_size> helper_offset, int arg0, int arg2,
                                        bool safepoint_pc);
-    void CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset<4> helper_offset,
+    template <size_t pointer_size>
+    void CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset<pointer_size> helper_offset,
                                                     int arg0, RegLocation arg1, RegLocation arg2,
                                                     bool safepoint_pc);
-    void CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset<4> helper_offset,
+    template <size_t pointer_size>
+    void CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset<pointer_size> helper_offset,
                                                             RegLocation arg0, RegLocation arg1,
                                                             RegLocation arg2,
                                                             bool safepoint_pc);
@@ -1002,7 +1033,10 @@
                                     RegLocation rl_src, RegLocation rl_dest, int lit) = 0;
     virtual bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) = 0;
     virtual LIR* CheckSuspendUsingLoad() = 0;
+
     virtual RegStorage LoadHelper(ThreadOffset<4> offset) = 0;
+    virtual RegStorage LoadHelper(ThreadOffset<8> offset) = 0;
+
     virtual LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
                                       OpSize size) = 0;
     virtual LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
@@ -1233,12 +1267,14 @@
                              RegStorage r_src2) = 0;
     virtual LIR* OpTestSuspend(LIR* target) = 0;
     virtual LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) = 0;
+    virtual LIR* OpThreadMem(OpKind op, ThreadOffset<8> thread_offset) = 0;
     virtual LIR* OpVldm(RegStorage r_base, int count) = 0;
     virtual LIR* OpVstm(RegStorage r_base, int count) = 0;
     virtual void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale,
                        int offset) = 0;
     virtual void OpRegCopyWide(RegStorage dest, RegStorage src) = 0;
     virtual void OpTlsCmp(ThreadOffset<4> offset, int val) = 0;
+    virtual void OpTlsCmp(ThreadOffset<8> offset, int val) = 0;
     virtual bool InexpensiveConstantInt(int32_t value) = 0;
     virtual bool InexpensiveConstantFloat(int32_t value) = 0;
     virtual bool InexpensiveConstantLong(int64_t value) = 0;
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index ca9a3ab..bcc077b 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -40,7 +40,8 @@
 
 Mir2Lir::RegisterInfo::RegisterInfo(RegStorage r, uint64_t mask)
   : reg_(r), is_temp_(false), wide_value_(false), dirty_(false), aliased_(false), partner_(r),
-    s_reg_(INVALID_SREG), def_use_mask_(mask), master_(this) {
+    s_reg_(INVALID_SREG), def_use_mask_(mask), master_(this), def_start_(nullptr),
+    def_end_(nullptr), alias_chain_(nullptr) {
   switch (r.StorageSize()) {
     case 0: storage_mask_ = 0xffffffff; break;
     case 4: storage_mask_ = 0x00000001; break;
@@ -66,9 +67,13 @@
     next_sp_reg_(0), dp_regs_(arena, dp_regs.size()), next_dp_reg_(0), m2l_(m2l)  {
   // Initialize the fast lookup map.
   m2l_->reginfo_map_.Reset();
-  m2l_->reginfo_map_.Resize(RegStorage::kMaxRegs);
-  for (unsigned i = 0; i < RegStorage::kMaxRegs; i++) {
-    m2l_->reginfo_map_.Insert(nullptr);
+  if (kIsDebugBuild) {
+    m2l_->reginfo_map_.Resize(RegStorage::kMaxRegs);
+    for (unsigned i = 0; i < RegStorage::kMaxRegs; i++) {
+      m2l_->reginfo_map_.Insert(nullptr);
+    }
+  } else {
+    m2l_->reginfo_map_.SetSize(RegStorage::kMaxRegs);
   }
 
   // Construct the register pool.
@@ -139,29 +144,43 @@
 }
 
 void Mir2Lir::Clobber(RegStorage reg) {
-  if (reg.IsPair()) {
+  if (UNLIKELY(reg.IsPair())) {
     DCHECK(!GetRegInfo(reg.GetLow())->IsAliased());
-    ClobberBody(GetRegInfo(reg.GetLow()));
+    Clobber(reg.GetLow());
     DCHECK(!GetRegInfo(reg.GetHigh())->IsAliased());
-    ClobberBody(GetRegInfo(reg.GetHigh()));
+    Clobber(reg.GetHigh());
   } else {
     RegisterInfo* info = GetRegInfo(reg);
-    if (info->IsAliased()) {
-      ClobberAliases(info);
-    } else if (info != info->Master() && info->Master()->SReg() != INVALID_SREG) {
-      ClobberBody(info->Master());
+    if (info->IsTemp() && !info->IsDead()) {
+      ClobberBody(info);
+      if (info->IsAliased()) {
+        ClobberAliases(info);
+      } else {
+        RegisterInfo* master = info->Master();
+        if (info != master) {
+          ClobberBody(info->Master());
+        }
+      }
     }
-    ClobberBody(info);
   }
 }
 
 void Mir2Lir::ClobberAliases(RegisterInfo* info) {
-  DCHECK(info->IsAliased());
-  GrowableArray<RegisterInfo*>::Iterator iter(&tempreg_info_);
-  for (RegisterInfo* tmpreg_info = iter.Next(); tmpreg_info != NULL; tmpreg_info = iter.Next()) {
-    if (tmpreg_info->Master() == info) {
-      // tmpreg_info is an alias of info.
-      ClobberBody(tmpreg_info);
+  for (RegisterInfo* alias = info->GetAliasChain(); alias != nullptr;
+       alias = alias->GetAliasChain()) {
+    DCHECK(!alias->IsAliased());  // Only the master should be marked as alised.
+    if (alias->SReg() != INVALID_SREG) {
+      alias->SetSReg(INVALID_SREG);
+      alias->ResetDefBody();
+      if (alias->IsWide()) {
+        alias->SetIsWide(false);
+        if (alias->GetReg() != alias->Partner()) {
+          RegisterInfo* p = GetRegInfo(alias->Partner());
+          p->SetIsWide(false);
+          p->MarkDead();
+          p->ResetDefBody();
+        }
+      }
     }
   }
 }
@@ -185,11 +204,10 @@
     GrowableArray<RegisterInfo*>::Iterator iter(&tempreg_info_);
     for (RegisterInfo* info = iter.Next(); info != NULL; info = iter.Next()) {
       if (info->SReg() == s_reg) {
+        ClobberBody(info);
         if (info->IsAliased()) {
-          // TUNING: if this gets hot, we could add links to follow - aliasing is static.
           ClobberAliases(info);
         }
-        ClobberBody(info);
       }
     }
   }
@@ -645,7 +663,7 @@
   }
 }
 
-void Mir2Lir::ClobberAllRegs() {
+void Mir2Lir::ClobberAllTemps() {
   GrowableArray<RegisterInfo*>::Iterator iter(&tempreg_info_);
   for (RegisterInfo* info = iter.Next(); info != NULL; info = iter.Next()) {
     ClobberBody(info);
@@ -703,10 +721,9 @@
 void Mir2Lir::FlushAllRegs() {
   GrowableArray<RegisterInfo*>::Iterator it(&tempreg_info_);
   for (RegisterInfo* info = it.Next(); info != nullptr; info = it.Next()) {
-    if (info->IsLive() && info->IsDirty()) {
+    if (info->IsDirty() && info->IsLive()) {
       FlushSpecificReg(info);
     }
-    DCHECK(info->IsTemp());
     info->MarkDead();
     info->SetSReg(INVALID_SREG);
     info->ResetDefBody();
@@ -937,9 +954,8 @@
   /* If already in registers, we can assume proper form.  Right reg class? */
   if (loc.location == kLocPhysReg) {
     if (!RegClassMatches(reg_class, loc.reg)) {
-      /* Wrong register class.  Reallocate and copy */
+      // Wrong register class.  Reallocate and transfer ownership.
       RegStorage new_regs = AllocTypedTempWide(loc.fp, reg_class);
-      OpRegCopyWide(new_regs, loc.reg);
       // Associate the old sreg with the new register and clobber the old register.
       GetRegInfo(new_regs)->SetSReg(GetRegInfo(loc.reg)->SReg());
       Clobber(loc.reg);
@@ -971,9 +987,8 @@
 
   if (loc.location == kLocPhysReg) {
     if (!RegClassMatches(reg_class, loc.reg)) {
-      /* Wrong register class.  Realloc, copy and transfer ownership */
+      // Wrong register class.  Reallocate and transfer ownership.
       RegStorage new_reg = AllocTypedTemp(loc.fp, reg_class);
-      OpRegCopy(new_reg, loc.reg);
       // Associate the old sreg with the new register and clobber the old register.
       GetRegInfo(new_reg)->SetSReg(GetRegInfo(loc.reg)->SReg());
       Clobber(loc.reg);
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index f701a1f..cf2b10a 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -161,7 +161,9 @@
 }
 
 void X86Mir2Lir::GenMoveException(RegLocation rl_dest) {
-  int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
+  int ex_offset = Is64BitInstructionSet(cu_->instruction_set) ?
+      Thread::ExceptionOffset<8>().Int32Value() :
+      Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   NewLIR2(kX86Mov32RT, rl_result.reg.GetReg(), ex_offset);
   NewLIR2(kX86Mov32TI, ex_offset, 0);
@@ -175,7 +177,10 @@
   RegStorage reg_card_base = AllocTemp();
   RegStorage reg_card_no = AllocTemp();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
-  NewLIR2(kX86Mov32RT, reg_card_base.GetReg(), Thread::CardTableOffset<4>().Int32Value());
+  int ct_offset = Is64BitInstructionSet(cu_->instruction_set) ?
+      Thread::CardTableOffset<8>().Int32Value() :
+      Thread::CardTableOffset<4>().Int32Value();
+  NewLIR2(kX86Mov32RT, reg_card_base.GetReg(), ct_offset);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
@@ -222,10 +227,14 @@
         GenerateTargetLabel(kPseudoThrowTarget);
         m2l_->OpRegImm(kOpAdd, rs_rX86_SP, sp_displace_);
         m2l_->ClobberCallerSave();
-        ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowStackOverflow);
         // Assumes codegen and target are in thumb2 mode.
-        m2l_->CallHelper(RegStorage::InvalidReg(), func_offset, false /* MarkSafepointPC */,
-                         false /* UseLink */);
+        if (Is64BitInstructionSet(cu_->instruction_set)) {
+          m2l_->CallHelper(RegStorage::InvalidReg(), QUICK_ENTRYPOINT_OFFSET(8, pThrowStackOverflow),
+                           false /* MarkSafepointPC */, false /* UseLink */);
+        } else {
+          m2l_->CallHelper(RegStorage::InvalidReg(), QUICK_ENTRYPOINT_OFFSET(4, pThrowStackOverflow),
+                                     false /* MarkSafepointPC */, false /* UseLink */);
+        }
       }
 
      private:
@@ -240,9 +249,15 @@
     // in case a signal comes in that's not using an alternate signal stack and the large frame may
     // have moved us outside of the reserved area at the end of the stack.
     // cmp rX86_SP, fs:[stack_end_]; jcc throw_slowpath
-    OpRegThreadMem(kOpCmp, rs_rX86_SP, Thread::StackEndOffset<4>());
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      OpRegThreadMem(kOpCmp, rs_rX86_SP, Thread::StackEndOffset<8>());
+    } else {
+      OpRegThreadMem(kOpCmp, rs_rX86_SP, Thread::StackEndOffset<4>());
+    }
     LIR* branch = OpCondBranch(kCondUlt, nullptr);
-    AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_ - 4));
+    AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch,
+                                                 frame_size_ -
+                                                 GetInstructionSetPointerSize(cu_->instruction_set)));
   }
 
   FlushIns(ArgLocs, rl_method);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 9648312..11e7ff9 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -31,7 +31,8 @@
                             RegLocation rl_dest, int lit);
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(ThreadOffset<4> offset);
+    RegStorage LoadHelper(ThreadOffset<4> offset) OVERRIDE;
+    RegStorage LoadHelper(ThreadOffset<8> offset) OVERRIDE;
     LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
                               OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
@@ -245,14 +246,17 @@
     LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
     LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
     LIR* OpTestSuspend(LIR* target);
-    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset);
+    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) OVERRIDE;
+    LIR* OpThreadMem(OpKind op, ThreadOffset<8> thread_offset) OVERRIDE;
     LIR* OpVldm(RegStorage r_base, int count);
     LIR* OpVstm(RegStorage r_base, int count);
     void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
     void OpRegCopyWide(RegStorage dest, RegStorage src);
-    void OpTlsCmp(ThreadOffset<4> offset, int val);
+    void OpTlsCmp(ThreadOffset<4> offset, int val) OVERRIDE;
+    void OpTlsCmp(ThreadOffset<8> offset, int val) OVERRIDE;
 
     void OpRegThreadMem(OpKind op, RegStorage r_dest, ThreadOffset<4> thread_offset);
+    void OpRegThreadMem(OpKind op, RegStorage r_dest, ThreadOffset<8> thread_offset);
     void SpillCoreRegs();
     void UnSpillCoreRegs();
     static const X86EncodingMap EncodingMap[kX86Last];
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 698fce4..368234e 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -724,6 +724,12 @@
 }
 
 void X86Mir2Lir::OpTlsCmp(ThreadOffset<4> offset, int val) {
+  DCHECK_EQ(kX86, cu_->instruction_set);
+  NewLIR2(kX86Cmp16TI8, offset.Int32Value(), val);
+}
+
+void X86Mir2Lir::OpTlsCmp(ThreadOffset<8> offset, int val) {
+  DCHECK_EQ(kX86_64, cu_->instruction_set);
   NewLIR2(kX86Cmp16TI8, offset.Int32Value(), val);
 }
 
@@ -956,7 +962,11 @@
 
 // Test suspend flag, return target of taken suspend branch
 LIR* X86Mir2Lir::OpTestSuspend(LIR* target) {
-  OpTlsCmp(Thread::ThreadFlagsOffset<4>(), 0);
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    OpTlsCmp(Thread::ThreadFlagsOffset<8>(), 0);
+  } else {
+    OpTlsCmp(Thread::ThreadFlagsOffset<4>(), 0);
+  }
   return OpCondBranch((target == NULL) ? kCondNe : kCondEq, target);
 }
 
@@ -1196,7 +1206,7 @@
   if (rl_src.location == kLocPhysReg) {
     // Both operands are in registers.
     // But we must ensure that rl_src is in pair
-    rl_src = EvalLocWide(rl_src, kCoreReg, true);
+    rl_src = LoadValueWide(rl_src, kCoreReg);
     if (rl_dest.reg.GetLowReg() == rl_src.reg.GetHighReg()) {
       // The registers are the same, so we would clobber it before the use.
       RegStorage temp_reg = AllocTemp();
@@ -1365,6 +1375,20 @@
 }
 
 void X86Mir2Lir::OpRegThreadMem(OpKind op, RegStorage r_dest, ThreadOffset<4> thread_offset) {
+  DCHECK_EQ(kX86, cu_->instruction_set);
+  X86OpCode opcode = kX86Bkpt;
+  switch (op) {
+  case kOpCmp: opcode = kX86Cmp32RT;  break;
+  case kOpMov: opcode = kX86Mov32RT;  break;
+  default:
+    LOG(FATAL) << "Bad opcode: " << op;
+    break;
+  }
+  NewLIR2(opcode, r_dest.GetReg(), thread_offset.Int32Value());
+}
+
+void X86Mir2Lir::OpRegThreadMem(OpKind op, RegStorage r_dest, ThreadOffset<8> thread_offset) {
+  DCHECK_EQ(kX86_64, cu_->instruction_set);
   X86OpCode opcode = kX86Bkpt;
   switch (op) {
   case kOpCmp: opcode = kX86Cmp32RT;  break;
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index c401baf..2db9845 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -589,6 +589,12 @@
   return RegStorage::InvalidReg();
 }
 
+// Not used in x86
+RegStorage X86Mir2Lir::LoadHelper(ThreadOffset<8> offset) {
+  LOG(FATAL) << "Unexpected use of LoadHelper in x86";
+  return RegStorage::InvalidReg();
+}
+
 LIR* X86Mir2Lir::CheckSuspendUsingLoad() {
   LOG(FATAL) << "Unexpected use of CheckSuspendUsingLoad in x86";
   return nullptr;
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index a4e1255..1da4f17 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -472,6 +472,20 @@
 }
 
 LIR* X86Mir2Lir::OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) {
+  DCHECK_EQ(kX86, cu_->instruction_set);
+  X86OpCode opcode = kX86Bkpt;
+  switch (op) {
+    case kOpBlx: opcode = kX86CallT;  break;
+    case kOpBx: opcode = kX86JmpT;  break;
+    default:
+      LOG(FATAL) << "Bad opcode: " << op;
+      break;
+  }
+  return NewLIR1(opcode, thread_offset.Int32Value());
+}
+
+LIR* X86Mir2Lir::OpThreadMem(OpKind op, ThreadOffset<8> thread_offset) {
+  DCHECK_EQ(kX86_64, cu_->instruction_set);
   X86OpCode opcode = kX86Bkpt;
   switch (op) {
     case kOpBlx: opcode = kX86CallT;  break;
diff --git a/compiler/dex/reg_storage.h b/compiler/dex/reg_storage.h
index df5aa7b..979f516 100644
--- a/compiler/dex/reg_storage.h
+++ b/compiler/dex/reg_storage.h
@@ -22,14 +22,14 @@
 
 /*
  * 16-bit representation of the physical register container holding a Dalvik value.
- * The encoding allows up to 32 physical elements per storage class, and supports eight
+ * The encoding allows up to 64 physical elements per storage class, and supports eight
  * register container shapes.
  *
- * [V] [D] [HHHHH] [SSS] [F] [LLLLL]
+ * [V] [HHHHH] [SSS] [F] [LLLLLL]
  *
- * [LLLLL]
+ * [LLLLLL]
  *  Physical register number for the low or solo register.
- *    0..31
+ *    0..63
  *
  * [F]
  *  Describes type of the [LLLLL] register.
@@ -51,19 +51,13 @@
  *  Physical register number of the high register (valid only for register pair).
  *    0..31
  *
- * [D]
- *  Describes type of the [HHHHH] register (valid only for register pair).
- *    0: Core
- *    1: Floating point
- *
  * [V]
  *    0 -> Invalid
  *    1 -> Valid
  *
  * Note that in all non-invalid cases, we can determine if the storage is floating point
- * by testing bit 6.  Though a mismatch appears to be permitted by the format, the [F][D] values
- * from each half of a pair must match (this allows the high and low regs of a pair to be more
- * easily individually manipulated).
+ * by testing bit 7.  Note also that a register pair is effectively limited to a pair of
+ * physical register numbers in the 0..31 range.
  *
  * On some target architectures, the same underlying physical register container can be given
  * different views.  For example, Arm's 32-bit single-precision floating point registers
@@ -82,30 +76,30 @@
     kValidMask     = 0x8000,
     kValid         = 0x8000,
     kInvalid       = 0x0000,
-    kShapeMask     = 0x01c0,
-    k32BitSolo     = 0x0040,
-    k64BitSolo     = 0x0080,
-    k64BitPair     = 0x00c0,
-    k128BitSolo    = 0x0100,
-    k256BitSolo    = 0x0140,
-    k512BitSolo    = 0x0180,
-    k1024BitSolo   = 0x01c0,
-    k64BitMask     = 0x0180,
-    k64Bits        = 0x0080,
-    kShapeTypeMask = 0x01e0,
-    kFloatingPoint = 0x0020,
+    kShapeMask     = 0x0380,
+    k32BitSolo     = 0x0080,
+    k64BitSolo     = 0x0100,
+    k64BitPair     = 0x0180,
+    k128BitSolo    = 0x0200,
+    k256BitSolo    = 0x0280,
+    k512BitSolo    = 0x0300,
+    k1024BitSolo   = 0x0380,
+    k64BitMask     = 0x0300,
+    k64Bits        = 0x0100,
+    kShapeTypeMask = 0x03c0,
+    kFloatingPoint = 0x0040,
     kCoreRegister  = 0x0000,
   };
 
-  static const uint16_t kRegValMask  = 0x01ff;  // Num, type and shape.
-  static const uint16_t kRegTypeMask = 0x003f;  // Num and type.
-  static const uint16_t kRegNumMask  = 0x001f;  // Num only.
+  static const uint16_t kRegValMask  = 0x03ff;     // Num, type and shape.
+  static const uint16_t kRegTypeMask = 0x007f;     // Num and type.
+  static const uint16_t kRegNumMask  = 0x003f;     // Num only.
+  static const uint16_t kHighRegNumMask = 0x001f;  // 0..31 for high reg
   static const uint16_t kMaxRegs     = kRegValMask + 1;
-  // TODO: deprecate use of kInvalidRegVal and speed up GetReg().
-  static const uint16_t kInvalidRegVal = 0x01ff;
-  static const uint16_t kHighRegShift = 9;
-  static const uint16_t kShapeMaskShift = 6;
-  static const uint16_t kHighRegMask = (kRegTypeMask << kHighRegShift);
+  // TODO: deprecate use of kInvalidRegVal and speed up GetReg().  Rely on valid bit instead.
+  static const uint16_t kInvalidRegVal = 0x03ff;
+  static const uint16_t kHighRegShift = 10;
+  static const uint16_t kHighRegMask = (kHighRegNumMask << kHighRegShift);
 
   // Reg is [F][LLLLL], will override any existing shape and use rs_kind.
   RegStorage(RegStorageKind rs_kind, int reg) {
@@ -116,7 +110,9 @@
   RegStorage(RegStorageKind rs_kind, int low_reg, int high_reg) {
     DCHECK_EQ(rs_kind, k64BitPair);
     DCHECK_EQ(low_reg & kFloatingPoint, high_reg & kFloatingPoint);
-    reg_ = kValid | rs_kind | ((high_reg & kRegTypeMask) << kHighRegShift) | (low_reg & kRegTypeMask);
+    DCHECK_LE(high_reg & kRegNumMask, kHighRegNumMask) << "High reg must be in 0..31";
+    reg_ = kValid | rs_kind | ((high_reg & kHighRegNumMask) << kHighRegShift) |
+        (low_reg & kRegTypeMask);
   }
   constexpr explicit RegStorage(uint16_t val) : reg_(val) {}
   RegStorage() : reg_(kInvalid) {}
@@ -206,7 +202,7 @@
   // Retrieve the most significant register of a pair.
   int GetHighReg() const {
     DCHECK(IsPair());
-    return k32BitSolo | ((reg_ & kHighRegMask) >> kHighRegShift);
+    return k32BitSolo | ((reg_ & kHighRegMask) >> kHighRegShift) | (reg_ & kFloatingPoint);
   }
 
   // Create a stand-alone RegStorage from the high reg of a pair.
@@ -217,7 +213,7 @@
 
   void SetHighReg(int reg) {
     DCHECK(IsPair());
-    reg_ = (reg_ & ~kHighRegMask) | ((reg & kRegTypeMask) << kHighRegShift);
+    reg_ = (reg_ & ~kHighRegMask) | ((reg & kHighRegNumMask) << kHighRegShift);
   }
 
   // Return the register number of low or solo.
diff --git a/compiler/optimizing/dominator_test.cc b/compiler/optimizing/dominator_test.cc
index 0417050..3062e37 100644
--- a/compiler/optimizing/dominator_test.cc
+++ b/compiler/optimizing/dominator_test.cc
@@ -167,7 +167,8 @@
     0,
     1,
     1,
-    3
+    3,
+    1,  // Synthesized block to avoid critical edge.
   };
 
   TestCode(data, dominators, sizeof(dominators) / sizeof(int));
@@ -185,7 +186,9 @@
     0,
     1,
     1,
-    -1  // exit block is not dominated by any block due to the spin loop.
+    -1,  // exit block is not dominated by any block due to the spin loop.
+    1,   // block to avoid critical edge.
+    1    // block to avoid critical edge.
   };
 
   TestCode(data, dominators, sizeof(dominators) / sizeof(int));
@@ -205,7 +208,8 @@
     1,
     1,
     1,
-    -1  // exit block is not dominated by any block due to the spin loop.
+    -1,  // exit block is not dominated by any block due to the spin loop.
+    1    // block to avoid critical edge.
   };
 
   TestCode(data, dominators, sizeof(dominators) / sizeof(int));
@@ -225,7 +229,8 @@
     1,
     1,
     1,
-    -1  // exit block is not dominated by any block due to the spin loop.
+    -1,  // exit block is not dominated by any block due to the spin loop.
+    1    // block to avoid critical edge.
   };
 
   TestCode(data, dominators, sizeof(dominators) / sizeof(int));
@@ -247,7 +252,9 @@
     2,
     2,
     1,
-    5  // Block number 5 dominates exit block
+    5,    // Block number 5 dominates exit block
+    1,    // block to avoid critical edge.
+    2     // block to avoid critical edge.
   };
 
   TestCode(data, dominators, sizeof(dominators) / sizeof(int));
diff --git a/compiler/optimizing/find_loops_test.cc b/compiler/optimizing/find_loops_test.cc
new file mode 100644
index 0000000..fab9f7a
--- /dev/null
+++ b/compiler/optimizing/find_loops_test.cc
@@ -0,0 +1,362 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "builder.h"
+#include "dex_file.h"
+#include "dex_instruction.h"
+#include "nodes.h"
+#include "optimizing_unit_test.h"
+#include "ssa_liveness_analysis.h"
+#include "utils/arena_allocator.h"
+#include "pretty_printer.h"
+
+#include "gtest/gtest.h"
+
+namespace art {
+
+static HGraph* TestCode(const uint16_t* data, ArenaPool* pool) {
+  ArenaAllocator allocator(pool);
+  HGraphBuilder builder(&allocator);
+  const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
+  HGraph* graph = builder.BuildGraph(*item);
+  graph->BuildDominatorTree();
+  graph->FindNaturalLoops();
+  return graph;
+}
+
+TEST(FindLoopsTest, CFG1) {
+  // Constant is not used.
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::RETURN_VOID);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+  for (size_t i = 0, e = graph->GetBlocks().Size(); i < e; ++i) {
+    ASSERT_EQ(graph->GetBlocks().Get(i)->GetLoopInformation(), nullptr);
+  }
+}
+
+TEST(FindLoopsTest, CFG2) {
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::RETURN);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+  for (size_t i = 0, e = graph->GetBlocks().Size(); i < e; ++i) {
+    ASSERT_EQ(graph->GetBlocks().Get(i)->GetLoopInformation(), nullptr);
+  }
+}
+
+TEST(FindLoopsTest, CFG3) {
+  const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(
+    Instruction::CONST_4 | 3 << 12 | 0,
+    Instruction::CONST_4 | 4 << 12 | 1 << 8,
+    Instruction::ADD_INT_2ADDR | 1 << 12,
+    Instruction::GOTO | 0x100,
+    Instruction::RETURN);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+  for (size_t i = 0, e = graph->GetBlocks().Size(); i < e; ++i) {
+    ASSERT_EQ(graph->GetBlocks().Get(i)->GetLoopInformation(), nullptr);
+  }
+}
+
+TEST(FindLoopsTest, CFG4) {
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::IF_EQ, 4,
+    Instruction::CONST_4 | 4 << 12 | 0,
+    Instruction::GOTO | 0x200,
+    Instruction::CONST_4 | 5 << 12 | 0,
+    Instruction::RETURN | 0 << 8);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+  for (size_t i = 0, e = graph->GetBlocks().Size(); i < e; ++i) {
+    ASSERT_EQ(graph->GetBlocks().Get(i)->GetLoopInformation(), nullptr);
+  }
+}
+
+TEST(FindLoopsTest, CFG5) {
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::IF_EQ, 3,
+    Instruction::CONST_4 | 4 << 12 | 0,
+    Instruction::RETURN | 0 << 8);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+  for (size_t i = 0, e = graph->GetBlocks().Size(); i < e; ++i) {
+    ASSERT_EQ(graph->GetBlocks().Get(i)->GetLoopInformation(), nullptr);
+  }
+}
+
+static void TestBlock(HGraph* graph,
+                      int block_id,
+                      bool is_loop_header,
+                      int parent_loop_header_id,
+                      const int* blocks_in_loop = nullptr,
+                      size_t number_of_blocks = 0) {
+  HBasicBlock* block = graph->GetBlocks().Get(block_id);
+  ASSERT_EQ(block->IsLoopHeader(), is_loop_header);
+  if (parent_loop_header_id == -1) {
+    ASSERT_EQ(block->GetLoopInformation(), nullptr);
+  } else {
+    ASSERT_EQ(block->GetLoopInformation()->GetHeader()->GetBlockId(), parent_loop_header_id);
+  }
+
+  if (blocks_in_loop != nullptr) {
+    HLoopInformation* info = block->GetLoopInformation();
+    const BitVector& blocks = info->GetBlocks();
+    ASSERT_EQ(blocks.NumSetBits(), number_of_blocks);
+    for (size_t i = 0; i < number_of_blocks; ++i) {
+      ASSERT_TRUE(blocks.IsBitSet(blocks_in_loop[i]));
+    }
+  } else {
+    ASSERT_FALSE(block->IsLoopHeader());
+  }
+}
+
+TEST(FindLoopsTest, Loop1) {
+  // Simple loop with one preheader and one back edge.
+  // var a = 0;
+  // while (a == a) {
+  // }
+  // return;
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::IF_EQ, 3,
+    Instruction::GOTO | 0xFE00,
+    Instruction::RETURN_VOID);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+
+  TestBlock(graph, 0, false, -1);            // entry block
+  TestBlock(graph, 1, false, -1);            // pre header
+  const int blocks2[] = {2, 3};
+  TestBlock(graph, 2, true, 2, blocks2, 2);  // loop header
+  TestBlock(graph, 3, false, 2);             // block in loop
+  TestBlock(graph, 4, false, -1);            // return block
+  TestBlock(graph, 5, false, -1);            // exit block
+}
+
+TEST(FindLoopsTest, Loop2) {
+  // Make sure we support a preheader of a loop not being the first predecessor
+  // in the predecessor list of the header.
+  // var a = 0;
+  // while (a == a) {
+  // }
+  // return a;
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::GOTO | 0x400,
+    Instruction::IF_EQ, 4,
+    Instruction::GOTO | 0xFE00,
+    Instruction::GOTO | 0xFD00,
+    Instruction::RETURN | 0 << 8);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+
+  TestBlock(graph, 0, false, -1);            // entry block
+  TestBlock(graph, 1, false, -1);            // goto block
+  const int blocks2[] = {2, 3};
+  TestBlock(graph, 2, true, 2, blocks2, 2);  // loop header
+  TestBlock(graph, 3, false, 2);             // block in loop
+  TestBlock(graph, 4, false, -1);            // pre header
+  TestBlock(graph, 5, false, -1);            // return block
+  TestBlock(graph, 6, false, -1);            // exit block
+}
+
+TEST(FindLoopsTest, Loop3) {
+  // Make sure we create a preheader of a loop when a header originally has two
+  // incoming blocks and one back edge.
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::IF_EQ, 3,
+    Instruction::GOTO | 0x100,
+    Instruction::IF_EQ, 3,
+    Instruction::GOTO | 0xFE00,
+    Instruction::RETURN | 0 << 8);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+
+  TestBlock(graph, 0, false, -1);            // entry block
+  TestBlock(graph, 1, false, -1);            // goto block
+  TestBlock(graph, 2, false, -1);
+  const int blocks2[] = {3, 4};
+  TestBlock(graph, 3, true, 3, blocks2, 2);  // loop header
+  TestBlock(graph, 4, false, 3);             // block in loop
+  TestBlock(graph, 5, false, -1);            // pre header
+  TestBlock(graph, 6, false, -1);            // return block
+  TestBlock(graph, 7, false, -1);            // exit block
+  TestBlock(graph, 8, false, -1);            // synthesized pre header
+}
+
+TEST(FindLoopsTest, Loop4) {
+  // Test loop with originally two back edges.
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::IF_EQ, 6,
+    Instruction::IF_EQ, 3,
+    Instruction::GOTO | 0xFC00,
+    Instruction::GOTO | 0xFB00,
+    Instruction::RETURN | 0 << 8);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+
+  TestBlock(graph, 0, false, -1);            // entry block
+  TestBlock(graph, 1, false, -1);            // pre header
+  const int blocks2[] = {2, 3, 4, 5, 8};
+  TestBlock(graph, 2, true, 2, blocks2, 5);  // loop header
+  TestBlock(graph, 3, false, 2);             // block in loop
+  TestBlock(graph, 4, false, 2);             // original back edge
+  TestBlock(graph, 5, false, 2);             // original back edge
+  TestBlock(graph, 6, false, -1);            // return block
+  TestBlock(graph, 7, false, -1);            // exit block
+  TestBlock(graph, 8, false, 2);             // synthesized back edge
+}
+
+
+TEST(FindLoopsTest, Loop5) {
+  // Test loop with two exit edges.
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::IF_EQ, 6,
+    Instruction::IF_EQ, 3,
+    Instruction::GOTO | 0x0200,
+    Instruction::GOTO | 0xFB00,
+    Instruction::RETURN | 0 << 8);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+
+  TestBlock(graph, 0, false, -1);            // entry block
+  TestBlock(graph, 1, false, -1);            // pre header
+  const int blocks2[] = {2, 3, 5};
+  TestBlock(graph, 2, true, 2, blocks2, 3);  // loop header
+  TestBlock(graph, 3, false, 2);             // block in loop
+  TestBlock(graph, 4, false, -1);            // loop exit
+  TestBlock(graph, 5, false, 2);             // back edge
+  TestBlock(graph, 6, false, -1);            // return block
+  TestBlock(graph, 7, false, -1);            // exit block
+  TestBlock(graph, 8, false, -1);            // synthesized block at the loop exit
+}
+
+TEST(FindLoopsTest, InnerLoop) {
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::IF_EQ, 6,
+    Instruction::IF_EQ, 3,
+    Instruction::GOTO | 0xFE00,  // inner loop
+    Instruction::GOTO | 0xFB00,
+    Instruction::RETURN | 0 << 8);
+
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+
+  TestBlock(graph, 0, false, -1);            // entry block
+  TestBlock(graph, 1, false, -1);            // pre header of outer loop
+  const int blocks2[] = {2, 3, 4, 5, 8};
+  TestBlock(graph, 2, true, 2, blocks2, 5);  // outer loop header
+  const int blocks3[] = {3, 4};
+  TestBlock(graph, 3, true, 3, blocks3, 2);  // inner loop header
+  TestBlock(graph, 4, false, 3);             // back edge on inner loop
+  TestBlock(graph, 5, false, 2);             // back edge on outer loop
+  TestBlock(graph, 6, false, -1);            // return block
+  TestBlock(graph, 7, false, -1);            // exit block
+  TestBlock(graph, 8, false, 2);             // synthesized block as pre header of inner loop
+
+  ASSERT_TRUE(graph->GetBlocks().Get(3)->GetLoopInformation()->IsIn(
+                    *graph->GetBlocks().Get(2)->GetLoopInformation()));
+  ASSERT_FALSE(graph->GetBlocks().Get(2)->GetLoopInformation()->IsIn(
+                    *graph->GetBlocks().Get(3)->GetLoopInformation()));
+}
+
+TEST(FindLoopsTest, TwoLoops) {
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::IF_EQ, 3,
+    Instruction::GOTO | 0xFE00,  // first loop
+    Instruction::IF_EQ, 3,
+    Instruction::GOTO | 0xFE00,  // second loop
+    Instruction::RETURN | 0 << 8);
+
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+
+  TestBlock(graph, 0, false, -1);            // entry block
+  TestBlock(graph, 1, false, -1);            // pre header of first loop
+  const int blocks2[] = {2, 3};
+  TestBlock(graph, 2, true, 2, blocks2, 2);  // first loop header
+  TestBlock(graph, 3, false, 2);             // back edge of first loop
+  const int blocks4[] = {4, 5};
+  TestBlock(graph, 4, true, 4, blocks4, 2);  // second loop header
+  TestBlock(graph, 5, false, 4);             // back edge of second loop
+  TestBlock(graph, 6, false, -1);            // return block
+  TestBlock(graph, 7, false, -1);            // exit block
+
+  ASSERT_FALSE(graph->GetBlocks().Get(4)->GetLoopInformation()->IsIn(
+                    *graph->GetBlocks().Get(2)->GetLoopInformation()));
+  ASSERT_FALSE(graph->GetBlocks().Get(2)->GetLoopInformation()->IsIn(
+                    *graph->GetBlocks().Get(4)->GetLoopInformation()));
+}
+
+TEST(FindLoopsTest, NonNaturalLoop) {
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::IF_EQ, 3,
+    Instruction::GOTO | 0x0100,
+    Instruction::IF_EQ, 3,
+    Instruction::GOTO | 0xFD00,
+    Instruction::RETURN | 0 << 8);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+  ASSERT_TRUE(graph->GetBlocks().Get(3)->IsLoopHeader());
+  HLoopInformation* info = graph->GetBlocks().Get(3)->GetLoopInformation();
+  ASSERT_FALSE(info->GetHeader()->Dominates(info->GetBackEdges().Get(0)));
+}
+
+TEST(FindLoopsTest, DoWhileLoop) {
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::GOTO | 0x0100,
+    Instruction::IF_EQ, 0xFFFF,
+    Instruction::RETURN | 0 << 8);
+
+  ArenaPool arena;
+  HGraph* graph = TestCode(data, &arena);
+
+  TestBlock(graph, 0, false, -1);            // entry block
+  TestBlock(graph, 1, false, -1);            // pre header of first loop
+  const int blocks2[] = {2, 3, 6};
+  TestBlock(graph, 2, true, 2, blocks2, 3);  // loop header
+  TestBlock(graph, 3, false, 2);             // back edge of first loop
+  TestBlock(graph, 4, false, -1);            // return block
+  TestBlock(graph, 5, false, -1);            // exit block
+  TestBlock(graph, 6, false, 2);             // synthesized block to avoid a critical edge
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/liveness_test.cc b/compiler/optimizing/liveness_test.cc
index aa4d35e..d665ab9 100644
--- a/compiler/optimizing/liveness_test.cc
+++ b/compiler/optimizing/liveness_test.cc
@@ -188,7 +188,7 @@
     "  kill: (1100)\n"
     "Block 1\n"  // block with if
     "  live in: (1100)\n"
-    "  live out: (0100)\n"
+    "  live out: (1100)\n"
     "  kill: (0010)\n"
     "Block 2\n"  // else block
     "  live in: (0100)\n"
@@ -201,6 +201,10 @@
     "Block 4\n"  // exit block
     "  live in: (0000)\n"
     "  live out: (0000)\n"
+    "  kill: (0000)\n"
+    "Block 5\n"  // block to avoid critical edge. Predecessor is 1, successor is 3.
+    "  live in: (1000)\n"
+    "  live out: (0000)\n"
     "  kill: (0000)\n";
 
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
@@ -412,40 +416,45 @@
 
 TEST(LivenessTest, Loop6) {
   // Bitsets are made of:
-  // (constant0, constant4, constant5, phi in block 2, equal in block 2, equal in block 3)
+  // (constant0, constant4, constant5, phi in block 2, equal in block 2, equal in block 3,
+  //  phi in block 8)
   const char* expected =
     "Block 0\n"
-    "  live in: (000000)\n"
-    "  live out: (111000)\n"
-    "  kill: (111000)\n"
+    "  live in: (0000000)\n"
+    "  live out: (1110000)\n"
+    "  kill: (1110000)\n"
     "Block 1\n"
-    "  live in: (111000)\n"
-    "  live out: (011000)\n"
-    "  kill: (000000)\n"
+    "  live in: (1110000)\n"
+    "  live out: (0110000)\n"
+    "  kill: (0000000)\n"
     "Block 2\n"  // loop header
-    "  live in: (011000)\n"
-    "  live out: (011100)\n"
-    "  kill: (000110)\n"
+    "  live in: (0110000)\n"
+    "  live out: (0111000)\n"
+    "  kill: (0001100)\n"
     "Block 3\n"
-    "  live in: (011000)\n"
-    "  live out: (011000)\n"
-    "  kill: (000001)\n"
-    "Block 4\n"  // back edge
-    "  live in: (011000)\n"
-    "  live out: (011000)\n"
-    "  kill: (000000)\n"
-    "Block 5\n"  // back edge
-    "  live in: (011000)\n"
-    "  live out: (011000)\n"
-    "  kill: (000000)\n"
+    "  live in: (0110000)\n"
+    "  live out: (0110000)\n"
+    "  kill: (0000010)\n"
+    "Block 4\n"  // original back edge
+    "  live in: (0110000)\n"
+    "  live out: (0110000)\n"
+    "  kill: (0000000)\n"
+    "Block 5\n"  // original back edge
+    "  live in: (0110000)\n"
+    "  live out: (0110000)\n"
+    "  kill: (0000000)\n"
     "Block 6\n"  // return block
-    "  live in: (000100)\n"
-    "  live out: (000000)\n"
-    "  kill: (000000)\n"
+    "  live in: (0001000)\n"
+    "  live out: (0000000)\n"
+    "  kill: (0000000)\n"
     "Block 7\n"  // exit block
-    "  live in: (000000)\n"
-    "  live out: (000000)\n"
-    "  kill: (000000)\n";
+    "  live in: (0000000)\n"
+    "  live out: (0000000)\n"
+    "  kill: (0000000)\n"
+    "Block 8\n"  // synthesized back edge
+    "  live in: (0110000)\n"
+    "  live out: (0110000)\n"
+    "  kill: (0000001)\n";
 
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
@@ -476,7 +485,7 @@
     "  kill: (0000000)\n"
     "Block 2\n"  // loop header
     "  live in: (0110000)\n"
-    "  live out: (0110000)\n"
+    "  live out: (0111000)\n"
     "  kill: (0001100)\n"
     "Block 3\n"
     "  live in: (0110000)\n"
@@ -497,6 +506,10 @@
     "Block 7\n"  // exit block
     "  live in: (0000000)\n"
     "  live out: (0000000)\n"
+    "  kill: (0000000)\n"
+    "Block 8\n"  // synthesized block to avoid critical edge.
+    "  live in: (0001000)\n"
+    "  live out: (0000000)\n"
     "  kill: (0000000)\n";
 
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index d153bf7..cf2d1ee 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -31,11 +31,11 @@
 }
 
 void HGraph::RemoveDeadBlocks(const ArenaBitVector& visited) const {
-  for (size_t i = 0; i < blocks_.Size(); i++) {
+  for (size_t i = 0; i < blocks_.Size(); ++i) {
     if (!visited.IsBitSet(i)) {
       HBasicBlock* block = blocks_.Get(i);
-      for (size_t j = 0; j < block->GetSuccessors()->Size(); j++) {
-        block->GetSuccessors()->Get(j)->RemovePredecessor(block, false);
+      for (size_t j = 0; j < block->GetSuccessors().Size(); ++j) {
+        block->GetSuccessors().Get(j)->RemovePredecessor(block, false);
       }
       for (HInstructionIterator it(*block->GetPhis()); !it.Done(); it.Advance()) {
         block->RemovePhi(it.Current()->AsPhi());
@@ -55,15 +55,14 @@
 
   visited->SetBit(id);
   visiting->SetBit(id);
-  for (size_t i = 0; i < block->GetSuccessors()->Size(); i++) {
-    HBasicBlock* successor = block->GetSuccessors()->Get(i);
+  for (size_t i = 0; i < block->GetSuccessors().Size(); i++) {
+    HBasicBlock* successor = block->GetSuccessors().Get(i);
     if (visiting->IsBitSet(successor->GetBlockId())) {
       successor->AddBackEdge(block);
     } else {
       VisitBlockForBackEdges(successor, visited, visiting);
     }
   }
-  post_order_.Add(block);
   visiting->ClearBit(id);
 }
 
@@ -78,13 +77,18 @@
   //     predecessors list of live blocks.
   RemoveDeadBlocks(visited);
 
-  // (3) Compute the immediate dominator of each block. We visit
+  // (3) Simplify the CFG now, so that we don't need to recompute
+  //     dominators and the reverse post order.
+  SimplifyCFG();
+
+  // (4) Compute the immediate dominator of each block. We visit
   //     the successors of a block only when all its forward branches
   //     have been processed.
   GrowableArray<size_t> visits(arena_, blocks_.Size());
   visits.SetSize(blocks_.Size());
-  for (size_t i = 0; i < entry_block_->GetSuccessors()->Size(); i++) {
-    VisitBlockForDominatorTree(entry_block_->GetSuccessors()->Get(i), entry_block_, &visits);
+  reverse_post_order_.Add(entry_block_);
+  for (size_t i = 0; i < entry_block_->GetSuccessors().Size(); i++) {
+    VisitBlockForDominatorTree(entry_block_->GetSuccessors().Get(i), entry_block_, &visits);
   }
 }
 
@@ -119,59 +123,172 @@
   // Once all the forward edges have been visited, we know the immediate
   // dominator of the block. We can then start visiting its successors.
   if (visits->Get(block->GetBlockId()) ==
-      block->GetPredecessors()->Size() - block->NumberOfBackEdges()) {
-    for (size_t i = 0; i < block->GetSuccessors()->Size(); i++) {
-      VisitBlockForDominatorTree(block->GetSuccessors()->Get(i), block, visits);
+      block->GetPredecessors().Size() - block->NumberOfBackEdges()) {
+    reverse_post_order_.Add(block);
+    for (size_t i = 0; i < block->GetSuccessors().Size(); i++) {
+      VisitBlockForDominatorTree(block->GetSuccessors().Get(i), block, visits);
     }
   }
 }
 
 void HGraph::TransformToSSA() {
-  DCHECK(!post_order_.IsEmpty());
-  SimplifyCFG();
+  DCHECK(!reverse_post_order_.IsEmpty());
   SsaBuilder ssa_builder(this);
   ssa_builder.BuildSsa();
 }
 
-void HGraph::SimplifyCFG() {
-  for (size_t i = post_order_.Size(); i > 0; --i) {
-    HBasicBlock* current = post_order_.Get(i - 1);
-    if (current->IsLoopHeader()) {
-      // Make sure the loop has only one pre header. This simplifies SSA building by having
-      // to just look at the pre header to know which locals are initialized at entry of the
-      // loop.
-      HLoopInformation* info = current->GetLoopInformation();
-      size_t number_of_incomings = current->GetPredecessors()->Size() - info->NumberOfBackEdges();
-      if (number_of_incomings != 1) {
-        HBasicBlock* pre_header = new (arena_) HBasicBlock(this);
-        AddBlock(pre_header);
-        pre_header->AddInstruction(new (arena_) HGoto());
-        pre_header->SetDominator(current->GetDominator());
-        current->SetDominator(pre_header);
-        post_order_.InsertAt(i, pre_header);
-
-        ArenaBitVector back_edges(arena_, GetBlocks().Size(), false);
-        for (size_t pred = 0; pred < info->GetBackEdges()->Size(); pred++) {
-          back_edges.SetBit(info->GetBackEdges()->Get(pred)->GetBlockId());
-        }
-        for (size_t pred = 0; pred < current->GetPredecessors()->Size(); pred++) {
-          HBasicBlock* predecessor = current->GetPredecessors()->Get(pred);
-          if (!back_edges.IsBitSet(predecessor->GetBlockId())) {
-            current->RemovePredecessor(predecessor);
-            pred--;
-            predecessor->AddSuccessor(pre_header);
-          }
-        }
-        pre_header->AddSuccessor(current);
-      }
-      info->SetPreHeader(current->GetDominator());
+void HGraph::SplitCriticalEdge(HBasicBlock* block, HBasicBlock* successor) {
+  // Insert a new node between `block` and `successor` to split the
+  // critical edge.
+  HBasicBlock* new_block = new (arena_) HBasicBlock(this);
+  AddBlock(new_block);
+  new_block->AddInstruction(new (arena_) HGoto());
+  block->RemoveSuccessor(successor);
+  block->AddSuccessor(new_block);
+  new_block->AddSuccessor(successor);
+  if (successor->IsLoopHeader()) {
+    // If we split at a back edge boundary, make the new block the back edge.
+    HLoopInformation* info = successor->GetLoopInformation();
+    if (info->IsBackEdge(block)) {
+      info->RemoveBackEdge(block);
+      info->AddBackEdge(new_block);
     }
   }
 }
 
-void HLoopInformation::SetPreHeader(HBasicBlock* block) {
-  DCHECK_EQ(header_->GetDominator(), block);
-  pre_header_ = block;
+void HGraph::SimplifyLoop(HBasicBlock* header) {
+  HLoopInformation* info = header->GetLoopInformation();
+
+  // If there are more than one back edge, make them branch to the same block that
+  // will become the only back edge. This simplifies finding natural loops in the
+  // graph.
+  if (info->NumberOfBackEdges() > 1) {
+    HBasicBlock* new_back_edge = new (arena_) HBasicBlock(this);
+    AddBlock(new_back_edge);
+    new_back_edge->AddInstruction(new (arena_) HGoto());
+    for (size_t pred = 0, e = info->GetBackEdges().Size(); pred < e; ++pred) {
+      HBasicBlock* back_edge = info->GetBackEdges().Get(pred);
+      header->RemovePredecessor(back_edge);
+      back_edge->AddSuccessor(new_back_edge);
+    }
+    info->ClearBackEdges();
+    info->AddBackEdge(new_back_edge);
+    new_back_edge->AddSuccessor(header);
+  }
+
+  // Make sure the loop has only one pre header. This simplifies SSA building by having
+  // to just look at the pre header to know which locals are initialized at entry of the
+  // loop.
+  size_t number_of_incomings = header->GetPredecessors().Size() - info->NumberOfBackEdges();
+  if (number_of_incomings != 1) {
+    HBasicBlock* pre_header = new (arena_) HBasicBlock(this);
+    AddBlock(pre_header);
+    pre_header->AddInstruction(new (arena_) HGoto());
+
+    ArenaBitVector back_edges(arena_, GetBlocks().Size(), false);
+    HBasicBlock* back_edge = info->GetBackEdges().Get(0);
+    for (size_t pred = 0; pred < header->GetPredecessors().Size(); ++pred) {
+      HBasicBlock* predecessor = header->GetPredecessors().Get(pred);
+      if (predecessor != back_edge) {
+        header->RemovePredecessor(predecessor);
+        pred--;
+        predecessor->AddSuccessor(pre_header);
+      }
+    }
+    pre_header->AddSuccessor(header);
+  }
+}
+
+void HGraph::SimplifyCFG() {
+  // Simplify the CFG for future analysis, and code generation:
+  // (1): Split critical edges.
+  // (2): Simplify loops by having only one back edge, and one preheader.
+  for (size_t i = 0; i < blocks_.Size(); ++i) {
+    HBasicBlock* block = blocks_.Get(i);
+    if (block->GetSuccessors().Size() > 1) {
+      for (size_t j = 0; j < block->GetSuccessors().Size(); ++j) {
+        HBasicBlock* successor = block->GetSuccessors().Get(j);
+        if (successor->GetPredecessors().Size() > 1) {
+          SplitCriticalEdge(block, successor);
+          --j;
+        }
+      }
+    }
+    if (block->IsLoopHeader()) {
+      SimplifyLoop(block);
+    }
+  }
+}
+
+bool HGraph::FindNaturalLoops() const {
+  for (size_t i = 0; i < blocks_.Size(); ++i) {
+    HBasicBlock* block = blocks_.Get(i);
+    if (block->IsLoopHeader()) {
+      HLoopInformation* info = block->GetLoopInformation();
+      if (!info->Populate()) {
+        // Abort if the loop is non natural. We currently bailout in such cases.
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void HLoopInformation::PopulateRecursive(HBasicBlock* block) {
+  if (blocks_.IsBitSet(block->GetBlockId())) {
+    return;
+  }
+
+  blocks_.SetBit(block->GetBlockId());
+  block->SetInLoop(this);
+  for (size_t i = 0, e = block->GetPredecessors().Size(); i < e; ++i) {
+    PopulateRecursive(block->GetPredecessors().Get(i));
+  }
+}
+
+bool HLoopInformation::Populate() {
+  DCHECK_EQ(GetBackEdges().Size(), 1u);
+  HBasicBlock* back_edge = GetBackEdges().Get(0);
+  DCHECK(back_edge->GetDominator() != nullptr);
+  if (!header_->Dominates(back_edge)) {
+    // This loop is not natural. Do not bother going further.
+    return false;
+  }
+
+  // Populate this loop: starting with the back edge, recursively add predecessors
+  // that are not already part of that loop. Set the header as part of the loop
+  // to end the recursion.
+  // This is a recursive implementation of the algorithm described in
+  // "Advanced Compiler Design & Implementation" (Muchnick) p192.
+  blocks_.SetBit(header_->GetBlockId());
+  PopulateRecursive(back_edge);
+  return true;
+}
+
+HBasicBlock* HLoopInformation::GetPreHeader() const {
+  DCHECK_EQ(header_->GetPredecessors().Size(), 2u);
+  return header_->GetDominator();
+}
+
+bool HLoopInformation::Contains(const HBasicBlock& block) const {
+  return blocks_.IsBitSet(block.GetBlockId());
+}
+
+bool HLoopInformation::IsIn(const HLoopInformation& other) const {
+  return other.blocks_.IsBitSet(header_->GetBlockId());
+}
+
+bool HBasicBlock::Dominates(HBasicBlock* other) const {
+  // Walk up the dominator tree from `other`, to find out if `this`
+  // is an ancestor.
+  HBasicBlock* current = other;
+  while (current != nullptr) {
+    if (current == this) {
+      return true;
+    }
+    current = current->GetDominator();
+  }
+  return false;
 }
 
 static void Add(HInstructionList* instruction_list,
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index bd3d703..081c2bd 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -60,7 +60,7 @@
   explicit HGraph(ArenaAllocator* arena)
       : arena_(arena),
         blocks_(arena, kDefaultNumberOfBlocks),
-        post_order_(arena, kDefaultNumberOfBlocks),
+        reverse_post_order_(arena, kDefaultNumberOfBlocks),
         maximum_number_of_out_vregs_(0),
         number_of_vregs_(0),
         number_of_in_vregs_(0),
@@ -81,6 +81,14 @@
   void TransformToSSA();
   void SimplifyCFG();
 
+  // Find all natural loops in this graph. Aborts computation and returns false
+  // if one loop is not natural, that is the header does not dominated the back
+  // edge.
+  bool FindNaturalLoops() const;
+
+  void SplitCriticalEdge(HBasicBlock* block, HBasicBlock* successor);
+  void SimplifyLoop(HBasicBlock* header);
+
   int GetNextInstructionId() {
     return current_instruction_id_++;
   }
@@ -109,8 +117,8 @@
     return number_of_in_vregs_;
   }
 
-  const GrowableArray<HBasicBlock*>& GetPostOrder() const {
-    return post_order_;
+  const GrowableArray<HBasicBlock*>& GetReversePostOrder() const {
+    return reverse_post_order_;
   }
 
  private:
@@ -129,8 +137,8 @@
   // List of blocks in insertion order.
   GrowableArray<HBasicBlock*> blocks_;
 
-  // List of blocks to perform a post order tree traversal.
-  GrowableArray<HBasicBlock*> post_order_;
+  // List of blocks to perform a reverse post order tree traversal.
+  GrowableArray<HBasicBlock*> reverse_post_order_;
 
   HBasicBlock* entry_block_;
   HBasicBlock* exit_block_;
@@ -154,30 +162,63 @@
  public:
   HLoopInformation(HBasicBlock* header, HGraph* graph)
       : header_(header),
-        back_edges_(graph->GetArena(), kDefaultNumberOfBackEdges) { }
+        back_edges_(graph->GetArena(), kDefaultNumberOfBackEdges),
+        blocks_(graph->GetArena(), graph->GetBlocks().Size(), false) {}
+
+  HBasicBlock* GetHeader() const {
+    return header_;
+  }
 
   void AddBackEdge(HBasicBlock* back_edge) {
     back_edges_.Add(back_edge);
   }
 
+  void RemoveBackEdge(HBasicBlock* back_edge) {
+    back_edges_.Delete(back_edge);
+  }
+
+  bool IsBackEdge(HBasicBlock* block) {
+    for (size_t i = 0, e = back_edges_.Size(); i < e; ++i) {
+      if (back_edges_.Get(i) == block) return true;
+    }
+    return false;
+  }
+
   int NumberOfBackEdges() const {
     return back_edges_.Size();
   }
 
-  void SetPreHeader(HBasicBlock* block);
+  HBasicBlock* GetPreHeader() const;
 
-  HBasicBlock* GetPreHeader() const {
-    return pre_header_;
+  const GrowableArray<HBasicBlock*>& GetBackEdges() const {
+    return back_edges_;
   }
 
-  const GrowableArray<HBasicBlock*>* GetBackEdges() const {
-    return &back_edges_;
+  void ClearBackEdges() {
+    back_edges_.Reset();
   }
 
+  // Find blocks that are part of this loop. Returns whether the loop is a natural loop,
+  // that is the header dominates the back edge.
+  bool Populate();
+
+  // Returns whether this loop information contains `block`.
+  // Note that this loop information *must* be populated before entering this function.
+  bool Contains(const HBasicBlock& block) const;
+
+  // Returns whether this loop information is an inner loop of `other`.
+  // Note that `other` *must* be populated before entering this function.
+  bool IsIn(const HLoopInformation& other) const;
+
+  const ArenaBitVector& GetBlocks() const { return blocks_; }
+
  private:
-  HBasicBlock* pre_header_;
+  // Internal recursive implementation of `Populate`.
+  void PopulateRecursive(HBasicBlock* block);
+
   HBasicBlock* header_;
   GrowableArray<HBasicBlock*> back_edges_;
+  ArenaBitVector blocks_;
 
   DISALLOW_COPY_AND_ASSIGN(HLoopInformation);
 };
@@ -195,18 +236,19 @@
         dominator_(nullptr),
         block_id_(-1) { }
 
-  const GrowableArray<HBasicBlock*>* GetPredecessors() const {
-    return &predecessors_;
+  const GrowableArray<HBasicBlock*>& GetPredecessors() const {
+    return predecessors_;
   }
 
-  const GrowableArray<HBasicBlock*>* GetSuccessors() const {
-    return &successors_;
+  const GrowableArray<HBasicBlock*>& GetSuccessors() const {
+    return successors_;
   }
 
   void AddBackEdge(HBasicBlock* back_edge) {
     if (loop_information_ == nullptr) {
       loop_information_ = new (graph_->GetArena()) HLoopInformation(this, graph_);
     }
+    DCHECK_EQ(loop_information_->GetHeader(), this);
     loop_information_->AddBackEdge(back_edge);
   }
 
@@ -241,19 +283,57 @@
     }
   }
 
+  void RemoveSuccessor(HBasicBlock* block, bool remove_in_predecessor = true) {
+    successors_.Delete(block);
+    if (remove_in_predecessor) {
+      block->predecessors_.Delete(this);
+    }
+  }
+
+  void ClearAllPredecessors() {
+    predecessors_.Reset();
+  }
+
+  void AddPredecessor(HBasicBlock* block) {
+    predecessors_.Add(block);
+    block->successors_.Add(this);
+  }
+
   void AddInstruction(HInstruction* instruction);
   void RemoveInstruction(HInstruction* instruction);
   void AddPhi(HPhi* phi);
   void RemovePhi(HPhi* phi);
 
   bool IsLoopHeader() const {
-    return loop_information_ != nullptr;
+    return (loop_information_ != nullptr) && (loop_information_->GetHeader() == this);
   }
 
   HLoopInformation* GetLoopInformation() const {
     return loop_information_;
   }
 
+  // Set the loop_information_ on this block. This method overrides the current
+  // loop_information if it is an outer loop of the passed loop information.
+  void SetInLoop(HLoopInformation* info) {
+    if (IsLoopHeader()) {
+      // Nothing to do. This just means `info` is an outer loop.
+    } else if (loop_information_ == nullptr) {
+      loop_information_ = info;
+    } else if (loop_information_->Contains(*info->GetHeader())) {
+      // Block is currently part of an outer loop. Make it part of this inner loop.
+      // Note that a non loop header having a loop information means this loop information
+      // has already been populated
+      loop_information_ = info;
+    } else {
+      // Block is part of an inner loop. Do not update the loop information.
+      // Note that we cannot do the check `info->Contains(loop_information_)->GetHeader()`
+      // at this point, because this method is being called while populating `info`.
+    }
+  }
+
+  // Returns wheter this block dominates the blocked passed as parameter.
+  bool Dominates(HBasicBlock* block) const;
+
  private:
   HGraph* const graph_;
   GrowableArray<HBasicBlock*> predecessors_;
@@ -638,7 +718,7 @@
   HGoto() { }
 
   HBasicBlock* GetSuccessor() const {
-    return GetBlock()->GetSuccessors()->Get(0);
+    return GetBlock()->GetSuccessors().Get(0);
   }
 
   DECLARE_INSTRUCTION(Goto)
@@ -656,11 +736,11 @@
   }
 
   HBasicBlock* IfTrueSuccessor() const {
-    return GetBlock()->GetSuccessors()->Get(0);
+    return GetBlock()->GetSuccessors().Get(0);
   }
 
   HBasicBlock* IfFalseSuccessor() const {
-    return GetBlock()->GetSuccessors()->Get(1);
+    return GetBlock()->GetSuccessors().Get(1);
   }
 
   DECLARE_INSTRUCTION(If)
@@ -1011,35 +1091,35 @@
   DISALLOW_COPY_AND_ASSIGN(HInsertionOrderIterator);
 };
 
-class HPostOrderIterator : public ValueObject {
+class HReversePostOrderIterator : public ValueObject {
  public:
-  explicit HPostOrderIterator(const HGraph& graph) : graph_(graph), index_(0) {}
+  explicit HReversePostOrderIterator(const HGraph& graph) : graph_(graph), index_(0) {}
 
-  bool Done() const { return index_ == graph_.GetPostOrder().Size(); }
-  HBasicBlock* Current() const { return graph_.GetPostOrder().Get(index_); }
+  bool Done() const { return index_ == graph_.GetReversePostOrder().Size(); }
+  HBasicBlock* Current() const { return graph_.GetReversePostOrder().Get(index_); }
   void Advance() { ++index_; }
 
  private:
   const HGraph& graph_;
   size_t index_;
 
-  DISALLOW_COPY_AND_ASSIGN(HPostOrderIterator);
+  DISALLOW_COPY_AND_ASSIGN(HReversePostOrderIterator);
 };
 
-class HReversePostOrderIterator : public ValueObject {
+class HPostOrderIterator : public ValueObject {
  public:
-  explicit HReversePostOrderIterator(const HGraph& graph)
-      : graph_(graph), index_(graph_.GetPostOrder().Size()) {}
+  explicit HPostOrderIterator(const HGraph& graph)
+      : graph_(graph), index_(graph_.GetReversePostOrder().Size()) {}
 
   bool Done() const { return index_ == 0; }
-  HBasicBlock* Current() const { return graph_.GetPostOrder().Get(index_ - 1); }
+  HBasicBlock* Current() const { return graph_.GetReversePostOrder().Get(index_ - 1); }
   void Advance() { --index_; }
 
  private:
   const HGraph& graph_;
   size_t index_;
 
-  DISALLOW_COPY_AND_ASSIGN(HReversePostOrderIterator);
+  DISALLOW_COPY_AND_ASSIGN(HPostOrderIterator);
 };
 
 }  // namespace art
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 8594c69..a5031e0 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -104,6 +104,7 @@
   // Run these phases to get some test coverage.
   graph->BuildDominatorTree();
   graph->TransformToSSA();
+  graph->FindNaturalLoops();
   SsaLivenessAnalysis(*graph).Analyze();
 
   return new CompiledMethod(GetCompilerDriver(),
diff --git a/compiler/optimizing/pretty_printer.h b/compiler/optimizing/pretty_printer.h
index c82d0cc..dfeafe7 100644
--- a/compiler/optimizing/pretty_printer.h
+++ b/compiler/optimizing/pretty_printer.h
@@ -70,23 +70,23 @@
   virtual void VisitBasicBlock(HBasicBlock* block) {
     PrintString("BasicBlock ");
     PrintInt(block->GetBlockId());
-    const GrowableArray<HBasicBlock*>* blocks = block->GetPredecessors();
-    if (!blocks->IsEmpty()) {
+    const GrowableArray<HBasicBlock*>& predecessors = block->GetPredecessors();
+    if (!predecessors.IsEmpty()) {
       PrintString(", pred: ");
-      for (size_t i = 0; i < blocks->Size() -1; i++) {
-        PrintInt(blocks->Get(i)->GetBlockId());
+      for (size_t i = 0; i < predecessors.Size() -1; i++) {
+        PrintInt(predecessors.Get(i)->GetBlockId());
         PrintString(", ");
       }
-      PrintInt(blocks->Peek()->GetBlockId());
+      PrintInt(predecessors.Peek()->GetBlockId());
     }
-    blocks = block->GetSuccessors();
-    if (!blocks->IsEmpty()) {
+    const GrowableArray<HBasicBlock*>& successors = block->GetSuccessors();
+    if (!successors.IsEmpty()) {
       PrintString(", succ: ");
-      for (size_t i = 0; i < blocks->Size() - 1; i++) {
-        PrintInt(blocks->Get(i)->GetBlockId());
+      for (size_t i = 0; i < successors.Size() - 1; i++) {
+        PrintInt(successors.Get(i)->GetBlockId());
         PrintString(", ");
       }
-      PrintInt(blocks->Peek()->GetBlockId());
+      PrintInt(successors.Peek()->GetBlockId());
     }
     PrintNewLine();
     HGraphVisitor::VisitBasicBlock(block);
diff --git a/compiler/optimizing/pretty_printer_test.cc b/compiler/optimizing/pretty_printer_test.cc
index 04db7a6..006349c 100644
--- a/compiler/optimizing/pretty_printer_test.cc
+++ b/compiler/optimizing/pretty_printer_test.cc
@@ -57,7 +57,7 @@
     PrintString("  ");
     PrintInt(gota->GetId());
     PrintString(": Goto ");
-    PrintInt(current_block_->GetSuccessors()->Get(0)->GetBlockId());
+    PrintInt(current_block_->GetSuccessors().Get(0)->GetBlockId());
     PrintNewLine();
   }
 
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index ee1e1e4..1fc041c 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -32,8 +32,8 @@
     HBasicBlock* block = loop_headers_.Get(i);
     for (HInstructionIterator it(*block->GetPhis()); !it.Done(); it.Advance()) {
       HPhi* phi = it.Current()->AsPhi();
-      for (size_t pred = 0; pred < block->GetPredecessors()->Size(); pred++) {
-        phi->AddInput(ValueOfLocal(block->GetPredecessors()->Get(pred), phi->GetRegNumber()));
+      for (size_t pred = 0; pred < block->GetPredecessors().Size(); pred++) {
+        phi->AddInput(ValueOfLocal(block->GetPredecessors().Get(pred), phi->GetRegNumber()));
       }
     }
   }
@@ -75,14 +75,14 @@
     // Save the loop header so that the last phase of the analysis knows which
     // blocks need to be updated.
     loop_headers_.Add(block);
-  } else if (block->GetPredecessors()->Size() > 0) {
+  } else if (block->GetPredecessors().Size() > 0) {
     // All predecessors have already been visited because we are visiting in reverse post order.
     // We merge the values of all locals, creating phis if those values differ.
     for (size_t local = 0; local < current_locals_->Size(); local++) {
       bool is_different = false;
-      HInstruction* value = ValueOfLocal(block->GetPredecessors()->Get(0), local);
-      for (size_t i = 1; i < block->GetPredecessors()->Size(); i++) {
-        if (ValueOfLocal(block->GetPredecessors()->Get(i), local) != value) {
+      HInstruction* value = ValueOfLocal(block->GetPredecessors().Get(0), local);
+      for (size_t i = 1; i < block->GetPredecessors().Size(); i++) {
+        if (ValueOfLocal(block->GetPredecessors().Get(i), local) != value) {
           is_different = true;
           break;
         }
@@ -90,9 +90,9 @@
       if (is_different) {
         // TODO: Compute union type.
         HPhi* phi = new (GetGraph()->GetArena()) HPhi(
-            GetGraph()->GetArena(), local, block->GetPredecessors()->Size(), Primitive::kPrimVoid);
-        for (size_t i = 0; i < block->GetPredecessors()->Size(); i++) {
-          phi->SetRawInputAt(i, ValueOfLocal(block->GetPredecessors()->Get(i), local));
+            GetGraph()->GetArena(), local, block->GetPredecessors().Size(), Primitive::kPrimVoid);
+        for (size_t i = 0; i < block->GetPredecessors().Size(); i++) {
+          phi->SetRawInputAt(i, ValueOfLocal(block->GetPredecessors().Get(i), local));
         }
         block->AddPhi(phi);
         value = phi;
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index 838597d..0ab77ca 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -110,7 +110,7 @@
       for (size_t i = 0, e = current->InputCount(); i < e; ++i) {
         HInstruction* input = current->InputAt(i);
 
-        HBasicBlock* predecessor = block->GetPredecessors()->Get(i);
+        HBasicBlock* predecessor = block->GetPredecessors().Get(i);
         size_t ssa_index = input->GetSsaIndex();
         BitVector* predecessor_kill = GetKillSet(*predecessor);
         BitVector* predecessor_live_in = GetLiveInSet(*predecessor);
@@ -147,8 +147,8 @@
   BitVector* live_out = GetLiveOutSet(block);
   bool changed = false;
   // The live_out set of a block is the union of live_in sets of its successors.
-  for (size_t i = 0, e = block.GetSuccessors()->Size(); i < e; ++i) {
-    HBasicBlock* successor = block.GetSuccessors()->Get(i);
+  for (size_t i = 0, e = block.GetSuccessors().Size(); i < e; ++i) {
+    HBasicBlock* successor = block.GetSuccessors().Get(i);
     if (live_out->Union(GetLiveInSet(*successor))) {
       changed = true;
     }
diff --git a/compiler/optimizing/ssa_test.cc b/compiler/optimizing/ssa_test.cc
index e4aafb7..9be2197 100644
--- a/compiler/optimizing/ssa_test.cc
+++ b/compiler/optimizing/ssa_test.cc
@@ -98,15 +98,18 @@
     "BasicBlock 0, succ: 1\n"
     "  0: IntConstant 0 [2, 2]\n"
     "  1: Goto\n"
-    "BasicBlock 1, pred: 0, succ: 3, 2\n"
+    "BasicBlock 1, pred: 0, succ: 2, 5\n"
     "  2: Equal(0, 0) [3]\n"
     "  3: If(2)\n"
     "BasicBlock 2, pred: 1, succ: 3\n"
     "  4: Goto\n"
-    "BasicBlock 3, pred: 1, 2, succ: 4\n"
+    "BasicBlock 3, pred: 2, 5, succ: 4\n"
     "  5: ReturnVoid\n"
     "BasicBlock 4, pred: 3\n"
-    "  6: Exit\n";
+    "  6: Exit\n"
+    // Synthesized block to avoid critical edge.
+    "BasicBlock 5, pred: 1, succ: 3\n"
+    "  7: Goto\n";
 
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
@@ -125,16 +128,19 @@
     "  0: IntConstant 0 [6, 3, 3]\n"
     "  1: IntConstant 4 [6]\n"
     "  2: Goto\n"
-    "BasicBlock 1, pred: 0, succ: 3, 2\n"
+    "BasicBlock 1, pred: 0, succ: 2, 5\n"
     "  3: Equal(0, 0) [4]\n"
     "  4: If(3)\n"
     "BasicBlock 2, pred: 1, succ: 3\n"
     "  5: Goto\n"
-    "BasicBlock 3, pred: 1, 2, succ: 4\n"
-    "  6: Phi(0, 1) [7]\n"
+    "BasicBlock 3, pred: 2, 5, succ: 4\n"
+    "  6: Phi(1, 0) [7]\n"
     "  7: Return(6)\n"
     "BasicBlock 4, pred: 3\n"
-    "  8: Exit\n";
+    "  8: Exit\n"
+    // Synthesized block to avoid critical edge.
+    "BasicBlock 5, pred: 1, succ: 3\n"
+    "  9: Goto\n";
 
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
@@ -184,16 +190,21 @@
     "BasicBlock 0, succ: 1\n"
     "  0: IntConstant 0 [6, 4, 2, 2]\n"
     "  1: Goto\n"
-    "BasicBlock 1, pred: 0, succ: 3, 2\n"
+    "BasicBlock 1, pred: 0, succ: 5, 6\n"
     "  2: Equal(0, 0) [3]\n"
     "  3: If(2)\n"
-    "BasicBlock 2, pred: 1, 3, succ: 3\n"
-    "  4: Phi(0, 6) [6]\n"
+    "BasicBlock 2, pred: 3, 6, succ: 3\n"
+    "  4: Phi(6, 0) [6]\n"
     "  5: Goto\n"
-    "BasicBlock 3, pred: 1, 2, succ: 2\n"
-    "  6: Phi(0, 4) [4]\n"
+    "BasicBlock 3, pred: 2, 5, succ: 2\n"
+    "  6: Phi(4, 0) [4]\n"
     "  7: Goto\n"
-    "BasicBlock 4\n";
+    "BasicBlock 4\n"
+    // Synthesized blocks to avoid critical edge.
+    "BasicBlock 5, pred: 1, succ: 3\n"
+    "  8: Goto\n"
+    "BasicBlock 6, pred: 1, succ: 2\n"
+    "  9: Goto\n";
 
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
@@ -349,26 +360,30 @@
   const char* expected =
     "BasicBlock 0, succ: 1\n"
     "  0: IntConstant 0 [5]\n"
-    "  1: IntConstant 4 [5, 8, 8]\n"
-    "  2: IntConstant 5 [5]\n"
+    "  1: IntConstant 4 [14, 8, 8]\n"
+    "  2: IntConstant 5 [14]\n"
     "  3: Goto\n"
     "BasicBlock 1, pred: 0, succ: 2\n"
     "  4: Goto\n"
-    "BasicBlock 2, pred: 1, 4, 5, succ: 6, 3\n"
-    "  5: Phi(0, 2, 1) [12, 6, 6]\n"
+    "BasicBlock 2, pred: 1, 8, succ: 6, 3\n"
+    "  5: Phi(0, 14) [12, 6, 6]\n"
     "  6: Equal(5, 5) [7]\n"
     "  7: If(6)\n"
     "BasicBlock 3, pred: 2, succ: 5, 4\n"
     "  8: Equal(1, 1) [9]\n"
     "  9: If(8)\n"
-    "BasicBlock 4, pred: 3, succ: 2\n"
+    "BasicBlock 4, pred: 3, succ: 8\n"
     "  10: Goto\n"
-    "BasicBlock 5, pred: 3, succ: 2\n"
+    "BasicBlock 5, pred: 3, succ: 8\n"
     "  11: Goto\n"
     "BasicBlock 6, pred: 2, succ: 7\n"
     "  12: Return(5)\n"
     "BasicBlock 7, pred: 6\n"
-    "  13: Exit\n";
+    "  13: Exit\n"
+    // Synthesized single back edge of loop.
+    "BasicBlock 8, pred: 5, 4, succ: 2\n"
+    "  14: Phi(1, 2) [5]\n"
+    "  15: Goto\n";
 
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
@@ -393,7 +408,7 @@
     "  3: Goto\n"
     "BasicBlock 1, pred: 0, succ: 2\n"
     "  4: Goto\n"
-    "BasicBlock 2, pred: 1, 5, succ: 6, 3\n"
+    "BasicBlock 2, pred: 1, 5, succ: 3, 8\n"
     "  5: Phi(0, 1) [12, 6, 6]\n"
     "  6: Equal(5, 5) [7]\n"
     "  7: If(6)\n"
@@ -404,11 +419,13 @@
     "  10: Goto\n"
     "BasicBlock 5, pred: 3, succ: 2\n"
     "  11: Goto\n"
-    "BasicBlock 6, pred: 2, 4, succ: 7\n"
-    "  12: Phi(5, 2) [13]\n"
+    "BasicBlock 6, pred: 4, 8, succ: 7\n"
+    "  12: Phi(2, 5) [13]\n"
     "  13: Return(12)\n"
     "BasicBlock 7, pred: 6\n"
-    "  14: Exit\n";
+    "  14: Exit\n"
+    "BasicBlock 8, pred: 2, succ: 6\n"
+    "  15: Goto\n";
 
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
diff --git a/compiler/trampolines/trampoline_compiler.cc b/compiler/trampolines/trampoline_compiler.cc
index fb909a8..d03b99f 100644
--- a/compiler/trampolines/trampoline_compiler.cc
+++ b/compiler/trampolines/trampoline_compiler.cc
@@ -62,18 +62,15 @@
 
   switch (abi) {
     case kInterpreterAbi:  // Thread* is first argument (X0) in interpreter ABI.
-      // FIXME IPx used by VIXL - this is unsafe.
       __ JumpTo(Arm64ManagedRegister::FromCoreRegister(X0), Offset(offset.Int32Value()),
           Arm64ManagedRegister::FromCoreRegister(IP1));
 
       break;
     case kJniAbi:  // Load via Thread* held in JNIEnv* in first argument (X0).
-
       __ LoadRawPtr(Arm64ManagedRegister::FromCoreRegister(IP1),
                       Arm64ManagedRegister::FromCoreRegister(X0),
                       Offset(JNIEnvExt::SelfOffset().Int32Value()));
 
-      // FIXME IPx used by VIXL - this is unsafe.
       __ JumpTo(Arm64ManagedRegister::FromCoreRegister(IP1), Offset(offset.Int32Value()),
                 Arm64ManagedRegister::FromCoreRegister(IP0));
 
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index b4bb979..f486b3c 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -50,11 +50,11 @@
 }
 
 void Arm64Assembler::GetCurrentThread(ManagedRegister tr) {
-  ___ Mov(reg_x(tr.AsArm64().AsCoreRegister()), reg_x(TR1));
+  ___ Mov(reg_x(tr.AsArm64().AsCoreRegister()), reg_x(ETR));
 }
 
 void Arm64Assembler::GetCurrentThread(FrameOffset offset, ManagedRegister /* scratch */) {
-  StoreToOffset(TR1, SP, offset.Int32Value());
+  StoreToOffset(ETR, SP, offset.Int32Value());
 }
 
 // See Arm64 PCS Section 5.2.2.1.
@@ -79,11 +79,13 @@
     // VIXL macro-assembler handles all variants.
     ___ Add(reg_x(rd), reg_x(rn), value);
   } else {
-    // ip1 = rd + value
-    // rd = cond ? ip1 : rn
-    CHECK_NE(rn, IP1);
-    ___ Add(reg_x(IP1), reg_x(rn), value);
-    ___ Csel(reg_x(rd), reg_x(IP1), reg_x(rd), COND_OP(cond));
+    // temp = rd + value
+    // rd = cond ? temp : rn
+    vixl::UseScratchRegisterScope temps(vixl_masm_);
+    temps.Exclude(reg_x(rd), reg_x(rn));
+    vixl::Register temp = temps.AcquireX();
+    ___ Add(temp, reg_x(rn), value);
+    ___ Csel(reg_x(rd), temp, reg_x(rd), COND_OP(cond));
   }
 }
 
@@ -162,7 +164,7 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadImmediate(scratch.AsCoreRegister(), imm);
-  StoreToOffset(scratch.AsCoreRegister(), TR1, offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), ETR, offs.Int32Value());
 }
 
 void Arm64Assembler::StoreStackOffsetToThread64(ThreadOffset<8> tr_offs,
@@ -171,13 +173,14 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   AddConstant(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), ETR, tr_offs.Int32Value());
 }
 
 void Arm64Assembler::StoreStackPointerToThread64(ThreadOffset<8> tr_offs) {
-  // Arm64 does not support: "str sp, [dest]" therefore we use IP1 as a temp reg.
-  ___ Mov(reg_x(IP1), reg_x(SP));
-  StoreToOffset(IP1, TR1, tr_offs.Int32Value());
+  vixl::UseScratchRegisterScope temps(vixl_masm_);
+  vixl::Register temp = temps.AcquireX();
+  ___ Mov(temp, reg_x(SP));
+  ___ Str(temp, MEM_OP(reg_x(ETR), tr_offs.Int32Value()));
 }
 
 void Arm64Assembler::StoreSpanning(FrameOffset dest_off, ManagedRegister m_source,
@@ -195,12 +198,14 @@
   if ((cond == AL) || (cond == NV)) {
     ___ Mov(reg_x(dest), value);
   } else {
-    // ip1 = value
-    // rd = cond ? ip1 : rd
+    // temp = value
+    // rd = cond ? temp : rd
     if (value != 0) {
-      CHECK_NE(dest, IP1);
-      ___ Mov(reg_x(IP1), value);
-      ___ Csel(reg_x(dest), reg_x(IP1), reg_x(dest), COND_OP(cond));
+      vixl::UseScratchRegisterScope temps(vixl_masm_);
+      temps.Exclude(reg_x(dest));
+      vixl::Register temp = temps.AcquireX();
+      ___ Mov(temp, value);
+      ___ Csel(reg_x(dest), temp, reg_x(dest), COND_OP(cond));
     } else {
       ___ Csel(reg_x(dest), reg_x(XZR), reg_x(dest), COND_OP(cond));
     }
@@ -276,7 +281,7 @@
 }
 
 void Arm64Assembler::LoadFromThread64(ManagedRegister m_dst, ThreadOffset<8> src, size_t size) {
-  return Load(m_dst.AsArm64(), TR1, src.Int32Value(), size);
+  return Load(m_dst.AsArm64(), ETR, src.Int32Value(), size);
 }
 
 void Arm64Assembler::LoadRef(ManagedRegister m_dst, FrameOffset offs) {
@@ -298,13 +303,16 @@
   Arm64ManagedRegister dst = m_dst.AsArm64();
   Arm64ManagedRegister base = m_base.AsArm64();
   CHECK(dst.IsCoreRegister() && base.IsCoreRegister());
-  LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
+  // Remove dst and base form the temp list - higher level API uses IP1, IP0.
+  vixl::UseScratchRegisterScope temps(vixl_masm_);
+  temps.Exclude(reg_x(dst.AsCoreRegister()), reg_x(base.AsCoreRegister()));
+  ___ Ldr(reg_x(dst.AsCoreRegister()), MEM_OP(reg_x(base.AsCoreRegister()), offs.Int32Value()));
 }
 
 void Arm64Assembler::LoadRawPtrFromThread64(ManagedRegister m_dst, ThreadOffset<8> offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
-  LoadFromOffset(dst.AsCoreRegister(), TR1, offs.Int32Value());
+  LoadFromOffset(dst.AsCoreRegister(), ETR, offs.Int32Value());
 }
 
 // Copying routines.
@@ -342,7 +350,7 @@
                                           ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
-  LoadFromOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), ETR, tr_offs.Int32Value());
   StoreToOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
 }
 
@@ -352,7 +360,7 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadFromOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), ETR, tr_offs.Int32Value());
 }
 
 void Arm64Assembler::CopyRef(FrameOffset dest, FrameOffset src,
@@ -511,7 +519,10 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(base.IsCoreRegister()) << base;
   CHECK(scratch.IsCoreRegister()) << scratch;
-  LoadFromOffset(scratch.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
+  // Remove base and scratch form the temp list - higher level API uses IP1, IP0.
+  vixl::UseScratchRegisterScope temps(vixl_masm_);
+  temps.Exclude(reg_x(base.AsCoreRegister()), reg_x(scratch.AsCoreRegister()));
+  ___ Ldr(reg_x(scratch.AsCoreRegister()), MEM_OP(reg_x(base.AsCoreRegister()), offs.Int32Value()));
   ___ Br(reg_x(scratch.AsCoreRegister()));
 }
 
@@ -595,13 +606,17 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   Arm64Exception *current_exception = new Arm64Exception(scratch, stack_adjust);
   exception_blocks_.push_back(current_exception);
-  LoadFromOffset(scratch.AsCoreRegister(), TR1, Thread::ExceptionOffset<8>().Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), ETR, Thread::ExceptionOffset<8>().Int32Value());
   ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
   ___ B(current_exception->Entry(), COND_OP(NE));
 }
 
 void Arm64Assembler::EmitExceptionPoll(Arm64Exception *exception) {
-    // Bind exception poll entry.
+  vixl::UseScratchRegisterScope temps(vixl_masm_);
+  temps.Exclude(reg_x(exception->scratch_.AsCoreRegister()));
+  vixl::Register temp = temps.AcquireX();
+
+  // Bind exception poll entry.
   ___ Bind(exception->Entry());
   if (exception->stack_adjust_ != 0) {  // Fix up the frame.
     DecreaseFrameSize(exception->stack_adjust_);
@@ -609,12 +624,14 @@
   // Pass exception object as argument.
   // Don't care about preserving X0 as this won't return.
   ___ Mov(reg_x(X0), reg_x(exception->scratch_.AsCoreRegister()));
-  LoadFromOffset(IP1, TR1, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
+  ___ Ldr(temp, MEM_OP(reg_x(ETR), QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value()));
 
-  // FIXME: Temporary fix for TR (XSELF).
-  ___ Mov(reg_x(TR), reg_x(TR1));
+  // Move ETR(Callee saved) back to TR(Caller saved) reg. We use ETR on calls
+  // to external functions that might trash TR. We do not need the original
+  // X19 saved in BuildFrame().
+  ___ Mov(reg_x(TR), reg_x(ETR));
 
-  ___ Blr(reg_x(IP1));
+  ___ Blr(temp);
   // Call should never return.
   ___ Brk();
 }
@@ -634,8 +651,10 @@
   CHECK_EQ(callee_save_regs.size(), kCalleeSavedRegsSize);
   ___ PushCalleeSavedRegisters();
 
-  // FIXME: Temporary fix for TR (XSELF).
-  ___ Mov(reg_x(TR1), reg_x(TR));
+  // Move TR(Caller saved) to ETR(Callee saved). The original X19 has been
+  // saved by PushCalleeSavedRegisters(). This way we make sure that TR is not
+  // trashed by native code.
+  ___ Mov(reg_x(ETR), reg_x(TR));
 
   // Increate frame to required size - must be at least space to push Method*.
   CHECK_GT(frame_size, kCalleeSavedRegsSize * kFramePointerSize);
@@ -681,8 +700,10 @@
   size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
   DecreaseFrameSize(adjust);
 
-  // FIXME: Temporary fix for TR (XSELF).
-  ___ Mov(reg_x(TR), reg_x(TR1));
+  // We move ETR (Callee Saved) back to TR (Caller Saved) which might have
+  // been trashed in the native call. The original X19 (ETR) is restored as
+  // part of PopCalleeSavedRegisters().
+  ___ Mov(reg_x(TR), reg_x(ETR));
 
   // Pop callee saved and return to LR.
   ___ PopCalleeSavedRegisters();
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 97fb93a..583150c 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -85,6 +85,7 @@
   vixl_masm_(new vixl::MacroAssembler(vixl_buf_, kBufferSizeArm64)) {}
 
   virtual ~Arm64Assembler() {
+    delete vixl_masm_;
     delete[] vixl_buf_;
   }
 
@@ -237,8 +238,8 @@
   // Vixl buffer.
   byte* vixl_buf_;
 
-  // Unique ptr - vixl assembler.
-  UniquePtr<vixl::MacroAssembler> vixl_masm_;
+  // Vixl assembler.
+  vixl::MacroAssembler* vixl_masm_;
 
   // List of exception blocks to generate at the end of the code cache.
   std::vector<Arm64Exception*> exception_blocks_;
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index c2af767..e0bfc6b 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -765,7 +765,7 @@
 
   for (int i = 0; i < argc; i++) {
     const StringPiece option(argv[i]);
-    bool log_options = false;
+    const bool log_options = false;
     if (log_options) {
       LOG(INFO) << "dex2oat: option[" << i << "]=" << argv[i];
     }
@@ -958,7 +958,9 @@
   bool image = (!image_filename.empty());
   if (!image && boot_image_filename.empty()) {
     boot_image_filename += GetAndroidRoot();
-    boot_image_filename += "/framework/boot.art";
+    boot_image_filename += "/framework/boot-";
+    boot_image_filename += GetInstructionSetString(instruction_set);
+    boot_image_filename += ".art";
   }
   std::string boot_image_option;
   if (!boot_image_filename.empty()) {
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 412a052..fc60c02 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -1548,7 +1548,7 @@
   }
   UniquePtr<Runtime> runtime(Runtime::Current());
   // Runtime::Create acquired the mutator_lock_ that is normally given away when we Runtime::Start,
-  // give it away now and then switch to a more managable ScopedObjectAccess.
+  // give it away now and then switch to a more manageable ScopedObjectAccess.
   Thread::Current()->TransitionFromRunnableToSuspended(kNative);
   ScopedObjectAccess soa(Thread::Current());
   gc::Heap* heap = Runtime::Current()->GetHeap();
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index dcf4561..4886561 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -442,8 +442,8 @@
     cmp    r1, #65536
     bpl    .Lrecursive_thin_unlock
     @ transition to unlocked, r3 holds 0
-    str    r3, [r0, #LOCK_WORD_OFFSET]
     dmb    ish                        @ full (StoreLoad) memory barrier
+    str    r3, [r0, #LOCK_WORD_OFFSET]
     bx     lr
 .Lrecursive_thin_unlock:
     sub    r1, r1, #65536
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 8079460..c056b2f 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -443,35 +443,32 @@
     DELIVER_PENDING_EXCEPTION
 .endm
 
-// FIXME: Temporary fix for TR(XSELF).
 .macro NO_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
-    mov x0, x19                        // pass Thread::Current
+    mov x0, xSELF                        // pass Thread::Current
     mov x1, sp                        // pass SP
     b   \cxx_name                     // \cxx_name(Thread*, SP)
 END \c_name
 .endm
 
-// FIXME: Temporary fix for TR(XSELF).
 .macro ONE_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context.
-    mov x1, x19                       // pass Thread::Current.
+    mov x1, xSELF                       // pass Thread::Current.
     mov x2, sp                        // pass SP.
     b   \cxx_name                     // \cxx_name(arg, Thread*, SP).
     brk 0
 END \c_name
 .endm
 
-// FIXME: Temporary fix for TR(XSELF).
 .macro TWO_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
-    mov x2, x19                       // pass Thread::Current
+    mov x2, xSELF                       // pass Thread::Current
     mov x3, sp                        // pass SP
     b   \cxx_name                     // \cxx_name(arg1, arg2, Thread*, SP)
     brk 0
@@ -991,7 +988,6 @@
      * failure.
      */
     .extern artHandleFillArrayDataFromCode
-// TODO: xSELF -> x19.
 ENTRY art_quick_handle_fill_data
     SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // Save callee saves in case exception allocation triggers GC.
     mov    x2, xSELF                       // Pass Thread::Current.
@@ -1002,8 +998,81 @@
     DELIVER_PENDING_EXCEPTION
 END art_quick_handle_fill_data
 
-UNIMPLEMENTED art_quick_lock_object
-UNIMPLEMENTED art_quick_unlock_object
+    /*
+     * Entry from managed code that calls artLockObjectFromCode, may block for GC. x0 holds the
+     * possibly null object to lock.
+     *
+     * Derived from arm32 code.
+     */
+    .extern artLockObjectFromCode
+ENTRY art_quick_lock_object
+    cbz    w0, .Lslow_lock
+    add    x4, x0, #LOCK_WORD_OFFSET  // exclusive load/store had no immediate anymore
+.Lretry_lock:
+    ldr    w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
+    ldxr   w1, [x4]
+    cbnz   w1, .Lnot_unlocked         // already thin locked
+    stxr   w3, w2, [x4]
+    cbnz   w3, .Lstrex_fail           // store failed, retry
+    dmb    ishld                      // full (LoadLoad) memory barrier, TODO: acquire-release
+    ret
+.Lstrex_fail:
+    b .Lretry_lock                    // unlikely forward branch, need to reload and recheck r1/r2
+.Lnot_unlocked:
+    lsr    w3, w1, 30
+    cbnz   w3, .Lslow_lock            // if either of the top two bits are set, go slow path
+    eor    w2, w1, w2                 // lock_word.ThreadId() ^ self->ThreadId()
+    uxth   w2, w2                     // zero top 16 bits
+    cbnz   w2, .Lslow_lock            // lock word and self thread id's match -> recursive lock
+                                      // else contention, go to slow path
+    add    w2, w1, #65536             // increment count in lock word placing in w2 for storing
+    lsr    w1, w2, 30                 // if either of the top two bits are set, we overflowed.
+    cbnz   w1, .Lslow_lock            // if we overflow the count go slow path
+    str    w2, [x0, #LOCK_WORD_OFFSET]// no need for stxr as we hold the lock
+    ret
+.Lslow_lock:
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save callee saves in case we block
+    mov    x1, xSELF                  // pass Thread::Current
+    mov    x2, sp                     // pass SP
+    bl     artLockObjectFromCode      // (Object* obj, Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME
+    RETURN_IF_W0_IS_ZERO_OR_DELIVER
+END art_quick_lock_object
+
+    /*
+     * Entry from managed code that calls artUnlockObjectFromCode and delivers exception on failure.
+     * x0 holds the possibly null object to lock.
+     *
+     * Derived from arm32 code.
+     */
+    .extern artUnlockObjectFromCode
+ENTRY art_quick_unlock_object
+    cbz    x0, .Lslow_unlock
+    ldr    w1, [x0, #LOCK_WORD_OFFSET]
+    lsr    w2, w1, 30
+    cbnz   w2, .Lslow_unlock          // if either of the top two bits are set, go slow path
+    ldr    w2, [xSELF, #THREAD_ID_OFFSET]
+    eor    w3, w1, w2                 // lock_word.ThreadId() ^ self->ThreadId()
+    uxth   w3, w3                     // zero top 16 bits
+    cbnz   w3, .Lslow_unlock          // do lock word and self thread id's match?
+    cmp    w1, #65536
+    bpl    .Lrecursive_thin_unlock
+    // transition to unlocked, w3 holds 0
+    dmb    ish                        // full (StoreLoad) memory barrier
+    str    w3, [x0, #LOCK_WORD_OFFSET]
+    ret
+.Lrecursive_thin_unlock:
+    sub    w1, w1, #65536
+    str    w1, [x0, #LOCK_WORD_OFFSET]
+    ret
+.Lslow_unlock:
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save callee saves in case exception allocation triggers GC
+    mov    x1, xSELF                  // pass Thread::Current
+    mov    x2, sp                     // pass SP
+    bl     artUnlockObjectFromCode    // (Object* obj, Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME
+    RETURN_IF_W0_IS_ZERO_OR_DELIVER
+END art_quick_unlock_object
 
     /*
      * Entry from managed code that calls artIsAssignableFromCode and on failure calls
@@ -1166,12 +1235,7 @@
     brk 0                         // Unreached.
 END art_quick_aput_obj
 
-UNIMPLEMENTED art_quick_initialize_static_storage
-UNIMPLEMENTED art_quick_initialize_type
-UNIMPLEMENTED art_quick_initialize_type_and_verify_access
-
 // Macro to facilitate adding new allocation entrypoints.
-// TODO: xSELF -> x19. Temporarily rely on xSELF being saved in REF_ONLY
 .macro TWO_ARG_DOWNCALL name, entrypoint, return
     .extern \entrypoint
 ENTRY \name
@@ -1186,7 +1250,6 @@
 .endm
 
 // Macro to facilitate adding new array allocation entrypoints.
-// TODO: xSELF -> x19. Temporarily rely on xSELF being saved in REF_ONLY
 .macro THREE_ARG_DOWNCALL name, entrypoint, return
     .extern \entrypoint
 ENTRY \name
@@ -1244,6 +1307,16 @@
 END \name
 .endm
 
+    /*
+     * Entry from managed code when uninitialized static storage, this stub will run the class
+     * initializer and deliver the exception on error. On success the static storage base is
+     * returned.
+     */
+TWO_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO
+
+UNIMPLEMENTED art_quick_initialize_type
+UNIMPLEMENTED art_quick_initialize_type_and_verify_access
+
 ONE_ARG_REF_DOWNCALL art_quick_get32_static, artGet32StaticFromCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
 ONE_ARG_REF_DOWNCALL art_quick_get64_static, artGet64StaticFromCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
 ONE_ARG_REF_DOWNCALL art_quick_get_obj_static, artGetObjStaticFromCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
@@ -1273,8 +1346,13 @@
     RETURN_IF_W0_IS_ZERO_OR_DELIVER
 END art_quick_set64_static
 
-
-UNIMPLEMENTED art_quick_resolve_string
+    /*
+     * Entry from managed code to resolve a string, this stub will allocate a String and deliver an
+     * exception on error. On success the String is returned. x0 holds the referring method,
+     * w1 holds the string index. The fast path check for hit in strings cache has already been
+     * performed.
+     */
+TWO_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALL_ALLOC_ENTRYPOINTS
@@ -1293,7 +1371,7 @@
     mov     x2, xSELF                   // pass Thread::Current
     mov     x3, sp                      // pass SP
     bl      artQuickProxyInvokeHandler  // (Method* proxy method, receiver, Thread*, SP)
-    ldr  xSELF, [sp, #200]              // Restore self pointer.
+    ldr     xSELF, [sp, #200]           // Restore self pointer.
     ldr     x2, [xSELF, THREAD_EXCEPTION_OFFSET]
     cbnz    x2, .Lexception_in_proxy    // success if no exception is pending
     RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME_NO_D0 // keep d0
@@ -1308,14 +1386,13 @@
 
 ENTRY art_quick_resolution_trampoline
     SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME
-    mov x19, x0           // save the called method
     mov x2, xSELF
     mov x3, sp
     bl artQuickResolutionTrampoline  // (called, receiver, Thread*, SP)
-    mov x9, x0            // Remember returned code pointer in x9.
-    mov x0, x19           // Restore the method, before x19 is restored to on-call value
+    cbz x0, 1f
+    mov x9, x0              // Remember returned code pointer in x9.
+    ldr x0, [sp, #0]        // artQuickResolutionTrampoline puts called method in *SP.
     RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
-    cbz x9, 1f
     br x9
 1:
     RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
diff --git a/runtime/arch/arm64/registers_arm64.h b/runtime/arch/arm64/registers_arm64.h
index 2503918..ea346e0 100644
--- a/runtime/arch/arm64/registers_arm64.h
+++ b/runtime/arch/arm64/registers_arm64.h
@@ -56,8 +56,8 @@
   X29 = 29,
   X30 = 30,
   X31 = 31,
-  TR  = 18,     // ART Thread Register - Needs to be one of the callee saved regs.
-  TR1 = 19,     // FIXME!
+  TR  = 18,     // ART Thread Register - Managed Runtime (Caller Saved Reg)
+  ETR = 19,     // ART Thread Register - External Calls  (Callee Saved Reg)
   IP0 = 16,     // Used as scratch by VIXL.
   IP1 = 17,     // Used as scratch by ART JNI Assembler.
   FP  = 29,
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 5e8edf0..6a2bfb5 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -52,6 +52,12 @@
     }
   }
 
+  // Helper function needed since TEST_F makes a new class.
+  Thread::tls_ptr_sized_values* GetTlsPtr(Thread* self) {
+    return &self->tlsPtr_;
+  }
+
+ public:
   size_t Invoke3(size_t arg0, size_t arg1, size_t arg2, uintptr_t code, Thread* self) {
     // Push a transition back into managed code onto the linked list in thread.
     ManagedStack fragment;
@@ -165,7 +171,6 @@
     return result;
   }
 
- public:
   // TODO: Set up a frame according to referrer's specs.
   size_t Invoke3WithReferrer(size_t arg0, size_t arg1, size_t arg2, uintptr_t code, Thread* self,
                              mirror::ArtMethod* referrer) {
@@ -353,12 +358,12 @@
 #endif
 }
 
-#if defined(__i386__) || defined(__arm__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
 extern "C" void art_quick_lock_object(void);
 #endif
 
 TEST_F(StubTest, LockObject) {
-#if defined(__i386__) || defined(__arm__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
   static constexpr size_t kThinLockLoops = 100;
 
   Thread* self = Thread::Current();
@@ -392,8 +397,21 @@
     EXPECT_EQ(l_inc.ThinLockCount(), i);
   }
 
-  // TODO: Improve this test. Somehow force it to go to fat locked. But that needs another thread.
+  // Force a fat lock by running identity hashcode to fill up lock word.
+  SirtRef<mirror::Object> obj2(soa.Self(), mirror::String::AllocFromModifiedUtf8(soa.Self(),
+                                                                                 "hello, world!"));
 
+  obj2->IdentityHashCode();
+
+  Invoke3(reinterpret_cast<size_t>(obj2.get()), 0U, 0U,
+          reinterpret_cast<uintptr_t>(&art_quick_lock_object), self);
+
+  LockWord lock_after2 = obj2->GetLockWord(false);
+  LockWord::LockState new_state2 = lock_after2.GetState();
+  EXPECT_EQ(LockWord::LockState::kFatLocked, new_state2);
+  EXPECT_NE(lock_after2.FatLockMonitor(), static_cast<Monitor*>(nullptr));
+
+  // Test done.
 #else
   LOG(INFO) << "Skipping lock_object as I don't know how to do that on " << kRuntimeISA;
   // Force-print to std::cout so it's also outside the logcat.
@@ -415,13 +433,14 @@
 };
 
 
-#if defined(__i386__) || defined(__arm__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
 extern "C" void art_quick_lock_object(void);
 extern "C" void art_quick_unlock_object(void);
 #endif
 
-TEST_F(StubTest, UnlockObject) {
-#if defined(__i386__) || defined(__arm__) || defined(__x86_64__)
+// NO_THREAD_SAFETY_ANALYSIS as we do not want to grab exclusive mutator lock for MonitorInfo.
+static void TestUnlockObject(StubTest* test) NO_THREAD_SAFETY_ANALYSIS {
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
   static constexpr size_t kThinLockLoops = 100;
 
   Thread* self = Thread::Current();
@@ -435,8 +454,8 @@
   LockWord::LockState old_state = lock.GetState();
   EXPECT_EQ(LockWord::LockState::kUnlocked, old_state);
 
-  Invoke3(reinterpret_cast<size_t>(obj.get()), 0U, 0U,
-          reinterpret_cast<uintptr_t>(&art_quick_unlock_object), self);
+  test->Invoke3(reinterpret_cast<size_t>(obj.get()), 0U, 0U,
+                reinterpret_cast<uintptr_t>(&art_quick_unlock_object), self);
 
   // This should be an illegal monitor state.
   EXPECT_TRUE(self->IsExceptionPending());
@@ -446,15 +465,15 @@
   LockWord::LockState new_state = lock_after.GetState();
   EXPECT_EQ(LockWord::LockState::kUnlocked, new_state);
 
-  Invoke3(reinterpret_cast<size_t>(obj.get()), 0U, 0U,
-          reinterpret_cast<uintptr_t>(&art_quick_lock_object), self);
+  test->Invoke3(reinterpret_cast<size_t>(obj.get()), 0U, 0U,
+                reinterpret_cast<uintptr_t>(&art_quick_lock_object), self);
 
   LockWord lock_after2 = obj->GetLockWord(false);
   LockWord::LockState new_state2 = lock_after2.GetState();
   EXPECT_EQ(LockWord::LockState::kThinLocked, new_state2);
 
-  Invoke3(reinterpret_cast<size_t>(obj.get()), 0U, 0U,
-          reinterpret_cast<uintptr_t>(&art_quick_unlock_object), self);
+  test->Invoke3(reinterpret_cast<size_t>(obj.get()), 0U, 0U,
+                reinterpret_cast<uintptr_t>(&art_quick_unlock_object), self);
 
   LockWord lock_after3 = obj->GetLockWord(false);
   LockWord::LockState new_state3 = lock_after3.GetState();
@@ -468,13 +487,16 @@
 
   constexpr size_t kNumberOfLocks = 10;  // Number of objects = lock
   constexpr size_t kIterations = 10000;  // Number of iterations
+  constexpr size_t kMoveToFat = 1000;     // Chance of 1:kMoveFat to make a lock fat.
 
   size_t counts[kNumberOfLocks];
+  bool fat[kNumberOfLocks];  // Whether a lock should be thin or fat.
   SirtRef<mirror::String>* objects[kNumberOfLocks];
 
   // Initialize = allocate.
   for (size_t i = 0; i < kNumberOfLocks; ++i) {
     counts[i] = 0;
+    fat[i] = false;
     objects[i] = new SirtRef<mirror::String>(soa.Self(),
                                              mirror::String::AllocFromModifiedUtf8(soa.Self(), ""));
   }
@@ -483,36 +505,57 @@
     // Select which lock to update.
     size_t index = r.next() % kNumberOfLocks;
 
-    bool lock;  // Whether to lock or unlock in this step.
-    if (counts[index] == 0) {
-      lock = true;
-    } else if (counts[index] == kThinLockLoops) {
-      lock = false;
-    } else {
-      // Randomly.
-      lock = r.next() % 2 == 0;
-    }
+    // Make lock fat?
+    if (!fat[index] && (r.next() % kMoveToFat == 0)) {
+      fat[index] = true;
+      objects[index]->get()->IdentityHashCode();
 
-    if (lock) {
-      Invoke3(reinterpret_cast<size_t>(objects[index]->get()), 0U, 0U,
-              reinterpret_cast<uintptr_t>(&art_quick_lock_object), self);
-      counts[index]++;
+      LockWord lock_iter = objects[index]->get()->GetLockWord(false);
+      LockWord::LockState iter_state = lock_iter.GetState();
+      if (counts[index] == 0) {
+        EXPECT_EQ(LockWord::LockState::kHashCode, iter_state);
+      } else {
+        EXPECT_EQ(LockWord::LockState::kFatLocked, iter_state);
+      }
     } else {
-      Invoke3(reinterpret_cast<size_t>(objects[index]->get()), 0U, 0U,
-              reinterpret_cast<uintptr_t>(&art_quick_unlock_object), self);
-      counts[index]--;
-    }
+      bool lock;  // Whether to lock or unlock in this step.
+      if (counts[index] == 0) {
+        lock = true;
+      } else if (counts[index] == kThinLockLoops) {
+        lock = false;
+      } else {
+        // Randomly.
+        lock = r.next() % 2 == 0;
+      }
 
-    EXPECT_FALSE(self->IsExceptionPending());
+      if (lock) {
+        test-> Invoke3(reinterpret_cast<size_t>(objects[index]->get()), 0U, 0U,
+                       reinterpret_cast<uintptr_t>(&art_quick_lock_object), self);
+        counts[index]++;
+      } else {
+        test->Invoke3(reinterpret_cast<size_t>(objects[index]->get()), 0U, 0U,
+                      reinterpret_cast<uintptr_t>(&art_quick_unlock_object), self);
+        counts[index]--;
+      }
 
-    // Check the new state.
-    LockWord lock_iter = objects[index]->get()->GetLockWord(false);
-    LockWord::LockState iter_state = lock_iter.GetState();
-    if (counts[index] > 0) {
-      EXPECT_EQ(LockWord::LockState::kThinLocked, iter_state);
-      EXPECT_EQ(counts[index] - 1, lock_iter.ThinLockCount());
-    } else {
-      EXPECT_EQ(LockWord::LockState::kUnlocked, iter_state);
+      EXPECT_FALSE(self->IsExceptionPending());
+
+      // Check the new state.
+      LockWord lock_iter = objects[index]->get()->GetLockWord(true);
+      LockWord::LockState iter_state = lock_iter.GetState();
+      if (fat[index]) {
+        // Abuse MonitorInfo.
+        EXPECT_EQ(LockWord::LockState::kFatLocked, iter_state) << index;
+        MonitorInfo info(objects[index]->get());
+        EXPECT_EQ(counts[index], info.entry_count_) << index;
+      } else {
+        if (counts[index] > 0) {
+          EXPECT_EQ(LockWord::LockState::kThinLocked, iter_state);
+          EXPECT_EQ(counts[index] - 1, lock_iter.ThinLockCount());
+        } else {
+          EXPECT_EQ(LockWord::LockState::kUnlocked, iter_state);
+        }
+      }
     }
   }
 
@@ -522,21 +565,21 @@
     size_t index = kNumberOfLocks - 1 - i;
     size_t count = counts[index];
     while (count > 0) {
-      Invoke3(reinterpret_cast<size_t>(objects[index]->get()), 0U, 0U,
-              reinterpret_cast<uintptr_t>(&art_quick_unlock_object), self);
+      test->Invoke3(reinterpret_cast<size_t>(objects[index]->get()), 0U, 0U,
+                    reinterpret_cast<uintptr_t>(&art_quick_unlock_object), self);
 
       count--;
     }
 
     LockWord lock_after4 = objects[index]->get()->GetLockWord(false);
     LockWord::LockState new_state4 = lock_after4.GetState();
-    EXPECT_EQ(LockWord::LockState::kUnlocked, new_state4);
+    EXPECT_TRUE(LockWord::LockState::kUnlocked == new_state4
+                || LockWord::LockState::kFatLocked == new_state4);
 
     delete objects[index];
   }
 
-  // TODO: Improve this test. Somehow force it to go to fat locked. But that needs another thread.
-
+  // Test done.
 #else
   LOG(INFO) << "Skipping unlock_object as I don't know how to do that on " << kRuntimeISA;
   // Force-print to std::cout so it's also outside the logcat.
@@ -544,6 +587,9 @@
 #endif
 }
 
+TEST_F(StubTest, UnlockObject) {
+  TestUnlockObject(this);
+}
 
 #if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
 extern "C" void art_quick_check_cast(void);
@@ -728,13 +774,6 @@
 #endif
 }
 
-
-#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
-extern "C" void art_quick_alloc_object_rosalloc(void);
-extern "C" void art_quick_alloc_object_resolved_rosalloc(void);
-extern "C" void art_quick_alloc_object_initialized_rosalloc(void);
-#endif
-
 TEST_F(StubTest, AllocObject) {
   TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING();
 
@@ -752,13 +791,12 @@
   // Play with it...
 
   EXPECT_FALSE(self->IsExceptionPending());
-
   {
     // Use an arbitrary method from c to use as referrer
     size_t result = Invoke3(static_cast<size_t>(c->GetDexTypeIndex()),    // type_idx
                             reinterpret_cast<size_t>(c->GetVirtualMethod(0)),  // arbitrary
                             0U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocObject),
                             self);
 
     EXPECT_FALSE(self->IsExceptionPending());
@@ -772,7 +810,7 @@
     // We can use nullptr in the second argument as we do not need a method here (not used in
     // resolved/initialized cases)
     size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 0U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_resolved_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocObjectResolved),
                             self);
 
     EXPECT_FALSE(self->IsExceptionPending());
@@ -786,7 +824,7 @@
     // We can use nullptr in the second argument as we do not need a method here (not used in
     // resolved/initialized cases)
     size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 0U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_initialized_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocObjectInitialized),
                             self);
 
     EXPECT_FALSE(self->IsExceptionPending());
@@ -843,7 +881,7 @@
     self->ClearException();
 
     size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 0U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_initialized_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocObjectInitialized),
                             self);
 
     EXPECT_TRUE(self->IsExceptionPending());
@@ -867,12 +905,6 @@
 #endif
 }
 
-
-#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
-extern "C" void art_quick_alloc_array_rosalloc(void);
-extern "C" void art_quick_alloc_array_resolved_rosalloc(void);
-#endif
-
 TEST_F(StubTest, AllocObjectArray) {
   TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING();
 
@@ -903,7 +935,7 @@
     size_t result = Invoke3(static_cast<size_t>(c->GetDexTypeIndex()),    // type_idx
                             reinterpret_cast<size_t>(c_obj->GetVirtualMethod(0)),  // arbitrary
                             10U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_array_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocArray),
                             self);
 
     EXPECT_FALSE(self->IsExceptionPending());
@@ -918,7 +950,7 @@
     // We can use nullptr in the second argument as we do not need a method here (not used in
     // resolved/initialized cases)
     size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 10U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_array_resolved_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocArrayResolved),
                             self);
 
     EXPECT_FALSE(self->IsExceptionPending()) << PrettyTypeOf(self->GetException(nullptr));
@@ -938,7 +970,7 @@
   {
     size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr),
                             GB,  // that should fail...
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_array_resolved_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocArrayResolved),
                             self);
 
     EXPECT_TRUE(self->IsExceptionPending());
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index a55dbb6..b886fb0 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -697,8 +697,8 @@
     jz   .Lslow_unlock
     movl LOCK_WORD_OFFSET(%edi), %ecx     // ecx := lock word
     movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
-    test %ecx, %ecx
-    jb   .Lslow_unlock                    // lock word contains a monitor
+    test LITERAL(0xC0000000), %ecx
+    jnz  .Lslow_unlock                    // lock word contains a monitor
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
     cmpl LITERAL(65536), %ecx
diff --git a/runtime/base/bit_vector.cc b/runtime/base/bit_vector.cc
index 0e01dc2..a3e2b15 100644
--- a/runtime/base/bit_vector.cc
+++ b/runtime/base/bit_vector.cc
@@ -399,13 +399,13 @@
   return count;
 }
 
-void BitVector::Dump(std::ostream& os, const char *prefix) {
+void BitVector::Dump(std::ostream& os, const char *prefix) const {
   std::ostringstream buffer;
   DumpHelper(buffer, prefix);
   os << buffer.str() << std::endl;
 }
 
-void BitVector::DumpDot(FILE* file, const char* prefix, bool last_entry) {
+void BitVector::DumpDot(FILE* file, const char* prefix, bool last_entry) const {
   std::ostringstream buffer;
   Dump(buffer, prefix);
 
@@ -421,7 +421,7 @@
   fprintf(file, "\\\n");
 }
 
-void BitVector::DumpHelper(std::ostringstream& buffer, const char* prefix) {
+void BitVector::DumpHelper(std::ostringstream& buffer, const char* prefix) const {
   // Initialize it.
   if (prefix != nullptr) {
     buffer << prefix;
diff --git a/runtime/base/bit_vector.h b/runtime/base/bit_vector.h
index 6ee6b00..2a68396 100644
--- a/runtime/base/bit_vector.h
+++ b/runtime/base/bit_vector.h
@@ -148,11 +148,11 @@
 
     bool EnsureSizeAndClear(unsigned int num);
 
-    void Dump(std::ostream& os, const char* prefix);
-    void DumpDot(FILE* file, const char* prefix, bool last_entry = false);
+    void Dump(std::ostream& os, const char* prefix) const;
+    void DumpDot(FILE* file, const char* prefix, bool last_entry = false) const;
 
   protected:
-    void DumpHelper(std::ostringstream& buffer, const char* prefix);
+    void DumpHelper(std::ostringstream& buffer, const char* prefix) const;
 
   private:
     Allocator* const allocator_;
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index f2919e8..7235729 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -1331,11 +1331,6 @@
       if (collector_type == collector_type_) {
         return;
       }
-      if (Runtime::Current()->IsShuttingDown(self)) {
-        // Don't allow heap transitions to happen if the runtime is shutting down since these can
-        // cause objects to get finalized.
-        return;
-      }
       // GC can be disabled if someone has a used GetPrimitiveArrayCritical but not yet released.
       if (!copying_transition || disable_moving_gc_count_ == 0) {
         // TODO: Not hard code in semi-space collector?
@@ -1345,6 +1340,12 @@
     }
     usleep(1000);
   }
+  if (Runtime::Current()->IsShuttingDown(self)) {
+    // Don't allow heap transitions to happen if the runtime is shutting down since these can
+    // cause objects to get finalized.
+    FinishGC(self, collector::kGcTypeNone);
+    return;
+  }
   tl->SuspendAll();
   switch (collector_type) {
     case kCollectorTypeSS:
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index f1da6cd..446f898 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -144,8 +144,8 @@
   std::string error_msg;
   bool is_system = false;
   if (FindImageFilename(image_location, image_isa, &image_filename, &is_system)) {
-    ImageSpace* space = ImageSpace::Init(image_filename.c_str(), image_location,
-                                         !is_system, &error_msg);
+    ImageSpace* space = ImageSpace::Init(image_filename.c_str(), image_location, !is_system,
+                                         &error_msg);
     if (space != nullptr) {
       return space;
     }
diff --git a/runtime/image.h b/runtime/image.h
index ce2bc58..abe1ad8 100644
--- a/runtime/image.h
+++ b/runtime/image.h
@@ -91,7 +91,7 @@
   static std::string GetOatLocationFromImageLocation(const std::string& image) {
     std::string oat_filename = image;
     if (oat_filename.length() <= 3) {
-      return oat_filename + ".oat";
+      oat_filename += ".oat";
     } else {
       oat_filename.replace(oat_filename.length() - 3, 3, "oat");
     }
diff --git a/runtime/native/dalvik_system_ZygoteHooks.cc b/runtime/native/dalvik_system_ZygoteHooks.cc
index f541633..7490e6a 100644
--- a/runtime/native/dalvik_system_ZygoteHooks.cc
+++ b/runtime/native/dalvik_system_ZygoteHooks.cc
@@ -58,12 +58,12 @@
     Runtime* runtime = Runtime::Current();
     JavaVMExt* vm = runtime->GetJavaVM();
     if (!vm->check_jni) {
-      VLOG(jni) << "Late-enabling -Xcheck:jni";
+      LOG(INFO) << "Late-enabling -Xcheck:jni";
       vm->SetCheckJniEnabled(true);
       // There's only one thread running at this point, so only one JNIEnv to fix up.
       Thread::Current()->GetJniEnv()->SetCheckJniEnabled(true);
     } else {
-      VLOG(jni) << "Not late-enabling -Xcheck:jni (already on)";
+      LOG(INFO) << "Not late-enabling -Xcheck:jni (already on)";
     }
     debug_flags &= ~DEBUG_ENABLE_CHECKJNI;
   }
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 84ca23b..8c18dff 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -181,8 +181,16 @@
   parallel_gc_threads_ = sysconf(_SC_NPROCESSORS_CONF) - 1;
   // Only the main GC thread, no workers.
   conc_gc_threads_ = 0;
-  // Default is CMS which is Sticky + Partial + Full CMS GC.
+  // The default GC type is set in makefiles.
+#if ART_DEFAULT_GC_TYPE_IS_CMS
   collector_type_ = gc::kCollectorTypeCMS;
+#elif ART_DEFAULT_GC_TYPE_IS_SS
+  collector_type_ = gc::kCollectorTypeSS;
+#elif ART_DEFAULT_GC_TYPE_IS_GSS
+  collector_type_ = gc::kCollectorTypeGSS;
+#else
+#error "ART default GC type must be set"
+#endif
   // If background_collector_type_ is kCollectorTypeNone, it defaults to the collector_type_ after
   // parsing options.
   background_collector_type_ = gc::kCollectorTypeNone;
@@ -674,7 +682,9 @@
 
   if (compiler_callbacks_ == nullptr && image_.empty()) {
     image_ += GetAndroidRoot();
-    image_ += "/framework/boot.art";
+    image_ += "/framework/boot-";
+    image_ += GetInstructionSetString(image_isa_);
+    image_ += ".art";
   }
   if (heap_growth_limit_ == 0) {
     heap_growth_limit_ = heap_maximum_size_;
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 48322f0..99d43f4 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -209,6 +209,8 @@
     Thread* self = Thread::Current();
     if (self == nullptr) {
       os << "(Aborting thread was not attached to runtime!)\n";
+      DumpKernelStack(os, GetTid(), "  kernel: ", false);
+      DumpNativeStack(os, GetTid(), "  native: ", nullptr);
     } else {
       os << "Aborting thread:\n";
       if (Locks::mutator_lock_->IsExclusiveHeld(self) || Locks::mutator_lock_->IsSharedHeld(self)) {
@@ -936,8 +938,8 @@
 }
 
 void Runtime::VisitRoots(RootCallback* callback, void* arg, VisitRootFlags flags) {
-  VisitConcurrentRoots(callback, arg, flags);
   VisitNonConcurrentRoots(callback, arg);
+  VisitConcurrentRoots(callback, arg, flags);
 }
 
 mirror::ObjectArray<mirror::ArtMethod>* Runtime::CreateDefaultImt(ClassLinker* cl) {
@@ -1199,17 +1201,9 @@
   // Make the dex2oat instruction set match that of the launching runtime. If we have multiple
   // architecture support, dex2oat may be compiled as a different instruction-set than that
   // currently being executed.
-#if defined(__arm__)
-  argv->push_back("--instruction-set=arm");
-#elif defined(__aarch64__)
-  argv->push_back("--instruction-set=arm64");
-#elif defined(__i386__)
-  argv->push_back("--instruction-set=x86");
-#elif defined(__x86_64__)
-  argv->push_back("--instruction-set=x86_64");
-#elif defined(__mips__)
-  argv->push_back("--instruction-set=mips");
-#endif
+  std::string instruction_set("--instruction-set=");
+  instruction_set += GetInstructionSetString(kRuntimeISA);
+  argv->push_back(instruction_set);
 
   std::string features("--instruction-set-features=");
   features += GetDefaultInstructionSetFeatures();
diff --git a/runtime/stack.cc b/runtime/stack.cc
index b984aa6..fd31ec6 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -269,7 +269,7 @@
 void StackVisitor::SanityCheckFrame() const {
   if (kIsDebugBuild) {
     mirror::ArtMethod* method = GetMethod();
-    CHECK(method->GetClass() == mirror::ArtMethod::GetJavaLangReflectArtMethod());
+    CHECK_EQ(method->GetClass(), mirror::ArtMethod::GetJavaLangReflectArtMethod());
     if (cur_quick_frame_ != nullptr) {
       method->AssertPcIsWithinQuickCode(cur_quick_frame_pc_);
       // Frame sanity.
diff --git a/runtime/thread.h b/runtime/thread.h
index 8c17082..32311e1 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1082,6 +1082,7 @@
   friend class Runtime;  // For CreatePeer.
   friend class ScopedThreadStateChange;
   friend class SignalCatcher;  // For SetStateUnsafe.
+  friend class StubTest;  // For accessing entrypoints.
   friend class ThreadList;  // For ~Thread and Destroy.
 
   DISALLOW_COPY_AND_ASSIGN(Thread);