Merge "Don't create bump pointer spaces unless necessary."
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 5b4492f..767ffbf 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -22,6 +22,7 @@
 namespace art {
 
 enum RegisterClass {
+  kInvalidRegClass,
   kCoreReg,
   kFPReg,
   kAnyReg,
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index 9bad736..c3f694d 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -180,10 +180,10 @@
     // Instruction::GOTO_32,
     // Instruction::PACKED_SWITCH,
     // Instruction::SPARSE_SWITCH,
-    // Instruction::CMPL_FLOAT,
-    // Instruction::CMPG_FLOAT,
-    // Instruction::CMPL_DOUBLE,
-    // Instruction::CMPG_DOUBLE,
+    Instruction::CMPL_FLOAT,
+    Instruction::CMPG_FLOAT,
+    Instruction::CMPL_DOUBLE,
+    Instruction::CMPG_DOUBLE,
     Instruction::CMP_LONG,
     // Instruction::IF_EQ,
     // Instruction::IF_NE,
@@ -262,20 +262,20 @@
     Instruction::NOT_INT,
     Instruction::NEG_LONG,
     Instruction::NOT_LONG,
-    // Instruction::NEG_FLOAT,
-    // Instruction::NEG_DOUBLE,
+    Instruction::NEG_FLOAT,
+    Instruction::NEG_DOUBLE,
     Instruction::INT_TO_LONG,
-    // Instruction::INT_TO_FLOAT,
-    // Instruction::INT_TO_DOUBLE,
+    Instruction::INT_TO_FLOAT,
+    Instruction::INT_TO_DOUBLE,
     Instruction::LONG_TO_INT,
-    // Instruction::LONG_TO_FLOAT,
-    // Instruction::LONG_TO_DOUBLE,
-    // Instruction::FLOAT_TO_INT,
-    // Instruction::FLOAT_TO_LONG,
-    // Instruction::FLOAT_TO_DOUBLE,
-    // Instruction::DOUBLE_TO_INT,
-    // Instruction::DOUBLE_TO_LONG,
-    // Instruction::DOUBLE_TO_FLOAT,
+    Instruction::LONG_TO_FLOAT,
+    Instruction::LONG_TO_DOUBLE,
+    Instruction::FLOAT_TO_INT,
+    Instruction::FLOAT_TO_LONG,
+    Instruction::FLOAT_TO_DOUBLE,
+    Instruction::DOUBLE_TO_INT,
+    Instruction::DOUBLE_TO_LONG,
+    Instruction::DOUBLE_TO_FLOAT,
     Instruction::INT_TO_BYTE,
     Instruction::INT_TO_CHAR,
     Instruction::INT_TO_SHORT,
@@ -301,15 +301,15 @@
     Instruction::SHL_LONG,
     Instruction::SHR_LONG,
     Instruction::USHR_LONG,
-    // Instruction::ADD_FLOAT,
-    // Instruction::SUB_FLOAT,
-    // Instruction::MUL_FLOAT,
-    // Instruction::DIV_FLOAT,
+    Instruction::ADD_FLOAT,
+    Instruction::SUB_FLOAT,
+    Instruction::MUL_FLOAT,
+    Instruction::DIV_FLOAT,
     // Instruction::REM_FLOAT,
-    // Instruction::ADD_DOUBLE,
-    // Instruction::SUB_DOUBLE,
-    // Instruction::MUL_DOUBLE,
-    // Instruction::DIV_DOUBLE,
+    Instruction::ADD_DOUBLE,
+    Instruction::SUB_DOUBLE,
+    Instruction::MUL_DOUBLE,
+    Instruction::DIV_DOUBLE,
     // Instruction::REM_DOUBLE,
     Instruction::ADD_INT_2ADDR,
     Instruction::SUB_INT_2ADDR,
@@ -333,15 +333,15 @@
     Instruction::SHL_LONG_2ADDR,
     Instruction::SHR_LONG_2ADDR,
     Instruction::USHR_LONG_2ADDR,
-    // Instruction::ADD_FLOAT_2ADDR,
-    // Instruction::SUB_FLOAT_2ADDR,
-    // Instruction::MUL_FLOAT_2ADDR,
-    // Instruction::DIV_FLOAT_2ADDR,
+    Instruction::ADD_FLOAT_2ADDR,
+    Instruction::SUB_FLOAT_2ADDR,
+    Instruction::MUL_FLOAT_2ADDR,
+    Instruction::DIV_FLOAT_2ADDR,
     // Instruction::REM_FLOAT_2ADDR,
-    // Instruction::ADD_DOUBLE_2ADDR,
-    // Instruction::SUB_DOUBLE_2ADDR,
-    // Instruction::MUL_DOUBLE_2ADDR,
-    // Instruction::DIV_DOUBLE_2ADDR,
+    Instruction::ADD_DOUBLE_2ADDR,
+    Instruction::SUB_DOUBLE_2ADDR,
+    Instruction::MUL_DOUBLE_2ADDR,
+    Instruction::DIV_DOUBLE_2ADDR,
     // Instruction::REM_DOUBLE_2ADDR,
     Instruction::ADD_INT_LIT16,
     Instruction::RSUB_INT,
@@ -692,14 +692,14 @@
 // S : short
 // C : char
 // I : int
-// L : long
+// J : long
 // F : float
 // D : double
 // L : reference(object, array)
 // V : void
 // (ARM64) Current calling conversion only support 32bit softfp
 //         which has problems with long, float, double
-constexpr char arm64_supported_types[] = "ZBSCILVJ";
+constexpr char arm64_supported_types[] = "ZBSCILVJFD";
 // (x84_64) We still have troubles with compiling longs/doubles/floats
 constexpr char x86_64_supported_types[] = "ZBSCILV";
 
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index ed7e1f5..47b233b 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -953,18 +953,34 @@
   defs[reg_index] = ssa_reg;
 }
 
+void MIRGraph::AllocateSSAUseData(MIR *mir, int num_uses) {
+  mir->ssa_rep->num_uses = num_uses;
+
+  if (mir->ssa_rep->num_uses_allocated < num_uses) {
+    mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses, kArenaAllocDFInfo));
+    // NOTE: will be filled in during type & size inference pass
+    mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses, kArenaAllocDFInfo));
+  }
+}
+
+void MIRGraph::AllocateSSADefData(MIR *mir, int num_defs) {
+  mir->ssa_rep->num_defs = num_defs;
+
+  if (mir->ssa_rep->num_defs_allocated < num_defs) {
+    mir->ssa_rep->defs = static_cast<int*>(arena_->Alloc(sizeof(int) * num_defs,
+          kArenaAllocDFInfo));
+    mir->ssa_rep->fp_def = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_defs,
+          kArenaAllocDFInfo));
+  }
+}
+
 /* Look up new SSA names for format_35c instructions */
 void MIRGraph::DataFlowSSAFormat35C(MIR* mir) {
   MIR::DecodedInstruction* d_insn = &mir->dalvikInsn;
   int num_uses = d_insn->vA;
   int i;
 
-  mir->ssa_rep->num_uses = num_uses;
-  mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                       kArenaAllocDFInfo));
-  // NOTE: will be filled in during type & size inference pass
-  mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses,
-                                                          kArenaAllocDFInfo));
+  AllocateSSAUseData(mir, num_uses);
 
   for (i = 0; i < num_uses; i++) {
     HandleSSAUse(mir->ssa_rep->uses, d_insn->arg[i], i);
@@ -977,12 +993,7 @@
   int num_uses = d_insn->vA;
   int i;
 
-  mir->ssa_rep->num_uses = num_uses;
-  mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                       kArenaAllocDFInfo));
-  // NOTE: will be filled in during type & size inference pass
-  mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses,
-                                                          kArenaAllocDFInfo));
+  AllocateSSAUseData(mir, num_uses);
 
   for (i = 0; i < num_uses; i++) {
     HandleSSAUse(mir->ssa_rep->uses, d_insn->vC+i, i);
@@ -999,6 +1010,7 @@
     mir->ssa_rep =
         static_cast<struct SSARepresentation *>(arena_->Alloc(sizeof(SSARepresentation),
                                                               kArenaAllocDFInfo));
+    memset(mir->ssa_rep, 0, sizeof(*mir->ssa_rep));
 
     uint64_t df_attributes = GetDataFlowAttributes(mir);
 
@@ -1045,13 +1057,7 @@
       }
     }
 
-    if (num_uses) {
-      mir->ssa_rep->num_uses = num_uses;
-      mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                           kArenaAllocDFInfo));
-      mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses,
-                                                              kArenaAllocDFInfo));
-    }
+    AllocateSSAUseData(mir, num_uses);
 
     int num_defs = 0;
 
@@ -1062,13 +1068,7 @@
       }
     }
 
-    if (num_defs) {
-      mir->ssa_rep->num_defs = num_defs;
-      mir->ssa_rep->defs = static_cast<int*>(arena_->Alloc(sizeof(int) * num_defs,
-                                                           kArenaAllocDFInfo));
-      mir->ssa_rep->fp_def = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_defs,
-                                                              kArenaAllocDFInfo));
-    }
+    AllocateSSADefData(mir, num_defs);
 
     MIR::DecodedInstruction* d_insn = &mir->dalvikInsn;
 
@@ -1114,11 +1114,11 @@
    * input to PHI nodes can be derived from the snapshot of all
    * predecessor blocks.
    */
-  bb->data_flow_info->vreg_to_ssa_map =
+  bb->data_flow_info->vreg_to_ssa_map_exit =
       static_cast<int*>(arena_->Alloc(sizeof(int) * cu_->num_dalvik_registers,
                                       kArenaAllocDFInfo));
 
-  memcpy(bb->data_flow_info->vreg_to_ssa_map, vreg_to_ssa_map_,
+  memcpy(bb->data_flow_info->vreg_to_ssa_map_exit, vreg_to_ssa_map_,
          sizeof(int) * cu_->num_dalvik_registers);
   return true;
 }
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index 24fea71..0fffa01 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -74,6 +74,7 @@
       use_counts_(arena, 256, kGrowableArrayMisc),
       raw_use_counts_(arena, 256, kGrowableArrayMisc),
       num_reachable_blocks_(0),
+      max_num_reachable_blocks_(0),
       dfs_order_(NULL),
       dfs_post_order_(NULL),
       dom_post_order_traversal_(NULL),
@@ -1435,6 +1436,9 @@
   /* Rename register names by local defs and phi nodes */
   ClearAllVisitedFlags();
   DoDFSPreOrderSSARename(GetEntryBlock());
+
+  // Update the maximum number of reachable blocks.
+  max_num_reachable_blocks_ = num_reachable_blocks_;
 }
 
 void MIRGraph::SSATransformationEnd() {
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 53a997e..3655125 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -223,7 +223,7 @@
   ArenaBitVector* def_v;
   ArenaBitVector* live_in_v;
   ArenaBitVector* phi_v;
-  int32_t* vreg_to_ssa_map;
+  int32_t* vreg_to_ssa_map_exit;
   ArenaBitVector* ending_check_v;  // For null check and class init check elimination.
 };
 
@@ -234,14 +234,21 @@
  * Following SSA renaming, this is the primary struct used by code generators to locate
  * operand and result registers.  This is a somewhat confusing and unhelpful convention that
  * we may want to revisit in the future.
+ *
+ * TODO:
+ *  1. Add accessors for uses/defs and make data private
+ *  2. Change fp_use/fp_def to a bit array (could help memory usage)
+ *  3. Combine array storage into internal array and handled via accessors from 1.
  */
 struct SSARepresentation {
-  int16_t num_uses;
-  int16_t num_defs;
   int32_t* uses;
   bool* fp_use;
   int32_t* defs;
   bool* fp_def;
+  int16_t num_uses_allocated;
+  int16_t num_defs_allocated;
+  int16_t num_uses;
+  int16_t num_defs;
 
   static uint32_t GetStartUseIndex(Instruction::Code opcode);
 };
@@ -1020,6 +1027,10 @@
   void CombineBlocks(BasicBlock* bb);
 
   void ClearAllVisitedFlags();
+
+  void AllocateSSAUseData(MIR *mir, int num_uses);
+  void AllocateSSADefData(MIR *mir, int num_defs);
+
   /*
    * IsDebugBuild sanity check: keep track of the Dex PCs for catch entries so that later on
    * we can verify that all catch entries have native PC entries.
@@ -1105,6 +1116,7 @@
   GrowableArray<uint32_t> use_counts_;      // Weighted by nesting depth
   GrowableArray<uint32_t> raw_use_counts_;  // Not weighted
   unsigned int num_reachable_blocks_;
+  unsigned int max_num_reachable_blocks_;
   GrowableArray<BasicBlockId>* dfs_order_;
   GrowableArray<BasicBlockId>* dfs_post_order_;
   GrowableArray<BasicBlockId>* dom_post_order_traversal_;
diff --git a/compiler/dex/quick/arm/fp_arm.cc b/compiler/dex/quick/arm/fp_arm.cc
index bb02f74..dde8ff0 100644
--- a/compiler/dex/quick/arm/fp_arm.cc
+++ b/compiler/dex/quick/arm/fp_arm.cc
@@ -141,8 +141,11 @@
       break;
     case Instruction::LONG_TO_DOUBLE: {
       rl_src = LoadValueWide(rl_src, kFPReg);
-      RegStorage src_low = rl_src.reg.DoubleToLowSingle();
-      RegStorage src_high = rl_src.reg.DoubleToHighSingle();
+      RegisterInfo* info = GetRegInfo(rl_src.reg);
+      RegStorage src_low = info->FindMatchingView(RegisterInfo::kLowSingleStorageMask)->GetReg();
+      DCHECK(src_low.Valid());
+      RegStorage src_high = info->FindMatchingView(RegisterInfo::kHighSingleStorageMask)->GetReg();
+      DCHECK(src_high.Valid());
       rl_result = EvalLoc(rl_dest, kFPReg, true);
       RegStorage tmp1 = AllocTempDouble();
       RegStorage tmp2 = AllocTempDouble();
@@ -161,8 +164,11 @@
       return;
     case Instruction::LONG_TO_FLOAT: {
       rl_src = LoadValueWide(rl_src, kFPReg);
-      RegStorage src_low = rl_src.reg.DoubleToLowSingle();
-      RegStorage src_high = rl_src.reg.DoubleToHighSingle();
+      RegisterInfo* info = GetRegInfo(rl_src.reg);
+      RegStorage src_low = info->FindMatchingView(RegisterInfo::kLowSingleStorageMask)->GetReg();
+      DCHECK(src_low.Valid());
+      RegStorage src_high = info->FindMatchingView(RegisterInfo::kHighSingleStorageMask)->GetReg();
+      DCHECK(src_high.Valid());
       rl_result = EvalLoc(rl_dest, kFPReg, true);
       // Allocate temp registers.
       RegStorage high_val = AllocTempDouble();
@@ -334,22 +340,11 @@
 
 bool ArmMir2Lir::GenInlinedSqrt(CallInfo* info) {
   DCHECK_EQ(cu_->instruction_set, kThumb2);
-  LIR *branch;
   RegLocation rl_src = info->args[0];
   RegLocation rl_dest = InlineTargetWide(info);  // double place for result
   rl_src = LoadValueWide(rl_src, kFPReg);
   RegLocation rl_result = EvalLoc(rl_dest, kFPReg, true);
   NewLIR2(kThumb2Vsqrtd, rl_result.reg.GetReg(), rl_src.reg.GetReg());
-  NewLIR2(kThumb2Vcmpd, rl_result.reg.GetReg(), rl_result.reg.GetReg());
-  NewLIR0(kThumb2Fmstat);
-  branch = NewLIR2(kThumbBCond, 0, kArmCondEq);
-  ClobberCallerSave();
-  LockCallTemps();  // Using fixed registers
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pSqrt));
-  NewLIR3(kThumb2Fmrrd, rs_r0.GetReg(), rs_r1.GetReg(), rl_src.reg.GetReg());
-  NewLIR1(kThumbBlxR, r_tgt.GetReg());
-  NewLIR3(kThumb2Fmdrr, rl_result.reg.GetReg(), rs_r0.GetReg(), rs_r1.GetReg());
-  branch->target = NewLIR0(kPseudoTargetLabel);
   StoreValueWide(rl_dest, rl_result);
   return true;
 }
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index 1520c52..309f676 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -575,10 +575,10 @@
     // Redirect single precision's master storage to master.
     info->SetMaster(dp_reg_info);
     // Singles should show a single 32-bit mask bit, at first referring to the low half.
-    DCHECK_EQ(info->StorageMask(), 0x1U);
+    DCHECK_EQ(info->StorageMask(), RegisterInfo::kLowSingleStorageMask);
     if (sp_reg_num & 1) {
-      // For odd singles, change to user the high word of the backing double.
-      info->SetStorageMask(0x2);
+      // For odd singles, change to use the high word of the backing double.
+      info->SetStorageMask(RegisterInfo::kHighSingleStorageMask);
     }
   }
 
@@ -786,10 +786,13 @@
     }
   }
   if (res.Valid()) {
+    RegisterInfo* info = GetRegInfo(res);
     promotion_map_[p_map_idx].fp_location = kLocPhysReg;
-    promotion_map_[p_map_idx].FpReg = res.DoubleToLowSingle().GetReg();
+    promotion_map_[p_map_idx].FpReg =
+        info->FindMatchingView(RegisterInfo::kLowSingleStorageMask)->GetReg().GetReg();
     promotion_map_[p_map_idx+1].fp_location = kLocPhysReg;
-    promotion_map_[p_map_idx+1].FpReg = res.DoubleToHighSingle().GetReg();
+    promotion_map_[p_map_idx+1].FpReg =
+        info->FindMatchingView(RegisterInfo::kHighSingleStorageMask)->GetReg().GetReg();
   }
   return res;
 }
diff --git a/compiler/dex/quick/arm64/fp_arm64.cc b/compiler/dex/quick/arm64/fp_arm64.cc
index 87ab6fe..882ee66 100644
--- a/compiler/dex/quick/arm64/fp_arm64.cc
+++ b/compiler/dex/quick/arm64/fp_arm64.cc
@@ -25,10 +25,6 @@
   int op = kA64Brk1d;
   RegLocation rl_result;
 
-  /*
-   * Don't attempt to optimize register usage since these opcodes call out to
-   * the handlers.
-   */
   switch (opcode) {
     case Instruction::ADD_FLOAT_2ADDR:
     case Instruction::ADD_FLOAT:
@@ -119,49 +115,75 @@
                                  RegLocation rl_dest, RegLocation rl_src) {
   int op = kA64Brk1d;
   RegLocation rl_result;
+  RegisterClass src_reg_class = kInvalidRegClass;
+  RegisterClass dst_reg_class = kInvalidRegClass;
 
   switch (opcode) {
     case Instruction::INT_TO_FLOAT:
       op = kA64Scvtf2fw;
+      src_reg_class = kCoreReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::FLOAT_TO_INT:
       op = kA64Fcvtzs2wf;
+      src_reg_class = kFPReg;
+      dst_reg_class = kCoreReg;
       break;
     case Instruction::DOUBLE_TO_FLOAT:
       op = kA64Fcvt2sS;
+      src_reg_class = kFPReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::FLOAT_TO_DOUBLE:
       op = kA64Fcvt2Ss;
+      src_reg_class = kFPReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::INT_TO_DOUBLE:
       op = FWIDE(kA64Scvtf2fw);
+      src_reg_class = kCoreReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::DOUBLE_TO_INT:
       op = FWIDE(kA64Fcvtzs2wf);
+      src_reg_class = kFPReg;
+      dst_reg_class = kCoreReg;
       break;
     case Instruction::LONG_TO_DOUBLE:
       op = FWIDE(kA64Scvtf2fx);
+      src_reg_class = kCoreReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::FLOAT_TO_LONG:
       op = kA64Fcvtzs2xf;
+      src_reg_class = kFPReg;
+      dst_reg_class = kCoreReg;
       break;
     case Instruction::LONG_TO_FLOAT:
       op = kA64Scvtf2fx;
+      src_reg_class = kCoreReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::DOUBLE_TO_LONG:
       op = FWIDE(kA64Fcvtzs2xf);
+      src_reg_class = kFPReg;
+      dst_reg_class = kCoreReg;
       break;
     default:
       LOG(FATAL) << "Unexpected opcode: " << opcode;
   }
 
+  DCHECK_NE(src_reg_class, kInvalidRegClass);
+  DCHECK_NE(dst_reg_class, kInvalidRegClass);
+  DCHECK_NE(op, kA64Brk1d);
+
   if (rl_src.wide) {
-    rl_src = LoadValueWide(rl_src, kFPReg);
+    rl_src = LoadValueWide(rl_src, src_reg_class);
   } else {
-    rl_src = LoadValue(rl_src, kFPReg);
+    rl_src = LoadValue(rl_src, src_reg_class);
   }
 
-  rl_result = EvalLoc(rl_dest, kFPReg, true);
+  rl_result = EvalLoc(rl_dest, dst_reg_class, true);
   NewLIR2(op, rl_result.reg.GetReg(), rl_src.reg.GetReg());
 
   if (rl_dest.wide) {
@@ -296,25 +318,11 @@
 }
 
 bool Arm64Mir2Lir::GenInlinedSqrt(CallInfo* info) {
-  // TODO(Arm64): implement this.
-  UNIMPLEMENTED(FATAL) << "GenInlinedSqrt not implemented for Arm64";
-
-  DCHECK_EQ(cu_->instruction_set, kArm64);
-  LIR *branch;
   RegLocation rl_src = info->args[0];
   RegLocation rl_dest = InlineTargetWide(info);  // double place for result
   rl_src = LoadValueWide(rl_src, kFPReg);
   RegLocation rl_result = EvalLoc(rl_dest, kFPReg, true);
   NewLIR2(FWIDE(kA64Fsqrt2ff), rl_result.reg.GetReg(), rl_src.reg.GetReg());
-  NewLIR2(FWIDE(kA64Fcmp2ff), rl_result.reg.GetReg(), rl_result.reg.GetReg());
-  branch = NewLIR2(kA64B2ct, kArmCondEq, 0);
-  ClobberCallerSave();
-  LockCallTemps();  // Using fixed registers
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(8, pSqrt));
-  // NewLIR3(kThumb2Fmrrd, r0, r1, rl_src.reg.GetReg());
-  NewLIR1(kA64Blr1x, r_tgt.GetReg());
-  // NewLIR3(kThumb2Fmdrr, rl_result.reg.GetReg(), r0, r1);
-  branch->target = NewLIR0(kPseudoTargetLabel);
   StoreValueWide(rl_dest, rl_result);
   return true;
 }
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 38f110e..8dad90a 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -725,17 +725,10 @@
 
 // Test suspend flag, return target of taken suspend branch
 LIR* Arm64Mir2Lir::OpTestSuspend(LIR* target) {
-  // TODO(Arm64): re-enable suspend checks, once art_quick_test_suspend is implemented and
-  //   the suspend register is properly handled in the trampolines.
-#if 0
+  // FIXME: Define rA64_SUSPEND as w19, when we do not need two copies of reserved register.
+  // Note: The opcode is not set as wide, so actually we are using the 32-bit version register.
   NewLIR3(kA64Subs3rRd, rA64_SUSPEND, rA64_SUSPEND, 1);
   return OpCondBranch((target == NULL) ? kCondEq : kCondNe, target);
-#else
-  // TODO(Arm64): Fake suspend check. Will always fail to branch. Remove this.
-  LIR* branch = NewLIR2((target == NULL) ? kA64Cbnz2rt : kA64Cbz2rt, rwzr, 0);
-  branch->target = target;
-  return branch;
-#endif
 }
 
 // Decrement register and branch on condition
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 808060d..0222447 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -630,12 +630,6 @@
     DCHECK_EQ(info->StorageMask(), 0x1U);
   }
 
-  // TODO: re-enable this when we can safely save r4 over the suspension code path.
-  bool no_suspend = NO_SUSPEND;  // || !Runtime::Current()->ExplicitSuspendChecks();
-  if (no_suspend) {
-    GetRegInfo(rs_rA64_SUSPEND)->MarkFree();
-  }
-
   // Don't start allocating temps at r0/s0/d0 or you may clobber return regs in early-exit methods.
   // TODO: adjust when we roll to hard float calling convention.
   reg_pool_->next_core_reg_ = 2;
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 256135d..3fbbc4e 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1201,21 +1201,27 @@
 }
 
 RegLocation Mir2Lir::NarrowRegLoc(RegLocation loc) {
-  loc.wide = false;
   if (loc.location == kLocPhysReg) {
+    DCHECK(!loc.reg.Is32Bit());
     if (loc.reg.IsPair()) {
-      loc.reg = loc.reg.GetLow();
+      RegisterInfo* info_lo = GetRegInfo(loc.reg.GetLow());
+      RegisterInfo* info_hi = GetRegInfo(loc.reg.GetHigh());
+      info_lo->SetIsWide(false);
+      info_hi->SetIsWide(false);
+      loc.reg = info_lo->GetReg();
     } else {
-      // FIXME: temp workaround.
-      // Issue here: how do we narrow to a 32-bit value in 64-bit container?
-      // Probably the wrong thing to narrow the RegStorage container here.  That
-      // should be a target decision.  At the RegLocation level, we're only
-      // modifying the view of the Dalvik value - this is orthogonal to the storage
-      // container size.  Consider this a temp workaround.
-      DCHECK(loc.reg.IsDouble());
-      loc.reg = loc.reg.DoubleToLowSingle();
+      RegisterInfo* info = GetRegInfo(loc.reg);
+      RegisterInfo* info_new = info->FindMatchingView(RegisterInfo::k32SoloStorageMask);
+      DCHECK(info_new != nullptr);
+      if (info->IsLive() && (info->SReg() == loc.s_reg_low)) {
+        info->MarkDead();
+        info_new->MarkLive(loc.s_reg_low);
+      }
+      loc.reg = info_new->GetReg();
     }
+    DCHECK(loc.reg.Valid());
   }
+  loc.wide = false;
   return loc;
 }
 
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 5ec1ca9..a6d56bd 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -864,8 +864,17 @@
       // Wide spans, we need the 2nd half of uses[2].
       rl_arg = UpdateLocWide(rl_use2);
       if (rl_arg.location == kLocPhysReg) {
-        // NOTE: not correct for 64-bit core regs, but this needs rewriting for hard-float.
-        reg = rl_arg.reg.IsPair() ? rl_arg.reg.GetHigh() : rl_arg.reg.DoubleToHighSingle();
+        if (rl_arg.reg.IsPair()) {
+          reg = rl_arg.reg.GetHigh();
+        } else {
+          RegisterInfo* info = GetRegInfo(rl_arg.reg);
+          info = info->FindMatchingView(RegisterInfo::kHighSingleStorageMask);
+          if (info == nullptr) {
+            // NOTE: For hard float convention we won't split arguments across reg/mem.
+            UNIMPLEMENTED(FATAL) << "Needs hard float api.";
+          }
+          reg = info->GetReg();
+        }
       } else {
         // kArg2 & rArg3 can safely be used here
         reg = TargetReg(kArg3);
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index f58f078..361aba8 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -332,6 +332,15 @@
         return arena->Alloc(size, kArenaAllocRegAlloc);
       }
 
+      static const uint32_t k32SoloStorageMask     = 0x00000001;
+      static const uint32_t kLowSingleStorageMask  = 0x00000001;
+      static const uint32_t kHighSingleStorageMask = 0x00000002;
+      static const uint32_t k64SoloStorageMask     = 0x00000003;
+      static const uint32_t k128SoloStorageMask    = 0x0000000f;
+      static const uint32_t k256SoloStorageMask    = 0x000000ff;
+      static const uint32_t k512SoloStorageMask    = 0x0000ffff;
+      static const uint32_t k1024SoloStorageMask   = 0xffffffff;
+
       bool InUse() { return (storage_mask_ & master_->used_storage_) != 0; }
       void MarkInUse() { master_->used_storage_ |= storage_mask_; }
       void MarkFree() { master_->used_storage_ &= ~storage_mask_; }
@@ -389,7 +398,15 @@
       LIR* DefEnd() { return def_end_; }
       void SetDefEnd(LIR* def_end) { def_end_ = def_end; }
       void ResetDefBody() { def_start_ = def_end_ = nullptr; }
-
+      // Find member of aliased set matching storage_used; return nullptr if none.
+      RegisterInfo* FindMatchingView(uint32_t storage_used) {
+        RegisterInfo* res = Master();
+        for (; res != nullptr; res = res->GetAliasChain()) {
+          if (res->StorageMask() == storage_used)
+            break;
+        }
+        return res;
+      }
 
      private:
       RegStorage reg_;
@@ -648,7 +665,7 @@
     virtual void EndInvoke(CallInfo* info) {}
 
 
-    // Handle bookkeeping to convert a wide RegLocation to a narow RegLocation.  No code generated.
+    // Handle bookkeeping to convert a wide RegLocation to a narrow RegLocation.  No code generated.
     RegLocation NarrowRegLoc(RegLocation loc);
 
     // Shared by all targets - implemented in local_optimizations.cc
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 91a66d3..39a0365 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -63,27 +63,36 @@
 { kX86 ## opname ## 16TI8, kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0x66, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "16TI8", "fs:[!0d],!1d" }, \
   \
 { kX86 ## opname ## 32MR,  kMemReg,    mem_use | IS_TERTIARY_OP |           REG_USE02  | SETS_CCODES | uses_ccodes, { 0,             0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "32MR", "[!0r+!1d],!2r" }, \
-{ kX86 ## opname ## 64MR,  kMemReg64,  mem_use | IS_TERTIARY_OP |           REG_USE02  | SETS_CCODES | uses_ccodes, { REX_W,         0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "64MR", "[!0r+!1d],!2r" }, \
 { kX86 ## opname ## 32AR,  kArrayReg,  mem_use | IS_QUIN_OP     |           REG_USE014 | SETS_CCODES | uses_ccodes, { 0,             0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "32AR", "[!0r+!1r<<!2d+!3d],!4r" }, \
-{ kX86 ## opname ## 64AR,  kArrayReg64, mem_use | IS_QUIN_OP     |           REG_USE014 | SETS_CCODES | uses_ccodes, { REX_W,         0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "64AR", "[!0r+!1r<<!2d+!3d],!4r" }, \
 { kX86 ## opname ## 32TR,  kThreadReg, mem_use | IS_BINARY_OP   |           REG_USE1   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "32TR", "fs:[!0d],!1r" }, \
 { kX86 ## opname ## 32RR,  kRegReg,              IS_BINARY_OP   | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "32RR", "!0r,!1r" }, \
 { kX86 ## opname ## 32RM,  kRegMem,    IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "32RM", "!0r,[!1r+!2d]" }, \
-{ kX86 ## opname ## 64RM,  kRegMem,    IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { REX_W,         0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## 32RA,  kRegArray,  IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE012 | SETS_CCODES | uses_ccodes, { 0,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "32RA", "!0r,[!1r+!2r<<!3d+!4d]" }, \
-{ kX86 ## opname ## 64RA,  kRegArray,  IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE012 | SETS_CCODES | uses_ccodes, { REX_W,         0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RA", "!0r,[!1r+!2r<<!3d+!4d]" }, \
 { kX86 ## opname ## 32RT,  kRegThread, IS_LOAD | IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "32RT", "!0r,fs:[!1d]" }, \
-{ kX86 ## opname ## 64RT,  kReg64Thread, IS_LOAD | IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, REX_W, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RT", "!0r,fs:[!1d]" }, \
 { kX86 ## opname ## 32RI,  kRegImm,              IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i32, 0, 0, rm32_i32_modrm, ax32_i32, 4 }, #opname "32RI", "!0r,!1d" }, \
-{ kX86 ## opname ## 64RI,  kReg64Imm,            IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,         0, rm32_i32, 0, 0, rm32_i32_modrm, ax32_i32, 4 }, #opname "32RI", "!0r,!1d" }, \
 { kX86 ## opname ## 32MI,  kMemImm,    mem_use | IS_TERTIARY_OP |           REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "32MI", "[!0r+!1d],!2d" }, \
 { kX86 ## opname ## 32AI,  kArrayImm,  mem_use | IS_QUIN_OP     |           REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "32AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
 { kX86 ## opname ## 32TI,  kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "32TI", "fs:[!0d],!1d" }, \
 { kX86 ## opname ## 32RI8, kRegImm,              IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "32RI8", "!0r,!1d" }, \
-{ kX86 ## opname ## 64RI8, kReg64Imm,            IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,         0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "64RI8", "!0r,!1d" }, \
 { kX86 ## opname ## 32MI8, kMemImm,    mem_use | IS_TERTIARY_OP |           REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "32MI8", "[!0r+!1d],!2d" }, \
 { kX86 ## opname ## 32AI8, kArrayImm,  mem_use | IS_QUIN_OP     |           REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "32AI8", "[!0r+!1r<<!2d+!3d],!4d" }, \
-{ kX86 ## opname ## 32TI8, kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "32TI8", "fs:[!0d],!1d" }
+{ kX86 ## opname ## 32TI8, kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "32TI8", "fs:[!0d],!1d" }, \
+  \
+{ kX86 ## opname ## 64MR,  kMemReg,    mem_use | IS_TERTIARY_OP |           REG_USE02  | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "64MR", "[!0r+!1d],!2r" }, \
+{ kX86 ## opname ## 64AR,  kArrayReg,  mem_use | IS_QUIN_OP     |           REG_USE014 | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "64AR", "[!0r+!1r<<!2d+!3d],!4r" }, \
+{ kX86 ## opname ## 64TR,  kThreadReg, mem_use | IS_BINARY_OP   |           REG_USE1   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, REX_W, rm32_r32, 0, 0, 0,              0,        0 }, #opname "64TR", "fs:[!0d],!1r" }, \
+{ kX86 ## opname ## 64RR,  kRegReg,              IS_BINARY_OP   | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { REX_W,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RR", "!0r,!1r" }, \
+{ kX86 ## opname ## 64RM,  kRegMem,    IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { REX_W,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RM", "!0r,[!1r+!2d]" }, \
+{ kX86 ## opname ## 64RA,  kRegArray,  IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE012 | SETS_CCODES | uses_ccodes, { REX_W,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RA", "!0r,[!1r+!2r<<!3d+!4d]" }, \
+{ kX86 ## opname ## 64RT,  kRegThread, IS_LOAD | IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, REX_W, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RT", "!0r,fs:[!1d]" }, \
+{ kX86 ## opname ## 64RI,  kRegImm,              IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i32, 0, 0, rm32_i32_modrm, ax32_i32, 4 }, #opname "64RI", "!0r,!1d" }, \
+{ kX86 ## opname ## 64MI,  kMemImm,    mem_use | IS_TERTIARY_OP |           REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "64MI", "[!0r+!1d],!2d" }, \
+{ kX86 ## opname ## 64AI,  kArrayImm,  mem_use | IS_QUIN_OP     |           REG_USE01  | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "64AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
+{ kX86 ## opname ## 64TI,  kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, REX_W, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "64TI", "fs:[!0d],!1d" }, \
+{ kX86 ## opname ## 64RI8, kRegImm,              IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "64RI8", "!0r,!1d" }, \
+{ kX86 ## opname ## 64MI8, kMemImm,    mem_use | IS_TERTIARY_OP |           REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "64MI8", "[!0r+!1d],!2d" }, \
+{ kX86 ## opname ## 64AI8, kArrayImm,  mem_use | IS_QUIN_OP     |           REG_USE01  | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "64AI8", "[!0r+!1r<<!2d+!3d],!4d" }, \
+{ kX86 ## opname ## 64TI8, kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, REX_W, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "64TI8", "fs:[!0d],!1d" }
 
 ENCODING_MAP(Add, IS_LOAD | IS_STORE, REG_DEF0, 0,
   0x00 /* RegMem8/Reg8 */,     0x01 /* RegMem32/Reg32 */,
@@ -146,6 +155,13 @@
   { kX86Imul32RMI8,  kRegMemImm,   IS_LOAD | IS_QUAD_OP     | REG_DEF0_USE1  | SETS_CCODES, { 0, 0, 0x6B, 0, 0, 0, 0, 1 }, "Imul32RMI8", "!0r,[!1r+!2d],!3d" },
   { kX86Imul32RAI8,  kRegArrayImm, IS_LOAD | IS_SEXTUPLE_OP | REG_DEF0_USE12 | SETS_CCODES, { 0, 0, 0x6B, 0, 0, 0, 0, 1 }, "Imul32RAI8", "!0r,[!1r+!2r<<!3d+!4d],!5d" },
 
+  { kX86Imul64RRI,   kRegRegImm,             IS_TERTIARY_OP | REG_DEF0_USE1  | SETS_CCODES, { REX_W, 0, 0x69, 0, 0, 0, 0, 8 }, "Imul64RRI", "!0r,!1r,!2d" },
+  { kX86Imul64RMI,   kRegMemImm,   IS_LOAD | IS_QUAD_OP     | REG_DEF0_USE1  | SETS_CCODES, { REX_W, 0, 0x69, 0, 0, 0, 0, 8 }, "Imul64RMI", "!0r,[!1r+!2d],!3d" },
+  { kX86Imul64RAI,   kRegArrayImm, IS_LOAD | IS_SEXTUPLE_OP | REG_DEF0_USE12 | SETS_CCODES, { REX_W, 0, 0x69, 0, 0, 0, 0, 8 }, "Imul64RAI", "!0r,[!1r+!2r<<!3d+!4d],!5d" },
+  { kX86Imul64RRI8,  kRegRegImm,             IS_TERTIARY_OP | REG_DEF0_USE1  | SETS_CCODES, { REX_W, 0, 0x6B, 0, 0, 0, 0, 1 }, "Imul64RRI8", "!0r,!1r,!2d" },
+  { kX86Imul64RMI8,  kRegMemImm,   IS_LOAD | IS_QUAD_OP     | REG_DEF0_USE1  | SETS_CCODES, { REX_W, 0, 0x6B, 0, 0, 0, 0, 1 }, "Imul64RMI8", "!0r,[!1r+!2d],!3d" },
+  { kX86Imul64RAI8,  kRegArrayImm, IS_LOAD | IS_SEXTUPLE_OP | REG_DEF0_USE12 | SETS_CCODES, { REX_W, 0, 0x6B, 0, 0, 0, 0, 1 }, "Imul64RAI8", "!0r,[!1r+!2r<<!3d+!4d],!5d" },
+
   { kX86Mov8MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,      { 0,             0, 0x88, 0, 0, 0, 0, 0 }, "Mov8MR", "[!0r+!1d],!2r" },
   { kX86Mov8AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,     { 0,             0, 0x88, 0, 0, 0, 0, 0 }, "Mov8AR", "[!0r+!1r<<!2d+!3d],!4r" },
   { kX86Mov8TR, kThreadReg, IS_STORE | IS_BINARY_OP   | REG_USE1,       { THREAD_PREFIX, 0, 0x88, 0, 0, 0, 0, 0 }, "Mov8TR", "fs:[!0d],!1r" },
@@ -171,30 +187,42 @@
   { kX86Mov16TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, 0x66, 0xC7, 0, 0, 0, 0, 2 }, "Mov16TI", "fs:[!0d],!1d" },
 
   { kX86Mov32MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,      { 0,             0, 0x89, 0, 0, 0, 0, 0 }, "Mov32MR", "[!0r+!1d],!2r" },
-  { kX86Mov64MR, kMemReg64,  IS_STORE | IS_TERTIARY_OP | REG_USE02,      { REX_W,         0, 0x89, 0, 0, 0, 0, 0 }, "Mov64MR", "[!0r+!1d],!2r" },
   { kX86Mov32AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,     { 0,             0, 0x89, 0, 0, 0, 0, 0 }, "Mov32AR", "[!0r+!1r<<!2d+!3d],!4r" },
-  { kX86Mov64AR, kArrayReg64, IS_STORE | IS_QUIN_OP     | REG_USE014,     { REX_W,        0, 0x89, 0, 0, 0, 0, 0 }, "Mov64AR", "[!0r+!1r<<!2d+!3d],!4r" },
   { kX86Mov32TR, kThreadReg, IS_STORE | IS_BINARY_OP   | REG_USE1,       { THREAD_PREFIX, 0, 0x89, 0, 0, 0, 0, 0 }, "Mov32TR", "fs:[!0d],!1r" },
   { kX86Mov32RR, kRegReg,               IS_BINARY_OP   | REG_DEF0_USE1,  { 0,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov32RR", "!0r,!1r" },
   { kX86Mov32RM, kRegMem,    IS_LOAD  | IS_TERTIARY_OP | REG_DEF0_USE1,  { 0,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov32RM", "!0r,[!1r+!2d]" },
-  { kX86Mov64RM, kRegMem,    IS_LOAD  | IS_TERTIARY_OP | REG_DEF0_USE1,  { REX_W,         0, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RM", "!0r,[!1r+!2d]" },
   { kX86Mov32RA, kRegArray,  IS_LOAD  | IS_QUIN_OP     | REG_DEF0_USE12, { 0,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov32RA", "!0r,[!1r+!2r<<!3d+!4d]" },
-  { kX86Mov64RA, kRegArray,  IS_LOAD  | IS_QUIN_OP     | REG_DEF0_USE12, { REX_W,         0, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RA", "!0r,[!1r+!2r<<!3d+!4d]" },
   { kX86Mov32RT, kRegThread, IS_LOAD  | IS_BINARY_OP   | REG_DEF0,       { THREAD_PREFIX, 0, 0x8B, 0, 0, 0, 0, 0 }, "Mov32RT", "!0r,fs:[!1d]" },
-  { kX86Mov64RT, kRegThread, IS_LOAD  | IS_BINARY_OP   | REG_DEF0,       { THREAD_PREFIX, REX_W, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RT", "!0r,fs:[!1d]" },
   { kX86Mov32RI, kMovRegImm,            IS_BINARY_OP   | REG_DEF0,       { 0,             0, 0xB8, 0, 0, 0, 0, 4 }, "Mov32RI", "!0r,!1d" },
   { kX86Mov32MI, kMemImm,    IS_STORE | IS_TERTIARY_OP | REG_USE0,       { 0,             0, 0xC7, 0, 0, 0, 0, 4 }, "Mov32MI", "[!0r+!1d],!2d" },
   { kX86Mov32AI, kArrayImm,  IS_STORE | IS_QUIN_OP     | REG_USE01,      { 0,             0, 0xC7, 0, 0, 0, 0, 4 }, "Mov32AI", "[!0r+!1r<<!2d+!3d],!4d" },
   { kX86Mov32TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, 0, 0xC7, 0, 0, 0, 0, 4 }, "Mov32TI", "fs:[!0d],!1d" },
-  { kX86Mov64TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, REX_W, 0xC7, 0, 0, 0, 0, 4 }, "Mov64TI", "fs:[!0d],!1d" },
 
-  { kX86Lea32RM, kRegMem, IS_TERTIARY_OP | IS_LOAD | REG_DEF0_USE1,      { 0, 0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RM", "!0r,[!1r+!2d]" },
+  { kX86Lea32RM, kRegMem, IS_TERTIARY_OP | IS_LOAD | REG_DEF0_USE1,      { 0,             0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RM", "!0r,[!1r+!2d]" },
 
-  { kX86Lea32RA, kRegArray, IS_QUIN_OP | REG_DEF0_USE12, { 0, 0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RA", "!0r,[!1r+!2r<<!3d+!4d]" },
+  { kX86Lea32RA, kRegArray, IS_QUIN_OP | REG_DEF0_USE12,                 { 0,             0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RA", "!0r,[!1r+!2r<<!3d+!4d]" },
 
-  { kX86Cmov32RRC, kRegRegCond, IS_TERTIARY_OP | REG_DEF0_USE01 | USES_CCODES, {0, 0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc32RR", "!2c !0r,!1r" },
+  { kX86Mov64MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,      { REX_W,             0, 0x89, 0, 0, 0, 0, 0 }, "Mov64MR", "[!0r+!1d],!2r" },
+  { kX86Mov64AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,     { REX_W,             0, 0x89, 0, 0, 0, 0, 0 }, "Mov64AR", "[!0r+!1r<<!2d+!3d],!4r" },
+  { kX86Mov64TR, kThreadReg, IS_STORE | IS_BINARY_OP   | REG_USE1,       { THREAD_PREFIX, REX_W, 0x89, 0, 0, 0, 0, 0 }, "Mov64TR", "fs:[!0d],!1r" },
+  { kX86Mov64RR, kRegReg,               IS_BINARY_OP   | REG_DEF0_USE1,  { REX_W,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RR", "!0r,!1r" },
+  { kX86Mov64RM, kRegMem,    IS_LOAD  | IS_TERTIARY_OP | REG_DEF0_USE1,  { REX_W,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RM", "!0r,[!1r+!2d]" },
+  { kX86Mov64RA, kRegArray,  IS_LOAD  | IS_QUIN_OP     | REG_DEF0_USE12, { REX_W,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RA", "!0r,[!1r+!2r<<!3d+!4d]" },
+  { kX86Mov64RT, kRegThread, IS_LOAD  | IS_BINARY_OP   | REG_DEF0,       { THREAD_PREFIX, REX_W, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RT", "!0r,fs:[!1d]" },
+  { kX86Mov64RI, kMovRegImm,            IS_BINARY_OP   | REG_DEF0,       { REX_W,             0, 0xB8, 0, 0, 0, 0, 8 }, "Mov64RI", "!0r,!1d" },
+  { kX86Mov64MI, kMemImm,    IS_STORE | IS_TERTIARY_OP | REG_USE0,       { REX_W,             0, 0xC7, 0, 0, 0, 0, 8 }, "Mov64MI", "[!0r+!1d],!2d" },
+  { kX86Mov64AI, kArrayImm,  IS_STORE | IS_QUIN_OP     | REG_USE01,      { REX_W,             0, 0xC7, 0, 0, 0, 0, 8 }, "Mov64AI", "[!0r+!1r<<!2d+!3d],!4d" },
+  { kX86Mov64TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, REX_W, 0xC7, 0, 0, 0, 0, 8 }, "Mov64TI", "fs:[!0d],!1d" },
 
-  { kX86Cmov32RMC, kRegMemCond, IS_QUAD_OP | IS_LOAD | REG_DEF0_USE01 | USES_CCODES, {0, 0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc32RM", "!3c !0r,[!1r+!2d]" },
+  { kX86Lea64RM, kRegMem, IS_TERTIARY_OP | IS_LOAD | REG_DEF0_USE1,      { REX_W,             0, 0x8D, 0, 0, 0, 0, 0 }, "Lea64RM", "!0r,[!1r+!2d]" },
+
+  { kX86Lea64RA, kRegArray, IS_QUIN_OP | REG_DEF0_USE12,                 { REX_W,             0, 0x8D, 0, 0, 0, 0, 0 }, "Lea64RA", "!0r,[!1r+!2r<<!3d+!4d]" },
+
+  { kX86Cmov32RRC, kRegRegCond, IS_TERTIARY_OP | REG_DEF0_USE01 | USES_CCODES, {0,     0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc32RR", "!2c !0r,!1r" },
+  { kX86Cmov64RRC, kRegRegCond, IS_TERTIARY_OP | REG_DEF0_USE01 | USES_CCODES, {REX_W, 0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc64RR", "!2c !0r,!1r" },
+
+  { kX86Cmov32RMC, kRegMemCond, IS_QUAD_OP | IS_LOAD | REG_DEF0_USE01 | USES_CCODES, {0,     0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc32RM", "!3c !0r,[!1r+!2d]" },
+  { kX86Cmov64RMC, kRegMemCond, IS_QUAD_OP | IS_LOAD | REG_DEF0_USE01 | USES_CCODES, {REX_W, 0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc64RM", "!3c !0r,[!1r+!2d]" },
 
 #define SHIFT_ENCODING_MAP(opname, modrm_opcode) \
 { kX86 ## opname ## 8RI, kShiftRegImm,                        IS_BINARY_OP   | REG_DEF0_USE0 |            SETS_CCODES, { 0,    0, 0xC0, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "8RI", "!0r,!1d" }, \
@@ -216,7 +244,14 @@
 { kX86 ## opname ## 32AI, kShiftArrayImm, IS_LOAD | IS_STORE | IS_QUIN_OP     | REG_USE01     |            SETS_CCODES, { 0,    0, 0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "32AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
 { kX86 ## opname ## 32RC, kShiftRegCl,                         IS_BINARY_OP   | REG_DEF0_USE0 | REG_USEC | SETS_CCODES, { 0,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32RC", "!0r,cl" }, \
 { kX86 ## opname ## 32MC, kShiftMemCl,    IS_LOAD | IS_STORE | IS_TERTIARY_OP | REG_USE0      | REG_USEC | SETS_CCODES, { 0,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32MC", "[!0r+!1d],cl" }, \
-{ kX86 ## opname ## 32AC, kShiftArrayCl,  IS_LOAD | IS_STORE | IS_QUIN_OP     | REG_USE01     | REG_USEC | SETS_CCODES, { 0,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32AC", "[!0r+!1r<<!2d+!3d],cl" }
+{ kX86 ## opname ## 32AC, kShiftArrayCl,  IS_LOAD | IS_STORE | IS_QUIN_OP     | REG_USE01     | REG_USEC | SETS_CCODES, { 0,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32AC", "[!0r+!1r<<!2d+!3d],cl" }, \
+  \
+{ kX86 ## opname ## 64RI, kShiftRegImm,                        IS_BINARY_OP   | REG_DEF0_USE0 |            SETS_CCODES, { REX_W,    0, 0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "64RI", "!0r,!1d" }, \
+{ kX86 ## opname ## 64MI, kShiftMemImm,   IS_LOAD | IS_STORE | IS_TERTIARY_OP | REG_USE0      |            SETS_CCODES, { REX_W,    0, 0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "64MI", "[!0r+!1d],!2d" }, \
+{ kX86 ## opname ## 64AI, kShiftArrayImm, IS_LOAD | IS_STORE | IS_QUIN_OP     | REG_USE01     |            SETS_CCODES, { REX_W,    0, 0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "64AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
+{ kX86 ## opname ## 64RC, kShiftRegCl,                         IS_BINARY_OP   | REG_DEF0_USE0 | REG_USEC | SETS_CCODES, { REX_W,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "64RC", "!0r,cl" }, \
+{ kX86 ## opname ## 64MC, kShiftMemCl,    IS_LOAD | IS_STORE | IS_TERTIARY_OP | REG_USE0      | REG_USEC | SETS_CCODES, { REX_W,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "64MC", "[!0r+!1d],cl" }, \
+{ kX86 ## opname ## 64AC, kShiftArrayCl,  IS_LOAD | IS_STORE | IS_QUIN_OP     | REG_USE01     | REG_USEC | SETS_CCODES, { REX_W,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "64AC", "[!0r+!1r<<!2d+!3d],cl" }
 
   SHIFT_ENCODING_MAP(Rol, 0x0),
   SHIFT_ENCODING_MAP(Ror, 0x1),
@@ -232,6 +267,10 @@
   { kX86Shld32MRI,  kMemRegImm,    IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { 0,    0, 0x0F, 0xA4, 0, 0, 0, 1}, "Shld32MRI", "[!0r+!1d],!2r,!3d" },
   { kX86Shrd32RRI,  kRegRegImmRev, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES, { 0,    0, 0x0F, 0xAC, 0, 0, 0, 1}, "Shrd32RRI", "!0r,!1r,!2d" },
   { kX86Shrd32MRI,  kMemRegImm,    IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { 0,    0, 0x0F, 0xAC, 0, 0, 0, 1}, "Shrd32MRI", "[!0r+!1d],!2r,!3d" },
+  { kX86Shld64RRI,  kRegRegImmRev, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES, { REX_W,    0, 0x0F, 0xA4, 0, 0, 0, 1}, "Shld64RRI", "!0r,!1r,!2d" },
+  { kX86Shld64MRI,  kMemRegImm,    IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { REX_W,    0, 0x0F, 0xA4, 0, 0, 0, 1}, "Shld64MRI", "[!0r+!1d],!2r,!3d" },
+  { kX86Shrd64RRI,  kRegRegImmRev, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES, { REX_W,    0, 0x0F, 0xAC, 0, 0, 0, 1}, "Shrd64RRI", "!0r,!1r,!2d" },
+  { kX86Shrd64MRI,  kMemRegImm,    IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { REX_W,    0, 0x0F, 0xAC, 0, 0, 0, 1}, "Shrd64MRI", "[!0r+!1d],!2r,!3d" },
 
   { kX86Test8RI,  kRegImm,             IS_BINARY_OP   | REG_USE0  | SETS_CCODES, { 0,    0, 0xF6, 0, 0, 0, 0, 1}, "Test8RI", "!0r,!1d" },
   { kX86Test8MI,  kMemImm,   IS_LOAD | IS_TERTIARY_OP | REG_USE0  | SETS_CCODES, { 0,    0, 0xF6, 0, 0, 0, 0, 1}, "Test8MI", "[!0r+!1d],!2d" },
@@ -242,7 +281,12 @@
   { kX86Test32RI, kRegImm,             IS_BINARY_OP   | REG_USE0  | SETS_CCODES, { 0,    0, 0xF7, 0, 0, 0, 0, 4}, "Test32RI", "!0r,!1d" },
   { kX86Test32MI, kMemImm,   IS_LOAD | IS_TERTIARY_OP | REG_USE0  | SETS_CCODES, { 0,    0, 0xF7, 0, 0, 0, 0, 4}, "Test32MI", "[!0r+!1d],!2d" },
   { kX86Test32AI, kArrayImm, IS_LOAD | IS_QUIN_OP     | REG_USE01 | SETS_CCODES, { 0,    0, 0xF7, 0, 0, 0, 0, 4}, "Test32AI", "[!0r+!1r<<!2d+!3d],!4d" },
+  { kX86Test64RI, kRegImm,             IS_BINARY_OP   | REG_USE0  | SETS_CCODES, { REX_W, 0, 0xF7, 0, 0, 0, 0, 8}, "Test64RI", "!0r,!1d" },
+  { kX86Test64MI, kMemImm,   IS_LOAD | IS_TERTIARY_OP | REG_USE0  | SETS_CCODES, { REX_W, 0, 0xF7, 0, 0, 0, 0, 8}, "Test64MI", "[!0r+!1d],!2d" },
+  { kX86Test64AI, kArrayImm, IS_LOAD | IS_QUIN_OP     | REG_USE01 | SETS_CCODES, { REX_W, 0, 0xF7, 0, 0, 0, 0, 8}, "Test64AI", "[!0r+!1r<<!2d+!3d],!4d" },
+
   { kX86Test32RR, kRegReg,             IS_BINARY_OP   | REG_USE01 | SETS_CCODES, { 0,    0, 0x85, 0, 0, 0, 0, 0}, "Test32RR", "!0r,!1r" },
+  { kX86Test64RR, kRegReg,             IS_BINARY_OP   | REG_USE01 | SETS_CCODES, { REX_W, 0, 0x85, 0, 0, 0, 0, 0}, "Test64RR", "!0r,!1r" },
 
 #define UNARY_ENCODING_MAP(opname, modrm, is_store, sets_ccodes, \
                            reg, reg_kind, reg_flags, \
@@ -258,7 +302,10 @@
 { kX86 ## opname ## 16 ## arr, arr_kind, IS_LOAD | is_store | arr_flags | hw_flags | sets_ccodes, { 0x66, 0, 0xF7, 0, 0, modrm, 0, imm << 1}, #opname "16" #arr, hw_format "[!0r+!1r<<!2d+!3d]" }, \
 { kX86 ## opname ## 32 ## reg, reg_kind,                      reg_flags | w_flags  | sets_ccodes, { 0,    0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "32" #reg, w_format "!0r" }, \
 { kX86 ## opname ## 32 ## mem, mem_kind, IS_LOAD | is_store | mem_flags | w_flags  | sets_ccodes, { 0,    0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "32" #mem, w_format "[!0r+!1d]" }, \
-{ kX86 ## opname ## 32 ## arr, arr_kind, IS_LOAD | is_store | arr_flags | w_flags  | sets_ccodes, { 0,    0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "32" #arr, w_format "[!0r+!1r<<!2d+!3d]" }
+{ kX86 ## opname ## 32 ## arr, arr_kind, IS_LOAD | is_store | arr_flags | w_flags  | sets_ccodes, { 0,    0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "32" #arr, w_format "[!0r+!1r<<!2d+!3d]" }, \
+{ kX86 ## opname ## 64 ## reg, reg_kind,                      reg_flags | w_flags  | sets_ccodes, { REX_W, 0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "64" #reg, w_format "!0r" }, \
+{ kX86 ## opname ## 64 ## mem, mem_kind, IS_LOAD | is_store | mem_flags | w_flags  | sets_ccodes, { REX_W, 0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "64" #mem, w_format "[!0r+!1d]" }, \
+{ kX86 ## opname ## 64 ## arr, arr_kind, IS_LOAD | is_store | arr_flags | w_flags  | sets_ccodes, { REX_W, 0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "64" #arr, w_format "[!0r+!1r<<!2d+!3d]" }
 
   UNARY_ENCODING_MAP(Not, 0x2, IS_STORE, 0,           R, kReg, IS_UNARY_OP | REG_DEF0_USE0, M, kMem, IS_BINARY_OP | REG_USE0, A, kArray, IS_QUAD_OP | REG_USE01, 0, 0, 0, 0, "", "", ""),
   UNARY_ENCODING_MAP(Neg, 0x3, IS_STORE, SETS_CCODES, R, kReg, IS_UNARY_OP | REG_DEF0_USE0, M, kMem, IS_BINARY_OP | REG_USE0, A, kArray, IS_QUAD_OP | REG_USE01, 0, 0, 0, 0, "", "", ""),
@@ -431,7 +478,8 @@
   { kX86RepneScasw, kPrefix2Nullary, NO_OPERAND | REG_USEA | REG_USEC | SETS_CCODES, { 0x66, 0xF2, 0xAF, 0, 0, 0, 0, 0 }, "RepNE ScasW", "" },
 };
 
-size_t X86Mir2Lir::ComputeSize(const X86EncodingMap* entry, int base, int displacement, bool has_sib) {
+size_t X86Mir2Lir::ComputeSize(const X86EncodingMap* entry, int base, int displacement,
+                               int reg_r, int reg_x, bool has_sib) {
   size_t size = 0;
   if (entry->skeleton.prefix1 > 0) {
     ++size;
@@ -439,6 +487,10 @@
       ++size;
     }
   }
+  if ((NeedsRex(base) || NeedsRex(reg_r) || NeedsRex(reg_x)) &&
+       entry->skeleton.prefix1 != REX_W && entry->skeleton.prefix2 != REX_W) {
+    ++size;  // REX_R
+  }
   ++size;  // opcode
   if (entry->skeleton.opcode == 0x0F) {
     ++size;
@@ -447,13 +499,13 @@
     }
   }
   ++size;  // modrm
-  if (has_sib || RegStorage::RegNum(base) == rs_rX86_SP.GetRegNum()
+  if (has_sib || LowRegisterBits(RegStorage::RegNum(base)) == rs_rX86_SP.GetRegNum()
       || (Gen64Bit() && entry->skeleton.prefix1 == THREAD_PREFIX)) {
     // SP requires a SIB byte.
     // GS access also needs a SIB byte for absolute adressing in 64-bit mode.
     ++size;
   }
-  if (displacement != 0 || RegStorage::RegNum(base) == rs_rBP.GetRegNum()) {
+  if (displacement != 0 || LowRegisterBits(RegStorage::RegNum(base)) == rs_rBP.GetRegNum()) {
     // BP requires an explicit displacement, even when it's 0.
     if (entry->opcode != kX86Lea32RA) {
       DCHECK_NE(entry->flags & (IS_LOAD | IS_STORE), 0ULL) << entry->name;
@@ -477,38 +529,41 @@
     case kPrefix2Nullary:
       return 3;  // 1 byte of opcode + 2 prefixes
     case kRegOpcode:  // lir operands - 0: reg
-      return ComputeSize(entry, 0, 0, false) - 1;  // substract 1 for modrm
-    case kReg64:
+      // substract 1 for modrm
+      return ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false) - 1;
     case kReg:  // lir operands - 0: reg
-      return ComputeSize(entry, 0, 0, false);
+      return ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false);
     case kMem:  // lir operands - 0: base, 1: disp
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, NO_REG, false);
     case kArray:  // lir operands - 0: base, 1: index, 2: scale, 3: disp
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
-    case kMemReg64:
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         NO_REG, lir->operands[1], true);
     case kMemReg:  // lir operands - 0: base, 1: disp, 2: reg
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1],
+                         lir->operands[2], NO_REG, false);
     case kMemRegImm:  // lir operands - 0: base, 1: disp, 2: reg 3: immediate
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
-    case kArrayReg64:
+      return ComputeSize(entry, lir->operands[0], lir->operands[1],
+                         lir->operands[2], NO_REG, false);
     case kArrayReg:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: reg
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         lir->operands[4], lir->operands[1], true);
     case kThreadReg:  // lir operands - 0: disp, 1: reg
-      return ComputeSize(entry, 0, lir->operands[0], false);
-    case kRegReg:
-      return ComputeSize(entry, 0, 0, false);
-    case kRegRegStore:
-      return ComputeSize(entry, 0, 0, false);
+      return ComputeSize(entry, 0, lir->operands[0], lir->operands[1], NO_REG, false);
+    case kRegReg:  // lir operands - 0: reg1, 1: reg2
+      return ComputeSize(entry, 0, 0, lir->operands[0], lir->operands[1], false);
+    case kRegRegStore:  // lir operands - 0: reg2, 1: reg1
+      return ComputeSize(entry, 0, 0, lir->operands[1], lir->operands[0], false);
     case kRegMem:  // lir operands - 0: reg, 1: base, 2: disp
-      return ComputeSize(entry, lir->operands[1], lir->operands[2], false);
+      return ComputeSize(entry, lir->operands[1], lir->operands[2],
+                         lir->operands[0], NO_REG, false);
     case kRegArray:   // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: disp
-      return ComputeSize(entry, lir->operands[1], lir->operands[4], true);
-    case kReg64Thread:  // lir operands - 0: reg, 1: disp
+      return ComputeSize(entry, lir->operands[1], lir->operands[4],
+                         lir->operands[0], lir->operands[2], true);
     case kRegThread:  // lir operands - 0: reg, 1: disp
-      return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
-    case kReg64Imm:
+      // displacement size is always 32bit
+      return ComputeSize(entry, 0, 0x12345678, lir->operands[0], NO_REG, false);
     case kRegImm: {  // lir operands - 0: reg, 1: immediate
-      size_t size = ComputeSize(entry, 0, 0, false);
+      size_t size = ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false);
       if (entry->skeleton.ax_opcode == 0) {
         return size;
       } else {
@@ -518,47 +573,58 @@
       }
     }
     case kMemImm:  // lir operands - 0: base, 1: disp, 2: immediate
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1],
+                         NO_REG, lir->operands[0], false);
     case kArrayImm:  // lir operands - 0: base, 1: index, 2: scale, 3: disp 4: immediate
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         NO_REG, lir->operands[1], true);
     case kThreadImm:  // lir operands - 0: disp, 1: imm
-      return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
+      // displacement size is always 32bit
+      return ComputeSize(entry, 0, 0x12345678, NO_REG, NO_REG, false);
     case kRegRegImm:  // lir operands - 0: reg, 1: reg, 2: imm
     case kRegRegImmRev:
-      return ComputeSize(entry, 0, 0, false);
+      return ComputeSize(entry, 0, 0, lir->operands[0], lir->operands[1], false);
     case kRegMemImm:  // lir operands - 0: reg, 1: base, 2: disp, 3: imm
-      return ComputeSize(entry, lir->operands[1], lir->operands[2], false);
+      return ComputeSize(entry, lir->operands[1], lir->operands[2],
+                         lir->operands[0], NO_REG, false);
     case kRegArrayImm:  // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: disp, 5: imm
-      return ComputeSize(entry, lir->operands[1], lir->operands[4], true);
+      return ComputeSize(entry, lir->operands[1], lir->operands[4],
+                         lir->operands[0], lir->operands[2], true);
     case kMovRegImm:  // lir operands - 0: reg, 1: immediate
-      return 1 + entry->skeleton.immediate_bytes;
+      return (entry->skeleton.prefix1 != 0 || NeedsRex(lir->operands[0])?1:0) +
+             1 + entry->skeleton.immediate_bytes;
     case kShiftRegImm:  // lir operands - 0: reg, 1: immediate
       // Shift by immediate one has a shorter opcode.
-      return ComputeSize(entry, 0, 0, false) - (lir->operands[1] == 1 ? 1 : 0);
+      return ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false) -
+             (lir->operands[1] == 1 ? 1 : 0);
     case kShiftMemImm:  // lir operands - 0: base, 1: disp, 2: immediate
       // Shift by immediate one has a shorter opcode.
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false) -
+      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, NO_REG, false) -
              (lir->operands[2] == 1 ? 1 : 0);
     case kShiftArrayImm:  // lir operands - 0: base, 1: index, 2: scale, 3: disp 4: immediate
       // Shift by immediate one has a shorter opcode.
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true) -
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         NO_REG, lir->operands[1], true) -
              (lir->operands[4] == 1 ? 1 : 0);
-    case kShiftRegCl:
-      return ComputeSize(entry, 0, 0, false);
+    case kShiftRegCl:  // lir operands - 0: reg, 1: cl
+      return ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false);
     case kShiftMemCl:  // lir operands - 0: base, 1: disp, 2: cl
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, NO_REG, false);
     case kShiftArrayCl:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: reg
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         lir->operands[4], lir->operands[1], true);
     case kRegCond:  // lir operands - 0: reg, 1: cond
-      return ComputeSize(entry, 0, 0, false);
+      return ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false);
     case kMemCond:  // lir operands - 0: base, 1: disp, 2: cond
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, NO_REG, false);
     case kArrayCond:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: cond
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         NO_REG, lir->operands[1], true);
     case kRegRegCond:  // lir operands - 0: reg, 1: reg, 2: cond
-      return ComputeSize(entry, 0, 0, false);
+      return ComputeSize(entry, 0, 0, lir->operands[0], lir->operands[1], false);
     case kRegMemCond:  // lir operands - 0: reg, 1: reg, 2: disp, 3:cond
-      return ComputeSize(entry, lir->operands[1], lir->operands[2], false);
+      return ComputeSize(entry, lir->operands[1], lir->operands[2],
+                         lir->operands[0], lir->operands[1], false);
     case kJcc:
       if (lir->opcode == kX86Jcc8) {
         return 2;  // opcode + rel8
@@ -572,21 +638,28 @@
       } else if (lir->opcode == kX86Jmp32) {
         return 5;  // opcode + rel32
       } else if (lir->opcode == kX86JmpT) {
-        return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
+        // displacement size is always 32bit
+        return ComputeSize(entry, 0, 0x12345678, NO_REG, NO_REG, false);
       } else {
         DCHECK(lir->opcode == kX86JmpR);
-        return 2;  // opcode + modrm
+        if (NeedsRex(lir->operands[0])) {
+          return 3;  // REX.B + opcode + modrm
+        } else {
+          return 2;  // opcode + modrm
+        }
       }
     case kCall:
       switch (lir->opcode) {
         case kX86CallI: return 5;  // opcode 0:disp
         case kX86CallR: return 2;  // opcode modrm
         case kX86CallM:  // lir operands - 0: base, 1: disp
-          return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+          return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, NO_REG, false);
         case kX86CallA:  // lir operands - 0: base, 1: index, 2: scale, 3: disp
-          return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+          return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                             NO_REG, lir->operands[1], true);
         case kX86CallT:  // lir operands - 0: disp
-          return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
+          // displacement size is always 32bit
+          return ComputeSize(entry, 0, 0x12345678, NO_REG, NO_REG, false);
         default:
           break;
       }
@@ -594,16 +667,19 @@
     case kPcRel:
       if (entry->opcode == kX86PcRelLoadRA) {
         // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
-        return ComputeSize(entry, lir->operands[1], 0x12345678, true);
+        return ComputeSize(entry, lir->operands[1], 0x12345678,
+                           lir->operands[0], lir->operands[2], true);
       } else {
         DCHECK(entry->opcode == kX86PcRelAdr);
         return 5;  // opcode with reg + 4 byte immediate
       }
-    case kMacro:
+    case kMacro:  // lir operands - 0: reg
       DCHECK_EQ(lir->opcode, static_cast<int>(kX86StartOfMethod));
       return 5 /* call opcode + 4 byte displacement */ + 1 /* pop reg */ +
-          ComputeSize(&X86Mir2Lir::EncodingMap[kX86Sub32RI], 0, 0, false) -
-          (RegStorage::RegNum(lir->operands[0]) == rs_rAX.GetRegNum()  ? 1 : 0);  // shorter ax encoding
+          ComputeSize(&X86Mir2Lir::EncodingMap[kX86Sub32RI], 0, 0,
+                      lir->operands[0], NO_REG, false) -
+          // shorter ax encoding
+          (RegStorage::RegNum(lir->operands[0]) == rs_rAX.GetRegNum()  ? 1 : 0);
     default:
       break;
   }
@@ -612,19 +688,62 @@
 }
 
 void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry) {
+  EmitPrefix(entry, NO_REG, NO_REG, NO_REG);
+}
+
+void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry,
+                            uint8_t reg_r, uint8_t reg_x, uint8_t reg_b) {
+  // REX.WRXB
+  // W - 64-bit operand
+  // R - MODRM.reg
+  // X - SIB.index
+  // B - MODRM.rm/SIB.base
+  bool force = false;
+  bool w = (entry->skeleton.prefix1 == REX_W) || (entry->skeleton.prefix2 == REX_W);
+  bool r = NeedsRex(reg_r);
+  bool x = NeedsRex(reg_x);
+  bool b = NeedsRex(reg_b);
+  uint8_t rex = force ? 0x40 : 0;
+  if (w) {
+    rex |= 0x48;  // REX.W000
+  }
+  if (r) {
+    rex |= 0x44;  // REX.0R00
+  }
+  if (x) {
+    rex |= 0x42;  // REX.00X0
+  }
+  if (b) {
+    rex |= 0x41;  // REX.000B
+  }
   if (entry->skeleton.prefix1 != 0) {
     if (Gen64Bit() && entry->skeleton.prefix1 == THREAD_PREFIX) {
       // 64 bit adresses by GS, not FS
       code_buffer_.push_back(THREAD_PREFIX_GS);
     } else {
-      code_buffer_.push_back(entry->skeleton.prefix1);
+      if (entry->skeleton.prefix1 == REX_W) {
+        rex |= entry->skeleton.prefix1;
+        code_buffer_.push_back(rex);
+        rex = 0;
+      } else {
+        code_buffer_.push_back(entry->skeleton.prefix1);
+      }
     }
     if (entry->skeleton.prefix2 != 0) {
-      code_buffer_.push_back(entry->skeleton.prefix2);
+      if (entry->skeleton.prefix2 == REX_W) {
+        rex |= entry->skeleton.prefix2;
+        code_buffer_.push_back(rex);
+        rex = 0;
+      } else {
+        code_buffer_.push_back(entry->skeleton.prefix2);
+      }
     }
   } else {
     DCHECK_EQ(0, entry->skeleton.prefix2);
   }
+  if (rex != 0) {
+    code_buffer_.push_back(rex);
+  }
 }
 
 void X86Mir2Lir::EmitOpcode(const X86EncodingMap* entry) {
@@ -643,7 +762,12 @@
 }
 
 void X86Mir2Lir::EmitPrefixAndOpcode(const X86EncodingMap* entry) {
-  EmitPrefix(entry);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, NO_REG);
+}
+
+void X86Mir2Lir::EmitPrefixAndOpcode(const X86EncodingMap* entry,
+                                     uint8_t reg_r, uint8_t reg_x, uint8_t reg_b) {
+  EmitPrefix(entry, reg_r, reg_x, reg_b);
   EmitOpcode(entry);
 }
 
@@ -712,7 +836,7 @@
   EmitDisp(base, disp);
 }
 
-void X86Mir2Lir::EmitImm(const X86EncodingMap* entry, int imm) {
+void X86Mir2Lir::EmitImm(const X86EncodingMap* entry, int64_t imm) {
   switch (entry->skeleton.immediate_bytes) {
     case 1:
       DCHECK(IS_SIMM8(imm));
@@ -724,11 +848,26 @@
       code_buffer_.push_back((imm >> 8) & 0xFF);
       break;
     case 4:
+      if (imm <0) {
+        CHECK_EQ((-imm) & 0x0FFFFFFFFl, -imm);
+      } else {
+        CHECK_EQ(imm & 0x0FFFFFFFFl, imm);
+      }
       code_buffer_.push_back(imm & 0xFF);
       code_buffer_.push_back((imm >> 8) & 0xFF);
       code_buffer_.push_back((imm >> 16) & 0xFF);
       code_buffer_.push_back((imm >> 24) & 0xFF);
       break;
+    case 8:
+      code_buffer_.push_back(imm & 0xFF);
+      code_buffer_.push_back((imm >> 8) & 0xFF);
+      code_buffer_.push_back((imm >> 16) & 0xFF);
+      code_buffer_.push_back((imm >> 24) & 0xFF);
+      code_buffer_.push_back((imm >> 32) & 0xFF);
+      code_buffer_.push_back((imm >> 40) & 0xFF);
+      code_buffer_.push_back((imm >> 48) & 0xFF);
+      code_buffer_.push_back((imm >> 56) & 0xFF);
+      break;
     default:
       LOG(FATAL) << "Unexpected immediate bytes (" << entry->skeleton.immediate_bytes
                  << ") for instruction: " << entry->name;
@@ -737,7 +876,8 @@
 }
 
 void X86Mir2Lir::EmitOpRegOpcode(const X86EncodingMap* entry, uint8_t reg) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   // There's no 3-byte instruction with +rd
   DCHECK(entry->skeleton.opcode != 0x0F ||
          (entry->skeleton.extra_opcode1 != 0x38 && entry->skeleton.extra_opcode1 != 0x3A));
@@ -749,7 +889,8 @@
 }
 
 void X86Mir2Lir::EmitOpReg(const X86EncodingMap* entry, uint8_t reg) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   if (RegStorage::RegNum(reg) >= 4) {
     DCHECK(strchr(entry->name, '8') == NULL) << entry->name << " "
         << static_cast<int>(RegStorage::RegNum(reg))
@@ -763,7 +904,8 @@
 }
 
 void X86Mir2Lir::EmitOpMem(const X86EncodingMap* entry, uint8_t base, int disp) {
-  EmitPrefix(entry);
+  EmitPrefix(entry, NO_REG, NO_REG, base);
+  base = LowRegisterBits(base);
   code_buffer_.push_back(entry->skeleton.opcode);
   DCHECK_NE(0x0F, entry->skeleton.opcode);
   DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -775,15 +917,29 @@
 
 void X86Mir2Lir::EmitOpArray(const X86EncodingMap* entry, uint8_t base, uint8_t index,
                              int scale, int disp) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, NO_REG, index, base);
+  index = LowRegisterBits(index);
+  base = LowRegisterBits(base);
   EmitModrmSibDisp(entry->skeleton.modrm_opcode, base, index, scale, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
 }
 
+uint8_t X86Mir2Lir::LowRegisterBits(uint8_t reg) {
+  uint8_t res = reg;
+  res = reg & kRegNumMask32;  // 3 bits
+  return res;
+}
+
+bool X86Mir2Lir::NeedsRex(uint8_t reg) {
+  return RegStorage::RegNum(reg) > 7;
+}
+
 void X86Mir2Lir::EmitMemReg(const X86EncodingMap* entry,
                        uint8_t base, int disp, uint8_t reg) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg, NO_REG, base);
+  reg = LowRegisterBits(reg);
+  base = LowRegisterBits(base);
   if (RegStorage::RegNum(reg) >= 4) {
     DCHECK(strchr(entry->name, '8') == NULL ||
            entry->opcode == kX86Movzx8RM || entry->opcode == kX86Movsx8RM)
@@ -802,9 +958,12 @@
   EmitMemReg(entry, base, disp, reg);
 }
 
-void X86Mir2Lir::EmitRegArray(const X86EncodingMap* entry, uint8_t reg, uint8_t base, uint8_t index,
-                              int scale, int disp) {
-  EmitPrefixAndOpcode(entry);
+void X86Mir2Lir::EmitRegArray(const X86EncodingMap* entry, uint8_t reg, uint8_t base,
+                              uint8_t index, int scale, int disp) {
+  EmitPrefixAndOpcode(entry, reg, index, base);
+  reg = LowRegisterBits(reg);
+  index = LowRegisterBits(index);
+  base = LowRegisterBits(base);
   EmitModrmSibDisp(reg, base, index, scale, disp);
   DCHECK_EQ(0, entry->skeleton.modrm_opcode);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
@@ -819,7 +978,9 @@
 
 void X86Mir2Lir::EmitArrayImm(const X86EncodingMap* entry, uint8_t base, uint8_t index, int scale,
                               int disp, int32_t imm) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, NO_REG, index, base);
+  index = LowRegisterBits(index);
+  base = LowRegisterBits(base);
   EmitModrmSibDisp(entry->skeleton.modrm_opcode, base, index, scale, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   EmitImm(entry, imm);
@@ -827,7 +988,8 @@
 
 void X86Mir2Lir::EmitRegThread(const X86EncodingMap* entry, uint8_t reg, int disp) {
   DCHECK_NE(entry->skeleton.prefix1, 0);
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   if (RegStorage::RegNum(reg) >= 4) {
     DCHECK(strchr(entry->name, '8') == NULL) << entry->name << " "
         << static_cast<int>(RegStorage::RegNum(reg))
@@ -845,7 +1007,9 @@
 }
 
 void X86Mir2Lir::EmitRegReg(const X86EncodingMap* entry, uint8_t reg1, uint8_t reg2) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg1, NO_REG, reg2);
+  reg1 = LowRegisterBits(reg1);
+  reg2 = LowRegisterBits(reg2);
   DCHECK_LT(RegStorage::RegNum(reg1), 8);
   DCHECK_LT(RegStorage::RegNum(reg2), 8);
   uint8_t modrm = (3 << 6) | (RegStorage::RegNum(reg1) << 3) | RegStorage::RegNum(reg2);
@@ -857,7 +1021,9 @@
 
 void X86Mir2Lir::EmitRegRegImm(const X86EncodingMap* entry,
                           uint8_t reg1, uint8_t reg2, int32_t imm) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg1, NO_REG, reg2);
+  reg1 = LowRegisterBits(reg1);
+  reg2 = LowRegisterBits(reg2);
   DCHECK_LT(RegStorage::RegNum(reg1), 8);
   DCHECK_LT(RegStorage::RegNum(reg2), 8);
   uint8_t modrm = (3 << 6) | (RegStorage::RegNum(reg1) << 3) | RegStorage::RegNum(reg2);
@@ -874,7 +1040,9 @@
 
 void X86Mir2Lir::EmitRegMemImm(const X86EncodingMap* entry,
                                uint8_t reg, uint8_t base, int disp, int32_t imm) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg, NO_REG, base);
+  reg = LowRegisterBits(reg);
+  base = LowRegisterBits(base);
   DCHECK(!RegStorage::IsFloat(reg));
   DCHECK_LT(RegStorage::RegNum(reg), 8);
   EmitModrmDisp(reg, base, disp);
@@ -889,10 +1057,11 @@
 }
 
 void X86Mir2Lir::EmitRegImm(const X86EncodingMap* entry, uint8_t reg, int imm) {
-  EmitPrefix(entry);
+  EmitPrefix(entry, NO_REG, NO_REG, reg);
   if (RegStorage::RegNum(reg) == rs_rAX.GetRegNum() && entry->skeleton.ax_opcode != 0) {
     code_buffer_.push_back(entry->skeleton.ax_opcode);
   } else {
+    reg = LowRegisterBits(reg);
     EmitOpcode(entry);
     uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | RegStorage::RegNum(reg);
     code_buffer_.push_back(modrm);
@@ -901,7 +1070,8 @@
 }
 
 void X86Mir2Lir::EmitMemImm(const X86EncodingMap* entry, uint8_t base, int disp, int32_t imm) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, base);
+  base = LowRegisterBits(base);
   EmitModrmDisp(entry->skeleton.modrm_opcode, base, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   EmitImm(entry, imm);
@@ -918,17 +1088,37 @@
   DCHECK_EQ(entry->skeleton.ax_opcode, 0);
 }
 
-void X86Mir2Lir::EmitMovRegImm(const X86EncodingMap* entry, uint8_t reg, int imm) {
+void X86Mir2Lir::EmitMovRegImm(const X86EncodingMap* entry, uint8_t reg, int64_t imm) {
+  EmitPrefix(entry, NO_REG, NO_REG, reg);
+  reg = LowRegisterBits(reg);
   DCHECK_LT(RegStorage::RegNum(reg), 8);
   code_buffer_.push_back(0xB8 + RegStorage::RegNum(reg));
-  code_buffer_.push_back(imm & 0xFF);
-  code_buffer_.push_back((imm >> 8) & 0xFF);
-  code_buffer_.push_back((imm >> 16) & 0xFF);
-  code_buffer_.push_back((imm >> 24) & 0xFF);
+  switch (entry->skeleton.immediate_bytes) {
+    case 4:
+      code_buffer_.push_back(imm & 0xFF);
+      code_buffer_.push_back((imm >> 8) & 0xFF);
+      code_buffer_.push_back((imm >> 16) & 0xFF);
+      code_buffer_.push_back((imm >> 24) & 0xFF);
+      break;
+    case 8:
+      code_buffer_.push_back(imm & 0xFF);
+      code_buffer_.push_back((imm >> 8) & 0xFF);
+      code_buffer_.push_back((imm >> 16) & 0xFF);
+      code_buffer_.push_back((imm >> 24) & 0xFF);
+      code_buffer_.push_back((imm >> 32) & 0xFF);
+      code_buffer_.push_back((imm >> 40) & 0xFF);
+      code_buffer_.push_back((imm >> 48) & 0xFF);
+      code_buffer_.push_back((imm >> 56) & 0xFF);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported immediate size for EmitMovRegImm: "
+                 << static_cast<uint32_t>(entry->skeleton.immediate_bytes);
+  }
 }
 
 void X86Mir2Lir::EmitShiftRegImm(const X86EncodingMap* entry, uint8_t reg, int imm) {
-  EmitPrefix(entry);
+  EmitPrefix(entry, NO_REG, NO_REG, reg);
+  reg = LowRegisterBits(reg);
   if (imm != 1) {
     code_buffer_.push_back(entry->skeleton.opcode);
   } else {
@@ -955,7 +1145,8 @@
 
 void X86Mir2Lir::EmitShiftRegCl(const X86EncodingMap* entry, uint8_t reg, uint8_t cl) {
   DCHECK_EQ(cl, static_cast<uint8_t>(rs_rCX.GetReg()));
-  EmitPrefix(entry);
+  EmitPrefix(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   code_buffer_.push_back(entry->skeleton.opcode);
   DCHECK_NE(0x0F, entry->skeleton.opcode);
   DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -970,7 +1161,8 @@
 void X86Mir2Lir::EmitShiftMemCl(const X86EncodingMap* entry, uint8_t base,
                                 int displacement, uint8_t cl) {
   DCHECK_EQ(cl, static_cast<uint8_t>(rs_rCX.GetReg()));
-  EmitPrefix(entry);
+  EmitPrefix(entry, NO_REG, NO_REG, base);
+  base = LowRegisterBits(base);
   code_buffer_.push_back(entry->skeleton.opcode);
   DCHECK_NE(0x0F, entry->skeleton.opcode);
   DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -983,7 +1175,8 @@
 
 void X86Mir2Lir::EmitShiftMemImm(const X86EncodingMap* entry, uint8_t base,
                                 int displacement, int imm) {
-  EmitPrefix(entry);
+  EmitPrefix(entry, NO_REG, NO_REG, base);
+  base = LowRegisterBits(base);
   if (imm != 1) {
     code_buffer_.push_back(entry->skeleton.opcode);
   } else {
@@ -1002,7 +1195,8 @@
 }
 
 void X86Mir2Lir::EmitRegCond(const X86EncodingMap* entry, uint8_t reg, uint8_t condition) {
-  EmitPrefix(entry);
+  EmitPrefix(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0x0F, entry->skeleton.opcode);
   code_buffer_.push_back(0x0F);
@@ -1015,7 +1209,8 @@
   DCHECK_EQ(entry->skeleton.immediate_bytes, 0);
 }
 
-void X86Mir2Lir::EmitMemCond(const X86EncodingMap* entry, uint8_t base, int displacement, uint8_t condition) {
+void X86Mir2Lir::EmitMemCond(const X86EncodingMap* entry, uint8_t base, int displacement,
+                             uint8_t condition) {
   if (entry->skeleton.prefix1 != 0) {
     code_buffer_.push_back(entry->skeleton.prefix1);
     if (entry->skeleton.prefix2 != 0) {
@@ -1037,7 +1232,9 @@
 void X86Mir2Lir::EmitRegRegCond(const X86EncodingMap* entry, uint8_t reg1, uint8_t reg2,
                                 uint8_t condition) {
   // Generate prefix and opcode without the condition
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg1, NO_REG, reg2);
+  reg1 = LowRegisterBits(reg1);
+  reg2 = LowRegisterBits(reg2);
 
   // Now add the condition. The last byte of opcode is the one that receives it.
   DCHECK_LE(condition, 0xF);
@@ -1059,9 +1256,12 @@
   code_buffer_.push_back(modrm);
 }
 
-void X86Mir2Lir::EmitRegMemCond(const X86EncodingMap* entry, uint8_t reg1, uint8_t base, int displacement, uint8_t condition) {
+void X86Mir2Lir::EmitRegMemCond(const X86EncodingMap* entry, uint8_t reg1, uint8_t base,
+                                int displacement, uint8_t condition) {
   // Generate prefix and opcode without the condition
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg1, NO_REG, base);
+  reg1 = LowRegisterBits(reg1);
+  base = LowRegisterBits(base);
 
   // Now add the condition. The last byte of opcode is the one that receives it.
   DCHECK_LE(condition, 0xF);
@@ -1094,8 +1294,10 @@
     code_buffer_.push_back(rel & 0xFF);
   } else {
     DCHECK(entry->opcode == kX86JmpR);
-    code_buffer_.push_back(entry->skeleton.opcode);
     uint8_t reg = static_cast<uint8_t>(rel);
+    EmitPrefix(entry, NO_REG, NO_REG, reg);
+    code_buffer_.push_back(entry->skeleton.opcode);
+    reg = LowRegisterBits(reg);
     DCHECK_LT(RegStorage::RegNum(reg), 8);
     uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | RegStorage::RegNum(reg);
     code_buffer_.push_back(modrm);
@@ -1120,7 +1322,8 @@
 }
 
 void X86Mir2Lir::EmitCallMem(const X86EncodingMap* entry, uint8_t base, int disp) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, base);
+  base = LowRegisterBits(base);
   EmitModrmDisp(entry->skeleton.modrm_opcode, base, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
@@ -1161,9 +1364,12 @@
         reinterpret_cast<Mir2Lir::EmbeddedData*>(UnwrapPointer(base_or_table));
     disp = tab_rec->offset;
   }
-  EmitPrefix(entry);
-  DCHECK_LT(RegStorage::RegNum(reg), 8);
   if (entry->opcode == kX86PcRelLoadRA) {
+    EmitPrefix(entry, reg, index, base_or_table);
+    reg = LowRegisterBits(reg);
+    base_or_table = LowRegisterBits(base_or_table);
+    index = LowRegisterBits(index);
+    DCHECK_LT(RegStorage::RegNum(reg), 8);
     code_buffer_.push_back(entry->skeleton.opcode);
     DCHECK_NE(0x0F, entry->skeleton.opcode);
     DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -1178,6 +1384,7 @@
     code_buffer_.push_back(sib);
     DCHECK_EQ(0, entry->skeleton.immediate_bytes);
   } else {
+    DCHECK_LT(RegStorage::RegNum(reg), 8);
     code_buffer_.push_back(entry->skeleton.opcode + RegStorage::RegNum(reg));
   }
   code_buffer_.push_back(disp & 0xFF);
@@ -1190,6 +1397,8 @@
 
 void X86Mir2Lir::EmitMacro(const X86EncodingMap* entry, uint8_t reg, int offset) {
   DCHECK(entry->opcode == kX86StartOfMethod) << entry->name;
+  EmitPrefix(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   code_buffer_.push_back(0xE8);  // call +0
   code_buffer_.push_back(0);
   code_buffer_.push_back(0);
@@ -1380,7 +1589,6 @@
       case kRegOpcode:  // lir operands - 0: reg
         EmitOpRegOpcode(entry, lir->operands[0]);
         break;
-      case kReg64:
       case kReg:  // lir operands - 0: reg
         EmitOpReg(entry, lir->operands[0]);
         break;
@@ -1390,7 +1598,6 @@
       case kArray:  // lir operands - 0: base, 1: index, 2: scale, 3: disp
         EmitOpArray(entry, lir->operands[0], lir->operands[1], lir->operands[2], lir->operands[3]);
         break;
-      case kMemReg64:
       case kMemReg:  // lir operands - 0: base, 1: disp, 2: reg
         EmitMemReg(entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
@@ -1401,7 +1608,6 @@
         EmitArrayImm(entry, lir->operands[0], lir->operands[1], lir->operands[2],
                      lir->operands[3], lir->operands[4]);
         break;
-      case kArrayReg64:
       case kArrayReg:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: reg
         EmitArrayReg(entry, lir->operands[0], lir->operands[1], lir->operands[2],
                      lir->operands[3], lir->operands[4]);
@@ -1413,7 +1619,6 @@
         EmitRegArray(entry, lir->operands[0], lir->operands[1], lir->operands[2],
                      lir->operands[3], lir->operands[4]);
         break;
-      case kReg64Thread:  // lir operands - 0: reg, 1: disp
       case kRegThread:  // lir operands - 0: reg, 1: disp
         EmitRegThread(entry, lir->operands[0], lir->operands[1]);
         break;
@@ -1437,7 +1642,6 @@
         EmitRegMemImm(entry, lir->operands[0], lir->operands[1], lir->operands[2],
                       lir->operands[3]);
         break;
-      case kReg64Imm:
       case kRegImm:  // lir operands - 0: reg, 1: immediate
         EmitRegImm(entry, lir->operands[0], lir->operands[1]);
         break;
@@ -1469,7 +1673,8 @@
         EmitRegRegCond(entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
       case kRegMemCond:  // lir operands - 0: reg, 1: reg, displacement, 3: condition
-        EmitRegMemCond(entry, lir->operands[0], lir->operands[1], lir->operands[2], lir->operands[3]);
+        EmitRegMemCond(entry, lir->operands[0], lir->operands[1], lir->operands[2],
+                       lir->operands[3]);
         break;
       case kJmp:  // lir operands - 0: rel
         if (entry->opcode == kX86JmpT) {
@@ -1503,7 +1708,7 @@
         EmitPcRel(entry, lir->operands[0], lir->operands[1], lir->operands[2],
                   lir->operands[3], lir->operands[4]);
         break;
-      case kMacro:
+      case kMacro:  // lir operands - 0: reg
         EmitMacro(entry, lir->operands[0], lir->offset);
         break;
       default:
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 1807d5c..d66790d 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -331,15 +331,21 @@
     std::vector<uint8_t>* ReturnCallFrameInformation();
 
   protected:
-    size_t ComputeSize(const X86EncodingMap* entry, int base, int displacement, bool has_sib);
+    size_t ComputeSize(const X86EncodingMap* entry, int base, int displacement,
+                       int reg_r, int reg_x, bool has_sib);
+    uint8_t LowRegisterBits(uint8_t reg);
+    bool NeedsRex(uint8_t reg);
     void EmitPrefix(const X86EncodingMap* entry);
+    void EmitPrefix(const X86EncodingMap* entry, uint8_t reg_r, uint8_t reg_x, uint8_t reg_b);
     void EmitOpcode(const X86EncodingMap* entry);
     void EmitPrefixAndOpcode(const X86EncodingMap* entry);
+    void EmitPrefixAndOpcode(const X86EncodingMap* entry,
+                             uint8_t reg_r, uint8_t reg_x, uint8_t reg_b);
     void EmitDisp(uint8_t base, int disp);
     void EmitModrmThread(uint8_t reg_or_opcode);
     void EmitModrmDisp(uint8_t reg_or_opcode, uint8_t base, int disp);
     void EmitModrmSibDisp(uint8_t reg_or_opcode, uint8_t base, uint8_t index, int scale, int disp);
-    void EmitImm(const X86EncodingMap* entry, int imm);
+    void EmitImm(const X86EncodingMap* entry, int64_t imm);
     void EmitOpRegOpcode(const X86EncodingMap* entry, uint8_t reg);
     void EmitOpReg(const X86EncodingMap* entry, uint8_t reg);
     void EmitOpMem(const X86EncodingMap* entry, uint8_t base, int disp);
@@ -362,7 +368,7 @@
     void EmitMemRegImm(const X86EncodingMap* entry, uint8_t base, int disp, uint8_t reg1, int32_t imm);
     void EmitRegImm(const X86EncodingMap* entry, uint8_t reg, int imm);
     void EmitThreadImm(const X86EncodingMap* entry, int disp, int imm);
-    void EmitMovRegImm(const X86EncodingMap* entry, uint8_t reg, int imm);
+    void EmitMovRegImm(const X86EncodingMap* entry, uint8_t reg, int64_t imm);
     void EmitShiftRegImm(const X86EncodingMap* entry, uint8_t reg, int imm);
     void EmitShiftMemImm(const X86EncodingMap* entry, uint8_t base, int disp, int imm);
     void EmitShiftMemCl(const X86EncodingMap* entry, uint8_t base, int displacement, uint8_t cl);
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 19ad2f8..8b34168 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -38,7 +38,7 @@
 static const RegStorage core_regs_arr_64q[] = {
     rs_r0q, rs_r1q, rs_r2q, rs_r3q, rs_rX86_SP_64, rs_r5q, rs_r6q, rs_r7q,
 #ifdef TARGET_REX_SUPPORT
-    rs_r8, rs_r9, rs_r10, rs_r11, rs_r12, rs_r13, rs_r14, rs_r15
+    rs_r8q, rs_r9q, rs_r10q, rs_r11q, rs_r12q, rs_r13q, rs_r14q, rs_r15q
 #endif
 };
 static const RegStorage sp_regs_arr_32[] = {
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index fed31c1..ee02257 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -89,7 +89,11 @@
     res = NewLIR2(kX86Xor32RR, r_dest.GetReg(), r_dest.GetReg());
   } else {
     // Note, there is no byte immediate form of a 32 bit immediate move.
-    res = NewLIR2(kX86Mov32RI, r_dest.GetReg(), value);
+    if (r_dest.Is64Bit()) {
+      res = NewLIR2(kX86Mov64RI, r_dest.GetReg(), value);
+    } else {
+      res = NewLIR2(kX86Mov32RI, r_dest.GetReg(), value);
+    }
   }
 
   if (r_dest_save.IsFloat()) {
@@ -181,7 +185,6 @@
         LOG(FATAL) << "Bad case in OpRegImm " << op;
     }
   }
-  CHECK(!r_dest_src1.Is64Bit() || X86Mir2Lir::EncodingMap[opcode].kind == kReg64Imm) << "OpRegImm(" << op << ")";
   return NewLIR2(opcode, r_dest_src1.GetReg(), value);
 }
 
@@ -559,7 +562,7 @@
         // We don't know the proper offset for the value, so pick one that will force
         // 4 byte offset.  We will fix this up in the assembler later to have the right
         // value.
-        res = LoadBaseDisp(rl_method.reg, 256 /* bogus */, RegStorage::Solo64(low_reg_val),
+        res = LoadBaseDisp(rl_method.reg, 256 /* bogus */, RegStorage::FloatSolo64(low_reg_val),
                            kDouble);
         res->target = data_target;
         res->flags.fixup = kFixupLoad;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 964422c..bb8df89 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -175,6 +175,16 @@
   fr5  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 5,
   fr6  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 6,
   fr7  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 7,
+#ifdef TARGET_REX_SUPPORT
+  fr8  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 8,
+  fr9  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 9,
+  fr10 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 10,
+  fr11 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 11,
+  fr12 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 12,
+  fr13 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 13,
+  fr14 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 14,
+  fr15 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 15,
+#endif
 
   // xmm registers, double precision aliases.
   dr0  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 0,
@@ -185,8 +195,18 @@
   dr5  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 5,
   dr6  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 6,
   dr7  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 7,
+#ifdef TARGET_REX_SUPPORT
+  dr8  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 8,
+  dr9  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 9,
+  dr10 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 10,
+  dr11 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 11,
+  dr12 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 12,
+  dr13 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 13,
+  dr14 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 14,
+  dr15 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 15,
+#endif
 
-  // xmm registers aliases.
+  // xmm registers, quad precision aliases
   xr0  = RegStorage::k128BitSolo | 0,
   xr1  = RegStorage::k128BitSolo | 1,
   xr2  = RegStorage::k128BitSolo | 2,
@@ -195,6 +215,16 @@
   xr5  = RegStorage::k128BitSolo | 5,
   xr6  = RegStorage::k128BitSolo | 6,
   xr7  = RegStorage::k128BitSolo | 7,
+#ifdef TARGET_REX_SUPPORT
+  xr8  = RegStorage::k128BitSolo | 8,
+  xr9  = RegStorage::k128BitSolo | 9,
+  xr10 = RegStorage::k128BitSolo | 10,
+  xr11 = RegStorage::k128BitSolo | 11,
+  xr12 = RegStorage::k128BitSolo | 12,
+  xr13 = RegStorage::k128BitSolo | 13,
+  xr14 = RegStorage::k128BitSolo | 14,
+  xr15 = RegStorage::k128BitSolo | 15,
+#endif
 
   // TODO: as needed, add 256, 512 and 1024-bit xmm views.
 };
@@ -251,6 +281,16 @@
 constexpr RegStorage rs_fr5(RegStorage::kValid | fr5);
 constexpr RegStorage rs_fr6(RegStorage::kValid | fr6);
 constexpr RegStorage rs_fr7(RegStorage::kValid | fr7);
+#ifdef TARGET_REX_SUPPORT
+constexpr RegStorage rs_fr8(RegStorage::kValid | fr8);
+constexpr RegStorage rs_fr9(RegStorage::kValid | fr9);
+constexpr RegStorage rs_fr10(RegStorage::kValid | fr10);
+constexpr RegStorage rs_fr11(RegStorage::kValid | fr11);
+constexpr RegStorage rs_fr12(RegStorage::kValid | fr12);
+constexpr RegStorage rs_fr13(RegStorage::kValid | fr13);
+constexpr RegStorage rs_fr14(RegStorage::kValid | fr14);
+constexpr RegStorage rs_fr15(RegStorage::kValid | fr15);
+#endif
 
 constexpr RegStorage rs_dr0(RegStorage::kValid | dr0);
 constexpr RegStorage rs_dr1(RegStorage::kValid | dr1);
@@ -260,6 +300,16 @@
 constexpr RegStorage rs_dr5(RegStorage::kValid | dr5);
 constexpr RegStorage rs_dr6(RegStorage::kValid | dr6);
 constexpr RegStorage rs_dr7(RegStorage::kValid | dr7);
+#ifdef TARGET_REX_SUPPORT
+constexpr RegStorage rs_dr8(RegStorage::kValid | dr8);
+constexpr RegStorage rs_dr9(RegStorage::kValid | dr9);
+constexpr RegStorage rs_dr10(RegStorage::kValid | dr10);
+constexpr RegStorage rs_dr11(RegStorage::kValid | dr11);
+constexpr RegStorage rs_dr12(RegStorage::kValid | dr12);
+constexpr RegStorage rs_dr13(RegStorage::kValid | dr13);
+constexpr RegStorage rs_dr14(RegStorage::kValid | dr14);
+constexpr RegStorage rs_dr15(RegStorage::kValid | dr15);
+#endif
 
 constexpr RegStorage rs_xr0(RegStorage::kValid | xr0);
 constexpr RegStorage rs_xr1(RegStorage::kValid | xr1);
@@ -269,6 +319,16 @@
 constexpr RegStorage rs_xr5(RegStorage::kValid | xr5);
 constexpr RegStorage rs_xr6(RegStorage::kValid | xr6);
 constexpr RegStorage rs_xr7(RegStorage::kValid | xr7);
+#ifdef TARGET_REX_SUPPORT
+constexpr RegStorage rs_xr8(RegStorage::kValid | xr8);
+constexpr RegStorage rs_xr9(RegStorage::kValid | xr9);
+constexpr RegStorage rs_xr10(RegStorage::kValid | xr10);
+constexpr RegStorage rs_xr11(RegStorage::kValid | xr11);
+constexpr RegStorage rs_xr12(RegStorage::kValid | xr12);
+constexpr RegStorage rs_xr13(RegStorage::kValid | xr13);
+constexpr RegStorage rs_xr14(RegStorage::kValid | xr14);
+constexpr RegStorage rs_xr15(RegStorage::kValid | xr15);
+#endif
 
 extern X86NativeRegisterPool rX86_ARG0;
 extern X86NativeRegisterPool rX86_ARG1;
@@ -351,10 +411,14 @@
   opcode ## 16RR, opcode ## 16RM, opcode ## 16RA, opcode ## 16RT, \
   opcode ## 16RI, opcode ## 16MI, opcode ## 16AI, opcode ## 16TI, \
   opcode ## 16RI8, opcode ## 16MI8, opcode ## 16AI8, opcode ## 16TI8, \
-  opcode ## 32MR, opcode ## 64MR, opcode ## 32AR, opcode ## 64AR, opcode ## 32TR,  \
-  opcode ## 32RR, opcode ## 32RM, opcode ## 64RM, opcode ## 32RA, opcode ## 64RA, opcode ## 32RT, opcode ## 64RT, \
-  opcode ## 32RI, opcode ## 64RI, opcode ## 32MI, opcode ## 32AI, opcode ## 32TI, \
-  opcode ## 32RI8, opcode ## 64RI8, opcode ## 32MI8, opcode ## 32AI8, opcode ## 32TI8
+  opcode ## 32MR, opcode ## 32AR, opcode ## 32TR,  \
+  opcode ## 32RR, opcode ## 32RM, opcode ## 32RA, opcode ## 32RT, \
+  opcode ## 32RI, opcode ## 32MI, opcode ## 32AI, opcode ## 32TI, \
+  opcode ## 32RI8, opcode ## 32MI8, opcode ## 32AI8, opcode ## 32TI8, \
+  opcode ## 64MR, opcode ## 64AR, opcode ## 64TR,  \
+  opcode ## 64RR, opcode ## 64RM, opcode ## 64RA, opcode ## 64RT, \
+  opcode ## 64RI, opcode ## 64MI, opcode ## 64AI, opcode ## 64TI, \
+  opcode ## 64RI8, opcode ## 64MI8, opcode ## 64AI8, opcode ## 64TI8
   BinaryOpCode(kX86Add),
   BinaryOpCode(kX86Or),
   BinaryOpCode(kX86Adc),
@@ -367,23 +431,32 @@
   kX86Imul16RRI, kX86Imul16RMI, kX86Imul16RAI,
   kX86Imul32RRI, kX86Imul32RMI, kX86Imul32RAI,
   kX86Imul32RRI8, kX86Imul32RMI8, kX86Imul32RAI8,
+  kX86Imul64RRI, kX86Imul64RMI, kX86Imul64RAI,
+  kX86Imul64RRI8, kX86Imul64RMI8, kX86Imul64RAI8,
   kX86Mov8MR, kX86Mov8AR, kX86Mov8TR,
   kX86Mov8RR, kX86Mov8RM, kX86Mov8RA, kX86Mov8RT,
   kX86Mov8RI, kX86Mov8MI, kX86Mov8AI, kX86Mov8TI,
   kX86Mov16MR, kX86Mov16AR, kX86Mov16TR,
   kX86Mov16RR, kX86Mov16RM, kX86Mov16RA, kX86Mov16RT,
   kX86Mov16RI, kX86Mov16MI, kX86Mov16AI, kX86Mov16TI,
-  kX86Mov32MR, kX86Mov64MR, kX86Mov32AR, kX86Mov64AR, kX86Mov32TR,
-  kX86Mov32RR, kX86Mov32RM, kX86Mov64RM, kX86Mov32RA, kX86Mov64RA, kX86Mov32RT, kX86Mov64RT,
-  kX86Mov32RI, kX86Mov32MI, kX86Mov32AI, kX86Mov32TI, kX86Mov64TI,
+  kX86Mov32MR, kX86Mov32AR, kX86Mov32TR,
+  kX86Mov32RR, kX86Mov32RM, kX86Mov32RA, kX86Mov32RT,
+  kX86Mov32RI, kX86Mov32MI, kX86Mov32AI, kX86Mov32TI,
   kX86Lea32RM,
   kX86Lea32RA,
+  kX86Mov64MR, kX86Mov64AR, kX86Mov64TR,
+  kX86Mov64RR, kX86Mov64RM, kX86Mov64RA, kX86Mov64RT,
+  kX86Mov64RI, kX86Mov64MI, kX86Mov64AI, kX86Mov64TI,
+  kX86Lea64RM,
+  kX86Lea64RA,
   // RRC - Register Register ConditionCode - cond_opcode reg1, reg2
   //             - lir operands - 0: reg1, 1: reg2, 2: CC
   kX86Cmov32RRC,
+  kX86Cmov64RRC,
   // RMC - Register Memory ConditionCode - cond_opcode reg1, [base + disp]
   //             - lir operands - 0: reg1, 1: base, 2: disp 3: CC
   kX86Cmov32RMC,
+  kX86Cmov64RMC,
 
   // RC - Register CL - opcode reg, CL
   //          - lir operands - 0: reg, 1: CL
@@ -397,7 +470,9 @@
   opcode ## 16RI, opcode ## 16MI, opcode ## 16AI, \
   opcode ## 16RC, opcode ## 16MC, opcode ## 16AC, \
   opcode ## 32RI, opcode ## 32MI, opcode ## 32AI, \
-  opcode ## 32RC, opcode ## 32MC, opcode ## 32AC
+  opcode ## 32RC, opcode ## 32MC, opcode ## 32AC, \
+  opcode ## 64RI, opcode ## 64MI, opcode ## 64AI, \
+  opcode ## 64RC, opcode ## 64MC, opcode ## 64AC
   BinaryShiftOpCode(kX86Rol),
   BinaryShiftOpCode(kX86Ror),
   BinaryShiftOpCode(kX86Rcl),
@@ -411,12 +486,18 @@
   kX86Shld32MRI,
   kX86Shrd32RRI,
   kX86Shrd32MRI,
+  kX86Shld64RRI,
+  kX86Shld64MRI,
+  kX86Shrd64RRI,
+  kX86Shrd64MRI,
 #define UnaryOpcode(opcode, reg, mem, array) \
   opcode ## 8 ## reg, opcode ## 8 ## mem, opcode ## 8 ## array, \
   opcode ## 16 ## reg, opcode ## 16 ## mem, opcode ## 16 ## array, \
-  opcode ## 32 ## reg, opcode ## 32 ## mem, opcode ## 32 ## array
+  opcode ## 32 ## reg, opcode ## 32 ## mem, opcode ## 32 ## array, \
+  opcode ## 64 ## reg, opcode ## 64 ## mem, opcode ## 64 ## array
   UnaryOpcode(kX86Test, RI, MI, AI),
   kX86Test32RR,
+  kX86Test64RR,
   UnaryOpcode(kX86Not, R, M, A),
   UnaryOpcode(kX86Neg, R, M, A),
   UnaryOpcode(kX86Mul,  DaR, DaM, DaA),
@@ -544,20 +625,20 @@
 
 /* Instruction assembly field_loc kind */
 enum X86EncodingKind {
-  kData,                                   // Special case for raw data.
-  kNop,                                    // Special case for variable length nop.
-  kNullary,                                // Opcode that takes no arguments.
-  kPrefix2Nullary,                         // Opcode that takes no arguments, but 2 prefixes.
-  kRegOpcode,                              // Shorter form of R instruction kind (opcode+rd)
-  kReg, kReg64, kMem, kArray,              // R, M and A instruction kinds.
-  kMemReg, kMemReg64, kArrayReg, kArrayReg64, kThreadReg,          // MR, AR and TR instruction kinds.
-  kRegReg, kRegMem, kRegArray, kRegThread, kReg64Thread,  // RR, RM, RA and RT instruction kinds.
-  kRegRegStore,                            // RR following the store modrm reg-reg encoding rather than the load.
-  kRegImm, kReg64Imm, kMemImm, kArrayImm, kThreadImm,  // RI, MI, AI and TI instruction kinds.
-  kRegRegImm, kRegMemImm, kRegArrayImm,    // RRI, RMI and RAI instruction kinds.
-  kMovRegImm,                              // Shorter form move RI.
-  kRegRegImmRev,                           // RRI with first reg in r/m
-  kMemRegImm,                              // MRI instruction kinds.
+  kData,                                    // Special case for raw data.
+  kNop,                                     // Special case for variable length nop.
+  kNullary,                                 // Opcode that takes no arguments.
+  kPrefix2Nullary,                          // Opcode that takes no arguments, but 2 prefixes.
+  kRegOpcode,                               // Shorter form of R instruction kind (opcode+rd)
+  kReg, kMem, kArray,                       // R, M and A instruction kinds.
+  kMemReg, kArrayReg, kThreadReg,           // MR, AR and TR instruction kinds.
+  kRegReg, kRegMem, kRegArray, kRegThread,  // RR, RM, RA and RT instruction kinds.
+  kRegRegStore,                             // RR following the store modrm reg-reg encoding rather than the load.
+  kRegImm, kMemImm, kArrayImm, kThreadImm,  // RI, MI, AI and TI instruction kinds.
+  kRegRegImm, kRegMemImm, kRegArrayImm,     // RRI, RMI and RAI instruction kinds.
+  kMovRegImm,                               // Shorter form move RI.
+  kRegRegImmRev,                            // RRI with first reg in r/m
+  kMemRegImm,                               // MRI instruction kinds.
   kShiftRegImm, kShiftMemImm, kShiftArrayImm,  // Shift opcode with immediate.
   kShiftRegCl, kShiftMemCl, kShiftArrayCl,     // Shift opcode with register CL.
   kRegRegReg, kRegRegMem, kRegRegArray,    // RRR, RRM, RRA instruction kinds.
@@ -607,6 +688,15 @@
 // 64 Bit Operand Size
 #define REX_W 0x48
 // Extension of the ModR/M reg field
+#define REX_R 0x44
+// Extension of the SIB index field
+#define REX_X 0x42
+// Extension of the ModR/M r/m field, SIB base field, or Opcode reg field
+#define REX_B 0x41
+// Mask extracting the least 3 bits of r0..r15
+#define kRegNumMask32 0x07
+// Value indicating that base or reg is not used
+#define NO_REG 0
 
 #define IS_SIMM8(v) ((-128 <= (v)) && ((v) <= 127))
 #define IS_SIMM16(v) ((-32768 <= (v)) && ((v) <= 32767))
diff --git a/compiler/dex/reg_storage.h b/compiler/dex/reg_storage.h
index 2f7e701..7e50c31 100644
--- a/compiler/dex/reg_storage.h
+++ b/compiler/dex/reg_storage.h
@@ -225,24 +225,6 @@
     return reg_ & kRegNumMask;
   }
 
-  // Aliased double to low single.
-  RegStorage DoubleToLowSingle() const {
-    DCHECK(IsDouble());
-    return FloatSolo32(GetRegNum() << 1);
-  }
-
-  // Aliased double to high single.
-  RegStorage DoubleToHighSingle() const {
-    DCHECK(IsDouble());
-    return FloatSolo32((GetRegNum() << 1) + 1);
-  }
-
-  // Single to aliased double.
-  RegStorage SingleToDouble() const {
-    DCHECK(IsSingle());
-    return FloatSolo64(GetRegNum() >> 1);
-  }
-
   // Is register number in 0..7?
   bool Low8() const {
     return GetRegNum() < 8;
diff --git a/compiler/dex/ssa_transformation.cc b/compiler/dex/ssa_transformation.cc
index 6f47b8f..0c5a4ca 100644
--- a/compiler/dex/ssa_transformation.cc
+++ b/compiler/dex/ssa_transformation.cc
@@ -167,8 +167,8 @@
 }
 
 void MIRGraph::ComputeDomPostOrderTraversal(BasicBlock* bb) {
-  if (dom_post_order_traversal_ == NULL) {
-    // First time - create the array.
+  if (dom_post_order_traversal_ == NULL || max_num_reachable_blocks_ < num_reachable_blocks_) {
+    // First time or too small - create the array.
     dom_post_order_traversal_ =
         new (arena_) GrowableArray<BasicBlockId>(arena_, num_reachable_blocks_,
                                         kGrowableArrayDomPostOrderTraversal);
@@ -373,8 +373,8 @@
     InitializeDominationInfo(bb);
   }
 
-  /* Initalize & Clear i_dom_list */
-  if (i_dom_list_ == NULL) {
+  /* Initialize & Clear i_dom_list */
+  if (max_num_reachable_blocks_ < num_reachable_blocks_) {
     i_dom_list_ = static_cast<int*>(arena_->Alloc(sizeof(int) * num_reachable_blocks,
                                                   kArenaAllocDFInfo));
   }
@@ -542,12 +542,8 @@
     /* Iterate through the predecessors */
     GrowableArray<BasicBlockId>::Iterator iter(bb->predecessors);
     size_t num_uses = bb->predecessors->Size();
-    mir->ssa_rep->num_uses = num_uses;
-    int* uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                kArenaAllocDFInfo));
-    mir->ssa_rep->uses = uses;
-    mir->ssa_rep->fp_use =
-        static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses, kArenaAllocDFInfo));
+    AllocateSSAUseData(mir, num_uses);
+    int* uses = mir->ssa_rep->uses;
     BasicBlockId* incoming =
         static_cast<BasicBlockId*>(arena_->Alloc(sizeof(BasicBlockId) * num_uses,
                                                  kArenaAllocDFInfo));
@@ -556,9 +552,9 @@
     while (true) {
       BasicBlock* pred_bb = GetBasicBlock(iter.Next());
       if (!pred_bb) {
-        break;
+       break;
       }
-      int ssa_reg = pred_bb->data_flow_info->vreg_to_ssa_map[v_reg];
+      int ssa_reg = pred_bb->data_flow_info->vreg_to_ssa_map_exit[v_reg];
       uses[idx] = ssa_reg;
       incoming[idx] = pred_bb->id;
       idx++;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 3304561..8d4e283 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1135,7 +1135,7 @@
       if (dex_method_idx != DexFile::kDexNoIndex) {
         target_method->dex_method_index = dex_method_idx;
       } else {
-        if (compiling_boot) {
+        if (compiling_boot && !use_dex_cache) {
           target_method->dex_method_index = method->GetDexMethodIndex();
           target_method->dex_file = method->GetDeclaringClass()->GetDexCache()->GetDexFile();
         }
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index 6812f3c..49cf71b 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -180,7 +180,7 @@
   EXPECT_EQ(80U, sizeof(OatHeader));
   EXPECT_EQ(8U, sizeof(OatMethodOffsets));
   EXPECT_EQ(24U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(80 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
+  EXPECT_EQ(79 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
 }
 
 TEST_F(OatTest, OatHeaderIsValid) {
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index 23e3433..340a83e 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -196,7 +196,6 @@
   qpoints->pCmplDouble = CmplDouble;
   qpoints->pCmplFloat = CmplFloat;
   qpoints->pFmod = fmod;
-  qpoints->pSqrt = sqrt;
   qpoints->pL2d = __aeabi_l2d;
   qpoints->pFmodf = fmodf;
   qpoints->pL2f = __aeabi_l2f;
diff --git a/runtime/arch/arm64/asm_support_arm64.S b/runtime/arch/arm64/asm_support_arm64.S
index 9614c29..b94375e 100644
--- a/runtime/arch/arm64/asm_support_arm64.S
+++ b/runtime/arch/arm64/asm_support_arm64.S
@@ -21,6 +21,9 @@
 
 // Define special registers.
 
+// Register holding suspend check count down.
+// 32-bit is enough for the suspend register.
+#define wSUSPEND w19
 // Register holding Thread::Current().
 #define xSELF x18
 // Frame Pointer
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index cb9f53b..46e819e 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -182,7 +182,6 @@
   qpoints->pCmplDouble = CmplDouble;
   qpoints->pCmplFloat = CmplFloat;
   qpoints->pFmod = fmod;
-  qpoints->pSqrt = sqrt;
   qpoints->pL2d = NULL;
   qpoints->pFmodf = fmodf;
   qpoints->pL2f = NULL;
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 7f31fb6..97caa1f 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -197,7 +197,8 @@
 .endm
 
 .macro RESTORE_REF_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
-    brk 0
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME
+    ret
 .endm
 
 
@@ -561,32 +562,33 @@
 SAVE_SIZE=5*8   // x4, x5, SP, LR & FP saved.
 SAVE_SIZE_AND_METHOD=SAVE_SIZE+8
 
-    mov x9, sp                          // Save stack pointer.
+    mov x9, sp                             // Save stack pointer.
     .cfi_register sp,x9
 
-    add x10, x2, # SAVE_SIZE_AND_METHOD // calculate size of frame.
-    sub x10, sp, x10                    // Calculate SP position - saves + ArtMethod* +  args
-    and x10, x10, # ~0xf                // Enforce 16 byte stack alignment.
-    mov sp, x10                         // Set new SP.
+    add x10, x2, # SAVE_SIZE_AND_METHOD    // calculate size of frame.
+    sub x10, sp, x10                       // Calculate SP position - saves + ArtMethod* +  args
+    and x10, x10, # ~0xf                   // Enforce 16 byte stack alignment.
+    mov sp, x10                            // Set new SP.
 
-    sub x10, x9, #SAVE_SIZE             // Calculate new FP (later). Done here as we must move SP
-    .cfi_def_cfa_register x10           // before this.
+    sub x10, x9, #SAVE_SIZE                // Calculate new FP (later). Done here as we must move SP
+    .cfi_def_cfa_register x10              // before this.
     .cfi_adjust_cfa_offset SAVE_SIZE
 
-    str x9, [x10, #32]                  // Save old stack pointer.
+    str x9, [x10, #32]                     // Save old stack pointer.
     .cfi_rel_offset sp, 32
 
-    stp x4, x5, [x10, #16]              // Save result and shorty addresses.
+    stp x4, x5, [x10, #16]                 // Save result and shorty addresses.
     .cfi_rel_offset x4, 16
     .cfi_rel_offset x5, 24
 
-    stp xFP, xLR, [x10]                 // Store LR & FP.
+    stp xFP, xLR, [x10]                    // Store LR & FP.
     .cfi_rel_offset x29, 0
     .cfi_rel_offset x30, 8
 
-    mov xFP, x10                        // Use xFP now, as it's callee-saved.
+    mov xFP, x10                           // Use xFP now, as it's callee-saved.
     .cfi_def_cfa_register x29
-    mov xSELF, x3                       // Move thread pointer into SELF register.
+    mov xSELF, x3                          // Move thread pointer into SELF register.
+    mov wSUSPEND, #SUSPEND_CHECK_INTERVAL  // reset wSUSPEND to suspend check interval
 
     // Copy arguments into stack frame.
     // Use simple copy routine for now.
@@ -595,7 +597,7 @@
     // W2 - args length
     // X9 - destination address.
     // W10 - temporary
-    add x9, sp, #8     // Destination address is bottom of stack + NULL.
+    add x9, sp, #8                         // Destination address is bottom of stack + NULL.
 
     // Use \@ to differentiate between macro invocations.
 .LcopyParams\@:
@@ -693,6 +695,7 @@
  *  x1-x7 - integer parameters.
  *  d0-d7 - Floating point parameters.
  *  xSELF = self
+ *  wSUSPEND = suspend count
  *  SP = & of ArtMethod*
  *  x1 = "this" pointer.
  *
@@ -1373,7 +1376,22 @@
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALL_ALLOC_ENTRYPOINTS
 
-UNIMPLEMENTED art_quick_test_suspend
+    /*
+     * Called by managed code when the value in wSUSPEND has been decremented to 0.
+     */
+    .extern artTestSuspendFromCode
+ENTRY art_quick_test_suspend
+    ldrh   w0, [xSELF, #THREAD_FLAGS_OFFSET]  // get xSELF->state_and_flags.as_struct.flags
+    mov    wSUSPEND, #SUSPEND_CHECK_INTERVAL  // reset wSUSPEND to SUSPEND_CHECK_INTERVAL
+    cbnz   w0, .Lneed_suspend                 // check flags == 0
+    ret                                       // return if flags == 0
+.Lneed_suspend:
+    mov    x0, xSELF
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME          // save callee saves for stack crawl
+    mov    x1, sp
+    bl     artTestSuspendFromCode             // (Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
+END art_quick_test_suspend
 
      /*
      * Called by managed code that is attempting to call a method on a proxy class. On entry
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index 8ad29dd..c53fa1e 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -177,7 +177,6 @@
   // points->pCmplDouble = NULL;  // Not needed on x86.
   // points->pCmplFloat = NULL;  // Not needed on x86.
   qpoints->pFmod = art_quick_fmod;
-  // qpoints->pSqrt = NULL;  // Not needed on x86.
   // qpoints->pL2d = NULL;  // Not needed on x86.
   qpoints->pFmodf = art_quick_fmodf;
   // qpoints->pL2f = NULL;  // Not needed on x86.
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index 86dcf36..aeda072 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -176,7 +176,6 @@
   // points->pCmplDouble = NULL;  // Not needed on x86.
   // points->pCmplFloat = NULL;  // Not needed on x86.
   qpoints->pFmod = fmod;
-  // qpoints->pSqrt = NULL;  // Not needed on x86.
   // qpoints->pL2d = NULL;  // Not needed on x86.
   qpoints->pFmodf = fmodf;
   // qpoints->pL2f = NULL;  // Not needed on x86.
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index afff7a2..b9c42ee 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -563,8 +563,7 @@
                                   const char* oat_cache_filename,
                                   std::string* error_msg) {
   Locks::mutator_lock_->AssertNotHeld(Thread::Current());  // Avoid starving GC.
-  std::string dex2oat(GetAndroidRoot());
-  dex2oat += (kIsDebugBuild ? "/bin/dex2oatd" : "/bin/dex2oat");
+  std::string dex2oat(Runtime::Current()->GetCompilerExecutable());
 
   gc::Heap* heap = Runtime::Current()->GetHeap();
   std::string boot_image_option("--boot-image=");
diff --git a/runtime/dex_file_verifier.cc b/runtime/dex_file_verifier.cc
index a1c8c71..17d1ffc 100644
--- a/runtime/dex_file_verifier.cc
+++ b/runtime/dex_file_verifier.cc
@@ -698,6 +698,7 @@
   const byte* file_end = begin_ + size_;
 
   for (uint32_t i = 0; i < size; i++) {
+    CHECK_LT(i, size);  // b/15014252 Prevents hitting the impossible case below
     if (UNLIKELY(ptr_ >= file_end)) {
       ErrorStringPrintf("String data would go beyond end-of-file");
       return false;
@@ -710,6 +711,7 @@
       case 0x00:
         // Special case of bit pattern 0xxx.
         if (UNLIKELY(byte == 0)) {
+          CHECK_LT(i, size);  // b/15014252 Actually hit this impossible case with clang
           ErrorStringPrintf("String data shorter than indicated utf16_size %x", size);
           return false;
         }
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index ec69e28..7bd1582 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -98,7 +98,6 @@
   int32_t (*pCmplDouble)(double, double);
   int32_t (*pCmplFloat)(float, float);
   double (*pFmod)(double, double);
-  double (*pSqrt)(double);
   double (*pL2d)(int64_t);
   float (*pFmodf)(float, float);
   float (*pL2f)(int64_t);
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 45fee14..3d35c00 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -53,8 +53,7 @@
 
   std::vector<std::string> arg_vector;
 
-  std::string dex2oat(GetAndroidRoot());
-  dex2oat += (kIsDebugBuild ? "/bin/dex2oatd" : "/bin/dex2oat");
+  std::string dex2oat(Runtime::Current()->GetCompilerExecutable());
   arg_vector.push_back(dex2oat);
 
   std::string image_option_string("--image=");
diff --git a/runtime/gc/space/malloc_space.cc b/runtime/gc/space/malloc_space.cc
index e710409..57ed0bd 100644
--- a/runtime/gc/space/malloc_space.cc
+++ b/runtime/gc/space/malloc_space.cc
@@ -50,12 +50,12 @@
     CHECK(IsAligned<kGcCardSize>(reinterpret_cast<uintptr_t>(mem_map->End())));
     live_bitmap_.reset(accounting::ContinuousSpaceBitmap::Create(
         StringPrintf("allocspace %s live-bitmap %d", name.c_str(), static_cast<int>(bitmap_index)),
-        Begin(), Capacity()));
+        Begin(), NonGrowthLimitCapacity()));
     DCHECK(live_bitmap_.get() != nullptr) << "could not create allocspace live bitmap #"
         << bitmap_index;
     mark_bitmap_.reset(accounting::ContinuousSpaceBitmap::Create(
         StringPrintf("allocspace %s mark-bitmap %d", name.c_str(), static_cast<int>(bitmap_index)),
-        Begin(), Capacity()));
+        Begin(), NonGrowthLimitCapacity()));
     DCHECK(live_bitmap_.get() != nullptr) << "could not create allocspace mark bitmap #"
         << bitmap_index;
   }
@@ -218,10 +218,12 @@
 
 void MallocSpace::Dump(std::ostream& os) const {
   os << GetType()
-      << " begin=" << reinterpret_cast<void*>(Begin())
-      << ",end=" << reinterpret_cast<void*>(End())
-      << ",size=" << PrettySize(Size()) << ",capacity=" << PrettySize(Capacity())
-      << ",name=\"" << GetName() << "\"]";
+     << " begin=" << reinterpret_cast<void*>(Begin())
+     << ",end=" << reinterpret_cast<void*>(End())
+     << ",limit=" << reinterpret_cast<void*>(Limit())
+     << ",size=" << PrettySize(Size()) << ",capacity=" << PrettySize(Capacity())
+     << ",non_growth_limit_capacity=" << PrettySize(NonGrowthLimitCapacity())
+     << ",name=\"" << GetName() << "\"]";
 }
 
 void MallocSpace::SweepCallback(size_t num_ptrs, mirror::Object** ptrs, void* arg) {
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 2dbcc80..194cb18 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -180,13 +180,6 @@
 
     virtual bool VisitFrame() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
       mirror::ArtMethod* m = GetMethod();
-      if (GetCurrentQuickFrame() == NULL) {
-        if (kVerboseInstrumentation) {
-          LOG(INFO) << "  Ignoring a shadow frame. Frame " << GetFrameId()
-                    << " Method=" << PrettyMethod(m);
-        }
-        return true;  // Ignore shadow frames.
-      }
       if (m == NULL) {
         if (kVerboseInstrumentation) {
           LOG(INFO) << "  Skipping upcall. Frame " << GetFrameId();
@@ -204,6 +197,14 @@
       if (kVerboseInstrumentation) {
         LOG(INFO) << "  Installing exit stub in " << DescribeLocation();
       }
+      if (GetCurrentQuickFrame() == NULL) {
+        InstrumentationStackFrame instrumentation_frame(GetThisObject(), m, 0, GetFrameId(), false);
+        if (kVerboseInstrumentation) {
+          LOG(INFO) << "Pushing shadow frame " << instrumentation_frame.Dump();
+        }
+        shadow_stack_.push_back(instrumentation_frame);
+        return true;  // Continue.
+      }
       uintptr_t return_pc = GetReturnPc();
       if (return_pc == instrumentation_exit_pc_) {
         // We've reached a frame which has already been installed with instrumentation exit stub.
@@ -238,6 +239,7 @@
       return true;  // Continue.
     }
     std::deque<InstrumentationStackFrame>* const instrumentation_stack_;
+    std::vector<InstrumentationStackFrame> shadow_stack_;
     const size_t existing_instrumentation_frames_count_;
     std::vector<uint32_t> dex_pcs_;
     const uintptr_t instrumentation_exit_pc_;
@@ -261,14 +263,16 @@
   if (instrumentation->ShouldNotifyMethodEnterExitEvents()) {
     // Create method enter events for all methods currently on the thread's stack. We only do this
     // if no debugger is attached to prevent from posting events twice.
-    typedef std::deque<InstrumentationStackFrame>::const_reverse_iterator It;
-    for (It it = thread->GetInstrumentationStack()->rbegin(),
-        end = thread->GetInstrumentationStack()->rend(); it != end; ++it) {
-      mirror::Object* this_object = (*it).this_object_;
-      mirror::ArtMethod* method = (*it).method_;
+    auto ssi = visitor.shadow_stack_.rbegin();
+    for (auto isi = thread->GetInstrumentationStack()->rbegin(),
+        end = thread->GetInstrumentationStack()->rend(); isi != end; ++isi) {
+      while (ssi != visitor.shadow_stack_.rend() && (*ssi).frame_id_ < (*isi).frame_id_) {
+        instrumentation->MethodEnterEvent(thread, (*ssi).this_object_, (*ssi).method_, 0);
+        ++ssi;
+      }
       uint32_t dex_pc = visitor.dex_pcs_.back();
       visitor.dex_pcs_.pop_back();
-      instrumentation->MethodEnterEvent(thread, this_object, method, dex_pc);
+      instrumentation->MethodEnterEvent(thread, (*isi).this_object_, (*isi).method_, dex_pc);
     }
   }
   thread->VerifyStack();
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 339eb36..f12043e 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -82,11 +82,25 @@
   // Note: we deliberately don't visit the weak_interns_ table and the immutable image roots.
 }
 
-mirror::String* InternTable::Lookup(Table& table, mirror::String* s, int32_t hash_code) {
+mirror::String* InternTable::LookupStrong(mirror::String* s, int32_t hash_code) {
+  return Lookup<kWithoutReadBarrier>(&strong_interns_, s, hash_code);
+}
+
+mirror::String* InternTable::LookupWeak(mirror::String* s, int32_t hash_code) {
+  // Weak interns need a read barrier because they are weak roots.
+  return Lookup<kWithReadBarrier>(&weak_interns_, s, hash_code);
+}
+
+template<ReadBarrierOption kReadBarrierOption>
+mirror::String* InternTable::Lookup(Table* table, mirror::String* s, int32_t hash_code) {
+  CHECK_EQ(table == &weak_interns_, kReadBarrierOption == kWithReadBarrier)
+      << "Only weak_interns_ needs a read barrier.";
   Locks::intern_table_lock_->AssertHeld(Thread::Current());
-  for (auto it = table.lower_bound(hash_code), end = table.end();
+  for (auto it = table->lower_bound(hash_code), end = table->end();
        it != end && it->first == hash_code; ++it) {
-    mirror::String* existing_string = it->second;
+    mirror::String** weak_root = &it->second;
+    mirror::String* existing_string =
+        ReadBarrier::BarrierForWeakRoot<mirror::String, kReadBarrierOption>(weak_root);
     if (existing_string->Equals(s)) {
       return existing_string;
     }
@@ -115,19 +129,29 @@
   return s;
 }
 
+void InternTable::RemoveStrong(mirror::String* s, int32_t hash_code) {
+  Remove<kWithoutReadBarrier>(&strong_interns_, s, hash_code);
+}
+
 void InternTable::RemoveWeak(mirror::String* s, int32_t hash_code) {
   Runtime* runtime = Runtime::Current();
   if (runtime->IsActiveTransaction()) {
     runtime->RecordWeakStringRemoval(s, hash_code);
   }
-  Remove(weak_interns_, s, hash_code);
+  Remove<kWithReadBarrier>(&weak_interns_, s, hash_code);
 }
 
-void InternTable::Remove(Table& table, mirror::String* s, int32_t hash_code) {
-  for (auto it = table.lower_bound(hash_code), end = table.end();
+template<ReadBarrierOption kReadBarrierOption>
+void InternTable::Remove(Table* table, mirror::String* s, int32_t hash_code) {
+  CHECK_EQ(table == &weak_interns_, kReadBarrierOption == kWithReadBarrier)
+      << "Only weak_interns_ needs a read barrier.";
+  for (auto it = table->lower_bound(hash_code), end = table->end();
        it != end && it->first == hash_code; ++it) {
-    if (it->second == s) {
-      table.erase(it);
+    mirror::String** weak_root = &it->second;
+    mirror::String* existing_string =
+        ReadBarrier::BarrierForWeakRoot<mirror::String, kReadBarrierOption>(weak_root);
+    if (existing_string == s) {
+      table->erase(it);
       return;
     }
   }
@@ -144,11 +168,11 @@
 }
 void InternTable::RemoveStrongFromTransaction(mirror::String* s, int32_t hash_code) {
   DCHECK(!Runtime::Current()->IsActiveTransaction());
-  Remove(strong_interns_, s, hash_code);
+  RemoveStrong(s, hash_code);
 }
 void InternTable::RemoveWeakFromTransaction(mirror::String* s, int32_t hash_code) {
   DCHECK(!Runtime::Current()->IsActiveTransaction());
-  Remove(weak_interns_, s, hash_code);
+  RemoveWeak(s, hash_code);
 }
 
 static mirror::String* LookupStringFromImage(mirror::String* s)
@@ -202,7 +226,7 @@
 
   if (is_strong) {
     // Check the strong table for a match.
-    mirror::String* strong = Lookup(strong_interns_, s, hash_code);
+    mirror::String* strong = LookupStrong(s, hash_code);
     if (strong != NULL) {
       return strong;
     }
@@ -214,7 +238,7 @@
     }
 
     // There is no match in the strong table, check the weak table.
-    mirror::String* weak = Lookup(weak_interns_, s, hash_code);
+    mirror::String* weak = LookupWeak(s, hash_code);
     if (weak != NULL) {
       // A match was found in the weak table. Promote to the strong table.
       RemoveWeak(weak, hash_code);
@@ -227,7 +251,7 @@
   }
 
   // Check the strong table for a match.
-  mirror::String* strong = Lookup(strong_interns_, s, hash_code);
+  mirror::String* strong = LookupStrong(s, hash_code);
   if (strong != NULL) {
     return strong;
   }
@@ -237,7 +261,7 @@
     return InsertWeak(image, hash_code);
   }
   // Check the weak table for a match.
-  mirror::String* weak = Lookup(weak_interns_, s, hash_code);
+  mirror::String* weak = LookupWeak(s, hash_code);
   if (weak != NULL) {
     return weak;
   }
@@ -272,13 +296,14 @@
 
 bool InternTable::ContainsWeak(mirror::String* s) {
   MutexLock mu(Thread::Current(), *Locks::intern_table_lock_);
-  const mirror::String* found = Lookup(weak_interns_, s, s->GetHashCode());
+  const mirror::String* found = LookupWeak(s, s->GetHashCode());
   return found == s;
 }
 
 void InternTable::SweepInternTableWeaks(IsMarkedCallback* callback, void* arg) {
   MutexLock mu(Thread::Current(), *Locks::intern_table_lock_);
   for (auto it = weak_interns_.begin(), end = weak_interns_.end(); it != end;) {
+    // This does not need a read barrier because this is called by GC.
     mirror::Object* object = it->second;
     mirror::Object* new_object = callback(object, arg);
     if (new_object == nullptr) {
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index 47d5e09..3df2aeb 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -79,15 +79,26 @@
       LOCKS_EXCLUDED(Locks::intern_table_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  mirror::String* Lookup(Table& table, mirror::String* s, int32_t hash_code)
+  mirror::String* LookupStrong(mirror::String* s, int32_t hash_code)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  mirror::String* LookupWeak(mirror::String* s, int32_t hash_code)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
+  mirror::String* Lookup(Table* table, mirror::String* s, int32_t hash_code)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::String* InsertStrong(mirror::String* s, int32_t hash_code)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
   mirror::String* InsertWeak(mirror::String* s, int32_t hash_code)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
-  void RemoveWeak(mirror::String* s, int32_t hash_code)
+  void RemoveStrong(mirror::String* s, int32_t hash_code)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
-  void Remove(Table& table, mirror::String* s, int32_t hash_code)
+  void RemoveWeak(mirror::String* s, int32_t hash_code)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
+  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
+  void Remove(Table* table, mirror::String* s, int32_t hash_code)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
 
   // Transaction rollback access.
@@ -96,8 +107,10 @@
   mirror::String* InsertWeakFromTransaction(mirror::String* s, int32_t hash_code)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
   void RemoveStrongFromTransaction(mirror::String* s, int32_t hash_code)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
   void RemoveWeakFromTransaction(mirror::String* s, int32_t hash_code)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
   friend class Transaction;
 
@@ -107,6 +120,9 @@
   Table strong_interns_ GUARDED_BY(Locks::intern_table_lock_);
   std::vector<std::pair<int32_t, mirror::String*>> new_strong_intern_roots_
       GUARDED_BY(Locks::intern_table_lock_);
+  // Since weak_interns_ contain weak roots, they need a read
+  // barrier. Do not directly access the strings in it. Use functions
+  // that contain read barriers.
   Table weak_interns_ GUARDED_BY(Locks::intern_table_lock_);
 };
 
diff --git a/runtime/monitor.h b/runtime/monitor.h
index 9e6d255..bd0e23c 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -94,8 +94,8 @@
   static bool IsValidLockWord(LockWord lock_word);
 
   template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
-  mirror::Object* GetObject() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return ReadBarrier::BarrierForWeakRoot<mirror::Object, kReadBarrierOption>(obj_);
+  mirror::Object* GetObject() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return ReadBarrier::BarrierForWeakRoot<mirror::Object, kReadBarrierOption>(&obj_);
   }
 
   void SetObject(mirror::Object* object);
diff --git a/runtime/oat.cc b/runtime/oat.cc
index 10d335e..9c14a4f 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -22,7 +22,7 @@
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '2', '9', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '3', '0', '\0' };
 
 OatHeader::OatHeader() {
   memset(this, 0, sizeof(*this));
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 3756435..db2a61b 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -15,6 +15,7 @@
  */
 
 #include "parsed_options.h"
+#include "utils.h"
 #ifdef HAVE_ANDROID_OS
 #include "cutils/properties.h"
 #endif
@@ -604,6 +605,10 @@
           return false;
         }
       }
+    } else if (StartsWith(option, "-Xcompiler:")) {
+      if (!ParseStringAfterChar(option, ':', &compiler_executable_)) {
+        return false;
+      }
     } else if (option == "-Xcompiler-option") {
       i++;
       if (i == options.size()) {
@@ -791,6 +796,7 @@
   UsageMessage(stream, "  -Xprofile-duration:integervalue\n");
   UsageMessage(stream, "  -Xprofile-interval:integervalue\n");
   UsageMessage(stream, "  -Xprofile-backoff:doublevalue\n");
+  UsageMessage(stream, "  -Xcompiler:filename\n");
   UsageMessage(stream, "  -Xcompiler-option dex2oat-option\n");
   UsageMessage(stream, "  -Ximage-compiler-option dex2oat-option\n");
   UsageMessage(stream, "\n");
diff --git a/runtime/parsed_options.h b/runtime/parsed_options.h
index e0b0fb5..25fc12a 100644
--- a/runtime/parsed_options.h
+++ b/runtime/parsed_options.h
@@ -74,6 +74,7 @@
   void (*hook_exit_)(jint status);
   void (*hook_abort_)();
   std::vector<std::string> properties_;
+  std::string compiler_executable_;
   std::vector<std::string> compiler_options_;
   std::vector<std::string> image_compiler_options_;
   bool profile_;
diff --git a/runtime/profiler.cc b/runtime/profiler.cc
index 6e33f9d..5459ce3 100644
--- a/runtime/profiler.cc
+++ b/runtime/profiler.cc
@@ -193,7 +193,7 @@
 
       valid_samples += barrier_count;
 
-      ThreadState old_state = self->SetState(kWaitingForCheckPointsToRun);
+      ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
 
       // Wait for the barrier to be crossed by all runnable threads.  This wait
       // is done with a timeout so that we can detect problems with the checkpoint
@@ -211,13 +211,11 @@
       // code.  Crash the process in this case.
       CHECK_LT(waitdiff_us, kWaitTimeoutUs);
 
-      self->SetState(old_state);
-
       // Update the current time.
       now_us = MicroTime();
     }
 
-    if (valid_samples > 0 && !ShuttingDown(self)) {
+    if (valid_samples > 0) {
       // After the profile has been taken, write it out.
       ScopedObjectAccess soa(self);   // Acquire the mutator lock.
       uint32_t size = profiler->WriteProfile();
@@ -335,6 +333,7 @@
   pthread_t profiler_pthread = 0U;
   {
     MutexLock trace_mu(Thread::Current(), *Locks::profiler_lock_);
+    CHECK(!shutting_down_);
     profiler = profiler_;
     shutting_down_ = true;
     profiler_pthread = profiler_pthread_;
diff --git a/runtime/read_barrier-inl.h b/runtime/read_barrier-inl.h
index 4302c9e..e252b7b 100644
--- a/runtime/read_barrier-inl.h
+++ b/runtime/read_barrier-inl.h
@@ -44,8 +44,8 @@
 }
 
 template <typename MirrorType, ReadBarrierOption kReadBarrierOption>
-inline MirrorType* ReadBarrier::BarrierForWeakRoot(MirrorType* ref) {
-  UNUSED(ref);
+inline MirrorType* ReadBarrier::BarrierForWeakRoot(MirrorType** weak_root) {
+  MirrorType* ref = *weak_root;
   const bool with_read_barrier = kReadBarrierOption == kWithReadBarrier;
   if (with_read_barrier && kUseBakerReadBarrier) {
     // To be implemented.
diff --git a/runtime/read_barrier.h b/runtime/read_barrier.h
index e40e8ea..7232a3f 100644
--- a/runtime/read_barrier.h
+++ b/runtime/read_barrier.h
@@ -39,7 +39,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   template <typename MirrorType, ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
-  ALWAYS_INLINE static MirrorType* BarrierForWeakRoot(MirrorType* ref)
+  ALWAYS_INLINE static MirrorType* BarrierForWeakRoot(MirrorType** weak_root)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 };
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index dcbf42d..23a49cb 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -164,6 +164,11 @@
     }
     shutting_down_ = true;
   }
+  // Shut down background profiler before the runtime exits.
+  if (profile_) {
+    BackgroundMethodSamplingProfiler::Shutdown();
+  }
+
   Trace::Shutdown();
 
   // Make sure to let the GC complete if it is running.
@@ -365,6 +370,15 @@
   return env->NewGlobalRef(system_class_loader.get());
 }
 
+std::string Runtime::GetCompilerExecutable() const {
+  if (!compiler_executable_.empty()) {
+    return compiler_executable_;
+  }
+  std::string compiler_executable(GetAndroidRoot());
+  compiler_executable += (kIsDebugBuild ? "/bin/dex2oatd" : "/bin/dex2oat");
+  return compiler_executable;
+}
+
 bool Runtime::Start() {
   VLOG(startup) << "Runtime::Start entering";
 
@@ -531,6 +545,7 @@
   default_stack_size_ = options->stack_size_;
   stack_trace_file_ = options->stack_trace_file_;
 
+  compiler_executable_ = options->compiler_executable_;
   compiler_options_ = options->compiler_options_;
   image_compiler_options_ = options->image_compiler_options_;
 
diff --git a/runtime/runtime.h b/runtime/runtime.h
index f7074f6..261429e 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -104,6 +104,8 @@
     return is_explicit_gc_disabled_;
   }
 
+  std::string GetCompilerExecutable() const;
+
   const std::vector<std::string>& GetCompilerOptions() const {
     return compiler_options_;
   }
@@ -482,6 +484,7 @@
   bool is_concurrent_gc_enabled_;
   bool is_explicit_gc_disabled_;
 
+  std::string compiler_executable_;
   std::vector<std::string> compiler_options_;
   std::vector<std::string> image_compiler_options_;
 
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 41cfc58..55bec1e 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1816,7 +1816,6 @@
   QUICK_ENTRY_POINT_INFO(pCmplDouble)
   QUICK_ENTRY_POINT_INFO(pCmplFloat)
   QUICK_ENTRY_POINT_INFO(pFmod)
-  QUICK_ENTRY_POINT_INFO(pSqrt)
   QUICK_ENTRY_POINT_INFO(pL2d)
   QUICK_ENTRY_POINT_INFO(pFmodf)
   QUICK_ENTRY_POINT_INFO(pL2f)
diff --git a/runtime/transaction.h b/runtime/transaction.h
index 6fd86c8..7859126 100644
--- a/runtime/transaction.h
+++ b/runtime/transaction.h
@@ -147,7 +147,9 @@
       DCHECK(s != nullptr);
     }
 
-    void Undo(InternTable* intern_table) EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
+    void Undo(InternTable* intern_table)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+        EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
     void VisitRoots(RootCallback* callback, void* arg);
 
    private:
@@ -169,7 +171,8 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void UndoInternStringTableModifications()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_)
-      EXCLUSIVE_LOCKS_REQUIRED(log_lock_);
+      EXCLUSIVE_LOCKS_REQUIRED(log_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void VisitObjectLogs(RootCallback* callback, void* arg)
       EXCLUSIVE_LOCKS_REQUIRED(log_lock_)