Allow mixing of thread offsets between 32 and 64bit architectures.

Begin a more full implementation x86-64 REX prefixes.
Doesn't implement 64bit thread offset support for the JNI compiler.

Change-Id: If9af2f08a1833c21ddb4b4077f9b03add1a05147
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index d6724f1..2e37877 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -164,7 +164,7 @@
   // Making a call - use explicit registers
   FlushAllRegs();   /* Everything to home location */
   LoadValueDirectFixed(rl_src, rs_r0);
-  LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(pHandleFillArrayData).Int32Value(),
+  LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pHandleFillArrayData).Int32Value(),
                rs_rARM_LR);
   // Materialize a pointer to the fill data image
   NewLIR3(kThumb2Adr, r1, 0, WrapPointer(tab_rec));
@@ -192,7 +192,7 @@
         null_check_branch = OpCmpImmBranch(kCondEq, rs_r0, 0, NULL);
       }
     }
-    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset().Int32Value(), rs_r2);
+    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
     NewLIR3(kThumb2Ldrex, r1, r0, mirror::Object::MonitorOffset().Int32Value() >> 2);
     MarkPossibleNullPointerException(opt_flags);
     LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_r1, 0, NULL);
@@ -207,7 +207,7 @@
     }
     // TODO: move to a slow path.
     // Go expensive route - artLockObjectFromCode(obj);
-    LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(pLockObject).Int32Value(), rs_rARM_LR);
+    LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pLockObject).Int32Value(), rs_rARM_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx, rs_rARM_LR);
     MarkSafepointPC(call_inst);
@@ -218,7 +218,7 @@
   } else {
     // Explicit null-check as slow-path is entered using an IT.
     GenNullCheck(rs_r0, opt_flags);
-    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset().Int32Value(), rs_r2);
+    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
     NewLIR3(kThumb2Ldrex, r1, r0, mirror::Object::MonitorOffset().Int32Value() >> 2);
     MarkPossibleNullPointerException(opt_flags);
     OpRegImm(kOpCmp, rs_r1, 0);
@@ -227,7 +227,7 @@
     OpRegImm(kOpCmp, rs_r1, 0);
     OpIT(kCondNe, "T");
     // Go expensive route - artLockObjectFromCode(self, obj);
-    LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(pLockObject).Int32Value(), rs_rARM_LR);
+    LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pLockObject).Int32Value(), rs_rARM_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rARM_LR);
     MarkSafepointPC(call_inst);
@@ -245,7 +245,7 @@
   LoadValueDirectFixed(rl_src, rs_r0);  // Get obj
   LockCallTemps();  // Prepare for explicit register usage
   LIR* null_check_branch = nullptr;
-  LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset().Int32Value(), rs_r2);
+  LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
   constexpr bool kArchVariantHasGoodBranchPredictor = false;  // TODO: true if cortex-A15.
   if (kArchVariantHasGoodBranchPredictor) {
     if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) {
@@ -270,7 +270,7 @@
     }
     // TODO: move to a slow path.
     // Go expensive route - artUnlockObjectFromCode(obj);
-    LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(pUnlockObject).Int32Value(), rs_rARM_LR);
+    LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject).Int32Value(), rs_rARM_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx, rs_rARM_LR);
     MarkSafepointPC(call_inst);
@@ -283,14 +283,14 @@
     GenNullCheck(rs_r0, opt_flags);
     LoadWordDisp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);  // Get lock
     MarkPossibleNullPointerException(opt_flags);
-    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset().Int32Value(), rs_r2);
+    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
     LoadConstantNoClobber(rs_r3, 0);
     // Is lock unheld on lock or held by us (==thread_id) on unlock?
     OpRegReg(kOpCmp, rs_r1, rs_r2);
     OpIT(kCondEq, "EE");
     StoreWordDisp/*eq*/(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r3);
     // Go expensive route - UnlockObjectFromCode(obj);
-    LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(pUnlockObject).Int32Value(),
+    LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject).Int32Value(),
                        rs_rARM_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rARM_LR);
@@ -300,7 +300,7 @@
 }
 
 void ArmMir2Lir::GenMoveException(RegLocation rl_dest) {
-  int ex_offset = Thread::ExceptionOffset().Int32Value();
+  int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage reset_reg = AllocTemp();
   LoadWordDisp(rs_rARM_SELF, ex_offset, rl_result.reg);
@@ -317,7 +317,7 @@
   RegStorage reg_card_base = AllocTemp();
   RegStorage reg_card_no = AllocTemp();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
-  LoadWordDisp(rs_rARM_SELF, Thread::CardTableOffset().Int32Value(), reg_card_base);
+  LoadWordDisp(rs_rARM_SELF, Thread::CardTableOffset<4>().Int32Value(), reg_card_base);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
@@ -350,7 +350,7 @@
   if (!skip_overflow_check) {
     if (Runtime::Current()->ExplicitStackOverflowChecks()) {
       /* Load stack limit */
-      LoadWordDisp(rs_rARM_SELF, Thread::StackEndOffset().Int32Value(), rs_r12);
+      LoadWordDisp(rs_rARM_SELF, Thread::StackEndOffset<4>().Int32Value(), rs_r12);
     }
   }
   /* Spill core callee saves */
@@ -384,7 +384,7 @@
           }
           m2l_->OpRegImm(kOpAdd, rs_rARM_SP, sp_displace_);
           m2l_->ClobberCallerSave();
-          ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowStackOverflow);
+          ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowStackOverflow);
           // Load the entrypoint directly into the pc instead of doing a load + branch. Assumes
           // codegen and target are in thumb2 mode.
           m2l_->LoadWordDisp(rs_rARM_SELF, func_offset.Int32Value(), rs_rARM_PC);
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 8bfdb6a..7982231 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -30,7 +30,7 @@
     bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
                             RegLocation rl_dest, int lit);
     LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(ThreadOffset offset);
+    RegStorage LoadHelper(ThreadOffset<4> offset);
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size,
                       int s_reg);
     LIR* LoadBaseDispWide(RegStorage r_base, int displacement, RegStorage r_dest, int s_reg);
@@ -171,12 +171,12 @@
     LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
     LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
     LIR* OpTestSuspend(LIR* target);
-    LIR* OpThreadMem(OpKind op, ThreadOffset thread_offset);
+    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset);
     LIR* OpVldm(RegStorage r_base, int count);
     LIR* OpVstm(RegStorage r_base, int count);
     void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
     void OpRegCopyWide(RegStorage dest, RegStorage src);
-    void OpTlsCmp(ThreadOffset offset, int val);
+    void OpTlsCmp(ThreadOffset<4> offset, int val);
 
     LIR* LoadBaseDispBody(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size,
                           int s_reg);
diff --git a/compiler/dex/quick/arm/fp_arm.cc b/compiler/dex/quick/arm/fp_arm.cc
index 398bf96..07a13ce 100644
--- a/compiler/dex/quick/arm/fp_arm.cc
+++ b/compiler/dex/quick/arm/fp_arm.cc
@@ -49,7 +49,7 @@
     case Instruction::REM_FLOAT_2ADDR:
     case Instruction::REM_FLOAT:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmodf), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmodf), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturn(true);
       StoreValue(rl_dest, rl_result);
@@ -92,7 +92,7 @@
     case Instruction::REM_DOUBLE_2ADDR:
     case Instruction::REM_DOUBLE:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmod), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmod), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturnWide(true);
       StoreValueWide(rl_dest, rl_result);
@@ -162,7 +162,7 @@
       return;
     }
     case Instruction::FLOAT_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pF2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pF2l), rl_dest, rl_src);
       return;
     case Instruction::LONG_TO_FLOAT: {
       rl_src = LoadValueWide(rl_src, kFPReg);
@@ -192,7 +192,7 @@
       return;
     }
     case Instruction::DOUBLE_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pD2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pD2l), rl_dest, rl_src);
       return;
     default:
       LOG(FATAL) << "Unexpected opcode: " << opcode;
@@ -359,7 +359,7 @@
   branch = NewLIR2(kThumbBCond, 0, kArmCondEq);
   ClobberCallerSave();
   LockCallTemps();  // Using fixed registers
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pSqrt));
+  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pSqrt));
   NewLIR3(kThumb2Fmrrd, r0, r1, S2d(rl_src.reg.GetLowReg(), rl_src.reg.GetHighReg()));
   NewLIR1(kThumbBlxR, r_tgt.GetReg());
   NewLIR3(kThumb2Fmdrr, S2d(rl_result.reg.GetLowReg(), rl_result.reg.GetHighReg()), r0, r1);
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 46db466..fde6e8a 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -578,7 +578,7 @@
   LOG(FATAL) << "Unexpected use of OpLea for Arm";
 }
 
-void ArmMir2Lir::OpTlsCmp(ThreadOffset offset, int val) {
+void ArmMir2Lir::OpTlsCmp(ThreadOffset<4> offset, int val) {
   LOG(FATAL) << "Unexpected use of OpTlsCmp for Arm";
 }
 
@@ -848,7 +848,7 @@
      */
     RegLocation rl_result;
     if (BadOverlap(rl_src1, rl_dest) || (BadOverlap(rl_src2, rl_dest))) {
-      ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pLmul);
+      ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLmul);
       FlushAllRegs();
       CallRuntimeHelperRegLocationRegLocation(func_offset, rl_src1, rl_src2, false);
       rl_result = GetReturnWide(false);
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index 5bab0e3..5ebe0a3 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -727,14 +727,14 @@
   FreeTemp(r3);
 }
 
-RegStorage ArmMir2Lir::LoadHelper(ThreadOffset offset) {
+RegStorage ArmMir2Lir::LoadHelper(ThreadOffset<4> offset) {
   LoadWordDisp(rs_rARM_SELF, offset.Int32Value(), rs_rARM_LR);
   return rs_rARM_LR;
 }
 
 LIR* ArmMir2Lir::CheckSuspendUsingLoad() {
   RegStorage tmp = rs_r0;
-  LoadWordDisp(rs_rARM_SELF, Thread::ThreadSuspendTriggerOffset().Int32Value(), tmp);
+  LoadWordDisp(rs_rARM_SELF, Thread::ThreadSuspendTriggerOffset<4>().Int32Value(), tmp);
   LIR* load2 = LoadWordDisp(tmp, 0, tmp);
   return load2;
 }
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 8df5b25..1634905 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -1109,7 +1109,7 @@
   return res;
 }
 
-LIR* ArmMir2Lir::OpThreadMem(OpKind op, ThreadOffset thread_offset) {
+LIR* ArmMir2Lir::OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) {
   LOG(FATAL) << "Unexpected use of OpThreadMem for Arm";
   return NULL;
 }
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 866ce5f..44f81f8 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -251,7 +251,7 @@
 void Mir2Lir::GenNewArray(uint32_t type_idx, RegLocation rl_dest,
                           RegLocation rl_src) {
   FlushAllRegs();  /* Everything to home location */
-  ThreadOffset func_offset(-1);
+  ThreadOffset<4> func_offset(-1);
   const DexFile* dex_file = cu_->dex_file;
   CompilerDriver* driver = cu_->compiler_driver;
   if (cu_->compiler_driver->CanAccessTypeWithoutChecks(cu_->method_idx, *dex_file,
@@ -265,22 +265,22 @@
       // The fast path.
       if (!use_direct_type_ptr) {
         LoadClassType(type_idx, kArg0);
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocArrayResolved);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocArrayResolved);
         CallRuntimeHelperRegMethodRegLocation(func_offset, TargetReg(kArg0), rl_src, true);
       } else {
         // Use the direct pointer.
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocArrayResolved);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocArrayResolved);
         CallRuntimeHelperImmMethodRegLocation(func_offset, direct_type_ptr, rl_src, true);
       }
     } else {
       // The slow path.
       DCHECK_EQ(func_offset.Int32Value(), -1);
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocArray);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocArray);
       CallRuntimeHelperImmMethodRegLocation(func_offset, type_idx, rl_src, true);
     }
     DCHECK_NE(func_offset.Int32Value(), -1);
   } else {
-    func_offset= QUICK_ENTRYPOINT_OFFSET(pAllocArrayWithAccessCheck);
+    func_offset= QUICK_ENTRYPOINT_OFFSET(4, pAllocArrayWithAccessCheck);
     CallRuntimeHelperImmMethodRegLocation(func_offset, type_idx, rl_src, true);
   }
   RegLocation rl_result = GetReturn(false);
@@ -297,12 +297,12 @@
   int elems = info->num_arg_words;
   int type_idx = info->index;
   FlushAllRegs();  /* Everything to home location */
-  ThreadOffset func_offset(-1);
+  ThreadOffset<4> func_offset(-1);
   if (cu_->compiler_driver->CanAccessTypeWithoutChecks(cu_->method_idx, *cu_->dex_file,
                                                        type_idx)) {
-    func_offset = QUICK_ENTRYPOINT_OFFSET(pCheckAndAllocArray);
+    func_offset = QUICK_ENTRYPOINT_OFFSET(4, pCheckAndAllocArray);
   } else {
-    func_offset = QUICK_ENTRYPOINT_OFFSET(pCheckAndAllocArrayWithAccessCheck);
+    func_offset = QUICK_ENTRYPOINT_OFFSET(4, pCheckAndAllocArrayWithAccessCheck);
   }
   CallRuntimeHelperImmMethodImm(func_offset, type_idx, elems, true);
   FreeTemp(TargetReg(kArg2));
@@ -410,7 +410,7 @@
   void Compile() {
     LIR* unresolved_target = GenerateTargetLabel();
     uninit_->target = unresolved_target;
-    m2l_->CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(pInitializeStaticStorage),
+    m2l_->CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeStaticStorage),
                                storage_index_, true);
     // Copy helper's result into r_base, a no-op on all but MIPS.
     m2l_->OpRegCopy(r_base_,  m2l_->TargetReg(kRet0));
@@ -502,10 +502,10 @@
     FreeTemp(r_base);
   } else {
     FlushAllRegs();  // Everything to home locations
-    ThreadOffset setter_offset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pSet64Static)
-                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(pSetObjStatic)
-                                       : QUICK_ENTRYPOINT_OFFSET(pSet32Static));
+    ThreadOffset<4> setter_offset =
+        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pSet64Static)
+                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(4, pSetObjStatic)
+                                       : QUICK_ENTRYPOINT_OFFSET(4, pSet32Static));
     CallRuntimeHelperImmRegLocation(setter_offset, field_info.FieldIndex(), rl_src, true);
   }
 }
@@ -583,10 +583,10 @@
     }
   } else {
     FlushAllRegs();  // Everything to home locations
-    ThreadOffset getterOffset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pGet64Static)
-                          :(is_object ? QUICK_ENTRYPOINT_OFFSET(pGetObjStatic)
-                                      : QUICK_ENTRYPOINT_OFFSET(pGet32Static));
+    ThreadOffset<4> getterOffset =
+        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pGet64Static)
+                          :(is_object ? QUICK_ENTRYPOINT_OFFSET(4, pGetObjStatic)
+                                      : QUICK_ENTRYPOINT_OFFSET(4, pGet32Static));
     CallRuntimeHelperImm(getterOffset, field_info.FieldIndex(), true);
     if (is_long_or_double) {
       RegLocation rl_result = GetReturnWide(rl_dest.fp);
@@ -610,7 +610,7 @@
 
 void Mir2Lir::HandleSuspendLaunchPads() {
   int num_elems = suspend_launchpads_.Size();
-  ThreadOffset helper_offset = QUICK_ENTRYPOINT_OFFSET(pTestSuspend);
+  ThreadOffset<4> helper_offset = QUICK_ENTRYPOINT_OFFSET(4, pTestSuspend);
   for (int i = 0; i < num_elems; i++) {
     ResetRegPool();
     ResetDefTracking();
@@ -632,13 +632,13 @@
     LIR* lab = throw_launchpads_.Get(i);
     current_dalvik_offset_ = lab->operands[1];
     AppendLIR(lab);
-    ThreadOffset func_offset(-1);
+    ThreadOffset<4> func_offset(-1);
     int v1 = lab->operands[2];
     int v2 = lab->operands[3];
     const bool target_x86 = cu_->instruction_set == kX86;
     switch (lab->operands[0]) {
       case kThrowNullPointer:
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowNullPointer);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowNullPointer);
         break;
       case kThrowConstantArrayBounds:  // v1 is length reg (for Arm/Mips), v2 constant index
         // v1 holds the constant array index.  Mips/Arm uses v2 for length, x86 reloads.
@@ -651,7 +651,7 @@
         // Make sure the following LoadConstant doesn't mess with kArg1.
         LockTemp(TargetReg(kArg1));
         LoadConstant(TargetReg(kArg0), v2);
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowArrayBounds);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds);
         break;
       case kThrowArrayBounds:
         // Move v1 (array index) to kArg0 and v2 (array length) to kArg1
@@ -687,15 +687,15 @@
             OpRegCopy(TargetReg(kArg0), RegStorage::Solo32(v1));
           }
         }
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowArrayBounds);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds);
         break;
       case kThrowDivZero:
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowDivZero);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowDivZero);
         break;
       case kThrowNoSuchMethod:
         OpRegCopy(TargetReg(kArg0), RegStorage::Solo32(v1));
         func_offset =
-          QUICK_ENTRYPOINT_OFFSET(pThrowNoSuchMethod);
+          QUICK_ENTRYPOINT_OFFSET(4, pThrowNoSuchMethod);
         break;
       default:
         LOG(FATAL) << "Unexpected throw kind: " << lab->operands[0];
@@ -762,10 +762,10 @@
       StoreValue(rl_dest, rl_result);
     }
   } else {
-    ThreadOffset getterOffset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pGet64Instance)
-                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(pGetObjInstance)
-                                       : QUICK_ENTRYPOINT_OFFSET(pGet32Instance));
+    ThreadOffset<4> getterOffset =
+        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pGet64Instance)
+                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(4, pGetObjInstance)
+                                       : QUICK_ENTRYPOINT_OFFSET(4, pGet32Instance));
     CallRuntimeHelperImmRegLocation(getterOffset, field_info.FieldIndex(), rl_obj, true);
     if (is_long_or_double) {
       RegLocation rl_result = GetReturnWide(rl_dest.fp);
@@ -820,10 +820,10 @@
       }
     }
   } else {
-    ThreadOffset setter_offset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pSet64Instance)
-                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(pSetObjInstance)
-                                       : QUICK_ENTRYPOINT_OFFSET(pSet32Instance));
+    ThreadOffset<4> setter_offset =
+        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pSet64Instance)
+                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(4, pSetObjInstance)
+                                       : QUICK_ENTRYPOINT_OFFSET(4, pSet32Instance));
     CallRuntimeHelperImmRegLocationRegLocation(setter_offset, field_info.FieldIndex(),
                                                rl_obj, rl_src, true);
   }
@@ -834,10 +834,10 @@
   bool needs_range_check = !(opt_flags & MIR_IGNORE_RANGE_CHECK);
   bool needs_null_check = !((cu_->disable_opt & (1 << kNullCheckElimination)) &&
       (opt_flags & MIR_IGNORE_NULL_CHECK));
-  ThreadOffset helper = needs_range_check
-      ? (needs_null_check ? QUICK_ENTRYPOINT_OFFSET(pAputObjectWithNullAndBoundCheck)
-                          : QUICK_ENTRYPOINT_OFFSET(pAputObjectWithBoundCheck))
-      : QUICK_ENTRYPOINT_OFFSET(pAputObject);
+  ThreadOffset<4> helper = needs_range_check
+      ? (needs_null_check ? QUICK_ENTRYPOINT_OFFSET(4, pAputObjectWithNullAndBoundCheck)
+                          : QUICK_ENTRYPOINT_OFFSET(4, pAputObjectWithBoundCheck))
+      : QUICK_ENTRYPOINT_OFFSET(4, pAputObject);
   CallRuntimeHelperRegLocationRegLocationRegLocation(helper, rl_array, rl_index, rl_src, true);
 }
 
@@ -850,7 +850,7 @@
                                                    type_idx)) {
     // Call out to helper which resolves type and verifies access.
     // Resolved type returned in kRet0.
-    CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(pInitializeTypeAndVerifyAccess),
+    CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
                             type_idx, rl_method.reg, true);
     RegLocation rl_result = GetReturn(false);
     StoreValue(rl_dest, rl_result);
@@ -882,7 +882,7 @@
         void Compile() {
           GenerateTargetLabel();
 
-          m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(pInitializeType), type_idx_,
+          m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx_,
                                         rl_method_.reg, true);
           m2l_->OpRegCopy(rl_result_.reg,  m2l_->TargetReg(kRet0));
 
@@ -950,7 +950,7 @@
         void Compile() {
           GenerateTargetLabel();
 
-          RegStorage r_tgt = m2l_->CallHelperSetup(QUICK_ENTRYPOINT_OFFSET(pResolveString));
+          RegStorage r_tgt = m2l_->CallHelperSetup(QUICK_ENTRYPOINT_OFFSET(4, pResolveString));
 
           m2l_->OpRegCopy(m2l_->TargetReg(kArg0), r_method_);   // .eq
           LIR* call_inst = m2l_->OpReg(kOpBlx, r_tgt);
@@ -970,7 +970,7 @@
       DCHECK_EQ(cu_->instruction_set, kX86);
       LIR* branch = OpCmpImmBranch(kCondNe, TargetReg(kRet0), 0, NULL);
       LoadConstant(TargetReg(kArg1), string_idx);
-      CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(pResolveString), r_method, TargetReg(kArg1),
+      CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pResolveString), r_method, TargetReg(kArg1),
                               true);
       LIR* target = NewLIR0(kPseudoTargetLabel);
       branch->target = target;
@@ -995,7 +995,7 @@
   FlushAllRegs();  /* Everything to home location */
   // alloc will always check for resolution, do we also need to verify
   // access because the verifier was unable to?
-  ThreadOffset func_offset(-1);
+  ThreadOffset<4> func_offset(-1);
   const DexFile* dex_file = cu_->dex_file;
   CompilerDriver* driver = cu_->compiler_driver;
   if (driver->CanAccessInstantiableTypeWithoutChecks(
@@ -1010,31 +1010,31 @@
       if (!use_direct_type_ptr) {
         LoadClassType(type_idx, kArg0);
         if (!is_type_initialized) {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObjectResolved);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectResolved);
           CallRuntimeHelperRegMethod(func_offset, TargetReg(kArg0), true);
         } else {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObjectInitialized);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectInitialized);
           CallRuntimeHelperRegMethod(func_offset, TargetReg(kArg0), true);
         }
       } else {
         // Use the direct pointer.
         if (!is_type_initialized) {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObjectResolved);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectResolved);
           CallRuntimeHelperImmMethod(func_offset, direct_type_ptr, true);
         } else {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObjectInitialized);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectInitialized);
           CallRuntimeHelperImmMethod(func_offset, direct_type_ptr, true);
         }
       }
     } else {
       // The slow path.
       DCHECK_EQ(func_offset.Int32Value(), -1);
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObject);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObject);
       CallRuntimeHelperImmMethod(func_offset, type_idx, true);
     }
     DCHECK_NE(func_offset.Int32Value(), -1);
   } else {
-    func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObjectWithAccessCheck);
+    func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectWithAccessCheck);
     CallRuntimeHelperImmMethod(func_offset, type_idx, true);
   }
   RegLocation rl_result = GetReturn(false);
@@ -1043,7 +1043,7 @@
 
 void Mir2Lir::GenThrow(RegLocation rl_src) {
   FlushAllRegs();
-  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(pDeliverException), rl_src, true);
+  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pDeliverException), rl_src, true);
 }
 
 // For final classes there are no sub-classes to check and so we can answer the instance-of
@@ -1118,7 +1118,7 @@
   if (needs_access_check) {
     // Check we have access to type_idx and if not throw IllegalAccessError,
     // returns Class* in kArg0
-    CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(pInitializeTypeAndVerifyAccess),
+    CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
                          type_idx, true);
     OpRegCopy(class_reg, TargetReg(kRet0));  // Align usage with fast path
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));  // kArg0 <= ref
@@ -1140,7 +1140,7 @@
       LIR* hop_branch = OpCmpImmBranch(kCondNe, class_reg, 0, NULL);
       // Not resolved
       // Call out to helper, which will return resolved type in kRet0
-      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(pInitializeType), type_idx, true);
+      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx, true);
       OpRegCopy(TargetReg(kArg2), TargetReg(kRet0));  // Align usage with fast path
       LoadValueDirectFixed(rl_src, TargetReg(kArg0));  /* reload Ref */
       // Rejoin code paths
@@ -1175,7 +1175,7 @@
     }
   } else {
     if (cu_->instruction_set == kThumb2) {
-      RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pInstanceofNonTrivial));
+      RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pInstanceofNonTrivial));
       if (!type_known_abstract) {
       /* Uses conditional nullification */
         OpRegReg(kOpCmp, TargetReg(kArg1), TargetReg(kArg2));  // Same?
@@ -1191,7 +1191,7 @@
         LoadConstant(rl_result.reg, 1);     // assume true
         branchover = OpCmpBranch(kCondEq, TargetReg(kArg1), TargetReg(kArg2), NULL);
       }
-      RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pInstanceofNonTrivial));
+      RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pInstanceofNonTrivial));
       OpRegCopy(TargetReg(kArg0), TargetReg(kArg2));    // .ne case - arg0 <= class
       OpReg(kOpBlx, r_tgt);    // .ne case: helper(class, ref->class)
       FreeTemp(r_tgt);
@@ -1252,7 +1252,7 @@
     // Check we have access to type_idx and if not throw IllegalAccessError,
     // returns Class* in kRet0
     // InitializeTypeAndVerifyAccess(idx, method)
-    CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(pInitializeTypeAndVerifyAccess),
+    CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
                             type_idx, TargetReg(kArg1), true);
     OpRegCopy(class_reg, TargetReg(kRet0));  // Align usage with fast path
   } else if (use_declaring_class) {
@@ -1285,7 +1285,7 @@
 
           // Call out to helper, which will return resolved type in kArg0
           // InitializeTypeFromCode(idx, method)
-          m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(pInitializeType), type_idx_,
+          m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx_,
                                         m2l_->TargetReg(kArg1), true);
           m2l_->OpRegCopy(class_reg_, m2l_->TargetReg(kRet0));  // Align usage with fast path
           m2l_->OpUnconditionalBranch(cont_);
@@ -1316,7 +1316,7 @@
         m2l_->LoadWordDisp(m2l_->TargetReg(kArg0), mirror::Object::ClassOffset().Int32Value(),
                            m2l_->TargetReg(kArg1));
       }
-      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(pCheckCast), m2l_->TargetReg(kArg2),
+      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pCheckCast), m2l_->TargetReg(kArg2),
                                     m2l_->TargetReg(kArg1), true);
 
       m2l_->OpUnconditionalBranch(cont_);
@@ -1401,20 +1401,20 @@
 
 void Mir2Lir::GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
                              RegLocation rl_src1, RegLocation rl_shift) {
-  ThreadOffset func_offset(-1);
+  ThreadOffset<4> func_offset(-1);
 
   switch (opcode) {
     case Instruction::SHL_LONG:
     case Instruction::SHL_LONG_2ADDR:
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pShlLong);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pShlLong);
       break;
     case Instruction::SHR_LONG:
     case Instruction::SHR_LONG_2ADDR:
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pShrLong);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pShrLong);
       break;
     case Instruction::USHR_LONG:
     case Instruction::USHR_LONG_2ADDR:
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pUshrLong);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pUshrLong);
       break;
     default:
       LOG(FATAL) << "Unexpected case";
@@ -1547,7 +1547,7 @@
 
     // If we haven't already generated the code use the callout function.
     if (!done) {
-      ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pIdivmod);
+      ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pIdivmod);
       FlushAllRegs();   /* Send everything to home location */
       LoadValueDirectFixed(rl_src2, TargetReg(kArg1));
       RegStorage r_tgt = CallHelperSetup(func_offset);
@@ -1798,7 +1798,7 @@
         FlushAllRegs();   /* Everything to home location. */
         LoadValueDirectFixed(rl_src, TargetReg(kArg0));
         Clobber(TargetReg(kArg0));
-        ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pIdivmod);
+        ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pIdivmod);
         CallRuntimeHelperRegImm(func_offset, TargetReg(kArg0), lit, false);
         if (is_div)
           rl_result = GetReturn(false);
@@ -1829,7 +1829,7 @@
   OpKind second_op = kOpBkpt;
   bool call_out = false;
   bool check_zero = false;
-  ThreadOffset func_offset(-1);
+  ThreadOffset<4> func_offset(-1);
   int ret_reg = TargetReg(kRet0).GetReg();
 
   switch (opcode) {
@@ -1875,7 +1875,7 @@
       } else {
         call_out = true;
         ret_reg = TargetReg(kRet0).GetReg();
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pLmul);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLmul);
       }
       break;
     case Instruction::DIV_LONG:
@@ -1883,13 +1883,13 @@
       call_out = true;
       check_zero = true;
       ret_reg = TargetReg(kRet0).GetReg();
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pLdiv);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLdiv);
       break;
     case Instruction::REM_LONG:
     case Instruction::REM_LONG_2ADDR:
       call_out = true;
       check_zero = true;
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pLmod);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLmod);
       /* NOTE - for Arm, result is in kArg2/kArg3 instead of kRet0/kRet1 */
       ret_reg = (cu_->instruction_set == kThumb2) ? TargetReg(kArg2).GetReg() : TargetReg(kRet0).GetReg();
       break;
@@ -1951,7 +1951,7 @@
   }
 }
 
-void Mir2Lir::GenConversionCall(ThreadOffset func_offset,
+void Mir2Lir::GenConversionCall(ThreadOffset<4> func_offset,
                                 RegLocation rl_dest, RegLocation rl_src) {
   /*
    * Don't optimize the register usage since it calls out to support
@@ -2024,13 +2024,13 @@
 /* Call out to helper assembly routine that will null check obj and then lock it. */
 void Mir2Lir::GenMonitorEnter(int opt_flags, RegLocation rl_src) {
   FlushAllRegs();
-  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(pLockObject), rl_src, true);
+  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pLockObject), rl_src, true);
 }
 
 /* Call out to helper assembly routine that will null check obj and then unlock it. */
 void Mir2Lir::GenMonitorExit(int opt_flags, RegLocation rl_src) {
   FlushAllRegs();
-  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(pUnlockObject), rl_src, true);
+  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject), rl_src, true);
 }
 
 /* Generic code for generating a wide constant into a VR. */
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 7689b51..0746913 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -66,12 +66,12 @@
  * has a memory call operation, part 1 is a NOP for x86.  For other targets,
  * load arguments between the two parts.
  */
-RegStorage Mir2Lir::CallHelperSetup(ThreadOffset helper_offset) {
+RegStorage Mir2Lir::CallHelperSetup(ThreadOffset<4> helper_offset) {
   return (cu_->instruction_set == kX86) ? RegStorage::InvalidReg() : LoadHelper(helper_offset);
 }
 
 /* NOTE: if r_tgt is a temp, it will be freed following use */
-LIR* Mir2Lir::CallHelper(RegStorage r_tgt, ThreadOffset helper_offset, bool safepoint_pc,
+LIR* Mir2Lir::CallHelper(RegStorage r_tgt, ThreadOffset<4> helper_offset, bool safepoint_pc,
                          bool use_link) {
   LIR* call_inst;
   OpKind op = use_link ? kOpBlx : kOpBx;
@@ -87,21 +87,22 @@
   return call_inst;
 }
 
-void Mir2Lir::CallRuntimeHelperImm(ThreadOffset helper_offset, int arg0, bool safepoint_pc) {
+void Mir2Lir::CallRuntimeHelperImm(ThreadOffset<4> helper_offset, int arg0, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadConstant(TargetReg(kArg0), arg0);
   ClobberCallerSave();
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperReg(ThreadOffset helper_offset, RegStorage arg0, bool safepoint_pc) {
+void Mir2Lir::CallRuntimeHelperReg(ThreadOffset<4> helper_offset, RegStorage arg0,
+                                   bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   OpRegCopy(TargetReg(kArg0), arg0);
   ClobberCallerSave();
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegLocation(ThreadOffset helper_offset, RegLocation arg0,
+void Mir2Lir::CallRuntimeHelperRegLocation(ThreadOffset<4> helper_offset, RegLocation arg0,
                                            bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   if (arg0.wide == 0) {
@@ -114,7 +115,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmImm(ThreadOffset helper_offset, int arg0, int arg1,
+void Mir2Lir::CallRuntimeHelperImmImm(ThreadOffset<4> helper_offset, int arg0, int arg1,
                                       bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadConstant(TargetReg(kArg0), arg0);
@@ -123,7 +124,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmRegLocation(ThreadOffset helper_offset, int arg0,
+void Mir2Lir::CallRuntimeHelperImmRegLocation(ThreadOffset<4> helper_offset, int arg0,
                                               RegLocation arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   if (arg1.wide == 0) {
@@ -137,8 +138,8 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegLocationImm(ThreadOffset helper_offset, RegLocation arg0, int arg1,
-                                              bool safepoint_pc) {
+void Mir2Lir::CallRuntimeHelperRegLocationImm(ThreadOffset<4> helper_offset, RegLocation arg0,
+                                              int arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadValueDirectFixed(arg0, TargetReg(kArg0));
   LoadConstant(TargetReg(kArg1), arg1);
@@ -146,7 +147,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmReg(ThreadOffset helper_offset, int arg0, RegStorage arg1,
+void Mir2Lir::CallRuntimeHelperImmReg(ThreadOffset<4> helper_offset, int arg0, RegStorage arg1,
                                       bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   OpRegCopy(TargetReg(kArg1), arg1);
@@ -155,7 +156,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegImm(ThreadOffset helper_offset, RegStorage arg0, int arg1,
+void Mir2Lir::CallRuntimeHelperRegImm(ThreadOffset<4> helper_offset, RegStorage arg0, int arg1,
                                       bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   OpRegCopy(TargetReg(kArg0), arg0);
@@ -164,7 +165,8 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmMethod(ThreadOffset helper_offset, int arg0, bool safepoint_pc) {
+void Mir2Lir::CallRuntimeHelperImmMethod(ThreadOffset<4> helper_offset, int arg0,
+                                         bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadCurrMethodDirect(TargetReg(kArg1));
   LoadConstant(TargetReg(kArg0), arg0);
@@ -172,7 +174,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegMethod(ThreadOffset helper_offset, RegStorage arg0,
+void Mir2Lir::CallRuntimeHelperRegMethod(ThreadOffset<4> helper_offset, RegStorage arg0,
                                          bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_NE(TargetReg(kArg1).GetReg(), arg0.GetReg());
@@ -184,7 +186,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegMethodRegLocation(ThreadOffset helper_offset, RegStorage arg0,
+void Mir2Lir::CallRuntimeHelperRegMethodRegLocation(ThreadOffset<4> helper_offset, RegStorage arg0,
                                                     RegLocation arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_NE(TargetReg(kArg1).GetReg(), arg0.GetReg());
@@ -197,8 +199,9 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegLocationRegLocation(ThreadOffset helper_offset, RegLocation arg0,
-                                                      RegLocation arg1, bool safepoint_pc) {
+void Mir2Lir::CallRuntimeHelperRegLocationRegLocation(ThreadOffset<4> helper_offset,
+                                                      RegLocation arg0, RegLocation arg1,
+                                                      bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   if (arg0.wide == 0) {
     LoadValueDirectFixed(arg0, arg0.fp ? TargetReg(kFArg0) : TargetReg(kArg0));
@@ -246,7 +249,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegReg(ThreadOffset helper_offset, RegStorage arg0,
+void Mir2Lir::CallRuntimeHelperRegReg(ThreadOffset<4> helper_offset, RegStorage arg0,
                                       RegStorage arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_NE(TargetReg(kArg0).GetReg(), arg1.GetReg());  // check copy into arg0 won't clobber arg1
@@ -256,7 +259,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegRegImm(ThreadOffset helper_offset, RegStorage arg0,
+void Mir2Lir::CallRuntimeHelperRegRegImm(ThreadOffset<4> helper_offset, RegStorage arg0,
                                          RegStorage arg1, int arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_NE(TargetReg(kArg0).GetReg(), arg1.GetReg());  // check copy into arg0 won't clobber arg1
@@ -267,7 +270,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmMethodRegLocation(ThreadOffset helper_offset,
+void Mir2Lir::CallRuntimeHelperImmMethodRegLocation(ThreadOffset<4> helper_offset,
                                                     int arg0, RegLocation arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadValueDirectFixed(arg2, TargetReg(kArg2));
@@ -277,7 +280,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmMethodImm(ThreadOffset helper_offset, int arg0,
+void Mir2Lir::CallRuntimeHelperImmMethodImm(ThreadOffset<4> helper_offset, int arg0,
                                             int arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadCurrMethodDirect(TargetReg(kArg1));
@@ -287,7 +290,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset helper_offset,
+void Mir2Lir::CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset<4> helper_offset,
                                                          int arg0, RegLocation arg1,
                                                          RegLocation arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
@@ -304,7 +307,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset helper_offset,
+void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset<4> helper_offset,
                                                                  RegLocation arg0, RegLocation arg1,
                                                                  RegLocation arg2,
                                                                  bool safepoint_pc) {
@@ -597,7 +600,7 @@
   return state + 1;
 }
 
-static int NextInvokeInsnSP(CompilationUnit* cu, CallInfo* info, ThreadOffset trampoline,
+static int NextInvokeInsnSP(CompilationUnit* cu, CallInfo* info, ThreadOffset<4> trampoline,
                             int state, const MethodReference& target_method,
                             uint32_t method_idx) {
   Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
@@ -623,7 +626,7 @@
                                 const MethodReference& target_method,
                                 uint32_t unused, uintptr_t unused2,
                                 uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeStaticTrampolineWithAccessCheck);
+  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeStaticTrampolineWithAccessCheck);
   return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
 }
 
@@ -631,7 +634,7 @@
                                 const MethodReference& target_method,
                                 uint32_t unused, uintptr_t unused2,
                                 uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeDirectTrampolineWithAccessCheck);
+  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeDirectTrampolineWithAccessCheck);
   return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
 }
 
@@ -639,7 +642,7 @@
                                const MethodReference& target_method,
                                uint32_t unused, uintptr_t unused2,
                                uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeSuperTrampolineWithAccessCheck);
+  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeSuperTrampolineWithAccessCheck);
   return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
 }
 
@@ -647,7 +650,7 @@
                            const MethodReference& target_method,
                            uint32_t unused, uintptr_t unused2,
                            uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeVirtualTrampolineWithAccessCheck);
+  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeVirtualTrampolineWithAccessCheck);
   return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
 }
 
@@ -656,7 +659,8 @@
                                                 const MethodReference& target_method,
                                                 uint32_t unused, uintptr_t unused2,
                                                 uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeInterfaceTrampolineWithAccessCheck);
+  ThreadOffset<4> trampoline =
+      QUICK_ENTRYPOINT_OFFSET(4, pInvokeInterfaceTrampolineWithAccessCheck);
   return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
 }
 
@@ -986,7 +990,7 @@
     // Generate memcpy
     OpRegRegImm(kOpAdd, TargetReg(kArg0), TargetReg(kSp), outs_offset);
     OpRegRegImm(kOpAdd, TargetReg(kArg1), TargetReg(kSp), start_offset);
-    CallRuntimeHelperRegRegImm(QUICK_ENTRYPOINT_OFFSET(pMemcpy), TargetReg(kArg0),
+    CallRuntimeHelperRegRegImm(QUICK_ENTRYPOINT_OFFSET(4, pMemcpy), TargetReg(kArg0),
                                TargetReg(kArg1), (info->num_arg_words - 3) * 4, false);
   }
 
@@ -1318,7 +1322,7 @@
     RegLocation rl_start = info->args[2];     // 3rd arg only present in III flavor of IndexOf.
     LoadValueDirectFixed(rl_start, reg_start);
   }
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pIndexOf));
+  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pIndexOf));
   GenExplicitNullCheck(reg_ptr, info->opt_flags);
   LIR* high_code_point_branch =
       rl_char.is_const ? nullptr : OpCmpImmBranch(kCondGt, reg_char, 0xFFFF, nullptr);
@@ -1356,7 +1360,7 @@
   LoadValueDirectFixed(rl_this, reg_this);
   LoadValueDirectFixed(rl_cmp, reg_cmp);
   RegStorage r_tgt = (cu_->instruction_set != kX86) ?
-      LoadHelper(QUICK_ENTRYPOINT_OFFSET(pStringCompareTo)) : RegStorage::InvalidReg();
+      LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pStringCompareTo)) : RegStorage::InvalidReg();
   GenExplicitNullCheck(reg_this, info->opt_flags);
   info->opt_flags |= MIR_IGNORE_NULL_CHECK;  // Record that we've null checked.
   // TUNING: check if rl_cmp.s_reg_low is already null checked
@@ -1366,7 +1370,7 @@
   if (cu_->instruction_set != kX86) {
     OpReg(kOpBlx, r_tgt);
   } else {
-    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(pStringCompareTo));
+    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(4, pStringCompareTo));
   }
   RegLocation rl_return = GetReturn(false);
   RegLocation rl_dest = InlineTarget(info);
@@ -1377,7 +1381,7 @@
 bool Mir2Lir::GenInlinedCurrentThread(CallInfo* info) {
   RegLocation rl_dest = InlineTarget(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  ThreadOffset offset = Thread::PeerOffset();
+  ThreadOffset<4> offset = Thread::PeerOffset<4>();
   if (cu_->instruction_set == kThumb2 || cu_->instruction_set == kMips) {
     LoadWordDisp(TargetReg(kSelf), offset.Int32Value(), rl_result.reg);
   } else {
@@ -1551,22 +1555,22 @@
                           mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().Int32Value());
       }
     } else {
-      ThreadOffset trampoline(-1);
+      ThreadOffset<4> trampoline(-1);
       switch (info->type) {
       case kInterface:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeInterfaceTrampolineWithAccessCheck);
+        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeInterfaceTrampolineWithAccessCheck);
         break;
       case kDirect:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeDirectTrampolineWithAccessCheck);
+        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeDirectTrampolineWithAccessCheck);
         break;
       case kStatic:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeStaticTrampolineWithAccessCheck);
+        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeStaticTrampolineWithAccessCheck);
         break;
       case kSuper:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeSuperTrampolineWithAccessCheck);
+        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeSuperTrampolineWithAccessCheck);
         break;
       case kVirtual:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeVirtualTrampolineWithAccessCheck);
+        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeVirtualTrampolineWithAccessCheck);
         break;
       default:
         LOG(FATAL) << "Unexpected invoke type";
diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc
index 972457a..a938478 100644
--- a/compiler/dex/quick/mips/call_mips.cc
+++ b/compiler/dex/quick/mips/call_mips.cc
@@ -244,7 +244,7 @@
   GenBarrier();
   NewLIR0(kMipsCurrPC);  // Really a jal to .+8
   // Now, fill the branch delay slot with the helper load
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pHandleFillArrayData));
+  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pHandleFillArrayData));
   GenBarrier();  // Scheduling barrier
 
   // Construct BaseLabel and set up table base register
@@ -260,7 +260,7 @@
 }
 
 void MipsMir2Lir::GenMoveException(RegLocation rl_dest) {
-  int ex_offset = Thread::ExceptionOffset().Int32Value();
+  int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage reset_reg = AllocTemp();
   LoadWordDisp(rs_rMIPS_SELF, ex_offset, rl_result.reg);
@@ -277,7 +277,7 @@
   RegStorage reg_card_base = AllocTemp();
   RegStorage reg_card_no = AllocTemp();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
-  LoadWordDisp(rs_rMIPS_SELF, Thread::CardTableOffset().Int32Value(), reg_card_base);
+  LoadWordDisp(rs_rMIPS_SELF, Thread::CardTableOffset<4>().Int32Value(), reg_card_base);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
@@ -310,7 +310,7 @@
   RegStorage new_sp = AllocTemp();
   if (!skip_overflow_check) {
     /* Load stack limit */
-    LoadWordDisp(rs_rMIPS_SELF, Thread::StackEndOffset().Int32Value(), check_reg);
+    LoadWordDisp(rs_rMIPS_SELF, Thread::StackEndOffset<4>().Int32Value(), check_reg);
   }
   /* Spill core callee saves */
   SpillCoreRegs();
@@ -331,7 +331,7 @@
         m2l_->LoadWordDisp(rs_rMIPS_SP, 0, rs_rRA);
         m2l_->OpRegImm(kOpAdd, rs_rMIPS_SP, sp_displace_);
         m2l_->ClobberCallerSave();
-        ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowStackOverflow);
+        ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowStackOverflow);
         RegStorage r_tgt = m2l_->CallHelperSetup(func_offset);  // Doesn't clobber LR.
         m2l_->CallHelper(r_tgt, func_offset, false /* MarkSafepointPC */, false /* UseLink */);
       }
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index bc1ad02..0f9da6a 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -30,7 +30,7 @@
     bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
                             RegLocation rl_dest, int lit);
     LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(ThreadOffset offset);
+    RegStorage LoadHelper(ThreadOffset<4> offset);
     LIR* LoadBaseDisp(int r_base, int displacement, int r_dest, OpSize size, int s_reg);
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size,
                       int s_reg);
@@ -170,12 +170,12 @@
     LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
     LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
     LIR* OpTestSuspend(LIR* target);
-    LIR* OpThreadMem(OpKind op, ThreadOffset thread_offset);
+    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset);
     LIR* OpVldm(RegStorage r_base, int count);
     LIR* OpVstm(RegStorage r_base, int count);
     void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
     void OpRegCopyWide(RegStorage dest, RegStorage src);
-    void OpTlsCmp(ThreadOffset offset, int val);
+    void OpTlsCmp(ThreadOffset<4> offset, int val);
 
     // TODO: collapse r_dest.
     LIR* LoadBaseDispBody(RegStorage r_base, int displacement, RegStorage r_dest,
diff --git a/compiler/dex/quick/mips/fp_mips.cc b/compiler/dex/quick/mips/fp_mips.cc
index 2bc5540..a479dc7 100644
--- a/compiler/dex/quick/mips/fp_mips.cc
+++ b/compiler/dex/quick/mips/fp_mips.cc
@@ -50,7 +50,7 @@
     case Instruction::REM_FLOAT_2ADDR:
     case Instruction::REM_FLOAT:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmodf), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmodf), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturn(true);
       StoreValue(rl_dest, rl_result);
@@ -93,7 +93,7 @@
     case Instruction::REM_DOUBLE_2ADDR:
     case Instruction::REM_DOUBLE:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmod), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmod), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturnWide(true);
       StoreValueWide(rl_dest, rl_result);
@@ -135,22 +135,22 @@
       op = kMipsFcvtdw;
       break;
     case Instruction::FLOAT_TO_INT:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pF2iz), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pF2iz), rl_dest, rl_src);
       return;
     case Instruction::DOUBLE_TO_INT:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pD2iz), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pD2iz), rl_dest, rl_src);
       return;
     case Instruction::LONG_TO_DOUBLE:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pL2d), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pL2d), rl_dest, rl_src);
       return;
     case Instruction::FLOAT_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pF2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pF2l), rl_dest, rl_src);
       return;
     case Instruction::LONG_TO_FLOAT:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pL2f), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pL2f), rl_dest, rl_src);
       return;
     case Instruction::DOUBLE_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pD2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pD2l), rl_dest, rl_src);
       return;
     default:
       LOG(FATAL) << "Unexpected opcode: " << opcode;
@@ -176,22 +176,22 @@
 void MipsMir2Lir::GenCmpFP(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2) {
   bool wide = true;
-  ThreadOffset offset(-1);
+  ThreadOffset<4> offset(-1);
 
   switch (opcode) {
     case Instruction::CMPL_FLOAT:
-      offset = QUICK_ENTRYPOINT_OFFSET(pCmplFloat);
+      offset = QUICK_ENTRYPOINT_OFFSET(4, pCmplFloat);
       wide = false;
       break;
     case Instruction::CMPG_FLOAT:
-      offset = QUICK_ENTRYPOINT_OFFSET(pCmpgFloat);
+      offset = QUICK_ENTRYPOINT_OFFSET(4, pCmpgFloat);
       wide = false;
       break;
     case Instruction::CMPL_DOUBLE:
-      offset = QUICK_ENTRYPOINT_OFFSET(pCmplDouble);
+      offset = QUICK_ENTRYPOINT_OFFSET(4, pCmplDouble);
       break;
     case Instruction::CMPG_DOUBLE:
-      offset = QUICK_ENTRYPOINT_OFFSET(pCmpgDouble);
+      offset = QUICK_ENTRYPOINT_OFFSET(4, pCmpgDouble);
       break;
     default:
       LOG(FATAL) << "Unexpected opcode: " << opcode;
diff --git a/compiler/dex/quick/mips/int_mips.cc b/compiler/dex/quick/mips/int_mips.cc
index dfe8b35..2424dc5 100644
--- a/compiler/dex/quick/mips/int_mips.cc
+++ b/compiler/dex/quick/mips/int_mips.cc
@@ -270,7 +270,7 @@
   LOG(FATAL) << "Unexpected use of OpLea for Arm";
 }
 
-void MipsMir2Lir::OpTlsCmp(ThreadOffset offset, int val) {
+void MipsMir2Lir::OpTlsCmp(ThreadOffset<4> offset, int val) {
   LOG(FATAL) << "Unexpected use of OpTlsCmp for Arm";
 }
 
diff --git a/compiler/dex/quick/mips/target_mips.cc b/compiler/dex/quick/mips/target_mips.cc
index 67a44fa..1fe2bea 100644
--- a/compiler/dex/quick/mips/target_mips.cc
+++ b/compiler/dex/quick/mips/target_mips.cc
@@ -514,14 +514,14 @@
  * ensure that all branch instructions can be restarted if
  * there is a trap in the shadow.  Allocate a temp register.
  */
-RegStorage MipsMir2Lir::LoadHelper(ThreadOffset offset) {
+RegStorage MipsMir2Lir::LoadHelper(ThreadOffset<4> offset) {
   LoadWordDisp(rs_rMIPS_SELF, offset.Int32Value(), rs_rT9);
   return rs_rT9;
 }
 
 LIR* MipsMir2Lir::CheckSuspendUsingLoad() {
   RegStorage tmp = AllocTemp();
-  LoadWordDisp(rs_rMIPS_SELF, Thread::ThreadSuspendTriggerOffset().Int32Value(), tmp);
+  LoadWordDisp(rs_rMIPS_SELF, Thread::ThreadSuspendTriggerOffset<4>().Int32Value(), tmp);
   LIR *inst = LoadWordDisp(tmp, 0, tmp);
   FreeTemp(tmp);
   return inst;
diff --git a/compiler/dex/quick/mips/utility_mips.cc b/compiler/dex/quick/mips/utility_mips.cc
index 4f31341..c959510 100644
--- a/compiler/dex/quick/mips/utility_mips.cc
+++ b/compiler/dex/quick/mips/utility_mips.cc
@@ -642,7 +642,7 @@
   return StoreBaseDispBody(r_base, displacement, r_src.GetLow(), r_src.GetHigh(), kLong);
 }
 
-LIR* MipsMir2Lir::OpThreadMem(OpKind op, ThreadOffset thread_offset) {
+LIR* MipsMir2Lir::OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) {
   LOG(FATAL) << "Unexpected use of OpThreadMem for MIPS";
   return NULL;
 }
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 10f431f..276c4b8 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -601,7 +601,7 @@
                           RegLocation rl_src, int lit);
     void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
                         RegLocation rl_src1, RegLocation rl_src2);
-    void GenConversionCall(ThreadOffset func_offset, RegLocation rl_dest,
+    void GenConversionCall(ThreadOffset<4> func_offset, RegLocation rl_dest,
                            RegLocation rl_src);
     void GenSuspendTest(int opt_flags);
     void GenSuspendTestAndBranch(int opt_flags, LIR* target);
@@ -612,43 +612,44 @@
                        RegLocation rl_src1, RegLocation rl_src2);
 
     // Shared by all targets - implemented in gen_invoke.cc.
-    LIR* CallHelper(RegStorage r_tgt, ThreadOffset helper_offset, bool safepoint_pc,
+    LIR* CallHelper(RegStorage r_tgt, ThreadOffset<4> helper_offset, bool safepoint_pc,
                     bool use_link = true);
-    RegStorage CallHelperSetup(ThreadOffset helper_offset);
-    void CallRuntimeHelperImm(ThreadOffset helper_offset, int arg0, bool safepoint_pc);
-    void CallRuntimeHelperReg(ThreadOffset helper_offset, RegStorage arg0, bool safepoint_pc);
-    void CallRuntimeHelperRegLocation(ThreadOffset helper_offset, RegLocation arg0,
+    RegStorage CallHelperSetup(ThreadOffset<4> helper_offset);
+    void CallRuntimeHelperImm(ThreadOffset<4> helper_offset, int arg0, bool safepoint_pc);
+    void CallRuntimeHelperReg(ThreadOffset<4> helper_offset, RegStorage arg0, bool safepoint_pc);
+    void CallRuntimeHelperRegLocation(ThreadOffset<4> helper_offset, RegLocation arg0,
                                       bool safepoint_pc);
-    void CallRuntimeHelperImmImm(ThreadOffset helper_offset, int arg0, int arg1,
+    void CallRuntimeHelperImmImm(ThreadOffset<4> helper_offset, int arg0, int arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperImmRegLocation(ThreadOffset helper_offset, int arg0,
+    void CallRuntimeHelperImmRegLocation(ThreadOffset<4> helper_offset, int arg0,
                                          RegLocation arg1, bool safepoint_pc);
-    void CallRuntimeHelperRegLocationImm(ThreadOffset helper_offset, RegLocation arg0,
+    void CallRuntimeHelperRegLocationImm(ThreadOffset<4> helper_offset, RegLocation arg0,
                                          int arg1, bool safepoint_pc);
-    void CallRuntimeHelperImmReg(ThreadOffset helper_offset, int arg0, RegStorage arg1,
+    void CallRuntimeHelperImmReg(ThreadOffset<4> helper_offset, int arg0, RegStorage arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperRegImm(ThreadOffset helper_offset, RegStorage arg0, int arg1,
+    void CallRuntimeHelperRegImm(ThreadOffset<4> helper_offset, RegStorage arg0, int arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperImmMethod(ThreadOffset helper_offset, int arg0,
+    void CallRuntimeHelperImmMethod(ThreadOffset<4> helper_offset, int arg0,
                                     bool safepoint_pc);
-    void CallRuntimeHelperRegMethod(ThreadOffset helper_offset, RegStorage arg0, bool safepoint_pc);
-    void CallRuntimeHelperRegMethodRegLocation(ThreadOffset helper_offset, RegStorage arg0,
+    void CallRuntimeHelperRegMethod(ThreadOffset<4> helper_offset, RegStorage arg0,
+                                    bool safepoint_pc);
+    void CallRuntimeHelperRegMethodRegLocation(ThreadOffset<4> helper_offset, RegStorage arg0,
                                                RegLocation arg2, bool safepoint_pc);
-    void CallRuntimeHelperRegLocationRegLocation(ThreadOffset helper_offset,
+    void CallRuntimeHelperRegLocationRegLocation(ThreadOffset<4> helper_offset,
                                                  RegLocation arg0, RegLocation arg1,
                                                  bool safepoint_pc);
-    void CallRuntimeHelperRegReg(ThreadOffset helper_offset, RegStorage arg0, RegStorage arg1,
+    void CallRuntimeHelperRegReg(ThreadOffset<4> helper_offset, RegStorage arg0, RegStorage arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperRegRegImm(ThreadOffset helper_offset, RegStorage arg0, RegStorage arg1,
+    void CallRuntimeHelperRegRegImm(ThreadOffset<4> helper_offset, RegStorage arg0, RegStorage arg1,
                                     int arg2, bool safepoint_pc);
-    void CallRuntimeHelperImmMethodRegLocation(ThreadOffset helper_offset, int arg0,
+    void CallRuntimeHelperImmMethodRegLocation(ThreadOffset<4> helper_offset, int arg0,
                                                RegLocation arg2, bool safepoint_pc);
-    void CallRuntimeHelperImmMethodImm(ThreadOffset helper_offset, int arg0, int arg2,
+    void CallRuntimeHelperImmMethodImm(ThreadOffset<4> helper_offset, int arg0, int arg2,
                                        bool safepoint_pc);
-    void CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset helper_offset,
+    void CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset<4> helper_offset,
                                                     int arg0, RegLocation arg1, RegLocation arg2,
                                                     bool safepoint_pc);
-    void CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset helper_offset,
+    void CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset<4> helper_offset,
                                                             RegLocation arg0, RegLocation arg1,
                                                             RegLocation arg2,
                                                             bool safepoint_pc);
@@ -670,7 +671,8 @@
 
     /**
      * @brief Used to determine the register location of destination.
-     * @details This is needed during generation of inline intrinsics because it finds destination of return,
+     * @details This is needed during generation of inline intrinsics because it finds destination
+     *  of return,
      * either the physical register or the target of move-result.
      * @param info Information about the invoke.
      * @return Returns the destination location.
@@ -731,7 +733,8 @@
      * @brief Used to do the final store in a wide destination as per bytecode semantics.
      * @see StoreValue
      * @param rl_dest The destination dalvik register location.
-     * @param rl_src The source register location. Can be either physical register or dalvik register.
+     * @param rl_src The source register location. Can be either physical register or dalvik
+     *  register.
      */
     void StoreValueWide(RegLocation rl_dest, RegLocation rl_src);
 
@@ -812,7 +815,7 @@
     virtual bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div,
                                     RegLocation rl_src, RegLocation rl_dest, int lit) = 0;
     virtual LIR* CheckSuspendUsingLoad() = 0;
-    virtual RegStorage LoadHelper(ThreadOffset offset) = 0;
+    virtual RegStorage LoadHelper(ThreadOffset<4> offset) = 0;
     virtual LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size,
                               int s_reg) = 0;
     virtual LIR* LoadBaseDispWide(RegStorage r_base, int displacement, RegStorage r_dest,
@@ -948,7 +951,8 @@
 
     /**
      * @brief Used for generating code that throws ArithmeticException if both registers are zero.
-     * @details This is used for generating DivideByZero checks when divisor is held in two separate registers.
+     * @details This is used for generating DivideByZero checks when divisor is held in two
+     *  separate registers.
      * @param reg_lo The register holding the lower 32-bits.
      * @param reg_hi The register holding the upper 32-bits.
      */
@@ -1047,13 +1051,13 @@
     virtual LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1,
                              RegStorage r_src2) = 0;
     virtual LIR* OpTestSuspend(LIR* target) = 0;
-    virtual LIR* OpThreadMem(OpKind op, ThreadOffset thread_offset) = 0;
+    virtual LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) = 0;
     virtual LIR* OpVldm(RegStorage r_base, int count) = 0;
     virtual LIR* OpVstm(RegStorage r_base, int count) = 0;
     virtual void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale,
                        int offset) = 0;
     virtual void OpRegCopyWide(RegStorage dest, RegStorage src) = 0;
-    virtual void OpTlsCmp(ThreadOffset offset, int val) = 0;
+    virtual void OpTlsCmp(ThreadOffset<4> offset, int val) = 0;
     virtual bool InexpensiveConstantInt(int32_t value) = 0;
     virtual bool InexpensiveConstantFloat(int32_t value) = 0;
     virtual bool InexpensiveConstantLong(int64_t value) = 0;
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index d97cf4d..729b30d 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -156,12 +156,12 @@
   }
   NewLIR2(kX86PcRelAdr, rX86_ARG1, WrapPointer(tab_rec));
   NewLIR2(kX86Add32RR, rX86_ARG1, rX86_ARG2);
-  CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(pHandleFillArrayData), rs_rX86_ARG0,
+  CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pHandleFillArrayData), rs_rX86_ARG0,
                           rs_rX86_ARG1, true);
 }
 
 void X86Mir2Lir::GenMoveException(RegLocation rl_dest) {
-  int ex_offset = Thread::ExceptionOffset().Int32Value();
+  int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   NewLIR2(kX86Mov32RT, rl_result.reg.GetReg(), ex_offset);
   NewLIR2(kX86Mov32TI, ex_offset, 0);
@@ -175,7 +175,7 @@
   RegStorage reg_card_base = AllocTemp();
   RegStorage reg_card_no = AllocTemp();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
-  NewLIR2(kX86Mov32RT, reg_card_base.GetReg(), Thread::CardTableOffset().Int32Value());
+  NewLIR2(kX86Mov32RT, reg_card_base.GetReg(), Thread::CardTableOffset<4>().Int32Value());
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
@@ -222,7 +222,7 @@
         GenerateTargetLabel();
         m2l_->OpRegImm(kOpAdd, rs_rX86_SP, sp_displace_);
         m2l_->ClobberCallerSave();
-        ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowStackOverflow);
+        ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowStackOverflow);
         // Assumes codegen and target are in thumb2 mode.
         m2l_->CallHelper(RegStorage::InvalidReg(), func_offset, false /* MarkSafepointPC */,
                          false /* UseLink */);
@@ -240,7 +240,7 @@
     // in case a signal comes in that's not using an alternate signal stack and the large frame may
     // have moved us outside of the reserved area at the end of the stack.
     // cmp rX86_SP, fs:[stack_end_]; jcc throw_launchpad
-    OpRegThreadMem(kOpCmp, rX86_SP, Thread::StackEndOffset());
+    OpRegThreadMem(kOpCmp, rX86_SP, Thread::StackEndOffset<4>());
     LIR* branch = OpCondBranch(kCondUlt, nullptr);
     AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_ - 4));
   }
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 6d427e7..56b64dd 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -30,7 +30,7 @@
     bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
                             RegLocation rl_dest, int lit);
     LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(ThreadOffset offset);
+    RegStorage LoadHelper(ThreadOffset<4> offset);
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size,
                       int s_reg);
     LIR* LoadBaseDispWide(RegStorage r_base, int displacement, RegStorage r_dest, int s_reg);
@@ -245,14 +245,14 @@
     LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
     LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
     LIR* OpTestSuspend(LIR* target);
-    LIR* OpThreadMem(OpKind op, ThreadOffset thread_offset);
+    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset);
     LIR* OpVldm(RegStorage r_base, int count);
     LIR* OpVstm(RegStorage r_base, int count);
     void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
     void OpRegCopyWide(RegStorage dest, RegStorage src);
-    void OpTlsCmp(ThreadOffset offset, int val);
+    void OpTlsCmp(ThreadOffset<4> offset, int val);
 
-    void OpRegThreadMem(OpKind op, int r_dest, ThreadOffset thread_offset);
+    void OpRegThreadMem(OpKind op, int r_dest, ThreadOffset<4> thread_offset);
     void SpillCoreRegs();
     void UnSpillCoreRegs();
     static const X86EncodingMap EncodingMap[kX86Last];
diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
index 3fb9012..ee5387f 100644
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc
@@ -49,7 +49,7 @@
     case Instruction::REM_FLOAT_2ADDR:
     case Instruction::REM_FLOAT:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmodf), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmodf), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturn(true);
       StoreValue(rl_dest, rl_result);
@@ -100,7 +100,7 @@
     case Instruction::REM_DOUBLE_2ADDR:
     case Instruction::REM_DOUBLE:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmod), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmod), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturnWide(true);
       StoreValueWide(rl_dest, rl_result);
@@ -274,10 +274,10 @@
       GenLongToFP(rl_dest, rl_src, false /* is_double */);
       return;
     case Instruction::FLOAT_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pF2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pF2l), rl_dest, rl_src);
       return;
     case Instruction::DOUBLE_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pD2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pD2l), rl_dest, rl_src);
       return;
     default:
       LOG(INFO) << "Unexpected opcode: " << opcode;
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 851f448..0e7ba6b 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -742,7 +742,7 @@
   NewLIR5(kX86Lea32RA, r_base.GetReg(), reg1.GetReg(), reg2.GetReg(), scale, offset);
 }
 
-void X86Mir2Lir::OpTlsCmp(ThreadOffset offset, int val) {
+void X86Mir2Lir::OpTlsCmp(ThreadOffset<4> offset, int val) {
   NewLIR2(kX86Cmp16TI8, offset.Int32Value(), val);
 }
 
@@ -893,7 +893,7 @@
 
 // Test suspend flag, return target of taken suspend branch
 LIR* X86Mir2Lir::OpTestSuspend(LIR* target) {
-  OpTlsCmp(Thread::ThreadFlagsOffset(), 0);
+  OpTlsCmp(Thread::ThreadFlagsOffset<4>(), 0);
   return OpCondBranch((target == NULL) ? kCondNe : kCondEq, target);
 }
 
@@ -1293,7 +1293,7 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
-void X86Mir2Lir::OpRegThreadMem(OpKind op, int r_dest, ThreadOffset thread_offset) {
+void X86Mir2Lir::OpRegThreadMem(OpKind op, int r_dest, ThreadOffset<4> thread_offset) {
   X86OpCode opcode = kX86Bkpt;
   switch (op) {
   case kOpCmp: opcode = kX86Cmp32RT;  break;
@@ -1834,7 +1834,7 @@
   if (needs_access_check) {
     // Check we have access to type_idx and if not throw IllegalAccessError,
     // Caller function returns Class* in kArg0.
-    CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(pInitializeTypeAndVerifyAccess),
+    CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
                          type_idx, true);
     OpRegCopy(class_reg, TargetReg(kRet0));
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));
@@ -1855,7 +1855,7 @@
       // Need to test presence of type in dex cache at runtime.
       LIR* hop_branch = OpCmpImmBranch(kCondNe, class_reg, 0, NULL);
       // Type is not resolved. Call out to helper, which will return resolved type in kRet0/kArg0.
-      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(pInitializeType), type_idx, true);
+      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx, true);
       OpRegCopy(TargetReg(kArg2), TargetReg(kRet0));  // Align usage with fast path.
       LoadValueDirectFixed(rl_src, TargetReg(kArg0));  /* Reload Ref. */
       // Rejoin code paths
@@ -1889,7 +1889,7 @@
       branchover = OpCmpBranch(kCondEq, TargetReg(kArg1), TargetReg(kArg2), NULL);
     }
     OpRegCopy(TargetReg(kArg0), TargetReg(kArg2));
-    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(pInstanceofNonTrivial));
+    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(4, pInstanceofNonTrivial));
   }
   // TODO: only clobber when type isn't final?
   ClobberCallerSave();
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index da64250..925e736 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -581,16 +581,18 @@
 
 X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena),
+      base_of_code_(nullptr), store_method_addr_(false), store_method_addr_used_(false),
       method_address_insns_(arena, 100, kGrowableArrayMisc),
       class_type_address_insns_(arena, 100, kGrowableArrayMisc),
       call_method_insns_(arena, 100, kGrowableArrayMisc),
       stack_decrement_(nullptr), stack_increment_(nullptr) {
-  store_method_addr_used_ = false;
-  for (int i = 0; i < kX86Last; i++) {
-    if (X86Mir2Lir::EncodingMap[i].opcode != i) {
-      LOG(FATAL) << "Encoding order for " << X86Mir2Lir::EncodingMap[i].name
-                 << " is wrong: expecting " << i << ", seeing "
-                 << static_cast<int>(X86Mir2Lir::EncodingMap[i].opcode);
+  if (kIsDebugBuild) {
+    for (int i = 0; i < kX86Last; i++) {
+      if (X86Mir2Lir::EncodingMap[i].opcode != i) {
+        LOG(FATAL) << "Encoding order for " << X86Mir2Lir::EncodingMap[i].name
+            << " is wrong: expecting " << i << ", seeing "
+            << static_cast<int>(X86Mir2Lir::EncodingMap[i].opcode);
+      }
     }
   }
 }
@@ -601,7 +603,7 @@
 }
 
 // Not used in x86
-RegStorage X86Mir2Lir::LoadHelper(ThreadOffset offset) {
+RegStorage X86Mir2Lir::LoadHelper(ThreadOffset<4> offset) {
   LOG(FATAL) << "Unexpected use of LoadHelper in x86";
   return RegStorage::InvalidReg();
 }
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index bb5d387..e9faa7f 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -468,7 +468,7 @@
   return OpRegImm(op, r_dest, value);
 }
 
-LIR* X86Mir2Lir::OpThreadMem(OpKind op, ThreadOffset thread_offset) {
+LIR* X86Mir2Lir::OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) {
   X86OpCode opcode = kX86Bkpt;
   switch (op) {
     case kOpBlx: opcode = kX86CallT;  break;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 59754d5..c367260 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -336,6 +336,7 @@
       compiler_(Compiler::Create(compiler_kind)),
       instruction_set_(instruction_set),
       instruction_set_features_(instruction_set_features),
+      instruction_set_is_64_bit_(instruction_set == kX86_64 || instruction_set == kArm64),
       freezing_constructor_lock_("freezing constructor lock"),
       compiled_classes_lock_("compiled classes lock"),
       compiled_methods_lock_("compiled method lock"),
@@ -443,54 +444,55 @@
   return res;
 }
 
+#define CREATE_TRAMPOLINE(type, abi, offset) \
+    if (instruction_set_is_64_bit_) { \
+      return CreateTrampoline64(instruction_set_, abi, \
+                                type ## _ENTRYPOINT_OFFSET(8, offset)); \
+    } else { \
+      return CreateTrampoline32(instruction_set_, abi, \
+                                type ## _ENTRYPOINT_OFFSET(4, offset)); \
+    }
+
 const std::vector<uint8_t>* CompilerDriver::CreateInterpreterToInterpreterBridge() const {
-  return CreateTrampoline(instruction_set_, kInterpreterAbi,
-                          INTERPRETER_ENTRYPOINT_OFFSET(pInterpreterToInterpreterBridge));
+  CREATE_TRAMPOLINE(INTERPRETER, kInterpreterAbi, pInterpreterToInterpreterBridge)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateInterpreterToCompiledCodeBridge() const {
-  return CreateTrampoline(instruction_set_, kInterpreterAbi,
-                          INTERPRETER_ENTRYPOINT_OFFSET(pInterpreterToCompiledCodeBridge));
+  CREATE_TRAMPOLINE(INTERPRETER, kInterpreterAbi, pInterpreterToCompiledCodeBridge)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateJniDlsymLookup() const {
-  return CreateTrampoline(instruction_set_, kJniAbi, JNI_ENTRYPOINT_OFFSET(pDlsymLookup));
+  CREATE_TRAMPOLINE(JNI, kJniAbi, pDlsymLookup)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreatePortableImtConflictTrampoline() const {
-  return CreateTrampoline(instruction_set_, kPortableAbi,
-                          PORTABLE_ENTRYPOINT_OFFSET(pPortableImtConflictTrampoline));
+  CREATE_TRAMPOLINE(PORTABLE, kPortableAbi, pPortableImtConflictTrampoline)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreatePortableResolutionTrampoline() const {
-  return CreateTrampoline(instruction_set_, kPortableAbi,
-                          PORTABLE_ENTRYPOINT_OFFSET(pPortableResolutionTrampoline));
+  CREATE_TRAMPOLINE(PORTABLE, kPortableAbi, pPortableResolutionTrampoline)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreatePortableToInterpreterBridge() const {
-  return CreateTrampoline(instruction_set_, kPortableAbi,
-                          PORTABLE_ENTRYPOINT_OFFSET(pPortableToInterpreterBridge));
+  CREATE_TRAMPOLINE(PORTABLE, kPortableAbi, pPortableToInterpreterBridge)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateQuickGenericJniTrampoline() const {
-  return CreateTrampoline(instruction_set_, kQuickAbi,
-                          QUICK_ENTRYPOINT_OFFSET(pQuickGenericJniTrampoline));
+  CREATE_TRAMPOLINE(QUICK, kQuickAbi, pQuickGenericJniTrampoline)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateQuickImtConflictTrampoline() const {
-  return CreateTrampoline(instruction_set_, kQuickAbi,
-                          QUICK_ENTRYPOINT_OFFSET(pQuickImtConflictTrampoline));
+  CREATE_TRAMPOLINE(QUICK, kQuickAbi, pQuickImtConflictTrampoline)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateQuickResolutionTrampoline() const {
-  return CreateTrampoline(instruction_set_, kQuickAbi,
-                          QUICK_ENTRYPOINT_OFFSET(pQuickResolutionTrampoline));
+  CREATE_TRAMPOLINE(QUICK, kQuickAbi, pQuickResolutionTrampoline)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateQuickToInterpreterBridge() const {
-  return CreateTrampoline(instruction_set_, kQuickAbi,
-                          QUICK_ENTRYPOINT_OFFSET(pQuickToInterpreterBridge));
+  CREATE_TRAMPOLINE(QUICK, kQuickAbi, pQuickToInterpreterBridge)
 }
+#undef CREATE_TRAMPOLINE
 
 void CompilerDriver::CompileAll(jobject class_loader,
                                 const std::vector<const DexFile*>& dex_files,
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 256aa46..ddb62e1 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -719,6 +719,7 @@
 
   const InstructionSet instruction_set_;
   const InstructionSetFeatures instruction_set_features_;
+  const bool instruction_set_is_64_bit_;
 
   // All class references that require
   mutable ReaderWriterMutex freezing_constructor_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index c89bc40..dcdcdd1 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -101,10 +101,10 @@
   __ StoreImmediateToFrame(main_jni_conv->SirtNumRefsOffset(),
                            main_jni_conv->ReferenceCount(),
                            mr_conv->InterproceduralScratchRegister());
-  __ CopyRawPtrFromThread(main_jni_conv->SirtLinkOffset(),
-                          Thread::TopSirtOffset(),
+  __ CopyRawPtrFromThread32(main_jni_conv->SirtLinkOffset(),
+                          Thread::TopSirtOffset<4>(),
                           mr_conv->InterproceduralScratchRegister());
-  __ StoreStackOffsetToThread(Thread::TopSirtOffset(),
+  __ StoreStackOffsetToThread32(Thread::TopSirtOffset<4>(),
                               main_jni_conv->SirtOffset(),
                               mr_conv->InterproceduralScratchRegister());
 
@@ -154,8 +154,8 @@
   }
 
   // 4. Write out the end of the quick frames.
-  __ StoreStackPointerToThread(Thread::TopOfManagedStackOffset());
-  __ StoreImmediateToThread(Thread::TopOfManagedStackPcOffset(), 0,
+  __ StoreStackPointerToThread32(Thread::TopOfManagedStackOffset<4>());
+  __ StoreImmediateToThread32(Thread::TopOfManagedStackPcOffset<4>(), 0,
                             mr_conv->InterproceduralScratchRegister());
 
   // 5. Move frame down to allow space for out going args.
@@ -169,8 +169,8 @@
   //    can occur. The result is the saved JNI local state that is restored by the exit call. We
   //    abuse the JNI calling convention here, that is guaranteed to support passing 2 pointer
   //    arguments.
-  ThreadOffset jni_start = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(pJniMethodStartSynchronized)
-                                           : QUICK_ENTRYPOINT_OFFSET(pJniMethodStart);
+  ThreadOffset<4> jni_start = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStartSynchronized)
+                                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStart);
   main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
   FrameOffset locked_object_sirt_offset(0);
   if (is_synchronized) {
@@ -197,7 +197,7 @@
   } else {
     __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset(),
                         main_jni_conv->InterproceduralScratchRegister());
-    __ Call(ThreadOffset(jni_start), main_jni_conv->InterproceduralScratchRegister());
+    __ CallFromThread32(jni_start, main_jni_conv->InterproceduralScratchRegister());
   }
   if (is_synchronized) {  // Check for exceptions from monitor enter.
     __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), main_out_arg_size);
@@ -259,10 +259,10 @@
   if (main_jni_conv->IsCurrentParamInRegister()) {
     ManagedRegister jni_env = main_jni_conv->CurrentParamRegister();
     DCHECK(!jni_env.Equals(main_jni_conv->InterproceduralScratchRegister()));
-    __ LoadRawPtrFromThread(jni_env, Thread::JniEnvOffset());
+    __ LoadRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>());
   } else {
     FrameOffset jni_env = main_jni_conv->CurrentParamStackOffset();
-    __ CopyRawPtrFromThread(jni_env, Thread::JniEnvOffset(),
+    __ CopyRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>(),
                             main_jni_conv->InterproceduralScratchRegister());
   }
 
@@ -298,16 +298,16 @@
   // 12. Call into JNI method end possibly passing a returned reference, the method and the current
   //     thread.
   end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
-  ThreadOffset jni_end(-1);
+  ThreadOffset<4> jni_end(-1);
   if (reference_return) {
     // Pass result.
-    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(pJniMethodEndWithReferenceSynchronized)
-                              : QUICK_ENTRYPOINT_OFFSET(pJniMethodEndWithReference);
+    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReferenceSynchronized)
+                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReference);
     SetNativeParameter(jni_asm.get(), end_jni_conv.get(), end_jni_conv->ReturnRegister());
     end_jni_conv->Next();
   } else {
-    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(pJniMethodEndSynchronized)
-                              : QUICK_ENTRYPOINT_OFFSET(pJniMethodEnd);
+    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndSynchronized)
+                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEnd);
   }
   // Pass saved local reference state.
   if (end_jni_conv->IsCurrentParamOnStack()) {
@@ -339,7 +339,7 @@
   } else {
     __ GetCurrentThread(end_jni_conv->CurrentParamStackOffset(),
                         end_jni_conv->InterproceduralScratchRegister());
-    __ Call(ThreadOffset(jni_end), end_jni_conv->InterproceduralScratchRegister());
+    __ CallFromThread32(ThreadOffset<4>(jni_end), end_jni_conv->InterproceduralScratchRegister());
   }
 
   // 13. Reload return value
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 8ebea46..24298d2 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -39,7 +39,7 @@
 
 static ManagedRegister ReturnRegisterForShorty(const char* shorty, bool jni) {
   if (shorty[0] == 'F' || shorty[0] == 'D') {
-    return X86_64ManagedRegister::FromXmmRegister(_XMM0);
+    return X86_64ManagedRegister::FromXmmRegister(XMM0);
   } else if (shorty[0] == 'J') {
     return X86_64ManagedRegister::FromCpuRegister(RAX);
   } else if (shorty[0] == 'V') {
@@ -89,7 +89,7 @@
   } else if (itr_float_and_doubles_ < 8) {
     // First eight float parameters are passed via XMM0..XMM7
     res = X86_64ManagedRegister::FromXmmRegister(
-                                 static_cast<XmmRegister>(_XMM0 + itr_float_and_doubles_));
+                                 static_cast<FloatRegister>(XMM0 + itr_float_and_doubles_));
   }
   return res;
 }
@@ -171,15 +171,15 @@
   } else if (itr_float_and_doubles_ < 8) {
     // First eight float parameters are passed via XMM0..XMM7
     res = X86_64ManagedRegister::FromXmmRegister(
-                                 static_cast<XmmRegister>(_XMM0 + itr_float_and_doubles_));
+                                 static_cast<FloatRegister>(XMM0 + itr_float_and_doubles_));
   }
   return res;
 }
 
 FrameOffset X86_64JniCallingConvention::CurrentParamStackOffset() {
   size_t offset = itr_args_
-                  - std::min(8U, itr_float_and_doubles_)               // Float arguments passed through Xmm0..Xmm7
-                  - std::min(6U, itr_args_ - itr_float_and_doubles_);  // Integer arguments passed through GPR
+      - std::min(8U, itr_float_and_doubles_)               // Float arguments passed through Xmm0..Xmm7
+      - std::min(6U, itr_args_ - itr_float_and_doubles_);  // Integer arguments passed through GPR
   return FrameOffset(displacement_.Int32Value() - OutArgSize() + (offset * kPointerSize));
 }
 
diff --git a/compiler/trampolines/trampoline_compiler.cc b/compiler/trampolines/trampoline_compiler.cc
index 32980cb..fb909a8 100644
--- a/compiler/trampolines/trampoline_compiler.cc
+++ b/compiler/trampolines/trampoline_compiler.cc
@@ -21,6 +21,7 @@
 #include "utils/arm64/assembler_arm64.h"
 #include "utils/mips/assembler_mips.h"
 #include "utils/x86/assembler_x86.h"
+#include "utils/x86_64/assembler_x86_64.h"
 
 #define __ assembler->
 
@@ -28,7 +29,7 @@
 
 namespace arm {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
-                                                    ThreadOffset offset) {
+                                                    ThreadOffset<4> offset) {
   UniquePtr<ArmAssembler> assembler(static_cast<ArmAssembler*>(Assembler::Create(kArm)));
 
   switch (abi) {
@@ -56,7 +57,7 @@
 
 namespace arm64 {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
-                                                    ThreadOffset offset) {
+                                                    ThreadOffset<8> offset) {
   UniquePtr<Arm64Assembler> assembler(static_cast<Arm64Assembler*>(Assembler::Create(kArm64)));
 
   switch (abi) {
@@ -96,7 +97,7 @@
 
 namespace mips {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
-                                                    ThreadOffset offset) {
+                                                    ThreadOffset<4> offset) {
   UniquePtr<MipsAssembler> assembler(static_cast<MipsAssembler*>(Assembler::Create(kMips)));
 
   switch (abi) {
@@ -125,7 +126,7 @@
 }  // namespace mips
 
 namespace x86 {
-static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset offset) {
+static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset<4> offset) {
   UniquePtr<X86Assembler> assembler(static_cast<X86Assembler*>(Assembler::Create(kX86)));
 
   // All x86 trampolines call via the Thread* held in fs.
@@ -142,11 +143,12 @@
 }  // namespace x86
 
 namespace x86_64 {
-static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset offset) {
-  UniquePtr<x86::X86Assembler> assembler(static_cast<x86::X86Assembler*>(Assembler::Create(kX86_64)));
+static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset<8> offset) {
+  UniquePtr<x86_64::X86_64Assembler>
+      assembler(static_cast<x86_64::X86_64Assembler*>(Assembler::Create(kX86_64)));
 
   // All x86 trampolines call via the Thread* held in gs.
-  __ gs()->jmp(x86::Address::Absolute(offset, true));
+  __ gs()->jmp(x86_64::Address::Absolute(offset, true));
   __ int3();
 
   size_t cs = assembler->CodeSize();
@@ -158,23 +160,32 @@
 }
 }  // namespace x86_64
 
-const std::vector<uint8_t>* CreateTrampoline(InstructionSet isa, EntryPointCallingConvention abi,
-                                             ThreadOffset offset) {
+const std::vector<uint8_t>* CreateTrampoline64(InstructionSet isa, EntryPointCallingConvention abi,
+                                               ThreadOffset<8> offset) {
+  switch (isa) {
+    case kArm64:
+      return arm64::CreateTrampoline(abi, offset);
+    case kX86_64:
+      return x86_64::CreateTrampoline(offset);
+    default:
+      LOG(FATAL) << "Unexpected InstructionSet: " << isa;
+      return nullptr;
+  }
+}
+
+const std::vector<uint8_t>* CreateTrampoline32(InstructionSet isa, EntryPointCallingConvention abi,
+                                               ThreadOffset<4> offset) {
   switch (isa) {
     case kArm:
     case kThumb2:
       return arm::CreateTrampoline(abi, offset);
-    case kArm64:
-      return arm64::CreateTrampoline(abi, offset);
     case kMips:
       return mips::CreateTrampoline(abi, offset);
     case kX86:
       return x86::CreateTrampoline(offset);
-    case kX86_64:
-      return x86_64::CreateTrampoline(offset);
     default:
-      LOG(FATAL) << "Unknown InstructionSet: " << isa;
-      return NULL;
+      LOG(FATAL) << "Unexpected InstructionSet: " << isa;
+      return nullptr;
   }
 }
 
diff --git a/compiler/trampolines/trampoline_compiler.h b/compiler/trampolines/trampoline_compiler.h
index cb5aa27..bdab279 100644
--- a/compiler/trampolines/trampoline_compiler.h
+++ b/compiler/trampolines/trampoline_compiler.h
@@ -25,8 +25,11 @@
 namespace art {
 
 // Create code that will invoke the function held in thread local storage.
-const std::vector<uint8_t>* CreateTrampoline(InstructionSet isa, EntryPointCallingConvention abi,
-                                             ThreadOffset entry_point_offset)
+const std::vector<uint8_t>* CreateTrampoline32(InstructionSet isa, EntryPointCallingConvention abi,
+                                               ThreadOffset<4> entry_point_offset)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+const std::vector<uint8_t>* CreateTrampoline64(InstructionSet isa, EntryPointCallingConvention abi,
+                                               ThreadOffset<8> entry_point_offset)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 }  // namespace art
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index 872a557..59eb98e 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -1577,7 +1577,7 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), SP, dest.Int32Value());
 }
 
-void ArmAssembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
+void ArmAssembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
                                        ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -1609,18 +1609,18 @@
   return EmitLoad(this, m_dst, SP, src.Int32Value(), size);
 }
 
-void ArmAssembler::Load(ManagedRegister m_dst, ThreadOffset src, size_t size) {
+void ArmAssembler::LoadFromThread32(ManagedRegister m_dst, ThreadOffset<4> src, size_t size) {
   return EmitLoad(this, m_dst, TR, src.Int32Value(), size);
 }
 
-void ArmAssembler::LoadRawPtrFromThread(ManagedRegister m_dst, ThreadOffset offs) {
+void ArmAssembler::LoadRawPtrFromThread32(ManagedRegister m_dst, ThreadOffset<4> offs) {
   ArmManagedRegister dst = m_dst.AsArm();
   CHECK(dst.IsCoreRegister()) << dst;
   LoadFromOffset(kLoadWord, dst.AsCoreRegister(), TR, offs.Int32Value());
 }
 
-void ArmAssembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                        ThreadOffset thr_offs,
+void ArmAssembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                        ThreadOffset<4> thr_offs,
                                         ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -1630,7 +1630,7 @@
                 SP, fr_offs.Int32Value());
 }
 
-void ArmAssembler::CopyRawPtrToThread(ThreadOffset thr_offs,
+void ArmAssembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs,
                                       FrameOffset fr_offs,
                                       ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
@@ -1641,7 +1641,7 @@
                 TR, thr_offs.Int32Value());
 }
 
-void ArmAssembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
+void ArmAssembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
                                             FrameOffset fr_offs,
                                             ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
@@ -1651,7 +1651,7 @@
                 TR, thr_offs.Int32Value());
 }
 
-void ArmAssembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
+void ArmAssembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
   StoreToOffset(kStoreWord, SP, TR, thr_offs.Int32Value());
 }
 
@@ -1844,7 +1844,7 @@
   // TODO: place reference map on call
 }
 
-void ArmAssembler::Call(ThreadOffset /*offset*/, ManagedRegister /*scratch*/) {
+void ArmAssembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL);
 }
 
@@ -1862,7 +1862,7 @@
   ArmExceptionSlowPath* slow = new ArmExceptionSlowPath(scratch, stack_adjust);
   buffer_.EnqueueSlowPath(slow);
   LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
-                 TR, Thread::ExceptionOffset().Int32Value());
+                 TR, Thread::ExceptionOffset<4>().Int32Value());
   cmp(scratch.AsCoreRegister(), ShifterOperand(0));
   b(slow->Entry(), NE);
 }
@@ -1878,7 +1878,7 @@
   // Don't care about preserving R0 as this call won't return
   __ mov(R0, ShifterOperand(scratch_.AsCoreRegister()));
   // Set up call to Thread::Current()->pDeliverException
-  __ LoadFromOffset(kLoadWord, R12, TR, QUICK_ENTRYPOINT_OFFSET(pDeliverException).Int32Value());
+  __ LoadFromOffset(kLoadWord, R12, TR, QUICK_ENTRYPOINT_OFFSET(4, pDeliverException).Int32Value());
   __ blx(R12);
   // Call never returns
   __ bkpt(0);
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index bb9207c..f5be04a 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -35,6 +35,7 @@
   // Data-processing operands - Uninitialized
   ShifterOperand() {
     type_ = -1;
+    encoding_ = 0;
   }
 
   // Data-processing operands - Immediate
@@ -210,7 +211,7 @@
 };
 
 
-class ArmAssembler : public Assembler {
+class ArmAssembler FINAL : public Assembler {
  public:
   ArmAssembler() {}
   virtual ~ArmAssembler() {}
@@ -438,127 +439,116 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister src);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister src);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
 
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister dest, FrameOffset src, size_t size);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size);
+  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch);
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch);
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister scratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister scratch);
+  void MemoryBarrier(ManagedRegister scratch) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister scratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister scratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
-  virtual void LoadReferenceFromSirt(ManagedRegister dst,
-                                     ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
   void EmitType01(Condition cond,
@@ -642,12 +632,12 @@
 };
 
 // Slowpath entered when Thread::Current()->_exception is non-null
-class ArmExceptionSlowPath : public SlowPath {
+class ArmExceptionSlowPath FINAL : public SlowPath {
  public:
   explicit ArmExceptionSlowPath(ArmManagedRegister scratch, size_t stack_adjust)
       : scratch_(scratch), stack_adjust_(stack_adjust) {
   }
-  virtual void Emit(Assembler *sp_asm);
+  void Emit(Assembler *sp_asm) OVERRIDE;
  private:
   const ArmManagedRegister scratch_;
   const size_t stack_adjust_;
diff --git a/compiler/utils/arm/constants_arm.h b/compiler/utils/arm/constants_arm.h
index cc795b1..058f945 100644
--- a/compiler/utils/arm/constants_arm.h
+++ b/compiler/utils/arm/constants_arm.h
@@ -242,22 +242,22 @@
   }
 
   // Get the raw instruction bits.
-  inline int32_t InstructionBits() const {
+  int32_t InstructionBits() const {
     return *reinterpret_cast<const int32_t*>(this);
   }
 
   // Set the raw instruction bits to value.
-  inline void SetInstructionBits(int32_t value) {
+  void SetInstructionBits(int32_t value) {
     *reinterpret_cast<int32_t*>(this) = value;
   }
 
   // Read one particular bit out of the instruction bits.
-  inline int Bit(int nr) const {
+  int Bit(int nr) const {
     return (InstructionBits() >> nr) & 1;
   }
 
   // Read a bit field out of the instruction bits.
-  inline int Bits(int shift, int count) const {
+  int Bits(int shift, int count) const {
     return (InstructionBits() >> shift) & ((1 << count) - 1);
   }
 
@@ -265,80 +265,80 @@
   // Accessors for the different named fields used in the ARM encoding.
   // The naming of these accessor corresponds to figure A3-1.
   // Generally applicable fields
-  inline Condition ConditionField() const {
+  Condition ConditionField() const {
     return static_cast<Condition>(Bits(kConditionShift, kConditionBits));
   }
-  inline int TypeField() const { return Bits(kTypeShift, kTypeBits); }
+  int TypeField() const { return Bits(kTypeShift, kTypeBits); }
 
-  inline Register RnField() const { return static_cast<Register>(
+  Register RnField() const { return static_cast<Register>(
                                         Bits(kRnShift, kRnBits)); }
-  inline Register RdField() const { return static_cast<Register>(
+  Register RdField() const { return static_cast<Register>(
                                         Bits(kRdShift, kRdBits)); }
 
   // Fields used in Data processing instructions
-  inline Opcode OpcodeField() const {
+  Opcode OpcodeField() const {
     return static_cast<Opcode>(Bits(kOpcodeShift, kOpcodeBits));
   }
-  inline int SField() const { return Bits(kSShift, kSBits); }
+  int SField() const { return Bits(kSShift, kSBits); }
   // with register
-  inline Register RmField() const {
+  Register RmField() const {
     return static_cast<Register>(Bits(kRmShift, kRmBits));
   }
-  inline Shift ShiftField() const { return static_cast<Shift>(
+  Shift ShiftField() const { return static_cast<Shift>(
                                         Bits(kShiftShift, kShiftBits)); }
-  inline int RegShiftField() const { return Bit(4); }
-  inline Register RsField() const {
+  int RegShiftField() const { return Bit(4); }
+  Register RsField() const {
     return static_cast<Register>(Bits(kRsShift, kRsBits));
   }
-  inline int ShiftAmountField() const { return Bits(kShiftImmShift,
+  int ShiftAmountField() const { return Bits(kShiftImmShift,
                                                     kShiftImmBits); }
   // with immediate
-  inline int RotateField() const { return Bits(kRotateShift, kRotateBits); }
-  inline int Immed8Field() const { return Bits(kImmed8Shift, kImmed8Bits); }
+  int RotateField() const { return Bits(kRotateShift, kRotateBits); }
+  int Immed8Field() const { return Bits(kImmed8Shift, kImmed8Bits); }
 
   // Fields used in Load/Store instructions
-  inline int PUField() const { return Bits(23, 2); }
-  inline int  BField() const { return Bit(22); }
-  inline int  WField() const { return Bit(21); }
-  inline int  LField() const { return Bit(20); }
+  int PUField() const { return Bits(23, 2); }
+  int  BField() const { return Bit(22); }
+  int  WField() const { return Bit(21); }
+  int  LField() const { return Bit(20); }
   // with register uses same fields as Data processing instructions above
   // with immediate
-  inline int Offset12Field() const { return Bits(kOffset12Shift,
+  int Offset12Field() const { return Bits(kOffset12Shift,
                                                  kOffset12Bits); }
   // multiple
-  inline int RlistField() const { return Bits(0, 16); }
+  int RlistField() const { return Bits(0, 16); }
   // extra loads and stores
-  inline int SignField() const { return Bit(6); }
-  inline int HField() const { return Bit(5); }
-  inline int ImmedHField() const { return Bits(8, 4); }
-  inline int ImmedLField() const { return Bits(0, 4); }
+  int SignField() const { return Bit(6); }
+  int HField() const { return Bit(5); }
+  int ImmedHField() const { return Bits(8, 4); }
+  int ImmedLField() const { return Bits(0, 4); }
 
   // Fields used in Branch instructions
-  inline int LinkField() const { return Bits(kLinkShift, kLinkBits); }
-  inline int SImmed24Field() const { return ((InstructionBits() << 8) >> 8); }
+  int LinkField() const { return Bits(kLinkShift, kLinkBits); }
+  int SImmed24Field() const { return ((InstructionBits() << 8) >> 8); }
 
   // Fields used in Supervisor Call instructions
-  inline uint32_t SvcField() const { return Bits(0, 24); }
+  uint32_t SvcField() const { return Bits(0, 24); }
 
   // Field used in Breakpoint instruction
-  inline uint16_t BkptField() const {
+  uint16_t BkptField() const {
     return ((Bits(8, 12) << 4) | Bits(0, 4));
   }
 
   // Field used in 16-bit immediate move instructions
-  inline uint16_t MovwField() const {
+  uint16_t MovwField() const {
     return ((Bits(16, 4) << 12) | Bits(0, 12));
   }
 
   // Field used in VFP float immediate move instruction
-  inline float ImmFloatField() const {
+  float ImmFloatField() const {
     uint32_t imm32 = (Bit(19) << 31) | (((1 << 5) - Bit(18)) << 25) |
                      (Bits(16, 2) << 23) | (Bits(0, 4) << 19);
     return bit_cast<float, uint32_t>(imm32);
   }
 
   // Field used in VFP double immediate move instruction
-  inline double ImmDoubleField() const {
+  double ImmDoubleField() const {
     uint64_t imm64 = (Bit(19)*(1LL << 63)) | (((1LL << 8) - Bit(18)) << 54) |
                      (Bits(16, 2)*(1LL << 52)) | (Bits(0, 4)*(1LL << 48));
     return bit_cast<double, uint64_t>(imm64);
@@ -347,7 +347,7 @@
   // Test for data processing instructions of type 0 or 1.
   // See "ARM Architecture Reference Manual ARMv7-A and ARMv7-R edition",
   // section A5.1 "ARM instruction set encoding".
-  inline bool IsDataProcessing() const {
+  bool IsDataProcessing() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(Bits(26, 2), 0);  // Type 0 or 1.
     return ((Bits(20, 5) & 0x19) != 0x10) &&
@@ -359,47 +359,47 @@
   // Tests for special encodings of type 0 instructions (extra loads and stores,
   // as well as multiplications, synchronization primitives, and miscellaneous).
   // Can only be called for a type 0 or 1 instruction.
-  inline bool IsMiscellaneous() const {
+  bool IsMiscellaneous() const {
     CHECK_EQ(Bits(26, 2), 0);  // Type 0 or 1.
     return ((Bit(25) == 0) && ((Bits(20, 5) & 0x19) == 0x10) && (Bit(7) == 0));
   }
-  inline bool IsMultiplyOrSyncPrimitive() const {
+  bool IsMultiplyOrSyncPrimitive() const {
     CHECK_EQ(Bits(26, 2), 0);  // Type 0 or 1.
     return ((Bit(25) == 0) && (Bits(4, 4) == 9));
   }
 
   // Test for Supervisor Call instruction.
-  inline bool IsSvc() const {
+  bool IsSvc() const {
     return ((InstructionBits() & 0xff000000) == 0xef000000);
   }
 
   // Test for Breakpoint instruction.
-  inline bool IsBkpt() const {
+  bool IsBkpt() const {
     return ((InstructionBits() & 0xfff000f0) == 0xe1200070);
   }
 
   // VFP register fields.
-  inline SRegister SnField() const {
+  SRegister SnField() const {
     return static_cast<SRegister>((Bits(kRnShift, kRnBits) << 1) + Bit(7));
   }
-  inline SRegister SdField() const {
+  SRegister SdField() const {
     return static_cast<SRegister>((Bits(kRdShift, kRdBits) << 1) + Bit(22));
   }
-  inline SRegister SmField() const {
+  SRegister SmField() const {
     return static_cast<SRegister>((Bits(kRmShift, kRmBits) << 1) + Bit(5));
   }
-  inline DRegister DnField() const {
+  DRegister DnField() const {
     return static_cast<DRegister>(Bits(kRnShift, kRnBits) + (Bit(7) << 4));
   }
-  inline DRegister DdField() const {
+  DRegister DdField() const {
     return static_cast<DRegister>(Bits(kRdShift, kRdBits) + (Bit(22) << 4));
   }
-  inline DRegister DmField() const {
+  DRegister DmField() const {
     return static_cast<DRegister>(Bits(kRmShift, kRmBits) + (Bit(5) << 4));
   }
 
   // Test for VFP data processing or single transfer instructions of type 7.
-  inline bool IsVFPDataProcessingOrSingleTransfer() const {
+  bool IsVFPDataProcessingOrSingleTransfer() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(TypeField(), 7);
     return ((Bit(24) == 0) && (Bits(9, 3) == 5));
@@ -408,7 +408,7 @@
   }
 
   // Test for VFP 64-bit transfer instructions of type 6.
-  inline bool IsVFPDoubleTransfer() const {
+  bool IsVFPDoubleTransfer() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(TypeField(), 6);
     return ((Bits(21, 4) == 2) && (Bits(9, 3) == 5) &&
@@ -416,20 +416,20 @@
   }
 
   // Test for VFP load and store instructions of type 6.
-  inline bool IsVFPLoadStore() const {
+  bool IsVFPLoadStore() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(TypeField(), 6);
     return ((Bits(20, 5) & 0x12) == 0x10) && (Bits(9, 3) == 5);
   }
 
   // Special accessors that test for existence of a value.
-  inline bool HasS() const { return SField() == 1; }
-  inline bool HasB() const { return BField() == 1; }
-  inline bool HasW() const { return WField() == 1; }
-  inline bool HasL() const { return LField() == 1; }
-  inline bool HasSign() const { return SignField() == 1; }
-  inline bool HasH() const { return HField() == 1; }
-  inline bool HasLink() const { return LinkField() == 1; }
+  bool HasS() const { return SField() == 1; }
+  bool HasB() const { return BField() == 1; }
+  bool HasW() const { return WField() == 1; }
+  bool HasL() const { return LField() == 1; }
+  bool HasSign() const { return SignField() == 1; }
+  bool HasH() const { return HField() == 1; }
+  bool HasLink() const { return LinkField() == 1; }
 
   // Instructions are read out of a code stream. The only way to get a
   // reference to an instruction is to convert a pointer. There is no way
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index f8b91d7..5b2c8ba 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -155,7 +155,7 @@
   StoreToOffset(scratch.AsCoreRegister(), SP, offs.Int32Value());
 }
 
-void Arm64Assembler::StoreImmediateToThread(ThreadOffset offs, uint32_t imm,
+void Arm64Assembler::StoreImmediateToThread32(ThreadOffset<4> offs, uint32_t imm,
                                             ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -163,7 +163,7 @@
   StoreToOffset(scratch.AsCoreRegister(), TR, offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackOffsetToThread(ThreadOffset tr_offs,
+void Arm64Assembler::StoreStackOffsetToThread32(ThreadOffset<4> tr_offs,
                                               FrameOffset fr_offs,
                                               ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
@@ -172,7 +172,7 @@
   StoreToOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackPointerToThread(ThreadOffset tr_offs) {
+void Arm64Assembler::StoreStackPointerToThread32(ThreadOffset<4> tr_offs) {
   // Arm64 does not support: "str sp, [dest]" therefore we use IP1 as a temp reg.
   ___ Mov(reg_x(IP1), reg_x(SP));
   StoreToOffset(IP1, TR, tr_offs.Int32Value());
@@ -269,7 +269,7 @@
   return Load(m_dst.AsArm64(), SP, src.Int32Value(), size);
 }
 
-void Arm64Assembler::Load(ManagedRegister m_dst, ThreadOffset src, size_t size) {
+void Arm64Assembler::LoadFromThread32(ManagedRegister m_dst, ThreadOffset<4> src, size_t size) {
   return Load(m_dst.AsArm64(), TR, src.Int32Value(), size);
 }
 
@@ -294,7 +294,7 @@
   LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
 }
 
-void Arm64Assembler::LoadRawPtrFromThread(ManagedRegister m_dst, ThreadOffset offs) {
+void Arm64Assembler::LoadRawPtrFromThread32(ManagedRegister m_dst, ThreadOffset<4> offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
   LoadFromOffset(dst.AsCoreRegister(), TR, offs.Int32Value());
@@ -322,8 +322,8 @@
   }
 }
 
-void Arm64Assembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                          ThreadOffset tr_offs,
+void Arm64Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                          ThreadOffset<4> tr_offs,
                                           ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -331,7 +331,7 @@
   StoreToOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
 }
 
-void Arm64Assembler::CopyRawPtrToThread(ThreadOffset tr_offs,
+void Arm64Assembler::CopyRawPtrToThread32(ThreadOffset<4> tr_offs,
                                         FrameOffset fr_offs,
                                         ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
@@ -486,7 +486,7 @@
   ___ Blr(reg_x(scratch.AsCoreRegister()));
 }
 
-void Arm64Assembler::Call(ThreadOffset /*offset*/, ManagedRegister /*scratch*/) {
+void Arm64Assembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL) << "Unimplemented Call() variant";
 }
 
@@ -555,7 +555,7 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   Arm64Exception *current_exception = new Arm64Exception(scratch, stack_adjust);
   exception_blocks_.push_back(current_exception);
-  LoadFromOffset(scratch.AsCoreRegister(), TR, Thread::ExceptionOffset().Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), TR, Thread::ExceptionOffset<4>().Int32Value());
   ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
   ___ B(current_exception->Entry(), COND_OP(NE));
 }
@@ -569,7 +569,7 @@
   // Pass exception object as argument.
   // Don't care about preserving X0 as this won't return.
   ___ Mov(reg_x(X0), reg_x(exception->scratch_.AsCoreRegister()));
-  LoadFromOffset(IP1, TR, QUICK_ENTRYPOINT_OFFSET(pDeliverException).Int32Value());
+  LoadFromOffset(IP1, TR, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
   ___ Blr(reg_x(IP1));
   // Call should never return.
   ___ Brk();
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 44eb6ff..3abcaad 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -79,7 +79,7 @@
 
 class Arm64Exception;
 
-class Arm64Assembler : public Assembler {
+class Arm64Assembler FINAL : public Assembler {
  public:
   Arm64Assembler() : vixl_buf_(new byte[BUF_SIZE]),
   vixl_masm_(new vixl::MacroAssembler(vixl_buf_, BUF_SIZE)) {}
@@ -111,105 +111,97 @@
   // Emit code that will create an activation on the stack.
   void BuildFrame(size_t frame_size, ManagedRegister method_reg,
                   const std::vector<ManagedRegister>& callee_save_regs,
-                  const ManagedRegisterEntrySpills& entry_spills);
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack.
-  void RemoveFrame(size_t frame_size,
-                   const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  void IncreaseFrameSize(size_t adjust);
-  void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines.
-  void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  void StoreRef(FrameOffset dest, ManagedRegister src);
-  void StoreRawPtr(FrameOffset dest, ManagedRegister src);
-  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
-  void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
-  void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
-  void StoreStackPointerToThread(ThreadOffset thr_offs);
-  void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines.
-  void Load(ManagedRegister dest, FrameOffset src, size_t size);
-  void Load(ManagedRegister dest, ThreadOffset src, size_t size);
-  void LoadRef(ManagedRegister dest, FrameOffset  src);
-  void LoadRef(ManagedRegister dest, ManagedRegister base,
-               MemberOffset offs);
-  void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                  Offset offs);
-  void LoadRawPtrFromThread(ManagedRegister dest,
-                            ThreadOffset offs);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
+  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
+  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
+
   // Copying routines.
-  void Move(ManagedRegister dest, ManagedRegister src, size_t size);
-  void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                            ManagedRegister scratch);
-  void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                          ManagedRegister scratch);
-  void CopyRef(FrameOffset dest, FrameOffset src,
-               ManagedRegister scratch);
-  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
-  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-            ManagedRegister scratch, size_t size);
-  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-            ManagedRegister scratch, size_t size);
-  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-            ManagedRegister scratch, size_t size);
-  void Copy(ManagedRegister dest, Offset dest_offset,
-            ManagedRegister src, Offset src_offset,
-            ManagedRegister scratch, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
   void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-            ManagedRegister scratch, size_t size);
-  void MemoryBarrier(ManagedRegister scratch);
+            ManagedRegister scratch, size_t size) OVERRIDE;
+  void MemoryBarrier(ManagedRegister scratch) OVERRIDE;
 
   // Sign extension.
-  void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension.
-  void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current().
-  void GetCurrentThread(ManagedRegister tr);
-  void GetCurrentThread(FrameOffset dest_offset,
-                        ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
   void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                       ManagedRegister in_reg, bool null_allowed);
+                       ManagedRegister in_reg, bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
   void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                       ManagedRegister scratch, bool null_allowed);
+                       ManagedRegister scratch, bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst.
-  void LoadReferenceFromSirt(ManagedRegister dst,
-                             ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  void VerifyObject(ManagedRegister src, bool could_be_null);
-  void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset].
-  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch);
-  void Call(FrameOffset base, Offset offset, ManagedRegister scratch);
-  void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
 
   // Jump to address (not setting link register)
   void JumpTo(ManagedRegister m_base, Offset offs, ManagedRegister m_scratch);
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
   static vixl::Register reg_x(int code) {
diff --git a/compiler/utils/arm64/constants_arm64.h b/compiler/utils/arm64/constants_arm64.h
index c05c2f1..ecf9fbe 100644
--- a/compiler/utils/arm64/constants_arm64.h
+++ b/compiler/utils/arm64/constants_arm64.h
@@ -29,7 +29,7 @@
 namespace art {
 namespace arm64 {
 
-  constexpr unsigned int kCalleeSavedRegsSize = 20;
+constexpr unsigned int kCalleeSavedRegsSize = 20;
 
 }  // arm64
 }  // art
diff --git a/compiler/utils/assembler.cc b/compiler/utils/assembler.cc
index 1921b28..26bdceb 100644
--- a/compiler/utils/assembler.cc
+++ b/compiler/utils/assembler.cc
@@ -122,4 +122,78 @@
   }
 }
 
+void Assembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
+                                         ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
+                                         ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
+                                           FrameOffset fr_offs,
+                                           ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+                                           FrameOffset fr_offs,
+                                           ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackPointerToThread64(ThreadOffset<8> thr_offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                                       ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+                                       ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                     ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+                                     ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
 }  // namespace art
diff --git a/compiler/utils/assembler.h b/compiler/utils/assembler.h
index c23fd44..219c87f 100644
--- a/compiler/utils/assembler.h
+++ b/compiler/utils/assembler.h
@@ -374,14 +374,20 @@
   virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
                                      ManagedRegister scratch) = 0;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch) = 0;
+  virtual void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
+                                        ManagedRegister scratch);
+  virtual void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
+                                        ManagedRegister scratch);
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch) = 0;
+  virtual void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
+                                          FrameOffset fr_offs,
+                                          ManagedRegister scratch);
+  virtual void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+                                          FrameOffset fr_offs,
+                                          ManagedRegister scratch);
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs) = 0;
+  virtual void StoreStackPointerToThread32(ThreadOffset<4> thr_offs);
+  virtual void StoreStackPointerToThread64(ThreadOffset<8> thr_offs);
 
   virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
                              FrameOffset in_off, ManagedRegister scratch) = 0;
@@ -389,27 +395,29 @@
   // Load routines
   virtual void Load(ManagedRegister dest, FrameOffset src, size_t size) = 0;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size) = 0;
+  virtual void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size);
+  virtual void LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size);
 
   virtual void LoadRef(ManagedRegister dest, FrameOffset  src) = 0;
+  virtual void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) = 0;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs) = 0;
+  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) = 0;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs) = 0;
-
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs) = 0;
+  virtual void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs);
+  virtual void LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs);
 
   // Copying routines
   virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size) = 0;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch) = 0;
+  virtual void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                                      ManagedRegister scratch);
+  virtual void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+                                      ManagedRegister scratch);
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch) = 0;
+  virtual void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                    ManagedRegister scratch);
+  virtual void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+                                    ManagedRegister scratch);
 
   virtual void CopyRef(FrameOffset dest, FrameOffset src,
                        ManagedRegister scratch) = 0;
@@ -471,7 +479,8 @@
                     ManagedRegister scratch) = 0;
   virtual void Call(FrameOffset base, Offset offset,
                     ManagedRegister scratch) = 0;
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch) = 0;
+  virtual void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch);
+  virtual void CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch);
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index dfd3306..99c29f1 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -633,7 +633,7 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), SP, dest.Int32Value());
 }
 
-void MipsAssembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
+void MipsAssembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
                                            ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -641,7 +641,7 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), S1, dest.Int32Value());
 }
 
-void MipsAssembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
+void MipsAssembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
                                              FrameOffset fr_offs,
                                              ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
@@ -651,7 +651,7 @@
                 S1, thr_offs.Int32Value());
 }
 
-void MipsAssembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
+void MipsAssembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
   StoreToOffset(kStoreWord, SP, S1, thr_offs.Int32Value());
 }
 
@@ -668,7 +668,7 @@
   return EmitLoad(mdest, SP, src.Int32Value(), size);
 }
 
-void MipsAssembler::Load(ManagedRegister mdest, ThreadOffset src, size_t size) {
+void MipsAssembler::LoadFromThread32(ManagedRegister mdest, ThreadOffset<4> src, size_t size) {
   return EmitLoad(mdest, S1, src.Int32Value(), size);
 }
 
@@ -697,8 +697,8 @@
                  base.AsMips().AsCoreRegister(), offs.Int32Value());
 }
 
-void MipsAssembler::LoadRawPtrFromThread(ManagedRegister mdest,
-                                         ThreadOffset offs) {
+void MipsAssembler::LoadRawPtrFromThread32(ManagedRegister mdest,
+                                         ThreadOffset<4> offs) {
   MipsManagedRegister dest = mdest.AsMips();
   CHECK(dest.IsCoreRegister());
   LoadFromOffset(kLoadWord, dest.AsCoreRegister(), S1, offs.Int32Value());
@@ -748,8 +748,8 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), SP, dest.Int32Value());
 }
 
-void MipsAssembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                         ThreadOffset thr_offs,
+void MipsAssembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                         ThreadOffset<4> thr_offs,
                                          ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -759,7 +759,7 @@
                 SP, fr_offs.Int32Value());
 }
 
-void MipsAssembler::CopyRawPtrToThread(ThreadOffset thr_offs,
+void MipsAssembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs,
                                        FrameOffset fr_offs,
                                        ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
@@ -923,7 +923,7 @@
   // TODO: place reference map on call
 }
 
-void MipsAssembler::Call(ThreadOffset /*offset*/, ManagedRegister /*mscratch*/) {
+void MipsAssembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*mscratch*/) {
   UNIMPLEMENTED(FATAL) << "no mips implementation";
 }
 
@@ -941,7 +941,7 @@
   MipsExceptionSlowPath* slow = new MipsExceptionSlowPath(scratch, stack_adjust);
   buffer_.EnqueueSlowPath(slow);
   LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
-                 S1, Thread::ExceptionOffset().Int32Value());
+                 S1, Thread::ExceptionOffset<4>().Int32Value());
   EmitBranch(scratch.AsCoreRegister(), ZERO, slow->Entry(), false);
 }
 
@@ -956,7 +956,7 @@
   // Don't care about preserving A0 as this call won't return
   __ Move(A0, scratch_.AsCoreRegister());
   // Set up call to Thread::Current()->pDeliverException
-  __ LoadFromOffset(kLoadWord, T9, S1, QUICK_ENTRYPOINT_OFFSET(pDeliverException).Int32Value());
+  __ LoadFromOffset(kLoadWord, T9, S1, QUICK_ENTRYPOINT_OFFSET(4, pDeliverException).Int32Value());
   __ Jr(T9);
   // Call never returns
   __ Break();
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 0d1a94c..75ee8b9 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -29,171 +29,6 @@
 
 namespace art {
 namespace mips {
-#if 0
-class Operand {
- public:
-  uint8_t mod() const {
-    return (encoding_at(0) >> 6) & 3;
-  }
-
-  Register rm() const {
-    return static_cast<Register>(encoding_at(0) & 7);
-  }
-
-  ScaleFactor scale() const {
-    return static_cast<ScaleFactor>((encoding_at(1) >> 6) & 3);
-  }
-
-  Register index() const {
-    return static_cast<Register>((encoding_at(1) >> 3) & 7);
-  }
-
-  Register base() const {
-    return static_cast<Register>(encoding_at(1) & 7);
-  }
-
-  int8_t disp8() const {
-    CHECK_GE(length_, 2);
-    return static_cast<int8_t>(encoding_[length_ - 1]);
-  }
-
-  int32_t disp32() const {
-    CHECK_GE(length_, 5);
-    int32_t value;
-    memcpy(&value, &encoding_[length_ - 4], sizeof(value));
-    return value;
-  }
-
-  bool IsRegister(Register reg) const {
-    return ((encoding_[0] & 0xF8) == 0xC0)  // Addressing mode is register only.
-        && ((encoding_[0] & 0x07) == reg);  // Register codes match.
-  }
-
- protected:
-  // Operand can be sub classed (e.g: Address).
-  Operand() : length_(0) { }
-
-  void SetModRM(int mod, Register rm) {
-    CHECK_EQ(mod & ~3, 0);
-    encoding_[0] = (mod << 6) | rm;
-    length_ = 1;
-  }
-
-  void SetSIB(ScaleFactor scale, Register index, Register base) {
-    CHECK_EQ(length_, 1);
-    CHECK_EQ(scale & ~3, 0);
-    encoding_[1] = (scale << 6) | (index << 3) | base;
-    length_ = 2;
-  }
-
-  void SetDisp8(int8_t disp) {
-    CHECK(length_ == 1 || length_ == 2);
-    encoding_[length_++] = static_cast<uint8_t>(disp);
-  }
-
-  void SetDisp32(int32_t disp) {
-    CHECK(length_ == 1 || length_ == 2);
-    int disp_size = sizeof(disp);
-    memmove(&encoding_[length_], &disp, disp_size);
-    length_ += disp_size;
-  }
-
- private:
-  byte length_;
-  byte encoding_[6];
-  byte padding_;
-
-  explicit Operand(Register reg) { SetModRM(3, reg); }
-
-  // Get the operand encoding byte at the given index.
-  uint8_t encoding_at(int index) const {
-    CHECK_GE(index, 0);
-    CHECK_LT(index, length_);
-    return encoding_[index];
-  }
-
-  friend class MipsAssembler;
-
-  DISALLOW_COPY_AND_ASSIGN(Operand);
-};
-
-
-class Address : public Operand {
- public:
-  Address(Register base, int32_t disp) {
-    Init(base, disp);
-  }
-
-  Address(Register base, Offset disp) {
-    Init(base, disp.Int32Value());
-  }
-
-  Address(Register base, FrameOffset disp) {
-    CHECK_EQ(base, ESP);
-    Init(ESP, disp.Int32Value());
-  }
-
-  Address(Register base, MemberOffset disp) {
-    Init(base, disp.Int32Value());
-  }
-
-  void Init(Register base, int32_t disp) {
-    if (disp == 0 && base != EBP) {
-      SetModRM(0, base);
-      if (base == ESP) SetSIB(TIMES_1, ESP, base);
-    } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, base);
-      if (base == ESP) SetSIB(TIMES_1, ESP, base);
-      SetDisp8(disp);
-    } else {
-      SetModRM(2, base);
-      if (base == ESP) SetSIB(TIMES_1, ESP, base);
-      SetDisp32(disp);
-    }
-  }
-
-
-  Address(Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, ESP);  // Illegal addressing mode.
-    SetModRM(0, ESP);
-    SetSIB(scale, index, EBP);
-    SetDisp32(disp);
-  }
-
-  Address(Register base, Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, ESP);  // Illegal addressing mode.
-    if (disp == 0 && base != EBP) {
-      SetModRM(0, ESP);
-      SetSIB(scale, index, base);
-    } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, ESP);
-      SetSIB(scale, index, base);
-      SetDisp8(disp);
-    } else {
-      SetModRM(2, ESP);
-      SetSIB(scale, index, base);
-      SetDisp32(disp);
-    }
-  }
-
-  static Address Absolute(uword addr) {
-    Address result;
-    result.SetModRM(0, EBP);
-    result.SetDisp32(addr);
-    return result;
-  }
-
-  static Address Absolute(ThreadOffset addr) {
-    return Absolute(addr.Int32Value());
-  }
-
- private:
-  Address() {}
-
-  DISALLOW_COPY_AND_ASSIGN(Address);
-};
-
-#endif
 
 enum LoadOperandType {
   kLoadSignedByte,
@@ -215,7 +50,7 @@
   kStoreDWord
 };
 
-class MipsAssembler : public Assembler {
+class MipsAssembler FINAL : public Assembler {
  public:
   MipsAssembler() {}
   virtual ~MipsAssembler() {}
@@ -310,40 +145,6 @@
   void StoreFToOffset(FRegister reg, Register base, int32_t offset);
   void StoreDToOffset(DRegister reg, Register base, int32_t offset);
 
-#if 0
-  MipsAssembler* lock();
-
-  void mfence();
-
-  MipsAssembler* fs();
-
-  //
-  // Macros for High-level operations.
-  //
-
-  void AddImmediate(Register reg, const Immediate& imm);
-
-  void LoadDoubleConstant(XmmRegister dst, double value);
-
-  void DoubleNegate(XmmRegister d);
-  void FloatNegate(XmmRegister f);
-
-  void DoubleAbs(XmmRegister reg);
-
-  void LockCmpxchgl(const Address& address, Register reg) {
-    lock()->cmpxchgl(address, reg);
-  }
-
-  //
-  // Misc. functionality
-  //
-  int PreferredLoopAlignment() { return 16; }
-  void Align(int alignment, int offset);
-
-  // Debugging and bringup support.
-  void Stop(const char* message);
-#endif
-
   // Emit data (e.g. encoded instruction or immediate) to the instruction stream.
   void Emit(int32_t value);
   void EmitBranch(Register rt, Register rs, Label* label, bool equal);
@@ -355,127 +156,116 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister msrc, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister msrc);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister msrc);
+  void Store(FrameOffset offs, ManagedRegister msrc, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister msrc) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister msrc) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister mscratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister mscratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister mscratch);
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister mscratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister mscratch);
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister mscratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
 
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister msrc,
-                             FrameOffset in_off, ManagedRegister mscratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister msrc, FrameOffset in_off,
+                     ManagedRegister mscratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister mdest, FrameOffset src, size_t size);
+  void Load(ManagedRegister mdest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister mdest, ThreadOffset src, size_t size);
+  void LoadFromThread32(ManagedRegister mdest, ThreadOffset<4> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister mdest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister mdest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister mdest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister mdest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister mdest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread32(ManagedRegister mdest, ThreadOffset<4> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister mdest, ManagedRegister msrc, size_t size);
+  void Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) OVERRIDE;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister mscratch);
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister mscratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister mscratch);
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                            ManagedRegister mscratch) OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister mscratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister mscratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister mscratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
+            ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister mscratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister);
+  void MemoryBarrier(ManagedRegister) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister mscratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister mscratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister mscratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister mscratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
-  virtual void LoadReferenceFromSirt(ManagedRegister dst,
-                                     ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister mscratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister mscratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister mscratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister mscratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister mscratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister mscratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust) OVERRIDE;
 
  private:
   void EmitR(int opcode, Register rs, Register rt, Register rd, int shamt, int funct);
@@ -491,11 +281,11 @@
 };
 
 // Slowpath entered when Thread::Current()->_exception is non-null
-class MipsExceptionSlowPath : public SlowPath {
+class MipsExceptionSlowPath FINAL : public SlowPath {
  public:
   explicit MipsExceptionSlowPath(MipsManagedRegister scratch, size_t stack_adjust)
       : scratch_(scratch), stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm);
+  virtual void Emit(Assembler *sp_asm) OVERRIDE;
  private:
   const MipsManagedRegister scratch_;
   const size_t stack_adjust_;
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index ebbb43a..aac8b01 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1478,12 +1478,12 @@
   movl(Address(ESP, dest), Immediate(imm));
 }
 
-void X86Assembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
+void X86Assembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
                                           ManagedRegister) {
   fs()->movl(Address::Absolute(dest), Immediate(imm));
 }
 
-void X86Assembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
+void X86Assembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
                                             FrameOffset fr_offs,
                                             ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
@@ -1492,14 +1492,10 @@
   fs()->movl(Address::Absolute(thr_offs), scratch.AsCpuRegister());
 }
 
-void X86Assembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
+void X86Assembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
   fs()->movl(Address::Absolute(thr_offs), ESP);
 }
 
-void X86Assembler::StoreLabelToThread(ThreadOffset thr_offs, Label* lbl) {
-  fs()->movl(Address::Absolute(thr_offs), lbl);
-}
-
 void X86Assembler::StoreSpanning(FrameOffset /*dst*/, ManagedRegister /*src*/,
                                  FrameOffset /*in_off*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL);  // this case only currently exists for ARM
@@ -1532,7 +1528,7 @@
   }
 }
 
-void X86Assembler::Load(ManagedRegister mdest, ThreadOffset src, size_t size) {
+void X86Assembler::LoadFromThread32(ManagedRegister mdest, ThreadOffset<4> src, size_t size) {
   X86ManagedRegister dest = mdest.AsX86();
   if (dest.IsNoRegister()) {
     CHECK_EQ(0u, size);
@@ -1542,7 +1538,7 @@
   } else if (dest.IsRegisterPair()) {
     CHECK_EQ(8u, size);
     fs()->movl(dest.AsRegisterPairLow(), Address::Absolute(src));
-    fs()->movl(dest.AsRegisterPairHigh(), Address::Absolute(ThreadOffset(src.Int32Value()+4)));
+    fs()->movl(dest.AsRegisterPairHigh(), Address::Absolute(ThreadOffset<4>(src.Int32Value()+4)));
   } else if (dest.IsX87Register()) {
     if (size == 4) {
       fs()->flds(Address::Absolute(src));
@@ -1582,8 +1578,8 @@
   movl(dest.AsCpuRegister(), Address(base.AsX86().AsCpuRegister(), offs));
 }
 
-void X86Assembler::LoadRawPtrFromThread(ManagedRegister mdest,
-                                        ThreadOffset offs) {
+void X86Assembler::LoadRawPtrFromThread32(ManagedRegister mdest,
+                                        ThreadOffset<4> offs) {
   X86ManagedRegister dest = mdest.AsX86();
   CHECK(dest.IsCpuRegister());
   fs()->movl(dest.AsCpuRegister(), Address::Absolute(offs));
@@ -1645,8 +1641,8 @@
   movl(Address(ESP, dest), scratch.AsCpuRegister());
 }
 
-void X86Assembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                        ThreadOffset thr_offs,
+void X86Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                        ThreadOffset<4> thr_offs,
                                         ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
   CHECK(scratch.IsCpuRegister());
@@ -1654,7 +1650,7 @@
   Store(fr_offs, scratch, 4);
 }
 
-void X86Assembler::CopyRawPtrToThread(ThreadOffset thr_offs,
+void X86Assembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs,
                                       FrameOffset fr_offs,
                                       ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
@@ -1804,26 +1800,26 @@
   call(Address(scratch, offset));
 }
 
-void X86Assembler::Call(ThreadOffset offset, ManagedRegister /*mscratch*/) {
+void X86Assembler::CallFromThread32(ThreadOffset<4> offset, ManagedRegister /*mscratch*/) {
   fs()->call(Address::Absolute(offset));
 }
 
 void X86Assembler::GetCurrentThread(ManagedRegister tr) {
   fs()->movl(tr.AsX86().AsCpuRegister(),
-             Address::Absolute(Thread::SelfOffset()));
+             Address::Absolute(Thread::SelfOffset<4>()));
 }
 
 void X86Assembler::GetCurrentThread(FrameOffset offset,
                                     ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
-  fs()->movl(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset()));
+  fs()->movl(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset<4>()));
   movl(Address(ESP, offset), scratch.AsCpuRegister());
 }
 
 void X86Assembler::ExceptionPoll(ManagedRegister /*scratch*/, size_t stack_adjust) {
   X86ExceptionSlowPath* slow = new X86ExceptionSlowPath(stack_adjust);
   buffer_.EnqueueSlowPath(slow);
-  fs()->cmpl(Address::Absolute(Thread::ExceptionOffset()), Immediate(0));
+  fs()->cmpl(Address::Absolute(Thread::ExceptionOffset<4>()), Immediate(0));
   j(kNotEqual, slow->Entry());
 }
 
@@ -1836,8 +1832,8 @@
     __ DecreaseFrameSize(stack_adjust_);
   }
   // Pass exception as argument in EAX
-  __ fs()->movl(EAX, Address::Absolute(Thread::ExceptionOffset()));
-  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(pDeliverException)));
+  __ fs()->movl(EAX, Address::Absolute(Thread::ExceptionOffset<4>()));
+  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(4, pDeliverException)));
   // this call should never return
   __ int3();
 #undef __
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index f906a6f..f8fc4c0 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -117,7 +117,6 @@
  private:
   byte length_;
   byte encoding_[6];
-  byte padding_;
 
   explicit Operand(Register reg) { SetModRM(3, reg); }
 
@@ -192,21 +191,15 @@
     }
   }
 
-  static Address Absolute(uword addr, bool has_rip = false) {
+  static Address Absolute(uword addr) {
     Address result;
-    if (has_rip) {
-      result.SetModRM(0, ESP);
-      result.SetSIB(TIMES_1, ESP, EBP);
-      result.SetDisp32(addr);
-    } else {
-      result.SetModRM(0, EBP);
-      result.SetDisp32(addr);
-    }
+    result.SetModRM(0, EBP);
+    result.SetDisp32(addr);
     return result;
   }
 
-  static Address Absolute(ThreadOffset addr, bool has_rip = false) {
-    return Absolute(addr.Int32Value(), has_rip);
+  static Address Absolute(ThreadOffset<4> addr) {
+    return Absolute(addr.Int32Value());
   }
 
  private:
@@ -465,129 +458,116 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister src);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister src);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
 
-  void StoreLabelToThread(ThreadOffset thr_offs, Label* lbl);
-
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister dest, FrameOffset src, size_t size);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size);
+  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch);
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch);
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister scratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister);
+  void MemoryBarrier(ManagedRegister) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister scratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister scratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
-  virtual void LoadReferenceFromSirt(ManagedRegister dst,
-                                     ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
   inline void EmitUint8(uint8_t value);
@@ -637,10 +617,10 @@
 }
 
 // Slowpath entered when Thread::Current()->_exception is non-null
-class X86ExceptionSlowPath : public SlowPath {
+class X86ExceptionSlowPath FINAL : public SlowPath {
  public:
   explicit X86ExceptionSlowPath(size_t stack_adjust) : stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm);
+  virtual void Emit(Assembler *sp_asm) OVERRIDE;
  private:
   const size_t stack_adjust_;
 };
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index fa302c9..52b9382 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -24,23 +24,29 @@
 namespace art {
 namespace x86_64 {
 
+std::ostream& operator<<(std::ostream& os, const CpuRegister& reg) {
+  return os << reg.AsRegister();
+}
+
 std::ostream& operator<<(std::ostream& os, const XmmRegister& reg) {
-  return os << "XMM" << static_cast<int>(reg);
+  return os << reg.AsFloatRegister();
 }
 
 std::ostream& operator<<(std::ostream& os, const X87Register& reg) {
   return os << "ST" << static_cast<int>(reg);
 }
 
-void X86_64Assembler::call(Register reg) {
+void X86_64Assembler::call(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xFF);
-  EmitRegisterOperand(2, reg);
+  EmitRegisterOperand(2, reg.LowBits());
 }
 
 
 void X86_64Assembler::call(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xFF);
   EmitOperand(2, address);
 }
@@ -54,15 +60,16 @@
 }
 
 
-void X86_64Assembler::pushq(Register reg) {
+void X86_64Assembler::pushq(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_rm(reg);
-  EmitUint8(0x50 + reg);
+  EmitOptionalRex32(reg);
+  EmitUint8(0x50 + reg.LowBits());
 }
 
 
 void X86_64Assembler::pushq(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xFF);
   EmitOperand(6, address);
 }
@@ -80,332 +87,335 @@
 }
 
 
-void X86_64Assembler::popq(Register reg) {
+void X86_64Assembler::popq(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_rm(reg);
-  EmitUint8(0x58 + reg);
+  EmitOptionalRex32(reg);
+  EmitUint8(0x58 + reg.LowBits());
 }
 
 
 void X86_64Assembler::popq(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0x8F);
   EmitOperand(0, address);
 }
 
 
-void X86_64Assembler::movq(Register dst, const Immediate& imm) {
+void X86_64Assembler::movq(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
-  EmitUint8(0xB8 + dst);
+  EmitRex64(dst);
+  EmitUint8(0xB8 + dst.LowBits());
   EmitImmediate(imm);
 }
 
 
-void X86_64Assembler::movl(Register dst, const Immediate& imm) {
+void X86_64Assembler::movl(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xB8 + dst);
+  EmitOptionalRex32(dst);
+  EmitUint8(0xB8 + dst.LowBits());
   EmitImmediate(imm);
 }
 
 
-void X86_64Assembler::movq(Register dst, Register src) {
+void X86_64Assembler::movq(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
+  EmitRex64(dst, src);
   EmitUint8(0x89);
-  EmitRegisterOperand(src, dst);
+  EmitRegisterOperand(src.LowBits(), dst.LowBits());
 }
 
 
-void X86_64Assembler::movl(Register dst, Register src) {
+void X86_64Assembler::movl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x89);
-  EmitRegisterOperand(src, dst);
+  EmitRegisterOperand(src.LowBits(), dst.LowBits());
 }
 
 
-void X86_64Assembler::movq(Register dst, const Address& src) {
+void X86_64Assembler::movq(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(dst, 8);
+  EmitRex64(dst, src);
   EmitUint8(0x8B);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movl(Register dst, const Address& src) {
+void X86_64Assembler::movl(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(dst, 4);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x8B);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movq(const Address& dst, Register src) {
+void X86_64Assembler::movq(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(src, 8);
+  EmitRex64(src, dst);
   EmitUint8(0x89);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
-void X86_64Assembler::movl(const Address& dst, Register src) {
+void X86_64Assembler::movl(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(src, 4);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x89);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
-
 void X86_64Assembler::movl(const Address& dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
   EmitUint8(0xC7);
   EmitOperand(0, dst);
   EmitImmediate(imm);
 }
 
-void X86_64Assembler::movl(const Address& dst, Label* lbl) {
+void X86_64Assembler::movzxb(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xC7);
-  EmitOperand(0, dst);
-  EmitLabel(lbl, dst.length_ + 5);
-}
-
-void X86_64Assembler::movzxb(Register dst, ByteRegister src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB6);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movzxb(Register dst, const Address& src) {
+void X86_64Assembler::movzxb(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB6);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movsxb(Register dst, ByteRegister src) {
+void X86_64Assembler::movsxb(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBE);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movsxb(Register dst, const Address& src) {
+void X86_64Assembler::movsxb(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBE);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movb(Register /*dst*/, const Address& /*src*/) {
+void X86_64Assembler::movb(CpuRegister /*dst*/, const Address& /*src*/) {
   LOG(FATAL) << "Use movzxb or movsxb instead.";
 }
 
 
-void X86_64Assembler::movb(const Address& dst, ByteRegister src) {
+void X86_64Assembler::movb(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(src, dst);
   EmitUint8(0x88);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::movb(const Address& dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xC6);
-  EmitOperand(RAX, dst);
+  EmitOperand(Register::RAX, dst);
   CHECK(imm.is_int8());
   EmitUint8(imm.value() & 0xFF);
 }
 
 
-void X86_64Assembler::movzxw(Register dst, Register src) {
+void X86_64Assembler::movzxw(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB7);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movzxw(Register dst, const Address& src) {
+void X86_64Assembler::movzxw(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB7);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movsxw(Register dst, Register src) {
+void X86_64Assembler::movsxw(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBF);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movsxw(Register dst, const Address& src) {
+void X86_64Assembler::movsxw(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBF);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movw(Register /*dst*/, const Address& /*src*/) {
+void X86_64Assembler::movw(CpuRegister /*dst*/, const Address& /*src*/) {
   LOG(FATAL) << "Use movzxw or movsxw instead.";
 }
 
 
-void X86_64Assembler::movw(const Address& dst, Register src) {
+void X86_64Assembler::movw(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(src, dst);
   EmitOperandSizeOverride();
   EmitUint8(0x89);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
-void X86_64Assembler::leaq(Register dst, const Address& src) {
+void X86_64Assembler::leaq(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(dst, 8);
+  EmitRex64(dst, src);
   EmitUint8(0x8D);
-  EmitOperand(dst, src);
-}
-
-
-void X86_64Assembler::cmovl(Condition condition, Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0F);
-  EmitUint8(0x40 + condition);
-  EmitRegisterOperand(dst, src);
-}
-
-
-void X86_64Assembler::setb(Condition condition, Register dst) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0F);
-  EmitUint8(0x90 + condition);
-  EmitOperand(0, Operand(dst));
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::movss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x10);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::movss(const Address& dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::movss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitXmmRegisterOperand(src, dst);
+  EmitXmmRegisterOperand(src.LowBits(), dst);
 }
 
 
-void X86_64Assembler::movd(XmmRegister dst, Register src) {
+void X86_64Assembler::movd(XmmRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x6E);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::movd(Register dst, XmmRegister src) {
+void X86_64Assembler::movd(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x0F);
   EmitUint8(0x7E);
-  EmitOperand(src, Operand(dst));
+  EmitOperand(src.LowBits(), Operand(dst));
 }
 
 
 void X86_64Assembler::addss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::addss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
@@ -426,258 +436,287 @@
 void X86_64Assembler::movsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x10);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::movsd(const Address& dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::movsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitXmmRegisterOperand(src, dst);
+  EmitXmmRegisterOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::addsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::addsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvtsi2ss(XmmRegister dst, Register src) {
+void X86_64Assembler::cvtsi2ss(XmmRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2A);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::cvtsi2sd(XmmRegister dst, Register src) {
+void X86_64Assembler::cvtsi2sd(XmmRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2A);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::cvtss2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvtss2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2D);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::cvtss2sd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5A);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvtsd2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvtsd2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2D);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvttss2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvttss2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvttsd2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvttsd2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::cvtsd2ss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5A);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::cvtdq2pd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xE6);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::comiss(XmmRegister a, XmmRegister b) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(a, b);
   EmitUint8(0x0F);
   EmitUint8(0x2F);
-  EmitXmmRegisterOperand(a, b);
+  EmitXmmRegisterOperand(a.LowBits(), b);
 }
 
 
 void X86_64Assembler::comisd(XmmRegister a, XmmRegister b) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(a, b);
   EmitUint8(0x0F);
   EmitUint8(0x2F);
-  EmitXmmRegisterOperand(a, b);
+  EmitXmmRegisterOperand(a.LowBits(), b);
 }
 
 
 void X86_64Assembler::sqrtsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x51);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::sqrtss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x51);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorpd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorpd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorps(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::andpd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x54);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
@@ -766,92 +805,102 @@
 }
 
 
-void X86_64Assembler::xchgl(Register dst, Register src) {
+void X86_64Assembler::xchgl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x87);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
-void X86_64Assembler::xchgl(Register reg, const Address& address) {
+void X86_64Assembler::xchgl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x87);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::cmpl(Register reg, const Immediate& imm) {
+void X86_64Assembler::cmpl(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitComplex(7, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::cmpl(Register reg0, Register reg1) {
+void X86_64Assembler::cmpl(CpuRegister reg0, CpuRegister reg1) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg0, reg1);
   EmitUint8(0x3B);
-  EmitOperand(reg0, Operand(reg1));
+  EmitOperand(reg0.LowBits(), Operand(reg1));
 }
 
 
-void X86_64Assembler::cmpl(Register reg, const Address& address) {
+void X86_64Assembler::cmpl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x3B);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::addl(Register dst, Register src) {
+void X86_64Assembler::addl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x03);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::addl(Register reg, const Address& address) {
+void X86_64Assembler::addl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x03);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::cmpl(const Address& address, Register reg) {
+void X86_64Assembler::cmpl(const Address& address, CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x39);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
 void X86_64Assembler::cmpl(const Address& address, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitComplex(7, address, imm);
 }
 
 
-void X86_64Assembler::testl(Register reg1, Register reg2) {
+void X86_64Assembler::testl(CpuRegister reg1, CpuRegister reg2) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex(reg1, reg2, 4);
+  EmitOptionalRex32(reg1, reg2);
   EmitUint8(0x85);
-  EmitRegisterOperand(reg1, reg2);
+  EmitRegisterOperand(reg1.LowBits(), reg2.LowBits());
 }
 
 
-void X86_64Assembler::testl(Register reg, const Immediate& immediate) {
+void X86_64Assembler::testl(CpuRegister reg, const Immediate& immediate) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   // For registers that have a byte variant (RAX, RBX, RCX, and RDX)
-  // we only test the byte register to keep the encoding short.
-  if (immediate.is_uint8() && reg < 4) {
+  // we only test the byte CpuRegister to keep the encoding short.
+  if (immediate.is_uint8() && reg.AsRegister() < 4) {
     // Use zero-extended 8-bit immediate.
-    if (reg == RAX) {
+    if (reg.AsRegister() == RAX) {
       EmitUint8(0xA8);
     } else {
       EmitUint8(0xF6);
-      EmitUint8(0xC0 + reg);
+      EmitUint8(0xC0 + reg.AsRegister());
     }
     EmitUint8(immediate.value() & 0xFF);
-  } else if (reg == RAX) {
+  } else if (reg.AsRegister() == RAX) {
     // Use short form if the destination is RAX.
     EmitUint8(0xA9);
     EmitImmediate(immediate);
   } else {
+    EmitOptionalRex32(reg);
     EmitUint8(0xF7);
     EmitOperand(0, Operand(reg));
     EmitImmediate(immediate);
@@ -859,136 +908,145 @@
 }
 
 
-void X86_64Assembler::andl(Register dst, Register src) {
+void X86_64Assembler::andl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x23);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::andl(Register dst, const Immediate& imm) {
+void X86_64Assembler::andl(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
   EmitComplex(4, Operand(dst), imm);
 }
 
 
-void X86_64Assembler::orl(Register dst, Register src) {
+void X86_64Assembler::orl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0B);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::orl(Register dst, const Immediate& imm) {
+void X86_64Assembler::orl(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
   EmitComplex(1, Operand(dst), imm);
 }
 
 
-void X86_64Assembler::xorl(Register dst, Register src) {
+void X86_64Assembler::xorl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex(dst, src, 4);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x33);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
-void X86_64Assembler::rex_reg(Register &dst, size_t size) {
-  Register src = kNoRegister;
-  rex(dst, src, size);
-}
-
-void X86_64Assembler::rex_rm(Register &src, size_t size) {
-  Register dst = kNoRegister;
-  rex(dst, src, size);
-}
-
-void X86_64Assembler::rex(Register &dst, Register &src, size_t size) {
-  uint8_t rex = 0;
+#if 0
+void X86_64Assembler::rex(bool force, bool w, Register* r, Register* x, Register* b) {
   // REX.WRXB
   // W - 64-bit operand
   // R - MODRM.reg
   // X - SIB.index
   // B - MODRM.rm/SIB.base
-  if (size == 8) {
+  uint8_t rex = force ? 0x40 : 0;
+  if (w) {
     rex |= 0x48;  // REX.W000
   }
-  if (dst >= Register::R8 && dst < Register::kNumberOfCpuRegisters) {
+  if (r != nullptr && *r >= Register::R8 && *r < Register::kNumberOfCpuRegisters) {
     rex |= 0x44;  // REX.0R00
-    dst = static_cast<Register>(dst - 8);
+    *r = static_cast<Register>(*r - 8);
   }
-  if (src >= Register::R8 && src < Register::kNumberOfCpuRegisters) {
+  if (x != nullptr && *x >= Register::R8 && *x < Register::kNumberOfCpuRegisters) {
+    rex |= 0x42;  // REX.00X0
+    *x = static_cast<Register>(*x - 8);
+  }
+  if (b != nullptr && *b >= Register::R8 && *b < Register::kNumberOfCpuRegisters) {
     rex |= 0x41;  // REX.000B
-    src = static_cast<Register>(src - 8);
+    *b = static_cast<Register>(*b - 8);
   }
   if (rex != 0) {
     EmitUint8(rex);
   }
 }
 
-void X86_64Assembler::addl(Register reg, const Immediate& imm) {
+void X86_64Assembler::rex_reg_mem(bool force, bool w, Register* dst, const Address& mem) {
+  // REX.WRXB
+  // W - 64-bit operand
+  // R - MODRM.reg
+  // X - SIB.index
+  // B - MODRM.rm/SIB.base
+  uint8_t rex = mem->rex();
+  if (force) {
+    rex |= 0x40;  // REX.0000
+  }
+  if (w) {
+    rex |= 0x48;  // REX.W000
+  }
+  if (dst != nullptr && *dst >= Register::R8 && *dst < Register::kNumberOfCpuRegisters) {
+    rex |= 0x44;  // REX.0R00
+    *dst = static_cast<Register>(*dst - 8);
+  }
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+void rex_mem_reg(bool force, bool w, Address* mem, Register* src);
+#endif
+
+void X86_64Assembler::addl(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitComplex(0, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::addq(Register reg, const Immediate& imm) {
+void X86_64Assembler::addq(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
+  EmitRex64(reg);
   EmitComplex(0, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::addl(const Address& address, Register reg) {
+void X86_64Assembler::addl(const Address& address, CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x01);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
 void X86_64Assembler::addl(const Address& address, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitComplex(0, address, imm);
 }
 
 
-void X86_64Assembler::adcl(Register reg, const Immediate& imm) {
+void X86_64Assembler::subl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(2, Operand(reg), imm);
-}
-
-
-void X86_64Assembler::adcl(Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x13);
-  EmitOperand(dst, Operand(src));
-}
-
-
-void X86_64Assembler::adcl(Register dst, const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x13);
-  EmitOperand(dst, address);
-}
-
-
-void X86_64Assembler::subl(Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x2B);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::subl(Register reg, const Immediate& imm) {
+void X86_64Assembler::subl(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
+  EmitOptionalRex32(reg);
   EmitComplex(5, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::subl(Register reg, const Address& address) {
+void X86_64Assembler::subl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x2B);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
@@ -998,39 +1056,44 @@
 }
 
 
-void X86_64Assembler::idivl(Register reg) {
+void X86_64Assembler::idivl(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
-  EmitUint8(0xF8 | reg);
+  EmitUint8(0xF8 | reg.LowBits());
 }
 
 
-void X86_64Assembler::imull(Register dst, Register src) {
+void X86_64Assembler::imull(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xAF);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::imull(Register reg, const Immediate& imm) {
+void X86_64Assembler::imull(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0x69);
-  EmitOperand(reg, Operand(reg));
+  EmitOperand(reg.LowBits(), Operand(reg));
   EmitImmediate(imm);
 }
 
 
-void X86_64Assembler::imull(Register reg, const Address& address) {
+void X86_64Assembler::imull(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x0F);
   EmitUint8(0xAF);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::imull(Register reg) {
+void X86_64Assembler::imull(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
   EmitOperand(5, Operand(reg));
 }
@@ -1038,13 +1101,15 @@
 
 void X86_64Assembler::imull(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xF7);
   EmitOperand(5, address);
 }
 
 
-void X86_64Assembler::mull(Register reg) {
+void X86_64Assembler::mull(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
   EmitOperand(4, Operand(reg));
 }
@@ -1052,106 +1117,56 @@
 
 void X86_64Assembler::mull(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xF7);
   EmitOperand(4, address);
 }
 
 
-void X86_64Assembler::sbbl(Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x1B);
-  EmitOperand(dst, Operand(src));
-}
 
-
-void X86_64Assembler::sbbl(Register reg, const Immediate& imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(3, Operand(reg), imm);
-}
-
-
-void X86_64Assembler::sbbl(Register dst, const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x1B);
-  EmitOperand(dst, address);
-}
-
-
-void X86_64Assembler::incl(Register reg) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x40 + reg);
-}
-
-
-void X86_64Assembler::incl(const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xFF);
-  EmitOperand(0, address);
-}
-
-
-void X86_64Assembler::decl(Register reg) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48 + reg);
-}
-
-
-void X86_64Assembler::decl(const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xFF);
-  EmitOperand(1, address);
-}
-
-
-void X86_64Assembler::shll(Register reg, const Immediate& imm) {
+void X86_64Assembler::shll(CpuRegister reg, const Immediate& imm) {
   EmitGenericShift(4, reg, imm);
 }
 
 
-void X86_64Assembler::shll(Register operand, Register shifter) {
+void X86_64Assembler::shll(CpuRegister operand, CpuRegister shifter) {
   EmitGenericShift(4, operand, shifter);
 }
 
 
-void X86_64Assembler::shrl(Register reg, const Immediate& imm) {
+void X86_64Assembler::shrl(CpuRegister reg, const Immediate& imm) {
   EmitGenericShift(5, reg, imm);
 }
 
 
-void X86_64Assembler::shrl(Register operand, Register shifter) {
+void X86_64Assembler::shrl(CpuRegister operand, CpuRegister shifter) {
   EmitGenericShift(5, operand, shifter);
 }
 
 
-void X86_64Assembler::sarl(Register reg, const Immediate& imm) {
+void X86_64Assembler::sarl(CpuRegister reg, const Immediate& imm) {
   EmitGenericShift(7, reg, imm);
 }
 
 
-void X86_64Assembler::sarl(Register operand, Register shifter) {
+void X86_64Assembler::sarl(CpuRegister operand, CpuRegister shifter) {
   EmitGenericShift(7, operand, shifter);
 }
 
 
-void X86_64Assembler::shld(Register dst, Register src) {
+void X86_64Assembler::negl(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0F);
-  EmitUint8(0xA5);
-  EmitRegisterOperand(src, dst);
-}
-
-
-void X86_64Assembler::negl(Register reg) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
   EmitOperand(3, Operand(reg));
 }
 
 
-void X86_64Assembler::notl(Register reg) {
+void X86_64Assembler::notl(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
-  EmitUint8(0xD0 | reg);
+  EmitUint8(0xD0 | reg.LowBits());
 }
 
 
@@ -1228,14 +1243,16 @@
 }
 
 
-void X86_64Assembler::jmp(Register reg) {
+void X86_64Assembler::jmp(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xFF);
-  EmitRegisterOperand(4, reg);
+  EmitRegisterOperand(4, reg.LowBits());
 }
 
 void X86_64Assembler::jmp(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xFF);
   EmitOperand(4, address);
 }
@@ -1268,11 +1285,11 @@
 }
 
 
-void X86_64Assembler::cmpxchgl(const Address& address, Register reg) {
+void X86_64Assembler::cmpxchgl(const Address& address, CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x0F);
   EmitUint8(0xB1);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 void X86_64Assembler::mfence() {
@@ -1289,19 +1306,12 @@
   return this;
 }
 
-void X86_64Assembler::AddImmediate(Register reg, const Immediate& imm) {
+void X86_64Assembler::AddImmediate(CpuRegister reg, const Immediate& imm) {
   int value = imm.value();
-  if (value > 0) {
-    if (value == 1) {
-      incl(reg);
-    } else if (value != 0) {
+  if (value != 0) {
+    if (value > 0) {
       addl(reg, imm);
-    }
-  } else if (value < 0) {
-    value = -value;
-    if (value == 1) {
-      decl(reg);
-    } else if (value != 0) {
+    } else {
       subl(reg, Immediate(value));
     }
   }
@@ -1313,8 +1323,8 @@
   int64_t constant = bit_cast<int64_t, double>(value);
   pushq(Immediate(High32Bits(constant)));
   pushq(Immediate(Low32Bits(constant)));
-  movsd(dst, Address(RSP, 0));
-  addq(RSP, Immediate(2 * kWordSize));
+  movsd(dst, Address(CpuRegister(RSP), 0));
+  addq(CpuRegister(RSP), Immediate(2 * kWordSize));
 }
 
 
@@ -1372,7 +1382,7 @@
 }
 
 
-void X86_64Assembler::EmitOperand(int reg_or_opcode, const Operand& operand) {
+void X86_64Assembler::EmitOperand(uint8_t reg_or_opcode, const Operand& operand) {
   CHECK_GE(reg_or_opcode, 0);
   CHECK_LT(reg_or_opcode, 8);
   const int length = operand.length_;
@@ -1392,9 +1402,9 @@
 }
 
 
-void X86_64Assembler::EmitComplex(int reg_or_opcode,
-                               const Operand& operand,
-                               const Immediate& immediate) {
+void X86_64Assembler::EmitComplex(uint8_t reg_or_opcode,
+                                  const Operand& operand,
+                                  const Immediate& immediate) {
   CHECK_GE(reg_or_opcode, 0);
   CHECK_LT(reg_or_opcode, 8);
   if (immediate.is_int8()) {
@@ -1402,7 +1412,7 @@
     EmitUint8(0x83);
     EmitOperand(reg_or_opcode, operand);
     EmitUint8(immediate.value() & 0xFF);
-  } else if (operand.IsRegister(RAX)) {
+  } else if (operand.IsRegister(CpuRegister(RAX))) {
     // Use short form if the destination is eax.
     EmitUint8(0x05 + (reg_or_opcode << 3));
     EmitImmediate(immediate);
@@ -1434,7 +1444,7 @@
 
 
 void X86_64Assembler::EmitGenericShift(int reg_or_opcode,
-                                    Register reg,
+                                    CpuRegister reg,
                                     const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   CHECK(imm.is_int8());
@@ -1450,14 +1460,89 @@
 
 
 void X86_64Assembler::EmitGenericShift(int reg_or_opcode,
-                                    Register operand,
-                                    Register shifter) {
+                                    CpuRegister operand,
+                                    CpuRegister shifter) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  CHECK_EQ(shifter, RCX);
+  CHECK_EQ(shifter.AsRegister(), RCX);
   EmitUint8(0xD3);
   EmitOperand(reg_or_opcode, Operand(operand));
 }
 
+void X86_64Assembler::EmitOptionalRex(bool force, bool w, bool r, bool x, bool b) {
+  // REX.WRXB
+  // W - 64-bit operand
+  // R - MODRM.reg
+  // X - SIB.index
+  // B - MODRM.rm/SIB.base
+  uint8_t rex = force ? 0x40 : 0;
+  if (w) {
+    rex |= 0x48;  // REX.W000
+  }
+  if (r) {
+    rex |= 0x44;  // REX.0R00
+  }
+  if (x) {
+    rex |= 0x42;  // REX.00X0
+  }
+  if (b) {
+    rex |= 0x41;  // REX.000B
+  }
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister reg) {
+  EmitOptionalRex(false, false, reg.NeedsRex(), false, false);
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister dst, CpuRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(XmmRegister dst, XmmRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister dst, XmmRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(XmmRegister dst, CpuRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(const Operand& operand) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister dst, const Operand& operand) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void X86_64Assembler::EmitOptionalRex32(XmmRegister dst, const Operand& operand) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void X86_64Assembler::EmitRex64(CpuRegister reg) {
+  EmitOptionalRex(false, true, reg.NeedsRex(), false, false);
+}
+void X86_64Assembler::EmitRex64(CpuRegister dst, CpuRegister src) {
+  EmitOptionalRex(false, true, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitRex64(CpuRegister dst, const Operand& operand) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void X86_64Assembler::EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src) {
+  EmitOptionalRex(true, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalByteRegNormalizingRex32(CpuRegister dst, const Operand& operand) {
+  UNIMPLEMENTED(FATAL);
+}
+
 void X86_64Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
                               const std::vector<ManagedRegister>& spill_regs,
                               const ManagedRegisterEntrySpills& entry_spills) {
@@ -1466,25 +1551,26 @@
     pushq(spill_regs.at(i).AsX86_64().AsCpuRegister());
   }
   // return address then method on stack
-  addq(RSP, Immediate(-frame_size + (spill_regs.size() * kPointerSize) +
-                      kPointerSize /*method*/ + kPointerSize /*return address*/));
+  addq(CpuRegister(RSP), Immediate(-frame_size + (spill_regs.size() * kPointerSize) +
+                                   kPointerSize /*method*/ + kPointerSize /*return address*/));
   pushq(method_reg.AsX86_64().AsCpuRegister());
 
   for (size_t i = 0; i < entry_spills.size(); ++i) {
     ManagedRegisterSpill spill = entry_spills.at(i);
     if (spill.AsX86_64().IsCpuRegister()) {
       if (spill.getSize() == 8) {
-        movq(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsCpuRegister());
+        movq(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()),
+             spill.AsX86_64().AsCpuRegister());
       } else {
         CHECK_EQ(spill.getSize(), 4);
-        movl(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsCpuRegister());
+        movl(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), spill.AsX86_64().AsCpuRegister());
       }
     } else {
       if (spill.getSize() == 8) {
-        movsd(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
+        movsd(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
       } else {
         CHECK_EQ(spill.getSize(), 4);
-        movss(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
+        movss(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
       }
     }
   }
@@ -1493,7 +1579,7 @@
 void X86_64Assembler::RemoveFrame(size_t frame_size,
                             const std::vector<ManagedRegister>& spill_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
-  addq(RSP, Immediate(frame_size - (spill_regs.size() * kPointerSize) - kPointerSize));
+  addq(CpuRegister(RSP), Immediate(frame_size - (spill_regs.size() * kPointerSize) - kPointerSize));
   for (size_t i = 0; i < spill_regs.size(); ++i) {
     popq(spill_regs.at(i).AsX86_64().AsCpuRegister());
   }
@@ -1502,12 +1588,12 @@
 
 void X86_64Assembler::IncreaseFrameSize(size_t adjust) {
   CHECK_ALIGNED(adjust, kStackAlignment);
-  addq(RSP, Immediate(-adjust));
+  addq(CpuRegister(RSP), Immediate(-adjust));
 }
 
 void X86_64Assembler::DecreaseFrameSize(size_t adjust) {
   CHECK_ALIGNED(adjust, kStackAlignment);
-  addq(RSP, Immediate(adjust));
+  addq(CpuRegister(RSP), Immediate(adjust));
 }
 
 void X86_64Assembler::Store(FrameOffset offs, ManagedRegister msrc, size_t size) {
@@ -1517,28 +1603,28 @@
   } else if (src.IsCpuRegister()) {
     if (size == 4) {
       CHECK_EQ(4u, size);
-      movl(Address(RSP, offs), src.AsCpuRegister());
+      movl(Address(CpuRegister(RSP), offs), src.AsCpuRegister());
     } else {
       CHECK_EQ(8u, size);
-      movq(Address(RSP, offs), src.AsCpuRegister());
+      movq(Address(CpuRegister(RSP), offs), src.AsCpuRegister());
     }
   } else if (src.IsRegisterPair()) {
     CHECK_EQ(0u, size);
-    movq(Address(RSP, offs), src.AsRegisterPairLow());
-    movq(Address(RSP, FrameOffset(offs.Int32Value()+4)),
+    movq(Address(CpuRegister(RSP), offs), src.AsRegisterPairLow());
+    movq(Address(CpuRegister(RSP), FrameOffset(offs.Int32Value()+4)),
          src.AsRegisterPairHigh());
   } else if (src.IsX87Register()) {
     if (size == 4) {
-      fstps(Address(RSP, offs));
+      fstps(Address(CpuRegister(RSP), offs));
     } else {
-      fstpl(Address(RSP, offs));
+      fstpl(Address(CpuRegister(RSP), offs));
     }
   } else {
     CHECK(src.IsXmmRegister());
     if (size == 4) {
-      movss(Address(RSP, offs), src.AsXmmRegister());
+      movss(Address(CpuRegister(RSP), offs), src.AsXmmRegister());
     } else {
-      movsd(Address(RSP, offs), src.AsXmmRegister());
+      movsd(Address(CpuRegister(RSP), offs), src.AsXmmRegister());
     }
   }
 }
@@ -1546,40 +1632,36 @@
 void X86_64Assembler::StoreRef(FrameOffset dest, ManagedRegister msrc) {
   X86_64ManagedRegister src = msrc.AsX86_64();
   CHECK(src.IsCpuRegister());
-  movq(Address(RSP, dest), src.AsCpuRegister());
+  movq(Address(CpuRegister(RSP), dest), src.AsCpuRegister());
 }
 
 void X86_64Assembler::StoreRawPtr(FrameOffset dest, ManagedRegister msrc) {
   X86_64ManagedRegister src = msrc.AsX86_64();
   CHECK(src.IsCpuRegister());
-  movq(Address(RSP, dest), src.AsCpuRegister());
+  movq(Address(CpuRegister(RSP), dest), src.AsCpuRegister());
 }
 
 void X86_64Assembler::StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                         ManagedRegister) {
-  movl(Address(RSP, dest), Immediate(imm));  // TODO(64) movq?
+                                            ManagedRegister) {
+  movl(Address(CpuRegister(RSP), dest), Immediate(imm));  // TODO(64) movq?
 }
 
-void X86_64Assembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                          ManagedRegister) {
+void X86_64Assembler::StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
+                                               ManagedRegister) {
   gs()->movl(Address::Absolute(dest, true), Immediate(imm));  // TODO(64) movq?
 }
 
-void X86_64Assembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                            FrameOffset fr_offs,
-                                            ManagedRegister mscratch) {
+void X86_64Assembler::StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+                                                 FrameOffset fr_offs,
+                                                 ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
-  leaq(scratch.AsCpuRegister(), Address(RSP, fr_offs));
+  leaq(scratch.AsCpuRegister(), Address(CpuRegister(RSP), fr_offs));
   gs()->movq(Address::Absolute(thr_offs, true), scratch.AsCpuRegister());
 }
 
-void X86_64Assembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
-  gs()->movq(Address::Absolute(thr_offs, true), RSP);
-}
-
-void X86_64Assembler::StoreLabelToThread(ThreadOffset thr_offs, Label* lbl) {
-  gs()->movl(Address::Absolute(thr_offs, true), lbl);  // TODO(64) movq?
+void X86_64Assembler::StoreStackPointerToThread64(ThreadOffset<8> thr_offs) {
+  gs()->movq(Address::Absolute(thr_offs, true), CpuRegister(RSP));
 }
 
 void X86_64Assembler::StoreSpanning(FrameOffset /*dst*/, ManagedRegister /*src*/,
@@ -1594,42 +1676,41 @@
   } else if (dest.IsCpuRegister()) {
     if (size == 4) {
       CHECK_EQ(4u, size);
-      movl(dest.AsCpuRegister(), Address(RSP, src));
+      movl(dest.AsCpuRegister(), Address(CpuRegister(RSP), src));
     } else {
       CHECK_EQ(8u, size);
-      movq(dest.AsCpuRegister(), Address(RSP, src));
+      movq(dest.AsCpuRegister(), Address(CpuRegister(RSP), src));
     }
   } else if (dest.IsRegisterPair()) {
     CHECK_EQ(0u, size);
-    movq(dest.AsRegisterPairLow(), Address(RSP, src));
-    movq(dest.AsRegisterPairHigh(), Address(RSP, FrameOffset(src.Int32Value()+4)));
+    movq(dest.AsRegisterPairLow(), Address(CpuRegister(RSP), src));
+    movq(dest.AsRegisterPairHigh(), Address(CpuRegister(RSP), FrameOffset(src.Int32Value()+4)));
   } else if (dest.IsX87Register()) {
     if (size == 4) {
-      flds(Address(RSP, src));
+      flds(Address(CpuRegister(RSP), src));
     } else {
-      fldl(Address(RSP, src));
+      fldl(Address(CpuRegister(RSP), src));
     }
   } else {
     CHECK(dest.IsXmmRegister());
     if (size == 4) {
-      movss(dest.AsXmmRegister(), Address(RSP, src));
+      movss(dest.AsXmmRegister(), Address(CpuRegister(RSP), src));
     } else {
-      movsd(dest.AsXmmRegister(), Address(RSP, src));
+      movsd(dest.AsXmmRegister(), Address(CpuRegister(RSP), src));
     }
   }
 }
 
-void X86_64Assembler::Load(ManagedRegister mdest, ThreadOffset src, size_t size) {
+void X86_64Assembler::LoadFromThread64(ManagedRegister mdest, ThreadOffset<8> src, size_t size) {
   X86_64ManagedRegister dest = mdest.AsX86_64();
   if (dest.IsNoRegister()) {
     CHECK_EQ(0u, size);
   } else if (dest.IsCpuRegister()) {
     CHECK_EQ(4u, size);
-    gs()->movq(dest.AsCpuRegister(), Address::Absolute(src, true));
+    gs()->movl(dest.AsCpuRegister(), Address::Absolute(src, true));
   } else if (dest.IsRegisterPair()) {
     CHECK_EQ(8u, size);
     gs()->movq(dest.AsRegisterPairLow(), Address::Absolute(src, true));
-    gs()->movq(dest.AsRegisterPairHigh(), Address::Absolute(ThreadOffset(src.Int32Value()+4), true));
   } else if (dest.IsX87Register()) {
     if (size == 4) {
       gs()->flds(Address::Absolute(src, true));
@@ -1649,7 +1730,7 @@
 void X86_64Assembler::LoadRef(ManagedRegister mdest, FrameOffset  src) {
   X86_64ManagedRegister dest = mdest.AsX86_64();
   CHECK(dest.IsCpuRegister());
-  movq(dest.AsCpuRegister(), Address(RSP, src));
+  movq(dest.AsCpuRegister(), Address(CpuRegister(RSP), src));
 }
 
 void X86_64Assembler::LoadRef(ManagedRegister mdest, ManagedRegister base,
@@ -1666,8 +1747,7 @@
   movq(dest.AsCpuRegister(), Address(base.AsX86_64().AsCpuRegister(), offs));
 }
 
-void X86_64Assembler::LoadRawPtrFromThread(ManagedRegister mdest,
-                                        ThreadOffset offs) {
+void X86_64Assembler::LoadRawPtrFromThread64(ManagedRegister mdest, ThreadOffset<8> offs) {
   X86_64ManagedRegister dest = mdest.AsX86_64();
   CHECK(dest.IsCpuRegister());
   gs()->movq(dest.AsCpuRegister(), Address::Absolute(offs, true));
@@ -1678,7 +1758,7 @@
   CHECK(size == 1 || size == 2) << size;
   CHECK(reg.IsCpuRegister()) << reg;
   if (size == 1) {
-    movsxb(reg.AsCpuRegister(), reg.AsByteRegister());
+    movsxb(reg.AsCpuRegister(), reg.AsCpuRegister());
   } else {
     movsxw(reg.AsCpuRegister(), reg.AsCpuRegister());
   }
@@ -1689,7 +1769,7 @@
   CHECK(size == 1 || size == 2) << size;
   CHECK(reg.IsCpuRegister()) << reg;
   if (size == 1) {
-    movzxb(reg.AsCpuRegister(), reg.AsByteRegister());
+    movzxb(reg.AsCpuRegister(), reg.AsCpuRegister());
   } else {
     movzxw(reg.AsCpuRegister(), reg.AsCpuRegister());
   }
@@ -1703,17 +1783,17 @@
       movq(dest.AsCpuRegister(), src.AsCpuRegister());
     } else if (src.IsX87Register() && dest.IsXmmRegister()) {
       // Pass via stack and pop X87 register
-      subl(RSP, Immediate(16));
+      subl(CpuRegister(RSP), Immediate(16));
       if (size == 4) {
         CHECK_EQ(src.AsX87Register(), ST0);
-        fstps(Address(RSP, 0));
-        movss(dest.AsXmmRegister(), Address(RSP, 0));
+        fstps(Address(CpuRegister(RSP), 0));
+        movss(dest.AsXmmRegister(), Address(CpuRegister(RSP), 0));
       } else {
         CHECK_EQ(src.AsX87Register(), ST0);
-        fstpl(Address(RSP, 0));
-        movsd(dest.AsXmmRegister(), Address(RSP, 0));
+        fstpl(Address(CpuRegister(RSP), 0));
+        movsd(dest.AsXmmRegister(), Address(CpuRegister(RSP), 0));
       }
-      addq(RSP, Immediate(16));
+      addq(CpuRegister(RSP), Immediate(16));
     } else {
       // TODO: x87, SSE
       UNIMPLEMENTED(FATAL) << ": Move " << dest << ", " << src;
@@ -1725,22 +1805,22 @@
                            ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
-  movl(scratch.AsCpuRegister(), Address(RSP, src));
-  movl(Address(RSP, dest), scratch.AsCpuRegister());
+  movl(scratch.AsCpuRegister(), Address(CpuRegister(RSP), src));
+  movl(Address(CpuRegister(RSP), dest), scratch.AsCpuRegister());
 }
 
-void X86_64Assembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                        ThreadOffset thr_offs,
-                                        ManagedRegister mscratch) {
+void X86_64Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs,
+                                             ThreadOffset<8> thr_offs,
+                                             ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
   gs()->movq(scratch.AsCpuRegister(), Address::Absolute(thr_offs, true));
   Store(fr_offs, scratch, 8);
 }
 
-void X86_64Assembler::CopyRawPtrToThread(ThreadOffset thr_offs,
-                                      FrameOffset fr_offs,
-                                      ManagedRegister mscratch) {
+void X86_64Assembler::CopyRawPtrToThread64(ThreadOffset<8> thr_offs,
+                                           FrameOffset fr_offs,
+                                           ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
   Load(scratch, fr_offs, 8);
@@ -1771,17 +1851,17 @@
                         ManagedRegister scratch, size_t size) {
   CHECK(scratch.IsNoRegister());
   CHECK_EQ(size, 4u);
-  pushq(Address(RSP, src));
+  pushq(Address(CpuRegister(RSP), src));
   popq(Address(dest_base.AsX86_64().AsCpuRegister(), dest_offset));
 }
 
 void X86_64Assembler::Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
                         ManagedRegister mscratch, size_t size) {
-  Register scratch = mscratch.AsX86_64().AsCpuRegister();
+  CpuRegister scratch = mscratch.AsX86_64().AsCpuRegister();
   CHECK_EQ(size, 4u);
-  movq(scratch, Address(RSP, src_base));
+  movq(scratch, Address(CpuRegister(RSP), src_base));
   movq(scratch, Address(scratch, src_offset));
-  movq(Address(RSP, dest), scratch);
+  movq(Address(CpuRegister(RSP), dest), scratch);
 }
 
 void X86_64Assembler::Copy(ManagedRegister dest, Offset dest_offset,
@@ -1795,10 +1875,10 @@
 
 void X86_64Assembler::Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
                         ManagedRegister mscratch, size_t size) {
-  Register scratch = mscratch.AsX86_64().AsCpuRegister();
+  CpuRegister scratch = mscratch.AsX86_64().AsCpuRegister();
   CHECK_EQ(size, 4u);
   CHECK_EQ(dest.Int32Value(), src.Int32Value());
-  movq(scratch, Address(RSP, src));
+  movq(scratch, Address(CpuRegister(RSP), src));
   pushq(Address(scratch, src_offset));
   popq(Address(scratch, dest_offset));
 }
@@ -1818,7 +1898,7 @@
     // Use out_reg as indicator of NULL
     in_reg = out_reg;
     // TODO: movzwl
-    movl(in_reg.AsCpuRegister(), Address(RSP, sirt_offset));
+    movl(in_reg.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
   }
   CHECK(in_reg.IsCpuRegister());
   CHECK(out_reg.IsCpuRegister());
@@ -1830,10 +1910,10 @@
     }
     testl(in_reg.AsCpuRegister(), in_reg.AsCpuRegister());
     j(kZero, &null_arg);
-    leaq(out_reg.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(out_reg.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
     Bind(&null_arg);
   } else {
-    leaq(out_reg.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(out_reg.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
   }
 }
 
@@ -1845,13 +1925,13 @@
   CHECK(scratch.IsCpuRegister());
   if (null_allowed) {
     Label null_arg;
-    movl(scratch.AsCpuRegister(), Address(RSP, sirt_offset));
+    movl(scratch.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
     testl(scratch.AsCpuRegister(), scratch.AsCpuRegister());
     j(kZero, &null_arg);
-    leaq(scratch.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(scratch.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
     Bind(&null_arg);
   } else {
-    leaq(scratch.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(scratch.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
   }
   Store(out_off, scratch, 8);
 }
@@ -1889,35 +1969,42 @@
 }
 
 void X86_64Assembler::Call(FrameOffset base, Offset offset, ManagedRegister mscratch) {
-  Register scratch = mscratch.AsX86_64().AsCpuRegister();
-  movq(scratch, Address(RSP, base));
+  CpuRegister scratch = mscratch.AsX86_64().AsCpuRegister();
+  movq(scratch, Address(CpuRegister(RSP), base));
   call(Address(scratch, offset));
 }
 
-void X86_64Assembler::Call(ThreadOffset offset, ManagedRegister /*mscratch*/) {
+void X86_64Assembler::CallFromThread64(ThreadOffset<8> offset, ManagedRegister /*mscratch*/) {
   gs()->call(Address::Absolute(offset, true));
 }
 
 void X86_64Assembler::GetCurrentThread(ManagedRegister tr) {
-  gs()->movq(tr.AsX86_64().AsCpuRegister(),
-             Address::Absolute(Thread::SelfOffset(), true));
+  gs()->movq(tr.AsX86_64().AsCpuRegister(), Address::Absolute(Thread::SelfOffset<8>(), true));
 }
 
-void X86_64Assembler::GetCurrentThread(FrameOffset offset,
-                                    ManagedRegister mscratch) {
+void X86_64Assembler::GetCurrentThread(FrameOffset offset, ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
-  gs()->movq(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset(), true));
-  movq(Address(RSP, offset), scratch.AsCpuRegister());
+  gs()->movq(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset<8>(), true));
+  movq(Address(CpuRegister(RSP), offset), scratch.AsCpuRegister());
 }
 
+// Slowpath entered when Thread::Current()->_exception is non-null
+class X86_64ExceptionSlowPath FINAL : public SlowPath {
+ public:
+  explicit X86_64ExceptionSlowPath(size_t stack_adjust) : stack_adjust_(stack_adjust) {}
+  virtual void Emit(Assembler *sp_asm) OVERRIDE;
+ private:
+  const size_t stack_adjust_;
+};
+
 void X86_64Assembler::ExceptionPoll(ManagedRegister /*scratch*/, size_t stack_adjust) {
-  X86ExceptionSlowPath* slow = new X86ExceptionSlowPath(stack_adjust);
+  X86_64ExceptionSlowPath* slow = new X86_64ExceptionSlowPath(stack_adjust);
   buffer_.EnqueueSlowPath(slow);
-  gs()->cmpl(Address::Absolute(Thread::ExceptionOffset(), true), Immediate(0));
+  gs()->cmpl(Address::Absolute(Thread::ExceptionOffset<8>(), true), Immediate(0));
   j(kNotEqual, slow->Entry());
 }
 
-void X86ExceptionSlowPath::Emit(Assembler *sasm) {
+void X86_64ExceptionSlowPath::Emit(Assembler *sasm) {
   X86_64Assembler* sp_asm = down_cast<X86_64Assembler*>(sasm);
 #define __ sp_asm->
   __ Bind(&entry_);
@@ -1925,27 +2012,14 @@
   if (stack_adjust_ != 0) {  // Fix up the frame.
     __ DecreaseFrameSize(stack_adjust_);
   }
-  // Pass exception as argument in RAX
-  __ gs()->movq(RAX, Address::Absolute(Thread::ExceptionOffset(), true));  // TODO(64): Pass argument via RDI
-  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(pDeliverException), true));
+  // Pass exception as argument in RDI
+  __ gs()->movq(CpuRegister(RDI), Address::Absolute(Thread::ExceptionOffset<8>(), true));
+  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(8, pDeliverException), true));
   // this call should never return
   __ int3();
 #undef __
 }
 
-static const char* kRegisterNames[] = {
-  "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
-  "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
-};
-
-std::ostream& operator<<(std::ostream& os, const Register& rhs) {
-  if (rhs >= RAX && rhs <= R15) {
-    os << kRegisterNames[rhs];
-  } else {
-    os << "Register[" << static_cast<int>(rhs) << "]";
-  }
-  return os;
-}
 }  // namespace x86_64
 }  // namespace art
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index d48ba72..1d42d89 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -80,25 +80,30 @@
     return value;
   }
 
-  bool IsRegister(Register reg) const {
+  bool IsRegister(CpuRegister reg) const {
+    CHECK(!reg.NeedsRex()) << "TODO: rex support:" << reg;
     return ((encoding_[0] & 0xF8) == 0xC0)  // Addressing mode is register only.
-        && ((encoding_[0] & 0x07) == reg);  // Register codes match.
+        && ((encoding_[0] & 0x07) == reg.LowBits());  // Register codes match.
   }
 
  protected:
   // Operand can be sub classed (e.g: Address).
   Operand() : length_(0) { }
 
-  void SetModRM(int mod, Register rm) {
+  void SetModRM(int mod, CpuRegister rm) {
     CHECK_EQ(mod & ~3, 0);
-    encoding_[0] = (mod << 6) | rm;
+    CHECK(!rm.NeedsRex());
+    encoding_[0] = (mod << 6) | static_cast<uint8_t>(rm.AsRegister());
     length_ = 1;
   }
 
-  void SetSIB(ScaleFactor scale, Register index, Register base) {
+  void SetSIB(ScaleFactor scale, CpuRegister index, CpuRegister base) {
+    CHECK(!index.NeedsRex()) << "TODO: rex support: " << index;
+    CHECK(!base.NeedsRex()) << "TODO: rex support: " << base;
     CHECK_EQ(length_, 1);
     CHECK_EQ(scale & ~3, 0);
-    encoding_[1] = (scale << 6) | (index << 3) | base;
+    encoding_[1] = (scale << 6) | (static_cast<uint8_t>(index.AsRegister()) << 3) |
+        static_cast<uint8_t>(base.AsRegister());
     length_ = 2;
   }
 
@@ -117,9 +122,8 @@
  private:
   byte length_;
   byte encoding_[6];
-  byte padding_;
 
-  explicit Operand(Register reg) { SetModRM(3, reg); }
+  explicit Operand(CpuRegister reg) { SetModRM(3, reg); }
 
   // Get the operand encoding byte at the given index.
   uint8_t encoding_at(int index) const {
@@ -136,77 +140,85 @@
 
 class Address : public Operand {
  public:
-  Address(Register base, int32_t disp) {
+  Address(CpuRegister base, int32_t disp) {
     Init(base, disp);
   }
 
-  Address(Register base, Offset disp) {
+  Address(CpuRegister base, Offset disp) {
     Init(base, disp.Int32Value());
   }
 
-  Address(Register base, FrameOffset disp) {
-    CHECK_EQ(base, RSP);
-    Init(RSP, disp.Int32Value());
+  Address(CpuRegister base, FrameOffset disp) {
+    CHECK_EQ(base.AsRegister(), RSP);
+    Init(CpuRegister(RSP), disp.Int32Value());
   }
 
-  Address(Register base, MemberOffset disp) {
+  Address(CpuRegister base, MemberOffset disp) {
     Init(base, disp.Int32Value());
   }
 
-  void Init(Register base, int32_t disp) {
-    if (disp == 0 && base != RBP) {
+  void Init(CpuRegister base, int32_t disp) {
+    if (disp == 0 && base.AsRegister() != RBP) {
       SetModRM(0, base);
-      if (base == RSP) SetSIB(TIMES_1, RSP, base);
+      if (base.AsRegister() == RSP) {
+        SetSIB(TIMES_1, CpuRegister(RSP), base);
+      }
     } else if (disp >= -128 && disp <= 127) {
       SetModRM(1, base);
-      if (base == RSP) SetSIB(TIMES_1, RSP, base);
+      if (base.AsRegister() == RSP) {
+        SetSIB(TIMES_1, CpuRegister(RSP), base);
+      }
       SetDisp8(disp);
     } else {
       SetModRM(2, base);
-      if (base == RSP) SetSIB(TIMES_1, RSP, base);
+      if (base.AsRegister() == RSP) {
+        SetSIB(TIMES_1, CpuRegister(RSP), base);
+      }
       SetDisp32(disp);
     }
   }
 
 
-  Address(Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, RSP);  // Illegal addressing mode.
-    SetModRM(0, RSP);
-    SetSIB(scale, index, RBP);
+  Address(CpuRegister index, ScaleFactor scale, int32_t disp) {
+    CHECK_NE(index.AsRegister(), RSP);  // Illegal addressing mode.
+    SetModRM(0, CpuRegister(RSP));
+    SetSIB(scale, index, CpuRegister(RBP));
     SetDisp32(disp);
   }
 
-  Address(Register base, Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, RSP);  // Illegal addressing mode.
-    if (disp == 0 && base != RBP) {
-      SetModRM(0, RSP);
+  Address(CpuRegister base, CpuRegister index, ScaleFactor scale, int32_t disp) {
+    CHECK_NE(index.AsRegister(), RSP);  // Illegal addressing mode.
+    if (disp == 0 && base.AsRegister() != RBP) {
+      SetModRM(0, CpuRegister(RSP));
       SetSIB(scale, index, base);
     } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, RSP);
+      SetModRM(1, CpuRegister(RSP));
       SetSIB(scale, index, base);
       SetDisp8(disp);
     } else {
-      SetModRM(2, RSP);
+      SetModRM(2, CpuRegister(RSP));
       SetSIB(scale, index, base);
       SetDisp32(disp);
     }
   }
 
-  static Address Absolute(uword addr, bool has_rip = false) {
+  // If no_rip is true then the Absolute address isn't RIP relative.
+  static Address Absolute(uword addr, bool no_rip = false) {
     Address result;
-    if (has_rip) {
-      result.SetModRM(0, RSP);
-      result.SetSIB(TIMES_1, RSP, RBP);
+    if (no_rip) {
+      result.SetModRM(0, CpuRegister(RSP));
+      result.SetSIB(TIMES_1, CpuRegister(RSP), CpuRegister(RBP));
       result.SetDisp32(addr);
     } else {
-      result.SetModRM(0, RBP);
+      result.SetModRM(0, CpuRegister(RBP));
       result.SetDisp32(addr);
     }
     return result;
   }
 
-  static Address Absolute(ThreadOffset addr, bool has_rip = false) {
-    return Absolute(addr.Int32Value(), has_rip);
+  // If no_rip is true then the Absolute address isn't RIP relative.
+  static Address Absolute(ThreadOffset<8> addr, bool no_rip = false) {
+    return Absolute(addr.Int32Value(), no_rip);
   }
 
  private:
@@ -216,7 +228,7 @@
 };
 
 
-class X86_64Assembler : public Assembler {
+class X86_64Assembler FINAL : public Assembler {
  public:
   X86_64Assembler() {}
   virtual ~X86_64Assembler() {}
@@ -224,56 +236,51 @@
   /*
    * Emit Machine Instructions.
    */
-  void call(Register reg);
+  void call(CpuRegister reg);
   void call(const Address& address);
   void call(Label* label);
 
-  void pushq(Register reg);
+  void pushq(CpuRegister reg);
   void pushq(const Address& address);
   void pushq(const Immediate& imm);
 
-  void popq(Register reg);
+  void popq(CpuRegister reg);
   void popq(const Address& address);
 
-  void movq(Register dst, const Immediate& src);
-  void movl(Register dst, const Immediate& src);
-  void movq(Register dst, Register src);
-  void movl(Register dst, Register src);
+  void movq(CpuRegister dst, const Immediate& src);
+  void movl(CpuRegister dst, const Immediate& src);
+  void movq(CpuRegister dst, CpuRegister src);
+  void movl(CpuRegister dst, CpuRegister src);
 
-  void movq(Register dst, const Address& src);
-  void movl(Register dst, const Address& src);
-  void movq(const Address& dst, Register src);
-  void movl(const Address& dst, Register src);
+  void movq(CpuRegister dst, const Address& src);
+  void movl(CpuRegister dst, const Address& src);
+  void movq(const Address& dst, CpuRegister src);
+  void movl(const Address& dst, CpuRegister src);
   void movl(const Address& dst, const Immediate& imm);
-  void movl(const Address& dst, Label* lbl);
 
-  void movzxb(Register dst, ByteRegister src);
-  void movzxb(Register dst, const Address& src);
-  void movsxb(Register dst, ByteRegister src);
-  void movsxb(Register dst, const Address& src);
-  void movb(Register dst, const Address& src);
-  void movb(const Address& dst, ByteRegister src);
+  void movzxb(CpuRegister dst, CpuRegister src);
+  void movzxb(CpuRegister dst, const Address& src);
+  void movsxb(CpuRegister dst, CpuRegister src);
+  void movsxb(CpuRegister dst, const Address& src);
+  void movb(CpuRegister dst, const Address& src);
+  void movb(const Address& dst, CpuRegister src);
   void movb(const Address& dst, const Immediate& imm);
 
-  void movzxw(Register dst, Register src);
-  void movzxw(Register dst, const Address& src);
-  void movsxw(Register dst, Register src);
-  void movsxw(Register dst, const Address& src);
-  void movw(Register dst, const Address& src);
-  void movw(const Address& dst, Register src);
+  void movzxw(CpuRegister dst, CpuRegister src);
+  void movzxw(CpuRegister dst, const Address& src);
+  void movsxw(CpuRegister dst, CpuRegister src);
+  void movsxw(CpuRegister dst, const Address& src);
+  void movw(CpuRegister dst, const Address& src);
+  void movw(const Address& dst, CpuRegister src);
 
-  void leaq(Register dst, const Address& src);
-
-  void cmovl(Condition condition, Register dst, Register src);
-
-  void setb(Condition condition, Register dst);
+  void leaq(CpuRegister dst, const Address& src);
 
   void movss(XmmRegister dst, const Address& src);
   void movss(const Address& dst, XmmRegister src);
   void movss(XmmRegister dst, XmmRegister src);
 
-  void movd(XmmRegister dst, Register src);
-  void movd(Register dst, XmmRegister src);
+  void movd(XmmRegister dst, CpuRegister src);
+  void movd(CpuRegister dst, XmmRegister src);
 
   void addss(XmmRegister dst, XmmRegister src);
   void addss(XmmRegister dst, const Address& src);
@@ -297,17 +304,17 @@
   void divsd(XmmRegister dst, XmmRegister src);
   void divsd(XmmRegister dst, const Address& src);
 
-  void cvtsi2ss(XmmRegister dst, Register src);
-  void cvtsi2sd(XmmRegister dst, Register src);
+  void cvtsi2ss(XmmRegister dst, CpuRegister src);
+  void cvtsi2sd(XmmRegister dst, CpuRegister src);
 
-  void cvtss2si(Register dst, XmmRegister src);
+  void cvtss2si(CpuRegister dst, XmmRegister src);
   void cvtss2sd(XmmRegister dst, XmmRegister src);
 
-  void cvtsd2si(Register dst, XmmRegister src);
+  void cvtsd2si(CpuRegister dst, XmmRegister src);
   void cvtsd2ss(XmmRegister dst, XmmRegister src);
 
-  void cvttss2si(Register dst, XmmRegister src);
-  void cvttsd2si(Register dst, XmmRegister src);
+  void cvttss2si(CpuRegister dst, XmmRegister src);
+  void cvttsd2si(CpuRegister dst, XmmRegister src);
 
   void cvtdq2pd(XmmRegister dst, XmmRegister src);
 
@@ -344,77 +351,62 @@
   void fcos();
   void fptan();
 
-  void xchgl(Register dst, Register src);
-  void xchgl(Register reg, const Address& address);
+  void xchgl(CpuRegister dst, CpuRegister src);
+  void xchgl(CpuRegister reg, const Address& address);
 
-  void cmpl(Register reg, const Immediate& imm);
-  void cmpl(Register reg0, Register reg1);
-  void cmpl(Register reg, const Address& address);
+  void cmpl(CpuRegister reg, const Immediate& imm);
+  void cmpl(CpuRegister reg0, CpuRegister reg1);
+  void cmpl(CpuRegister reg, const Address& address);
 
-  void cmpl(const Address& address, Register reg);
+  void cmpl(const Address& address, CpuRegister reg);
   void cmpl(const Address& address, const Immediate& imm);
 
-  void testl(Register reg1, Register reg2);
-  void testl(Register reg, const Immediate& imm);
+  void testl(CpuRegister reg1, CpuRegister reg2);
+  void testl(CpuRegister reg, const Immediate& imm);
 
-  void andl(Register dst, const Immediate& imm);
-  void andl(Register dst, Register src);
+  void andl(CpuRegister dst, const Immediate& imm);
+  void andl(CpuRegister dst, CpuRegister src);
 
-  void orl(Register dst, const Immediate& imm);
-  void orl(Register dst, Register src);
+  void orl(CpuRegister dst, const Immediate& imm);
+  void orl(CpuRegister dst, CpuRegister src);
 
-  void xorl(Register dst, Register src);
+  void xorl(CpuRegister dst, CpuRegister src);
 
-  void addl(Register dst, Register src);
-  void addq(Register reg, const Immediate& imm);
-  void addl(Register reg, const Immediate& imm);
-  void addl(Register reg, const Address& address);
+  void addl(CpuRegister dst, CpuRegister src);
+  void addq(CpuRegister reg, const Immediate& imm);
+  void addl(CpuRegister reg, const Immediate& imm);
+  void addl(CpuRegister reg, const Address& address);
 
-  void addl(const Address& address, Register reg);
+  void addl(const Address& address, CpuRegister reg);
   void addl(const Address& address, const Immediate& imm);
 
-  void adcl(Register dst, Register src);
-  void adcl(Register reg, const Immediate& imm);
-  void adcl(Register dst, const Address& address);
-
-  void subl(Register dst, Register src);
-  void subl(Register reg, const Immediate& imm);
-  void subl(Register reg, const Address& address);
+  void subl(CpuRegister dst, CpuRegister src);
+  void subl(CpuRegister reg, const Immediate& imm);
+  void subl(CpuRegister reg, const Address& address);
 
   void cdq();
 
-  void idivl(Register reg);
+  void idivl(CpuRegister reg);
 
-  void imull(Register dst, Register src);
-  void imull(Register reg, const Immediate& imm);
-  void imull(Register reg, const Address& address);
+  void imull(CpuRegister dst, CpuRegister src);
+  void imull(CpuRegister reg, const Immediate& imm);
+  void imull(CpuRegister reg, const Address& address);
 
-  void imull(Register reg);
+  void imull(CpuRegister reg);
   void imull(const Address& address);
 
-  void mull(Register reg);
+  void mull(CpuRegister reg);
   void mull(const Address& address);
 
-  void sbbl(Register dst, Register src);
-  void sbbl(Register reg, const Immediate& imm);
-  void sbbl(Register reg, const Address& address);
+  void shll(CpuRegister reg, const Immediate& imm);
+  void shll(CpuRegister operand, CpuRegister shifter);
+  void shrl(CpuRegister reg, const Immediate& imm);
+  void shrl(CpuRegister operand, CpuRegister shifter);
+  void sarl(CpuRegister reg, const Immediate& imm);
+  void sarl(CpuRegister operand, CpuRegister shifter);
 
-  void incl(Register reg);
-  void incl(const Address& address);
-
-  void decl(Register reg);
-  void decl(const Address& address);
-
-  void shll(Register reg, const Immediate& imm);
-  void shll(Register operand, Register shifter);
-  void shrl(Register reg, const Immediate& imm);
-  void shrl(Register operand, Register shifter);
-  void sarl(Register reg, const Immediate& imm);
-  void sarl(Register operand, Register shifter);
-  void shld(Register dst, Register src);
-
-  void negl(Register reg);
-  void notl(Register reg);
+  void negl(CpuRegister reg);
+  void notl(CpuRegister reg);
 
   void enter(const Immediate& imm);
   void leave();
@@ -428,12 +420,12 @@
 
   void j(Condition condition, Label* label);
 
-  void jmp(Register reg);
+  void jmp(CpuRegister reg);
   void jmp(const Address& address);
   void jmp(Label* label);
 
   X86_64Assembler* lock();
-  void cmpxchgl(const Address& address, Register reg);
+  void cmpxchgl(const Address& address, CpuRegister reg);
 
   void mfence();
 
@@ -443,7 +435,7 @@
   // Macros for High-level operations.
   //
 
-  void AddImmediate(Register reg, const Immediate& imm);
+  void AddImmediate(CpuRegister reg, const Immediate& imm);
 
   void LoadDoubleConstant(XmmRegister dst, double value);
 
@@ -452,7 +444,7 @@
 
   void DoubleAbs(XmmRegister reg);
 
-  void LockCmpxchgl(const Address& address, Register reg) {
+  void LockCmpxchgl(const Address& address, CpuRegister reg) {
     lock()->cmpxchgl(address, reg);
   }
 
@@ -468,109 +460,99 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister src);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister src);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
+  void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
+  void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread64(ThreadOffset<8> thr_offs) OVERRIDE;
 
-  void StoreLabelToThread(ThreadOffset thr_offs, Label* lbl);
-
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister dest, FrameOffset src, size_t size);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size);
+  void LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size);
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch);
+  void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch);
+  void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister scratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister);
+  void MemoryBarrier(ManagedRegister) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister scratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister scratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
   virtual void LoadReferenceFromSirt(ManagedRegister dst,
@@ -578,40 +560,57 @@
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
-  inline void EmitUint8(uint8_t value);
-  inline void EmitInt32(int32_t value);
-  inline void EmitRegisterOperand(int rm, int reg);
-  inline void EmitXmmRegisterOperand(int rm, XmmRegister reg);
-  inline void EmitFixup(AssemblerFixup* fixup);
-  inline void EmitOperandSizeOverride();
+  void EmitUint8(uint8_t value);
+  void EmitInt32(int32_t value);
+  void EmitRegisterOperand(uint8_t rm, uint8_t reg);
+  void EmitXmmRegisterOperand(uint8_t rm, XmmRegister reg);
+  void EmitFixup(AssemblerFixup* fixup);
+  void EmitOperandSizeOverride();
 
-  void EmitOperand(int rm, const Operand& operand);
+  void EmitOperand(uint8_t rm, const Operand& operand);
   void EmitImmediate(const Immediate& imm);
-  void EmitComplex(int rm, const Operand& operand, const Immediate& immediate);
+  void EmitComplex(uint8_t rm, const Operand& operand, const Immediate& immediate);
   void EmitLabel(Label* label, int instruction_size);
   void EmitLabelLink(Label* label);
   void EmitNearLabelLink(Label* label);
 
-  void EmitGenericShift(int rm, Register reg, const Immediate& imm);
-  void EmitGenericShift(int rm, Register operand, Register shifter);
-  void rex(Register &dst, Register &src, size_t size = 4);
-  void rex_reg(Register &dst, size_t size = 4);
-  void rex_rm(Register &src, size_t size = 4);
+  void EmitGenericShift(int rm, CpuRegister reg, const Immediate& imm);
+  void EmitGenericShift(int rm, CpuRegister operand, CpuRegister shifter);
+
+  // If any input is not false, output the necessary rex prefix.
+  void EmitOptionalRex(bool force, bool w, bool r, bool x, bool b);
+
+  // Emit a rex prefix byte if necessary for reg. ie if reg is a register in the range R8 to R15.
+  void EmitOptionalRex32(CpuRegister reg);
+  void EmitOptionalRex32(CpuRegister dst, CpuRegister src);
+  void EmitOptionalRex32(XmmRegister dst, XmmRegister src);
+  void EmitOptionalRex32(CpuRegister dst, XmmRegister src);
+  void EmitOptionalRex32(XmmRegister dst, CpuRegister src);
+  void EmitOptionalRex32(const Operand& operand);
+  void EmitOptionalRex32(CpuRegister dst, const Operand& operand);
+  void EmitOptionalRex32(XmmRegister dst, const Operand& operand);
+
+  // Emit a REX.W prefix plus necessary register bit encodings.
+  void EmitRex64(CpuRegister reg);
+  void EmitRex64(CpuRegister dst, CpuRegister src);
+  void EmitRex64(CpuRegister dst, const Operand& operand);
+
+  // Emit a REX prefix to normalize byte registers plus necessary register bit encodings.
+  void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src);
+  void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, const Operand& operand);
 
   DISALLOW_COPY_AND_ASSIGN(X86_64Assembler);
 };
@@ -624,14 +623,14 @@
   buffer_.Emit<int32_t>(value);
 }
 
-inline void X86_64Assembler::EmitRegisterOperand(int rm, int reg) {
+inline void X86_64Assembler::EmitRegisterOperand(uint8_t rm, uint8_t reg) {
   CHECK_GE(rm, 0);
   CHECK_LT(rm, 8);
   buffer_.Emit<uint8_t>(0xC0 + (rm << 3) + reg);
 }
 
-inline void X86_64Assembler::EmitXmmRegisterOperand(int rm, XmmRegister reg) {
-  EmitRegisterOperand(rm, static_cast<Register>(reg));
+inline void X86_64Assembler::EmitXmmRegisterOperand(uint8_t rm, XmmRegister reg) {
+  EmitRegisterOperand(rm, static_cast<uint8_t>(reg.AsFloatRegister()));
 }
 
 inline void X86_64Assembler::EmitFixup(AssemblerFixup* fixup) {
@@ -642,15 +641,6 @@
   EmitUint8(0x66);
 }
 
-// Slowpath entered when Thread::Current()->_exception is non-null
-class X86ExceptionSlowPath : public SlowPath {
- public:
-  explicit X86ExceptionSlowPath(size_t stack_adjust) : stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm);
- private:
-  const size_t stack_adjust_;
-};
-
 }  // namespace x86_64
 }  // namespace art
 
diff --git a/compiler/utils/x86_64/constants_x86_64.h b/compiler/utils/x86_64/constants_x86_64.h
index 3340802..58a0379 100644
--- a/compiler/utils/x86_64/constants_x86_64.h
+++ b/compiler/utils/x86_64/constants_x86_64.h
@@ -27,30 +27,37 @@
 namespace art {
 namespace x86_64 {
 
-enum ByteRegister {
-  AL = 0,
-  CL = 1,
-  DL = 2,
-  BL = 3,
-  AH = 4,
-  CH = 5,
-  DH = 6,
-  BH = 7,
-  kNoByteRegister = -1  // Signals an illegal register.
+class CpuRegister {
+ public:
+  explicit CpuRegister(Register r) : reg_(r) {}
+  Register AsRegister() const {
+    return reg_;
+  }
+  uint8_t LowBits() const {
+    return reg_ & 7;
+  }
+  bool NeedsRex() const {
+    return reg_ > 7;
+  }
+ private:
+  const Register reg_;
 };
+std::ostream& operator<<(std::ostream& os, const CpuRegister& reg);
 
-
-enum XmmRegister {
-  _XMM0 = 0,
-  _XMM1 = 1,
-  _XMM2 = 2,
-  _XMM3 = 3,
-  _XMM4 = 4,
-  _XMM5 = 5,
-  _XMM6 = 6,
-  _XMM7 = 7,
-  kNumberOfXmmRegisters = 8,
-  kNoXmmRegister = -1  // Signals an illegal register.
+class XmmRegister {
+ public:
+  explicit XmmRegister(FloatRegister r) : reg_(r) {}
+  FloatRegister AsFloatRegister() const {
+    return reg_;
+  }
+  uint8_t LowBits() const {
+    return reg_ & 7;
+  }
+  bool NeedsRex() const {
+    return reg_ > 7;
+  }
+ private:
+  const FloatRegister reg_;
 };
 std::ostream& operator<<(std::ostream& os, const XmmRegister& reg);
 
diff --git a/compiler/utils/x86_64/managed_register_x86_64.cc b/compiler/utils/x86_64/managed_register_x86_64.cc
index 057a894..b8c2db2 100644
--- a/compiler/utils/x86_64/managed_register_x86_64.cc
+++ b/compiler/utils/x86_64/managed_register_x86_64.cc
@@ -60,8 +60,8 @@
   CHECK(other.IsValidManagedRegister());
   if (Equals(other)) return true;
   if (IsRegisterPair()) {
-    Register low = AsRegisterPairLow();
-    Register high = AsRegisterPairHigh();
+    Register low = AsRegisterPairLow().AsRegister();
+    Register high = AsRegisterPairHigh().AsRegister();
     return X86_64ManagedRegister::FromCpuRegister(low).Overlaps(other) ||
         X86_64ManagedRegister::FromCpuRegister(high).Overlaps(other);
   }
@@ -94,11 +94,11 @@
   if (!IsValidManagedRegister()) {
     os << "No Register";
   } else if (IsXmmRegister()) {
-    os << "XMM: " << static_cast<int>(AsXmmRegister());
+    os << "XMM: " << static_cast<int>(AsXmmRegister().AsFloatRegister());
   } else if (IsX87Register()) {
     os << "X87: " << static_cast<int>(AsX87Register());
   } else if (IsCpuRegister()) {
-    os << "CPU: " << static_cast<int>(AsCpuRegister());
+    os << "CPU: " << static_cast<int>(AsCpuRegister().AsRegister());
   } else if (IsRegisterPair()) {
     os << "Pair: " << AsRegisterPairLow() << ", " << AsRegisterPairHigh();
   } else {
diff --git a/compiler/utils/x86_64/managed_register_x86_64.h b/compiler/utils/x86_64/managed_register_x86_64.h
index d68c59d..822659f 100644
--- a/compiler/utils/x86_64/managed_register_x86_64.h
+++ b/compiler/utils/x86_64/managed_register_x86_64.h
@@ -46,8 +46,8 @@
 const int kNumberOfCpuRegIds = kNumberOfCpuRegisters;
 const int kNumberOfCpuAllocIds = kNumberOfCpuRegisters;
 
-const int kNumberOfXmmRegIds = kNumberOfXmmRegisters;
-const int kNumberOfXmmAllocIds = kNumberOfXmmRegisters;
+const int kNumberOfXmmRegIds = kNumberOfFloatRegisters;
+const int kNumberOfXmmAllocIds = kNumberOfFloatRegisters;
 
 const int kNumberOfX87RegIds = kNumberOfX87Registers;
 const int kNumberOfX87AllocIds = kNumberOfX87Registers;
@@ -87,20 +87,14 @@
 // There is a one-to-one mapping between ManagedRegister and register id.
 class X86_64ManagedRegister : public ManagedRegister {
  public:
-  ByteRegister AsByteRegister() const {
+  CpuRegister AsCpuRegister() const {
     CHECK(IsCpuRegister());
-    CHECK_LT(AsCpuRegister(), RSP);  // RSP, RBP, ESI and RDI cannot be encoded as byte registers.
-    return static_cast<ByteRegister>(id_);
-  }
-
-  Register AsCpuRegister() const {
-    CHECK(IsCpuRegister());
-    return static_cast<Register>(id_);
+    return CpuRegister(static_cast<Register>(id_));
   }
 
   XmmRegister AsXmmRegister() const {
     CHECK(IsXmmRegister());
-    return static_cast<XmmRegister>(id_ - kNumberOfCpuRegIds);
+    return XmmRegister(static_cast<FloatRegister>(id_ - kNumberOfCpuRegIds));
   }
 
   X87Register AsX87Register() const {
@@ -109,13 +103,13 @@
                                     (kNumberOfCpuRegIds + kNumberOfXmmRegIds));
   }
 
-  Register AsRegisterPairLow() const {
+  CpuRegister AsRegisterPairLow() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdLow().
     return FromRegId(AllocIdLow()).AsCpuRegister();
   }
 
-  Register AsRegisterPairHigh() const {
+  CpuRegister AsRegisterPairHigh() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdHigh().
     return FromRegId(AllocIdHigh()).AsCpuRegister();
@@ -157,8 +151,7 @@
     return FromRegId(r);
   }
 
-  static X86_64ManagedRegister FromXmmRegister(XmmRegister r) {
-    CHECK_NE(r, kNoXmmRegister);
+  static X86_64ManagedRegister FromXmmRegister(FloatRegister r) {
     return FromRegId(r + kNumberOfCpuRegIds);
   }
 
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 55fd52f..899aa78 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -305,7 +305,7 @@
           }
           if (rn.r == 9) {
             args << "  ; ";
-            Thread::DumpThreadOffset(args, offset, 4);
+            Thread::DumpThreadOffset<4>(args, offset);
           }
         }
       }
@@ -1291,7 +1291,7 @@
               args << Rt << ", [" << Rn << ", #" << imm12 << "]";
               if (Rn.r == 9) {
                 args << "  ; ";
-                Thread::DumpThreadOffset(args, imm12, 4);
+                Thread::DumpThreadOffset<4>(args, imm12);
               } else if (Rn.r == 15) {
                 intptr_t lit_adr = reinterpret_cast<intptr_t>(instr_ptr);
                 lit_adr = RoundDown(lit_adr, 4) + 4 + imm12;
@@ -1304,7 +1304,7 @@
               args << Rt << ", [" << Rn << ", #" << imm12 << "]";
               if (Rn.r == 9) {
                 args << "  ; ";
-                Thread::DumpThreadOffset(args, imm12, 4);
+                Thread::DumpThreadOffset<4>(args, imm12);
               } else if (Rn.r == 15) {
                 intptr_t lit_adr = reinterpret_cast<intptr_t>(instr_ptr);
                 lit_adr = RoundDown(lit_adr, 4) + 4 + imm12;
@@ -1361,7 +1361,7 @@
             args << Rt << ", [" << Rn << ", #" << imm12 << "]";
             if (Rn.r == 9) {
               args << "  ; ";
-              Thread::DumpThreadOffset(args, imm12, 4);
+              Thread::DumpThreadOffset<4>(args, imm12);
             } else if (Rn.r == 15) {
               intptr_t lit_adr = reinterpret_cast<intptr_t>(instr_ptr);
               lit_adr = RoundDown(lit_adr, 4) + 4 + imm12;
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index 72ff761..5e89f6f 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -237,7 +237,7 @@
               args << StringPrintf("%+d(r%d)", offset, rs);
               if (rs == 17) {
                 args << "  ; ";
-                Thread::DumpThreadOffset(args, offset, 4);
+                Thread::DumpThreadOffset<4>(args, offset);
               }
             }
             break;
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 4a03ebe..68e77d4 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -849,9 +849,13 @@
     }
     args << StringPrintf("%+d (%p)", displacement, instr + displacement);
   }
-  if (prefix[1] == kFs) {
+  if (prefix[1] == kFs && !supports_rex_) {
     args << "  ; ";
-    Thread::DumpThreadOffset(args, address_bits, 4);
+    Thread::DumpThreadOffset<4>(args, address_bits);
+  }
+  if (prefix[1] == kGs && supports_rex_) {
+    args << "  ; ";
+    Thread::DumpThreadOffset<8>(args, address_bits);
   }
   std::stringstream hex;
   for (size_t i = 0; begin_instr + i < instr; ++i) {
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 1576905..e8224cd 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -294,6 +294,7 @@
 
 
 LIBART_ENUM_OPERATOR_OUT_HEADER_FILES := \
+	arch/x86_64/registers_x86_64.h \
 	base/mutex.h \
 	dex_file.h \
 	dex_instruction.h \
diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h
index cfffbea..4b64076 100644
--- a/runtime/arch/arm/asm_support_arm.h
+++ b/runtime/arch/arm/asm_support_arm.h
@@ -23,13 +23,13 @@
 #define rSUSPEND r4
 // Register holding Thread::Current().
 #define rSELF r9
-// Offset of field Thread::suspend_count_ verified in InitCpu
+// Offset of field Thread::tls32_.state_and_flags verified in InitCpu
 #define THREAD_FLAGS_OFFSET 0
-// Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
-// Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 12
-// Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
-#define THREAD_ID_OFFSET 60
+// Offset of field Thread::tls32_.thin_lock_thread_id verified in InitCpu
+#define THREAD_ID_OFFSET 12
+// Offset of field Thread::tlsPtr_.card_table verified in InitCpu
+#define THREAD_CARD_TABLE_OFFSET 112
+// Offset of field Thread::tlsPtr_.exception verified in InitCpu
+#define THREAD_EXCEPTION_OFFSET 116
 
 #endif  // ART_RUNTIME_ARCH_ARM_ASM_SUPPORT_ARM_H_
diff --git a/runtime/arch/arm/fault_handler_arm.cc b/runtime/arch/arm/fault_handler_arm.cc
index abce838..65a4952 100644
--- a/runtime/arch/arm/fault_handler_arm.cc
+++ b/runtime/arch/arm/fault_handler_arm.cc
@@ -109,7 +109,7 @@
 bool SuspensionHandler::Action(int sig, siginfo_t* info, void* context) {
   // These are the instructions to check for.  The first one is the ldr r0,[r9,#xxx]
   // where xxx is the offset of the suspend trigger.
-  uint32_t checkinst1 = 0xf8d90000 + Thread::ThreadSuspendTriggerOffset().Int32Value();
+  uint32_t checkinst1 = 0xf8d90000 + Thread::ThreadSuspendTriggerOffset<4>().Int32Value();
   uint16_t checkinst2 = 0x6800;
 
   struct ucontext *uc = (struct ucontext *)context;
diff --git a/runtime/arch/arm/thread_arm.cc b/runtime/arch/arm/thread_arm.cc
index df4a04a..2a551a8 100644
--- a/runtime/arch/arm/thread_arm.cc
+++ b/runtime/arch/arm/thread_arm.cc
@@ -22,10 +22,10 @@
 namespace art {
 
 void Thread::InitCpu() {
-  CHECK_EQ(THREAD_FLAGS_OFFSET, OFFSETOF_MEMBER(Thread, state_and_flags_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_FLAGS_OFFSET, ThreadFlagsOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<4>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/arm64/thread_arm64.cc b/runtime/arch/arm64/thread_arm64.cc
index 4eebb85..564dced 100644
--- a/runtime/arch/arm64/thread_arm64.cc
+++ b/runtime/arch/arm64/thread_arm64.cc
@@ -22,10 +22,10 @@
 namespace art {
 
 void Thread::InitCpu() {
-  CHECK_EQ(THREAD_FLAGS_OFFSET, OFFSETOF_MEMBER(Thread, state_and_flags_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_FLAGS_OFFSET, ThreadFlagsOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<8>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/mips/asm_support_mips.h b/runtime/arch/mips/asm_support_mips.h
index 5307997..36ce1b6 100644
--- a/runtime/arch/mips/asm_support_mips.h
+++ b/runtime/arch/mips/asm_support_mips.h
@@ -23,11 +23,11 @@
 #define rSUSPEND $s0
 // Register holding Thread::Current().
 #define rSELF $s1
-// Offset of field Thread::suspend_count_ verified in InitCpu
+// Offset of field Thread::tls32_.state_and_flags verified in InitCpu
 #define THREAD_FLAGS_OFFSET 0
-// Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
-// Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 12
+// Offset of field Thread::tlsPtr_.card_table verified in InitCpu
+#define THREAD_CARD_TABLE_OFFSET 112
+// Offset of field Thread::tlsPtr_.exception verified in InitCpu
+#define THREAD_EXCEPTION_OFFSET 116
 
 #endif  // ART_RUNTIME_ARCH_MIPS_ASM_SUPPORT_MIPS_H_
diff --git a/runtime/arch/mips/thread_mips.cc b/runtime/arch/mips/thread_mips.cc
index f5d211f..a451496 100644
--- a/runtime/arch/mips/thread_mips.cc
+++ b/runtime/arch/mips/thread_mips.cc
@@ -22,9 +22,9 @@
 namespace art {
 
 void Thread::InitCpu() {
-  CHECK_EQ(THREAD_FLAGS_OFFSET, OFFSETOF_MEMBER(Thread, state_and_flags_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
+  CHECK_EQ(THREAD_FLAGS_OFFSET, ThreadFlagsOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<4>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/x86/asm_support_x86.h b/runtime/arch/x86/asm_support_x86.h
index e817ff7..e986c41 100644
--- a/runtime/arch/x86/asm_support_x86.h
+++ b/runtime/arch/x86/asm_support_x86.h
@@ -20,12 +20,12 @@
 #include "asm_support.h"
 
 // Offset of field Thread::self_ verified in InitCpu
-#define THREAD_SELF_OFFSET 40
+#define THREAD_SELF_OFFSET 148
 // Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
+#define THREAD_CARD_TABLE_OFFSET 112
 // Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 12
+#define THREAD_EXCEPTION_OFFSET 116
 // Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
-#define THREAD_ID_OFFSET 60
+#define THREAD_ID_OFFSET 12
 
 #endif  // ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_H_
diff --git a/runtime/arch/x86/thread_x86.cc b/runtime/arch/x86/thread_x86.cc
index 235da99..26cd864 100644
--- a/runtime/arch/x86/thread_x86.cc
+++ b/runtime/arch/x86/thread_x86.cc
@@ -120,11 +120,11 @@
       :);  // clobber
 
   // Allow easy indirection back to Thread*.
-  self_ = this;
+  tlsPtr_.self = this;
 
   // Sanity check that reads from %fs point to this Thread*.
   Thread* self_check;
-  CHECK_EQ(THREAD_SELF_OFFSET, OFFSETOF_MEMBER(Thread, self_));
+  CHECK_EQ(THREAD_SELF_OFFSET, SelfOffset<4>().Int32Value());
   __asm__ __volatile__("movl %%fs:(%1), %0"
       : "=r"(self_check)  // output
       : "r"(THREAD_SELF_OFFSET)  // input
@@ -132,9 +132,9 @@
   CHECK_EQ(self_check, this);
 
   // Sanity check other offsets.
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<4>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/x86_64/asm_support_x86_64.h b/runtime/arch/x86_64/asm_support_x86_64.h
index 03d9e24..70ef3ef 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.h
+++ b/runtime/arch/x86_64/asm_support_x86_64.h
@@ -27,12 +27,12 @@
 #define RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET 16
 
 // Offset of field Thread::self_ verified in InitCpu
-#define THREAD_SELF_OFFSET 72
+#define THREAD_SELF_OFFSET 184
 // Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
+#define THREAD_CARD_TABLE_OFFSET 112
 // Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 16
+#define THREAD_EXCEPTION_OFFSET 120
 // Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
-#define THREAD_ID_OFFSET 112
+#define THREAD_ID_OFFSET 12
 
 #endif  // ART_RUNTIME_ARCH_X86_64_ASM_SUPPORT_X86_64_H_
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 0d75a89..17b8556 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -146,7 +146,6 @@
     // Outgoing argument set up
     mov %rsp, %rdx                    // pass SP
     mov %gs:THREAD_SELF_OFFSET, %rsi  // pass Thread::Current()
-    mov %rax, %rdi                    // pass arg1
     call PLT_VAR(cxx_name, 1)     // cxx_name(arg1, Thread*, SP)
     int3                          // unreached
     END_FUNCTION VAR(c_name, 0)
diff --git a/runtime/arch/x86_64/registers_x86_64.h b/runtime/arch/x86_64/registers_x86_64.h
index b9d06b5..8b0dc07 100644
--- a/runtime/arch/x86_64/registers_x86_64.h
+++ b/runtime/arch/x86_64/registers_x86_64.h
@@ -67,7 +67,7 @@
   XMM15 = 15,
   kNumberOfFloatRegisters = 16
 };
-std::ostream& operator<<(std::ostream& os, const Register& rhs);
+std::ostream& operator<<(std::ostream& os, const FloatRegister& rhs);
 
 }  // namespace x86_64
 }  // namespace art
diff --git a/runtime/arch/x86_64/thread_x86_64.cc b/runtime/arch/x86_64/thread_x86_64.cc
index b74fc5d..de4c56a 100644
--- a/runtime/arch/x86_64/thread_x86_64.cc
+++ b/runtime/arch/x86_64/thread_x86_64.cc
@@ -36,11 +36,11 @@
   arch_prctl(ARCH_SET_GS, this);
 
   // Allow easy indirection back to Thread*.
-  self_ = this;
+  tlsPtr_.self = this;
 
   // Sanity check that reads from %gs point to this Thread*.
   Thread* self_check;
-  CHECK_EQ(THREAD_SELF_OFFSET, OFFSETOF_MEMBER(Thread, self_));
+  CHECK_EQ(THREAD_SELF_OFFSET, SelfOffset<8>().Int32Value());
   __asm__ __volatile__("movq %%gs:(%1), %0"
       : "=r"(self_check)  // output
       : "r"(THREAD_SELF_OFFSET)  // input
@@ -54,15 +54,15 @@
            Runtime::GetCalleeSaveMethodOffset(Runtime::kRefsOnly));
   CHECK_EQ(static_cast<size_t>(RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET),
            Runtime::GetCalleeSaveMethodOffset(Runtime::kRefsAndArgs));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<8>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
   // Sanity check that reads from %gs point to this Thread*.
   Thread* self_check;
-  CHECK_EQ(THREAD_SELF_OFFSET, OFFSETOF_MEMBER(Thread, self_));
+  CHECK_EQ(THREAD_SELF_OFFSET, SelfOffset<8>().Int32Value());
   __asm__ __volatile__("movq %%gs:(%1), %0"
       : "=r"(self_check)  // output
       : "r"(THREAD_SELF_OFFSET)  // input
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 024f830..2872a02 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -1924,7 +1924,7 @@
   if (error != JDWP::ERR_NONE) {
     return error;
   }
-  thread->Interrupt();
+  thread->Interrupt(soa.Self());
   return JDWP::ERR_NONE;
 }
 
diff --git a/runtime/entrypoints/interpreter/interpreter_entrypoints.h b/runtime/entrypoints/interpreter/interpreter_entrypoints.h
index c7df4e6..d8b2204 100644
--- a/runtime/entrypoints/interpreter/interpreter_entrypoints.h
+++ b/runtime/entrypoints/interpreter/interpreter_entrypoints.h
@@ -21,9 +21,8 @@
 #include "dex_file.h"
 #include "offsets.h"
 
-#define INTERPRETER_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, interpreter_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(InterpreterEntryPoints, x)))
+#define INTERPRETER_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::InterpreterEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(InterpreterEntryPoints, x))
 
 namespace art {
 
diff --git a/runtime/entrypoints/jni/jni_entrypoints.h b/runtime/entrypoints/jni/jni_entrypoints.h
index 0a53447..6fb0560 100644
--- a/runtime/entrypoints/jni/jni_entrypoints.h
+++ b/runtime/entrypoints/jni/jni_entrypoints.h
@@ -20,9 +20,8 @@
 #include "base/macros.h"
 #include "offsets.h"
 
-#define JNI_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, jni_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(JniEntryPoints, x)))
+#define JNI_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::JniEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(JniEntryPoints, x))
 
 namespace art {
 
diff --git a/runtime/entrypoints/portable/portable_entrypoints.h b/runtime/entrypoints/portable/portable_entrypoints.h
index dbea707..6f77e1c 100644
--- a/runtime/entrypoints/portable/portable_entrypoints.h
+++ b/runtime/entrypoints/portable/portable_entrypoints.h
@@ -27,9 +27,8 @@
 }  // namespace mirror
 class Thread;
 
-#define PORTABLE_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, portable_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(PortableEntryPoints, x)))
+#define PORTABLE_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::PortableEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(PortableEntryPoints, x))
 
 // Pointers to functions that are called by code generated by compiler's adhering to the portable
 // compiler ABI.
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index 5c3b824..ec69e28 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -22,9 +22,8 @@
 #include "base/macros.h"
 #include "offsets.h"
 
-#define QUICK_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, quick_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(QuickEntryPoints, x)))
+#define QUICK_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::QuickEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(QuickEntryPoints, x))
 
 namespace art {
 
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 19fdc63..f5f6f16 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -565,7 +565,7 @@
 
   if (LIKELY(idx <= kMaxThreadLocalSizeBracketIdx)) {
     // Use a thread-local run.
-    Run* thread_local_run = reinterpret_cast<Run*>(self->rosalloc_runs_[idx]);
+    Run* thread_local_run = reinterpret_cast<Run*>(self->GetRosAllocRun(idx));
     if (UNLIKELY(thread_local_run == NULL)) {
       MutexLock mu(self, *size_bracket_locks_[idx]);
       thread_local_run = RefillRun(self, idx);
@@ -575,7 +575,7 @@
       DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
       DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
       thread_local_run->is_thread_local_ = 1;
-      self->rosalloc_runs_[idx] = thread_local_run;
+      self->SetRosAllocRun(idx, thread_local_run);
       DCHECK(!thread_local_run->IsFull());
     }
 
@@ -600,7 +600,7 @@
       } else {
         // No slots got freed. Try to refill the thread-local run.
         DCHECK(thread_local_run->IsFull());
-        self->rosalloc_runs_[idx] = NULL;
+        self->SetRosAllocRun(idx, nullptr);
         thread_local_run->is_thread_local_ = 0;
         if (kIsDebugBuild) {
           full_runs_[idx].insert(thread_local_run);
@@ -619,7 +619,7 @@
         DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
         DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
         thread_local_run->is_thread_local_ = 1;
-        self->rosalloc_runs_[idx] = thread_local_run;
+        self->SetRosAllocRun(idx, thread_local_run);
         DCHECK(!thread_local_run->IsFull());
       }
 
@@ -1602,11 +1602,11 @@
   WriterMutexLock wmu(self, bulk_free_lock_);
   for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
     MutexLock mu(self, *size_bracket_locks_[idx]);
-    Run* thread_local_run = reinterpret_cast<Run*>(thread->rosalloc_runs_[idx]);
+    Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
     if (thread_local_run != NULL) {
       DCHECK_EQ(thread_local_run->magic_num_, kMagicNum);
       DCHECK_NE(thread_local_run->is_thread_local_, 0);
-      thread->rosalloc_runs_[idx] = NULL;
+      thread->SetRosAllocRun(idx, nullptr);
       // Note the thread local run may not be full here.
       bool dont_care;
       thread_local_run->MergeThreadLocalFreeBitMapToAllocBitMap(&dont_care);
@@ -1659,7 +1659,7 @@
     WriterMutexLock wmu(self, bulk_free_lock_);
     for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
       MutexLock mu(self, *size_bracket_locks_[idx]);
-      Run* thread_local_run = reinterpret_cast<Run*>(thread->rosalloc_runs_[idx]);
+      Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
       DCHECK(thread_local_run == nullptr);
     }
   }
@@ -1924,7 +1924,7 @@
       Thread* thread = *it;
       for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
         MutexLock mu(self, *rosalloc->size_bracket_locks_[i]);
-        Run* thread_local_run = reinterpret_cast<Run*>(thread->rosalloc_runs_[i]);
+        Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(i));
         if (thread_local_run == this) {
           CHECK(!owner_found)
               << "A thread local run has more than one owner thread " << Dump();
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 6148894..a955cc8 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -213,7 +213,7 @@
   // since there can exist multiple bump pointer spaces which exist at the same time.
   if (num_blocks_ > 0) {
     for (Thread* thread : thread_list) {
-      total += thread->thread_local_pos_ - thread->thread_local_start_;
+      total += thread->GetThreadLocalBytesAllocated();
     }
   }
   return total;
@@ -231,15 +231,15 @@
   // since there can exist multiple bump pointer spaces which exist at the same time.
   if (num_blocks_ > 0) {
     for (Thread* thread : thread_list) {
-      total += thread->thread_local_objects_;
+      total += thread->GetThreadLocalObjectsAllocated();
     }
   }
   return total;
 }
 
 void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
-  objects_allocated_.FetchAndAdd(thread->thread_local_objects_);
-  bytes_allocated_.FetchAndAdd(thread->thread_local_pos_ - thread->thread_local_start_);
+  objects_allocated_.FetchAndAdd(thread->GetThreadLocalObjectsAllocated());
+  bytes_allocated_.FetchAndAdd(thread->GetThreadLocalBytesAllocated());
   thread->SetTlab(nullptr, nullptr);
 }
 
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 13aa77f..f7aeffd 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -2829,7 +2829,7 @@
       local_ref_cookie(IRT_FIRST_SEGMENT),
       locals(kLocalsInitial, kLocalsMax, kLocal),
       check_jni(false),
-      critical(false),
+      critical(0),
       monitors("monitors", kMonitorsInitial, kMonitorsMax) {
   functions = unchecked_functions = &gJniNativeInterface;
   if (vm->check_jni) {
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 332aef0..2d3d318 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -157,7 +157,7 @@
 void Monitor::AppendToWaitSet(Thread* thread) {
   DCHECK(owner_ == Thread::Current());
   DCHECK(thread != NULL);
-  DCHECK(thread->wait_next_ == NULL) << thread->wait_next_;
+  DCHECK(thread->GetWaitNext() == nullptr) << thread->GetWaitNext();
   if (wait_set_ == NULL) {
     wait_set_ = thread;
     return;
@@ -165,10 +165,10 @@
 
   // push_back.
   Thread* t = wait_set_;
-  while (t->wait_next_ != NULL) {
-    t = t->wait_next_;
+  while (t->GetWaitNext() != nullptr) {
+    t = t->GetWaitNext();
   }
-  t->wait_next_ = thread;
+  t->SetWaitNext(thread);
 }
 
 /*
@@ -182,19 +182,19 @@
     return;
   }
   if (wait_set_ == thread) {
-    wait_set_ = thread->wait_next_;
-    thread->wait_next_ = NULL;
+    wait_set_ = thread->GetWaitNext();
+    thread->SetWaitNext(nullptr);
     return;
   }
 
   Thread* t = wait_set_;
-  while (t->wait_next_ != NULL) {
-    if (t->wait_next_ == thread) {
-      t->wait_next_ = thread->wait_next_;
-      thread->wait_next_ = NULL;
+  while (t->GetWaitNext() != NULL) {
+    if (t->GetWaitNext() == thread) {
+      t->SetWaitNext(thread->GetWaitNext());
+      thread->SetWaitNext(nullptr);
       return;
     }
-    t = t->wait_next_;
+    t = t->GetWaitNext();
   }
 }
 
@@ -226,6 +226,7 @@
     monitor_lock_.Unlock(self);  // Let go of locks in order.
     {
       ScopedThreadStateChange tsc(self, kBlocked);  // Change to blocked and give up mutator_lock_.
+      self->SetMonitorEnterObject(obj_);
       MutexLock mu2(self, monitor_lock_);  // Reacquire monitor_lock_ without mutator_lock_ for Wait.
       if (owner_ != NULL) {  // Did the owner_ give the lock up?
         ++num_waiters_;
@@ -248,6 +249,7 @@
           }
         }
       }
+      self->SetMonitorEnterObject(nullptr);
     }
     monitor_lock_.Lock(self);  // Reacquire locks in order.
   }
@@ -447,33 +449,33 @@
   bool was_interrupted = false;
   {
     // Pseudo-atomically wait on self's wait_cond_ and release the monitor lock.
-    MutexLock mu(self, *self->wait_mutex_);
+    MutexLock mu(self, *self->GetWaitMutex());
 
     // Set wait_monitor_ to the monitor object we will be waiting on. When wait_monitor_ is
     // non-NULL a notifying or interrupting thread must signal the thread's wait_cond_ to wake it
     // up.
-    DCHECK(self->wait_monitor_ == NULL);
-    self->wait_monitor_ = this;
+    DCHECK(self->GetWaitMonitor() == nullptr);
+    self->SetWaitMonitor(this);
 
     // Release the monitor lock.
     monitor_contenders_.Signal(self);
     monitor_lock_.Unlock(self);
 
     // Handle the case where the thread was interrupted before we called wait().
-    if (self->interrupted_) {
+    if (self->IsInterruptedLocked()) {
       was_interrupted = true;
     } else {
       // Wait for a notification or a timeout to occur.
       if (why == kWaiting) {
-        self->wait_cond_->Wait(self);
+        self->GetWaitConditionVariable()->Wait(self);
       } else {
         DCHECK(why == kTimedWaiting || why == kSleeping) << why;
-        self->wait_cond_->TimedWait(self, ms, ns);
+        self->GetWaitConditionVariable()->TimedWait(self, ms, ns);
       }
-      if (self->interrupted_) {
+      if (self->IsInterruptedLocked()) {
         was_interrupted = true;
       }
-      self->interrupted_ = false;
+      self->SetInterruptedLocked(false);
     }
   }
 
@@ -485,15 +487,15 @@
     // that a thread in a waiting/sleeping state has a non-null wait_monitor_ for debugging
     // and diagnostic purposes. (If you reset this earlier, stack dumps will claim that threads
     // are waiting on "null".)
-    MutexLock mu(self, *self->wait_mutex_);
-    DCHECK(self->wait_monitor_ != NULL);
-    self->wait_monitor_ = NULL;
+    MutexLock mu(self, *self->GetWaitMutex());
+    DCHECK(self->GetWaitMonitor() != nullptr);
+    self->SetWaitMonitor(nullptr);
   }
 
   // Re-acquire the monitor and lock.
   Lock(self);
   monitor_lock_.Lock(self);
-  self->wait_mutex_->AssertNotHeld(self);
+  self->GetWaitMutex()->AssertNotHeld(self);
 
   /*
    * We remove our thread from wait set after restoring the count
@@ -516,8 +518,8 @@
      * cleared when this exception is thrown."
      */
     {
-      MutexLock mu(self, *self->wait_mutex_);
-      self->interrupted_ = false;
+      MutexLock mu(self, *self->GetWaitMutex());
+      self->SetInterruptedLocked(false);
     }
     if (interruptShouldThrow) {
       ThrowLocation throw_location = self->GetCurrentLocationForThrow();
@@ -538,13 +540,13 @@
   // Signal the first waiting thread in the wait set.
   while (wait_set_ != NULL) {
     Thread* thread = wait_set_;
-    wait_set_ = thread->wait_next_;
-    thread->wait_next_ = NULL;
+    wait_set_ = thread->GetWaitNext();
+    thread->SetWaitNext(nullptr);
 
     // Check to see if the thread is still waiting.
-    MutexLock mu(self, *thread->wait_mutex_);
-    if (thread->wait_monitor_ != NULL) {
-      thread->wait_cond_->Signal(self);
+    MutexLock mu(self, *thread->GetWaitMutex());
+    if (thread->GetWaitMonitor() != nullptr) {
+      thread->GetWaitConditionVariable()->Signal(self);
       return;
     }
   }
@@ -561,8 +563,8 @@
   // Signal all threads in the wait set.
   while (wait_set_ != NULL) {
     Thread* thread = wait_set_;
-    wait_set_ = thread->wait_next_;
-    thread->wait_next_ = NULL;
+    wait_set_ = thread->GetWaitNext();
+    thread->SetWaitNext(nullptr);
     thread->Notify();
   }
 }
@@ -633,6 +635,7 @@
     ThreadList* thread_list = Runtime::Current()->GetThreadList();
     // Suspend the owner, inflate. First change to blocked and give up mutator_lock_.
     ScopedThreadStateChange tsc(self, kBlocked);
+    self->SetMonitorEnterObject(obj.get());
     if (lock_word == obj->GetLockWord()) {  // If lock word hasn't changed.
       bool timed_out;
       Thread* owner = thread_list->SuspendThreadByThreadId(owner_thread_id, false, &timed_out);
@@ -647,6 +650,7 @@
         thread_list->Resume(owner, false);
       }
     }
+    self->SetMonitorEnterObject(nullptr);
   }
 }
 
@@ -880,8 +884,8 @@
     }
     {
       Thread* self = Thread::Current();
-      MutexLock mu(self, *thread->wait_mutex_);
-      Monitor* monitor = thread->wait_monitor_;
+      MutexLock mu(self, *thread->GetWaitMutex());
+      Monitor* monitor = thread->GetWaitMonitor();
       if (monitor != NULL) {
         mirror::Object* object = monitor->obj_;
         object_identity_hashcode = object->IdentityHashCode();
@@ -890,7 +894,7 @@
     }
   } else if (state == kBlocked) {
     os << "  - waiting to lock ";
-    mirror::Object* object = thread->monitor_enter_object_;
+    mirror::Object* object = thread->GetMonitorEnterObject();
     if (object != NULL) {
       object_identity_hashcode = object->IdentityHashCode();
       lock_owner = object->GetLockOwnerThreadId();
@@ -915,11 +919,11 @@
 mirror::Object* Monitor::GetContendedMonitor(Thread* thread) {
   // This is used to implement JDWP's ThreadReference.CurrentContendedMonitor, and has a bizarre
   // definition of contended that includes a monitor a thread is trying to enter...
-  mirror::Object* result = thread->monitor_enter_object_;
+  mirror::Object* result = thread->GetMonitorEnterObject();
   if (result == NULL) {
     // ...but also a monitor that the thread is waiting on.
-    MutexLock mu(Thread::Current(), *thread->wait_mutex_);
-    Monitor* monitor = thread->wait_monitor_;
+    MutexLock mu(Thread::Current(), *thread->GetWaitMutex());
+    Monitor* monitor = thread->GetWaitMonitor();
     if (monitor != NULL) {
       result = monitor->GetObject();
     }
@@ -1118,7 +1122,7 @@
       Monitor* mon = lock_word.FatLockMonitor();
       owner_ = mon->owner_;
       entry_count_ = 1 + mon->lock_count_;
-      for (Thread* waiter = mon->wait_set_; waiter != NULL; waiter = waiter->wait_next_) {
+      for (Thread* waiter = mon->wait_set_; waiter != NULL; waiter = waiter->GetWaitNext()) {
         waiters_.push_back(waiter);
       }
       break;
diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc
index de1b593..0b84005 100644
--- a/runtime/native/java_lang_Thread.cc
+++ b/runtime/native/java_lang_Thread.cc
@@ -104,11 +104,11 @@
 }
 
 static void Thread_nativeInterrupt(JNIEnv* env, jobject java_thread) {
-  ScopedObjectAccess soa(env);
+  ScopedFastNativeObjectAccess soa(env);
   MutexLock mu(soa.Self(), *Locks::thread_list_lock_);
   Thread* thread = Thread::FromManagedThread(soa, java_thread);
   if (thread != NULL) {
-    thread->Interrupt();
+    thread->Interrupt(soa.Self());
   }
 }
 
@@ -175,7 +175,7 @@
   NATIVE_METHOD(Thread, nativeCreate, "(Ljava/lang/Thread;JZ)V"),
   NATIVE_METHOD(Thread, nativeGetStatus, "(Z)I"),
   NATIVE_METHOD(Thread, nativeHoldsLock, "(Ljava/lang/Object;)Z"),
-  NATIVE_METHOD(Thread, nativeInterrupt, "()V"),
+  NATIVE_METHOD(Thread, nativeInterrupt, "!()V"),
   NATIVE_METHOD(Thread, nativeSetName, "(Ljava/lang/String;)V"),
   NATIVE_METHOD(Thread, nativeSetPriority, "(I)V"),
   NATIVE_METHOD(Thread, sleep, "!(Ljava/lang/Object;JI)V"),
diff --git a/runtime/offsets.h b/runtime/offsets.h
index ed4e49e..72a6b0f 100644
--- a/runtime/offsets.h
+++ b/runtime/offsets.h
@@ -50,6 +50,7 @@
 };
 
 // Offsets relative to the current running thread.
+template<size_t pointer_size>
 class ThreadOffset : public Offset {
  public:
   explicit ThreadOffset(size_t val) : Offset(val) {}
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index a8da2f8..f016189 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -576,7 +576,7 @@
   // objects. We can't supply a thread group yet; it will be fixed later. Since we are the main
   // thread, we do not get a java peer.
   Thread* self = Thread::Attach("main", false, NULL, false);
-  CHECK_EQ(self->thin_lock_thread_id_, ThreadList::kMainThreadId);
+  CHECK_EQ(self->GetThreadId(), ThreadList::kMainThreadId);
   CHECK(self != NULL);
 
   // Set us to runnable so tools using a runtime can allocate and GC by default
diff --git a/runtime/runtime_stats.h b/runtime/runtime_stats.h
index 05d3fbb..6ed7fd5 100644
--- a/runtime/runtime_stats.h
+++ b/runtime/runtime_stats.h
@@ -89,20 +89,20 @@
   }
 
   // Number of objects allocated.
-  int allocated_objects;
+  uint64_t allocated_objects;
   // Cumulative size of all objects allocated.
-  int allocated_bytes;
+  uint64_t allocated_bytes;
 
   // Number of objects freed.
-  int freed_objects;
+  uint64_t freed_objects;
   // Cumulative size of all freed objects.
-  int freed_bytes;
+  uint64_t freed_bytes;
 
   // Number of times an allocation triggered a GC.
-  int gc_for_alloc_count;
+  uint64_t gc_for_alloc_count;
 
   // Number of initialized classes.
-  int class_init_count;
+  uint64_t class_init_count;
   // Cumulative time spent in class initialization.
   uint64_t class_init_time_ns;
 
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 66077f9..fc886d5 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -51,8 +51,8 @@
   DCHECK_NE(new_state, kRunnable);
   DCHECK_EQ(this, Thread::Current());
   union StateAndFlags old_state_and_flags;
-  old_state_and_flags.as_int = state_and_flags_.as_int;
-  state_and_flags_.as_struct.state = new_state;
+  old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
+  tls32_.state_and_flags.as_struct.state = new_state;
   return static_cast<ThreadState>(old_state_and_flags.as_struct.state);
 }
 
@@ -60,7 +60,7 @@
 #ifdef NDEBUG
   UNUSED(check_locks);  // Keep GCC happy about unused parameters.
 #else
-  CHECK_EQ(0u, no_thread_suspension_) << last_no_thread_suspension_cause_;
+  CHECK_EQ(0u, tls32_.no_thread_suspension) << tlsPtr_.last_no_thread_suspension_cause;
   if (check_locks) {
     bool bad_mutexes_held = false;
     for (int i = kLockLevelCount - 1; i >= 0; --i) {
@@ -88,7 +88,7 @@
   union StateAndFlags old_state_and_flags;
   union StateAndFlags new_state_and_flags;
   while (true) {
-    old_state_and_flags.as_int = state_and_flags_.as_int;
+    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     if (UNLIKELY((old_state_and_flags.as_struct.flags & kCheckpointRequest) != 0)) {
       RunCheckpointFunction();
       continue;
@@ -98,7 +98,7 @@
     new_state_and_flags.as_struct.flags = old_state_and_flags.as_struct.flags;
     new_state_and_flags.as_struct.state = new_state;
     int status = android_atomic_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                       &state_and_flags_.as_int);
+                                       &tls32_.state_and_flags.as_int);
     if (LIKELY(status == 0)) {
       break;
     }
@@ -110,22 +110,22 @@
 inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
   bool done = false;
   union StateAndFlags old_state_and_flags;
-  old_state_and_flags.as_int = state_and_flags_.as_int;
+  old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
   int16_t old_state = old_state_and_flags.as_struct.state;
   DCHECK_NE(static_cast<ThreadState>(old_state), kRunnable);
   do {
     Locks::mutator_lock_->AssertNotHeld(this);  // Otherwise we starve GC..
-    old_state_and_flags.as_int = state_and_flags_.as_int;
+    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
     if (UNLIKELY((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0)) {
       // Wait while our suspend count is non-zero.
       MutexLock mu(this, *Locks::thread_suspend_count_lock_);
-      old_state_and_flags.as_int = state_and_flags_.as_int;
+      old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
       DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
       while ((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0) {
         // Re-check when Thread::resume_cond_ is notified.
         Thread::resume_cond_->Wait(this);
-        old_state_and_flags.as_int = state_and_flags_.as_int;
+        old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
         DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
       }
       DCHECK_EQ(GetSuspendCount(), 0);
@@ -133,7 +133,7 @@
     // Re-acquire shared mutator_lock_ access.
     Locks::mutator_lock_->SharedLock(this);
     // Atomically change from suspended to runnable if no suspend request pending.
-    old_state_and_flags.as_int = state_and_flags_.as_int;
+    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
     if (LIKELY((old_state_and_flags.as_struct.flags & kSuspendRequest) == 0)) {
       union StateAndFlags new_state_and_flags;
@@ -141,7 +141,7 @@
       new_state_and_flags.as_struct.state = kRunnable;
       // CAS the value without a memory barrier, that occurred in the lock above.
       done = android_atomic_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                &state_and_flags_.as_int) == 0;
+                                &tls32_.state_and_flags.as_int) == 0;
     }
     if (UNLIKELY(!done)) {
       // Failed to transition to Runnable. Release shared mutator_lock_ access and try again.
@@ -161,26 +161,27 @@
 }
 
 inline size_t Thread::TlabSize() const {
-  return thread_local_end_ - thread_local_pos_;
+  return tlsPtr_.thread_local_end - tlsPtr_.thread_local_pos;
 }
 
 inline mirror::Object* Thread::AllocTlab(size_t bytes) {
   DCHECK_GE(TlabSize(), bytes);
-  ++thread_local_objects_;
-  mirror::Object* ret = reinterpret_cast<mirror::Object*>(thread_local_pos_);
-  thread_local_pos_ += bytes;
+  ++tlsPtr_.thread_local_objects;
+  mirror::Object* ret = reinterpret_cast<mirror::Object*>(tlsPtr_.thread_local_pos);
+  tlsPtr_.thread_local_pos += bytes;
   return ret;
 }
 
 inline bool Thread::PushOnThreadLocalAllocationStack(mirror::Object* obj) {
-  DCHECK_LE(thread_local_alloc_stack_top_, thread_local_alloc_stack_end_);
-  if (thread_local_alloc_stack_top_ < thread_local_alloc_stack_end_) {
+  DCHECK_LE(tlsPtr_.thread_local_alloc_stack_top, tlsPtr_.thread_local_alloc_stack_end);
+  if (tlsPtr_.thread_local_alloc_stack_top < tlsPtr_.thread_local_alloc_stack_end) {
     // There's room.
-    DCHECK_LE(reinterpret_cast<byte*>(thread_local_alloc_stack_top_) + sizeof(mirror::Object*),
-              reinterpret_cast<byte*>(thread_local_alloc_stack_end_));
-    DCHECK(*thread_local_alloc_stack_top_ == nullptr);
-    *thread_local_alloc_stack_top_ = obj;
-    ++thread_local_alloc_stack_top_;
+    DCHECK_LE(reinterpret_cast<byte*>(tlsPtr_.thread_local_alloc_stack_top) +
+                  sizeof(mirror::Object*),
+              reinterpret_cast<byte*>(tlsPtr_.thread_local_alloc_stack_end));
+    DCHECK(*tlsPtr_.thread_local_alloc_stack_top == nullptr);
+    *tlsPtr_.thread_local_alloc_stack_top = obj;
+    ++tlsPtr_.thread_local_alloc_stack_top;
     return true;
   }
   return false;
@@ -193,8 +194,8 @@
   DCHECK_ALIGNED(start, sizeof(mirror::Object*));
   DCHECK_ALIGNED(end, sizeof(mirror::Object*));
   DCHECK_LT(start, end);
-  thread_local_alloc_stack_end_ = end;
-  thread_local_alloc_stack_top_ = start;
+  tlsPtr_.thread_local_alloc_stack_end = end;
+  tlsPtr_.thread_local_alloc_stack_top = start;
 }
 
 inline void Thread::RevokeThreadLocalAllocationStack() {
@@ -204,8 +205,8 @@
     DCHECK(this == self || IsSuspended() || GetState() == kWaitingPerformingGc)
         << GetState() << " thread " << this << " self " << self;
   }
-  thread_local_alloc_stack_end_ = nullptr;
-  thread_local_alloc_stack_top_ = nullptr;
+  tlsPtr_.thread_local_alloc_stack_end = nullptr;
+  tlsPtr_.thread_local_alloc_stack_top = nullptr;
 }
 
 }  // namespace art
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 3692b9f..fd5b599 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -79,57 +79,49 @@
 static const char* kThreadNameDuringStartup = "<native thread without managed peer>";
 
 void Thread::InitCardTable() {
-  card_table_ = Runtime::Current()->GetHeap()->GetCardTable()->GetBiasedBegin();
+  tlsPtr_.card_table = Runtime::Current()->GetHeap()->GetCardTable()->GetBiasedBegin();
 }
 
-#if !defined(__APPLE__)
 static void UnimplementedEntryPoint() {
   UNIMPLEMENTED(FATAL);
 }
-#endif
 
 void InitEntryPoints(InterpreterEntryPoints* ipoints, JniEntryPoints* jpoints,
                      PortableEntryPoints* ppoints, QuickEntryPoints* qpoints);
 
 void Thread::InitTlsEntryPoints() {
-#if !defined(__APPLE__)  // The Mac GCC is too old to accept this code.
   // Insert a placeholder so we can easily tell if we call an unimplemented entry point.
-  uintptr_t* begin = reinterpret_cast<uintptr_t*>(&interpreter_entrypoints_);
-  uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(begin) + sizeof(quick_entrypoints_));
+  uintptr_t* begin = reinterpret_cast<uintptr_t*>(&tlsPtr_.interpreter_entrypoints);
+  uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(begin) +
+                                                sizeof(tlsPtr_.quick_entrypoints));
   for (uintptr_t* it = begin; it != end; ++it) {
     *it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);
   }
-  begin = reinterpret_cast<uintptr_t*>(&interpreter_entrypoints_);
-  end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(begin) + sizeof(portable_entrypoints_));
-  for (uintptr_t* it = begin; it != end; ++it) {
-    *it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);
-  }
-#endif
-  InitEntryPoints(&interpreter_entrypoints_, &jni_entrypoints_, &portable_entrypoints_,
-                  &quick_entrypoints_);
+  InitEntryPoints(&tlsPtr_.interpreter_entrypoints, &tlsPtr_.jni_entrypoints,
+                  &tlsPtr_.portable_entrypoints, &tlsPtr_.quick_entrypoints);
 }
 
 void Thread::ResetQuickAllocEntryPointsForThread() {
-  ResetQuickAllocEntryPoints(&quick_entrypoints_);
+  ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints);
 }
 
 void Thread::SetDeoptimizationShadowFrame(ShadowFrame* sf) {
-  deoptimization_shadow_frame_ = sf;
+  tlsPtr_.deoptimization_shadow_frame = sf;
 }
 
 void Thread::SetDeoptimizationReturnValue(const JValue& ret_val) {
-  deoptimization_return_value_.SetJ(ret_val.GetJ());
+  tls64_.deoptimization_return_value.SetJ(ret_val.GetJ());
 }
 
 ShadowFrame* Thread::GetAndClearDeoptimizationShadowFrame(JValue* ret_val) {
-  ShadowFrame* sf = deoptimization_shadow_frame_;
-  deoptimization_shadow_frame_ = nullptr;
-  ret_val->SetJ(deoptimization_return_value_.GetJ());
+  ShadowFrame* sf = tlsPtr_.deoptimization_shadow_frame;
+  tlsPtr_.deoptimization_shadow_frame = nullptr;
+  ret_val->SetJ(tls64_.deoptimization_return_value.GetJ());
   return sf;
 }
 
 void Thread::InitTid() {
-  tid_ = ::art::GetTid();
+  tls32_.tid = ::art::GetTid();
 }
 
 void Thread::InitAfterFork() {
@@ -159,10 +151,10 @@
     ScopedObjectAccess soa(self);
 
     // Copy peer into self, deleting global reference when done.
-    CHECK(self->jpeer_ != nullptr);
-    self->opeer_ = soa.Decode<mirror::Object*>(self->jpeer_);
-    self->GetJniEnv()->DeleteGlobalRef(self->jpeer_);
-    self->jpeer_ = nullptr;
+    CHECK(self->tlsPtr_.jpeer != nullptr);
+    self->tlsPtr_.opeer = soa.Decode<mirror::Object*>(self->tlsPtr_.jpeer);
+    self->GetJniEnv()->DeleteGlobalRef(self->tlsPtr_.jpeer);
+    self->tlsPtr_.jpeer = nullptr;
 
     {
       SirtRef<mirror::String> thread_name(self, self->GetThreadName(soa));
@@ -171,7 +163,7 @@
     Dbg::PostThreadStart(self);
 
     // Invoke the 'run' method of our java.lang.Thread.
-    mirror::Object* receiver = self->opeer_;
+    mirror::Object* receiver = self->tlsPtr_.opeer;
     jmethodID mid = WellKnownClasses::java_lang_Thread_run;
     InvokeVirtualOrInterfaceWithJValues(soa, receiver, mid, nullptr);
   }
@@ -237,7 +229,7 @@
 // is the StackOverflow reserved region used when creating the StackOverflow
 // exception.
 void Thread::InstallImplicitProtection(bool is_main_stack) {
-  byte* pregion = stack_end_;
+  byte* pregion = tlsPtr_.stack_end;
 
   constexpr uint32_t kMarker = 0xdadadada;
   uintptr_t *marker = reinterpret_cast<uintptr_t*>(pregion);
@@ -288,7 +280,7 @@
 
   Thread* child_thread = new Thread(is_daemon);
   // Use global JNI ref to hold peer live while child thread starts.
-  child_thread->jpeer_ = env->NewGlobalRef(java_peer);
+  child_thread->tlsPtr_.jpeer = env->NewGlobalRef(java_peer);
   stack_size = FixStackSize(stack_size);
 
   // Thread.start is synchronized, so we know that nativePeer is 0, and know that we're not racing to
@@ -311,8 +303,8 @@
       runtime->EndThreadBirth();
     }
     // Manually delete the global reference since Thread::Init will not have been run.
-    env->DeleteGlobalRef(child_thread->jpeer_);
-    child_thread->jpeer_ = nullptr;
+    env->DeleteGlobalRef(child_thread->tlsPtr_.jpeer);
+    child_thread->tlsPtr_.jpeer = nullptr;
     delete child_thread;
     child_thread = nullptr;
     // TODO: remove from thread group?
@@ -340,15 +332,15 @@
   InitTid();
   // Set pthread_self_ ahead of pthread_setspecific, that makes Thread::Current function, this
   // avoids pthread_self_ ever being invalid when discovered from Thread::Current().
-  pthread_self_ = pthread_self();
+  tlsPtr_.pthread_self = pthread_self();
   CHECK(is_started_);
   CHECK_PTHREAD_CALL(pthread_setspecific, (Thread::pthread_key_self_, this), "attach self");
   DCHECK_EQ(Thread::Current(), this);
 
-  thin_lock_thread_id_ = thread_list->AllocThreadId(this);
+  tls32_.thin_lock_thread_id = thread_list->AllocThreadId(this);
   InitStackHwm();
 
-  jni_env_ = new JNIEnvExt(this, java_vm);
+  tlsPtr_.jni_env = new JNIEnvExt(this, java_vm);
   thread_list->Register(this);
 }
 
@@ -385,7 +377,7 @@
   } else {
     // These aren't necessary, but they improve diagnostics for unit tests & command-line tools.
     if (thread_name != nullptr) {
-      self->name_->assign(thread_name);
+      self->tlsPtr_.name->assign(thread_name);
       ::art::SetThreadName(thread_name);
     }
   }
@@ -396,7 +388,7 @@
 void Thread::CreatePeer(const char* name, bool as_daemon, jobject thread_group) {
   Runtime* runtime = Runtime::Current();
   CHECK(runtime->IsStarted());
-  JNIEnv* env = jni_env_;
+  JNIEnv* env = tlsPtr_.jni_env;
 
   if (thread_group == nullptr) {
     thread_group = runtime->GetMainThreadGroup();
@@ -412,7 +404,7 @@
   }
   {
     ScopedObjectAccess soa(this);
-    opeer_ = soa.Decode<mirror::Object*>(peer.get());
+    tlsPtr_.opeer = soa.Decode<mirror::Object*>(peer.get());
   }
   env->CallNonvirtualVoidMethod(peer.get(),
                                 WellKnownClasses::java_lang_Thread,
@@ -422,8 +414,8 @@
 
   Thread* self = this;
   DCHECK_EQ(self, Thread::Current());
-  jni_env_->SetLongField(peer.get(), WellKnownClasses::java_lang_Thread_nativePeer,
-                         reinterpret_cast<jlong>(self));
+  env->SetLongField(peer.get(), WellKnownClasses::java_lang_Thread_nativePeer,
+                    reinterpret_cast<jlong>(self));
 
   ScopedObjectAccess soa(self);
   SirtRef<mirror::String> peer_thread_name(soa.Self(), GetThreadName(soa));
@@ -449,34 +441,36 @@
 void Thread::InitPeer(ScopedObjectAccess& soa, jboolean thread_is_daemon, jobject thread_group,
                       jobject thread_name, jint thread_priority) {
   soa.DecodeField(WellKnownClasses::java_lang_Thread_daemon)->
-      SetBoolean<kTransactionActive>(opeer_, thread_is_daemon);
+      SetBoolean<kTransactionActive>(tlsPtr_.opeer, thread_is_daemon);
   soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->
-      SetObject<kTransactionActive>(opeer_, soa.Decode<mirror::Object*>(thread_group));
+      SetObject<kTransactionActive>(tlsPtr_.opeer, soa.Decode<mirror::Object*>(thread_group));
   soa.DecodeField(WellKnownClasses::java_lang_Thread_name)->
-      SetObject<kTransactionActive>(opeer_, soa.Decode<mirror::Object*>(thread_name));
+      SetObject<kTransactionActive>(tlsPtr_.opeer, soa.Decode<mirror::Object*>(thread_name));
   soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)->
-      SetInt<kTransactionActive>(opeer_, thread_priority);
+      SetInt<kTransactionActive>(tlsPtr_.opeer, thread_priority);
 }
 
 void Thread::SetThreadName(const char* name) {
-  name_->assign(name);
+  tlsPtr_.name->assign(name);
   ::art::SetThreadName(name);
   Dbg::DdmSendThreadNotification(this, CHUNK_TYPE("THNM"));
 }
 
 void Thread::InitStackHwm() {
-  void* stack_base;
-  size_t stack_size;
-  GetThreadStack(pthread_self_, &stack_base, &stack_size);
+  void* read_stack_base;
+  size_t read_stack_size;
+  GetThreadStack(tlsPtr_.pthread_self, &read_stack_base, &read_stack_size);
 
   // TODO: include this in the thread dumps; potentially useful in SIGQUIT output?
-  VLOG(threads) << StringPrintf("Native stack is at %p (%s)", stack_base, PrettySize(stack_size).c_str());
+  VLOG(threads) << StringPrintf("Native stack is at %p (%s)", read_stack_base,
+                                PrettySize(read_stack_size).c_str());
 
-  stack_begin_ = reinterpret_cast<byte*>(stack_base);
-  stack_size_ = stack_size;
+  tlsPtr_.stack_begin = reinterpret_cast<byte*>(read_stack_base);
+  tlsPtr_.stack_size = read_stack_size;
 
-  if (stack_size_ <= kStackOverflowReservedBytes) {
-    LOG(FATAL) << "Attempt to attach a thread with a too-small stack (" << stack_size_ << " bytes)";
+  if (read_stack_size <= kStackOverflowReservedBytes) {
+    LOG(FATAL) << "Attempt to attach a thread with a too-small stack (" << read_stack_size
+        << " bytes)";
   }
 
   // TODO: move this into the Linux GetThreadStack implementation.
@@ -500,12 +494,12 @@
       CHECK_PTHREAD_CALL(pthread_attr_destroy, (&default_attributes), "default stack size query");
 
       // ...and use that as our limit.
-      size_t old_stack_size = stack_size_;
-      stack_size_ = default_stack_size;
-      stack_begin_ += (old_stack_size - stack_size_);
+      size_t old_stack_size = read_stack_size;
+      tlsPtr_.stack_size = default_stack_size;
+      tlsPtr_.stack_begin += (old_stack_size - default_stack_size);
       VLOG(threads) << "Limiting unlimited stack (reported as " << PrettySize(old_stack_size) << ")"
-                    << " to " << PrettySize(stack_size_)
-                    << " with base " << reinterpret_cast<void*>(stack_begin_);
+                    << " to " << PrettySize(default_stack_size)
+                    << " with base " << reinterpret_cast<void*>(tlsPtr_.stack_begin);
     }
   }
 #endif
@@ -521,16 +515,16 @@
       // to install our own region so we need to move the limits
       // of the stack to make room for it.
       constexpr uint32_t kDelta = 16 * KB;
-      stack_begin_ += kDelta;
-      stack_end_ += kDelta;
-      stack_size_ -= kDelta;
+      tlsPtr_.stack_begin += kDelta;
+      tlsPtr_.stack_end += kDelta;
+      tlsPtr_.stack_size -= kDelta;
     }
     InstallImplicitProtection(is_main_thread);
   }
 
   // Sanity check.
   int stack_variable;
-  CHECK_GT(&stack_variable, reinterpret_cast<void*>(stack_end_));
+  CHECK_GT(&stack_variable, reinterpret_cast<void*>(tlsPtr_.stack_end));
 }
 
 void Thread::ShortDump(std::ostream& os) const {
@@ -542,8 +536,8 @@
   }
   os << GetState()
            << ",Thread*=" << this
-           << ",peer=" << opeer_
-           << ",\"" << *name_ << "\""
+           << ",peer=" << tlsPtr_.opeer
+           << ",\"" << *tlsPtr_.name << "\""
            << "]";
 }
 
@@ -554,17 +548,17 @@
 
 mirror::String* Thread::GetThreadName(const ScopedObjectAccessUnchecked& soa) const {
   mirror::ArtField* f = soa.DecodeField(WellKnownClasses::java_lang_Thread_name);
-  return (opeer_ != nullptr) ? reinterpret_cast<mirror::String*>(f->GetObject(opeer_)) : nullptr;
+  return (tlsPtr_.opeer != nullptr) ? reinterpret_cast<mirror::String*>(f->GetObject(tlsPtr_.opeer)) : nullptr;
 }
 
 void Thread::GetThreadName(std::string& name) const {
-  name.assign(*name_);
+  name.assign(*tlsPtr_.name);
 }
 
 uint64_t Thread::GetCpuMicroTime() const {
 #if defined(HAVE_POSIX_CLOCKS)
   clockid_t cpu_clock_id;
-  pthread_getcpuclockid(pthread_self_, &cpu_clock_id);
+  pthread_getcpuclockid(tlsPtr_.pthread_self, &cpu_clock_id);
   timespec now;
   clock_gettime(cpu_clock_id, &now);
   return static_cast<uint64_t>(now.tv_sec) * UINT64_C(1000000) + now.tv_nsec / UINT64_C(1000);
@@ -575,11 +569,11 @@
 }
 
 void Thread::AtomicSetFlag(ThreadFlag flag) {
-  android_atomic_or(flag, &state_and_flags_.as_int);
+  android_atomic_or(flag, &tls32_.state_and_flags.as_int);
 }
 
 void Thread::AtomicClearFlag(ThreadFlag flag) {
-  android_atomic_and(-1 ^ flag, &state_and_flags_.as_int);
+  android_atomic_and(-1 ^ flag, &tls32_.state_and_flags.as_int);
 }
 
 // Attempt to rectify locks so that we dump thread list with required locks before exiting.
@@ -604,24 +598,24 @@
 }
 
 void Thread::ModifySuspendCount(Thread* self, int delta, bool for_debugger) {
-  DCHECK(delta == -1 || delta == +1 || delta == -debug_suspend_count_)
-      << delta << " " << debug_suspend_count_ << " " << this;
-  DCHECK_GE(suspend_count_, debug_suspend_count_) << this;
+  DCHECK(delta == -1 || delta == +1 || delta == -tls32_.debug_suspend_count)
+      << delta << " " << tls32_.debug_suspend_count << " " << this;
+  DCHECK_GE(tls32_.suspend_count, tls32_.debug_suspend_count) << this;
   Locks::thread_suspend_count_lock_->AssertHeld(self);
   if (this != self && !IsSuspended()) {
     Locks::thread_list_lock_->AssertHeld(self);
   }
-  if (UNLIKELY(delta < 0 && suspend_count_ <= 0)) {
+  if (UNLIKELY(delta < 0 && tls32_.suspend_count <= 0)) {
     UnsafeLogFatalForSuspendCount(self, this);
     return;
   }
 
-  suspend_count_ += delta;
+  tls32_.suspend_count += delta;
   if (for_debugger) {
-    debug_suspend_count_ += delta;
+    tls32_.debug_suspend_count += delta;
   }
 
-  if (suspend_count_ == 0) {
+  if (tls32_.suspend_count == 0) {
     AtomicClearFlag(kSuspendRequest);
   } else {
     AtomicSetFlag(kSuspendRequest);
@@ -639,8 +633,8 @@
   {
     MutexLock mu(this, *Locks::thread_suspend_count_lock_);
     for (uint32_t i = 0; i < kMaxCheckpoints; ++i) {
-      checkpoints[i] = checkpoint_functions_[i];
-      checkpoint_functions_[i] = nullptr;
+      checkpoints[i] = tlsPtr_.checkpoint_functions[i];
+      tlsPtr_.checkpoint_functions[i] = nullptr;
     }
     AtomicClearFlag(kCheckpointRequest);
   }
@@ -661,14 +655,14 @@
 
 bool Thread::RequestCheckpoint(Closure* function) {
   union StateAndFlags old_state_and_flags;
-  old_state_and_flags.as_int = state_and_flags_.as_int;
+  old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
   if (old_state_and_flags.as_struct.state != kRunnable) {
     return false;  // Fail, thread is suspended and so can't run a checkpoint.
   }
 
   uint32_t available_checkpoint = kMaxCheckpoints;
   for (uint32_t i = 0 ; i < kMaxCheckpoints; ++i) {
-    if (checkpoint_functions_[i] == nullptr) {
+    if (tlsPtr_.checkpoint_functions[i] == nullptr) {
       available_checkpoint = i;
       break;
     }
@@ -677,7 +671,7 @@
     // No checkpoint functions available, we can't run a checkpoint
     return false;
   }
-  checkpoint_functions_[available_checkpoint] = function;
+  tlsPtr_.checkpoint_functions[available_checkpoint] = function;
 
   // Checkpoint function installed now install flag bit.
   // We must be runnable to request a checkpoint.
@@ -686,11 +680,11 @@
   new_state_and_flags.as_int = old_state_and_flags.as_int;
   new_state_and_flags.as_struct.flags |= kCheckpointRequest;
   int succeeded = android_atomic_acquire_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                         &state_and_flags_.as_int);
+                                             &tls32_.state_and_flags.as_int);
   if (UNLIKELY(succeeded != 0)) {
     // The thread changed state before the checkpoint was installed.
-    CHECK_EQ(checkpoint_functions_[available_checkpoint], function);
-    checkpoint_functions_[available_checkpoint] = nullptr;
+    CHECK_EQ(tlsPtr_.checkpoint_functions[available_checkpoint], function);
+    tlsPtr_.checkpoint_functions[available_checkpoint] = nullptr;
   } else {
     CHECK_EQ(ReadFlag(kCheckpointRequest), true);
     TriggerSuspend();
@@ -715,13 +709,15 @@
   bool is_daemon = false;
   Thread* self = Thread::Current();
 
-  if (self != nullptr && thread != nullptr && thread->opeer_ != nullptr) {
+  if (self != nullptr && thread != nullptr && thread->tlsPtr_.opeer != nullptr) {
     ScopedObjectAccessUnchecked soa(self);
-    priority = soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)->GetInt(thread->opeer_);
-    is_daemon = soa.DecodeField(WellKnownClasses::java_lang_Thread_daemon)->GetBoolean(thread->opeer_);
+    priority = soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)
+        ->GetInt(thread->tlsPtr_.opeer);
+    is_daemon = soa.DecodeField(WellKnownClasses::java_lang_Thread_daemon)
+        ->GetBoolean(thread->tlsPtr_.opeer);
 
     mirror::Object* thread_group =
-        soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->GetObject(thread->opeer_);
+        soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->GetObject(thread->tlsPtr_.opeer);
 
     if (thread_group != nullptr) {
       mirror::ArtField* group_name_field =
@@ -740,7 +736,7 @@
   }
 
   if (thread != nullptr) {
-    os << '"' << *thread->name_ << '"';
+    os << '"' << *thread->tlsPtr_.name << '"';
     if (is_daemon) {
       os << " daemon";
     }
@@ -760,9 +756,9 @@
   if (thread != nullptr) {
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
     os << "  | group=\"" << group_name << "\""
-       << " sCount=" << thread->suspend_count_
-       << " dsCount=" << thread->debug_suspend_count_
-       << " obj=" << reinterpret_cast<void*>(thread->opeer_)
+       << " sCount=" << thread->tls32_.suspend_count
+       << " dsCount=" << thread->tls32_.debug_suspend_count
+       << " obj=" << reinterpret_cast<void*>(thread->tlsPtr_.opeer)
        << " self=" << reinterpret_cast<const void*>(thread) << "\n";
   }
 
@@ -772,9 +768,10 @@
   if (thread != nullptr) {
     int policy;
     sched_param sp;
-    CHECK_PTHREAD_CALL(pthread_getschedparam, (thread->pthread_self_, &policy, &sp), __FUNCTION__);
+    CHECK_PTHREAD_CALL(pthread_getschedparam, (thread->tlsPtr_.pthread_self, &policy, &sp),
+                       __FUNCTION__);
     os << " sched=" << policy << "/" << sp.sched_priority
-       << " handle=" << reinterpret_cast<void*>(thread->pthread_self_);
+       << " handle=" << reinterpret_cast<void*>(thread->tlsPtr_.pthread_self);
   }
   os << "\n";
 
@@ -799,8 +796,9 @@
      << " core=" << task_cpu
      << " HZ=" << sysconf(_SC_CLK_TCK) << "\n";
   if (thread != nullptr) {
-    os << "  | stack=" << reinterpret_cast<void*>(thread->stack_begin_) << "-" << reinterpret_cast<void*>(thread->stack_end_)
-       << " stackSize=" << PrettySize(thread->stack_size_) << "\n";
+    os << "  | stack=" << reinterpret_cast<void*>(thread->tlsPtr_.stack_begin) << "-"
+        << reinterpret_cast<void*>(thread->tlsPtr_.stack_end) << " stackSize="
+        << PrettySize(thread->tlsPtr_.stack_size) << "\n";
   }
 }
 
@@ -919,7 +917,8 @@
       DumpNativeStack(os, GetTid(), "  native: ", false, method_ref.get());
     }
     UniquePtr<Context> context(Context::Create());
-    StackDumpVisitor dumper(os, const_cast<Thread*>(this), context.get(), !throwing_OutOfMemoryError_);
+    StackDumpVisitor dumper(os, const_cast<Thread*>(this), context.get(),
+                            !tls32_.throwing_OutOfMemoryError);
     dumper.WalkStack();
   } else {
     os << "Not able to dump stack of thread that isn't suspended";
@@ -928,11 +927,12 @@
 
 void Thread::ThreadExitCallback(void* arg) {
   Thread* self = reinterpret_cast<Thread*>(arg);
-  if (self->thread_exit_check_count_ == 0) {
-    LOG(WARNING) << "Native thread exiting without having called DetachCurrentThread (maybe it's going to use a pthread_key_create destructor?): " << *self;
+  if (self->tls32_.thread_exit_check_count == 0) {
+    LOG(WARNING) << "Native thread exiting without having called DetachCurrentThread (maybe it's "
+        "going to use a pthread_key_create destructor?): " << *self;
     CHECK(is_started_);
     CHECK_PTHREAD_CALL(pthread_setspecific, (Thread::pthread_key_self_, self), "reattach self");
-    self->thread_exit_check_count_ = 1;
+    self->tls32_.thread_exit_check_count = 1;
   } else {
     LOG(FATAL) << "Native thread exited without calling DetachCurrentThread: " << *self;
   }
@@ -984,58 +984,21 @@
   }
 }
 
-Thread::Thread(bool daemon)
-    : suspend_count_(0),
-      card_table_(nullptr),
-      exception_(nullptr),
-      stack_end_(nullptr),
-      managed_stack_(),
-      jni_env_(nullptr),
-      self_(nullptr),
-      opeer_(nullptr),
-      jpeer_(nullptr),
-      stack_begin_(nullptr),
-      stack_size_(0),
-      thin_lock_thread_id_(0),
-      stack_trace_sample_(nullptr),
-      trace_clock_base_(0),
-      tid_(0),
-      wait_mutex_(new Mutex("a thread wait mutex")),
-      wait_cond_(new ConditionVariable("a thread wait condition variable", *wait_mutex_)),
-      wait_monitor_(nullptr),
-      interrupted_(false),
-      wait_next_(nullptr),
-      monitor_enter_object_(nullptr),
-      top_sirt_(nullptr),
-      runtime_(nullptr),
-      class_loader_override_(nullptr),
-      long_jump_context_(nullptr),
-      throwing_OutOfMemoryError_(false),
-      debug_suspend_count_(0),
-      debug_invoke_req_(new DebugInvokeReq),
-      single_step_control_(new SingleStepControl),
-      deoptimization_shadow_frame_(nullptr),
-      instrumentation_stack_(new std::deque<instrumentation::InstrumentationStackFrame>),
-      name_(new std::string(kThreadNameDuringStartup)),
-      daemon_(daemon),
-      pthread_self_(0),
-      no_thread_suspension_(0),
-      last_no_thread_suspension_cause_(nullptr),
-      suspend_trigger_(reinterpret_cast<uintptr_t*>(&suspend_trigger_)),
-      thread_exit_check_count_(0),
-      thread_local_start_(nullptr),
-      thread_local_pos_(nullptr),
-      thread_local_end_(nullptr),
-      thread_local_objects_(0),
-      thread_local_alloc_stack_top_(nullptr),
-      thread_local_alloc_stack_end_(nullptr) {
+Thread::Thread(bool daemon) : tls32_(daemon), wait_monitor_(nullptr), interrupted_(false) {
+  wait_mutex_ = new Mutex("a thread wait mutex");
+  wait_cond_ = new ConditionVariable("a thread wait condition variable", *wait_mutex_);
+  tlsPtr_.debug_invoke_req = new DebugInvokeReq;
+  tlsPtr_.single_step_control = new SingleStepControl;
+  tlsPtr_.instrumentation_stack = new std::deque<instrumentation::InstrumentationStackFrame>;
+  tlsPtr_.name = new std::string(kThreadNameDuringStartup);
+
   CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
-  state_and_flags_.as_struct.flags = 0;
-  state_and_flags_.as_struct.state = kNative;
-  memset(&held_mutexes_[0], 0, sizeof(held_mutexes_));
-  memset(rosalloc_runs_, 0, sizeof(rosalloc_runs_));
+  tls32_.state_and_flags.as_struct.flags = 0;
+  tls32_.state_and_flags.as_struct.state = kNative;
+  memset(&tlsPtr_.held_mutexes[0], 0, sizeof(tlsPtr_.held_mutexes));
+  memset(tlsPtr_.rosalloc_runs, 0, sizeof(tlsPtr_.rosalloc_runs));
   for (uint32_t i = 0; i < kMaxCheckpoints; ++i) {
-    checkpoint_functions_[i] = nullptr;
+    tlsPtr_.checkpoint_functions[i] = nullptr;
   }
 }
 
@@ -1046,7 +1009,8 @@
   // assigned fairly early on, and needs to be.
   // It turns out that the last thing to change is the thread name; that's a good proxy for "has
   // this thread _ever_ entered kRunnable".
-  return (jpeer_ == nullptr && opeer_ == nullptr) || (*name_ == kThreadNameDuringStartup);
+  return (tlsPtr_.jpeer == nullptr && tlsPtr_.opeer == nullptr) ||
+      (*tlsPtr_.name == kThreadNameDuringStartup);
 }
 
 void Thread::AssertNoPendingException() const {
@@ -1084,7 +1048,7 @@
   Thread* self = this;
   DCHECK_EQ(self, Thread::Current());
 
-  if (opeer_ != nullptr) {
+  if (tlsPtr_.opeer != nullptr) {
     ScopedObjectAccess soa(self);
     // We may need to call user-supplied managed code, do this before final clean-up.
     HandleUncaughtExceptions(soa);
@@ -1092,16 +1056,18 @@
 
     // this.nativePeer = 0;
     if (Runtime::Current()->IsActiveTransaction()) {
-      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)->SetLong<true>(opeer_, 0);
+      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)
+          ->SetLong<true>(tlsPtr_.opeer, 0);
     } else {
-      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)->SetLong<false>(opeer_, 0);
+      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)
+          ->SetLong<false>(tlsPtr_.opeer, 0);
     }
     Dbg::PostThreadDeath(self);
 
     // Thread.join() is implemented as an Object.wait() on the Thread.lock object. Signal anyone
     // who is waiting.
     mirror::Object* lock =
-        soa.DecodeField(WellKnownClasses::java_lang_Thread_lock)->GetObject(opeer_);
+        soa.DecodeField(WellKnownClasses::java_lang_Thread_lock)->GetObject(tlsPtr_.opeer);
     // (This conditional is only needed for tests, where Thread.lock won't have been set.)
     if (lock != nullptr) {
       SirtRef<mirror::Object> sirt_obj(self, lock);
@@ -1111,29 +1077,29 @@
   }
 
   // On thread detach, all monitors entered with JNI MonitorEnter are automatically exited.
-  if (jni_env_ != nullptr) {
-    jni_env_->monitors.VisitRoots(MonitorExitVisitor, self, 0, kRootVMInternal);
+  if (tlsPtr_.jni_env != nullptr) {
+    tlsPtr_.jni_env->monitors.VisitRoots(MonitorExitVisitor, self, 0, kRootVMInternal);
   }
 }
 
 Thread::~Thread() {
-  if (jni_env_ != nullptr && jpeer_ != nullptr) {
+  if (tlsPtr_.jni_env != nullptr && tlsPtr_.jpeer != nullptr) {
     // If pthread_create fails we don't have a jni env here.
-    jni_env_->DeleteGlobalRef(jpeer_);
-    jpeer_ = nullptr;
+    tlsPtr_.jni_env->DeleteGlobalRef(tlsPtr_.jpeer);
+    tlsPtr_.jpeer = nullptr;
   }
-  opeer_ = nullptr;
+  tlsPtr_.opeer = nullptr;
 
-  bool initialized = (jni_env_ != nullptr);  // Did Thread::Init run?
+  bool initialized = (tlsPtr_.jni_env != nullptr);  // Did Thread::Init run?
   if (initialized) {
-    delete jni_env_;
-    jni_env_ = nullptr;
+    delete tlsPtr_.jni_env;
+    tlsPtr_.jni_env = nullptr;
   }
   CHECK_NE(GetState(), kRunnable);
   CHECK_NE(ReadFlag(kCheckpointRequest), true);
-  CHECK(checkpoint_functions_[0] == nullptr);
-  CHECK(checkpoint_functions_[1] == nullptr);
-  CHECK(checkpoint_functions_[2] == nullptr);
+  CHECK(tlsPtr_.checkpoint_functions[0] == nullptr);
+  CHECK(tlsPtr_.checkpoint_functions[1] == nullptr);
+  CHECK(tlsPtr_.checkpoint_functions[2] == nullptr);
 
   // We may be deleting a still born thread.
   SetStateUnsafe(kTerminated);
@@ -1141,19 +1107,19 @@
   delete wait_cond_;
   delete wait_mutex_;
 
-  if (long_jump_context_ != nullptr) {
-    delete long_jump_context_;
+  if (tlsPtr_.long_jump_context != nullptr) {
+    delete tlsPtr_.long_jump_context;
   }
 
   if (initialized) {
     CleanupCpu();
   }
 
-  delete debug_invoke_req_;
-  delete single_step_control_;
-  delete instrumentation_stack_;
-  delete name_;
-  delete stack_trace_sample_;
+  delete tlsPtr_.debug_invoke_req;
+  delete tlsPtr_.single_step_control;
+  delete tlsPtr_.instrumentation_stack;
+  delete tlsPtr_.name;
+  delete tlsPtr_.stack_trace_sample;
 
   Runtime::Current()->GetHeap()->RevokeThreadLocalBuffers(this);
 
@@ -1164,47 +1130,50 @@
   if (!IsExceptionPending()) {
     return;
   }
-  ScopedLocalRef<jobject> peer(jni_env_, soa.AddLocalReference<jobject>(opeer_));
+  ScopedLocalRef<jobject> peer(tlsPtr_.jni_env, soa.AddLocalReference<jobject>(tlsPtr_.opeer));
   ScopedThreadStateChange tsc(this, kNative);
 
   // Get and clear the exception.
-  ScopedLocalRef<jthrowable> exception(jni_env_, jni_env_->ExceptionOccurred());
-  jni_env_->ExceptionClear();
+  ScopedLocalRef<jthrowable> exception(tlsPtr_.jni_env, tlsPtr_.jni_env->ExceptionOccurred());
+  tlsPtr_.jni_env->ExceptionClear();
 
   // If the thread has its own handler, use that.
-  ScopedLocalRef<jobject> handler(jni_env_,
-                                  jni_env_->GetObjectField(peer.get(),
-                                                           WellKnownClasses::java_lang_Thread_uncaughtHandler));
+  ScopedLocalRef<jobject> handler(tlsPtr_.jni_env,
+                                  tlsPtr_.jni_env->GetObjectField(peer.get(),
+                                      WellKnownClasses::java_lang_Thread_uncaughtHandler));
   if (handler.get() == nullptr) {
     // Otherwise use the thread group's default handler.
-    handler.reset(jni_env_->GetObjectField(peer.get(), WellKnownClasses::java_lang_Thread_group));
+    handler.reset(tlsPtr_.jni_env->GetObjectField(peer.get(),
+                                                  WellKnownClasses::java_lang_Thread_group));
   }
 
   // Call the handler.
-  jni_env_->CallVoidMethod(handler.get(),
-                           WellKnownClasses::java_lang_Thread$UncaughtExceptionHandler_uncaughtException,
-                           peer.get(), exception.get());
+  tlsPtr_.jni_env->CallVoidMethod(handler.get(),
+      WellKnownClasses::java_lang_Thread$UncaughtExceptionHandler_uncaughtException,
+      peer.get(), exception.get());
 
   // If the handler threw, clear that exception too.
-  jni_env_->ExceptionClear();
+  tlsPtr_.jni_env->ExceptionClear();
 }
 
 void Thread::RemoveFromThreadGroup(ScopedObjectAccess& soa) {
   // this.group.removeThread(this);
   // group can be null if we're in the compiler or a test.
-  mirror::Object* ogroup = soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->GetObject(opeer_);
+  mirror::Object* ogroup = soa.DecodeField(WellKnownClasses::java_lang_Thread_group)
+      ->GetObject(tlsPtr_.opeer);
   if (ogroup != nullptr) {
     ScopedLocalRef<jobject> group(soa.Env(), soa.AddLocalReference<jobject>(ogroup));
-    ScopedLocalRef<jobject> peer(soa.Env(), soa.AddLocalReference<jobject>(opeer_));
+    ScopedLocalRef<jobject> peer(soa.Env(), soa.AddLocalReference<jobject>(tlsPtr_.opeer));
     ScopedThreadStateChange tsc(soa.Self(), kNative);
-    jni_env_->CallVoidMethod(group.get(), WellKnownClasses::java_lang_ThreadGroup_removeThread,
-                             peer.get());
+    tlsPtr_.jni_env->CallVoidMethod(group.get(),
+                                    WellKnownClasses::java_lang_ThreadGroup_removeThread,
+                                    peer.get());
   }
 }
 
 size_t Thread::NumSirtReferences() {
   size_t count = 0;
-  for (StackIndirectReferenceTable* cur = top_sirt_; cur; cur = cur->GetLink()) {
+  for (StackIndirectReferenceTable* cur = tlsPtr_.top_sirt; cur; cur = cur->GetLink()) {
     count += cur->NumberOfReferences();
   }
   return count;
@@ -1213,17 +1182,17 @@
 bool Thread::SirtContains(jobject obj) const {
   StackReference<mirror::Object>* sirt_entry =
       reinterpret_cast<StackReference<mirror::Object>*>(obj);
-  for (StackIndirectReferenceTable* cur = top_sirt_; cur; cur = cur->GetLink()) {
+  for (StackIndirectReferenceTable* cur = tlsPtr_.top_sirt; cur; cur = cur->GetLink()) {
     if (cur->Contains(sirt_entry)) {
       return true;
     }
   }
   // JNI code invoked from portable code uses shadow frames rather than the SIRT.
-  return managed_stack_.ShadowFramesContain(sirt_entry);
+  return tlsPtr_.managed_stack.ShadowFramesContain(sirt_entry);
 }
 
 void Thread::SirtVisitRoots(RootCallback* visitor, void* arg, uint32_t thread_id) {
-  for (StackIndirectReferenceTable* cur = top_sirt_; cur; cur = cur->GetLink()) {
+  for (StackIndirectReferenceTable* cur = tlsPtr_.top_sirt; cur; cur = cur->GetLink()) {
     size_t num_refs = cur->NumberOfReferences();
     for (size_t j = 0; j < num_refs; ++j) {
       mirror::Object* object = cur->GetReference(j);
@@ -1248,7 +1217,7 @@
   mirror::Object* result;
   // The "kinds" below are sorted by the frequency we expect to encounter them.
   if (kind == kLocal) {
-    IndirectReferenceTable& locals = jni_env_->locals;
+    IndirectReferenceTable& locals = tlsPtr_.jni_env->locals;
     result = locals.Get(ref);
   } else if (kind == kSirtOrInvalid) {
     // TODO: make stack indirect reference table lookup more efficient.
@@ -1287,19 +1256,18 @@
 // Implements java.lang.Thread.interrupted.
 bool Thread::Interrupted() {
   MutexLock mu(Thread::Current(), *wait_mutex_);
-  bool interrupted = interrupted_;
-  interrupted_ = false;
+  bool interrupted = IsInterruptedLocked();
+  SetInterruptedLocked(false);
   return interrupted;
 }
 
 // Implements java.lang.Thread.isInterrupted.
 bool Thread::IsInterrupted() {
   MutexLock mu(Thread::Current(), *wait_mutex_);
-  return interrupted_;
+  return IsInterruptedLocked();
 }
 
-void Thread::Interrupt() {
-  Thread* self = Thread::Current();
+void Thread::Interrupt(Thread* self) {
   MutexLock mu(self, *wait_mutex_);
   if (interrupted_) {
     return;
@@ -1677,12 +1645,12 @@
 
 void Thread::ThrowOutOfMemoryError(const char* msg) {
   LOG(ERROR) << StringPrintf("Throwing OutOfMemoryError \"%s\"%s",
-      msg, (throwing_OutOfMemoryError_ ? " (recursive case)" : ""));
+      msg, (tls32_.throwing_OutOfMemoryError ? " (recursive case)" : ""));
   ThrowLocation throw_location = GetCurrentLocationForThrow();
-  if (!throwing_OutOfMemoryError_) {
-    throwing_OutOfMemoryError_ = true;
+  if (!tls32_.throwing_OutOfMemoryError) {
+    tls32_.throwing_OutOfMemoryError = true;
     ThrowNewException(throw_location, "Ljava/lang/OutOfMemoryError;", msg);
-    throwing_OutOfMemoryError_ = false;
+    tls32_.throwing_OutOfMemoryError = false;
   } else {
     Dump(LOG(ERROR));  // The pre-allocated OOME has no stack, so help out and log one.
     SetException(throw_location, Runtime::Current()->GetPreAllocatedOutOfMemoryError());
@@ -1705,140 +1673,146 @@
 #endif
 }
 
-struct EntryPointInfo {
-  uint32_t offset;
-  const char* name;
-};
-#define INTERPRETER_ENTRY_POINT_INFO(x) { INTERPRETER_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-#define JNI_ENTRY_POINT_INFO(x)         { JNI_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-#define PORTABLE_ENTRY_POINT_INFO(x)    { PORTABLE_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-#define QUICK_ENTRY_POINT_INFO(x)       { QUICK_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-static const EntryPointInfo gThreadEntryPointInfo[] = {
-  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToInterpreterBridge),
-  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToCompiledCodeBridge),
-  JNI_ENTRY_POINT_INFO(pDlsymLookup),
-  PORTABLE_ENTRY_POINT_INFO(pPortableImtConflictTrampoline),
-  PORTABLE_ENTRY_POINT_INFO(pPortableResolutionTrampoline),
-  PORTABLE_ENTRY_POINT_INFO(pPortableToInterpreterBridge),
-  QUICK_ENTRY_POINT_INFO(pAllocArray),
-  QUICK_ENTRY_POINT_INFO(pAllocArrayResolved),
-  QUICK_ENTRY_POINT_INFO(pAllocArrayWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pAllocObject),
-  QUICK_ENTRY_POINT_INFO(pAllocObjectResolved),
-  QUICK_ENTRY_POINT_INFO(pAllocObjectInitialized),
-  QUICK_ENTRY_POINT_INFO(pAllocObjectWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArray),
-  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArrayWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInstanceofNonTrivial),
-  QUICK_ENTRY_POINT_INFO(pCheckCast),
-  QUICK_ENTRY_POINT_INFO(pInitializeStaticStorage),
-  QUICK_ENTRY_POINT_INFO(pInitializeTypeAndVerifyAccess),
-  QUICK_ENTRY_POINT_INFO(pInitializeType),
-  QUICK_ENTRY_POINT_INFO(pResolveString),
-  QUICK_ENTRY_POINT_INFO(pSet32Instance),
-  QUICK_ENTRY_POINT_INFO(pSet32Static),
-  QUICK_ENTRY_POINT_INFO(pSet64Instance),
-  QUICK_ENTRY_POINT_INFO(pSet64Static),
-  QUICK_ENTRY_POINT_INFO(pSetObjInstance),
-  QUICK_ENTRY_POINT_INFO(pSetObjStatic),
-  QUICK_ENTRY_POINT_INFO(pGet32Instance),
-  QUICK_ENTRY_POINT_INFO(pGet32Static),
-  QUICK_ENTRY_POINT_INFO(pGet64Instance),
-  QUICK_ENTRY_POINT_INFO(pGet64Static),
-  QUICK_ENTRY_POINT_INFO(pGetObjInstance),
-  QUICK_ENTRY_POINT_INFO(pGetObjStatic),
-  QUICK_ENTRY_POINT_INFO(pAputObjectWithNullAndBoundCheck),
-  QUICK_ENTRY_POINT_INFO(pAputObjectWithBoundCheck),
-  QUICK_ENTRY_POINT_INFO(pAputObject),
-  QUICK_ENTRY_POINT_INFO(pHandleFillArrayData),
-  QUICK_ENTRY_POINT_INFO(pJniMethodStart),
-  QUICK_ENTRY_POINT_INFO(pJniMethodStartSynchronized),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEnd),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEndSynchronized),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReference),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReferenceSynchronized),
-  QUICK_ENTRY_POINT_INFO(pQuickGenericJniTrampoline),
-  QUICK_ENTRY_POINT_INFO(pLockObject),
-  QUICK_ENTRY_POINT_INFO(pUnlockObject),
-  QUICK_ENTRY_POINT_INFO(pCmpgDouble),
-  QUICK_ENTRY_POINT_INFO(pCmpgFloat),
-  QUICK_ENTRY_POINT_INFO(pCmplDouble),
-  QUICK_ENTRY_POINT_INFO(pCmplFloat),
-  QUICK_ENTRY_POINT_INFO(pFmod),
-  QUICK_ENTRY_POINT_INFO(pSqrt),
-  QUICK_ENTRY_POINT_INFO(pL2d),
-  QUICK_ENTRY_POINT_INFO(pFmodf),
-  QUICK_ENTRY_POINT_INFO(pL2f),
-  QUICK_ENTRY_POINT_INFO(pD2iz),
-  QUICK_ENTRY_POINT_INFO(pF2iz),
-  QUICK_ENTRY_POINT_INFO(pIdivmod),
-  QUICK_ENTRY_POINT_INFO(pD2l),
-  QUICK_ENTRY_POINT_INFO(pF2l),
-  QUICK_ENTRY_POINT_INFO(pLdiv),
-  QUICK_ENTRY_POINT_INFO(pLmod),
-  QUICK_ENTRY_POINT_INFO(pLmul),
-  QUICK_ENTRY_POINT_INFO(pShlLong),
-  QUICK_ENTRY_POINT_INFO(pShrLong),
-  QUICK_ENTRY_POINT_INFO(pUshrLong),
-  QUICK_ENTRY_POINT_INFO(pIndexOf),
-  QUICK_ENTRY_POINT_INFO(pMemcmp16),
-  QUICK_ENTRY_POINT_INFO(pStringCompareTo),
-  QUICK_ENTRY_POINT_INFO(pMemcpy),
-  QUICK_ENTRY_POINT_INFO(pQuickImtConflictTrampoline),
-  QUICK_ENTRY_POINT_INFO(pQuickResolutionTrampoline),
-  QUICK_ENTRY_POINT_INFO(pQuickToInterpreterBridge),
-  QUICK_ENTRY_POINT_INFO(pInvokeDirectTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeInterfaceTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeStaticTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeSuperTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeVirtualTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pCheckSuspend),
-  QUICK_ENTRY_POINT_INFO(pTestSuspend),
-  QUICK_ENTRY_POINT_INFO(pDeliverException),
-  QUICK_ENTRY_POINT_INFO(pThrowArrayBounds),
-  QUICK_ENTRY_POINT_INFO(pThrowDivZero),
-  QUICK_ENTRY_POINT_INFO(pThrowNoSuchMethod),
-  QUICK_ENTRY_POINT_INFO(pThrowNullPointer),
-  QUICK_ENTRY_POINT_INFO(pThrowStackOverflow),
-};
-#undef QUICK_ENTRY_POINT_INFO
+// Explicitly instantiate 32 and 64bit thread offset dumping support.
+template void Thread::DumpThreadOffset<4>(std::ostream& os, uint32_t offset);
+template void Thread::DumpThreadOffset<8>(std::ostream& os, uint32_t offset);
 
-void Thread::DumpThreadOffset(std::ostream& os, uint32_t offset, size_t size_of_pointers) {
-  CHECK_EQ(size_of_pointers, 4U);  // TODO: support 64-bit targets.
-
-#define DO_THREAD_OFFSET(x) \
-    if (offset == static_cast<uint32_t>(OFFSETOF_VOLATILE_MEMBER(Thread, x))) { \
-      os << # x; \
+template<size_t ptr_size>
+void Thread::DumpThreadOffset(std::ostream& os, uint32_t offset) {
+#define DO_THREAD_OFFSET(x, y) \
+    if (offset == x.Uint32Value()) { \
+      os << y; \
       return; \
     }
-  DO_THREAD_OFFSET(state_and_flags_);
-  DO_THREAD_OFFSET(card_table_);
-  DO_THREAD_OFFSET(exception_);
-  DO_THREAD_OFFSET(opeer_);
-  DO_THREAD_OFFSET(jni_env_);
-  DO_THREAD_OFFSET(self_);
-  DO_THREAD_OFFSET(stack_end_);
-  DO_THREAD_OFFSET(suspend_count_);
-  DO_THREAD_OFFSET(thin_lock_thread_id_);
-  // DO_THREAD_OFFSET(top_of_managed_stack_);
-  // DO_THREAD_OFFSET(top_of_managed_stack_pc_);
-  DO_THREAD_OFFSET(top_sirt_);
-  DO_THREAD_OFFSET(suspend_trigger_);
+  DO_THREAD_OFFSET(ThreadFlagsOffset<ptr_size>(), "state_and_flags")
+  DO_THREAD_OFFSET(CardTableOffset<ptr_size>(), "card_table")
+  DO_THREAD_OFFSET(ExceptionOffset<ptr_size>(), "exception")
+  DO_THREAD_OFFSET(PeerOffset<ptr_size>(), "peer");
+  DO_THREAD_OFFSET(JniEnvOffset<ptr_size>(), "jni_env")
+  DO_THREAD_OFFSET(SelfOffset<ptr_size>(), "self")
+  DO_THREAD_OFFSET(StackEndOffset<ptr_size>(), "stack_end")
+  DO_THREAD_OFFSET(ThinLockIdOffset<ptr_size>(), "thin_lock_thread_id")
+  DO_THREAD_OFFSET(TopOfManagedStackOffset<ptr_size>(), "top_quick_frame_method")
+  DO_THREAD_OFFSET(TopOfManagedStackPcOffset<ptr_size>(), "top_quick_frame_pc")
+  DO_THREAD_OFFSET(TopShadowFrameOffset<ptr_size>(), "top_shadow_frame")
+  DO_THREAD_OFFSET(TopSirtOffset<ptr_size>(), "top_sirt")
+  DO_THREAD_OFFSET(ThreadSuspendTriggerOffset<ptr_size>(), "suspend_trigger")
 #undef DO_THREAD_OFFSET
 
-  size_t entry_point_count = arraysize(gThreadEntryPointInfo);
-  CHECK_EQ(entry_point_count * size_of_pointers,
-           sizeof(InterpreterEntryPoints) + sizeof(JniEntryPoints) + sizeof(PortableEntryPoints) +
-           sizeof(QuickEntryPoints));
-  uint32_t expected_offset = OFFSETOF_MEMBER(Thread, interpreter_entrypoints_);
-  for (size_t i = 0; i < entry_point_count; ++i) {
-    CHECK_EQ(gThreadEntryPointInfo[i].offset, expected_offset) << gThreadEntryPointInfo[i].name;
-    expected_offset += size_of_pointers;
-    if (gThreadEntryPointInfo[i].offset == offset) {
-      os << gThreadEntryPointInfo[i].name;
-      return;
+#define INTERPRETER_ENTRY_POINT_INFO(x) \
+    if (INTERPRETER_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
     }
-  }
+  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToInterpreterBridge)
+  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToCompiledCodeBridge)
+#undef INTERPRETER_ENTRY_POINT_INFO
+
+#define JNI_ENTRY_POINT_INFO(x) \
+    if (JNI_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
+    }
+  JNI_ENTRY_POINT_INFO(pDlsymLookup)
+#undef JNI_ENTRY_POINT_INFO
+
+#define PORTABLE_ENTRY_POINT_INFO(x) \
+    if (PORTABLE_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
+    }
+  PORTABLE_ENTRY_POINT_INFO(pPortableImtConflictTrampoline)
+  PORTABLE_ENTRY_POINT_INFO(pPortableResolutionTrampoline)
+  PORTABLE_ENTRY_POINT_INFO(pPortableToInterpreterBridge)
+#undef PORTABLE_ENTRY_POINT_INFO
+
+#define QUICK_ENTRY_POINT_INFO(x) \
+    if (QUICK_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
+    }
+  QUICK_ENTRY_POINT_INFO(pAllocArray)
+  QUICK_ENTRY_POINT_INFO(pAllocArrayResolved)
+  QUICK_ENTRY_POINT_INFO(pAllocArrayWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pAllocObject)
+  QUICK_ENTRY_POINT_INFO(pAllocObjectResolved)
+  QUICK_ENTRY_POINT_INFO(pAllocObjectInitialized)
+  QUICK_ENTRY_POINT_INFO(pAllocObjectWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArray)
+  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArrayWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInstanceofNonTrivial)
+  QUICK_ENTRY_POINT_INFO(pCheckCast)
+  QUICK_ENTRY_POINT_INFO(pInitializeStaticStorage)
+  QUICK_ENTRY_POINT_INFO(pInitializeTypeAndVerifyAccess)
+  QUICK_ENTRY_POINT_INFO(pInitializeType)
+  QUICK_ENTRY_POINT_INFO(pResolveString)
+  QUICK_ENTRY_POINT_INFO(pSet32Instance)
+  QUICK_ENTRY_POINT_INFO(pSet32Static)
+  QUICK_ENTRY_POINT_INFO(pSet64Instance)
+  QUICK_ENTRY_POINT_INFO(pSet64Static)
+  QUICK_ENTRY_POINT_INFO(pSetObjInstance)
+  QUICK_ENTRY_POINT_INFO(pSetObjStatic)
+  QUICK_ENTRY_POINT_INFO(pGet32Instance)
+  QUICK_ENTRY_POINT_INFO(pGet32Static)
+  QUICK_ENTRY_POINT_INFO(pGet64Instance)
+  QUICK_ENTRY_POINT_INFO(pGet64Static)
+  QUICK_ENTRY_POINT_INFO(pGetObjInstance)
+  QUICK_ENTRY_POINT_INFO(pGetObjStatic)
+  QUICK_ENTRY_POINT_INFO(pAputObjectWithNullAndBoundCheck)
+  QUICK_ENTRY_POINT_INFO(pAputObjectWithBoundCheck)
+  QUICK_ENTRY_POINT_INFO(pAputObject)
+  QUICK_ENTRY_POINT_INFO(pHandleFillArrayData)
+  QUICK_ENTRY_POINT_INFO(pJniMethodStart)
+  QUICK_ENTRY_POINT_INFO(pJniMethodStartSynchronized)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEnd)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndSynchronized)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReference)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReferenceSynchronized)
+  QUICK_ENTRY_POINT_INFO(pQuickGenericJniTrampoline)
+  QUICK_ENTRY_POINT_INFO(pLockObject)
+  QUICK_ENTRY_POINT_INFO(pUnlockObject)
+  QUICK_ENTRY_POINT_INFO(pCmpgDouble)
+  QUICK_ENTRY_POINT_INFO(pCmpgFloat)
+  QUICK_ENTRY_POINT_INFO(pCmplDouble)
+  QUICK_ENTRY_POINT_INFO(pCmplFloat)
+  QUICK_ENTRY_POINT_INFO(pFmod)
+  QUICK_ENTRY_POINT_INFO(pSqrt)
+  QUICK_ENTRY_POINT_INFO(pL2d)
+  QUICK_ENTRY_POINT_INFO(pFmodf)
+  QUICK_ENTRY_POINT_INFO(pL2f)
+  QUICK_ENTRY_POINT_INFO(pD2iz)
+  QUICK_ENTRY_POINT_INFO(pF2iz)
+  QUICK_ENTRY_POINT_INFO(pIdivmod)
+  QUICK_ENTRY_POINT_INFO(pD2l)
+  QUICK_ENTRY_POINT_INFO(pF2l)
+  QUICK_ENTRY_POINT_INFO(pLdiv)
+  QUICK_ENTRY_POINT_INFO(pLmod)
+  QUICK_ENTRY_POINT_INFO(pLmul)
+  QUICK_ENTRY_POINT_INFO(pShlLong)
+  QUICK_ENTRY_POINT_INFO(pShrLong)
+  QUICK_ENTRY_POINT_INFO(pUshrLong)
+  QUICK_ENTRY_POINT_INFO(pIndexOf)
+  QUICK_ENTRY_POINT_INFO(pMemcmp16)
+  QUICK_ENTRY_POINT_INFO(pStringCompareTo)
+  QUICK_ENTRY_POINT_INFO(pMemcpy)
+  QUICK_ENTRY_POINT_INFO(pQuickImtConflictTrampoline)
+  QUICK_ENTRY_POINT_INFO(pQuickResolutionTrampoline)
+  QUICK_ENTRY_POINT_INFO(pQuickToInterpreterBridge)
+  QUICK_ENTRY_POINT_INFO(pInvokeDirectTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeInterfaceTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeStaticTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeSuperTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeVirtualTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pCheckSuspend)
+  QUICK_ENTRY_POINT_INFO(pTestSuspend)
+  QUICK_ENTRY_POINT_INFO(pDeliverException)
+  QUICK_ENTRY_POINT_INFO(pThrowArrayBounds)
+  QUICK_ENTRY_POINT_INFO(pThrowDivZero)
+  QUICK_ENTRY_POINT_INFO(pThrowNoSuchMethod)
+  QUICK_ENTRY_POINT_INFO(pThrowNullPointer)
+  QUICK_ENTRY_POINT_INFO(pThrowStackOverflow)
+#undef QUICK_ENTRY_POINT_INFO
+
   os << offset;
 }
 
@@ -1869,11 +1843,11 @@
 }
 
 Context* Thread::GetLongJumpContext() {
-  Context* result = long_jump_context_;
+  Context* result = tlsPtr_.long_jump_context;
   if (result == nullptr) {
     result = Context::Create();
   } else {
-    long_jump_context_ = nullptr;  // Avoid context being shared.
+    tlsPtr_.long_jump_context = nullptr;  // Avoid context being shared.
     result->Reset();
   }
   return result;
@@ -1918,11 +1892,11 @@
   return ThrowLocation(visitor.this_object_, visitor.method_, visitor.dex_pc_);
 }
 
-bool Thread::HoldsLock(mirror::Object* object) {
+bool Thread::HoldsLock(mirror::Object* object) const {
   if (object == nullptr) {
     return false;
   }
-  return object->GetLockOwnerThreadId() == thin_lock_thread_id_;
+  return object->GetLockOwnerThreadId() == GetThreadId();
 }
 
 // RootVisitor parameters are: (const Object* obj, size_t vreg, const StackVisitor* visitor).
@@ -2061,30 +2035,30 @@
 
 void Thread::SetClassLoaderOverride(mirror::ClassLoader* class_loader_override) {
   VerifyObject(class_loader_override);
-  class_loader_override_ = class_loader_override;
+  tlsPtr_.class_loader_override = class_loader_override;
 }
 
 void Thread::VisitRoots(RootCallback* visitor, void* arg) {
   uint32_t thread_id = GetThreadId();
-  if (opeer_ != nullptr) {
-    visitor(&opeer_, arg, thread_id, kRootThreadObject);
+  if (tlsPtr_.opeer != nullptr) {
+    visitor(&tlsPtr_.opeer, arg, thread_id, kRootThreadObject);
   }
-  if (exception_ != nullptr) {
-    visitor(reinterpret_cast<mirror::Object**>(&exception_), arg, thread_id, kRootNativeStack);
+  if (tlsPtr_.exception != nullptr) {
+    visitor(reinterpret_cast<mirror::Object**>(&tlsPtr_.exception), arg, thread_id, kRootNativeStack);
   }
-  throw_location_.VisitRoots(visitor, arg);
-  if (class_loader_override_ != nullptr) {
-    visitor(reinterpret_cast<mirror::Object**>(&class_loader_override_), arg, thread_id,
+  tlsPtr_.throw_location.VisitRoots(visitor, arg);
+  if (tlsPtr_.class_loader_override != nullptr) {
+    visitor(reinterpret_cast<mirror::Object**>(&tlsPtr_.class_loader_override), arg, thread_id,
             kRootNativeStack);
   }
-  jni_env_->locals.VisitRoots(visitor, arg, thread_id, kRootJNILocal);
-  jni_env_->monitors.VisitRoots(visitor, arg, thread_id, kRootJNIMonitor);
+  tlsPtr_.jni_env->locals.VisitRoots(visitor, arg, thread_id, kRootJNILocal);
+  tlsPtr_.jni_env->monitors.VisitRoots(visitor, arg, thread_id, kRootJNIMonitor);
   SirtVisitRoots(visitor, arg, thread_id);
-  if (debug_invoke_req_ != nullptr) {
-    debug_invoke_req_->VisitRoots(visitor, arg, thread_id, kRootDebugger);
+  if (tlsPtr_.debug_invoke_req != nullptr) {
+    tlsPtr_.debug_invoke_req->VisitRoots(visitor, arg, thread_id, kRootDebugger);
   }
-  if (single_step_control_ != nullptr) {
-    single_step_control_->VisitRoots(visitor, arg, thread_id, kRootDebugger);
+  if (tlsPtr_.single_step_control != nullptr) {
+    tlsPtr_.single_step_control->VisitRoots(visitor, arg, thread_id, kRootDebugger);
   }
   // Visit roots on this thread's stack
   Context* context = GetLongJumpContext();
@@ -2116,7 +2090,7 @@
 // Set the stack end to that to be used during a stack overflow
 void Thread::SetStackEndForStackOverflow() {
   // During stack overflow we allow use of the full stack.
-  if (stack_end_ == stack_begin_) {
+  if (tlsPtr_.stack_end == tlsPtr_.stack_begin) {
     // However, we seem to have already extended to use the full stack.
     LOG(ERROR) << "Need to increase kStackOverflowReservedBytes (currently "
                << kStackOverflowReservedBytes << ")?";
@@ -2124,23 +2098,23 @@
     LOG(FATAL) << "Recursive stack overflow.";
   }
 
-  stack_end_ = stack_begin_;
+  tlsPtr_.stack_end = tlsPtr_.stack_begin;
 }
 
 void Thread::SetTlab(byte* start, byte* end) {
   DCHECK_LE(start, end);
-  thread_local_start_ = start;
-  thread_local_pos_  = thread_local_start_;
-  thread_local_end_ = end;
-  thread_local_objects_ = 0;
+  tlsPtr_.thread_local_start = start;
+  tlsPtr_.thread_local_pos  = tlsPtr_.thread_local_start;
+  tlsPtr_.thread_local_end = end;
+  tlsPtr_.thread_local_objects = 0;
 }
 
 bool Thread::HasTlab() const {
-  bool has_tlab = thread_local_pos_ != nullptr;
+  bool has_tlab = tlsPtr_.thread_local_pos != nullptr;
   if (has_tlab) {
-    DCHECK(thread_local_start_ != nullptr && thread_local_end_ != nullptr);
+    DCHECK(tlsPtr_.thread_local_start != nullptr && tlsPtr_.thread_local_end != nullptr);
   } else {
-    DCHECK(thread_local_start_ == nullptr && thread_local_end_ == nullptr);
+    DCHECK(tlsPtr_.thread_local_start == nullptr && tlsPtr_.thread_local_end == nullptr);
   }
   return has_tlab;
 }
diff --git a/runtime/thread.h b/runtime/thread.h
index 63d22c5..59fe724 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -92,7 +92,7 @@
   kCheckpointRequest = 2  // Request that the thread do some checkpoint work and then continue.
 };
 
-class PACKED(4) Thread {
+class Thread {
  public:
   // Space to throw a StackOverflowError in.
   // TODO: shrink reserved space, in particular for 64bit.
@@ -145,7 +145,8 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Translates 172 to pAllocArrayFromCode and so on.
-  static void DumpThreadOffset(std::ostream& os, uint32_t offset, size_t size_of_pointers);
+  template<size_t size_of_pointers>
+  static void DumpThreadOffset(std::ostream& os, uint32_t offset);
 
   // Dumps a one-line summary of thread state (used for operator<<).
   void ShortDump(std::ostream& os) const;
@@ -162,32 +163,24 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   ThreadState GetState() const {
-    DCHECK(state_and_flags_.as_struct.state >= kTerminated && state_and_flags_.as_struct.state <= kSuspended);
-    return static_cast<ThreadState>(state_and_flags_.as_struct.state);
-  }
-
-  // This function can be used to make sure a thread's state is valid.
-  void CheckState(int id) const {
-    if (state_and_flags_.as_struct.state >= kTerminated && state_and_flags_.as_struct.state <= kSuspended) {
-      return;
-    }
-    LOG(INFO) << "Thread " << this << " state is invalid: " << state_and_flags_.as_struct.state << " id=" << id;
-    CHECK(false);
+    DCHECK_GE(tls32_.state_and_flags.as_struct.state, kTerminated);
+    DCHECK_LE(tls32_.state_and_flags.as_struct.state, kSuspended);
+    return static_cast<ThreadState>(tls32_.state_and_flags.as_struct.state);
   }
 
   ThreadState SetState(ThreadState new_state);
 
   int GetSuspendCount() const EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    return suspend_count_;
+    return tls32_.suspend_count;
   }
 
   int GetDebugSuspendCount() const EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    return debug_suspend_count_;
+    return tls32_.debug_suspend_count;
   }
 
   bool IsSuspended() const {
     union StateAndFlags state_and_flags;
-    state_and_flags.as_int = state_and_flags_.as_int;
+    state_and_flags.as_int = tls32_.state_and_flags.as_int;
     return state_and_flags.as_struct.state != kRunnable &&
         (state_and_flags.as_struct.flags & kSuspendRequest) != 0;
   }
@@ -221,9 +214,9 @@
   const char* StartAssertNoThreadSuspension(const char* cause) {
     if (kIsDebugBuild) {
       CHECK(cause != NULL);
-      const char* previous_cause = last_no_thread_suspension_cause_;
-      no_thread_suspension_++;
-      last_no_thread_suspension_cause_ = cause;
+      const char* previous_cause = tlsPtr_.last_no_thread_suspension_cause;
+      tls32_.no_thread_suspension++;
+      tlsPtr_.last_no_thread_suspension_cause = cause;
       return previous_cause;
     } else {
       return nullptr;
@@ -233,20 +226,20 @@
   // End region where no thread suspension is expected.
   void EndAssertNoThreadSuspension(const char* old_cause) {
     if (kIsDebugBuild) {
-      CHECK(old_cause != NULL || no_thread_suspension_ == 1);
-      CHECK_GT(no_thread_suspension_, 0U);
-      no_thread_suspension_--;
-      last_no_thread_suspension_cause_ = old_cause;
+      CHECK(old_cause != nullptr || tls32_.no_thread_suspension == 1);
+      CHECK_GT(tls32_.no_thread_suspension, 0U);
+      tls32_.no_thread_suspension--;
+      tlsPtr_.last_no_thread_suspension_cause = old_cause;
     }
   }
 
   void AssertThreadSuspensionIsAllowable(bool check_locks = true) const;
 
   bool IsDaemon() const {
-    return daemon_;
+    return tls32_.daemon;
   }
 
-  bool HoldsLock(mirror::Object*) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool HoldsLock(mirror::Object*) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   /*
    * Changes the priority of this thread to match that of the java.lang.Thread object.
@@ -265,11 +258,11 @@
   static int GetNativePriority();
 
   uint32_t GetThreadId() const {
-    return thin_lock_thread_id_;
+    return tls32_.thin_lock_thread_id;
   }
 
   pid_t GetTid() const {
-    return tid_;
+    return tls32_.tid;
   }
 
   // Returns the java.lang.Thread's name, or NULL if this Thread* doesn't have a peer.
@@ -287,30 +280,30 @@
   uint64_t GetCpuMicroTime() const;
 
   mirror::Object* GetPeer() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    CHECK(jpeer_ == NULL);
-    return opeer_;
+    CHECK(tlsPtr_.jpeer == nullptr);
+    return tlsPtr_.opeer;
   }
 
   bool HasPeer() const {
-    return jpeer_ != NULL || opeer_ != NULL;
+    return tlsPtr_.jpeer != nullptr || tlsPtr_.opeer != nullptr;
   }
 
   RuntimeStats* GetStats() {
-    return &stats_;
+    return &tls64_.stats;
   }
 
   bool IsStillStarting() const;
 
   bool IsExceptionPending() const {
-    return exception_ != NULL;
+    return tlsPtr_.exception != nullptr;
   }
 
   mirror::Throwable* GetException(ThrowLocation* throw_location) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    if (throw_location != NULL) {
-      *throw_location = throw_location_;
+    if (throw_location != nullptr) {
+      *throw_location = tlsPtr_.throw_location;
     }
-    return exception_;
+    return tlsPtr_.exception;
   }
 
   void AssertNoPendingException() const;
@@ -320,13 +313,13 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     CHECK(new_exception != NULL);
     // TODO: DCHECK(!IsExceptionPending());
-    exception_ = new_exception;
-    throw_location_ = throw_location;
+    tlsPtr_.exception = new_exception;
+    tlsPtr_.throw_location = throw_location;
   }
 
   void ClearException() {
-    exception_ = NULL;
-    throw_location_.Clear();
+    tlsPtr_.exception = nullptr;
+    tlsPtr_.throw_location.Clear();
   }
 
   // Find catch block and perform long jump to appropriate exception handle
@@ -334,8 +327,8 @@
 
   Context* GetLongJumpContext();
   void ReleaseLongJumpContext(Context* context) {
-    DCHECK(long_jump_context_ == NULL);
-    long_jump_context_ = context;
+    DCHECK(tlsPtr_.long_jump_context == nullptr);
+    tlsPtr_.long_jump_context = context;
   }
 
   mirror::ArtMethod* GetCurrentMethod(uint32_t* dex_pc) const
@@ -344,16 +337,17 @@
   ThrowLocation GetCurrentLocationForThrow() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void SetTopOfStack(mirror::ArtMethod** top_method, uintptr_t pc) {
-    managed_stack_.SetTopQuickFrame(top_method);
-    managed_stack_.SetTopQuickFramePc(pc);
+    tlsPtr_.managed_stack.SetTopQuickFrame(top_method);
+    tlsPtr_.managed_stack.SetTopQuickFramePc(pc);
   }
 
   void SetTopOfShadowStack(ShadowFrame* top) {
-    managed_stack_.SetTopShadowFrame(top);
+    tlsPtr_.managed_stack.SetTopShadowFrame(top);
   }
 
   bool HasManagedStack() const {
-    return managed_stack_.GetTopQuickFrame() != NULL || managed_stack_.GetTopShadowFrame() != NULL;
+    return (tlsPtr_.managed_stack.GetTopQuickFrame() != nullptr) ||
+        (tlsPtr_.managed_stack.GetTopShadowFrame() != nullptr);
   }
 
   // If 'msg' is NULL, no detail message is set.
@@ -387,21 +381,65 @@
 
   // JNI methods
   JNIEnvExt* GetJniEnv() const {
-    return jni_env_;
+    return tlsPtr_.jni_env;
   }
 
   // Convert a jobject into a Object*
   mirror::Object* DecodeJObject(jobject obj) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  mirror::Object* GetMonitorEnterObject() const {
+    return tlsPtr_.monitor_enter_object;
+  }
+
+  void SetMonitorEnterObject(mirror::Object* obj) {
+    tlsPtr_.monitor_enter_object = obj;
+  }
+
   // Implements java.lang.Thread.interrupted.
-  bool Interrupted();
+  bool Interrupted() LOCKS_EXCLUDED(wait_mutex_);
   // Implements java.lang.Thread.isInterrupted.
-  bool IsInterrupted();
-  void Interrupt();
-  void Notify();
+  bool IsInterrupted() LOCKS_EXCLUDED(wait_mutex_);
+  bool IsInterruptedLocked() EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return interrupted_;
+  }
+  void Interrupt(Thread* self) LOCKS_EXCLUDED(wait_mutex_);
+  void SetInterruptedLocked(bool i) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    interrupted_ = i;
+  }
+  void Notify() LOCKS_EXCLUDED(wait_mutex_);
+
+ private:
+  void NotifyLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_);
+
+ public:
+  Mutex* GetWaitMutex() const LOCK_RETURNED(wait_mutex_) {
+    return wait_mutex_;
+  }
+
+  ConditionVariable* GetWaitConditionVariable() const EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return wait_cond_;
+  }
+
+  Monitor* GetWaitMonitor() const EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return wait_monitor_;
+  }
+
+  void SetWaitMonitor(Monitor* mon) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    wait_monitor_ = mon;
+  }
+
+
+  // Waiter link-list support.
+  Thread* GetWaitNext() const {
+    return tlsPtr_.wait_next;
+  }
+
+  void SetWaitNext(Thread* next) {
+    tlsPtr_.wait_next = next;
+  }
 
   mirror::ClassLoader* GetClassLoaderOverride() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return class_loader_override_;
+    return tlsPtr_.class_loader_override;
   }
 
   void SetClassLoaderOverride(mirror::ClassLoader* class_loader_override)
@@ -428,41 +466,99 @@
   // Offsets of various members of native Thread class, used by compiled code.
   //
 
-  static ThreadOffset SelfOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, self_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThinLockIdOffset() {
+    return ThreadOffset<pointer_size>(
+        OFFSETOF_MEMBER(Thread, tls32_) +
+        OFFSETOF_MEMBER(tls_32bit_sized_values, thin_lock_thread_id));
   }
 
-  static ThreadOffset ExceptionOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, exception_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadFlagsOffset() {
+    return ThreadOffset<pointer_size>(
+        OFFSETOF_MEMBER(Thread, tls32_) +
+        OFFSETOF_MEMBER(tls_32bit_sized_values, state_and_flags));
   }
 
-  static ThreadOffset PeerOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, opeer_));
+ private:
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadOffsetFromTlsPtr(size_t tls_ptr_offset) {
+    size_t base = OFFSETOF_MEMBER(Thread, tlsPtr_);
+    size_t scale;
+    size_t shrink;
+    if (pointer_size == sizeof(void*)) {
+      scale = 1;
+      shrink = 1;
+    } else if (pointer_size > sizeof(void*)) {
+      scale = pointer_size / sizeof(void*);
+      shrink = 1;
+    } else {
+      DCHECK_GT(sizeof(void*), pointer_size);
+      scale = 1;
+      shrink = sizeof(void*) / pointer_size;
+    }
+    return ThreadOffset<pointer_size>(base + ((tls_ptr_offset * scale) / shrink));
   }
 
-  static ThreadOffset ThinLockIdOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+ public:
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> QuickEntryPointOffset(size_t quick_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, quick_entrypoints) + quick_entrypoint_offset);
   }
 
-  static ThreadOffset CardTableOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, card_table_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> InterpreterEntryPointOffset(size_t interp_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, interpreter_entrypoints) + interp_entrypoint_offset);
   }
 
-  static ThreadOffset ThreadFlagsOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, state_and_flags_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> JniEntryPointOffset(size_t jni_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, jni_entrypoints) + jni_entrypoint_offset);
   }
 
-  static ThreadOffset ThreadSuspendTriggerOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, suspend_trigger_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> PortableEntryPointOffset(size_t port_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, portable_entrypoints) + port_entrypoint_offset);
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> SelfOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, self));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ExceptionOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, exception));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> PeerOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, opeer));
+  }
+
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> CardTableOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, card_table));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadSuspendTriggerOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, suspend_trigger));
   }
 
   // Size of stack less any space reserved for stack overflow
   size_t GetStackSize() const {
-    return stack_size_ - (stack_end_ - stack_begin_);
+    return tlsPtr_.stack_size - (tlsPtr_.stack_end - tlsPtr_.stack_begin);
   }
 
   byte* GetStackEnd() const {
-    return stack_end_;
+    return tlsPtr_.stack_end;
   }
 
   // Set the stack end to that to be used during a stack overflow
@@ -475,9 +571,9 @@
     if (implicit_overflow_check) {
       // For implicit checks we also need to add in the protected region above the
       // overflow region.
-      stack_end_ = stack_begin_ + kStackOverflowImplicitCheckSize;
+      tlsPtr_.stack_end = tlsPtr_.stack_begin + kStackOverflowImplicitCheckSize;
     } else {
-      stack_end_ = stack_begin_ + kStackOverflowReservedBytes;
+      tlsPtr_.stack_end = tlsPtr_.stack_begin + kStackOverflowReservedBytes;
     }
   }
 
@@ -485,55 +581,65 @@
   void InstallImplicitProtection(bool is_main_stack);
 
   bool IsHandlingStackOverflow() const {
-    return stack_end_ == stack_begin_;
+    return tlsPtr_.stack_end == tlsPtr_.stack_begin;
   }
 
-  static ThreadOffset StackEndOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, stack_end_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> StackEndOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, stack_end));
   }
 
-  static ThreadOffset JniEnvOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, jni_env_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> JniEnvOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, jni_env));
   }
 
-  static ThreadOffset TopOfManagedStackOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopQuickFrameOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopOfManagedStackOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopQuickFrameOffset());
   }
 
-  static ThreadOffset TopOfManagedStackPcOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopQuickFramePcOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopOfManagedStackPcOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopQuickFramePcOffset());
   }
 
   const ManagedStack* GetManagedStack() const {
-    return &managed_stack_;
+    return &tlsPtr_.managed_stack;
   }
 
   // Linked list recording fragments of managed stack.
   void PushManagedStackFragment(ManagedStack* fragment) {
-    managed_stack_.PushManagedStackFragment(fragment);
+    tlsPtr_.managed_stack.PushManagedStackFragment(fragment);
   }
   void PopManagedStackFragment(const ManagedStack& fragment) {
-    managed_stack_.PopManagedStackFragment(fragment);
+    tlsPtr_.managed_stack.PopManagedStackFragment(fragment);
   }
 
   ShadowFrame* PushShadowFrame(ShadowFrame* new_top_frame) {
-    return managed_stack_.PushShadowFrame(new_top_frame);
+    return tlsPtr_.managed_stack.PushShadowFrame(new_top_frame);
   }
 
   ShadowFrame* PopShadowFrame() {
-    return managed_stack_.PopShadowFrame();
+    return tlsPtr_.managed_stack.PopShadowFrame();
   }
 
-  static ThreadOffset TopShadowFrameOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopShadowFrameOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopShadowFrameOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopShadowFrameOffset());
   }
 
   // Number of references allocated in JNI ShadowFrames on this thread.
   size_t NumJniShadowFrameReferences() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return managed_stack_.NumJniShadowFrameReferences();
+    return tlsPtr_.managed_stack.NumJniShadowFrameReferences();
   }
 
   // Number of references in SIRTs on this thread.
@@ -551,27 +657,28 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void PushSirt(StackIndirectReferenceTable* sirt) {
-    sirt->SetLink(top_sirt_);
-    top_sirt_ = sirt;
+    sirt->SetLink(tlsPtr_.top_sirt);
+    tlsPtr_.top_sirt = sirt;
   }
 
   StackIndirectReferenceTable* PopSirt() {
-    StackIndirectReferenceTable* sirt = top_sirt_;
+    StackIndirectReferenceTable* sirt = tlsPtr_.top_sirt;
     DCHECK(sirt != NULL);
-    top_sirt_ = top_sirt_->GetLink();
+    tlsPtr_.top_sirt = tlsPtr_.top_sirt->GetLink();
     return sirt;
   }
 
-  static ThreadOffset TopSirtOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, top_sirt_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopSirtOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, top_sirt));
   }
 
-  DebugInvokeReq* GetInvokeReq() {
-    return debug_invoke_req_;
+  DebugInvokeReq* GetInvokeReq() const {
+    return tlsPtr_.debug_invoke_req;
   }
 
   SingleStepControl* GetSingleStepControl() const {
-    return single_step_control_;
+    return tlsPtr_.single_step_control;
   }
 
   void SetDeoptimizationShadowFrame(ShadowFrame* sf);
@@ -580,41 +687,41 @@
   ShadowFrame* GetAndClearDeoptimizationShadowFrame(JValue* ret_val);
 
   std::deque<instrumentation::InstrumentationStackFrame>* GetInstrumentationStack() {
-    return instrumentation_stack_;
+    return tlsPtr_.instrumentation_stack;
   }
 
   std::vector<mirror::ArtMethod*>* GetStackTraceSample() const {
-    return stack_trace_sample_;
+    return tlsPtr_.stack_trace_sample;
   }
 
   void SetStackTraceSample(std::vector<mirror::ArtMethod*>* sample) {
-    stack_trace_sample_ = sample;
+    tlsPtr_.stack_trace_sample = sample;
   }
 
   uint64_t GetTraceClockBase() const {
-    return trace_clock_base_;
+    return tls64_.trace_clock_base;
   }
 
   void SetTraceClockBase(uint64_t clock_base) {
-    trace_clock_base_ = clock_base;
+    tls64_.trace_clock_base = clock_base;
   }
 
   BaseMutex* GetHeldMutex(LockLevel level) const {
-    return held_mutexes_[level];
+    return tlsPtr_.held_mutexes[level];
   }
 
   void SetHeldMutex(LockLevel level, BaseMutex* mutex) {
-    held_mutexes_[level] = mutex;
+    tlsPtr_.held_mutexes[level] = mutex;
   }
 
   void RunCheckpointFunction();
 
   bool ReadFlag(ThreadFlag flag) const {
-    return (state_and_flags_.as_struct.flags & flag) != 0;
+    return (tls32_.state_and_flags.as_struct.flags & flag) != 0;
   }
 
   bool TestAllFlags() const {
-    return (state_and_flags_.as_struct.flags != 0);
+    return (tls32_.state_and_flags.as_struct.flags != 0);
   }
 
   void AtomicSetFlag(ThreadFlag flag);
@@ -623,11 +730,57 @@
 
   void ResetQuickAllocEntryPointsForThread();
 
- private:
-  // We have no control over the size of 'bool', but want our boolean fields
-  // to be 4-byte quantities.
-  typedef uint32_t bool32_t;
+  // Returns the remaining space in the TLAB.
+  size_t TlabSize() const;
+  // Doesn't check that there is room.
+  mirror::Object* AllocTlab(size_t bytes);
+  void SetTlab(byte* start, byte* end);
+  bool HasTlab() const;
 
+  // Remove the suspend trigger for this thread by making the suspend_trigger_ TLS value
+  // equal to a valid pointer.
+  // TODO: does this need to atomic?  I don't think so.
+  void RemoveSuspendTrigger() {
+    tlsPtr_.suspend_trigger = reinterpret_cast<uintptr_t*>(&tlsPtr_.suspend_trigger);
+  }
+
+  // Trigger a suspend check by making the suspend_trigger_ TLS value an invalid pointer.
+  // The next time a suspend check is done, it will load from the value at this address
+  // and trigger a SIGSEGV.
+  void TriggerSuspend() {
+    tlsPtr_.suspend_trigger = nullptr;
+  }
+
+
+  // Push an object onto the allocation stack.
+  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+
+  // Set the thread local allocation pointers to the given pointers.
+  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+
+  // Resets the thread local allocation pointers.
+  void RevokeThreadLocalAllocationStack();
+
+  size_t GetThreadLocalBytesAllocated() const {
+    return tlsPtr_.thread_local_pos - tlsPtr_.thread_local_start;
+  }
+
+  size_t GetThreadLocalObjectsAllocated() const {
+    return tlsPtr_.thread_local_objects;
+  }
+
+  // ROS alloc TLS.
+  static constexpr size_t kRosAllocNumOfSizeBrackets = 34;
+
+  void* GetRosAllocRun(size_t index) const {
+    return tlsPtr_.rosalloc_runs[index];
+  }
+
+  void SetRosAllocRun(size_t index, void* run) {
+    tlsPtr_.rosalloc_runs[index] = run;
+  }
+
+ private:
   explicit Thread(bool daemon);
   ~Thread() LOCKS_EXCLUDED(Locks::mutator_lock_,
                            Locks::thread_suspend_count_lock_);
@@ -644,7 +797,7 @@
   // Dbg::Disconnected.
   ThreadState SetStateUnsafe(ThreadState new_state) {
     ThreadState old_state = GetState();
-    state_and_flags_.as_struct.state = new_state;
+    tls32_.state_and_flags.as_struct.state = new_state;
     return old_state;
   }
 
@@ -678,22 +831,6 @@
   void SetUpAlternateSignalStack();
   void TearDownAlternateSignalStack();
 
-  void NotifyLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_);
-
-  static void ThreadExitCallback(void* arg);
-
-  // Has Thread::Startup been called?
-  static bool is_started_;
-
-  // TLS key used to retrieve the Thread*.
-  static pthread_key_t pthread_key_self_;
-
-  // Used to notify threads that they should attempt to resume, they will suspend again if
-  // their suspend count is > 0.
-  static ConditionVariable* resume_cond_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // --- Frequently accessed fields first for short offsets ---
-
   // 32 bits of atomically changed state and flags. Keeping as 32 bits allows and atomic CAS to
   // change from being Suspended to Runnable without a suspend request occurring.
   union PACKED(4) StateAndFlags {
@@ -715,206 +852,225 @@
     // See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47409
     DISALLOW_COPY_AND_ASSIGN(StateAndFlags);
   };
-  union StateAndFlags state_and_flags_;
-  COMPILE_ASSERT(sizeof(union StateAndFlags) == sizeof(int32_t),
-                 sizeof_state_and_flags_and_int32_are_different);
 
-  // A non-zero value is used to tell the current thread to enter a safe point
-  // at the next poll.
-  int suspend_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // The biased card table, see CardTable for details
-  byte* card_table_;
-
-  // The pending exception or NULL.
-  mirror::Throwable* exception_;
-
-  // The end of this thread's stack. This is the lowest safely-addressable address on the stack.
-  // We leave extra space so there's room for the code that throws StackOverflowError.
-  byte* stack_end_;
-
-  // The top of the managed stack often manipulated directly by compiler generated code.
-  ManagedStack managed_stack_;
-
-  // Every thread may have an associated JNI environment
-  JNIEnvExt* jni_env_;
-
-  // Initialized to "this". On certain architectures (such as x86) reading
-  // off of Thread::Current is easy but getting the address of Thread::Current
-  // is hard. This field can be read off of Thread::Current to give the address.
-  Thread* self_;
-
-  // Our managed peer (an instance of java.lang.Thread). The jobject version is used during thread
-  // start up, until the thread is registered and the local opeer_ is used.
-  mirror::Object* opeer_;
-  jobject jpeer_;
-
-  // The "lowest addressable byte" of the stack
-  byte* stack_begin_;
-
-  // Size of the stack
-  size_t stack_size_;
-
-  // Thin lock thread id. This is a small integer used by the thin lock implementation.
-  // This is not to be confused with the native thread's tid, nor is it the value returned
-  // by java.lang.Thread.getId --- this is a distinct value, used only for locking. One
-  // important difference between this id and the ids visible to managed code is that these
-  // ones get reused (to ensure that they fit in the number of bits available).
-  uint32_t thin_lock_thread_id_;
-
-  // Pointer to previous stack trace captured by sampling profiler.
-  std::vector<mirror::ArtMethod*>* stack_trace_sample_;
-
-  // The clock base used for tracing.
-  uint64_t trace_clock_base_;
-
-  // System thread id.
-  pid_t tid_;
-
-  ThrowLocation throw_location_;
-
-  // Guards the 'interrupted_' and 'wait_monitor_' members.
-  mutable Mutex* wait_mutex_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  // Condition variable waited upon during a wait.
-  ConditionVariable* wait_cond_ GUARDED_BY(wait_mutex_);
-  // Pointer to the monitor lock we're currently waiting on or NULL if not waiting.
-  Monitor* wait_monitor_ GUARDED_BY(wait_mutex_);
-  // Thread "interrupted" status; stays raised until queried or thrown.
-  bool32_t interrupted_ GUARDED_BY(wait_mutex_);
-  // The next thread in the wait set this thread is part of or NULL if not waiting.
-  Thread* wait_next_;
-
-
-  // If we're blocked in MonitorEnter, this is the object we're trying to lock.
-  mirror::Object* monitor_enter_object_;
-
-  // Top of linked list of stack indirect reference tables or NULL for none
-  StackIndirectReferenceTable* top_sirt_;
-
-  Runtime* runtime_;
-
-  RuntimeStats stats_;
-
-  // Needed to get the right ClassLoader in JNI_OnLoad, but also
-  // useful for testing.
-  mirror::ClassLoader* class_loader_override_;
-
-  // Thread local, lazily allocated, long jump context. Used to deliver exceptions.
-  Context* long_jump_context_;
-
-  // A boolean telling us whether we're recursively throwing OOME.
-  bool32_t throwing_OutOfMemoryError_;
-
-  // How much of 'suspend_count_' is by request of the debugger, used to set things right
-  // when the debugger detaches. Must be <= suspend_count_.
-  int debug_suspend_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // JDWP invoke-during-breakpoint support.
-  DebugInvokeReq* debug_invoke_req_;
-
-  // JDWP single-stepping support.
-  SingleStepControl* single_step_control_;
-
-  // Shadow frame that is used temporarily during the deoptimization of a method.
-  ShadowFrame* deoptimization_shadow_frame_;
-  JValue deoptimization_return_value_;
-
-  // Additional stack used by method instrumentation to store method and return pc values.
-  // Stored as a pointer since std::deque is not PACKED.
-  std::deque<instrumentation::InstrumentationStackFrame>* instrumentation_stack_;
-
-  // A cached copy of the java.lang.Thread's name.
-  std::string* name_;
-
-  // Is the thread a daemon?
-  const bool32_t daemon_;
-
-  // A cached pthread_t for the pthread underlying this Thread*.
-  pthread_t pthread_self_;
-
-  // Support for Mutex lock hierarchy bug detection.
-  BaseMutex* held_mutexes_[kLockLevelCount];
-
-  // A positive value implies we're in a region where thread suspension isn't expected.
-  uint32_t no_thread_suspension_;
-
-  // If no_thread_suspension_ is > 0, what is causing that assertion.
-  const char* last_no_thread_suspension_cause_;
+  static void ThreadExitCallback(void* arg);
 
   // Maximum number of checkpoint functions.
   static constexpr uint32_t kMaxCheckpoints = 3;
 
-  // Pending checkpoint function or NULL if non-pending. Installation guarding by
-  // Locks::thread_suspend_count_lock_.
-  Closure* checkpoint_functions_[kMaxCheckpoints];
+  // Has Thread::Startup been called?
+  static bool is_started_;
 
- public:
-  // Entrypoint function pointers
-  // TODO: move this near the top, since changing its offset requires all oats to be recompiled!
-  InterpreterEntryPoints interpreter_entrypoints_;
-  JniEntryPoints jni_entrypoints_;
-  PortableEntryPoints portable_entrypoints_;
-  QuickEntryPoints quick_entrypoints_;
+  // TLS key used to retrieve the Thread*.
+  static pthread_key_t pthread_key_self_;
 
-  // Setting this to 0 will trigger a SEGV and thus a suspend check.  It is normally
-  // set to the address of itself.
-  uintptr_t* suspend_trigger_;
+  // Used to notify threads that they should attempt to resume, they will suspend again if
+  // their suspend count is > 0.
+  static ConditionVariable* resume_cond_ GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // How many times has our pthread key's destructor been called?
-  uint32_t thread_exit_check_count_;
+  /***********************************************************************************************/
+  // Thread local storage. Fields are grouped by size to enable 32 <-> 64 searching to account for
+  // pointer size differences. To encourage shorter encoding, more frequently used values appear
+  // first if possible.
+  /***********************************************************************************************/
 
-  // Thread-local allocation pointer.
-  byte* thread_local_start_;
-  byte* thread_local_pos_;
-  byte* thread_local_end_;
-  size_t thread_local_objects_;
-  // Returns the remaining space in the TLAB.
-  size_t TlabSize() const;
-  // Doesn't check that there is room.
-  mirror::Object* AllocTlab(size_t bytes);
-  void SetTlab(byte* start, byte* end);
-  bool HasTlab() const;
+  struct PACKED(4)  tls_32bit_sized_values {
+    // We have no control over the size of 'bool', but want our boolean fields
+    // to be 4-byte quantities.
+    typedef uint32_t bool32_t;
 
-  // Remove the suspend trigger for this thread by making the suspend_trigger_ TLS value
-  // equal to a valid pointer.
-  // TODO: does this need to atomic?  I don't think so.
-  void RemoveSuspendTrigger() {
-    suspend_trigger_ = reinterpret_cast<uintptr_t*>(&suspend_trigger_);
-  }
+    explicit tls_32bit_sized_values(bool is_daemon) :
+      suspend_count(0), debug_suspend_count(0), thin_lock_thread_id(0), tid(0),
+      daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
+      thread_exit_check_count(0) {
+    }
 
-  // Trigger a suspend check by making the suspend_trigger_ TLS value an invalid pointer.
-  // The next time a suspend check is done, it will load from the value at this address
-  // and trigger a SIGSEGV.
-  void TriggerSuspend() {
-    suspend_trigger_ = nullptr;
-  }
+    union StateAndFlags state_and_flags;
+    COMPILE_ASSERT(sizeof(union StateAndFlags) == sizeof(int32_t),
+                   sizeof_state_and_flags_and_int32_are_different);
 
-  // Thread-local rosalloc runs. There are 34 size brackets in rosalloc
-  // runs (RosAlloc::kNumOfSizeBrackets). We can't refer to the
-  // RosAlloc class due to a header file circular dependency issue.
-  // To compensate, we check that the two values match at RosAlloc
-  // initialization time.
-  static const size_t kRosAllocNumOfSizeBrackets = 34;
-  void* rosalloc_runs_[kRosAllocNumOfSizeBrackets];
+    // A non-zero value is used to tell the current thread to enter a safe point
+    // at the next poll.
+    int suspend_count GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // Thread-local allocation stack data/routines.
-  mirror::Object** thread_local_alloc_stack_top_;
-  mirror::Object** thread_local_alloc_stack_end_;
+    // How much of 'suspend_count_' is by request of the debugger, used to set things right
+    // when the debugger detaches. Must be <= suspend_count_.
+    int debug_suspend_count GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // Push an object onto the allocation stack.
-  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+    // Thin lock thread id. This is a small integer used by the thin lock implementation.
+    // This is not to be confused with the native thread's tid, nor is it the value returned
+    // by java.lang.Thread.getId --- this is a distinct value, used only for locking. One
+    // important difference between this id and the ids visible to managed code is that these
+    // ones get reused (to ensure that they fit in the number of bits available).
+    uint32_t thin_lock_thread_id;
 
-  // Set the thread local allocation pointers to the given pointers.
-  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+    // System thread id.
+    uint32_t tid;
 
-  // Resets the thread local allocation pointers.
-  void RevokeThreadLocalAllocationStack();
+    // Is the thread a daemon?
+    const bool32_t daemon;
 
- private:
+    // A boolean telling us whether we're recursively throwing OOME.
+    bool32_t throwing_OutOfMemoryError;
+
+    // A positive value implies we're in a region where thread suspension isn't expected.
+    uint32_t no_thread_suspension;
+
+    // How many times has our pthread key's destructor been called?
+    uint32_t thread_exit_check_count;
+  } tls32_;
+
+  struct PACKED(8) tls_64bit_sized_values {
+    tls_64bit_sized_values() : trace_clock_base(0), deoptimization_return_value() {
+    }
+
+    // The clock base used for tracing.
+    uint64_t trace_clock_base;
+
+    // Return value used by deoptimization.
+    JValue deoptimization_return_value;
+
+    RuntimeStats stats;
+  } tls64_;
+
+  struct PACKED(4) tls_ptr_sized_values {
+      tls_ptr_sized_values() : card_table(nullptr), exception(nullptr), stack_end(nullptr),
+      managed_stack(), suspend_trigger(nullptr), jni_env(nullptr), self(nullptr), opeer(nullptr),
+      jpeer(nullptr), stack_begin(nullptr), stack_size(0), throw_location(),
+      stack_trace_sample(nullptr), wait_next(nullptr), monitor_enter_object(nullptr),
+      top_sirt(nullptr), class_loader_override(nullptr), long_jump_context(nullptr),
+      instrumentation_stack(nullptr), debug_invoke_req(nullptr), single_step_control(nullptr),
+      deoptimization_shadow_frame(nullptr), name(nullptr), pthread_self(0),
+      last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
+      thread_local_pos(nullptr), thread_local_end(nullptr), thread_local_objects(0),
+      thread_local_alloc_stack_top(nullptr), thread_local_alloc_stack_end(nullptr) {
+    }
+
+    // The biased card table, see CardTable for details.
+    byte* card_table;
+
+    // The pending exception or NULL.
+    mirror::Throwable* exception;
+
+    // The end of this thread's stack. This is the lowest safely-addressable address on the stack.
+    // We leave extra space so there's room for the code that throws StackOverflowError.
+    byte* stack_end;
+
+    // The top of the managed stack often manipulated directly by compiler generated code.
+    ManagedStack managed_stack;
+
+    // In certain modes, setting this to 0 will trigger a SEGV and thus a suspend check.  It is
+    // normally set to the address of itself.
+    uintptr_t* suspend_trigger;
+
+    // Every thread may have an associated JNI environment
+    JNIEnvExt* jni_env;
+
+    // Initialized to "this". On certain architectures (such as x86) reading off of Thread::Current
+    // is easy but getting the address of Thread::Current is hard. This field can be read off of
+    // Thread::Current to give the address.
+    Thread* self;
+
+    // Our managed peer (an instance of java.lang.Thread). The jobject version is used during thread
+    // start up, until the thread is registered and the local opeer_ is used.
+    mirror::Object* opeer;
+    jobject jpeer;
+
+    // The "lowest addressable byte" of the stack.
+    byte* stack_begin;
+
+    // Size of the stack.
+    size_t stack_size;
+
+    // The location the current exception was thrown from.
+    ThrowLocation throw_location;
+
+    // Pointer to previous stack trace captured by sampling profiler.
+    std::vector<mirror::ArtMethod*>* stack_trace_sample;
+
+    // The next thread in the wait set this thread is part of or NULL if not waiting.
+    Thread* wait_next;
+
+    // If we're blocked in MonitorEnter, this is the object we're trying to lock.
+    mirror::Object* monitor_enter_object;
+
+    // Top of linked list of stack indirect reference tables or NULL for none.
+    StackIndirectReferenceTable* top_sirt;
+
+    // Needed to get the right ClassLoader in JNI_OnLoad, but also
+    // useful for testing.
+    mirror::ClassLoader* class_loader_override;
+
+    // Thread local, lazily allocated, long jump context. Used to deliver exceptions.
+    Context* long_jump_context;
+
+    // Additional stack used by method instrumentation to store method and return pc values.
+    // Stored as a pointer since std::deque is not PACKED.
+    std::deque<instrumentation::InstrumentationStackFrame>* instrumentation_stack;
+
+    // JDWP invoke-during-breakpoint support.
+    DebugInvokeReq* debug_invoke_req;
+
+    // JDWP single-stepping support.
+    SingleStepControl* single_step_control;
+
+    // Shadow frame stack that is used temporarily during the deoptimization of a method.
+    ShadowFrame* deoptimization_shadow_frame;
+
+    // A cached copy of the java.lang.Thread's name.
+    std::string* name;
+
+    // A cached pthread_t for the pthread underlying this Thread*.
+    pthread_t pthread_self;
+
+    // Support for Mutex lock hierarchy bug detection.
+    BaseMutex* held_mutexes[kLockLevelCount];
+
+    // If no_thread_suspension_ is > 0, what is causing that assertion.
+    const char* last_no_thread_suspension_cause;
+
+    // Pending checkpoint function or NULL if non-pending. Installation guarding by
+    // Locks::thread_suspend_count_lock_.
+    Closure* checkpoint_functions[kMaxCheckpoints];
+
+    // Entrypoint function pointers.
+    // TODO: move this to more of a global offset table model to avoid per-thread duplication.
+    InterpreterEntryPoints interpreter_entrypoints;
+    JniEntryPoints jni_entrypoints;
+    PortableEntryPoints portable_entrypoints;
+    QuickEntryPoints quick_entrypoints;
+
+    // Thread-local allocation pointer.
+    byte* thread_local_start;
+    byte* thread_local_pos;
+    byte* thread_local_end;
+    size_t thread_local_objects;
+
+    // Thread-local rosalloc runs. There are 34 size brackets in rosalloc
+    // runs (RosAlloc::kNumOfSizeBrackets). We can't refer to the
+    // RosAlloc class due to a header file circular dependency issue.
+    // To compensate, we check that the two values match at RosAlloc
+    // initialization time.
+    void* rosalloc_runs[kRosAllocNumOfSizeBrackets];
+
+    // Thread-local allocation stack data/routines.
+    mirror::Object** thread_local_alloc_stack_top;
+    mirror::Object** thread_local_alloc_stack_end;
+  } tlsPtr_;
+
+  // Guards the 'interrupted_' and 'wait_monitor_' members.
+  Mutex* wait_mutex_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+
+  // Condition variable waited upon during a wait.
+  ConditionVariable* wait_cond_ GUARDED_BY(wait_mutex_);
+  // Pointer to the monitor lock we're currently waiting on or NULL if not waiting.
+  Monitor* wait_monitor_ GUARDED_BY(wait_mutex_);
+
+  // Thread "interrupted" status; stays raised until queried or thrown.
+  bool interrupted_ GUARDED_BY(wait_mutex_);
+
   friend class Dbg;  // For SetStateUnsafe.
   friend class gc::collector::SemiSpace;  // For getting stack traces.
-  friend class Monitor;
-  friend class MonitorInfo;
   friend class Runtime;  // For CreatePeer.
   friend class ScopedThreadStateChange;
   friend class SignalCatcher;  // For SetStateUnsafe.
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index ec610e1..0933780 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -63,7 +63,7 @@
 
 bool ThreadList::Contains(pid_t tid) {
   for (const auto& thread : list_) {
-    if (thread->tid_ == tid) {
+    if (thread->GetTid() == tid) {
       return true;
     }
   }
@@ -77,8 +77,8 @@
 void ThreadList::DumpNativeStacks(std::ostream& os) {
   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
   for (const auto& thread : list_) {
-    os << "DUMPING THREAD " << thread->tid_ << "\n";
-    DumpNativeStack(os, thread->tid_, "\t", true);
+    os << "DUMPING THREAD " << thread->GetTid() << "\n";
+    DumpNativeStack(os, thread->GetTid(), "\t", true);
     os << "\n";
   }
 }
@@ -607,7 +607,7 @@
     // though.
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
     self->ModifySuspendCount(self, +1, true);
-    CHECK_GT(self->suspend_count_, 0);
+    CHECK_GT(self->GetSuspendCount(), 0);
   }
 
   VLOG(threads) << *self << " self-suspending (debugger)";
@@ -631,18 +631,18 @@
 
   {
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
-    while (self->suspend_count_ != 0) {
+    while (self->GetSuspendCount() != 0) {
       Thread::resume_cond_->Wait(self);
-      if (self->suspend_count_ != 0) {
+      if (self->GetSuspendCount() != 0) {
         // The condition was signaled but we're still suspended. This
         // can happen if the debugger lets go while a SIGQUIT thread
         // dump event is pending (assuming SignalCatcher was resumed for
         // just long enough to try to grab the thread-suspend lock).
         LOG(DEBUG) << *self << " still suspended after undo "
-                   << "(suspend count=" << self->suspend_count_ << ")";
+                   << "(suspend count=" << self->GetSuspendCount() << ")";
       }
     }
-    CHECK_EQ(self->suspend_count_, 0);
+    CHECK_EQ(self->GetSuspendCount(), 0);
   }
 
   VLOG(threads) << *self << " self-reviving (debugger)";
@@ -661,10 +661,10 @@
     debug_suspend_all_count_ = 0;
     // Update running threads.
     for (const auto& thread : list_) {
-      if (thread == self || thread->debug_suspend_count_ == 0) {
+      if (thread == self || thread->GetDebugSuspendCount() == 0) {
         continue;
       }
-      thread->ModifySuspendCount(self, -thread->debug_suspend_count_, true);
+      thread->ModifySuspendCount(self, -thread->GetDebugSuspendCount(), true);
     }
   }
 
@@ -749,11 +749,15 @@
   // SuspendAll requests.
   MutexLock mu(self, *Locks::thread_list_lock_);
   MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-  self->suspend_count_ = suspend_all_count_;
-  self->debug_suspend_count_ = debug_suspend_all_count_;
-  if (self->suspend_count_ > 0) {
-    self->AtomicSetFlag(kSuspendRequest);
-    self->TriggerSuspend();
+  CHECK_GE(suspend_all_count_, debug_suspend_all_count_);
+  if (debug_suspend_all_count_ > 0) {
+    self->ModifySuspendCount(self, debug_suspend_all_count_, true);
+  }
+  if (suspend_all_count_ > 0) {
+    int delta = suspend_all_count_ - debug_suspend_all_count_;
+    if (delta > 0) {
+      self->ModifySuspendCount(self, delta, false);
+    }
   }
   CHECK(!Contains(self));
   list_.push_back(self);
@@ -768,7 +772,7 @@
   // suspend and so on, must happen at this point, and not in ~Thread.
   self->Destroy();
 
-  uint32_t thin_lock_id = self->thin_lock_thread_id_;
+  uint32_t thin_lock_id = self->GetThreadId();
   while (self != nullptr) {
     // Remove and delete the Thread* while holding the thread_list_lock_ and
     // thread_suspend_count_lock_ so that the unregistering thread cannot be suspended.
diff --git a/runtime/throw_location.h b/runtime/throw_location.h
index c171b07..b36eb67 100644
--- a/runtime/throw_location.h
+++ b/runtime/throw_location.h
@@ -41,7 +41,16 @@
                 uint32_t throw_dex_pc) :
       this_object_(throw_this_object),
       method_(throw_method),
-      dex_pc_(throw_dex_pc) {}
+      dex_pc_(throw_dex_pc)
+#ifdef __LP64__
+      , pad_(0)
+#endif
+
+  {
+#ifdef __LP64__
+    UNUSED(pad_);
+#endif
+  }
 
   mirror::Object* GetThis() const {
     return this_object_;
@@ -72,6 +81,10 @@
   mirror::ArtMethod* method_;
   // The instruction within the throwing method.
   uint32_t dex_pc_;
+  // Ensure 8byte alignment on 64bit.
+#ifdef __LP64__
+  uint32_t pad_;
+#endif
 };
 
 }  // namespace art