Merge "Profile: renamed kThresholdPercent and make it configurable"
diff --git a/Android.mk b/Android.mk
index 6c388e5..6139cb9 100644
--- a/Android.mk
+++ b/Android.mk
@@ -191,7 +191,7 @@
 
 endef
 
-$(foreach test, $(wildcard art/test/[0-9]*), $(eval $(call declare-test-art-host-run-test,$(notdir $(test)))))
+$(foreach test, $(TEST_ART_RUN_TESTS), $(eval $(call declare-test-art-host-run-test,$(test))))
 
 .PHONY: test-art-host-run-test-default
 test-art-host-run-test-default: $(TEST_ART_HOST_RUN_TEST_DEFAULT_TARGETS)
@@ -229,19 +229,32 @@
 test-art-target-oat: $(ART_TEST_TARGET_OAT_TARGETS)
 	@echo test-art-target-oat PASSED
 
-define declare-test-art-target-run-test
-.PHONY: test-art-target-run-test-$(1)
-test-art-target-run-test-$(1): test-art-target-sync $(DX) $(HOST_OUT_EXECUTABLES)/jasmin
-	DX=$(abspath $(DX)) JASMIN=$(abspath $(HOST_OUT_EXECUTABLES)/jasmin) art/test/run-test $(DALVIKVM_FLAGS) $(1)
-	@echo test-art-target-run-test-$(1) PASSED
+define declare-test-art-target-run-test-impl
+.PHONY: test-art-target-run-test-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX)
+test-art-target-run-test-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX): test-art-target-sync $(DX) $(HOST_OUT_EXECUTABLES)/jasmin
+	DX=$(abspath $(DX)) JASMIN=$(abspath $(HOST_OUT_EXECUTABLES)/jasmin) art/test/run-test $(DALVIKVM_FLAGS) $(1) $(3)
+	@echo test-art-target-run-test-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX) PASSED
+endef
 
-TEST_ART_TARGET_RUN_TEST_TARGETS += test-art-target-run-test-$(1)
+define declare-test-art-target-run-test
+
+  ifdef TARGET_2ND_ARCH
+    $(call declare-test-art-target-run-test-impl,$(1),2ND_,)
+
+    ifneq ($(ART_PHONY_TEST_TARGET_SUFFIX),)
+      # Link primary to non-suffix
+test-art-target-run-test-$(1): test-art-target-run-test-$(1)$(ART_PHONY_TEST_TARGET_SUFFIX)
+    endif
+  endif
+  $(call declare-test-art-target-run-test-impl,$(1),,--$(ART_TARGET_BINARY_SUFFIX))
+
+  TEST_ART_TARGET_RUN_TEST_TARGETS += test-art-target-run-test-$(1)
 
 test-art-run-test-$(1): test-art-host-run-test-$(1) test-art-target-run-test-$(1)
 
 endef
 
-$(foreach test, $(wildcard art/test/[0-9]*), $(eval $(call declare-test-art-target-run-test,$(notdir $(test)))))
+$(foreach test, $(TEST_ART_RUN_TESTS), $(eval $(call declare-test-art-target-run-test,$(test))))
 
 .PHONY: test-art-target-run-test
 test-art-target-run-test: $(TEST_ART_TARGET_RUN_TEST_TARGETS)
@@ -271,9 +284,9 @@
 .PHONY: oat-target-$(1)
 oat-target-$(1): $$(OUT_OAT_FILE)
 
-$$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OAT_DEPENDENCY)
+$$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD_DEPENDENCY)
 	@mkdir -p $$(dir $$@)
-	$(DEX2OAT) --runtime-arg -Xms64m --runtime-arg -Xmx64m --boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) --dex-location=/$(1) --oat-file=$$@ --instruction-set=$(TARGET_ARCH) --instruction-set-features=$(TARGET_INSTRUCTION_SET_FEATURES) --android-root=$(PRODUCT_OUT)/system
+	$(DEX2OATD) --runtime-arg -Xms64m --runtime-arg -Xmx64m --boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) --dex-location=/$(1) --oat-file=$$@ --instruction-set=$(TARGET_ARCH) --instruction-set-features=$(TARGET_INSTRUCTION_SET_FEATURES) --android-root=$(PRODUCT_OUT)/system
 
 endif
 
@@ -422,6 +435,18 @@
 	adb shell setprop persist.sys.dalvik.vm.lib.1 libart.so
 	adb shell start
 
+.PHONY: use-art-verify-none
+use-art-verify-none:
+	adb root && sleep 3
+	adb shell stop
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.dex
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.oat
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.art
+	adb shell setprop dalvik.vm.dex2oat-flags "--compiler-filter=verify-none"
+	adb shell setprop dalvik.vm.image-dex2oat-flags "--compiler-filter=verify-none"
+	adb shell setprop persist.sys.dalvik.vm.lib.1 libart.so
+	adb shell start
+
 ########################################################################
 
 endif # !art_dont_bother
diff --git a/build/Android.common.mk b/build/Android.common.mk
index d80d039..b9a297b 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -127,12 +127,48 @@
 ART_DALVIK_CACHE_DIR := /data/dalvik-cache
 
 # directory used for gtests on device
-ART_NATIVETEST_DIR := /data/nativetest/art
-ART_NATIVETEST_OUT := $(TARGET_OUT_DATA_NATIVE_TESTS)/art
+ART_BASE_NATIVETEST_DIR := /data/nativetest/art
+ART_BASE_NATIVETEST_OUT := $(TARGET_OUT_DATA_NATIVE_TESTS)/art
 
 # directory used for tests on device
-ART_TEST_DIR := /data/art-test
-ART_TEST_OUT := $(TARGET_OUT_DATA)/art-test
+ART_BASE_TEST_DIR := /data/art-test
+ART_BASE_TEST_OUT := $(TARGET_OUT_DATA)/art-test
+
+# Primary vs. secondary
+2ND_TARGET_ARCH := $(TARGET_2ND_ARCH)
+ART_PHONY_TEST_TARGET_SUFFIX :=
+2ND_ART_PHONY_TEST_TARGET_SUFFIX :=
+ART_TARGET_BINARY_SUFFIX :=
+2ND_ART_TARGET_BINARY_SUFFIX :=
+ifdef TARGET_2ND_ARCH
+  art_test_primary_suffix :=
+  art_test_secondary_suffix :=
+  ifneq ($(filter %64,$(TARGET_ARCH)),)
+    art_test_primary_suffix := 64
+    ART_PHONY_TEST_TARGET_SUFFIX := 64
+    2ND_ART_PHONY_TEST_TARGET_SUFFIX := 32
+    ART_TARGET_BINARY_SUFFIX := 64
+  else
+    # TODO: ???
+    $(error Do not know what to do with this multi-target configuration!)
+  endif
+  # Primary with primary suffix
+  ART_NATIVETEST_DIR := $(ART_BASE_NATIVETEST_DIR)$(art_test_primary_suffix)
+  ART_NATIVETEST_OUT := $(ART_BASE_NATIVETEST_OUT)$(art_test_primary_suffix)
+  ART_TEST_DIR := $(ART_BASE_TEST_DIR)$(art_test_primary_suffix)
+  ART_TEST_OUT := $(ART_BASE_TEST_OUT)$(art_test_primary_suffix)
+  # Secondary with 2ND_ prefix and secondary suffix
+  2ND_ART_NATIVETEST_DIR := $(ART_BASE_NATIVETEST_DIR)$(art_test_secondary_suffix)
+  2ND_ART_NATIVETEST_OUT := $(ART_BASE_NATIVETEST_OUT)$(art_test_secondary_suffix)
+  2ND_ART_TEST_DIR := $(ART_BASE_TEST_DIR)$(art_test_secondary_suffix)
+  2ND_ART_TEST_OUT := $(ART_BASE_TEST_OUT)$(art_test_secondary_suffix)
+else
+  ART_NATIVETEST_DIR := $(ART_BASE_NATIVETEST_DIR)
+  ART_NATIVETEST_OUT := $(ART_BASE_NATIVETEST_OUT)
+  ART_TEST_DIR := $(ART_BASE_TEST_DIR)
+  ART_TEST_OUT := $(ART_BASE_TEST_OUT)
+  # No secondary
+endif
 
 ART_CPP_EXTENSION := .cc
 
diff --git a/build/Android.executable.mk b/build/Android.executable.mk
index 551b03c..27d687c 100644
--- a/build/Android.executable.mk
+++ b/build/Android.executable.mk
@@ -30,6 +30,7 @@
 # $(4): extra include directories
 # $(5): target or host
 # $(6): ndebug or debug
+# $(7): value for LOCAL_MULTILIB (empty means default)
 define build-art-executable
   ifneq ($(5),target)
     ifneq ($(5),host)
@@ -48,6 +49,7 @@
   art_c_includes := $(4)
   art_target_or_host := $(5)
   art_ndebug_or_debug := $(6)
+  art_multilib := $(7)
 
   include $(CLEAR_VARS)
   ifeq ($$(art_target_or_host),target)
@@ -98,8 +100,7 @@
 
   ifeq ($$(art_target_or_host),target)
     LOCAL_MODULE_TARGET_ARCH := $(ART_SUPPORTED_ARCH)
-    #HACK: force 32-bit until 64-bit dex2oat can handle 32-bit
-    LOCAL_32_BIT_ONLY := true
+    LOCAL_MULTILIB := $$(art_multilib)
   endif
 
   ifeq ($$(art_target_or_host),target)
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index da0b500..5b83056 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -113,6 +113,22 @@
   ART_TEST_CFLAGS += -DART_USE_PORTABLE_COMPILER=1
 endif
 
+# Build a make target for a target test.
+# (1) Prefix for variables
+define build-art-test-make-target
+.PHONY: $$(art_gtest_target)$($(1)ART_PHONY_TEST_TARGET_SUFFIX)
+$$(art_gtest_target)$($(1)ART_PHONY_TEST_TARGET_SUFFIX): $($(1)ART_NATIVETEST_OUT)/$$(LOCAL_MODULE) test-art-target-sync
+	adb shell touch $($(1)ART_TEST_DIR)/$$@
+	adb shell rm $($(1)ART_TEST_DIR)/$$@
+	adb shell chmod 755 $($(1)ART_NATIVETEST_DIR)/$$(notdir $$<)
+	adb shell sh -c "$($(1)ART_NATIVETEST_DIR)/$$(notdir $$<) && touch $($(1)ART_TEST_DIR)/$$@"
+	$(hide) (adb pull $($(1)ART_TEST_DIR)/$$@ /tmp/ && echo $$@ PASSED) || (echo $$@ FAILED && exit 1)
+	$(hide) rm /tmp/$$@
+
+    ART_TARGET_GTEST_TARGETS += $$(art_gtest_target)$($(1)ART_PHONY_TEST_TARGET_SUFFIX)
+endef
+
+
 # $(1): target or host
 # $(2): file name
 # $(3): extra C includes
@@ -163,10 +179,23 @@
     LOCAL_CFLAGS_x86 := $(ART_TARGET_CFLAGS_x86)
     LOCAL_SHARED_LIBRARIES += libdl libicuuc libicui18n libnativehelper libz libcutils
     LOCAL_STATIC_LIBRARIES += libgtest
-    LOCAL_MODULE_PATH := $(ART_NATIVETEST_OUT)
+    LOCAL_MODULE_PATH_32 := $(ART_BASE_NATIVETEST_OUT)
+    LOCAL_MODULE_PATH_64 := $(ART_BASE_NATIVETEST_OUT)64
+    LOCAL_MULTILIB := both
     include $(BUILD_EXECUTABLE)
-    art_gtest_exe := $$(LOCAL_MODULE_PATH)/$$(LOCAL_MODULE)
     ART_TARGET_GTEST_EXECUTABLES += $$(art_gtest_exe)
+    art_gtest_target := test-art-$$(art_target_or_host)-gtest-$$(art_gtest_name)
+
+    ifdef TARGET_2ND_ARCH
+      $(call build-art-test-make-target,2ND_)
+
+      # Bind the primary to the non-suffix rule
+      ifneq ($(ART_PHONY_TEST_TARGET_SUFFIX),)
+$$(art_gtest_target): $$(art_gtest_target)$(ART_PHONY_TEST_TARGET_SUFFIX)
+      endif
+    endif
+    $(call build-art-test-make-target,)
+
   else # host
     LOCAL_CLANG := $(ART_HOST_CLANG)
     LOCAL_CFLAGS += $(ART_HOST_CFLAGS) $(ART_HOST_DEBUG_CFLAGS)
@@ -180,34 +209,21 @@
     include $(BUILD_HOST_EXECUTABLE)
     art_gtest_exe := $(HOST_OUT_EXECUTABLES)/$$(LOCAL_MODULE)
     ART_HOST_GTEST_EXECUTABLES += $$(art_gtest_exe)
-  endif
-art_gtest_target := test-art-$$(art_target_or_host)-gtest-$$(art_gtest_name)
-ifeq ($$(art_target_or_host),target)
-.PHONY: $$(art_gtest_target)
-$$(art_gtest_target): $$(art_gtest_exe) test-art-target-sync
-	adb shell touch $(ART_TEST_DIR)/$$@
-	adb shell rm $(ART_TEST_DIR)/$$@
-	adb shell chmod 755 $(ART_NATIVETEST_DIR)/$$(notdir $$<)
-	adb shell sh -c "$(ART_NATIVETEST_DIR)/$$(notdir $$<) && touch $(ART_TEST_DIR)/$$@"
-	$(hide) (adb pull $(ART_TEST_DIR)/$$@ /tmp/ && echo $$@ PASSED) || (echo $$@ FAILED && exit 1)
-	$(hide) rm /tmp/$$@
-
-ART_TARGET_GTEST_TARGETS += $$(art_gtest_target)
-else
+    art_gtest_target := test-art-$$(art_target_or_host)-gtest-$$(art_gtest_name)
 .PHONY: $$(art_gtest_target)
 $$(art_gtest_target): $$(art_gtest_exe) test-art-host-dependencies
 	$$<
 	@echo $$@ PASSED
 
-ART_HOST_GTEST_TARGETS += $$(art_gtest_target)
+    ART_HOST_GTEST_TARGETS += $$(art_gtest_target)
 
 .PHONY: valgrind-$$(art_gtest_target)
 valgrind-$$(art_gtest_target): $$(art_gtest_exe) test-art-host-dependencies
 	valgrind --leak-check=full --error-exitcode=1 $$<
 	@echo $$@ PASSED
 
-ART_HOST_VALGRIND_GTEST_TARGETS += valgrind-$$(art_gtest_target)
-endif
+    ART_HOST_VALGRIND_GTEST_TARGETS += valgrind-$$(art_gtest_target)
+  endif
 endef
 
 ifeq ($(ART_BUILD_TARGET),true)
diff --git a/build/Android.libarttest.mk b/build/Android.libarttest.mk
index d807a9c..14d16ac 100644
--- a/build/Android.libarttest.mk
+++ b/build/Android.libarttest.mk
@@ -51,7 +51,9 @@
     LOCAL_CFLAGS_x86 := $(ART_TARGET_CFLAGS_x86)
     LOCAL_SHARED_LIBRARIES += libdl libcutils
     LOCAL_STATIC_LIBRARIES := libgtest
-    LOCAL_MODULE_PATH := $(ART_TEST_OUT)
+    LOCAL_MULTILIB := both
+    LOCAL_MODULE_PATH_32 := $(ART_BASE_TEST_OUT)
+    LOCAL_MODULE_PATH_64 := $(ART_BASE_TEST_OUT)64
     LOCAL_MODULE_TARGET_ARCH := $(ART_SUPPORTED_ARCH)
     include $(BUILD_SHARED_LIBRARY)
   else # host
diff --git a/build/Android.oat.mk b/build/Android.oat.mk
index def585b..cb45a85 100644
--- a/build/Android.oat.mk
+++ b/build/Android.oat.mk
@@ -31,34 +31,46 @@
 
 HOST_CORE_OAT := $(HOST_OUT_JAVA_LIBRARIES)/core.oat
 TARGET_CORE_OAT := $(ART_TEST_DIR)/core.oat
+2ND_TARGET_CORE_OAT := $(2ND_ART_TEST_DIR)/core.oat
 
 HOST_CORE_OAT_OUT := $(HOST_OUT_JAVA_LIBRARIES)/core.oat
 TARGET_CORE_OAT_OUT := $(ART_TEST_OUT)/core.oat
+2ND_TARGET_CORE_OAT_OUT := $(2ND_ART_TEST_OUT)/core.oat
 
 HOST_CORE_IMG_OUT := $(HOST_OUT_JAVA_LIBRARIES)/core.art
 TARGET_CORE_IMG_OUT := $(ART_TEST_OUT)/core.art
+2ND_TARGET_CORE_IMG_OUT := $(2ND_ART_TEST_OUT)/core.art
 
 TARGET_INSTRUCTION_SET_FEATURES := $(DEX2OAT_TARGET_INSTRUCTION_SET_FEATURES)
 
-$(HOST_CORE_IMG_OUT): $(HOST_CORE_DEX_FILES) $(DEX2OAT_DEPENDENCY)
+# Use dex2oat debug version for better error reporting
+$(HOST_CORE_IMG_OUT): $(HOST_CORE_DEX_FILES) $(DEX2OATD_DEPENDENCY)
 	@echo "host dex2oat: $@ ($?)"
 	@mkdir -p $(dir $@)
-	$(hide) $(DEX2OAT) --runtime-arg -Xms16m --runtime-arg -Xmx16m --image-classes=$(PRELOADED_CLASSES) $(addprefix \
+	$(hide) $(DEX2OATD) --runtime-arg -Xms16m --runtime-arg -Xmx16m --image-classes=$(PRELOADED_CLASSES) $(addprefix \
 		--dex-file=,$(HOST_CORE_DEX_FILES)) $(addprefix --dex-location=,$(HOST_CORE_DEX_LOCATIONS)) --oat-file=$(HOST_CORE_OAT_OUT) \
 		--oat-location=$(HOST_CORE_OAT) --image=$(HOST_CORE_IMG_OUT) --base=$(LIBART_IMG_HOST_BASE_ADDRESS) \
 		--instruction-set=$(ART_HOST_ARCH) --host --android-root=$(HOST_OUT)
 
-$(TARGET_CORE_IMG_OUT): $(TARGET_CORE_DEX_FILES) $(DEX2OAT_DEPENDENCY)
-	@echo "target dex2oat: $@ ($?)"
-	@mkdir -p $(dir $@)
-	$(hide) $(DEX2OAT) --runtime-arg -Xms16m --runtime-arg -Xmx16m --image-classes=$(PRELOADED_CLASSES) $(addprefix \
-		--dex-file=,$(TARGET_CORE_DEX_FILES)) $(addprefix --dex-location=,$(TARGET_CORE_DEX_LOCATIONS)) --oat-file=$(TARGET_CORE_OAT_OUT) \
-		--oat-location=$(TARGET_CORE_OAT) --image=$(TARGET_CORE_IMG_OUT) --base=$(LIBART_IMG_TARGET_BASE_ADDRESS) \
-		--instruction-set=$(TARGET_ARCH) --instruction-set-features=$(TARGET_INSTRUCTION_SET_FEATURES) --android-root=$(PRODUCT_OUT)/system
-
 $(HOST_CORE_OAT_OUT): $(HOST_CORE_IMG_OUT)
 
-$(TARGET_CORE_OAT_OUT): $(TARGET_CORE_IMG_OUT)
+define create-oat-target-targets
+$$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
+	@echo "target dex2oat: $$@ ($$?)"
+	@mkdir -p $$(dir $$@)
+	$$(hide) $$(DEX2OATD) --runtime-arg -Xms16m --runtime-arg -Xmx16m --image-classes=$$(PRELOADED_CLASSES) $$(addprefix \
+		--dex-file=,$$(TARGET_CORE_DEX_FILES)) $$(addprefix --dex-location=,$$(TARGET_CORE_DEX_LOCATIONS)) --oat-file=$$($(1)TARGET_CORE_OAT_OUT) \
+		--oat-location=$$($(1)TARGET_CORE_OAT) --image=$$($(1)TARGET_CORE_IMG_OUT) --base=$$(LIBART_IMG_TARGET_BASE_ADDRESS) \
+		--instruction-set=$$($(1)TARGET_ARCH) --instruction-set-features=$$(TARGET_INSTRUCTION_SET_FEATURES) --android-root=$$(PRODUCT_OUT)/system
+
+$$($(1)TARGET_CORE_OAT_OUT): $$($(1)TARGET_CORE_IMG_OUT)
+endef
+
+ifdef TARGET_2ND_ARCH
+$(eval $(call create-oat-target-targets,2ND_))
+endif
+$(eval $(call create-oat-target-targets,))
+
 
 ifeq ($(ART_BUILD_HOST),true)
 include $(CLEAR_VARS)
diff --git a/compiler/common_compiler_test.h b/compiler/common_compiler_test.h
index 6aa85d4..9a21da0 100644
--- a/compiler/common_compiler_test.h
+++ b/compiler/common_compiler_test.h
@@ -291,7 +291,7 @@
 
       // Take the default set of instruction features from the build.
       InstructionSetFeatures instruction_set_features =
-          ParseFeatureList(STRINGIFY(ART_DEFAULT_INSTRUCTION_SET_FEATURES));
+          ParseFeatureList(Runtime::GetDefaultInstructionSetFeatures());
 
 #if defined(__arm__)
       instruction_set = kThumb2;
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 964a222..6c8c85d 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -401,7 +401,11 @@
   kRegUseSP,
   kSetsCCodes,
   kUsesCCodes,
-  kUseFpStack
+  kUseFpStack,
+  kUseHi,
+  kUseLo,
+  kDefHi,
+  kDefLo
 };
 
 enum SelectInstructionKind {
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index 3f122de..cc616f6 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -145,9 +145,7 @@
     return NULL;
   }
 
-  const CompilerOptions& compiler_options = driver.GetCompilerOptions();
-  CompilerOptions::CompilerFilter compiler_filter = compiler_options.GetCompilerFilter();
-  if (compiler_filter == CompilerOptions::kInterpretOnly) {
+  if (!driver.GetCompilerOptions().IsCompilationEnabled()) {
     return nullptr;
   }
 
@@ -230,10 +228,8 @@
                               class_loader, dex_file);
 
   cu.NewTimingSplit("MIROpt:CheckFilters");
-  if (compiler_filter != CompilerOptions::kInterpretOnly) {
-    if (cu.mir_graph->SkipCompilation()) {
-      return NULL;
-    }
+  if (cu.mir_graph->SkipCompilation()) {
+    return NULL;
   }
 
   /* Create the pass driver and launch it */
diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc
index b96c40d..200795e 100644
--- a/compiler/dex/mir_analysis.cc
+++ b/compiler/dex/mir_analysis.cc
@@ -1013,7 +1013,7 @@
     return true;
   }
 
-  if (compiler_filter == CompilerOptions::kInterpretOnly || compiler_filter == CompilerOptions::kProfiled) {
+  if (!compiler_options.IsCompilationEnabled() || compiler_filter == CompilerOptions::kProfiled) {
     return true;
   }
 
diff --git a/compiler/dex/quick/arm/assemble_arm.cc b/compiler/dex/quick/arm/assemble_arm.cc
index 7955d6c..1c35018 100644
--- a/compiler/dex/quick/arm/assemble_arm.cc
+++ b/compiler/dex/quick/arm/assemble_arm.cc
@@ -408,6 +408,19 @@
                  kFmtBitBlt, 2, 0, kFmtBitBlt, 5, 3, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_UNARY_OP | REG_USE01 | SETS_CCODES,
                  "tst", "!0C, !1C", 2, kFixupNone),
+    /*
+     * Note: The encoding map entries for vldrd and vldrs include REG_DEF_LR, even though
+     * these instructions don't define lr.  The reason is that these instructions
+     * are used for loading values from the literal pool, and the displacement may be found
+     * to be insuffient at assembly time.  In that case, we need to materialize a new base
+     * register - and will use lr as the temp register.  This works because lr is used as
+     * a temp register in very limited situations, and never in conjunction with a floating
+     * point constant load.  However, it is possible that during instruction scheduling,
+     * another use of lr could be moved across a vldrd/vldrs.  By setting REG_DEF_LR, we
+     * prevent that from happening.  Note that we set REG_DEF_LR on all vldrd/vldrs - even those
+     * not used in a pc-relative case.  It is really only needed on the pc-relative loads, but
+     * the case we're handling is rare enough that it seemed not worth the trouble to distinguish.
+     */
     ENCODING_MAP(kThumb2Vldrs,       0xed900a00,
                  kFmtSfp, 22, 12, kFmtBitBlt, 19, 16, kFmtBitBlt, 7, 0,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE1 | IS_LOAD |
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index d6724f1..8c9f8ea 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -164,7 +164,7 @@
   // Making a call - use explicit registers
   FlushAllRegs();   /* Everything to home location */
   LoadValueDirectFixed(rl_src, rs_r0);
-  LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(pHandleFillArrayData).Int32Value(),
+  LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pHandleFillArrayData).Int32Value(),
                rs_rARM_LR);
   // Materialize a pointer to the fill data image
   NewLIR3(kThumb2Adr, r1, 0, WrapPointer(tab_rec));
@@ -192,7 +192,7 @@
         null_check_branch = OpCmpImmBranch(kCondEq, rs_r0, 0, NULL);
       }
     }
-    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset().Int32Value(), rs_r2);
+    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
     NewLIR3(kThumb2Ldrex, r1, r0, mirror::Object::MonitorOffset().Int32Value() >> 2);
     MarkPossibleNullPointerException(opt_flags);
     LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_r1, 0, NULL);
@@ -207,7 +207,7 @@
     }
     // TODO: move to a slow path.
     // Go expensive route - artLockObjectFromCode(obj);
-    LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(pLockObject).Int32Value(), rs_rARM_LR);
+    LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pLockObject).Int32Value(), rs_rARM_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx, rs_rARM_LR);
     MarkSafepointPC(call_inst);
@@ -218,7 +218,7 @@
   } else {
     // Explicit null-check as slow-path is entered using an IT.
     GenNullCheck(rs_r0, opt_flags);
-    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset().Int32Value(), rs_r2);
+    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
     NewLIR3(kThumb2Ldrex, r1, r0, mirror::Object::MonitorOffset().Int32Value() >> 2);
     MarkPossibleNullPointerException(opt_flags);
     OpRegImm(kOpCmp, rs_r1, 0);
@@ -227,7 +227,7 @@
     OpRegImm(kOpCmp, rs_r1, 0);
     OpIT(kCondNe, "T");
     // Go expensive route - artLockObjectFromCode(self, obj);
-    LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(pLockObject).Int32Value(), rs_rARM_LR);
+    LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pLockObject).Int32Value(), rs_rARM_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rARM_LR);
     MarkSafepointPC(call_inst);
@@ -245,7 +245,7 @@
   LoadValueDirectFixed(rl_src, rs_r0);  // Get obj
   LockCallTemps();  // Prepare for explicit register usage
   LIR* null_check_branch = nullptr;
-  LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset().Int32Value(), rs_r2);
+  LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
   constexpr bool kArchVariantHasGoodBranchPredictor = false;  // TODO: true if cortex-A15.
   if (kArchVariantHasGoodBranchPredictor) {
     if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) {
@@ -270,7 +270,7 @@
     }
     // TODO: move to a slow path.
     // Go expensive route - artUnlockObjectFromCode(obj);
-    LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(pUnlockObject).Int32Value(), rs_rARM_LR);
+    LoadWordDisp(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject).Int32Value(), rs_rARM_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx, rs_rARM_LR);
     MarkSafepointPC(call_inst);
@@ -283,14 +283,14 @@
     GenNullCheck(rs_r0, opt_flags);
     LoadWordDisp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);  // Get lock
     MarkPossibleNullPointerException(opt_flags);
-    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset().Int32Value(), rs_r2);
+    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
     LoadConstantNoClobber(rs_r3, 0);
     // Is lock unheld on lock or held by us (==thread_id) on unlock?
     OpRegReg(kOpCmp, rs_r1, rs_r2);
     OpIT(kCondEq, "EE");
     StoreWordDisp/*eq*/(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r3);
     // Go expensive route - UnlockObjectFromCode(obj);
-    LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(pUnlockObject).Int32Value(),
+    LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject).Int32Value(),
                        rs_rARM_LR);
     ClobberCallerSave();
     LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rARM_LR);
@@ -300,7 +300,7 @@
 }
 
 void ArmMir2Lir::GenMoveException(RegLocation rl_dest) {
-  int ex_offset = Thread::ExceptionOffset().Int32Value();
+  int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage reset_reg = AllocTemp();
   LoadWordDisp(rs_rARM_SELF, ex_offset, rl_result.reg);
@@ -317,7 +317,7 @@
   RegStorage reg_card_base = AllocTemp();
   RegStorage reg_card_no = AllocTemp();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
-  LoadWordDisp(rs_rARM_SELF, Thread::CardTableOffset().Int32Value(), reg_card_base);
+  LoadWordDisp(rs_rARM_SELF, Thread::CardTableOffset<4>().Int32Value(), reg_card_base);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
@@ -350,7 +350,7 @@
   if (!skip_overflow_check) {
     if (Runtime::Current()->ExplicitStackOverflowChecks()) {
       /* Load stack limit */
-      LoadWordDisp(rs_rARM_SELF, Thread::StackEndOffset().Int32Value(), rs_r12);
+      LoadWordDisp(rs_rARM_SELF, Thread::StackEndOffset<4>().Int32Value(), rs_r12);
     }
   }
   /* Spill core callee saves */
@@ -365,8 +365,8 @@
     NewLIR1(kThumb2VPushCS, num_fp_spills_);
   }
 
-  // TODO: 64 bit will be different code.
-  const int frame_size_without_spills = frame_size_ - spill_count * 4;
+  const int spill_size = spill_count * 4;
+  const int frame_size_without_spills = frame_size_ - spill_size;
   if (!skip_overflow_check) {
     if (Runtime::Current()->ExplicitStackOverflowChecks()) {
       class StackOverflowSlowPath : public LIRSlowPath {
@@ -384,7 +384,7 @@
           }
           m2l_->OpRegImm(kOpAdd, rs_rARM_SP, sp_displace_);
           m2l_->ClobberCallerSave();
-          ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowStackOverflow);
+          ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowStackOverflow);
           // Load the entrypoint directly into the pc instead of doing a load + branch. Assumes
           // codegen and target are in thumb2 mode.
           m2l_->LoadWordDisp(rs_rARM_SELF, func_offset.Int32Value(), rs_rARM_PC);
@@ -398,8 +398,7 @@
         OpRegRegImm(kOpSub, rs_rARM_LR, rs_rARM_SP, frame_size_without_spills);
         LIR* branch = OpCmpBranch(kCondUlt, rs_rARM_LR, rs_r12, nullptr);
         // Need to restore LR since we used it as a temp.
-        AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, true,
-                                                     frame_size_without_spills));
+        AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, true, spill_size));
         OpRegCopy(rs_rARM_SP, rs_rARM_LR);     // Establish stack
       } else {
         // If the frame is small enough we are guaranteed to have enough space that remains to
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 8bfdb6a..0ad2b70 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -22,15 +22,16 @@
 
 namespace art {
 
-class ArmMir2Lir : public Mir2Lir {
+class ArmMir2Lir FINAL : public Mir2Lir {
   public:
     ArmMir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena);
 
     // Required for target - codegen helpers.
     bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
                             RegLocation rl_dest, int lit);
+    bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(ThreadOffset offset);
+    RegStorage LoadHelper(ThreadOffset<4> offset);
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size,
                       int s_reg);
     LIR* LoadBaseDispWide(RegStorage r_base, int displacement, RegStorage r_dest, int s_reg);
@@ -171,18 +172,19 @@
     LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
     LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
     LIR* OpTestSuspend(LIR* target);
-    LIR* OpThreadMem(OpKind op, ThreadOffset thread_offset);
+    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset);
     LIR* OpVldm(RegStorage r_base, int count);
     LIR* OpVstm(RegStorage r_base, int count);
     void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
     void OpRegCopyWide(RegStorage dest, RegStorage src);
-    void OpTlsCmp(ThreadOffset offset, int val);
+    void OpTlsCmp(ThreadOffset<4> offset, int val);
 
     LIR* LoadBaseDispBody(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size,
                           int s_reg);
     LIR* StoreBaseDispBody(RegStorage r_base, int displacement, RegStorage r_src, OpSize size);
-    LIR* OpRegRegRegShift(OpKind op, int r_dest, int r_src1, int r_src2, int shift);
-    LIR* OpRegRegShift(OpKind op, int r_dest_src1, int r_src2, int shift);
+    LIR* OpRegRegRegShift(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2,
+                          int shift);
+    LIR* OpRegRegShift(OpKind op, RegStorage r_dest_src1, RegStorage r_src2, int shift);
     static const ArmEncodingMap EncodingMap[kArmLast];
     int EncodeShift(int code, int amount);
     int ModifiedImmediate(uint32_t value);
@@ -202,6 +204,13 @@
     RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
                           bool is_div, bool check_zero);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div);
+    typedef struct {
+      OpKind op;
+      uint32_t shift;
+    } EasyMultiplyOp;
+    bool GetEasyMultiplyOp(int lit, EasyMultiplyOp* op);
+    bool GetEasyMultiplyTwoOps(int lit, EasyMultiplyOp* ops);
+    void GenEasyMultiplyTwoOps(RegStorage r_dest, RegStorage r_src, EasyMultiplyOp* ops);
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/arm/fp_arm.cc b/compiler/dex/quick/arm/fp_arm.cc
index 398bf96..07a13ce 100644
--- a/compiler/dex/quick/arm/fp_arm.cc
+++ b/compiler/dex/quick/arm/fp_arm.cc
@@ -49,7 +49,7 @@
     case Instruction::REM_FLOAT_2ADDR:
     case Instruction::REM_FLOAT:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmodf), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmodf), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturn(true);
       StoreValue(rl_dest, rl_result);
@@ -92,7 +92,7 @@
     case Instruction::REM_DOUBLE_2ADDR:
     case Instruction::REM_DOUBLE:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmod), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmod), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturnWide(true);
       StoreValueWide(rl_dest, rl_result);
@@ -162,7 +162,7 @@
       return;
     }
     case Instruction::FLOAT_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pF2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pF2l), rl_dest, rl_src);
       return;
     case Instruction::LONG_TO_FLOAT: {
       rl_src = LoadValueWide(rl_src, kFPReg);
@@ -192,7 +192,7 @@
       return;
     }
     case Instruction::DOUBLE_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pD2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pD2l), rl_dest, rl_src);
       return;
     default:
       LOG(FATAL) << "Unexpected opcode: " << opcode;
@@ -359,7 +359,7 @@
   branch = NewLIR2(kThumbBCond, 0, kArmCondEq);
   ClobberCallerSave();
   LockCallTemps();  // Using fixed registers
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pSqrt));
+  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pSqrt));
   NewLIR3(kThumb2Fmrrd, r0, r1, S2d(rl_src.reg.GetLowReg(), rl_src.reg.GetHighReg()));
   NewLIR1(kThumbBlxR, r_tgt.GetReg());
   NewLIR3(kThumb2Fmdrr, S2d(rl_result.reg.GetLowReg(), rl_result.reg.GetHighReg()), r0, r1);
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 46db466..8177999 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -439,19 +439,18 @@
   NewLIR4(kThumb2Smull, r_lo.GetReg(), r_hi.GetReg(), r_magic.GetReg(), rl_src.reg.GetReg());
   switch (pattern) {
     case Divide3:
-      OpRegRegRegShift(kOpSub, rl_result.reg.GetReg(), r_hi.GetReg(),
-               rl_src.reg.GetReg(), EncodeShift(kArmAsr, 31));
+      OpRegRegRegShift(kOpSub, rl_result.reg, r_hi, rl_src.reg, EncodeShift(kArmAsr, 31));
       break;
     case Divide5:
       OpRegRegImm(kOpAsr, r_lo, rl_src.reg, 31);
-      OpRegRegRegShift(kOpRsub, rl_result.reg.GetReg(), r_lo.GetReg(), r_hi.GetReg(),
-               EncodeShift(kArmAsr, magic_table[lit].shift));
+      OpRegRegRegShift(kOpRsub, rl_result.reg, r_lo, r_hi,
+                       EncodeShift(kArmAsr, magic_table[lit].shift));
       break;
     case Divide7:
       OpRegReg(kOpAdd, r_hi, rl_src.reg);
       OpRegRegImm(kOpAsr, r_lo, rl_src.reg, 31);
-      OpRegRegRegShift(kOpRsub, rl_result.reg.GetReg(), r_lo.GetReg(), r_hi.GetReg(),
-               EncodeShift(kArmAsr, magic_table[lit].shift));
+      OpRegRegRegShift(kOpRsub, rl_result.reg, r_lo, r_hi,
+                       EncodeShift(kArmAsr, magic_table[lit].shift));
       break;
     default:
       LOG(FATAL) << "Unexpected pattern: " << pattern;
@@ -460,6 +459,108 @@
   return true;
 }
 
+// Try to convert *lit to 1 RegRegRegShift/RegRegShift form.
+bool ArmMir2Lir::GetEasyMultiplyOp(int lit, ArmMir2Lir::EasyMultiplyOp* op) {
+  if (IsPowerOfTwo(lit)) {
+    op->op = kOpLsl;
+    op->shift = LowestSetBit(lit);
+    return true;
+  }
+
+  if (IsPowerOfTwo(lit - 1)) {
+    op->op = kOpAdd;
+    op->shift = LowestSetBit(lit - 1);
+    return true;
+  }
+
+  if (IsPowerOfTwo(lit + 1)) {
+    op->op = kOpRsub;
+    op->shift = LowestSetBit(lit + 1);
+    return true;
+  }
+
+  op->op = kOpInvalid;
+  return false;
+}
+
+// Try to convert *lit to 1~2 RegRegRegShift/RegRegShift forms.
+bool ArmMir2Lir::GetEasyMultiplyTwoOps(int lit, EasyMultiplyOp* ops) {
+  GetEasyMultiplyOp(lit, &ops[0]);
+  if (GetEasyMultiplyOp(lit, &ops[0])) {
+    ops[1].op = kOpInvalid;
+    return true;
+  }
+
+  int lit1 = lit;
+  uint32_t shift = LowestSetBit(lit1);
+  if (GetEasyMultiplyOp(lit1 >> shift, &ops[0])) {
+    ops[1].op = kOpLsl;
+    ops[1].shift = shift;
+    return true;
+  }
+
+  lit1 = lit - 1;
+  shift = LowestSetBit(lit1);
+  if (GetEasyMultiplyOp(lit1 >> shift, &ops[0])) {
+    ops[1].op = kOpAdd;
+    ops[1].shift = shift;
+    return true;
+  }
+
+  lit1 = lit + 1;
+  shift = LowestSetBit(lit1);
+  if (GetEasyMultiplyOp(lit1 >> shift, &ops[0])) {
+    ops[1].op = kOpRsub;
+    ops[1].shift = shift;
+    return true;
+  }
+
+  return false;
+}
+
+void ArmMir2Lir::GenEasyMultiplyTwoOps(RegStorage r_dest, RegStorage r_src, EasyMultiplyOp* ops) {
+  // dest = ( src << shift1) + [ src | -src | 0 ]
+  // dest = (dest << shift2) + [ src | -src | 0 ]
+  for (int i = 0; i < 2; i++) {
+    RegStorage r_src2;
+    if (i == 0) {
+      r_src2 = r_src;
+    } else {
+      r_src2 = r_dest;
+    }
+    switch (ops[i].op) {
+    case kOpLsl:
+      OpRegRegImm(kOpLsl, r_dest, r_src2, ops[i].shift);
+      break;
+    case kOpAdd:
+      OpRegRegRegShift(kOpAdd, r_dest, r_src, r_src2, EncodeShift(kArmLsl, ops[i].shift));
+      break;
+    case kOpRsub:
+      OpRegRegRegShift(kOpRsub, r_dest, r_src, r_src2, EncodeShift(kArmLsl, ops[i].shift));
+      break;
+    default:
+      DCHECK_NE(i, 0);
+      DCHECK_EQ(ops[i].op, kOpInvalid);
+      break;
+    }
+  }
+}
+
+bool ArmMir2Lir::EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) {
+  EasyMultiplyOp ops[2];
+
+  if (!GetEasyMultiplyTwoOps(lit, ops)) {
+    return false;
+  }
+
+  rl_src = LoadValue(rl_src, kCoreReg);
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+
+  GenEasyMultiplyTwoOps(rl_result.reg, rl_src.reg, ops);
+  StoreValue(rl_dest, rl_result);
+  return true;
+}
+
 LIR* ArmMir2Lir::GenRegMemCheck(ConditionCode c_code, RegStorage reg1, RegStorage base,
                                 int offset, ThrowKind kind) {
   LOG(FATAL) << "Unexpected use of GenRegMemCheck for Arm";
@@ -578,7 +679,7 @@
   LOG(FATAL) << "Unexpected use of OpLea for Arm";
 }
 
-void ArmMir2Lir::OpTlsCmp(ThreadOffset offset, int val) {
+void ArmMir2Lir::OpTlsCmp(ThreadOffset<4> offset, int val) {
   LOG(FATAL) << "Unexpected use of OpTlsCmp for Arm";
 }
 
@@ -715,6 +816,8 @@
 
   // Still one conditional left from OpIT(kCondEq, "T") from either branch
   OpRegImm(kOpCmp /* eq */, r_tmp, 1);
+  GenBarrier();
+
   OpCondBranch(kCondEq, target);
 
   if (!load_early) {
@@ -728,6 +831,7 @@
   OpIT(kCondUlt, "");
   LoadConstant(rl_result.reg, 0); /* cc */
   FreeTemp(r_tmp);  // Now unneeded.
+  GenBarrier();     // Barrier to terminate OpIT.
 
   StoreValue(rl_dest, rl_result);
 
@@ -752,7 +856,7 @@
 void ArmMir2Lir::GenMultiplyByTwoBitMultiplier(RegLocation rl_src,
                                                RegLocation rl_result, int lit,
                                                int first_bit, int second_bit) {
-  OpRegRegRegShift(kOpAdd, rl_result.reg.GetReg(), rl_src.reg.GetReg(), rl_src.reg.GetReg(),
+  OpRegRegRegShift(kOpAdd, rl_result.reg, rl_src.reg, rl_src.reg,
                    EncodeShift(kArmLsl, second_bit - first_bit));
   if (first_bit != 0) {
     OpRegRegImm(kOpLsl, rl_result.reg, rl_result.reg, first_bit);
@@ -848,7 +952,7 @@
      */
     RegLocation rl_result;
     if (BadOverlap(rl_src1, rl_dest) || (BadOverlap(rl_src2, rl_dest))) {
-      ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pLmul);
+      ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLmul);
       FlushAllRegs();
       CallRuntimeHelperRegLocationRegLocation(func_offset, rl_src1, rl_src2, false);
       rl_result = GetReturnWide(false);
@@ -898,8 +1002,7 @@
       NewLIR3(kThumb2MulRRR, tmp1.GetReg(), rl_src1.reg.GetLowReg(), rl_src1.reg.GetHighReg());
       NewLIR4(kThumb2Umull, res_lo.GetReg(), res_hi.GetReg(), rl_src1.reg.GetLowReg(),
               rl_src1.reg.GetLowReg());
-      OpRegRegRegShift(kOpAdd, res_hi.GetReg(), res_hi.GetReg(), tmp1.GetReg(),
-                       EncodeShift(kArmLsl, 1));
+      OpRegRegRegShift(kOpAdd, res_hi, res_hi, tmp1, EncodeShift(kArmLsl, 1));
     } else {
       NewLIR3(kThumb2MulRRR, tmp1.GetReg(), rl_src2.reg.GetLowReg(), rl_src1.reg.GetHighReg());
       if (reg_status == 2) {
@@ -1009,8 +1112,7 @@
     } else {
       // No special indexed operation, lea + load w/ displacement
       reg_ptr = AllocTemp();
-      OpRegRegRegShift(kOpAdd, reg_ptr.GetReg(), rl_array.reg.GetReg(), rl_index.reg.GetReg(),
-                       EncodeShift(kArmLsl, scale));
+      OpRegRegRegShift(kOpAdd, reg_ptr, rl_array.reg, rl_index.reg, EncodeShift(kArmLsl, scale));
       FreeTemp(rl_index.reg.GetReg());
     }
     rl_result = EvalLoc(rl_dest, reg_class, true);
@@ -1117,8 +1219,7 @@
       rl_src = LoadValue(rl_src, reg_class);
     }
     if (!constant_index) {
-      OpRegRegRegShift(kOpAdd, reg_ptr.GetReg(), rl_array.reg.GetReg(), rl_index.reg.GetReg(),
-                       EncodeShift(kArmLsl, scale));
+      OpRegRegRegShift(kOpAdd, reg_ptr, rl_array.reg, rl_index.reg, EncodeShift(kArmLsl, scale));
     }
     if (needs_range_check) {
       if (constant_index) {
@@ -1183,7 +1284,7 @@
         LoadConstant(rl_result.reg.GetLow(), 0);
       } else {
         OpRegRegImm(kOpLsl, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), shift_amount);
-        OpRegRegRegShift(kOpOr, rl_result.reg.GetHighReg(), rl_result.reg.GetHighReg(), rl_src.reg.GetLowReg(),
+        OpRegRegRegShift(kOpOr, rl_result.reg.GetHigh(), rl_result.reg.GetHigh(), rl_src.reg.GetLow(),
                          EncodeShift(kArmLsr, 32 - shift_amount));
         OpRegRegImm(kOpLsl, rl_result.reg.GetLow(), rl_src.reg.GetLow(), shift_amount);
       }
@@ -1199,7 +1300,7 @@
       } else {
         RegStorage t_reg = AllocTemp();
         OpRegRegImm(kOpLsr, t_reg, rl_src.reg.GetLow(), shift_amount);
-        OpRegRegRegShift(kOpOr, rl_result.reg.GetLowReg(), t_reg.GetReg(), rl_src.reg.GetHighReg(),
+        OpRegRegRegShift(kOpOr, rl_result.reg.GetLow(), t_reg, rl_src.reg.GetHigh(),
                          EncodeShift(kArmLsl, 32 - shift_amount));
         FreeTemp(t_reg);
         OpRegRegImm(kOpAsr, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), shift_amount);
@@ -1216,7 +1317,7 @@
       } else {
         RegStorage t_reg = AllocTemp();
         OpRegRegImm(kOpLsr, t_reg, rl_src.reg.GetLow(), shift_amount);
-        OpRegRegRegShift(kOpOr, rl_result.reg.GetLowReg(), t_reg.GetReg(), rl_src.reg.GetHighReg(),
+        OpRegRegRegShift(kOpOr, rl_result.reg.GetLow(), t_reg, rl_src.reg.GetHigh(),
                          EncodeShift(kArmLsl, 32 - shift_amount));
         FreeTemp(t_reg);
         OpRegRegImm(kOpLsr, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), shift_amount);
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index 5bab0e3..5e9a8b0 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -559,10 +559,13 @@
       (arena_->Alloc(num_fp_regs * sizeof(*reg_pool_->FPRegs), kArenaAllocRegAlloc));
   CompilerInitPool(reg_pool_->core_regs, core_regs, reg_pool_->num_core_regs);
   CompilerInitPool(reg_pool_->FPRegs, FpRegs, reg_pool_->num_fp_regs);
+
   // Keep special registers from being allocated
+  // Don't reserve the r4 if we are doing implicit suspend checks.
+  bool no_suspend = NO_SUSPEND || !Runtime::Current()->ExplicitSuspendChecks();
   for (int i = 0; i < num_reserved; i++) {
-    if (NO_SUSPEND && (ReservedRegs[i] == rARM_SUSPEND)) {
-      // To measure cost of suspend check
+    if (no_suspend && (ReservedRegs[i] == rARM_SUSPEND)) {
+      // Don't reserve the suspend register.
       continue;
     }
     MarkInUse(ReservedRegs[i]);
@@ -727,14 +730,14 @@
   FreeTemp(r3);
 }
 
-RegStorage ArmMir2Lir::LoadHelper(ThreadOffset offset) {
+RegStorage ArmMir2Lir::LoadHelper(ThreadOffset<4> offset) {
   LoadWordDisp(rs_rARM_SELF, offset.Int32Value(), rs_rARM_LR);
   return rs_rARM_LR;
 }
 
 LIR* ArmMir2Lir::CheckSuspendUsingLoad() {
   RegStorage tmp = rs_r0;
-  LoadWordDisp(rs_rARM_SELF, Thread::ThreadSuspendTriggerOffset().Int32Value(), tmp);
+  LoadWordDisp(rs_rARM_SELF, Thread::ThreadSuspendTriggerOffset<4>().Int32Value(), tmp);
   LIR* load2 = LoadWordDisp(tmp, 0, tmp);
   return load2;
 }
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 8df5b25..70cbdd2 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -234,9 +234,10 @@
   return NewLIR1(opcode, r_dest_src.GetReg());
 }
 
-LIR* ArmMir2Lir::OpRegRegShift(OpKind op, int r_dest_src1, int r_src2,
+LIR* ArmMir2Lir::OpRegRegShift(OpKind op, RegStorage r_dest_src1, RegStorage r_src2,
                                int shift) {
-  bool thumb_form = ((shift == 0) && ARM_LOWREG(r_dest_src1) && ARM_LOWREG(r_src2));
+  bool thumb_form =
+      ((shift == 0) && ARM_LOWREG(r_dest_src1.GetReg()) && ARM_LOWREG(r_src2.GetReg()));
   ArmOpcode opcode = kThumbBkpt;
   switch (op) {
     case kOpAdc:
@@ -255,9 +256,9 @@
     case kOpCmp:
       if (thumb_form)
         opcode = kThumbCmpRR;
-      else if ((shift == 0) && !ARM_LOWREG(r_dest_src1) && !ARM_LOWREG(r_src2))
+      else if ((shift == 0) && !ARM_LOWREG(r_dest_src1.GetReg()) && !ARM_LOWREG(r_src2.GetReg()))
         opcode = kThumbCmpHH;
-      else if ((shift == 0) && ARM_LOWREG(r_dest_src1))
+      else if ((shift == 0) && ARM_LOWREG(r_dest_src1.GetReg()))
         opcode = kThumbCmpLH;
       else if (shift == 0)
         opcode = kThumbCmpHL;
@@ -269,11 +270,11 @@
       break;
     case kOpMov:
       DCHECK_EQ(shift, 0);
-      if (ARM_LOWREG(r_dest_src1) && ARM_LOWREG(r_src2))
+      if (ARM_LOWREG(r_dest_src1.GetReg()) && ARM_LOWREG(r_src2.GetReg()))
         opcode = kThumbMovRR;
-      else if (!ARM_LOWREG(r_dest_src1) && !ARM_LOWREG(r_src2))
+      else if (!ARM_LOWREG(r_dest_src1.GetReg()) && !ARM_LOWREG(r_src2.GetReg()))
         opcode = kThumbMovRR_H2H;
-      else if (ARM_LOWREG(r_dest_src1))
+      else if (ARM_LOWREG(r_dest_src1.GetReg()))
         opcode = kThumbMovRR_H2L;
       else
         opcode = kThumbMovRR_L2H;
@@ -324,7 +325,7 @@
       DCHECK_EQ(shift, 0);
       if (!thumb_form) {
         // Binary, but rm is encoded twice.
-        return NewLIR3(kThumb2RevRR, r_dest_src1, r_src2, r_src2);
+        return NewLIR3(kThumb2RevRR, r_dest_src1.GetReg(), r_src2.GetReg(), r_src2.GetReg());
       }
       opcode = kThumbRev;
       break;
@@ -332,34 +333,34 @@
       DCHECK_EQ(shift, 0);
       if (!thumb_form) {
         // Binary, but rm is encoded twice.
-        return NewLIR3(kThumb2RevshRR, r_dest_src1, r_src2, r_src2);
+        return NewLIR3(kThumb2RevshRR, r_dest_src1.GetReg(), r_src2.GetReg(), r_src2.GetReg());
       }
       opcode = kThumbRevsh;
       break;
     case kOp2Byte:
       DCHECK_EQ(shift, 0);
-      return NewLIR4(kThumb2Sbfx, r_dest_src1, r_src2, 0, 8);
+      return NewLIR4(kThumb2Sbfx, r_dest_src1.GetReg(), r_src2.GetReg(), 0, 8);
     case kOp2Short:
       DCHECK_EQ(shift, 0);
-      return NewLIR4(kThumb2Sbfx, r_dest_src1, r_src2, 0, 16);
+      return NewLIR4(kThumb2Sbfx, r_dest_src1.GetReg(), r_src2.GetReg(), 0, 16);
     case kOp2Char:
       DCHECK_EQ(shift, 0);
-      return NewLIR4(kThumb2Ubfx, r_dest_src1, r_src2, 0, 16);
+      return NewLIR4(kThumb2Ubfx, r_dest_src1.GetReg(), r_src2.GetReg(), 0, 16);
     default:
       LOG(FATAL) << "Bad opcode: " << op;
       break;
   }
   DCHECK(!IsPseudoLirOp(opcode));
   if (EncodingMap[opcode].flags & IS_BINARY_OP) {
-    return NewLIR2(opcode, r_dest_src1, r_src2);
+    return NewLIR2(opcode, r_dest_src1.GetReg(), r_src2.GetReg());
   } else if (EncodingMap[opcode].flags & IS_TERTIARY_OP) {
     if (EncodingMap[opcode].field_loc[2].kind == kFmtShift) {
-      return NewLIR3(opcode, r_dest_src1, r_src2, shift);
+      return NewLIR3(opcode, r_dest_src1.GetReg(), r_src2.GetReg(), shift);
     } else {
-      return NewLIR3(opcode, r_dest_src1, r_dest_src1, r_src2);
+      return NewLIR3(opcode, r_dest_src1.GetReg(), r_dest_src1.GetReg(), r_src2.GetReg());
     }
   } else if (EncodingMap[opcode].flags & IS_QUAD_OP) {
-    return NewLIR4(opcode, r_dest_src1, r_dest_src1, r_src2, shift);
+    return NewLIR4(opcode, r_dest_src1.GetReg(), r_dest_src1.GetReg(), r_src2.GetReg(), shift);
   } else {
     LOG(FATAL) << "Unexpected encoding operand count";
     return NULL;
@@ -367,7 +368,7 @@
 }
 
 LIR* ArmMir2Lir::OpRegReg(OpKind op, RegStorage r_dest_src1, RegStorage r_src2) {
-  return OpRegRegShift(op, r_dest_src1.GetReg(), r_src2.GetReg(), 0);
+  return OpRegRegShift(op, r_dest_src1, r_src2, 0);
 }
 
 LIR* ArmMir2Lir::OpMovRegMem(RegStorage r_dest, RegStorage r_base, int offset, MoveType move_type) {
@@ -385,11 +386,11 @@
   return NULL;
 }
 
-LIR* ArmMir2Lir::OpRegRegRegShift(OpKind op, int r_dest, int r_src1,
-                                  int r_src2, int shift) {
+LIR* ArmMir2Lir::OpRegRegRegShift(OpKind op, RegStorage r_dest, RegStorage r_src1,
+                                  RegStorage r_src2, int shift) {
   ArmOpcode opcode = kThumbBkpt;
-  bool thumb_form = (shift == 0) && ARM_LOWREG(r_dest) && ARM_LOWREG(r_src1) &&
-      ARM_LOWREG(r_src2);
+  bool thumb_form = (shift == 0) && ARM_LOWREG(r_dest.GetReg()) && ARM_LOWREG(r_src1.GetReg()) &&
+      ARM_LOWREG(r_src2.GetReg());
   switch (op) {
     case kOpAdd:
       opcode = (thumb_form) ? kThumbAddRRR : kThumb2AddRRR;
@@ -448,15 +449,15 @@
   }
   DCHECK(!IsPseudoLirOp(opcode));
   if (EncodingMap[opcode].flags & IS_QUAD_OP) {
-    return NewLIR4(opcode, r_dest, r_src1, r_src2, shift);
+    return NewLIR4(opcode, r_dest.GetReg(), r_src1.GetReg(), r_src2.GetReg(), shift);
   } else {
     DCHECK(EncodingMap[opcode].flags & IS_TERTIARY_OP);
-    return NewLIR3(opcode, r_dest, r_src1, r_src2);
+    return NewLIR3(opcode, r_dest.GetReg(), r_src1.GetReg(), r_src2.GetReg());
   }
 }
 
 LIR* ArmMir2Lir::OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2) {
-  return OpRegRegRegShift(op, r_dest.GetReg(), r_src1.GetReg(), r_src2.GetReg(), 0);
+  return OpRegRegRegShift(op, r_dest, r_src1, r_src2, 0);
 }
 
 LIR* ArmMir2Lir::OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value) {
@@ -1109,7 +1110,7 @@
   return res;
 }
 
-LIR* ArmMir2Lir::OpThreadMem(OpKind op, ThreadOffset thread_offset) {
+LIR* ArmMir2Lir::OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) {
   LOG(FATAL) << "Unexpected use of OpThreadMem for Arm";
   return NULL;
 }
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 866ce5f..7af9d57 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "dex/compiler_ir.h"
 #include "dex/compiler_internals.h"
 #include "dex/quick/arm/arm_lir.h"
@@ -251,7 +250,7 @@
 void Mir2Lir::GenNewArray(uint32_t type_idx, RegLocation rl_dest,
                           RegLocation rl_src) {
   FlushAllRegs();  /* Everything to home location */
-  ThreadOffset func_offset(-1);
+  ThreadOffset<4> func_offset(-1);
   const DexFile* dex_file = cu_->dex_file;
   CompilerDriver* driver = cu_->compiler_driver;
   if (cu_->compiler_driver->CanAccessTypeWithoutChecks(cu_->method_idx, *dex_file,
@@ -265,22 +264,22 @@
       // The fast path.
       if (!use_direct_type_ptr) {
         LoadClassType(type_idx, kArg0);
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocArrayResolved);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocArrayResolved);
         CallRuntimeHelperRegMethodRegLocation(func_offset, TargetReg(kArg0), rl_src, true);
       } else {
         // Use the direct pointer.
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocArrayResolved);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocArrayResolved);
         CallRuntimeHelperImmMethodRegLocation(func_offset, direct_type_ptr, rl_src, true);
       }
     } else {
       // The slow path.
       DCHECK_EQ(func_offset.Int32Value(), -1);
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocArray);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocArray);
       CallRuntimeHelperImmMethodRegLocation(func_offset, type_idx, rl_src, true);
     }
     DCHECK_NE(func_offset.Int32Value(), -1);
   } else {
-    func_offset= QUICK_ENTRYPOINT_OFFSET(pAllocArrayWithAccessCheck);
+    func_offset= QUICK_ENTRYPOINT_OFFSET(4, pAllocArrayWithAccessCheck);
     CallRuntimeHelperImmMethodRegLocation(func_offset, type_idx, rl_src, true);
   }
   RegLocation rl_result = GetReturn(false);
@@ -297,12 +296,12 @@
   int elems = info->num_arg_words;
   int type_idx = info->index;
   FlushAllRegs();  /* Everything to home location */
-  ThreadOffset func_offset(-1);
+  ThreadOffset<4> func_offset(-1);
   if (cu_->compiler_driver->CanAccessTypeWithoutChecks(cu_->method_idx, *cu_->dex_file,
                                                        type_idx)) {
-    func_offset = QUICK_ENTRYPOINT_OFFSET(pCheckAndAllocArray);
+    func_offset = QUICK_ENTRYPOINT_OFFSET(4, pCheckAndAllocArray);
   } else {
-    func_offset = QUICK_ENTRYPOINT_OFFSET(pCheckAndAllocArrayWithAccessCheck);
+    func_offset = QUICK_ENTRYPOINT_OFFSET(4, pCheckAndAllocArrayWithAccessCheck);
   }
   CallRuntimeHelperImmMethodImm(func_offset, type_idx, elems, true);
   FreeTemp(TargetReg(kArg2));
@@ -410,7 +409,7 @@
   void Compile() {
     LIR* unresolved_target = GenerateTargetLabel();
     uninit_->target = unresolved_target;
-    m2l_->CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(pInitializeStaticStorage),
+    m2l_->CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeStaticStorage),
                                storage_index_, true);
     // Copy helper's result into r_base, a no-op on all but MIPS.
     m2l_->OpRegCopy(r_base_,  m2l_->TargetReg(kRet0));
@@ -502,10 +501,10 @@
     FreeTemp(r_base);
   } else {
     FlushAllRegs();  // Everything to home locations
-    ThreadOffset setter_offset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pSet64Static)
-                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(pSetObjStatic)
-                                       : QUICK_ENTRYPOINT_OFFSET(pSet32Static));
+    ThreadOffset<4> setter_offset =
+        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pSet64Static)
+                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(4, pSetObjStatic)
+                                       : QUICK_ENTRYPOINT_OFFSET(4, pSet32Static));
     CallRuntimeHelperImmRegLocation(setter_offset, field_info.FieldIndex(), rl_src, true);
   }
 }
@@ -583,10 +582,10 @@
     }
   } else {
     FlushAllRegs();  // Everything to home locations
-    ThreadOffset getterOffset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pGet64Static)
-                          :(is_object ? QUICK_ENTRYPOINT_OFFSET(pGetObjStatic)
-                                      : QUICK_ENTRYPOINT_OFFSET(pGet32Static));
+    ThreadOffset<4> getterOffset =
+        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pGet64Static)
+                          :(is_object ? QUICK_ENTRYPOINT_OFFSET(4, pGetObjStatic)
+                                      : QUICK_ENTRYPOINT_OFFSET(4, pGet32Static));
     CallRuntimeHelperImm(getterOffset, field_info.FieldIndex(), true);
     if (is_long_or_double) {
       RegLocation rl_result = GetReturnWide(rl_dest.fp);
@@ -610,7 +609,7 @@
 
 void Mir2Lir::HandleSuspendLaunchPads() {
   int num_elems = suspend_launchpads_.Size();
-  ThreadOffset helper_offset = QUICK_ENTRYPOINT_OFFSET(pTestSuspend);
+  ThreadOffset<4> helper_offset = QUICK_ENTRYPOINT_OFFSET(4, pTestSuspend);
   for (int i = 0; i < num_elems; i++) {
     ResetRegPool();
     ResetDefTracking();
@@ -632,13 +631,13 @@
     LIR* lab = throw_launchpads_.Get(i);
     current_dalvik_offset_ = lab->operands[1];
     AppendLIR(lab);
-    ThreadOffset func_offset(-1);
+    ThreadOffset<4> func_offset(-1);
     int v1 = lab->operands[2];
     int v2 = lab->operands[3];
     const bool target_x86 = cu_->instruction_set == kX86;
     switch (lab->operands[0]) {
       case kThrowNullPointer:
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowNullPointer);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowNullPointer);
         break;
       case kThrowConstantArrayBounds:  // v1 is length reg (for Arm/Mips), v2 constant index
         // v1 holds the constant array index.  Mips/Arm uses v2 for length, x86 reloads.
@@ -651,7 +650,7 @@
         // Make sure the following LoadConstant doesn't mess with kArg1.
         LockTemp(TargetReg(kArg1));
         LoadConstant(TargetReg(kArg0), v2);
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowArrayBounds);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds);
         break;
       case kThrowArrayBounds:
         // Move v1 (array index) to kArg0 and v2 (array length) to kArg1
@@ -687,15 +686,15 @@
             OpRegCopy(TargetReg(kArg0), RegStorage::Solo32(v1));
           }
         }
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowArrayBounds);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds);
         break;
       case kThrowDivZero:
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowDivZero);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowDivZero);
         break;
       case kThrowNoSuchMethod:
         OpRegCopy(TargetReg(kArg0), RegStorage::Solo32(v1));
         func_offset =
-          QUICK_ENTRYPOINT_OFFSET(pThrowNoSuchMethod);
+          QUICK_ENTRYPOINT_OFFSET(4, pThrowNoSuchMethod);
         break;
       default:
         LOG(FATAL) << "Unexpected throw kind: " << lab->operands[0];
@@ -762,10 +761,10 @@
       StoreValue(rl_dest, rl_result);
     }
   } else {
-    ThreadOffset getterOffset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pGet64Instance)
-                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(pGetObjInstance)
-                                       : QUICK_ENTRYPOINT_OFFSET(pGet32Instance));
+    ThreadOffset<4> getterOffset =
+        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pGet64Instance)
+                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(4, pGetObjInstance)
+                                       : QUICK_ENTRYPOINT_OFFSET(4, pGet32Instance));
     CallRuntimeHelperImmRegLocation(getterOffset, field_info.FieldIndex(), rl_obj, true);
     if (is_long_or_double) {
       RegLocation rl_result = GetReturnWide(rl_dest.fp);
@@ -820,10 +819,10 @@
       }
     }
   } else {
-    ThreadOffset setter_offset =
-        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(pSet64Instance)
-                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(pSetObjInstance)
-                                       : QUICK_ENTRYPOINT_OFFSET(pSet32Instance));
+    ThreadOffset<4> setter_offset =
+        is_long_or_double ? QUICK_ENTRYPOINT_OFFSET(4, pSet64Instance)
+                          : (is_object ? QUICK_ENTRYPOINT_OFFSET(4, pSetObjInstance)
+                                       : QUICK_ENTRYPOINT_OFFSET(4, pSet32Instance));
     CallRuntimeHelperImmRegLocationRegLocation(setter_offset, field_info.FieldIndex(),
                                                rl_obj, rl_src, true);
   }
@@ -834,10 +833,10 @@
   bool needs_range_check = !(opt_flags & MIR_IGNORE_RANGE_CHECK);
   bool needs_null_check = !((cu_->disable_opt & (1 << kNullCheckElimination)) &&
       (opt_flags & MIR_IGNORE_NULL_CHECK));
-  ThreadOffset helper = needs_range_check
-      ? (needs_null_check ? QUICK_ENTRYPOINT_OFFSET(pAputObjectWithNullAndBoundCheck)
-                          : QUICK_ENTRYPOINT_OFFSET(pAputObjectWithBoundCheck))
-      : QUICK_ENTRYPOINT_OFFSET(pAputObject);
+  ThreadOffset<4> helper = needs_range_check
+      ? (needs_null_check ? QUICK_ENTRYPOINT_OFFSET(4, pAputObjectWithNullAndBoundCheck)
+                          : QUICK_ENTRYPOINT_OFFSET(4, pAputObjectWithBoundCheck))
+      : QUICK_ENTRYPOINT_OFFSET(4, pAputObject);
   CallRuntimeHelperRegLocationRegLocationRegLocation(helper, rl_array, rl_index, rl_src, true);
 }
 
@@ -850,7 +849,7 @@
                                                    type_idx)) {
     // Call out to helper which resolves type and verifies access.
     // Resolved type returned in kRet0.
-    CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(pInitializeTypeAndVerifyAccess),
+    CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
                             type_idx, rl_method.reg, true);
     RegLocation rl_result = GetReturn(false);
     StoreValue(rl_dest, rl_result);
@@ -882,7 +881,7 @@
         void Compile() {
           GenerateTargetLabel();
 
-          m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(pInitializeType), type_idx_,
+          m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx_,
                                         rl_method_.reg, true);
           m2l_->OpRegCopy(rl_result_.reg,  m2l_->TargetReg(kRet0));
 
@@ -950,7 +949,7 @@
         void Compile() {
           GenerateTargetLabel();
 
-          RegStorage r_tgt = m2l_->CallHelperSetup(QUICK_ENTRYPOINT_OFFSET(pResolveString));
+          RegStorage r_tgt = m2l_->CallHelperSetup(QUICK_ENTRYPOINT_OFFSET(4, pResolveString));
 
           m2l_->OpRegCopy(m2l_->TargetReg(kArg0), r_method_);   // .eq
           LIR* call_inst = m2l_->OpReg(kOpBlx, r_tgt);
@@ -970,7 +969,7 @@
       DCHECK_EQ(cu_->instruction_set, kX86);
       LIR* branch = OpCmpImmBranch(kCondNe, TargetReg(kRet0), 0, NULL);
       LoadConstant(TargetReg(kArg1), string_idx);
-      CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(pResolveString), r_method, TargetReg(kArg1),
+      CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pResolveString), r_method, TargetReg(kArg1),
                               true);
       LIR* target = NewLIR0(kPseudoTargetLabel);
       branch->target = target;
@@ -995,7 +994,7 @@
   FlushAllRegs();  /* Everything to home location */
   // alloc will always check for resolution, do we also need to verify
   // access because the verifier was unable to?
-  ThreadOffset func_offset(-1);
+  ThreadOffset<4> func_offset(-1);
   const DexFile* dex_file = cu_->dex_file;
   CompilerDriver* driver = cu_->compiler_driver;
   if (driver->CanAccessInstantiableTypeWithoutChecks(
@@ -1010,31 +1009,31 @@
       if (!use_direct_type_ptr) {
         LoadClassType(type_idx, kArg0);
         if (!is_type_initialized) {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObjectResolved);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectResolved);
           CallRuntimeHelperRegMethod(func_offset, TargetReg(kArg0), true);
         } else {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObjectInitialized);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectInitialized);
           CallRuntimeHelperRegMethod(func_offset, TargetReg(kArg0), true);
         }
       } else {
         // Use the direct pointer.
         if (!is_type_initialized) {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObjectResolved);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectResolved);
           CallRuntimeHelperImmMethod(func_offset, direct_type_ptr, true);
         } else {
-          func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObjectInitialized);
+          func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectInitialized);
           CallRuntimeHelperImmMethod(func_offset, direct_type_ptr, true);
         }
       }
     } else {
       // The slow path.
       DCHECK_EQ(func_offset.Int32Value(), -1);
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObject);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObject);
       CallRuntimeHelperImmMethod(func_offset, type_idx, true);
     }
     DCHECK_NE(func_offset.Int32Value(), -1);
   } else {
-    func_offset = QUICK_ENTRYPOINT_OFFSET(pAllocObjectWithAccessCheck);
+    func_offset = QUICK_ENTRYPOINT_OFFSET(4, pAllocObjectWithAccessCheck);
     CallRuntimeHelperImmMethod(func_offset, type_idx, true);
   }
   RegLocation rl_result = GetReturn(false);
@@ -1043,7 +1042,7 @@
 
 void Mir2Lir::GenThrow(RegLocation rl_src) {
   FlushAllRegs();
-  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(pDeliverException), rl_src, true);
+  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pDeliverException), rl_src, true);
 }
 
 // For final classes there are no sub-classes to check and so we can answer the instance-of
@@ -1084,6 +1083,7 @@
     OpRegReg(kOpCmp, check_class, object_class);  // Same?
     OpIT(kCondEq, "");   // if-convert the test
     LoadConstant(result_reg, 1);     // .eq case - load true
+    GenBarrier();
   } else {
     ne_branchover = OpCmpBranch(kCondNe, check_class, object_class, NULL);
     LoadConstant(result_reg, 1);     // eq case - load true
@@ -1118,7 +1118,7 @@
   if (needs_access_check) {
     // Check we have access to type_idx and if not throw IllegalAccessError,
     // returns Class* in kArg0
-    CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(pInitializeTypeAndVerifyAccess),
+    CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
                          type_idx, true);
     OpRegCopy(class_reg, TargetReg(kRet0));  // Align usage with fast path
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));  // kArg0 <= ref
@@ -1140,7 +1140,7 @@
       LIR* hop_branch = OpCmpImmBranch(kCondNe, class_reg, 0, NULL);
       // Not resolved
       // Call out to helper, which will return resolved type in kRet0
-      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(pInitializeType), type_idx, true);
+      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx, true);
       OpRegCopy(TargetReg(kArg2), TargetReg(kRet0));  // Align usage with fast path
       LoadValueDirectFixed(rl_src, TargetReg(kArg0));  /* reload Ref */
       // Rejoin code paths
@@ -1168,6 +1168,7 @@
       OpIT(kCondEq, "E");   // if-convert the test
       LoadConstant(rl_result.reg, 1);     // .eq case - load true
       LoadConstant(rl_result.reg, 0);     // .ne case - load false
+      GenBarrier();
     } else {
       LoadConstant(rl_result.reg, 0);     // ne case - load false
       branchover = OpCmpBranch(kCondNe, TargetReg(kArg1), TargetReg(kArg2), NULL);
@@ -1175,7 +1176,7 @@
     }
   } else {
     if (cu_->instruction_set == kThumb2) {
-      RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pInstanceofNonTrivial));
+      RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pInstanceofNonTrivial));
       if (!type_known_abstract) {
       /* Uses conditional nullification */
         OpRegReg(kOpCmp, TargetReg(kArg1), TargetReg(kArg2));  // Same?
@@ -1184,6 +1185,7 @@
       }
       OpRegCopy(TargetReg(kArg0), TargetReg(kArg2));    // .ne case - arg0 <= class
       OpReg(kOpBlx, r_tgt);    // .ne case: helper(class, ref->class)
+      GenBarrier();
       FreeTemp(r_tgt);
     } else {
       if (!type_known_abstract) {
@@ -1191,7 +1193,7 @@
         LoadConstant(rl_result.reg, 1);     // assume true
         branchover = OpCmpBranch(kCondEq, TargetReg(kArg1), TargetReg(kArg2), NULL);
       }
-      RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pInstanceofNonTrivial));
+      RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pInstanceofNonTrivial));
       OpRegCopy(TargetReg(kArg0), TargetReg(kArg2));    // .ne case - arg0 <= class
       OpReg(kOpBlx, r_tgt);    // .ne case: helper(class, ref->class)
       FreeTemp(r_tgt);
@@ -1252,7 +1254,7 @@
     // Check we have access to type_idx and if not throw IllegalAccessError,
     // returns Class* in kRet0
     // InitializeTypeAndVerifyAccess(idx, method)
-    CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(pInitializeTypeAndVerifyAccess),
+    CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
                             type_idx, TargetReg(kArg1), true);
     OpRegCopy(class_reg, TargetReg(kRet0));  // Align usage with fast path
   } else if (use_declaring_class) {
@@ -1285,7 +1287,7 @@
 
           // Call out to helper, which will return resolved type in kArg0
           // InitializeTypeFromCode(idx, method)
-          m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(pInitializeType), type_idx_,
+          m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx_,
                                         m2l_->TargetReg(kArg1), true);
           m2l_->OpRegCopy(class_reg_, m2l_->TargetReg(kRet0));  // Align usage with fast path
           m2l_->OpUnconditionalBranch(cont_);
@@ -1316,7 +1318,7 @@
         m2l_->LoadWordDisp(m2l_->TargetReg(kArg0), mirror::Object::ClassOffset().Int32Value(),
                            m2l_->TargetReg(kArg1));
       }
-      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(pCheckCast), m2l_->TargetReg(kArg2),
+      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pCheckCast), m2l_->TargetReg(kArg2),
                                     m2l_->TargetReg(kArg1), true);
 
       m2l_->OpUnconditionalBranch(cont_);
@@ -1401,20 +1403,20 @@
 
 void Mir2Lir::GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
                              RegLocation rl_src1, RegLocation rl_shift) {
-  ThreadOffset func_offset(-1);
+  ThreadOffset<4> func_offset(-1);
 
   switch (opcode) {
     case Instruction::SHL_LONG:
     case Instruction::SHL_LONG_2ADDR:
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pShlLong);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pShlLong);
       break;
     case Instruction::SHR_LONG:
     case Instruction::SHR_LONG_2ADDR:
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pShrLong);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pShrLong);
       break;
     case Instruction::USHR_LONG:
     case Instruction::USHR_LONG_2ADDR:
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pUshrLong);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pUshrLong);
       break;
     default:
       LOG(FATAL) << "Unexpected case";
@@ -1547,7 +1549,7 @@
 
     // If we haven't already generated the code use the callout function.
     if (!done) {
-      ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pIdivmod);
+      ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pIdivmod);
       FlushAllRegs();   /* Send everything to home location */
       LoadValueDirectFixed(rl_src2, TargetReg(kArg1));
       RegStorage r_tgt = CallHelperSetup(func_offset);
@@ -1632,14 +1634,32 @@
 // Returns true if it added instructions to 'cu' to multiply 'rl_src' by 'lit'
 // and store the result in 'rl_dest'.
 bool Mir2Lir::HandleEasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) {
+  if (lit < 0) {
+    return false;
+  }
+  if (lit == 0) {
+    RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+    LoadConstant(rl_result.reg, 0);
+    StoreValue(rl_dest, rl_result);
+    return true;
+  }
+  if (lit == 1) {
+    rl_src = LoadValue(rl_src, kCoreReg);
+    RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+    OpRegCopy(rl_result.reg, rl_src.reg);
+    StoreValue(rl_dest, rl_result);
+    return true;
+  }
+  // There is RegRegRegShift on Arm, so check for more special cases.
+  // TODO: disabled, need to handle case of "dest == src" properly.
+  if (false && cu_->instruction_set == kThumb2) {
+    return EasyMultiply(rl_src, rl_dest, lit);
+  }
   // Can we simplify this multiplication?
   bool power_of_two = false;
   bool pop_count_le2 = false;
   bool power_of_two_minus_one = false;
-  if (lit < 2) {
-    // Avoid special cases.
-    return false;
-  } else if (IsPowerOfTwo(lit)) {
+  if (IsPowerOfTwo(lit)) {
     power_of_two = true;
   } else if (IsPopCountLE2(lit)) {
     pop_count_le2 = true;
@@ -1798,7 +1818,7 @@
         FlushAllRegs();   /* Everything to home location. */
         LoadValueDirectFixed(rl_src, TargetReg(kArg0));
         Clobber(TargetReg(kArg0));
-        ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pIdivmod);
+        ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pIdivmod);
         CallRuntimeHelperRegImm(func_offset, TargetReg(kArg0), lit, false);
         if (is_div)
           rl_result = GetReturn(false);
@@ -1829,7 +1849,7 @@
   OpKind second_op = kOpBkpt;
   bool call_out = false;
   bool check_zero = false;
-  ThreadOffset func_offset(-1);
+  ThreadOffset<4> func_offset(-1);
   int ret_reg = TargetReg(kRet0).GetReg();
 
   switch (opcode) {
@@ -1875,7 +1895,7 @@
       } else {
         call_out = true;
         ret_reg = TargetReg(kRet0).GetReg();
-        func_offset = QUICK_ENTRYPOINT_OFFSET(pLmul);
+        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLmul);
       }
       break;
     case Instruction::DIV_LONG:
@@ -1883,13 +1903,13 @@
       call_out = true;
       check_zero = true;
       ret_reg = TargetReg(kRet0).GetReg();
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pLdiv);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLdiv);
       break;
     case Instruction::REM_LONG:
     case Instruction::REM_LONG_2ADDR:
       call_out = true;
       check_zero = true;
-      func_offset = QUICK_ENTRYPOINT_OFFSET(pLmod);
+      func_offset = QUICK_ENTRYPOINT_OFFSET(4, pLmod);
       /* NOTE - for Arm, result is in kArg2/kArg3 instead of kRet0/kRet1 */
       ret_reg = (cu_->instruction_set == kThumb2) ? TargetReg(kArg2).GetReg() : TargetReg(kRet0).GetReg();
       break;
@@ -1951,7 +1971,7 @@
   }
 }
 
-void Mir2Lir::GenConversionCall(ThreadOffset func_offset,
+void Mir2Lir::GenConversionCall(ThreadOffset<4> func_offset,
                                 RegLocation rl_dest, RegLocation rl_src) {
   /*
    * Don't optimize the register usage since it calls out to support
@@ -2024,13 +2044,13 @@
 /* Call out to helper assembly routine that will null check obj and then lock it. */
 void Mir2Lir::GenMonitorEnter(int opt_flags, RegLocation rl_src) {
   FlushAllRegs();
-  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(pLockObject), rl_src, true);
+  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pLockObject), rl_src, true);
 }
 
 /* Call out to helper assembly routine that will null check obj and then unlock it. */
 void Mir2Lir::GenMonitorExit(int opt_flags, RegLocation rl_src) {
   FlushAllRegs();
-  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(pUnlockObject), rl_src, true);
+  CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject), rl_src, true);
 }
 
 /* Generic code for generating a wide constant into a VR. */
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 7689b51..352130d 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -66,12 +66,12 @@
  * has a memory call operation, part 1 is a NOP for x86.  For other targets,
  * load arguments between the two parts.
  */
-RegStorage Mir2Lir::CallHelperSetup(ThreadOffset helper_offset) {
+RegStorage Mir2Lir::CallHelperSetup(ThreadOffset<4> helper_offset) {
   return (cu_->instruction_set == kX86) ? RegStorage::InvalidReg() : LoadHelper(helper_offset);
 }
 
 /* NOTE: if r_tgt is a temp, it will be freed following use */
-LIR* Mir2Lir::CallHelper(RegStorage r_tgt, ThreadOffset helper_offset, bool safepoint_pc,
+LIR* Mir2Lir::CallHelper(RegStorage r_tgt, ThreadOffset<4> helper_offset, bool safepoint_pc,
                          bool use_link) {
   LIR* call_inst;
   OpKind op = use_link ? kOpBlx : kOpBx;
@@ -87,21 +87,22 @@
   return call_inst;
 }
 
-void Mir2Lir::CallRuntimeHelperImm(ThreadOffset helper_offset, int arg0, bool safepoint_pc) {
+void Mir2Lir::CallRuntimeHelperImm(ThreadOffset<4> helper_offset, int arg0, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadConstant(TargetReg(kArg0), arg0);
   ClobberCallerSave();
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperReg(ThreadOffset helper_offset, RegStorage arg0, bool safepoint_pc) {
+void Mir2Lir::CallRuntimeHelperReg(ThreadOffset<4> helper_offset, RegStorage arg0,
+                                   bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   OpRegCopy(TargetReg(kArg0), arg0);
   ClobberCallerSave();
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegLocation(ThreadOffset helper_offset, RegLocation arg0,
+void Mir2Lir::CallRuntimeHelperRegLocation(ThreadOffset<4> helper_offset, RegLocation arg0,
                                            bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   if (arg0.wide == 0) {
@@ -114,7 +115,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmImm(ThreadOffset helper_offset, int arg0, int arg1,
+void Mir2Lir::CallRuntimeHelperImmImm(ThreadOffset<4> helper_offset, int arg0, int arg1,
                                       bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadConstant(TargetReg(kArg0), arg0);
@@ -123,7 +124,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmRegLocation(ThreadOffset helper_offset, int arg0,
+void Mir2Lir::CallRuntimeHelperImmRegLocation(ThreadOffset<4> helper_offset, int arg0,
                                               RegLocation arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   if (arg1.wide == 0) {
@@ -137,8 +138,8 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegLocationImm(ThreadOffset helper_offset, RegLocation arg0, int arg1,
-                                              bool safepoint_pc) {
+void Mir2Lir::CallRuntimeHelperRegLocationImm(ThreadOffset<4> helper_offset, RegLocation arg0,
+                                              int arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadValueDirectFixed(arg0, TargetReg(kArg0));
   LoadConstant(TargetReg(kArg1), arg1);
@@ -146,7 +147,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmReg(ThreadOffset helper_offset, int arg0, RegStorage arg1,
+void Mir2Lir::CallRuntimeHelperImmReg(ThreadOffset<4> helper_offset, int arg0, RegStorage arg1,
                                       bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   OpRegCopy(TargetReg(kArg1), arg1);
@@ -155,7 +156,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegImm(ThreadOffset helper_offset, RegStorage arg0, int arg1,
+void Mir2Lir::CallRuntimeHelperRegImm(ThreadOffset<4> helper_offset, RegStorage arg0, int arg1,
                                       bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   OpRegCopy(TargetReg(kArg0), arg0);
@@ -164,7 +165,8 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmMethod(ThreadOffset helper_offset, int arg0, bool safepoint_pc) {
+void Mir2Lir::CallRuntimeHelperImmMethod(ThreadOffset<4> helper_offset, int arg0,
+                                         bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadCurrMethodDirect(TargetReg(kArg1));
   LoadConstant(TargetReg(kArg0), arg0);
@@ -172,7 +174,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegMethod(ThreadOffset helper_offset, RegStorage arg0,
+void Mir2Lir::CallRuntimeHelperRegMethod(ThreadOffset<4> helper_offset, RegStorage arg0,
                                          bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_NE(TargetReg(kArg1).GetReg(), arg0.GetReg());
@@ -184,7 +186,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegMethodRegLocation(ThreadOffset helper_offset, RegStorage arg0,
+void Mir2Lir::CallRuntimeHelperRegMethodRegLocation(ThreadOffset<4> helper_offset, RegStorage arg0,
                                                     RegLocation arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_NE(TargetReg(kArg1).GetReg(), arg0.GetReg());
@@ -197,8 +199,9 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegLocationRegLocation(ThreadOffset helper_offset, RegLocation arg0,
-                                                      RegLocation arg1, bool safepoint_pc) {
+void Mir2Lir::CallRuntimeHelperRegLocationRegLocation(ThreadOffset<4> helper_offset,
+                                                      RegLocation arg0, RegLocation arg1,
+                                                      bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   if (arg0.wide == 0) {
     LoadValueDirectFixed(arg0, arg0.fp ? TargetReg(kFArg0) : TargetReg(kArg0));
@@ -246,7 +249,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegReg(ThreadOffset helper_offset, RegStorage arg0,
+void Mir2Lir::CallRuntimeHelperRegReg(ThreadOffset<4> helper_offset, RegStorage arg0,
                                       RegStorage arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_NE(TargetReg(kArg0).GetReg(), arg1.GetReg());  // check copy into arg0 won't clobber arg1
@@ -256,7 +259,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegRegImm(ThreadOffset helper_offset, RegStorage arg0,
+void Mir2Lir::CallRuntimeHelperRegRegImm(ThreadOffset<4> helper_offset, RegStorage arg0,
                                          RegStorage arg1, int arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_NE(TargetReg(kArg0).GetReg(), arg1.GetReg());  // check copy into arg0 won't clobber arg1
@@ -267,7 +270,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmMethodRegLocation(ThreadOffset helper_offset,
+void Mir2Lir::CallRuntimeHelperImmMethodRegLocation(ThreadOffset<4> helper_offset,
                                                     int arg0, RegLocation arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadValueDirectFixed(arg2, TargetReg(kArg2));
@@ -277,7 +280,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmMethodImm(ThreadOffset helper_offset, int arg0,
+void Mir2Lir::CallRuntimeHelperImmMethodImm(ThreadOffset<4> helper_offset, int arg0,
                                             int arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadCurrMethodDirect(TargetReg(kArg1));
@@ -287,7 +290,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset helper_offset,
+void Mir2Lir::CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset<4> helper_offset,
                                                          int arg0, RegLocation arg1,
                                                          RegLocation arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
@@ -304,7 +307,7 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
-void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset helper_offset,
+void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset<4> helper_offset,
                                                                  RegLocation arg0, RegLocation arg1,
                                                                  RegLocation arg2,
                                                                  bool safepoint_pc) {
@@ -597,7 +600,7 @@
   return state + 1;
 }
 
-static int NextInvokeInsnSP(CompilationUnit* cu, CallInfo* info, ThreadOffset trampoline,
+static int NextInvokeInsnSP(CompilationUnit* cu, CallInfo* info, ThreadOffset<4> trampoline,
                             int state, const MethodReference& target_method,
                             uint32_t method_idx) {
   Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
@@ -623,7 +626,7 @@
                                 const MethodReference& target_method,
                                 uint32_t unused, uintptr_t unused2,
                                 uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeStaticTrampolineWithAccessCheck);
+  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeStaticTrampolineWithAccessCheck);
   return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
 }
 
@@ -631,7 +634,7 @@
                                 const MethodReference& target_method,
                                 uint32_t unused, uintptr_t unused2,
                                 uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeDirectTrampolineWithAccessCheck);
+  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeDirectTrampolineWithAccessCheck);
   return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
 }
 
@@ -639,7 +642,7 @@
                                const MethodReference& target_method,
                                uint32_t unused, uintptr_t unused2,
                                uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeSuperTrampolineWithAccessCheck);
+  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeSuperTrampolineWithAccessCheck);
   return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
 }
 
@@ -647,7 +650,7 @@
                            const MethodReference& target_method,
                            uint32_t unused, uintptr_t unused2,
                            uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeVirtualTrampolineWithAccessCheck);
+  ThreadOffset<4> trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeVirtualTrampolineWithAccessCheck);
   return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
 }
 
@@ -656,7 +659,8 @@
                                                 const MethodReference& target_method,
                                                 uint32_t unused, uintptr_t unused2,
                                                 uintptr_t unused3, InvokeType unused4) {
-  ThreadOffset trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeInterfaceTrampolineWithAccessCheck);
+  ThreadOffset<4> trampoline =
+      QUICK_ENTRYPOINT_OFFSET(4, pInvokeInterfaceTrampolineWithAccessCheck);
   return NextInvokeInsnSP(cu, info, trampoline, state, target_method, 0);
 }
 
@@ -986,7 +990,7 @@
     // Generate memcpy
     OpRegRegImm(kOpAdd, TargetReg(kArg0), TargetReg(kSp), outs_offset);
     OpRegRegImm(kOpAdd, TargetReg(kArg1), TargetReg(kSp), start_offset);
-    CallRuntimeHelperRegRegImm(QUICK_ENTRYPOINT_OFFSET(pMemcpy), TargetReg(kArg0),
+    CallRuntimeHelperRegRegImm(QUICK_ENTRYPOINT_OFFSET(4, pMemcpy), TargetReg(kArg0),
                                TargetReg(kArg1), (info->num_arg_words - 3) * 4, false);
   }
 
@@ -1318,7 +1322,7 @@
     RegLocation rl_start = info->args[2];     // 3rd arg only present in III flavor of IndexOf.
     LoadValueDirectFixed(rl_start, reg_start);
   }
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pIndexOf));
+  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pIndexOf));
   GenExplicitNullCheck(reg_ptr, info->opt_flags);
   LIR* high_code_point_branch =
       rl_char.is_const ? nullptr : OpCmpImmBranch(kCondGt, reg_char, 0xFFFF, nullptr);
@@ -1356,7 +1360,7 @@
   LoadValueDirectFixed(rl_this, reg_this);
   LoadValueDirectFixed(rl_cmp, reg_cmp);
   RegStorage r_tgt = (cu_->instruction_set != kX86) ?
-      LoadHelper(QUICK_ENTRYPOINT_OFFSET(pStringCompareTo)) : RegStorage::InvalidReg();
+      LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pStringCompareTo)) : RegStorage::InvalidReg();
   GenExplicitNullCheck(reg_this, info->opt_flags);
   info->opt_flags |= MIR_IGNORE_NULL_CHECK;  // Record that we've null checked.
   // TUNING: check if rl_cmp.s_reg_low is already null checked
@@ -1366,7 +1370,7 @@
   if (cu_->instruction_set != kX86) {
     OpReg(kOpBlx, r_tgt);
   } else {
-    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(pStringCompareTo));
+    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(4, pStringCompareTo));
   }
   RegLocation rl_return = GetReturn(false);
   RegLocation rl_dest = InlineTarget(info);
@@ -1377,7 +1381,7 @@
 bool Mir2Lir::GenInlinedCurrentThread(CallInfo* info) {
   RegLocation rl_dest = InlineTarget(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  ThreadOffset offset = Thread::PeerOffset();
+  ThreadOffset<4> offset = Thread::PeerOffset<4>();
   if (cu_->instruction_set == kThumb2 || cu_->instruction_set == kMips) {
     LoadWordDisp(TargetReg(kSelf), offset.Int32Value(), rl_result.reg);
   } else {
@@ -1404,8 +1408,15 @@
   RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   if (is_long) {
-    OpRegReg(kOpAdd, rl_object.reg, rl_offset.reg);
-    LoadBaseDispWide(rl_object.reg, 0, rl_result.reg, INVALID_SREG);
+    if (cu_->instruction_set == kX86) {
+      LoadBaseIndexedDisp(rl_object.reg, rl_offset.reg, 1, 0, rl_result.reg.GetLow(),
+                          rl_result.reg.GetHigh(), kLong, INVALID_SREG);
+    } else {
+      RegStorage rl_temp_offset = AllocTemp();
+      OpRegRegReg(kOpAdd, rl_temp_offset, rl_object.reg, rl_offset.reg);
+      LoadBaseDispWide(rl_temp_offset, 0, rl_result.reg, INVALID_SREG);
+      FreeTemp(rl_temp_offset.GetReg());
+    }
   } else {
     LoadBaseIndexed(rl_object.reg, rl_offset.reg, rl_result.reg, 0, kWord);
   }
@@ -1445,8 +1456,15 @@
   RegLocation rl_value;
   if (is_long) {
     rl_value = LoadValueWide(rl_src_value, kCoreReg);
-    OpRegReg(kOpAdd, rl_object.reg, rl_offset.reg);
-    StoreBaseDispWide(rl_object.reg, 0, rl_value.reg);
+    if (cu_->instruction_set == kX86) {
+      StoreBaseIndexedDisp(rl_object.reg, rl_offset.reg, 1, 0, rl_value.reg.GetLow(),
+                           rl_value.reg.GetHigh(), kLong, INVALID_SREG);
+    } else {
+      RegStorage rl_temp_offset = AllocTemp();
+      OpRegRegReg(kOpAdd, rl_temp_offset, rl_object.reg, rl_offset.reg);
+      StoreBaseDispWide(rl_temp_offset, 0, rl_value.reg);
+      FreeTemp(rl_temp_offset.GetReg());
+    }
   } else {
     rl_value = LoadValue(rl_src_value, kCoreReg);
     StoreBaseIndexed(rl_object.reg, rl_offset.reg, rl_value.reg, 0, kWord);
@@ -1551,22 +1569,22 @@
                           mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().Int32Value());
       }
     } else {
-      ThreadOffset trampoline(-1);
+      ThreadOffset<4> trampoline(-1);
       switch (info->type) {
       case kInterface:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeInterfaceTrampolineWithAccessCheck);
+        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeInterfaceTrampolineWithAccessCheck);
         break;
       case kDirect:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeDirectTrampolineWithAccessCheck);
+        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeDirectTrampolineWithAccessCheck);
         break;
       case kStatic:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeStaticTrampolineWithAccessCheck);
+        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeStaticTrampolineWithAccessCheck);
         break;
       case kSuper:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeSuperTrampolineWithAccessCheck);
+        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeSuperTrampolineWithAccessCheck);
         break;
       case kVirtual:
-        trampoline = QUICK_ENTRYPOINT_OFFSET(pInvokeVirtualTrampolineWithAccessCheck);
+        trampoline = QUICK_ENTRYPOINT_OFFSET(4, pInvokeVirtualTrampolineWithAccessCheck);
         break;
       default:
         LOG(FATAL) << "Unexpected invoke type";
diff --git a/compiler/dex/quick/mips/assemble_mips.cc b/compiler/dex/quick/mips/assemble_mips.cc
index ee142e5..a579254 100644
--- a/compiler/dex/quick/mips/assemble_mips.cc
+++ b/compiler/dex/quick/mips/assemble_mips.cc
@@ -143,9 +143,9 @@
                  kFmtUnused, -1, -1, IS_BINARY_OP | IS_BRANCH | REG_USE01 |
                  NEEDS_FIXUP, "bne", "!0r,!1r,!2t!0N", 8),
     ENCODING_MAP(kMipsDiv, 0x0000001a,
-                 kFmtUnused, -1, -1, kFmtUnused, -1, -1, kFmtBitBlt, 25, 21,
-                 kFmtBitBlt, 20, 16, IS_QUAD_OP | REG_DEF01 | REG_USE23,
-                 "div", "!2r,!3r", 4),
+                 kFmtBitBlt, 25, 21, kFmtBitBlt, 20, 16, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF_HI | REG_DEF_LO | REG_USE01,
+                 "div", "!0r,!1r", 4),
 #if __mips_isa_rev >= 2
     ENCODING_MAP(kMipsExt, 0x7c000000,
                  kFmtBitBlt, 20, 16, kFmtBitBlt, 25, 21, kFmtBitBlt, 10, 6,
@@ -198,11 +198,11 @@
                  "lw", "!0r,!1d(!2r)", 4),
     ENCODING_MAP(kMipsMfhi, 0x00000010,
                  kFmtBitBlt, 15, 11, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
-                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
+                 kFmtUnused, -1, -1, IS_UNARY_OP | REG_DEF0 | REG_USE_HI,
                  "mfhi", "!0r", 4),
     ENCODING_MAP(kMipsMflo, 0x00000012,
                  kFmtBitBlt, 15, 11, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
-                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
+                 kFmtUnused, -1, -1, IS_UNARY_OP | REG_DEF0 | REG_USE_LO,
                  "mflo", "!0r", 4),
     ENCODING_MAP(kMipsMove, 0x00000025, /* or using zero reg */
                  kFmtBitBlt, 15, 11, kFmtBitBlt, 25, 21, kFmtUnused, -1, -1,
diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc
index 972457a..a938478 100644
--- a/compiler/dex/quick/mips/call_mips.cc
+++ b/compiler/dex/quick/mips/call_mips.cc
@@ -244,7 +244,7 @@
   GenBarrier();
   NewLIR0(kMipsCurrPC);  // Really a jal to .+8
   // Now, fill the branch delay slot with the helper load
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pHandleFillArrayData));
+  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pHandleFillArrayData));
   GenBarrier();  // Scheduling barrier
 
   // Construct BaseLabel and set up table base register
@@ -260,7 +260,7 @@
 }
 
 void MipsMir2Lir::GenMoveException(RegLocation rl_dest) {
-  int ex_offset = Thread::ExceptionOffset().Int32Value();
+  int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage reset_reg = AllocTemp();
   LoadWordDisp(rs_rMIPS_SELF, ex_offset, rl_result.reg);
@@ -277,7 +277,7 @@
   RegStorage reg_card_base = AllocTemp();
   RegStorage reg_card_no = AllocTemp();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
-  LoadWordDisp(rs_rMIPS_SELF, Thread::CardTableOffset().Int32Value(), reg_card_base);
+  LoadWordDisp(rs_rMIPS_SELF, Thread::CardTableOffset<4>().Int32Value(), reg_card_base);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
@@ -310,7 +310,7 @@
   RegStorage new_sp = AllocTemp();
   if (!skip_overflow_check) {
     /* Load stack limit */
-    LoadWordDisp(rs_rMIPS_SELF, Thread::StackEndOffset().Int32Value(), check_reg);
+    LoadWordDisp(rs_rMIPS_SELF, Thread::StackEndOffset<4>().Int32Value(), check_reg);
   }
   /* Spill core callee saves */
   SpillCoreRegs();
@@ -331,7 +331,7 @@
         m2l_->LoadWordDisp(rs_rMIPS_SP, 0, rs_rRA);
         m2l_->OpRegImm(kOpAdd, rs_rMIPS_SP, sp_displace_);
         m2l_->ClobberCallerSave();
-        ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowStackOverflow);
+        ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowStackOverflow);
         RegStorage r_tgt = m2l_->CallHelperSetup(func_offset);  // Doesn't clobber LR.
         m2l_->CallHelper(r_tgt, func_offset, false /* MarkSafepointPC */, false /* UseLink */);
       }
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index bc1ad02..c962ea3 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -22,15 +22,16 @@
 
 namespace art {
 
-class MipsMir2Lir : public Mir2Lir {
+class MipsMir2Lir FINAL : public Mir2Lir {
   public:
     MipsMir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena);
 
     // Required for target - codegen utilities.
     bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
                             RegLocation rl_dest, int lit);
+    bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(ThreadOffset offset);
+    RegStorage LoadHelper(ThreadOffset<4> offset);
     LIR* LoadBaseDisp(int r_base, int displacement, int r_dest, OpSize size, int s_reg);
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size,
                       int s_reg);
@@ -170,12 +171,12 @@
     LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
     LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
     LIR* OpTestSuspend(LIR* target);
-    LIR* OpThreadMem(OpKind op, ThreadOffset thread_offset);
+    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset);
     LIR* OpVldm(RegStorage r_base, int count);
     LIR* OpVstm(RegStorage r_base, int count);
     void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
     void OpRegCopyWide(RegStorage dest, RegStorage src);
-    void OpTlsCmp(ThreadOffset offset, int val);
+    void OpTlsCmp(ThreadOffset<4> offset, int val);
 
     // TODO: collapse r_dest.
     LIR* LoadBaseDispBody(RegStorage r_base, int displacement, RegStorage r_dest,
diff --git a/compiler/dex/quick/mips/fp_mips.cc b/compiler/dex/quick/mips/fp_mips.cc
index 2bc5540..a479dc7 100644
--- a/compiler/dex/quick/mips/fp_mips.cc
+++ b/compiler/dex/quick/mips/fp_mips.cc
@@ -50,7 +50,7 @@
     case Instruction::REM_FLOAT_2ADDR:
     case Instruction::REM_FLOAT:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmodf), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmodf), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturn(true);
       StoreValue(rl_dest, rl_result);
@@ -93,7 +93,7 @@
     case Instruction::REM_DOUBLE_2ADDR:
     case Instruction::REM_DOUBLE:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmod), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmod), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturnWide(true);
       StoreValueWide(rl_dest, rl_result);
@@ -135,22 +135,22 @@
       op = kMipsFcvtdw;
       break;
     case Instruction::FLOAT_TO_INT:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pF2iz), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pF2iz), rl_dest, rl_src);
       return;
     case Instruction::DOUBLE_TO_INT:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pD2iz), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pD2iz), rl_dest, rl_src);
       return;
     case Instruction::LONG_TO_DOUBLE:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pL2d), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pL2d), rl_dest, rl_src);
       return;
     case Instruction::FLOAT_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pF2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pF2l), rl_dest, rl_src);
       return;
     case Instruction::LONG_TO_FLOAT:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pL2f), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pL2f), rl_dest, rl_src);
       return;
     case Instruction::DOUBLE_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pD2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pD2l), rl_dest, rl_src);
       return;
     default:
       LOG(FATAL) << "Unexpected opcode: " << opcode;
@@ -176,22 +176,22 @@
 void MipsMir2Lir::GenCmpFP(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2) {
   bool wide = true;
-  ThreadOffset offset(-1);
+  ThreadOffset<4> offset(-1);
 
   switch (opcode) {
     case Instruction::CMPL_FLOAT:
-      offset = QUICK_ENTRYPOINT_OFFSET(pCmplFloat);
+      offset = QUICK_ENTRYPOINT_OFFSET(4, pCmplFloat);
       wide = false;
       break;
     case Instruction::CMPG_FLOAT:
-      offset = QUICK_ENTRYPOINT_OFFSET(pCmpgFloat);
+      offset = QUICK_ENTRYPOINT_OFFSET(4, pCmpgFloat);
       wide = false;
       break;
     case Instruction::CMPL_DOUBLE:
-      offset = QUICK_ENTRYPOINT_OFFSET(pCmplDouble);
+      offset = QUICK_ENTRYPOINT_OFFSET(4, pCmplDouble);
       break;
     case Instruction::CMPG_DOUBLE:
-      offset = QUICK_ENTRYPOINT_OFFSET(pCmpgDouble);
+      offset = QUICK_ENTRYPOINT_OFFSET(4, pCmpgDouble);
       break;
     default:
       LOG(FATAL) << "Unexpected opcode: " << opcode;
diff --git a/compiler/dex/quick/mips/int_mips.cc b/compiler/dex/quick/mips/int_mips.cc
index dfe8b35..f394185 100644
--- a/compiler/dex/quick/mips/int_mips.cc
+++ b/compiler/dex/quick/mips/int_mips.cc
@@ -229,12 +229,12 @@
 
 RegLocation MipsMir2Lir::GenDivRem(RegLocation rl_dest, RegStorage reg1, RegStorage reg2,
                                     bool is_div) {
-  NewLIR4(kMipsDiv, rHI, rLO, reg1.GetReg(), reg2.GetReg());
+  NewLIR2(kMipsDiv, reg1.GetReg(), reg2.GetReg());
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   if (is_div) {
-    NewLIR2(kMipsMflo, rl_result.reg.GetReg(), rLO);
+    NewLIR1(kMipsMflo, rl_result.reg.GetReg());
   } else {
-    NewLIR2(kMipsMfhi, rl_result.reg.GetReg(), rHI);
+    NewLIR1(kMipsMfhi, rl_result.reg.GetReg());
   }
   return rl_result;
 }
@@ -243,12 +243,12 @@
                                        bool is_div) {
   int t_reg = AllocTemp().GetReg();
   NewLIR3(kMipsAddiu, t_reg, rZERO, lit);
-  NewLIR4(kMipsDiv, rHI, rLO, reg1.GetReg(), t_reg);
+  NewLIR2(kMipsDiv, reg1.GetReg(), t_reg);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   if (is_div) {
-    NewLIR2(kMipsMflo, rl_result.reg.GetReg(), rLO);
+    NewLIR1(kMipsMflo, rl_result.reg.GetReg());
   } else {
-    NewLIR2(kMipsMfhi, rl_result.reg.GetReg(), rHI);
+    NewLIR1(kMipsMfhi, rl_result.reg.GetReg());
   }
   FreeTemp(t_reg);
   return rl_result;
@@ -270,7 +270,7 @@
   LOG(FATAL) << "Unexpected use of OpLea for Arm";
 }
 
-void MipsMir2Lir::OpTlsCmp(ThreadOffset offset, int val) {
+void MipsMir2Lir::OpTlsCmp(ThreadOffset<4> offset, int val) {
   LOG(FATAL) << "Unexpected use of OpTlsCmp for Arm";
 }
 
@@ -368,6 +368,11 @@
   return false;
 }
 
+bool MipsMir2Lir::EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) {
+  LOG(FATAL) << "Unexpected use of easyMultiply in Mips";
+  return false;
+}
+
 LIR* MipsMir2Lir::OpIT(ConditionCode cond, const char* guide) {
   LOG(FATAL) << "Unexpected use of OpIT in Mips";
   return NULL;
diff --git a/compiler/dex/quick/mips/mips_lir.h b/compiler/dex/quick/mips/mips_lir.h
index 96cd3d8..c5150ee 100644
--- a/compiler/dex/quick/mips/mips_lir.h
+++ b/compiler/dex/quick/mips/mips_lir.h
@@ -90,14 +90,12 @@
 #define MIPS_FP_REG_OFFSET 32
 // Offset to distinguish DP FP regs.
 #define MIPS_FP_DOUBLE 64
-// Offset to distingish the extra regs.
-#define MIPS_EXTRA_REG_OFFSET 128
 // Reg types.
 #define MIPS_REGTYPE(x) (x & (MIPS_FP_REG_OFFSET | MIPS_FP_DOUBLE))
 #define MIPS_FPREG(x) ((x & MIPS_FP_REG_OFFSET) == MIPS_FP_REG_OFFSET)
-#define MIPS_EXTRAREG(x) ((x & MIPS_EXTRA_REG_OFFSET) == MIPS_EXTRA_REG_OFFSET)
 #define MIPS_DOUBLEREG(x) ((x & MIPS_FP_DOUBLE) == MIPS_FP_DOUBLE)
 #define MIPS_SINGLEREG(x) (MIPS_FPREG(x) && !MIPS_DOUBLEREG(x))
+// FIXME: out of date comment.
 /*
  * Note: the low register of a floating point pair is sufficient to
  * create the name of a double, but require both names to be passed to
@@ -157,6 +155,8 @@
 #define ENCODE_MIPS_REG_SP           (1ULL << kMipsRegSP)
 #define ENCODE_MIPS_REG_LR           (1ULL << kMipsRegLR)
 #define ENCODE_MIPS_REG_PC           (1ULL << kMipsRegPC)
+#define ENCODE_MIPS_REG_HI           (1ULL << kMipsRegHI)
+#define ENCODE_MIPS_REG_LO           (1ULL << kMipsRegLO)
 
 enum MipsNativeRegisterPool {
   rZERO = 0,
@@ -248,9 +248,6 @@
   rDF14 = rF28 + MIPS_FP_DOUBLE,
   rDF15 = rF30 + MIPS_FP_DOUBLE,
 #endif
-  rHI = MIPS_EXTRA_REG_OFFSET,
-  rLO,
-  rPC,
 };
 
 const RegStorage rs_rZERO(RegStorage::k32BitSolo, rZERO);
diff --git a/compiler/dex/quick/mips/target_mips.cc b/compiler/dex/quick/mips/target_mips.cc
index 67a44fa..3e02fae 100644
--- a/compiler/dex/quick/mips/target_mips.cc
+++ b/compiler/dex/quick/mips/target_mips.cc
@@ -151,6 +151,22 @@
   if (flags & REG_DEF_LR) {
     lir->u.m.def_mask |= ENCODE_MIPS_REG_LR;
   }
+
+  if (flags & REG_DEF_HI) {
+    lir->u.m.def_mask |= ENCODE_MIPS_REG_HI;
+  }
+
+  if (flags & REG_DEF_LO) {
+    lir->u.m.def_mask |= ENCODE_MIPS_REG_LO;
+  }
+
+  if (flags & REG_USE_HI) {
+    lir->u.m.use_mask |= ENCODE_MIPS_REG_HI;
+  }
+
+  if (flags & REG_USE_LO) {
+    lir->u.m.use_mask |= ENCODE_MIPS_REG_LO;
+  }
 }
 
 /* For dumping instructions */
@@ -514,14 +530,14 @@
  * ensure that all branch instructions can be restarted if
  * there is a trap in the shadow.  Allocate a temp register.
  */
-RegStorage MipsMir2Lir::LoadHelper(ThreadOffset offset) {
+RegStorage MipsMir2Lir::LoadHelper(ThreadOffset<4> offset) {
   LoadWordDisp(rs_rMIPS_SELF, offset.Int32Value(), rs_rT9);
   return rs_rT9;
 }
 
 LIR* MipsMir2Lir::CheckSuspendUsingLoad() {
   RegStorage tmp = AllocTemp();
-  LoadWordDisp(rs_rMIPS_SELF, Thread::ThreadSuspendTriggerOffset().Int32Value(), tmp);
+  LoadWordDisp(rs_rMIPS_SELF, Thread::ThreadSuspendTriggerOffset<4>().Int32Value(), tmp);
   LIR *inst = LoadWordDisp(tmp, 0, tmp);
   FreeTemp(tmp);
   return inst;
diff --git a/compiler/dex/quick/mips/utility_mips.cc b/compiler/dex/quick/mips/utility_mips.cc
index 4f31341..c959510 100644
--- a/compiler/dex/quick/mips/utility_mips.cc
+++ b/compiler/dex/quick/mips/utility_mips.cc
@@ -642,7 +642,7 @@
   return StoreBaseDispBody(r_base, displacement, r_src.GetLow(), r_src.GetHigh(), kLong);
 }
 
-LIR* MipsMir2Lir::OpThreadMem(OpKind op, ThreadOffset thread_offset) {
+LIR* MipsMir2Lir::OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) {
   LOG(FATAL) << "Unexpected use of OpThreadMem for MIPS";
   return NULL;
 }
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 10f431f..1ad636b 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -84,6 +84,10 @@
 #define SETS_CCODES          (1ULL << kSetsCCodes)
 #define USES_CCODES          (1ULL << kUsesCCodes)
 #define USE_FP_STACK         (1ULL << kUseFpStack)
+#define REG_USE_LO           (1ULL << kUseLo)
+#define REG_USE_HI           (1ULL << kUseHi)
+#define REG_DEF_LO           (1ULL << kDefLo)
+#define REG_DEF_HI           (1ULL << kDefHi)
 
 // Common combo register usage patterns.
 #define REG_DEF01            (REG_DEF0 | REG_DEF1)
@@ -601,7 +605,7 @@
                           RegLocation rl_src, int lit);
     void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
                         RegLocation rl_src1, RegLocation rl_src2);
-    void GenConversionCall(ThreadOffset func_offset, RegLocation rl_dest,
+    void GenConversionCall(ThreadOffset<4> func_offset, RegLocation rl_dest,
                            RegLocation rl_src);
     void GenSuspendTest(int opt_flags);
     void GenSuspendTestAndBranch(int opt_flags, LIR* target);
@@ -612,43 +616,44 @@
                        RegLocation rl_src1, RegLocation rl_src2);
 
     // Shared by all targets - implemented in gen_invoke.cc.
-    LIR* CallHelper(RegStorage r_tgt, ThreadOffset helper_offset, bool safepoint_pc,
+    LIR* CallHelper(RegStorage r_tgt, ThreadOffset<4> helper_offset, bool safepoint_pc,
                     bool use_link = true);
-    RegStorage CallHelperSetup(ThreadOffset helper_offset);
-    void CallRuntimeHelperImm(ThreadOffset helper_offset, int arg0, bool safepoint_pc);
-    void CallRuntimeHelperReg(ThreadOffset helper_offset, RegStorage arg0, bool safepoint_pc);
-    void CallRuntimeHelperRegLocation(ThreadOffset helper_offset, RegLocation arg0,
+    RegStorage CallHelperSetup(ThreadOffset<4> helper_offset);
+    void CallRuntimeHelperImm(ThreadOffset<4> helper_offset, int arg0, bool safepoint_pc);
+    void CallRuntimeHelperReg(ThreadOffset<4> helper_offset, RegStorage arg0, bool safepoint_pc);
+    void CallRuntimeHelperRegLocation(ThreadOffset<4> helper_offset, RegLocation arg0,
                                       bool safepoint_pc);
-    void CallRuntimeHelperImmImm(ThreadOffset helper_offset, int arg0, int arg1,
+    void CallRuntimeHelperImmImm(ThreadOffset<4> helper_offset, int arg0, int arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperImmRegLocation(ThreadOffset helper_offset, int arg0,
+    void CallRuntimeHelperImmRegLocation(ThreadOffset<4> helper_offset, int arg0,
                                          RegLocation arg1, bool safepoint_pc);
-    void CallRuntimeHelperRegLocationImm(ThreadOffset helper_offset, RegLocation arg0,
+    void CallRuntimeHelperRegLocationImm(ThreadOffset<4> helper_offset, RegLocation arg0,
                                          int arg1, bool safepoint_pc);
-    void CallRuntimeHelperImmReg(ThreadOffset helper_offset, int arg0, RegStorage arg1,
+    void CallRuntimeHelperImmReg(ThreadOffset<4> helper_offset, int arg0, RegStorage arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperRegImm(ThreadOffset helper_offset, RegStorage arg0, int arg1,
+    void CallRuntimeHelperRegImm(ThreadOffset<4> helper_offset, RegStorage arg0, int arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperImmMethod(ThreadOffset helper_offset, int arg0,
+    void CallRuntimeHelperImmMethod(ThreadOffset<4> helper_offset, int arg0,
                                     bool safepoint_pc);
-    void CallRuntimeHelperRegMethod(ThreadOffset helper_offset, RegStorage arg0, bool safepoint_pc);
-    void CallRuntimeHelperRegMethodRegLocation(ThreadOffset helper_offset, RegStorage arg0,
+    void CallRuntimeHelperRegMethod(ThreadOffset<4> helper_offset, RegStorage arg0,
+                                    bool safepoint_pc);
+    void CallRuntimeHelperRegMethodRegLocation(ThreadOffset<4> helper_offset, RegStorage arg0,
                                                RegLocation arg2, bool safepoint_pc);
-    void CallRuntimeHelperRegLocationRegLocation(ThreadOffset helper_offset,
+    void CallRuntimeHelperRegLocationRegLocation(ThreadOffset<4> helper_offset,
                                                  RegLocation arg0, RegLocation arg1,
                                                  bool safepoint_pc);
-    void CallRuntimeHelperRegReg(ThreadOffset helper_offset, RegStorage arg0, RegStorage arg1,
+    void CallRuntimeHelperRegReg(ThreadOffset<4> helper_offset, RegStorage arg0, RegStorage arg1,
                                  bool safepoint_pc);
-    void CallRuntimeHelperRegRegImm(ThreadOffset helper_offset, RegStorage arg0, RegStorage arg1,
+    void CallRuntimeHelperRegRegImm(ThreadOffset<4> helper_offset, RegStorage arg0, RegStorage arg1,
                                     int arg2, bool safepoint_pc);
-    void CallRuntimeHelperImmMethodRegLocation(ThreadOffset helper_offset, int arg0,
+    void CallRuntimeHelperImmMethodRegLocation(ThreadOffset<4> helper_offset, int arg0,
                                                RegLocation arg2, bool safepoint_pc);
-    void CallRuntimeHelperImmMethodImm(ThreadOffset helper_offset, int arg0, int arg2,
+    void CallRuntimeHelperImmMethodImm(ThreadOffset<4> helper_offset, int arg0, int arg2,
                                        bool safepoint_pc);
-    void CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset helper_offset,
+    void CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset<4> helper_offset,
                                                     int arg0, RegLocation arg1, RegLocation arg2,
                                                     bool safepoint_pc);
-    void CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset helper_offset,
+    void CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset<4> helper_offset,
                                                             RegLocation arg0, RegLocation arg1,
                                                             RegLocation arg2,
                                                             bool safepoint_pc);
@@ -670,7 +675,8 @@
 
     /**
      * @brief Used to determine the register location of destination.
-     * @details This is needed during generation of inline intrinsics because it finds destination of return,
+     * @details This is needed during generation of inline intrinsics because it finds destination
+     *  of return,
      * either the physical register or the target of move-result.
      * @param info Information about the invoke.
      * @return Returns the destination location.
@@ -731,7 +737,8 @@
      * @brief Used to do the final store in a wide destination as per bytecode semantics.
      * @see StoreValue
      * @param rl_dest The destination dalvik register location.
-     * @param rl_src The source register location. Can be either physical register or dalvik register.
+     * @param rl_src The source register location. Can be either physical register or dalvik
+     *  register.
      */
     void StoreValueWide(RegLocation rl_dest, RegLocation rl_src);
 
@@ -811,8 +818,9 @@
     // Required for target - codegen helpers.
     virtual bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div,
                                     RegLocation rl_src, RegLocation rl_dest, int lit) = 0;
+    virtual bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) = 0;
     virtual LIR* CheckSuspendUsingLoad() = 0;
-    virtual RegStorage LoadHelper(ThreadOffset offset) = 0;
+    virtual RegStorage LoadHelper(ThreadOffset<4> offset) = 0;
     virtual LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size,
                               int s_reg) = 0;
     virtual LIR* LoadBaseDispWide(RegStorage r_base, int displacement, RegStorage r_dest,
@@ -948,7 +956,8 @@
 
     /**
      * @brief Used for generating code that throws ArithmeticException if both registers are zero.
-     * @details This is used for generating DivideByZero checks when divisor is held in two separate registers.
+     * @details This is used for generating DivideByZero checks when divisor is held in two
+     *  separate registers.
      * @param reg_lo The register holding the lower 32-bits.
      * @param reg_hi The register holding the upper 32-bits.
      */
@@ -1047,13 +1056,13 @@
     virtual LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1,
                              RegStorage r_src2) = 0;
     virtual LIR* OpTestSuspend(LIR* target) = 0;
-    virtual LIR* OpThreadMem(OpKind op, ThreadOffset thread_offset) = 0;
+    virtual LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) = 0;
     virtual LIR* OpVldm(RegStorage r_base, int count) = 0;
     virtual LIR* OpVstm(RegStorage r_base, int count) = 0;
     virtual void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale,
                        int offset) = 0;
     virtual void OpRegCopyWide(RegStorage dest, RegStorage src) = 0;
-    virtual void OpTlsCmp(ThreadOffset offset, int val) = 0;
+    virtual void OpTlsCmp(ThreadOffset<4> offset, int val) = 0;
     virtual bool InexpensiveConstantInt(int32_t value) = 0;
     virtual bool InexpensiveConstantFloat(int32_t value) = 0;
     virtual bool InexpensiveConstantLong(int64_t value) = 0;
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index d97cf4d..729b30d 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -156,12 +156,12 @@
   }
   NewLIR2(kX86PcRelAdr, rX86_ARG1, WrapPointer(tab_rec));
   NewLIR2(kX86Add32RR, rX86_ARG1, rX86_ARG2);
-  CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(pHandleFillArrayData), rs_rX86_ARG0,
+  CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pHandleFillArrayData), rs_rX86_ARG0,
                           rs_rX86_ARG1, true);
 }
 
 void X86Mir2Lir::GenMoveException(RegLocation rl_dest) {
-  int ex_offset = Thread::ExceptionOffset().Int32Value();
+  int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   NewLIR2(kX86Mov32RT, rl_result.reg.GetReg(), ex_offset);
   NewLIR2(kX86Mov32TI, ex_offset, 0);
@@ -175,7 +175,7 @@
   RegStorage reg_card_base = AllocTemp();
   RegStorage reg_card_no = AllocTemp();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
-  NewLIR2(kX86Mov32RT, reg_card_base.GetReg(), Thread::CardTableOffset().Int32Value());
+  NewLIR2(kX86Mov32RT, reg_card_base.GetReg(), Thread::CardTableOffset<4>().Int32Value());
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
@@ -222,7 +222,7 @@
         GenerateTargetLabel();
         m2l_->OpRegImm(kOpAdd, rs_rX86_SP, sp_displace_);
         m2l_->ClobberCallerSave();
-        ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowStackOverflow);
+        ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowStackOverflow);
         // Assumes codegen and target are in thumb2 mode.
         m2l_->CallHelper(RegStorage::InvalidReg(), func_offset, false /* MarkSafepointPC */,
                          false /* UseLink */);
@@ -240,7 +240,7 @@
     // in case a signal comes in that's not using an alternate signal stack and the large frame may
     // have moved us outside of the reserved area at the end of the stack.
     // cmp rX86_SP, fs:[stack_end_]; jcc throw_launchpad
-    OpRegThreadMem(kOpCmp, rX86_SP, Thread::StackEndOffset());
+    OpRegThreadMem(kOpCmp, rX86_SP, Thread::StackEndOffset<4>());
     LIR* branch = OpCondBranch(kCondUlt, nullptr);
     AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_ - 4));
   }
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 6d427e7..940a1da 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -22,15 +22,16 @@
 
 namespace art {
 
-class X86Mir2Lir : public Mir2Lir {
+class X86Mir2Lir FINAL : public Mir2Lir {
   public:
     X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena);
 
     // Required for target - codegen helpers.
     bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
                             RegLocation rl_dest, int lit);
+    bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
-    RegStorage LoadHelper(ThreadOffset offset);
+    RegStorage LoadHelper(ThreadOffset<4> offset);
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size,
                       int s_reg);
     LIR* LoadBaseDispWide(RegStorage r_base, int displacement, RegStorage r_dest, int s_reg);
@@ -245,14 +246,14 @@
     LIR* OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value);
     LIR* OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2);
     LIR* OpTestSuspend(LIR* target);
-    LIR* OpThreadMem(OpKind op, ThreadOffset thread_offset);
+    LIR* OpThreadMem(OpKind op, ThreadOffset<4> thread_offset);
     LIR* OpVldm(RegStorage r_base, int count);
     LIR* OpVstm(RegStorage r_base, int count);
     void OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset);
     void OpRegCopyWide(RegStorage dest, RegStorage src);
-    void OpTlsCmp(ThreadOffset offset, int val);
+    void OpTlsCmp(ThreadOffset<4> offset, int val);
 
-    void OpRegThreadMem(OpKind op, int r_dest, ThreadOffset thread_offset);
+    void OpRegThreadMem(OpKind op, int r_dest, ThreadOffset<4> thread_offset);
     void SpillCoreRegs();
     void UnSpillCoreRegs();
     static const X86EncodingMap EncodingMap[kX86Last];
diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
index ec4d9db..ee5387f 100644
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc
@@ -49,7 +49,7 @@
     case Instruction::REM_FLOAT_2ADDR:
     case Instruction::REM_FLOAT:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmodf), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmodf), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturn(true);
       StoreValue(rl_dest, rl_result);
@@ -100,7 +100,7 @@
     case Instruction::REM_DOUBLE_2ADDR:
     case Instruction::REM_DOUBLE:
       FlushAllRegs();   // Send everything to home location
-      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(pFmod), rl_src1, rl_src2,
+      CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmod), rl_src1, rl_src2,
                                               false);
       rl_result = GetReturnWide(true);
       StoreValueWide(rl_dest, rl_result);
@@ -146,6 +146,11 @@
     if (lo_info != nullptr && lo_info->is_temp) {
       // Calling FlushSpecificReg because it will only write back VR if it is dirty.
       FlushSpecificReg(lo_info);
+      // ResetDef for low/high to prevent NullifyRange from removing stores.
+      ResetDef(rl_src.reg.GetLowReg());
+      if (rl_src.reg.GetLowReg() != rl_src.reg.GetHighReg() && GetRegInfo(rl_src.reg.GetHighReg()) != nullptr) {
+        ResetDef(rl_src.reg.GetHighReg());
+      }
     } else {
       // It must have been register promoted if it is not a temp but is still in physical
       // register. Since we need it to be in memory to convert, we place it there now.
@@ -269,10 +274,10 @@
       GenLongToFP(rl_dest, rl_src, false /* is_double */);
       return;
     case Instruction::FLOAT_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pF2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pF2l), rl_dest, rl_src);
       return;
     case Instruction::DOUBLE_TO_LONG:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pD2l), rl_dest, rl_src);
+      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(4, pD2l), rl_dest, rl_src);
       return;
     default:
       LOG(INFO) << "Unexpected opcode: " << opcode;
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 851f448..4310d6e 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -742,7 +742,7 @@
   NewLIR5(kX86Lea32RA, r_base.GetReg(), reg1.GetReg(), reg2.GetReg(), scale, offset);
 }
 
-void X86Mir2Lir::OpTlsCmp(ThreadOffset offset, int val) {
+void X86Mir2Lir::OpTlsCmp(ThreadOffset<4> offset, int val) {
   NewLIR2(kX86Cmp16TI8, offset.Int32Value(), val);
 }
 
@@ -893,7 +893,7 @@
 
 // Test suspend flag, return target of taken suspend branch
 LIR* X86Mir2Lir::OpTestSuspend(LIR* target) {
-  OpTlsCmp(Thread::ThreadFlagsOffset(), 0);
+  OpTlsCmp(Thread::ThreadFlagsOffset<4>(), 0);
   return OpCondBranch((target == NULL) ? kCondNe : kCondEq, target);
 }
 
@@ -909,6 +909,11 @@
   return false;
 }
 
+bool X86Mir2Lir::EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) {
+  LOG(FATAL) << "Unexpected use of easyMultiply in x86";
+  return false;
+}
+
 LIR* X86Mir2Lir::OpIT(ConditionCode cond, const char* guide) {
   LOG(FATAL) << "Unexpected use of OpIT in x86";
   return NULL;
@@ -1293,7 +1298,7 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
-void X86Mir2Lir::OpRegThreadMem(OpKind op, int r_dest, ThreadOffset thread_offset) {
+void X86Mir2Lir::OpRegThreadMem(OpKind op, int r_dest, ThreadOffset<4> thread_offset) {
   X86OpCode opcode = kX86Bkpt;
   switch (op) {
   case kOpCmp: opcode = kX86Cmp32RT;  break;
@@ -1834,7 +1839,7 @@
   if (needs_access_check) {
     // Check we have access to type_idx and if not throw IllegalAccessError,
     // Caller function returns Class* in kArg0.
-    CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(pInitializeTypeAndVerifyAccess),
+    CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
                          type_idx, true);
     OpRegCopy(class_reg, TargetReg(kRet0));
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));
@@ -1855,7 +1860,7 @@
       // Need to test presence of type in dex cache at runtime.
       LIR* hop_branch = OpCmpImmBranch(kCondNe, class_reg, 0, NULL);
       // Type is not resolved. Call out to helper, which will return resolved type in kRet0/kArg0.
-      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(pInitializeType), type_idx, true);
+      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx, true);
       OpRegCopy(TargetReg(kArg2), TargetReg(kRet0));  // Align usage with fast path.
       LoadValueDirectFixed(rl_src, TargetReg(kArg0));  /* Reload Ref. */
       // Rejoin code paths
@@ -1889,7 +1894,7 @@
       branchover = OpCmpBranch(kCondEq, TargetReg(kArg1), TargetReg(kArg2), NULL);
     }
     OpRegCopy(TargetReg(kArg0), TargetReg(kArg2));
-    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(pInstanceofNonTrivial));
+    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(4, pInstanceofNonTrivial));
   }
   // TODO: only clobber when type isn't final?
   ClobberCallerSave();
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index da64250..925e736 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -581,16 +581,18 @@
 
 X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena),
+      base_of_code_(nullptr), store_method_addr_(false), store_method_addr_used_(false),
       method_address_insns_(arena, 100, kGrowableArrayMisc),
       class_type_address_insns_(arena, 100, kGrowableArrayMisc),
       call_method_insns_(arena, 100, kGrowableArrayMisc),
       stack_decrement_(nullptr), stack_increment_(nullptr) {
-  store_method_addr_used_ = false;
-  for (int i = 0; i < kX86Last; i++) {
-    if (X86Mir2Lir::EncodingMap[i].opcode != i) {
-      LOG(FATAL) << "Encoding order for " << X86Mir2Lir::EncodingMap[i].name
-                 << " is wrong: expecting " << i << ", seeing "
-                 << static_cast<int>(X86Mir2Lir::EncodingMap[i].opcode);
+  if (kIsDebugBuild) {
+    for (int i = 0; i < kX86Last; i++) {
+      if (X86Mir2Lir::EncodingMap[i].opcode != i) {
+        LOG(FATAL) << "Encoding order for " << X86Mir2Lir::EncodingMap[i].name
+            << " is wrong: expecting " << i << ", seeing "
+            << static_cast<int>(X86Mir2Lir::EncodingMap[i].opcode);
+      }
     }
   }
 }
@@ -601,7 +603,7 @@
 }
 
 // Not used in x86
-RegStorage X86Mir2Lir::LoadHelper(ThreadOffset offset) {
+RegStorage X86Mir2Lir::LoadHelper(ThreadOffset<4> offset) {
   LOG(FATAL) << "Unexpected use of LoadHelper in x86";
   return RegStorage::InvalidReg();
 }
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index bb5d387..e9faa7f 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -468,7 +468,7 @@
   return OpRegImm(op, r_dest, value);
 }
 
-LIR* X86Mir2Lir::OpThreadMem(OpKind op, ThreadOffset thread_offset) {
+LIR* X86Mir2Lir::OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) {
   X86OpCode opcode = kX86Bkpt;
   switch (op) {
     case kOpBlx: opcode = kX86CallT;  break;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 797bc82..1759cbe 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -106,12 +106,9 @@
 #define X86_FP_REG_OFFSET 32
 // Offset to distinguish DP FP regs.
 #define X86_FP_DOUBLE (X86_FP_REG_OFFSET + 16)
-// Offset to distingish the extra regs.
-#define X86_EXTRA_REG_OFFSET (X86_FP_DOUBLE + 16)
 // Reg types.
 #define X86_REGTYPE(x) (x & (X86_FP_REG_OFFSET | X86_FP_DOUBLE))
 #define X86_FPREG(x) ((x & X86_FP_REG_OFFSET) == X86_FP_REG_OFFSET)
-#define X86_EXTRAREG(x) ((x & X86_EXTRA_REG_OFFSET) == X86_EXTRA_REG_OFFSET)
 #define X86_DOUBLEREG(x) ((x & X86_FP_DOUBLE) == X86_FP_DOUBLE)
 #define X86_SINGLEREG(x) (X86_FPREG(x) && !X86_DOUBLEREG(x))
 
@@ -135,7 +132,6 @@
   kX86RegEnd   = kX86FPStack,
 };
 
-#define ENCODE_X86_REG_LIST(N)      (static_cast<uint64_t>(N))
 #define ENCODE_X86_REG_SP           (1ULL << kX86RegSP)
 #define ENCODE_X86_FP_STACK         (1ULL << kX86FPStack)
 
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 7fadfc9..fc1332a 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -341,6 +341,7 @@
       compiler_(Compiler::Create(compiler_kind)),
       instruction_set_(instruction_set),
       instruction_set_features_(instruction_set_features),
+      instruction_set_is_64_bit_(instruction_set == kX86_64 || instruction_set == kArm64),
       freezing_constructor_lock_("freezing constructor lock"),
       compiled_classes_lock_("compiled classes lock"),
       compiled_methods_lock_("compiled method lock"),
@@ -448,54 +449,55 @@
   return res;
 }
 
+#define CREATE_TRAMPOLINE(type, abi, offset) \
+    if (instruction_set_is_64_bit_) { \
+      return CreateTrampoline64(instruction_set_, abi, \
+                                type ## _ENTRYPOINT_OFFSET(8, offset)); \
+    } else { \
+      return CreateTrampoline32(instruction_set_, abi, \
+                                type ## _ENTRYPOINT_OFFSET(4, offset)); \
+    }
+
 const std::vector<uint8_t>* CompilerDriver::CreateInterpreterToInterpreterBridge() const {
-  return CreateTrampoline(instruction_set_, kInterpreterAbi,
-                          INTERPRETER_ENTRYPOINT_OFFSET(pInterpreterToInterpreterBridge));
+  CREATE_TRAMPOLINE(INTERPRETER, kInterpreterAbi, pInterpreterToInterpreterBridge)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateInterpreterToCompiledCodeBridge() const {
-  return CreateTrampoline(instruction_set_, kInterpreterAbi,
-                          INTERPRETER_ENTRYPOINT_OFFSET(pInterpreterToCompiledCodeBridge));
+  CREATE_TRAMPOLINE(INTERPRETER, kInterpreterAbi, pInterpreterToCompiledCodeBridge)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateJniDlsymLookup() const {
-  return CreateTrampoline(instruction_set_, kJniAbi, JNI_ENTRYPOINT_OFFSET(pDlsymLookup));
+  CREATE_TRAMPOLINE(JNI, kJniAbi, pDlsymLookup)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreatePortableImtConflictTrampoline() const {
-  return CreateTrampoline(instruction_set_, kPortableAbi,
-                          PORTABLE_ENTRYPOINT_OFFSET(pPortableImtConflictTrampoline));
+  CREATE_TRAMPOLINE(PORTABLE, kPortableAbi, pPortableImtConflictTrampoline)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreatePortableResolutionTrampoline() const {
-  return CreateTrampoline(instruction_set_, kPortableAbi,
-                          PORTABLE_ENTRYPOINT_OFFSET(pPortableResolutionTrampoline));
+  CREATE_TRAMPOLINE(PORTABLE, kPortableAbi, pPortableResolutionTrampoline)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreatePortableToInterpreterBridge() const {
-  return CreateTrampoline(instruction_set_, kPortableAbi,
-                          PORTABLE_ENTRYPOINT_OFFSET(pPortableToInterpreterBridge));
+  CREATE_TRAMPOLINE(PORTABLE, kPortableAbi, pPortableToInterpreterBridge)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateQuickGenericJniTrampoline() const {
-  return CreateTrampoline(instruction_set_, kQuickAbi,
-                          QUICK_ENTRYPOINT_OFFSET(pQuickGenericJniTrampoline));
+  CREATE_TRAMPOLINE(QUICK, kQuickAbi, pQuickGenericJniTrampoline)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateQuickImtConflictTrampoline() const {
-  return CreateTrampoline(instruction_set_, kQuickAbi,
-                          QUICK_ENTRYPOINT_OFFSET(pQuickImtConflictTrampoline));
+  CREATE_TRAMPOLINE(QUICK, kQuickAbi, pQuickImtConflictTrampoline)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateQuickResolutionTrampoline() const {
-  return CreateTrampoline(instruction_set_, kQuickAbi,
-                          QUICK_ENTRYPOINT_OFFSET(pQuickResolutionTrampoline));
+  CREATE_TRAMPOLINE(QUICK, kQuickAbi, pQuickResolutionTrampoline)
 }
 
 const std::vector<uint8_t>* CompilerDriver::CreateQuickToInterpreterBridge() const {
-  return CreateTrampoline(instruction_set_, kQuickAbi,
-                          QUICK_ENTRYPOINT_OFFSET(pQuickToInterpreterBridge));
+  CREATE_TRAMPOLINE(QUICK, kQuickAbi, pQuickToInterpreterBridge)
 }
+#undef CREATE_TRAMPOLINE
 
 void CompilerDriver::CompileAll(jobject class_loader,
                                 const std::vector<const DexFile*>& dex_files,
@@ -601,6 +603,11 @@
                                 ThreadPool* thread_pool, TimingLogger* timings) {
   LoadImageClasses(timings);
 
+  if (!compiler_options_->IsVerificationEnabled()) {
+    VLOG(compiler) << "Verify none mode specified, skipping pre-compilation";
+    return;
+  }
+
   Resolve(class_loader, dex_files, thread_pool, timings);
 
   Verify(class_loader, dex_files, thread_pool, timings);
@@ -1875,7 +1882,7 @@
 
   if ((access_flags & kAccNative) != 0) {
     // Are we interpreting only and have support for generic JNI down calls?
-    if ((compiler_options_->GetCompilerFilter() == CompilerOptions::kInterpretOnly) &&
+    if (!compiler_options_->IsCompilationEnabled() &&
         (instruction_set_ == kX86_64 || instruction_set_ == kArm64)) {
       // Leaving this empty will trigger the generic JNI version
     } else {
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 4257241..802f859 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -725,6 +725,7 @@
 
   const InstructionSet instruction_set_;
   const InstructionSetFeatures instruction_set_features_;
+  const bool instruction_set_is_64_bit_;
 
   // All class references that require
   mutable ReaderWriterMutex freezing_constructor_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
diff --git a/compiler/driver/compiler_options.h b/compiler/driver/compiler_options.h
index 0cca1e9..20c6bc8 100644
--- a/compiler/driver/compiler_options.h
+++ b/compiler/driver/compiler_options.h
@@ -22,7 +22,8 @@
 class CompilerOptions {
  public:
   enum CompilerFilter {
-    kInterpretOnly,       // Compile nothing.
+    kVerifyNone,          // Skip verification and compile nothing except JNI stubs.
+    kInterpretOnly,       // Compile nothing except JNI stubs.
     kProfiled,            // Compile based on profile.
     kSpace,               // Maximize space savings.
     kBalanced,            // Try to get the best performance return on compilation investment.
@@ -86,6 +87,15 @@
     compiler_filter_ = compiler_filter;
   }
 
+  bool IsCompilationEnabled() const {
+    return ((compiler_filter_ != CompilerOptions::kVerifyNone) &&
+            (compiler_filter_ != CompilerOptions::kInterpretOnly));
+  }
+
+  bool IsVerificationEnabled() const {
+    return (compiler_filter_ != CompilerOptions::kVerifyNone);
+  }
+
   size_t GetHugeMethodThreshold() const {
     return huge_method_threshold_;
   }
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 6824183..0405198 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -610,11 +610,13 @@
 void ImageWriter::FixupObject(Object* orig, Object* copy) {
   DCHECK(orig != nullptr);
   DCHECK(copy != nullptr);
-  if (kUseBrooksPointer) {
-    orig->AssertSelfBrooksPointer();
-    // Note the address 'copy' isn't the same as the image address of 'orig'.
-    copy->SetBrooksPointer(GetImageAddress(orig));
-    DCHECK_EQ(copy->GetBrooksPointer(), GetImageAddress(orig));
+  if (kUseBakerOrBrooksReadBarrier) {
+    orig->AssertReadBarrierPointer();
+    if (kUseBrooksReadBarrier) {
+      // Note the address 'copy' isn't the same as the image address of 'orig'.
+      copy->SetReadBarrierPointer(GetImageAddress(orig));
+      DCHECK_EQ(copy->GetReadBarrierPointer(), GetImageAddress(orig));
+    }
   }
   FixupVisitor visitor(this, copy);
   orig->VisitReferences<true /*visit class*/>(visitor, visitor);
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index 31acb69..3204282 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -590,7 +590,7 @@
     ScopedObjectAccess soa(env);
 
     // Build stack trace
-    jobject internal = Thread::Current()->CreateInternalStackTrace(soa);
+    jobject internal = Thread::Current()->CreateInternalStackTrace<false>(soa);
     jobjectArray ste_array = Thread::InternalStackTraceToStackTraceElementArray(soa, internal);
     mirror::ObjectArray<mirror::StackTraceElement>* trace_array =
         soa.Decode<mirror::ObjectArray<mirror::StackTraceElement>*>(ste_array);
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index 28b438e..ab39d6b 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -79,9 +79,9 @@
 FrameOffset ArmManagedRuntimeCallingConvention::CurrentParamStackOffset() {
   CHECK(IsCurrentParamOnStack());
   FrameOffset result =
-      FrameOffset(displacement_.Int32Value() +   // displacement
-                  kPointerSize +                 // Method*
-                  (itr_slots_ * kPointerSize));  // offset into in args
+      FrameOffset(displacement_.Int32Value() +        // displacement
+                  kFramePointerSize +                 // Method*
+                  (itr_slots_ * kFramePointerSize));  // offset into in args
   return result;
 }
 
@@ -106,7 +106,7 @@
 
 ArmJniCallingConvention::ArmJniCallingConvention(bool is_static, bool is_synchronized,
                                                  const char* shorty)
-    : JniCallingConvention(is_static, is_synchronized, shorty) {
+    : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
   // Compute padding to ensure longs and doubles are not split in AAPCS. Ignore the 'this' jobject
   // or jclass for static methods and the JNIEnv. We start at the aligned register r2.
   size_t padding = 0;
@@ -143,15 +143,15 @@
 
 size_t ArmJniCallingConvention::FrameSize() {
   // Method*, LR and callee save area size, local reference segment state
-  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kPointerSize;
+  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * kPointerSize;
+  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
 
 size_t ArmJniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kPointerSize + padding_,
+  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize + padding_,
                  kStackAlignment);
 }
 
@@ -195,7 +195,7 @@
 
 FrameOffset ArmJniCallingConvention::CurrentParamStackOffset() {
   CHECK_GE(itr_slots_, 4u);
-  size_t offset = displacement_.Int32Value() - OutArgSize() + ((itr_slots_ - 4) * kPointerSize);
+  size_t offset = displacement_.Int32Value() - OutArgSize() + ((itr_slots_ - 4) * kFramePointerSize);
   CHECK_LT(offset, OutArgSize());
   return FrameOffset(offset);
 }
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index 96bbb7e..00a239b 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -22,10 +22,12 @@
 namespace art {
 namespace arm {
 
+constexpr size_t kFramePointerSize = 4;
+
 class ArmManagedRuntimeCallingConvention FINAL : public ManagedRuntimeCallingConvention {
  public:
   ArmManagedRuntimeCallingConvention(bool is_static, bool is_synchronized, const char* shorty)
-      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty) {}
+      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {}
   ~ArmManagedRuntimeCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index ff899b7..c408fa9 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -79,9 +79,9 @@
 FrameOffset Arm64ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
   CHECK(IsCurrentParamOnStack());
   FrameOffset result =
-      FrameOffset(displacement_.Int32Value() +   // displacement
-                  kPointerSize +                 // Method*
-                  (itr_slots_ * kPointerSize));  // offset into in args
+      FrameOffset(displacement_.Int32Value() +         // displacement
+                  kFramePointerSize +                 // Method*
+                  (itr_slots_ * kFramePointerSize));  // offset into in args
   return result;
 }
 
@@ -119,8 +119,8 @@
 // JNI calling convention
 
 Arm64JniCallingConvention::Arm64JniCallingConvention(bool is_static, bool is_synchronized,
-                                                 const char* shorty)
-    : JniCallingConvention(is_static, is_synchronized, shorty) {
+                                                     const char* shorty)
+    : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
   // TODO This needs to be converted to 64bit.
   // Compute padding to ensure longs and doubles are not split in AAPCS. Ignore the 'this' jobject
   // or jclass for static methods and the JNIEnv. We start at the aligned register r2.
@@ -135,7 +135,7 @@
 //    }
 //    cur_reg++;  // bump the iterator for every argument
 //  }
-//  padding_ =0;
+  padding_ =0;
 
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X19));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X20));
@@ -173,15 +173,15 @@
 
 size_t Arm64JniCallingConvention::FrameSize() {
   // Method*, LR and callee save area size, local reference segment state
-  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kPointerSize;
+  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * kPointerSize;
+  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
 
 size_t Arm64JniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kPointerSize + padding_,
+  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize + padding_,
                  kStackAlignment);
 }
 
@@ -228,7 +228,7 @@
 
 FrameOffset Arm64JniCallingConvention::CurrentParamStackOffset() {
   CHECK_GE(itr_slots_, 4u);
-  size_t offset = displacement_.Int32Value() - OutArgSize() + ((itr_slots_ - 4) * kPointerSize);
+  size_t offset = displacement_.Int32Value() - OutArgSize() + ((itr_slots_ - 4) * kFramePointerSize);
   CHECK_LT(offset, OutArgSize());
   return FrameOffset(offset);
 }
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 7e33830..c18cd2b 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -22,10 +22,12 @@
 namespace art {
 namespace arm64 {
 
+constexpr size_t kFramePointerSize = 8;
+
 class Arm64ManagedRuntimeCallingConvention FINAL : public ManagedRuntimeCallingConvention {
  public:
   Arm64ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized, const char* shorty)
-      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty) {}
+      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {}
   ~Arm64ManagedRuntimeCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/calling_convention.cc b/compiler/jni/quick/calling_convention.cc
index 043bcea..8efdcda 100644
--- a/compiler/jni/quick/calling_convention.cc
+++ b/compiler/jni/quick/calling_convention.cc
@@ -26,11 +26,6 @@
 
 namespace art {
 
-// Offset of Method within the frame
-FrameOffset CallingConvention::MethodStackOffset() {
-  return displacement_;
-}
-
 // Managed runtime calling convention
 
 ManagedRuntimeCallingConvention* ManagedRuntimeCallingConvention::Create(
@@ -123,7 +118,7 @@
 }
 
 FrameOffset JniCallingConvention::SavedLocalReferenceCookieOffset() const {
-  size_t references_size = kSirtPointerSize * ReferenceCount();  // size excluding header
+  size_t references_size = sirt_pointer_size_ * ReferenceCount();  // size excluding header
   return FrameOffset(SirtReferencesOffset().Int32Value() + references_size);
 }
 
@@ -191,14 +186,14 @@
 FrameOffset JniCallingConvention::CurrentParamSirtEntryOffset() {
   CHECK(IsCurrentParamAReference());
   CHECK_LT(SirtLinkOffset(), SirtNumRefsOffset());
-  int result = SirtReferencesOffset().Int32Value() + itr_refs_ * kSirtPointerSize;
+  int result = SirtReferencesOffset().Int32Value() + itr_refs_ * sirt_pointer_size_;
   CHECK_GT(result, SirtNumRefsOffset().Int32Value());
   return FrameOffset(result);
 }
 
 size_t JniCallingConvention::CurrentParamSize() {
   if (itr_args_ <= kObjectOrClass) {
-    return kPointerSize;  // JNIEnv or jobject/jclass
+    return frame_pointer_size_;  // JNIEnv or jobject/jclass
   } else {
     int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
     return ParamSize(arg_pos);
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index fe3d1cd..7e1cf63 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -24,7 +24,7 @@
 
 namespace art {
 
-// Top-level abstraction for different calling conventions
+// Top-level abstraction for different calling conventions.
 class CallingConvention {
  public:
   bool IsReturnAReference() const { return shorty_[0] == 'L'; }
@@ -46,8 +46,10 @@
   // Register reserved for scratch usage during procedure calls.
   virtual ManagedRegister InterproceduralScratchRegister() = 0;
 
-  // Offset of Method within the frame
-  FrameOffset MethodStackOffset();
+  // Offset of Method within the frame.
+  FrameOffset MethodStackOffset() {
+    return displacement_;
+  }
 
   // Iterator interface
 
@@ -66,8 +68,13 @@
   virtual ~CallingConvention() {}
 
  protected:
-  CallingConvention(bool is_static, bool is_synchronized, const char* shorty)
-      : displacement_(0), kSirtPointerSize(sizeof(StackReference<mirror::Object>)), is_static_(is_static), is_synchronized_(is_synchronized),
+  CallingConvention(bool is_static, bool is_synchronized, const char* shorty,
+                    size_t frame_pointer_size)
+      : itr_slots_(0), itr_refs_(0), itr_args_(0), itr_longs_and_doubles_(0),
+        itr_float_and_doubles_(0), displacement_(0),
+        frame_pointer_size_(frame_pointer_size),
+        sirt_pointer_size_(sizeof(StackReference<mirror::Object>)),
+        is_static_(is_static), is_synchronized_(is_synchronized),
         shorty_(shorty) {
     num_args_ = (is_static ? 0 : 1) + strlen(shorty) - 1;
     num_ref_args_ = is_static ? 0 : 1;  // The implicit this pointer.
@@ -145,7 +152,7 @@
     if (IsStatic()) {
       param++;  // 0th argument must skip return value at start of the shorty
     } else if (param == 0) {
-      return kPointerSize;  // this argument
+      return frame_pointer_size_;  // this argument
     }
     size_t result = Primitive::ComponentSize(Primitive::GetType(shorty_[param]));
     if (result >= 1 && result < 4) {
@@ -160,17 +167,20 @@
   // Note that each slot is 32-bit. When the current argument is bigger
   // than 32 bits, return the first slot number for this argument.
   unsigned int itr_slots_;
-  // The number of references iterated past
+  // The number of references iterated past.
   unsigned int itr_refs_;
-  // The argument number along argument list for current argument
+  // The argument number along argument list for current argument.
   unsigned int itr_args_;
-  // Number of longs and doubles seen along argument list
+  // Number of longs and doubles seen along argument list.
   unsigned int itr_longs_and_doubles_;
-  // Number of float and doubles seen along argument list
+  // Number of float and doubles seen along argument list.
   unsigned int itr_float_and_doubles_;
-  // Space for frames below this on the stack
+  // Space for frames below this on the stack.
   FrameOffset displacement_;
-  size_t kSirtPointerSize;
+  // The size of a reference.
+  const size_t frame_pointer_size_;
+  // The size of a reference entry within the SIRT.
+  const size_t sirt_pointer_size_;
 
  private:
   const bool is_static_;
@@ -218,8 +228,9 @@
   virtual const ManagedRegisterEntrySpills& EntrySpills() = 0;
 
  protected:
-  ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized, const char* shorty)
-      : CallingConvention(is_static, is_synchronized, shorty) {}
+  ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized, const char* shorty,
+                                  size_t frame_pointer_size)
+      : CallingConvention(is_static, is_synchronized, shorty, frame_pointer_size) {}
 };
 
 // Abstraction for JNI calling conventions
@@ -283,8 +294,7 @@
 
   // Position of SIRT and interior fields
   FrameOffset SirtOffset() const {
-    return FrameOffset(displacement_.Int32Value() +
-                       kPointerSize);  // above Method*
+    return FrameOffset(this->displacement_.Int32Value() + frame_pointer_size_);  // above Method*
   }
 
   FrameOffset SirtLinkOffset() const {
@@ -298,9 +308,8 @@
   }
 
   FrameOffset SirtReferencesOffset() const {
-    // The StackIndirectReferenceTable::number_of_references_ type is uint32_t
-    return FrameOffset(SirtNumRefsOffset().Int32Value() +
-                       sizeof(uint32_t));
+    return FrameOffset(SirtOffset().Int32Value() +
+                       StackIndirectReferenceTable::ReferencesOffset());
   }
 
   virtual ~JniCallingConvention() {}
@@ -312,8 +321,9 @@
     kObjectOrClass = 1
   };
 
-  explicit JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty)
-      : CallingConvention(is_static, is_synchronized, shorty) {}
+  explicit JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty,
+                                size_t frame_pointer_size)
+      : CallingConvention(is_static, is_synchronized, shorty, frame_pointer_size) {}
 
   // Number of stack slots for outgoing arguments, above which the SIRT is
   // located
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index c89bc40..dcdcdd1 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -101,10 +101,10 @@
   __ StoreImmediateToFrame(main_jni_conv->SirtNumRefsOffset(),
                            main_jni_conv->ReferenceCount(),
                            mr_conv->InterproceduralScratchRegister());
-  __ CopyRawPtrFromThread(main_jni_conv->SirtLinkOffset(),
-                          Thread::TopSirtOffset(),
+  __ CopyRawPtrFromThread32(main_jni_conv->SirtLinkOffset(),
+                          Thread::TopSirtOffset<4>(),
                           mr_conv->InterproceduralScratchRegister());
-  __ StoreStackOffsetToThread(Thread::TopSirtOffset(),
+  __ StoreStackOffsetToThread32(Thread::TopSirtOffset<4>(),
                               main_jni_conv->SirtOffset(),
                               mr_conv->InterproceduralScratchRegister());
 
@@ -154,8 +154,8 @@
   }
 
   // 4. Write out the end of the quick frames.
-  __ StoreStackPointerToThread(Thread::TopOfManagedStackOffset());
-  __ StoreImmediateToThread(Thread::TopOfManagedStackPcOffset(), 0,
+  __ StoreStackPointerToThread32(Thread::TopOfManagedStackOffset<4>());
+  __ StoreImmediateToThread32(Thread::TopOfManagedStackPcOffset<4>(), 0,
                             mr_conv->InterproceduralScratchRegister());
 
   // 5. Move frame down to allow space for out going args.
@@ -169,8 +169,8 @@
   //    can occur. The result is the saved JNI local state that is restored by the exit call. We
   //    abuse the JNI calling convention here, that is guaranteed to support passing 2 pointer
   //    arguments.
-  ThreadOffset jni_start = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(pJniMethodStartSynchronized)
-                                           : QUICK_ENTRYPOINT_OFFSET(pJniMethodStart);
+  ThreadOffset<4> jni_start = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStartSynchronized)
+                                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStart);
   main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
   FrameOffset locked_object_sirt_offset(0);
   if (is_synchronized) {
@@ -197,7 +197,7 @@
   } else {
     __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset(),
                         main_jni_conv->InterproceduralScratchRegister());
-    __ Call(ThreadOffset(jni_start), main_jni_conv->InterproceduralScratchRegister());
+    __ CallFromThread32(jni_start, main_jni_conv->InterproceduralScratchRegister());
   }
   if (is_synchronized) {  // Check for exceptions from monitor enter.
     __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), main_out_arg_size);
@@ -259,10 +259,10 @@
   if (main_jni_conv->IsCurrentParamInRegister()) {
     ManagedRegister jni_env = main_jni_conv->CurrentParamRegister();
     DCHECK(!jni_env.Equals(main_jni_conv->InterproceduralScratchRegister()));
-    __ LoadRawPtrFromThread(jni_env, Thread::JniEnvOffset());
+    __ LoadRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>());
   } else {
     FrameOffset jni_env = main_jni_conv->CurrentParamStackOffset();
-    __ CopyRawPtrFromThread(jni_env, Thread::JniEnvOffset(),
+    __ CopyRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>(),
                             main_jni_conv->InterproceduralScratchRegister());
   }
 
@@ -298,16 +298,16 @@
   // 12. Call into JNI method end possibly passing a returned reference, the method and the current
   //     thread.
   end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
-  ThreadOffset jni_end(-1);
+  ThreadOffset<4> jni_end(-1);
   if (reference_return) {
     // Pass result.
-    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(pJniMethodEndWithReferenceSynchronized)
-                              : QUICK_ENTRYPOINT_OFFSET(pJniMethodEndWithReference);
+    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReferenceSynchronized)
+                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReference);
     SetNativeParameter(jni_asm.get(), end_jni_conv.get(), end_jni_conv->ReturnRegister());
     end_jni_conv->Next();
   } else {
-    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(pJniMethodEndSynchronized)
-                              : QUICK_ENTRYPOINT_OFFSET(pJniMethodEnd);
+    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndSynchronized)
+                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEnd);
   }
   // Pass saved local reference state.
   if (end_jni_conv->IsCurrentParamOnStack()) {
@@ -339,7 +339,7 @@
   } else {
     __ GetCurrentThread(end_jni_conv->CurrentParamStackOffset(),
                         end_jni_conv->InterproceduralScratchRegister());
-    __ Call(ThreadOffset(jni_end), end_jni_conv->InterproceduralScratchRegister());
+    __ CallFromThread32(ThreadOffset<4>(jni_end), end_jni_conv->InterproceduralScratchRegister());
   }
 
   // 13. Reload return value
diff --git a/compiler/jni/quick/mips/calling_convention_mips.cc b/compiler/jni/quick/mips/calling_convention_mips.cc
index ea39d60..51a3f54 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.cc
+++ b/compiler/jni/quick/mips/calling_convention_mips.cc
@@ -79,9 +79,9 @@
 FrameOffset MipsManagedRuntimeCallingConvention::CurrentParamStackOffset() {
   CHECK(IsCurrentParamOnStack());
   FrameOffset result =
-      FrameOffset(displacement_.Int32Value() +   // displacement
-                  kPointerSize +                 // Method*
-                  (itr_slots_ * kPointerSize));  // offset into in args
+      FrameOffset(displacement_.Int32Value() +        // displacement
+                  kFramePointerSize +                 // Method*
+                  (itr_slots_ * kFramePointerSize));  // offset into in args
   return result;
 }
 
@@ -105,8 +105,8 @@
 // JNI calling convention
 
 MipsJniCallingConvention::MipsJniCallingConvention(bool is_static, bool is_synchronized,
-                                                 const char* shorty)
-    : JniCallingConvention(is_static, is_synchronized, shorty) {
+                                                   const char* shorty)
+    : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
   // Compute padding to ensure longs and doubles are not split in AAPCS. Ignore the 'this' jobject
   // or jclass for static methods and the JNIEnv. We start at the aligned register A2.
   size_t padding = 0;
@@ -147,16 +147,15 @@
 
 size_t MipsJniCallingConvention::FrameSize() {
   // Method*, LR and callee save area size, local reference segment state
-  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kPointerSize;
+  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * kPointerSize;
+  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
 
 size_t MipsJniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kPointerSize + padding_,
-                 kStackAlignment);
+  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize + padding_, kStackAlignment);
 }
 
 // JniCallingConvention ABI follows AAPCS where longs and doubles must occur
@@ -199,7 +198,7 @@
 
 FrameOffset MipsJniCallingConvention::CurrentParamStackOffset() {
   CHECK_GE(itr_slots_, 4u);
-  size_t offset = displacement_.Int32Value() - OutArgSize() + (itr_slots_ * kPointerSize);
+  size_t offset = displacement_.Int32Value() - OutArgSize() + (itr_slots_ * kFramePointerSize);
   CHECK_LT(offset, OutArgSize());
   return FrameOffset(offset);
 }
diff --git a/compiler/jni/quick/mips/calling_convention_mips.h b/compiler/jni/quick/mips/calling_convention_mips.h
index 1a9053a..e33fbad 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.h
+++ b/compiler/jni/quick/mips/calling_convention_mips.h
@@ -21,10 +21,13 @@
 
 namespace art {
 namespace mips {
+
+constexpr size_t kFramePointerSize = 4;
+
 class MipsManagedRuntimeCallingConvention FINAL : public ManagedRuntimeCallingConvention {
  public:
   MipsManagedRuntimeCallingConvention(bool is_static, bool is_synchronized, const char* shorty)
-      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty) {}
+      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {}
   ~MipsManagedRuntimeCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 8d22fe6..8b440ed 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -86,8 +86,8 @@
 
 FrameOffset X86ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
   return FrameOffset(displacement_.Int32Value() +   // displacement
-                     kPointerSize +                 // Method*
-                     (itr_slots_ * kPointerSize));  // offset into in args
+                     kFramePointerSize +                 // Method*
+                     (itr_slots_ * kFramePointerSize));  // offset into in args
 }
 
 const ManagedRegisterEntrySpills& X86ManagedRuntimeCallingConvention::EntrySpills() {
@@ -112,7 +112,7 @@
 
 X86JniCallingConvention::X86JniCallingConvention(bool is_static, bool is_synchronized,
                                                  const char* shorty)
-    : JniCallingConvention(is_static, is_synchronized, shorty) {
+    : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
   callee_save_regs_.push_back(X86ManagedRegister::FromCpuRegister(EBP));
   callee_save_regs_.push_back(X86ManagedRegister::FromCpuRegister(ESI));
   callee_save_regs_.push_back(X86ManagedRegister::FromCpuRegister(EDI));
@@ -124,15 +124,15 @@
 
 size_t X86JniCallingConvention::FrameSize() {
   // Method*, return address and callee save area size, local reference segment state
-  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kPointerSize;
+  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * kPointerSize;
+  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
 
 size_t X86JniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kPointerSize, kStackAlignment);
+  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
 }
 
 bool X86JniCallingConvention::IsCurrentParamInRegister() {
@@ -149,7 +149,7 @@
 }
 
 FrameOffset X86JniCallingConvention::CurrentParamStackOffset() {
-  return FrameOffset(displacement_.Int32Value() - OutArgSize() + (itr_slots_ * kPointerSize));
+  return FrameOffset(displacement_.Int32Value() - OutArgSize() + (itr_slots_ * kFramePointerSize));
 }
 
 size_t X86JniCallingConvention::NumberOfOutgoingStackArgs() {
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index 2dab059..5b9069c 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -22,11 +22,13 @@
 namespace art {
 namespace x86 {
 
+constexpr size_t kFramePointerSize = 4;
+
 class X86ManagedRuntimeCallingConvention FINAL : public ManagedRuntimeCallingConvention {
  public:
   explicit X86ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized,
                                               const char* shorty)
-      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty) {}
+      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {}
   ~X86ManagedRuntimeCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 8ebea46..21e0bd7 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -39,7 +39,7 @@
 
 static ManagedRegister ReturnRegisterForShorty(const char* shorty, bool jni) {
   if (shorty[0] == 'F' || shorty[0] == 'D') {
-    return X86_64ManagedRegister::FromXmmRegister(_XMM0);
+    return X86_64ManagedRegister::FromXmmRegister(XMM0);
   } else if (shorty[0] == 'J') {
     return X86_64ManagedRegister::FromCpuRegister(RAX);
   } else if (shorty[0] == 'V') {
@@ -89,14 +89,14 @@
   } else if (itr_float_and_doubles_ < 8) {
     // First eight float parameters are passed via XMM0..XMM7
     res = X86_64ManagedRegister::FromXmmRegister(
-                                 static_cast<XmmRegister>(_XMM0 + itr_float_and_doubles_));
+                                 static_cast<FloatRegister>(XMM0 + itr_float_and_doubles_));
   }
   return res;
 }
 
 FrameOffset X86_64ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
   return FrameOffset(displacement_.Int32Value() +   // displacement
-                     kPointerSize +                 // Method*
+                     kFramePointerSize +                 // Method*
                      (itr_slots_ * sizeof(uint32_t)));  // offset into in args
 }
 
@@ -122,8 +122,8 @@
 // JNI calling convention
 
 X86_64JniCallingConvention::X86_64JniCallingConvention(bool is_static, bool is_synchronized,
-                                                 const char* shorty)
-    : JniCallingConvention(is_static, is_synchronized, shorty) {
+                                                       const char* shorty)
+    : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
   callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(RBX));
   callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(RBP));
   callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R12));
@@ -133,20 +133,21 @@
 }
 
 uint32_t X86_64JniCallingConvention::CoreSpillMask() const {
-  return 1 << RBX | 1 << RBP | 1 << R12 | 1 << R13 | 1 << R14 | 1 << R15 | 1 << R13 | 1 << kNumberOfCpuRegisters;
+  return 1 << RBX | 1 << RBP | 1 << R12 | 1 << R13 | 1 << R14 | 1 << R15 | 1 << R13 |
+      1 << kNumberOfCpuRegisters;
 }
 
 size_t X86_64JniCallingConvention::FrameSize() {
   // Method*, return address and callee save area size, local reference segment state
-  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kPointerSize;
+  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus link_ (pointer) and number_of_references_ (uint32_t) for SIRT header
-  size_t sirt_size = kPointerSize + sizeof(uint32_t) + ReferenceCount()*kSirtPointerSize;
+  size_t sirt_size = kFramePointerSize + sizeof(uint32_t) + (ReferenceCount() * sirt_pointer_size_);
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
 
 size_t X86_64JniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kPointerSize, kStackAlignment);
+  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
 }
 
 bool X86_64JniCallingConvention::IsCurrentParamInRegister() {
@@ -171,16 +172,16 @@
   } else if (itr_float_and_doubles_ < 8) {
     // First eight float parameters are passed via XMM0..XMM7
     res = X86_64ManagedRegister::FromXmmRegister(
-                                 static_cast<XmmRegister>(_XMM0 + itr_float_and_doubles_));
+                                 static_cast<FloatRegister>(XMM0 + itr_float_and_doubles_));
   }
   return res;
 }
 
 FrameOffset X86_64JniCallingConvention::CurrentParamStackOffset() {
   size_t offset = itr_args_
-                  - std::min(8U, itr_float_and_doubles_)               // Float arguments passed through Xmm0..Xmm7
-                  - std::min(6U, itr_args_ - itr_float_and_doubles_);  // Integer arguments passed through GPR
-  return FrameOffset(displacement_.Int32Value() - OutArgSize() + (offset * kPointerSize));
+      - std::min(8U, itr_float_and_doubles_)               // Float arguments passed through Xmm0..Xmm7
+      - std::min(6U, itr_args_ - itr_float_and_doubles_);  // Integer arguments passed through GPR
+  return FrameOffset(displacement_.Int32Value() - OutArgSize() + (offset * kFramePointerSize));
 }
 
 size_t X86_64JniCallingConvention::NumberOfOutgoingStackArgs() {
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index d7f7762..d545774 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -22,11 +22,13 @@
 namespace art {
 namespace x86_64 {
 
+constexpr size_t kFramePointerSize = 8;
+
 class X86_64ManagedRuntimeCallingConvention FINAL : public ManagedRuntimeCallingConvention {
  public:
   explicit X86_64ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized,
                                               const char* shorty)
-      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty) {}
+      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {}
   ~X86_64ManagedRuntimeCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/trampolines/trampoline_compiler.cc b/compiler/trampolines/trampoline_compiler.cc
index 32980cb..fb909a8 100644
--- a/compiler/trampolines/trampoline_compiler.cc
+++ b/compiler/trampolines/trampoline_compiler.cc
@@ -21,6 +21,7 @@
 #include "utils/arm64/assembler_arm64.h"
 #include "utils/mips/assembler_mips.h"
 #include "utils/x86/assembler_x86.h"
+#include "utils/x86_64/assembler_x86_64.h"
 
 #define __ assembler->
 
@@ -28,7 +29,7 @@
 
 namespace arm {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
-                                                    ThreadOffset offset) {
+                                                    ThreadOffset<4> offset) {
   UniquePtr<ArmAssembler> assembler(static_cast<ArmAssembler*>(Assembler::Create(kArm)));
 
   switch (abi) {
@@ -56,7 +57,7 @@
 
 namespace arm64 {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
-                                                    ThreadOffset offset) {
+                                                    ThreadOffset<8> offset) {
   UniquePtr<Arm64Assembler> assembler(static_cast<Arm64Assembler*>(Assembler::Create(kArm64)));
 
   switch (abi) {
@@ -96,7 +97,7 @@
 
 namespace mips {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
-                                                    ThreadOffset offset) {
+                                                    ThreadOffset<4> offset) {
   UniquePtr<MipsAssembler> assembler(static_cast<MipsAssembler*>(Assembler::Create(kMips)));
 
   switch (abi) {
@@ -125,7 +126,7 @@
 }  // namespace mips
 
 namespace x86 {
-static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset offset) {
+static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset<4> offset) {
   UniquePtr<X86Assembler> assembler(static_cast<X86Assembler*>(Assembler::Create(kX86)));
 
   // All x86 trampolines call via the Thread* held in fs.
@@ -142,11 +143,12 @@
 }  // namespace x86
 
 namespace x86_64 {
-static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset offset) {
-  UniquePtr<x86::X86Assembler> assembler(static_cast<x86::X86Assembler*>(Assembler::Create(kX86_64)));
+static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset<8> offset) {
+  UniquePtr<x86_64::X86_64Assembler>
+      assembler(static_cast<x86_64::X86_64Assembler*>(Assembler::Create(kX86_64)));
 
   // All x86 trampolines call via the Thread* held in gs.
-  __ gs()->jmp(x86::Address::Absolute(offset, true));
+  __ gs()->jmp(x86_64::Address::Absolute(offset, true));
   __ int3();
 
   size_t cs = assembler->CodeSize();
@@ -158,23 +160,32 @@
 }
 }  // namespace x86_64
 
-const std::vector<uint8_t>* CreateTrampoline(InstructionSet isa, EntryPointCallingConvention abi,
-                                             ThreadOffset offset) {
+const std::vector<uint8_t>* CreateTrampoline64(InstructionSet isa, EntryPointCallingConvention abi,
+                                               ThreadOffset<8> offset) {
+  switch (isa) {
+    case kArm64:
+      return arm64::CreateTrampoline(abi, offset);
+    case kX86_64:
+      return x86_64::CreateTrampoline(offset);
+    default:
+      LOG(FATAL) << "Unexpected InstructionSet: " << isa;
+      return nullptr;
+  }
+}
+
+const std::vector<uint8_t>* CreateTrampoline32(InstructionSet isa, EntryPointCallingConvention abi,
+                                               ThreadOffset<4> offset) {
   switch (isa) {
     case kArm:
     case kThumb2:
       return arm::CreateTrampoline(abi, offset);
-    case kArm64:
-      return arm64::CreateTrampoline(abi, offset);
     case kMips:
       return mips::CreateTrampoline(abi, offset);
     case kX86:
       return x86::CreateTrampoline(offset);
-    case kX86_64:
-      return x86_64::CreateTrampoline(offset);
     default:
-      LOG(FATAL) << "Unknown InstructionSet: " << isa;
-      return NULL;
+      LOG(FATAL) << "Unexpected InstructionSet: " << isa;
+      return nullptr;
   }
 }
 
diff --git a/compiler/trampolines/trampoline_compiler.h b/compiler/trampolines/trampoline_compiler.h
index cb5aa27..bdab279 100644
--- a/compiler/trampolines/trampoline_compiler.h
+++ b/compiler/trampolines/trampoline_compiler.h
@@ -25,8 +25,11 @@
 namespace art {
 
 // Create code that will invoke the function held in thread local storage.
-const std::vector<uint8_t>* CreateTrampoline(InstructionSet isa, EntryPointCallingConvention abi,
-                                             ThreadOffset entry_point_offset)
+const std::vector<uint8_t>* CreateTrampoline32(InstructionSet isa, EntryPointCallingConvention abi,
+                                               ThreadOffset<4> entry_point_offset)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+const std::vector<uint8_t>* CreateTrampoline64(InstructionSet isa, EntryPointCallingConvention abi,
+                                               ThreadOffset<8> entry_point_offset)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 }  // namespace art
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index 872a557..effc38e 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -1436,6 +1436,8 @@
   mov(rd, ShifterOperand(rm, ROR, 0), cond);
 }
 
+constexpr size_t kFramePointerSize = 4;
+
 void ArmAssembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
                               const std::vector<ManagedRegister>& callee_save_regs,
                               const ManagedRegisterEntrySpills& entry_spills) {
@@ -1453,8 +1455,8 @@
   PushList(push_list);
 
   // Increase frame to required size.
-  CHECK_GT(frame_size, pushed_values * kPointerSize);  // Must be at least space to push Method*
-  size_t adjust = frame_size - (pushed_values * kPointerSize);
+  CHECK_GT(frame_size, pushed_values * kFramePointerSize);  // Must at least have space for Method*.
+  size_t adjust = frame_size - (pushed_values * kFramePointerSize);
   IncreaseFrameSize(adjust);
 
   // Write out Method*.
@@ -1463,7 +1465,7 @@
   // Write out entry spills.
   for (size_t i = 0; i < entry_spills.size(); ++i) {
     Register reg = entry_spills.at(i).AsArm().AsCoreRegister();
-    StoreToOffset(kStoreWord, reg, SP, frame_size + kPointerSize + (i * kPointerSize));
+    StoreToOffset(kStoreWord, reg, SP, frame_size + kFramePointerSize + (i * kFramePointerSize));
   }
 }
 
@@ -1480,8 +1482,8 @@
   }
 
   // Decrease frame to start of callee saves
-  CHECK_GT(frame_size, pop_values * kPointerSize);
-  size_t adjust = frame_size - (pop_values * kPointerSize);
+  CHECK_GT(frame_size, pop_values * kFramePointerSize);
+  size_t adjust = frame_size - (pop_values * kFramePointerSize);
   DecreaseFrameSize(adjust);
 
   // Pop callee saves and PC
@@ -1577,7 +1579,7 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), SP, dest.Int32Value());
 }
 
-void ArmAssembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
+void ArmAssembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
                                        ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -1609,18 +1611,18 @@
   return EmitLoad(this, m_dst, SP, src.Int32Value(), size);
 }
 
-void ArmAssembler::Load(ManagedRegister m_dst, ThreadOffset src, size_t size) {
+void ArmAssembler::LoadFromThread32(ManagedRegister m_dst, ThreadOffset<4> src, size_t size) {
   return EmitLoad(this, m_dst, TR, src.Int32Value(), size);
 }
 
-void ArmAssembler::LoadRawPtrFromThread(ManagedRegister m_dst, ThreadOffset offs) {
+void ArmAssembler::LoadRawPtrFromThread32(ManagedRegister m_dst, ThreadOffset<4> offs) {
   ArmManagedRegister dst = m_dst.AsArm();
   CHECK(dst.IsCoreRegister()) << dst;
   LoadFromOffset(kLoadWord, dst.AsCoreRegister(), TR, offs.Int32Value());
 }
 
-void ArmAssembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                        ThreadOffset thr_offs,
+void ArmAssembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                        ThreadOffset<4> thr_offs,
                                         ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -1630,7 +1632,7 @@
                 SP, fr_offs.Int32Value());
 }
 
-void ArmAssembler::CopyRawPtrToThread(ThreadOffset thr_offs,
+void ArmAssembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs,
                                       FrameOffset fr_offs,
                                       ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
@@ -1641,7 +1643,7 @@
                 TR, thr_offs.Int32Value());
 }
 
-void ArmAssembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
+void ArmAssembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
                                             FrameOffset fr_offs,
                                             ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
@@ -1651,7 +1653,7 @@
                 TR, thr_offs.Int32Value());
 }
 
-void ArmAssembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
+void ArmAssembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
   StoreToOffset(kStoreWord, SP, TR, thr_offs.Int32Value());
 }
 
@@ -1844,7 +1846,7 @@
   // TODO: place reference map on call
 }
 
-void ArmAssembler::Call(ThreadOffset /*offset*/, ManagedRegister /*scratch*/) {
+void ArmAssembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL);
 }
 
@@ -1862,7 +1864,7 @@
   ArmExceptionSlowPath* slow = new ArmExceptionSlowPath(scratch, stack_adjust);
   buffer_.EnqueueSlowPath(slow);
   LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
-                 TR, Thread::ExceptionOffset().Int32Value());
+                 TR, Thread::ExceptionOffset<4>().Int32Value());
   cmp(scratch.AsCoreRegister(), ShifterOperand(0));
   b(slow->Entry(), NE);
 }
@@ -1878,7 +1880,7 @@
   // Don't care about preserving R0 as this call won't return
   __ mov(R0, ShifterOperand(scratch_.AsCoreRegister()));
   // Set up call to Thread::Current()->pDeliverException
-  __ LoadFromOffset(kLoadWord, R12, TR, QUICK_ENTRYPOINT_OFFSET(pDeliverException).Int32Value());
+  __ LoadFromOffset(kLoadWord, R12, TR, QUICK_ENTRYPOINT_OFFSET(4, pDeliverException).Int32Value());
   __ blx(R12);
   // Call never returns
   __ bkpt(0);
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index bb9207c..f5be04a 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -35,6 +35,7 @@
   // Data-processing operands - Uninitialized
   ShifterOperand() {
     type_ = -1;
+    encoding_ = 0;
   }
 
   // Data-processing operands - Immediate
@@ -210,7 +211,7 @@
 };
 
 
-class ArmAssembler : public Assembler {
+class ArmAssembler FINAL : public Assembler {
  public:
   ArmAssembler() {}
   virtual ~ArmAssembler() {}
@@ -438,127 +439,116 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister src);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister src);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
 
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister dest, FrameOffset src, size_t size);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size);
+  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch);
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch);
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister scratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister scratch);
+  void MemoryBarrier(ManagedRegister scratch) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister scratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister scratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
-  virtual void LoadReferenceFromSirt(ManagedRegister dst,
-                                     ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
   void EmitType01(Condition cond,
@@ -642,12 +632,12 @@
 };
 
 // Slowpath entered when Thread::Current()->_exception is non-null
-class ArmExceptionSlowPath : public SlowPath {
+class ArmExceptionSlowPath FINAL : public SlowPath {
  public:
   explicit ArmExceptionSlowPath(ArmManagedRegister scratch, size_t stack_adjust)
       : scratch_(scratch), stack_adjust_(stack_adjust) {
   }
-  virtual void Emit(Assembler *sp_asm);
+  void Emit(Assembler *sp_asm) OVERRIDE;
  private:
   const ArmManagedRegister scratch_;
   const size_t stack_adjust_;
diff --git a/compiler/utils/arm/constants_arm.h b/compiler/utils/arm/constants_arm.h
index cc795b1..058f945 100644
--- a/compiler/utils/arm/constants_arm.h
+++ b/compiler/utils/arm/constants_arm.h
@@ -242,22 +242,22 @@
   }
 
   // Get the raw instruction bits.
-  inline int32_t InstructionBits() const {
+  int32_t InstructionBits() const {
     return *reinterpret_cast<const int32_t*>(this);
   }
 
   // Set the raw instruction bits to value.
-  inline void SetInstructionBits(int32_t value) {
+  void SetInstructionBits(int32_t value) {
     *reinterpret_cast<int32_t*>(this) = value;
   }
 
   // Read one particular bit out of the instruction bits.
-  inline int Bit(int nr) const {
+  int Bit(int nr) const {
     return (InstructionBits() >> nr) & 1;
   }
 
   // Read a bit field out of the instruction bits.
-  inline int Bits(int shift, int count) const {
+  int Bits(int shift, int count) const {
     return (InstructionBits() >> shift) & ((1 << count) - 1);
   }
 
@@ -265,80 +265,80 @@
   // Accessors for the different named fields used in the ARM encoding.
   // The naming of these accessor corresponds to figure A3-1.
   // Generally applicable fields
-  inline Condition ConditionField() const {
+  Condition ConditionField() const {
     return static_cast<Condition>(Bits(kConditionShift, kConditionBits));
   }
-  inline int TypeField() const { return Bits(kTypeShift, kTypeBits); }
+  int TypeField() const { return Bits(kTypeShift, kTypeBits); }
 
-  inline Register RnField() const { return static_cast<Register>(
+  Register RnField() const { return static_cast<Register>(
                                         Bits(kRnShift, kRnBits)); }
-  inline Register RdField() const { return static_cast<Register>(
+  Register RdField() const { return static_cast<Register>(
                                         Bits(kRdShift, kRdBits)); }
 
   // Fields used in Data processing instructions
-  inline Opcode OpcodeField() const {
+  Opcode OpcodeField() const {
     return static_cast<Opcode>(Bits(kOpcodeShift, kOpcodeBits));
   }
-  inline int SField() const { return Bits(kSShift, kSBits); }
+  int SField() const { return Bits(kSShift, kSBits); }
   // with register
-  inline Register RmField() const {
+  Register RmField() const {
     return static_cast<Register>(Bits(kRmShift, kRmBits));
   }
-  inline Shift ShiftField() const { return static_cast<Shift>(
+  Shift ShiftField() const { return static_cast<Shift>(
                                         Bits(kShiftShift, kShiftBits)); }
-  inline int RegShiftField() const { return Bit(4); }
-  inline Register RsField() const {
+  int RegShiftField() const { return Bit(4); }
+  Register RsField() const {
     return static_cast<Register>(Bits(kRsShift, kRsBits));
   }
-  inline int ShiftAmountField() const { return Bits(kShiftImmShift,
+  int ShiftAmountField() const { return Bits(kShiftImmShift,
                                                     kShiftImmBits); }
   // with immediate
-  inline int RotateField() const { return Bits(kRotateShift, kRotateBits); }
-  inline int Immed8Field() const { return Bits(kImmed8Shift, kImmed8Bits); }
+  int RotateField() const { return Bits(kRotateShift, kRotateBits); }
+  int Immed8Field() const { return Bits(kImmed8Shift, kImmed8Bits); }
 
   // Fields used in Load/Store instructions
-  inline int PUField() const { return Bits(23, 2); }
-  inline int  BField() const { return Bit(22); }
-  inline int  WField() const { return Bit(21); }
-  inline int  LField() const { return Bit(20); }
+  int PUField() const { return Bits(23, 2); }
+  int  BField() const { return Bit(22); }
+  int  WField() const { return Bit(21); }
+  int  LField() const { return Bit(20); }
   // with register uses same fields as Data processing instructions above
   // with immediate
-  inline int Offset12Field() const { return Bits(kOffset12Shift,
+  int Offset12Field() const { return Bits(kOffset12Shift,
                                                  kOffset12Bits); }
   // multiple
-  inline int RlistField() const { return Bits(0, 16); }
+  int RlistField() const { return Bits(0, 16); }
   // extra loads and stores
-  inline int SignField() const { return Bit(6); }
-  inline int HField() const { return Bit(5); }
-  inline int ImmedHField() const { return Bits(8, 4); }
-  inline int ImmedLField() const { return Bits(0, 4); }
+  int SignField() const { return Bit(6); }
+  int HField() const { return Bit(5); }
+  int ImmedHField() const { return Bits(8, 4); }
+  int ImmedLField() const { return Bits(0, 4); }
 
   // Fields used in Branch instructions
-  inline int LinkField() const { return Bits(kLinkShift, kLinkBits); }
-  inline int SImmed24Field() const { return ((InstructionBits() << 8) >> 8); }
+  int LinkField() const { return Bits(kLinkShift, kLinkBits); }
+  int SImmed24Field() const { return ((InstructionBits() << 8) >> 8); }
 
   // Fields used in Supervisor Call instructions
-  inline uint32_t SvcField() const { return Bits(0, 24); }
+  uint32_t SvcField() const { return Bits(0, 24); }
 
   // Field used in Breakpoint instruction
-  inline uint16_t BkptField() const {
+  uint16_t BkptField() const {
     return ((Bits(8, 12) << 4) | Bits(0, 4));
   }
 
   // Field used in 16-bit immediate move instructions
-  inline uint16_t MovwField() const {
+  uint16_t MovwField() const {
     return ((Bits(16, 4) << 12) | Bits(0, 12));
   }
 
   // Field used in VFP float immediate move instruction
-  inline float ImmFloatField() const {
+  float ImmFloatField() const {
     uint32_t imm32 = (Bit(19) << 31) | (((1 << 5) - Bit(18)) << 25) |
                      (Bits(16, 2) << 23) | (Bits(0, 4) << 19);
     return bit_cast<float, uint32_t>(imm32);
   }
 
   // Field used in VFP double immediate move instruction
-  inline double ImmDoubleField() const {
+  double ImmDoubleField() const {
     uint64_t imm64 = (Bit(19)*(1LL << 63)) | (((1LL << 8) - Bit(18)) << 54) |
                      (Bits(16, 2)*(1LL << 52)) | (Bits(0, 4)*(1LL << 48));
     return bit_cast<double, uint64_t>(imm64);
@@ -347,7 +347,7 @@
   // Test for data processing instructions of type 0 or 1.
   // See "ARM Architecture Reference Manual ARMv7-A and ARMv7-R edition",
   // section A5.1 "ARM instruction set encoding".
-  inline bool IsDataProcessing() const {
+  bool IsDataProcessing() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(Bits(26, 2), 0);  // Type 0 or 1.
     return ((Bits(20, 5) & 0x19) != 0x10) &&
@@ -359,47 +359,47 @@
   // Tests for special encodings of type 0 instructions (extra loads and stores,
   // as well as multiplications, synchronization primitives, and miscellaneous).
   // Can only be called for a type 0 or 1 instruction.
-  inline bool IsMiscellaneous() const {
+  bool IsMiscellaneous() const {
     CHECK_EQ(Bits(26, 2), 0);  // Type 0 or 1.
     return ((Bit(25) == 0) && ((Bits(20, 5) & 0x19) == 0x10) && (Bit(7) == 0));
   }
-  inline bool IsMultiplyOrSyncPrimitive() const {
+  bool IsMultiplyOrSyncPrimitive() const {
     CHECK_EQ(Bits(26, 2), 0);  // Type 0 or 1.
     return ((Bit(25) == 0) && (Bits(4, 4) == 9));
   }
 
   // Test for Supervisor Call instruction.
-  inline bool IsSvc() const {
+  bool IsSvc() const {
     return ((InstructionBits() & 0xff000000) == 0xef000000);
   }
 
   // Test for Breakpoint instruction.
-  inline bool IsBkpt() const {
+  bool IsBkpt() const {
     return ((InstructionBits() & 0xfff000f0) == 0xe1200070);
   }
 
   // VFP register fields.
-  inline SRegister SnField() const {
+  SRegister SnField() const {
     return static_cast<SRegister>((Bits(kRnShift, kRnBits) << 1) + Bit(7));
   }
-  inline SRegister SdField() const {
+  SRegister SdField() const {
     return static_cast<SRegister>((Bits(kRdShift, kRdBits) << 1) + Bit(22));
   }
-  inline SRegister SmField() const {
+  SRegister SmField() const {
     return static_cast<SRegister>((Bits(kRmShift, kRmBits) << 1) + Bit(5));
   }
-  inline DRegister DnField() const {
+  DRegister DnField() const {
     return static_cast<DRegister>(Bits(kRnShift, kRnBits) + (Bit(7) << 4));
   }
-  inline DRegister DdField() const {
+  DRegister DdField() const {
     return static_cast<DRegister>(Bits(kRdShift, kRdBits) + (Bit(22) << 4));
   }
-  inline DRegister DmField() const {
+  DRegister DmField() const {
     return static_cast<DRegister>(Bits(kRmShift, kRmBits) + (Bit(5) << 4));
   }
 
   // Test for VFP data processing or single transfer instructions of type 7.
-  inline bool IsVFPDataProcessingOrSingleTransfer() const {
+  bool IsVFPDataProcessingOrSingleTransfer() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(TypeField(), 7);
     return ((Bit(24) == 0) && (Bits(9, 3) == 5));
@@ -408,7 +408,7 @@
   }
 
   // Test for VFP 64-bit transfer instructions of type 6.
-  inline bool IsVFPDoubleTransfer() const {
+  bool IsVFPDoubleTransfer() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(TypeField(), 6);
     return ((Bits(21, 4) == 2) && (Bits(9, 3) == 5) &&
@@ -416,20 +416,20 @@
   }
 
   // Test for VFP load and store instructions of type 6.
-  inline bool IsVFPLoadStore() const {
+  bool IsVFPLoadStore() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(TypeField(), 6);
     return ((Bits(20, 5) & 0x12) == 0x10) && (Bits(9, 3) == 5);
   }
 
   // Special accessors that test for existence of a value.
-  inline bool HasS() const { return SField() == 1; }
-  inline bool HasB() const { return BField() == 1; }
-  inline bool HasW() const { return WField() == 1; }
-  inline bool HasL() const { return LField() == 1; }
-  inline bool HasSign() const { return SignField() == 1; }
-  inline bool HasH() const { return HField() == 1; }
-  inline bool HasLink() const { return LinkField() == 1; }
+  bool HasS() const { return SField() == 1; }
+  bool HasB() const { return BField() == 1; }
+  bool HasW() const { return WField() == 1; }
+  bool HasL() const { return LField() == 1; }
+  bool HasSign() const { return SignField() == 1; }
+  bool HasH() const { return HField() == 1; }
+  bool HasLink() const { return LinkField() == 1; }
 
   // Instructions are read out of a code stream. The only way to get a
   // reference to an instruction is to convert a pointer. There is no way
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index f8b91d7..a11c2da 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -155,7 +155,7 @@
   StoreToOffset(scratch.AsCoreRegister(), SP, offs.Int32Value());
 }
 
-void Arm64Assembler::StoreImmediateToThread(ThreadOffset offs, uint32_t imm,
+void Arm64Assembler::StoreImmediateToThread32(ThreadOffset<4> offs, uint32_t imm,
                                             ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -163,7 +163,7 @@
   StoreToOffset(scratch.AsCoreRegister(), TR, offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackOffsetToThread(ThreadOffset tr_offs,
+void Arm64Assembler::StoreStackOffsetToThread32(ThreadOffset<4> tr_offs,
                                               FrameOffset fr_offs,
                                               ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
@@ -172,7 +172,7 @@
   StoreToOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackPointerToThread(ThreadOffset tr_offs) {
+void Arm64Assembler::StoreStackPointerToThread32(ThreadOffset<4> tr_offs) {
   // Arm64 does not support: "str sp, [dest]" therefore we use IP1 as a temp reg.
   ___ Mov(reg_x(IP1), reg_x(SP));
   StoreToOffset(IP1, TR, tr_offs.Int32Value());
@@ -269,7 +269,7 @@
   return Load(m_dst.AsArm64(), SP, src.Int32Value(), size);
 }
 
-void Arm64Assembler::Load(ManagedRegister m_dst, ThreadOffset src, size_t size) {
+void Arm64Assembler::LoadFromThread32(ManagedRegister m_dst, ThreadOffset<4> src, size_t size) {
   return Load(m_dst.AsArm64(), TR, src.Int32Value(), size);
 }
 
@@ -294,7 +294,7 @@
   LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
 }
 
-void Arm64Assembler::LoadRawPtrFromThread(ManagedRegister m_dst, ThreadOffset offs) {
+void Arm64Assembler::LoadRawPtrFromThread32(ManagedRegister m_dst, ThreadOffset<4> offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
   LoadFromOffset(dst.AsCoreRegister(), TR, offs.Int32Value());
@@ -322,8 +322,8 @@
   }
 }
 
-void Arm64Assembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                          ThreadOffset tr_offs,
+void Arm64Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                          ThreadOffset<4> tr_offs,
                                           ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -331,7 +331,7 @@
   StoreToOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
 }
 
-void Arm64Assembler::CopyRawPtrToThread(ThreadOffset tr_offs,
+void Arm64Assembler::CopyRawPtrToThread32(ThreadOffset<4> tr_offs,
                                         FrameOffset fr_offs,
                                         ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
@@ -486,7 +486,7 @@
   ___ Blr(reg_x(scratch.AsCoreRegister()));
 }
 
-void Arm64Assembler::Call(ThreadOffset /*offset*/, ManagedRegister /*scratch*/) {
+void Arm64Assembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL) << "Unimplemented Call() variant";
 }
 
@@ -555,7 +555,7 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   Arm64Exception *current_exception = new Arm64Exception(scratch, stack_adjust);
   exception_blocks_.push_back(current_exception);
-  LoadFromOffset(scratch.AsCoreRegister(), TR, Thread::ExceptionOffset().Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), TR, Thread::ExceptionOffset<4>().Int32Value());
   ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
   ___ B(current_exception->Entry(), COND_OP(NE));
 }
@@ -569,12 +569,14 @@
   // Pass exception object as argument.
   // Don't care about preserving X0 as this won't return.
   ___ Mov(reg_x(X0), reg_x(exception->scratch_.AsCoreRegister()));
-  LoadFromOffset(IP1, TR, QUICK_ENTRYPOINT_OFFSET(pDeliverException).Int32Value());
+  LoadFromOffset(IP1, TR, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
   ___ Blr(reg_x(IP1));
   // Call should never return.
   ___ Brk();
 }
 
+constexpr size_t kFramePointerSize = 8;
+
 void Arm64Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
                         const std::vector<ManagedRegister>& callee_save_regs,
                         const ManagedRegisterEntrySpills& entry_spills) {
@@ -589,8 +591,8 @@
   ___ PushCalleeSavedRegisters();
 
   // Increate frame to required size - must be at least space to push Method*.
-  CHECK_GT(frame_size, kCalleeSavedRegsSize * kPointerSize);
-  size_t adjust = frame_size - (kCalleeSavedRegsSize * kPointerSize);
+  CHECK_GT(frame_size, kCalleeSavedRegsSize * kFramePointerSize);
+  size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
   IncreaseFrameSize(adjust);
 
   // Write Method*.
@@ -600,7 +602,7 @@
   // TODO: we can implement a %2 STRP variant of StoreToOffset.
   for (size_t i = 0; i < entry_spills.size(); ++i) {
     Register reg = entry_spills.at(i).AsArm64().AsCoreRegister();
-    StoreToOffset(reg, SP, frame_size + kPointerSize + (i * kPointerSize));
+    StoreToOffset(reg, SP, frame_size + kFramePointerSize + (i * kFramePointerSize));
   }
 }
 
@@ -610,10 +612,10 @@
   // For now we only check that the size of the frame is greater than the
   // no of APCS callee saved regs [X19, X30] [D8, D15].
   CHECK_EQ(callee_save_regs.size(), kCalleeSavedRegsSize);
-  CHECK_GT(frame_size, kCalleeSavedRegsSize * kPointerSize);
+  CHECK_GT(frame_size, kCalleeSavedRegsSize * kFramePointerSize);
 
   // Decrease frame size to start of callee saved regs.
-  size_t adjust = frame_size - (kCalleeSavedRegsSize * kPointerSize);
+  size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
   DecreaseFrameSize(adjust);
 
   // Pop callee saved and return to LR.
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 44eb6ff..3abcaad 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -79,7 +79,7 @@
 
 class Arm64Exception;
 
-class Arm64Assembler : public Assembler {
+class Arm64Assembler FINAL : public Assembler {
  public:
   Arm64Assembler() : vixl_buf_(new byte[BUF_SIZE]),
   vixl_masm_(new vixl::MacroAssembler(vixl_buf_, BUF_SIZE)) {}
@@ -111,105 +111,97 @@
   // Emit code that will create an activation on the stack.
   void BuildFrame(size_t frame_size, ManagedRegister method_reg,
                   const std::vector<ManagedRegister>& callee_save_regs,
-                  const ManagedRegisterEntrySpills& entry_spills);
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack.
-  void RemoveFrame(size_t frame_size,
-                   const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  void IncreaseFrameSize(size_t adjust);
-  void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines.
-  void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  void StoreRef(FrameOffset dest, ManagedRegister src);
-  void StoreRawPtr(FrameOffset dest, ManagedRegister src);
-  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
-  void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
-  void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
-  void StoreStackPointerToThread(ThreadOffset thr_offs);
-  void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines.
-  void Load(ManagedRegister dest, FrameOffset src, size_t size);
-  void Load(ManagedRegister dest, ThreadOffset src, size_t size);
-  void LoadRef(ManagedRegister dest, FrameOffset  src);
-  void LoadRef(ManagedRegister dest, ManagedRegister base,
-               MemberOffset offs);
-  void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                  Offset offs);
-  void LoadRawPtrFromThread(ManagedRegister dest,
-                            ThreadOffset offs);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
+  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
+  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
+
   // Copying routines.
-  void Move(ManagedRegister dest, ManagedRegister src, size_t size);
-  void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                            ManagedRegister scratch);
-  void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                          ManagedRegister scratch);
-  void CopyRef(FrameOffset dest, FrameOffset src,
-               ManagedRegister scratch);
-  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
-  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-            ManagedRegister scratch, size_t size);
-  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-            ManagedRegister scratch, size_t size);
-  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-            ManagedRegister scratch, size_t size);
-  void Copy(ManagedRegister dest, Offset dest_offset,
-            ManagedRegister src, Offset src_offset,
-            ManagedRegister scratch, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
   void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-            ManagedRegister scratch, size_t size);
-  void MemoryBarrier(ManagedRegister scratch);
+            ManagedRegister scratch, size_t size) OVERRIDE;
+  void MemoryBarrier(ManagedRegister scratch) OVERRIDE;
 
   // Sign extension.
-  void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension.
-  void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current().
-  void GetCurrentThread(ManagedRegister tr);
-  void GetCurrentThread(FrameOffset dest_offset,
-                        ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
   void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                       ManagedRegister in_reg, bool null_allowed);
+                       ManagedRegister in_reg, bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
   void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                       ManagedRegister scratch, bool null_allowed);
+                       ManagedRegister scratch, bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst.
-  void LoadReferenceFromSirt(ManagedRegister dst,
-                             ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  void VerifyObject(ManagedRegister src, bool could_be_null);
-  void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset].
-  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch);
-  void Call(FrameOffset base, Offset offset, ManagedRegister scratch);
-  void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
 
   // Jump to address (not setting link register)
   void JumpTo(ManagedRegister m_base, Offset offs, ManagedRegister m_scratch);
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
   static vixl::Register reg_x(int code) {
diff --git a/compiler/utils/arm64/constants_arm64.h b/compiler/utils/arm64/constants_arm64.h
index c05c2f1..ecf9fbe 100644
--- a/compiler/utils/arm64/constants_arm64.h
+++ b/compiler/utils/arm64/constants_arm64.h
@@ -29,7 +29,7 @@
 namespace art {
 namespace arm64 {
 
-  constexpr unsigned int kCalleeSavedRegsSize = 20;
+constexpr unsigned int kCalleeSavedRegsSize = 20;
 
 }  // arm64
 }  // art
diff --git a/compiler/utils/assembler.cc b/compiler/utils/assembler.cc
index 1921b28..26bdceb 100644
--- a/compiler/utils/assembler.cc
+++ b/compiler/utils/assembler.cc
@@ -122,4 +122,78 @@
   }
 }
 
+void Assembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
+                                         ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
+                                         ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
+                                           FrameOffset fr_offs,
+                                           ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+                                           FrameOffset fr_offs,
+                                           ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackPointerToThread64(ThreadOffset<8> thr_offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                                       ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+                                       ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                     ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+                                     ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
 }  // namespace art
diff --git a/compiler/utils/assembler.h b/compiler/utils/assembler.h
index c23fd44..219c87f 100644
--- a/compiler/utils/assembler.h
+++ b/compiler/utils/assembler.h
@@ -374,14 +374,20 @@
   virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
                                      ManagedRegister scratch) = 0;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch) = 0;
+  virtual void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
+                                        ManagedRegister scratch);
+  virtual void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
+                                        ManagedRegister scratch);
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch) = 0;
+  virtual void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
+                                          FrameOffset fr_offs,
+                                          ManagedRegister scratch);
+  virtual void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+                                          FrameOffset fr_offs,
+                                          ManagedRegister scratch);
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs) = 0;
+  virtual void StoreStackPointerToThread32(ThreadOffset<4> thr_offs);
+  virtual void StoreStackPointerToThread64(ThreadOffset<8> thr_offs);
 
   virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
                              FrameOffset in_off, ManagedRegister scratch) = 0;
@@ -389,27 +395,29 @@
   // Load routines
   virtual void Load(ManagedRegister dest, FrameOffset src, size_t size) = 0;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size) = 0;
+  virtual void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size);
+  virtual void LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size);
 
   virtual void LoadRef(ManagedRegister dest, FrameOffset  src) = 0;
+  virtual void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) = 0;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs) = 0;
+  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) = 0;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs) = 0;
-
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs) = 0;
+  virtual void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs);
+  virtual void LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs);
 
   // Copying routines
   virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size) = 0;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch) = 0;
+  virtual void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                                      ManagedRegister scratch);
+  virtual void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+                                      ManagedRegister scratch);
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch) = 0;
+  virtual void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                    ManagedRegister scratch);
+  virtual void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+                                    ManagedRegister scratch);
 
   virtual void CopyRef(FrameOffset dest, FrameOffset src,
                        ManagedRegister scratch) = 0;
@@ -471,7 +479,8 @@
                     ManagedRegister scratch) = 0;
   virtual void Call(FrameOffset base, Offset offset,
                     ManagedRegister scratch) = 0;
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch) = 0;
+  virtual void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch);
+  virtual void CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch);
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index dfd3306..45d3a97 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -536,6 +536,8 @@
   Sdc1(reg, base, offset);
 }
 
+constexpr size_t kFramePointerSize = 4;
+
 void MipsAssembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
                                const std::vector<ManagedRegister>& callee_save_regs,
                                const ManagedRegisterEntrySpills& entry_spills) {
@@ -545,10 +547,10 @@
   IncreaseFrameSize(frame_size);
 
   // Push callee saves and return address
-  int stack_offset = frame_size - kPointerSize;
+  int stack_offset = frame_size - kFramePointerSize;
   StoreToOffset(kStoreWord, RA, SP, stack_offset);
   for (int i = callee_save_regs.size() - 1; i >= 0; --i) {
-    stack_offset -= kPointerSize;
+    stack_offset -= kFramePointerSize;
     Register reg = callee_save_regs.at(i).AsMips().AsCoreRegister();
     StoreToOffset(kStoreWord, reg, SP, stack_offset);
   }
@@ -559,7 +561,7 @@
   // Write out entry spills.
   for (size_t i = 0; i < entry_spills.size(); ++i) {
     Register reg = entry_spills.at(i).AsMips().AsCoreRegister();
-    StoreToOffset(kStoreWord, reg, SP, frame_size + kPointerSize + (i * kPointerSize));
+    StoreToOffset(kStoreWord, reg, SP, frame_size + kFramePointerSize + (i * kFramePointerSize));
   }
 }
 
@@ -568,11 +570,11 @@
   CHECK_ALIGNED(frame_size, kStackAlignment);
 
   // Pop callee saves and return address
-  int stack_offset = frame_size - (callee_save_regs.size() * kPointerSize) - kPointerSize;
+  int stack_offset = frame_size - (callee_save_regs.size() * kFramePointerSize) - kFramePointerSize;
   for (size_t i = 0; i < callee_save_regs.size(); ++i) {
     Register reg = callee_save_regs.at(i).AsMips().AsCoreRegister();
     LoadFromOffset(kLoadWord, reg, SP, stack_offset);
-    stack_offset += kPointerSize;
+    stack_offset += kFramePointerSize;
   }
   LoadFromOffset(kLoadWord, RA, SP, stack_offset);
 
@@ -633,7 +635,7 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), SP, dest.Int32Value());
 }
 
-void MipsAssembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
+void MipsAssembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
                                            ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -641,7 +643,7 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), S1, dest.Int32Value());
 }
 
-void MipsAssembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
+void MipsAssembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
                                              FrameOffset fr_offs,
                                              ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
@@ -651,7 +653,7 @@
                 S1, thr_offs.Int32Value());
 }
 
-void MipsAssembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
+void MipsAssembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
   StoreToOffset(kStoreWord, SP, S1, thr_offs.Int32Value());
 }
 
@@ -668,7 +670,7 @@
   return EmitLoad(mdest, SP, src.Int32Value(), size);
 }
 
-void MipsAssembler::Load(ManagedRegister mdest, ThreadOffset src, size_t size) {
+void MipsAssembler::LoadFromThread32(ManagedRegister mdest, ThreadOffset<4> src, size_t size) {
   return EmitLoad(mdest, S1, src.Int32Value(), size);
 }
 
@@ -697,8 +699,8 @@
                  base.AsMips().AsCoreRegister(), offs.Int32Value());
 }
 
-void MipsAssembler::LoadRawPtrFromThread(ManagedRegister mdest,
-                                         ThreadOffset offs) {
+void MipsAssembler::LoadRawPtrFromThread32(ManagedRegister mdest,
+                                         ThreadOffset<4> offs) {
   MipsManagedRegister dest = mdest.AsMips();
   CHECK(dest.IsCoreRegister());
   LoadFromOffset(kLoadWord, dest.AsCoreRegister(), S1, offs.Int32Value());
@@ -748,8 +750,8 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), SP, dest.Int32Value());
 }
 
-void MipsAssembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                         ThreadOffset thr_offs,
+void MipsAssembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                         ThreadOffset<4> thr_offs,
                                          ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -759,7 +761,7 @@
                 SP, fr_offs.Int32Value());
 }
 
-void MipsAssembler::CopyRawPtrToThread(ThreadOffset thr_offs,
+void MipsAssembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs,
                                        FrameOffset fr_offs,
                                        ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
@@ -923,7 +925,7 @@
   // TODO: place reference map on call
 }
 
-void MipsAssembler::Call(ThreadOffset /*offset*/, ManagedRegister /*mscratch*/) {
+void MipsAssembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*mscratch*/) {
   UNIMPLEMENTED(FATAL) << "no mips implementation";
 }
 
@@ -941,7 +943,7 @@
   MipsExceptionSlowPath* slow = new MipsExceptionSlowPath(scratch, stack_adjust);
   buffer_.EnqueueSlowPath(slow);
   LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
-                 S1, Thread::ExceptionOffset().Int32Value());
+                 S1, Thread::ExceptionOffset<4>().Int32Value());
   EmitBranch(scratch.AsCoreRegister(), ZERO, slow->Entry(), false);
 }
 
@@ -956,7 +958,7 @@
   // Don't care about preserving A0 as this call won't return
   __ Move(A0, scratch_.AsCoreRegister());
   // Set up call to Thread::Current()->pDeliverException
-  __ LoadFromOffset(kLoadWord, T9, S1, QUICK_ENTRYPOINT_OFFSET(pDeliverException).Int32Value());
+  __ LoadFromOffset(kLoadWord, T9, S1, QUICK_ENTRYPOINT_OFFSET(4, pDeliverException).Int32Value());
   __ Jr(T9);
   // Call never returns
   __ Break();
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 0d1a94c..75ee8b9 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -29,171 +29,6 @@
 
 namespace art {
 namespace mips {
-#if 0
-class Operand {
- public:
-  uint8_t mod() const {
-    return (encoding_at(0) >> 6) & 3;
-  }
-
-  Register rm() const {
-    return static_cast<Register>(encoding_at(0) & 7);
-  }
-
-  ScaleFactor scale() const {
-    return static_cast<ScaleFactor>((encoding_at(1) >> 6) & 3);
-  }
-
-  Register index() const {
-    return static_cast<Register>((encoding_at(1) >> 3) & 7);
-  }
-
-  Register base() const {
-    return static_cast<Register>(encoding_at(1) & 7);
-  }
-
-  int8_t disp8() const {
-    CHECK_GE(length_, 2);
-    return static_cast<int8_t>(encoding_[length_ - 1]);
-  }
-
-  int32_t disp32() const {
-    CHECK_GE(length_, 5);
-    int32_t value;
-    memcpy(&value, &encoding_[length_ - 4], sizeof(value));
-    return value;
-  }
-
-  bool IsRegister(Register reg) const {
-    return ((encoding_[0] & 0xF8) == 0xC0)  // Addressing mode is register only.
-        && ((encoding_[0] & 0x07) == reg);  // Register codes match.
-  }
-
- protected:
-  // Operand can be sub classed (e.g: Address).
-  Operand() : length_(0) { }
-
-  void SetModRM(int mod, Register rm) {
-    CHECK_EQ(mod & ~3, 0);
-    encoding_[0] = (mod << 6) | rm;
-    length_ = 1;
-  }
-
-  void SetSIB(ScaleFactor scale, Register index, Register base) {
-    CHECK_EQ(length_, 1);
-    CHECK_EQ(scale & ~3, 0);
-    encoding_[1] = (scale << 6) | (index << 3) | base;
-    length_ = 2;
-  }
-
-  void SetDisp8(int8_t disp) {
-    CHECK(length_ == 1 || length_ == 2);
-    encoding_[length_++] = static_cast<uint8_t>(disp);
-  }
-
-  void SetDisp32(int32_t disp) {
-    CHECK(length_ == 1 || length_ == 2);
-    int disp_size = sizeof(disp);
-    memmove(&encoding_[length_], &disp, disp_size);
-    length_ += disp_size;
-  }
-
- private:
-  byte length_;
-  byte encoding_[6];
-  byte padding_;
-
-  explicit Operand(Register reg) { SetModRM(3, reg); }
-
-  // Get the operand encoding byte at the given index.
-  uint8_t encoding_at(int index) const {
-    CHECK_GE(index, 0);
-    CHECK_LT(index, length_);
-    return encoding_[index];
-  }
-
-  friend class MipsAssembler;
-
-  DISALLOW_COPY_AND_ASSIGN(Operand);
-};
-
-
-class Address : public Operand {
- public:
-  Address(Register base, int32_t disp) {
-    Init(base, disp);
-  }
-
-  Address(Register base, Offset disp) {
-    Init(base, disp.Int32Value());
-  }
-
-  Address(Register base, FrameOffset disp) {
-    CHECK_EQ(base, ESP);
-    Init(ESP, disp.Int32Value());
-  }
-
-  Address(Register base, MemberOffset disp) {
-    Init(base, disp.Int32Value());
-  }
-
-  void Init(Register base, int32_t disp) {
-    if (disp == 0 && base != EBP) {
-      SetModRM(0, base);
-      if (base == ESP) SetSIB(TIMES_1, ESP, base);
-    } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, base);
-      if (base == ESP) SetSIB(TIMES_1, ESP, base);
-      SetDisp8(disp);
-    } else {
-      SetModRM(2, base);
-      if (base == ESP) SetSIB(TIMES_1, ESP, base);
-      SetDisp32(disp);
-    }
-  }
-
-
-  Address(Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, ESP);  // Illegal addressing mode.
-    SetModRM(0, ESP);
-    SetSIB(scale, index, EBP);
-    SetDisp32(disp);
-  }
-
-  Address(Register base, Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, ESP);  // Illegal addressing mode.
-    if (disp == 0 && base != EBP) {
-      SetModRM(0, ESP);
-      SetSIB(scale, index, base);
-    } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, ESP);
-      SetSIB(scale, index, base);
-      SetDisp8(disp);
-    } else {
-      SetModRM(2, ESP);
-      SetSIB(scale, index, base);
-      SetDisp32(disp);
-    }
-  }
-
-  static Address Absolute(uword addr) {
-    Address result;
-    result.SetModRM(0, EBP);
-    result.SetDisp32(addr);
-    return result;
-  }
-
-  static Address Absolute(ThreadOffset addr) {
-    return Absolute(addr.Int32Value());
-  }
-
- private:
-  Address() {}
-
-  DISALLOW_COPY_AND_ASSIGN(Address);
-};
-
-#endif
 
 enum LoadOperandType {
   kLoadSignedByte,
@@ -215,7 +50,7 @@
   kStoreDWord
 };
 
-class MipsAssembler : public Assembler {
+class MipsAssembler FINAL : public Assembler {
  public:
   MipsAssembler() {}
   virtual ~MipsAssembler() {}
@@ -310,40 +145,6 @@
   void StoreFToOffset(FRegister reg, Register base, int32_t offset);
   void StoreDToOffset(DRegister reg, Register base, int32_t offset);
 
-#if 0
-  MipsAssembler* lock();
-
-  void mfence();
-
-  MipsAssembler* fs();
-
-  //
-  // Macros for High-level operations.
-  //
-
-  void AddImmediate(Register reg, const Immediate& imm);
-
-  void LoadDoubleConstant(XmmRegister dst, double value);
-
-  void DoubleNegate(XmmRegister d);
-  void FloatNegate(XmmRegister f);
-
-  void DoubleAbs(XmmRegister reg);
-
-  void LockCmpxchgl(const Address& address, Register reg) {
-    lock()->cmpxchgl(address, reg);
-  }
-
-  //
-  // Misc. functionality
-  //
-  int PreferredLoopAlignment() { return 16; }
-  void Align(int alignment, int offset);
-
-  // Debugging and bringup support.
-  void Stop(const char* message);
-#endif
-
   // Emit data (e.g. encoded instruction or immediate) to the instruction stream.
   void Emit(int32_t value);
   void EmitBranch(Register rt, Register rs, Label* label, bool equal);
@@ -355,127 +156,116 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister msrc, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister msrc);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister msrc);
+  void Store(FrameOffset offs, ManagedRegister msrc, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister msrc) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister msrc) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister mscratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister mscratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister mscratch);
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister mscratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister mscratch);
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister mscratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
 
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister msrc,
-                             FrameOffset in_off, ManagedRegister mscratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister msrc, FrameOffset in_off,
+                     ManagedRegister mscratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister mdest, FrameOffset src, size_t size);
+  void Load(ManagedRegister mdest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister mdest, ThreadOffset src, size_t size);
+  void LoadFromThread32(ManagedRegister mdest, ThreadOffset<4> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister mdest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister mdest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister mdest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister mdest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister mdest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread32(ManagedRegister mdest, ThreadOffset<4> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister mdest, ManagedRegister msrc, size_t size);
+  void Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) OVERRIDE;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister mscratch);
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister mscratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister mscratch);
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                            ManagedRegister mscratch) OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister mscratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister mscratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister mscratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
+            ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister mscratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister);
+  void MemoryBarrier(ManagedRegister) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister mscratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister mscratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister mscratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister mscratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
-  virtual void LoadReferenceFromSirt(ManagedRegister dst,
-                                     ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister mscratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister mscratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister mscratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister mscratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister mscratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister mscratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust) OVERRIDE;
 
  private:
   void EmitR(int opcode, Register rs, Register rt, Register rd, int shamt, int funct);
@@ -491,11 +281,11 @@
 };
 
 // Slowpath entered when Thread::Current()->_exception is non-null
-class MipsExceptionSlowPath : public SlowPath {
+class MipsExceptionSlowPath FINAL : public SlowPath {
  public:
   explicit MipsExceptionSlowPath(MipsManagedRegister scratch, size_t stack_adjust)
       : scratch_(scratch), stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm);
+  virtual void Emit(Assembler *sp_asm) OVERRIDE;
  private:
   const MipsManagedRegister scratch_;
   const size_t stack_adjust_;
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index ebbb43a..6043c17 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1396,6 +1396,8 @@
   EmitOperand(reg_or_opcode, Operand(operand));
 }
 
+constexpr size_t kFramePointerSize = 4;
+
 void X86Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
                               const std::vector<ManagedRegister>& spill_regs,
                               const ManagedRegisterEntrySpills& entry_spills) {
@@ -1404,11 +1406,11 @@
     pushl(spill_regs.at(i).AsX86().AsCpuRegister());
   }
   // return address then method on stack
-  addl(ESP, Immediate(-frame_size + (spill_regs.size() * kPointerSize) +
-                      kPointerSize /*method*/ + kPointerSize /*return address*/));
+  addl(ESP, Immediate(-frame_size + (spill_regs.size() * kFramePointerSize) +
+                      kFramePointerSize /*method*/ + kFramePointerSize /*return address*/));
   pushl(method_reg.AsX86().AsCpuRegister());
   for (size_t i = 0; i < entry_spills.size(); ++i) {
-    movl(Address(ESP, frame_size + kPointerSize + (i * kPointerSize)),
+    movl(Address(ESP, frame_size + kFramePointerSize + (i * kFramePointerSize)),
          entry_spills.at(i).AsX86().AsCpuRegister());
   }
 }
@@ -1416,7 +1418,7 @@
 void X86Assembler::RemoveFrame(size_t frame_size,
                             const std::vector<ManagedRegister>& spill_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
-  addl(ESP, Immediate(frame_size - (spill_regs.size() * kPointerSize) - kPointerSize));
+  addl(ESP, Immediate(frame_size - (spill_regs.size() * kFramePointerSize) - kFramePointerSize));
   for (size_t i = 0; i < spill_regs.size(); ++i) {
     popl(spill_regs.at(i).AsX86().AsCpuRegister());
   }
@@ -1478,12 +1480,12 @@
   movl(Address(ESP, dest), Immediate(imm));
 }
 
-void X86Assembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
+void X86Assembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
                                           ManagedRegister) {
   fs()->movl(Address::Absolute(dest), Immediate(imm));
 }
 
-void X86Assembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
+void X86Assembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
                                             FrameOffset fr_offs,
                                             ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
@@ -1492,14 +1494,10 @@
   fs()->movl(Address::Absolute(thr_offs), scratch.AsCpuRegister());
 }
 
-void X86Assembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
+void X86Assembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
   fs()->movl(Address::Absolute(thr_offs), ESP);
 }
 
-void X86Assembler::StoreLabelToThread(ThreadOffset thr_offs, Label* lbl) {
-  fs()->movl(Address::Absolute(thr_offs), lbl);
-}
-
 void X86Assembler::StoreSpanning(FrameOffset /*dst*/, ManagedRegister /*src*/,
                                  FrameOffset /*in_off*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL);  // this case only currently exists for ARM
@@ -1532,7 +1530,7 @@
   }
 }
 
-void X86Assembler::Load(ManagedRegister mdest, ThreadOffset src, size_t size) {
+void X86Assembler::LoadFromThread32(ManagedRegister mdest, ThreadOffset<4> src, size_t size) {
   X86ManagedRegister dest = mdest.AsX86();
   if (dest.IsNoRegister()) {
     CHECK_EQ(0u, size);
@@ -1542,7 +1540,7 @@
   } else if (dest.IsRegisterPair()) {
     CHECK_EQ(8u, size);
     fs()->movl(dest.AsRegisterPairLow(), Address::Absolute(src));
-    fs()->movl(dest.AsRegisterPairHigh(), Address::Absolute(ThreadOffset(src.Int32Value()+4)));
+    fs()->movl(dest.AsRegisterPairHigh(), Address::Absolute(ThreadOffset<4>(src.Int32Value()+4)));
   } else if (dest.IsX87Register()) {
     if (size == 4) {
       fs()->flds(Address::Absolute(src));
@@ -1582,8 +1580,8 @@
   movl(dest.AsCpuRegister(), Address(base.AsX86().AsCpuRegister(), offs));
 }
 
-void X86Assembler::LoadRawPtrFromThread(ManagedRegister mdest,
-                                        ThreadOffset offs) {
+void X86Assembler::LoadRawPtrFromThread32(ManagedRegister mdest,
+                                        ThreadOffset<4> offs) {
   X86ManagedRegister dest = mdest.AsX86();
   CHECK(dest.IsCpuRegister());
   fs()->movl(dest.AsCpuRegister(), Address::Absolute(offs));
@@ -1645,8 +1643,8 @@
   movl(Address(ESP, dest), scratch.AsCpuRegister());
 }
 
-void X86Assembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                        ThreadOffset thr_offs,
+void X86Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                        ThreadOffset<4> thr_offs,
                                         ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
   CHECK(scratch.IsCpuRegister());
@@ -1654,7 +1652,7 @@
   Store(fr_offs, scratch, 4);
 }
 
-void X86Assembler::CopyRawPtrToThread(ThreadOffset thr_offs,
+void X86Assembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs,
                                       FrameOffset fr_offs,
                                       ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
@@ -1804,26 +1802,26 @@
   call(Address(scratch, offset));
 }
 
-void X86Assembler::Call(ThreadOffset offset, ManagedRegister /*mscratch*/) {
+void X86Assembler::CallFromThread32(ThreadOffset<4> offset, ManagedRegister /*mscratch*/) {
   fs()->call(Address::Absolute(offset));
 }
 
 void X86Assembler::GetCurrentThread(ManagedRegister tr) {
   fs()->movl(tr.AsX86().AsCpuRegister(),
-             Address::Absolute(Thread::SelfOffset()));
+             Address::Absolute(Thread::SelfOffset<4>()));
 }
 
 void X86Assembler::GetCurrentThread(FrameOffset offset,
                                     ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
-  fs()->movl(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset()));
+  fs()->movl(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset<4>()));
   movl(Address(ESP, offset), scratch.AsCpuRegister());
 }
 
 void X86Assembler::ExceptionPoll(ManagedRegister /*scratch*/, size_t stack_adjust) {
   X86ExceptionSlowPath* slow = new X86ExceptionSlowPath(stack_adjust);
   buffer_.EnqueueSlowPath(slow);
-  fs()->cmpl(Address::Absolute(Thread::ExceptionOffset()), Immediate(0));
+  fs()->cmpl(Address::Absolute(Thread::ExceptionOffset<4>()), Immediate(0));
   j(kNotEqual, slow->Entry());
 }
 
@@ -1836,8 +1834,8 @@
     __ DecreaseFrameSize(stack_adjust_);
   }
   // Pass exception as argument in EAX
-  __ fs()->movl(EAX, Address::Absolute(Thread::ExceptionOffset()));
-  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(pDeliverException)));
+  __ fs()->movl(EAX, Address::Absolute(Thread::ExceptionOffset<4>()));
+  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(4, pDeliverException)));
   // this call should never return
   __ int3();
 #undef __
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index f906a6f..f8fc4c0 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -117,7 +117,6 @@
  private:
   byte length_;
   byte encoding_[6];
-  byte padding_;
 
   explicit Operand(Register reg) { SetModRM(3, reg); }
 
@@ -192,21 +191,15 @@
     }
   }
 
-  static Address Absolute(uword addr, bool has_rip = false) {
+  static Address Absolute(uword addr) {
     Address result;
-    if (has_rip) {
-      result.SetModRM(0, ESP);
-      result.SetSIB(TIMES_1, ESP, EBP);
-      result.SetDisp32(addr);
-    } else {
-      result.SetModRM(0, EBP);
-      result.SetDisp32(addr);
-    }
+    result.SetModRM(0, EBP);
+    result.SetDisp32(addr);
     return result;
   }
 
-  static Address Absolute(ThreadOffset addr, bool has_rip = false) {
-    return Absolute(addr.Int32Value(), has_rip);
+  static Address Absolute(ThreadOffset<4> addr) {
+    return Absolute(addr.Int32Value());
   }
 
  private:
@@ -465,129 +458,116 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister src);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister src);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
 
-  void StoreLabelToThread(ThreadOffset thr_offs, Label* lbl);
-
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister dest, FrameOffset src, size_t size);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size);
+  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch);
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch);
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister scratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister);
+  void MemoryBarrier(ManagedRegister) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister scratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister scratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
-  virtual void LoadReferenceFromSirt(ManagedRegister dst,
-                                     ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
   inline void EmitUint8(uint8_t value);
@@ -637,10 +617,10 @@
 }
 
 // Slowpath entered when Thread::Current()->_exception is non-null
-class X86ExceptionSlowPath : public SlowPath {
+class X86ExceptionSlowPath FINAL : public SlowPath {
  public:
   explicit X86ExceptionSlowPath(size_t stack_adjust) : stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm);
+  virtual void Emit(Assembler *sp_asm) OVERRIDE;
  private:
   const size_t stack_adjust_;
 };
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index fa302c9..7d02c7c 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -24,23 +24,29 @@
 namespace art {
 namespace x86_64 {
 
+std::ostream& operator<<(std::ostream& os, const CpuRegister& reg) {
+  return os << reg.AsRegister();
+}
+
 std::ostream& operator<<(std::ostream& os, const XmmRegister& reg) {
-  return os << "XMM" << static_cast<int>(reg);
+  return os << reg.AsFloatRegister();
 }
 
 std::ostream& operator<<(std::ostream& os, const X87Register& reg) {
   return os << "ST" << static_cast<int>(reg);
 }
 
-void X86_64Assembler::call(Register reg) {
+void X86_64Assembler::call(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xFF);
-  EmitRegisterOperand(2, reg);
+  EmitRegisterOperand(2, reg.LowBits());
 }
 
 
 void X86_64Assembler::call(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xFF);
   EmitOperand(2, address);
 }
@@ -54,15 +60,16 @@
 }
 
 
-void X86_64Assembler::pushq(Register reg) {
+void X86_64Assembler::pushq(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_rm(reg);
-  EmitUint8(0x50 + reg);
+  EmitOptionalRex32(reg);
+  EmitUint8(0x50 + reg.LowBits());
 }
 
 
 void X86_64Assembler::pushq(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xFF);
   EmitOperand(6, address);
 }
@@ -80,332 +87,335 @@
 }
 
 
-void X86_64Assembler::popq(Register reg) {
+void X86_64Assembler::popq(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_rm(reg);
-  EmitUint8(0x58 + reg);
+  EmitOptionalRex32(reg);
+  EmitUint8(0x58 + reg.LowBits());
 }
 
 
 void X86_64Assembler::popq(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0x8F);
   EmitOperand(0, address);
 }
 
 
-void X86_64Assembler::movq(Register dst, const Immediate& imm) {
+void X86_64Assembler::movq(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
-  EmitUint8(0xB8 + dst);
+  EmitRex64(dst);
+  EmitUint8(0xB8 + dst.LowBits());
   EmitImmediate(imm);
 }
 
 
-void X86_64Assembler::movl(Register dst, const Immediate& imm) {
+void X86_64Assembler::movl(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xB8 + dst);
+  EmitOptionalRex32(dst);
+  EmitUint8(0xB8 + dst.LowBits());
   EmitImmediate(imm);
 }
 
 
-void X86_64Assembler::movq(Register dst, Register src) {
+void X86_64Assembler::movq(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
+  EmitRex64(dst, src);
   EmitUint8(0x89);
-  EmitRegisterOperand(src, dst);
+  EmitRegisterOperand(src.LowBits(), dst.LowBits());
 }
 
 
-void X86_64Assembler::movl(Register dst, Register src) {
+void X86_64Assembler::movl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x89);
-  EmitRegisterOperand(src, dst);
+  EmitRegisterOperand(src.LowBits(), dst.LowBits());
 }
 
 
-void X86_64Assembler::movq(Register dst, const Address& src) {
+void X86_64Assembler::movq(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(dst, 8);
+  EmitRex64(dst, src);
   EmitUint8(0x8B);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movl(Register dst, const Address& src) {
+void X86_64Assembler::movl(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(dst, 4);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x8B);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movq(const Address& dst, Register src) {
+void X86_64Assembler::movq(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(src, 8);
+  EmitRex64(src, dst);
   EmitUint8(0x89);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
-void X86_64Assembler::movl(const Address& dst, Register src) {
+void X86_64Assembler::movl(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(src, 4);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x89);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
-
 void X86_64Assembler::movl(const Address& dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
   EmitUint8(0xC7);
   EmitOperand(0, dst);
   EmitImmediate(imm);
 }
 
-void X86_64Assembler::movl(const Address& dst, Label* lbl) {
+void X86_64Assembler::movzxb(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xC7);
-  EmitOperand(0, dst);
-  EmitLabel(lbl, dst.length_ + 5);
-}
-
-void X86_64Assembler::movzxb(Register dst, ByteRegister src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB6);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movzxb(Register dst, const Address& src) {
+void X86_64Assembler::movzxb(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB6);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movsxb(Register dst, ByteRegister src) {
+void X86_64Assembler::movsxb(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBE);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movsxb(Register dst, const Address& src) {
+void X86_64Assembler::movsxb(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBE);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movb(Register /*dst*/, const Address& /*src*/) {
+void X86_64Assembler::movb(CpuRegister /*dst*/, const Address& /*src*/) {
   LOG(FATAL) << "Use movzxb or movsxb instead.";
 }
 
 
-void X86_64Assembler::movb(const Address& dst, ByteRegister src) {
+void X86_64Assembler::movb(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(src, dst);
   EmitUint8(0x88);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::movb(const Address& dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xC6);
-  EmitOperand(RAX, dst);
+  EmitOperand(Register::RAX, dst);
   CHECK(imm.is_int8());
   EmitUint8(imm.value() & 0xFF);
 }
 
 
-void X86_64Assembler::movzxw(Register dst, Register src) {
+void X86_64Assembler::movzxw(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB7);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movzxw(Register dst, const Address& src) {
+void X86_64Assembler::movzxw(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB7);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movsxw(Register dst, Register src) {
+void X86_64Assembler::movsxw(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBF);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movsxw(Register dst, const Address& src) {
+void X86_64Assembler::movsxw(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBF);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movw(Register /*dst*/, const Address& /*src*/) {
+void X86_64Assembler::movw(CpuRegister /*dst*/, const Address& /*src*/) {
   LOG(FATAL) << "Use movzxw or movsxw instead.";
 }
 
 
-void X86_64Assembler::movw(const Address& dst, Register src) {
+void X86_64Assembler::movw(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(src, dst);
   EmitOperandSizeOverride();
   EmitUint8(0x89);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
-void X86_64Assembler::leaq(Register dst, const Address& src) {
+void X86_64Assembler::leaq(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(dst, 8);
+  EmitRex64(dst, src);
   EmitUint8(0x8D);
-  EmitOperand(dst, src);
-}
-
-
-void X86_64Assembler::cmovl(Condition condition, Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0F);
-  EmitUint8(0x40 + condition);
-  EmitRegisterOperand(dst, src);
-}
-
-
-void X86_64Assembler::setb(Condition condition, Register dst) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0F);
-  EmitUint8(0x90 + condition);
-  EmitOperand(0, Operand(dst));
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::movss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x10);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::movss(const Address& dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::movss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitXmmRegisterOperand(src, dst);
+  EmitXmmRegisterOperand(src.LowBits(), dst);
 }
 
 
-void X86_64Assembler::movd(XmmRegister dst, Register src) {
+void X86_64Assembler::movd(XmmRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x6E);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::movd(Register dst, XmmRegister src) {
+void X86_64Assembler::movd(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x0F);
   EmitUint8(0x7E);
-  EmitOperand(src, Operand(dst));
+  EmitOperand(src.LowBits(), Operand(dst));
 }
 
 
 void X86_64Assembler::addss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::addss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
@@ -426,258 +436,287 @@
 void X86_64Assembler::movsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x10);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::movsd(const Address& dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::movsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitXmmRegisterOperand(src, dst);
+  EmitXmmRegisterOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::addsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::addsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvtsi2ss(XmmRegister dst, Register src) {
+void X86_64Assembler::cvtsi2ss(XmmRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2A);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::cvtsi2sd(XmmRegister dst, Register src) {
+void X86_64Assembler::cvtsi2sd(XmmRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2A);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::cvtss2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvtss2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2D);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::cvtss2sd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5A);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvtsd2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvtsd2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2D);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvttss2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvttss2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvttsd2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvttsd2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::cvtsd2ss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5A);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::cvtdq2pd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xE6);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::comiss(XmmRegister a, XmmRegister b) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(a, b);
   EmitUint8(0x0F);
   EmitUint8(0x2F);
-  EmitXmmRegisterOperand(a, b);
+  EmitXmmRegisterOperand(a.LowBits(), b);
 }
 
 
 void X86_64Assembler::comisd(XmmRegister a, XmmRegister b) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(a, b);
   EmitUint8(0x0F);
   EmitUint8(0x2F);
-  EmitXmmRegisterOperand(a, b);
+  EmitXmmRegisterOperand(a.LowBits(), b);
 }
 
 
 void X86_64Assembler::sqrtsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x51);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::sqrtss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x51);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorpd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorpd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorps(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::andpd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x54);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
@@ -766,92 +805,102 @@
 }
 
 
-void X86_64Assembler::xchgl(Register dst, Register src) {
+void X86_64Assembler::xchgl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x87);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
-void X86_64Assembler::xchgl(Register reg, const Address& address) {
+void X86_64Assembler::xchgl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x87);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::cmpl(Register reg, const Immediate& imm) {
+void X86_64Assembler::cmpl(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitComplex(7, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::cmpl(Register reg0, Register reg1) {
+void X86_64Assembler::cmpl(CpuRegister reg0, CpuRegister reg1) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg0, reg1);
   EmitUint8(0x3B);
-  EmitOperand(reg0, Operand(reg1));
+  EmitOperand(reg0.LowBits(), Operand(reg1));
 }
 
 
-void X86_64Assembler::cmpl(Register reg, const Address& address) {
+void X86_64Assembler::cmpl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x3B);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::addl(Register dst, Register src) {
+void X86_64Assembler::addl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x03);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::addl(Register reg, const Address& address) {
+void X86_64Assembler::addl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x03);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::cmpl(const Address& address, Register reg) {
+void X86_64Assembler::cmpl(const Address& address, CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x39);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
 void X86_64Assembler::cmpl(const Address& address, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitComplex(7, address, imm);
 }
 
 
-void X86_64Assembler::testl(Register reg1, Register reg2) {
+void X86_64Assembler::testl(CpuRegister reg1, CpuRegister reg2) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex(reg1, reg2, 4);
+  EmitOptionalRex32(reg1, reg2);
   EmitUint8(0x85);
-  EmitRegisterOperand(reg1, reg2);
+  EmitRegisterOperand(reg1.LowBits(), reg2.LowBits());
 }
 
 
-void X86_64Assembler::testl(Register reg, const Immediate& immediate) {
+void X86_64Assembler::testl(CpuRegister reg, const Immediate& immediate) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   // For registers that have a byte variant (RAX, RBX, RCX, and RDX)
-  // we only test the byte register to keep the encoding short.
-  if (immediate.is_uint8() && reg < 4) {
+  // we only test the byte CpuRegister to keep the encoding short.
+  if (immediate.is_uint8() && reg.AsRegister() < 4) {
     // Use zero-extended 8-bit immediate.
-    if (reg == RAX) {
+    if (reg.AsRegister() == RAX) {
       EmitUint8(0xA8);
     } else {
       EmitUint8(0xF6);
-      EmitUint8(0xC0 + reg);
+      EmitUint8(0xC0 + reg.AsRegister());
     }
     EmitUint8(immediate.value() & 0xFF);
-  } else if (reg == RAX) {
+  } else if (reg.AsRegister() == RAX) {
     // Use short form if the destination is RAX.
     EmitUint8(0xA9);
     EmitImmediate(immediate);
   } else {
+    EmitOptionalRex32(reg);
     EmitUint8(0xF7);
     EmitOperand(0, Operand(reg));
     EmitImmediate(immediate);
@@ -859,136 +908,145 @@
 }
 
 
-void X86_64Assembler::andl(Register dst, Register src) {
+void X86_64Assembler::andl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x23);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::andl(Register dst, const Immediate& imm) {
+void X86_64Assembler::andl(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
   EmitComplex(4, Operand(dst), imm);
 }
 
 
-void X86_64Assembler::orl(Register dst, Register src) {
+void X86_64Assembler::orl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0B);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::orl(Register dst, const Immediate& imm) {
+void X86_64Assembler::orl(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
   EmitComplex(1, Operand(dst), imm);
 }
 
 
-void X86_64Assembler::xorl(Register dst, Register src) {
+void X86_64Assembler::xorl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex(dst, src, 4);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x33);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
-void X86_64Assembler::rex_reg(Register &dst, size_t size) {
-  Register src = kNoRegister;
-  rex(dst, src, size);
-}
-
-void X86_64Assembler::rex_rm(Register &src, size_t size) {
-  Register dst = kNoRegister;
-  rex(dst, src, size);
-}
-
-void X86_64Assembler::rex(Register &dst, Register &src, size_t size) {
-  uint8_t rex = 0;
+#if 0
+void X86_64Assembler::rex(bool force, bool w, Register* r, Register* x, Register* b) {
   // REX.WRXB
   // W - 64-bit operand
   // R - MODRM.reg
   // X - SIB.index
   // B - MODRM.rm/SIB.base
-  if (size == 8) {
+  uint8_t rex = force ? 0x40 : 0;
+  if (w) {
     rex |= 0x48;  // REX.W000
   }
-  if (dst >= Register::R8 && dst < Register::kNumberOfCpuRegisters) {
+  if (r != nullptr && *r >= Register::R8 && *r < Register::kNumberOfCpuRegisters) {
     rex |= 0x44;  // REX.0R00
-    dst = static_cast<Register>(dst - 8);
+    *r = static_cast<Register>(*r - 8);
   }
-  if (src >= Register::R8 && src < Register::kNumberOfCpuRegisters) {
+  if (x != nullptr && *x >= Register::R8 && *x < Register::kNumberOfCpuRegisters) {
+    rex |= 0x42;  // REX.00X0
+    *x = static_cast<Register>(*x - 8);
+  }
+  if (b != nullptr && *b >= Register::R8 && *b < Register::kNumberOfCpuRegisters) {
     rex |= 0x41;  // REX.000B
-    src = static_cast<Register>(src - 8);
+    *b = static_cast<Register>(*b - 8);
   }
   if (rex != 0) {
     EmitUint8(rex);
   }
 }
 
-void X86_64Assembler::addl(Register reg, const Immediate& imm) {
+void X86_64Assembler::rex_reg_mem(bool force, bool w, Register* dst, const Address& mem) {
+  // REX.WRXB
+  // W - 64-bit operand
+  // R - MODRM.reg
+  // X - SIB.index
+  // B - MODRM.rm/SIB.base
+  uint8_t rex = mem->rex();
+  if (force) {
+    rex |= 0x40;  // REX.0000
+  }
+  if (w) {
+    rex |= 0x48;  // REX.W000
+  }
+  if (dst != nullptr && *dst >= Register::R8 && *dst < Register::kNumberOfCpuRegisters) {
+    rex |= 0x44;  // REX.0R00
+    *dst = static_cast<Register>(*dst - 8);
+  }
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+void rex_mem_reg(bool force, bool w, Address* mem, Register* src);
+#endif
+
+void X86_64Assembler::addl(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitComplex(0, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::addq(Register reg, const Immediate& imm) {
+void X86_64Assembler::addq(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
+  EmitRex64(reg);
   EmitComplex(0, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::addl(const Address& address, Register reg) {
+void X86_64Assembler::addl(const Address& address, CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x01);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
 void X86_64Assembler::addl(const Address& address, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitComplex(0, address, imm);
 }
 
 
-void X86_64Assembler::adcl(Register reg, const Immediate& imm) {
+void X86_64Assembler::subl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(2, Operand(reg), imm);
-}
-
-
-void X86_64Assembler::adcl(Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x13);
-  EmitOperand(dst, Operand(src));
-}
-
-
-void X86_64Assembler::adcl(Register dst, const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x13);
-  EmitOperand(dst, address);
-}
-
-
-void X86_64Assembler::subl(Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x2B);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::subl(Register reg, const Immediate& imm) {
+void X86_64Assembler::subl(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
+  EmitOptionalRex32(reg);
   EmitComplex(5, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::subl(Register reg, const Address& address) {
+void X86_64Assembler::subl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x2B);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
@@ -998,39 +1056,44 @@
 }
 
 
-void X86_64Assembler::idivl(Register reg) {
+void X86_64Assembler::idivl(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
-  EmitUint8(0xF8 | reg);
+  EmitUint8(0xF8 | reg.LowBits());
 }
 
 
-void X86_64Assembler::imull(Register dst, Register src) {
+void X86_64Assembler::imull(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xAF);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::imull(Register reg, const Immediate& imm) {
+void X86_64Assembler::imull(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0x69);
-  EmitOperand(reg, Operand(reg));
+  EmitOperand(reg.LowBits(), Operand(reg));
   EmitImmediate(imm);
 }
 
 
-void X86_64Assembler::imull(Register reg, const Address& address) {
+void X86_64Assembler::imull(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x0F);
   EmitUint8(0xAF);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::imull(Register reg) {
+void X86_64Assembler::imull(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
   EmitOperand(5, Operand(reg));
 }
@@ -1038,13 +1101,15 @@
 
 void X86_64Assembler::imull(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xF7);
   EmitOperand(5, address);
 }
 
 
-void X86_64Assembler::mull(Register reg) {
+void X86_64Assembler::mull(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
   EmitOperand(4, Operand(reg));
 }
@@ -1052,106 +1117,56 @@
 
 void X86_64Assembler::mull(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xF7);
   EmitOperand(4, address);
 }
 
 
-void X86_64Assembler::sbbl(Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x1B);
-  EmitOperand(dst, Operand(src));
-}
 
-
-void X86_64Assembler::sbbl(Register reg, const Immediate& imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(3, Operand(reg), imm);
-}
-
-
-void X86_64Assembler::sbbl(Register dst, const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x1B);
-  EmitOperand(dst, address);
-}
-
-
-void X86_64Assembler::incl(Register reg) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x40 + reg);
-}
-
-
-void X86_64Assembler::incl(const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xFF);
-  EmitOperand(0, address);
-}
-
-
-void X86_64Assembler::decl(Register reg) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48 + reg);
-}
-
-
-void X86_64Assembler::decl(const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xFF);
-  EmitOperand(1, address);
-}
-
-
-void X86_64Assembler::shll(Register reg, const Immediate& imm) {
+void X86_64Assembler::shll(CpuRegister reg, const Immediate& imm) {
   EmitGenericShift(4, reg, imm);
 }
 
 
-void X86_64Assembler::shll(Register operand, Register shifter) {
+void X86_64Assembler::shll(CpuRegister operand, CpuRegister shifter) {
   EmitGenericShift(4, operand, shifter);
 }
 
 
-void X86_64Assembler::shrl(Register reg, const Immediate& imm) {
+void X86_64Assembler::shrl(CpuRegister reg, const Immediate& imm) {
   EmitGenericShift(5, reg, imm);
 }
 
 
-void X86_64Assembler::shrl(Register operand, Register shifter) {
+void X86_64Assembler::shrl(CpuRegister operand, CpuRegister shifter) {
   EmitGenericShift(5, operand, shifter);
 }
 
 
-void X86_64Assembler::sarl(Register reg, const Immediate& imm) {
+void X86_64Assembler::sarl(CpuRegister reg, const Immediate& imm) {
   EmitGenericShift(7, reg, imm);
 }
 
 
-void X86_64Assembler::sarl(Register operand, Register shifter) {
+void X86_64Assembler::sarl(CpuRegister operand, CpuRegister shifter) {
   EmitGenericShift(7, operand, shifter);
 }
 
 
-void X86_64Assembler::shld(Register dst, Register src) {
+void X86_64Assembler::negl(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0F);
-  EmitUint8(0xA5);
-  EmitRegisterOperand(src, dst);
-}
-
-
-void X86_64Assembler::negl(Register reg) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
   EmitOperand(3, Operand(reg));
 }
 
 
-void X86_64Assembler::notl(Register reg) {
+void X86_64Assembler::notl(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
-  EmitUint8(0xD0 | reg);
+  EmitUint8(0xD0 | reg.LowBits());
 }
 
 
@@ -1228,14 +1243,16 @@
 }
 
 
-void X86_64Assembler::jmp(Register reg) {
+void X86_64Assembler::jmp(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xFF);
-  EmitRegisterOperand(4, reg);
+  EmitRegisterOperand(4, reg.LowBits());
 }
 
 void X86_64Assembler::jmp(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xFF);
   EmitOperand(4, address);
 }
@@ -1268,11 +1285,11 @@
 }
 
 
-void X86_64Assembler::cmpxchgl(const Address& address, Register reg) {
+void X86_64Assembler::cmpxchgl(const Address& address, CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x0F);
   EmitUint8(0xB1);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 void X86_64Assembler::mfence() {
@@ -1289,19 +1306,12 @@
   return this;
 }
 
-void X86_64Assembler::AddImmediate(Register reg, const Immediate& imm) {
+void X86_64Assembler::AddImmediate(CpuRegister reg, const Immediate& imm) {
   int value = imm.value();
-  if (value > 0) {
-    if (value == 1) {
-      incl(reg);
-    } else if (value != 0) {
+  if (value != 0) {
+    if (value > 0) {
       addl(reg, imm);
-    }
-  } else if (value < 0) {
-    value = -value;
-    if (value == 1) {
-      decl(reg);
-    } else if (value != 0) {
+    } else {
       subl(reg, Immediate(value));
     }
   }
@@ -1313,8 +1323,8 @@
   int64_t constant = bit_cast<int64_t, double>(value);
   pushq(Immediate(High32Bits(constant)));
   pushq(Immediate(Low32Bits(constant)));
-  movsd(dst, Address(RSP, 0));
-  addq(RSP, Immediate(2 * kWordSize));
+  movsd(dst, Address(CpuRegister(RSP), 0));
+  addq(CpuRegister(RSP), Immediate(2 * kWordSize));
 }
 
 
@@ -1372,7 +1382,7 @@
 }
 
 
-void X86_64Assembler::EmitOperand(int reg_or_opcode, const Operand& operand) {
+void X86_64Assembler::EmitOperand(uint8_t reg_or_opcode, const Operand& operand) {
   CHECK_GE(reg_or_opcode, 0);
   CHECK_LT(reg_or_opcode, 8);
   const int length = operand.length_;
@@ -1392,9 +1402,9 @@
 }
 
 
-void X86_64Assembler::EmitComplex(int reg_or_opcode,
-                               const Operand& operand,
-                               const Immediate& immediate) {
+void X86_64Assembler::EmitComplex(uint8_t reg_or_opcode,
+                                  const Operand& operand,
+                                  const Immediate& immediate) {
   CHECK_GE(reg_or_opcode, 0);
   CHECK_LT(reg_or_opcode, 8);
   if (immediate.is_int8()) {
@@ -1402,7 +1412,7 @@
     EmitUint8(0x83);
     EmitOperand(reg_or_opcode, operand);
     EmitUint8(immediate.value() & 0xFF);
-  } else if (operand.IsRegister(RAX)) {
+  } else if (operand.IsRegister(CpuRegister(RAX))) {
     // Use short form if the destination is eax.
     EmitUint8(0x05 + (reg_or_opcode << 3));
     EmitImmediate(immediate);
@@ -1434,7 +1444,7 @@
 
 
 void X86_64Assembler::EmitGenericShift(int reg_or_opcode,
-                                    Register reg,
+                                    CpuRegister reg,
                                     const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   CHECK(imm.is_int8());
@@ -1450,41 +1460,146 @@
 
 
 void X86_64Assembler::EmitGenericShift(int reg_or_opcode,
-                                    Register operand,
-                                    Register shifter) {
+                                    CpuRegister operand,
+                                    CpuRegister shifter) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  CHECK_EQ(shifter, RCX);
+  CHECK_EQ(shifter.AsRegister(), RCX);
   EmitUint8(0xD3);
   EmitOperand(reg_or_opcode, Operand(operand));
 }
 
+void X86_64Assembler::EmitOptionalRex(bool force, bool w, bool r, bool x, bool b) {
+  // REX.WRXB
+  // W - 64-bit operand
+  // R - MODRM.reg
+  // X - SIB.index
+  // B - MODRM.rm/SIB.base
+  uint8_t rex = force ? 0x40 : 0;
+  if (w) {
+    rex |= 0x48;  // REX.W000
+  }
+  if (r) {
+    rex |= 0x44;  // REX.0R00
+  }
+  if (x) {
+    rex |= 0x42;  // REX.00X0
+  }
+  if (b) {
+    rex |= 0x41;  // REX.000B
+  }
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister reg) {
+  EmitOptionalRex(false, false, reg.NeedsRex(), false, false);
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister dst, CpuRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(XmmRegister dst, XmmRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister dst, XmmRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(XmmRegister dst, CpuRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(const Operand& operand) {
+  uint8_t rex = operand.rex();
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister dst, const Operand& operand) {
+  uint8_t rex = operand.rex();
+  if (dst.NeedsRex()) {
+    rex |= 0x44;  // REX.0R00
+  }
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+void X86_64Assembler::EmitOptionalRex32(XmmRegister dst, const Operand& operand) {
+  uint8_t rex = operand.rex();
+  if (dst.NeedsRex()) {
+    rex |= 0x44;  // REX.0R00
+  }
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+void X86_64Assembler::EmitRex64(CpuRegister reg) {
+  EmitOptionalRex(false, true, reg.NeedsRex(), false, false);
+}
+void X86_64Assembler::EmitRex64(CpuRegister dst, CpuRegister src) {
+  EmitOptionalRex(false, true, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitRex64(CpuRegister dst, const Operand& operand) {
+  uint8_t rex = 0x48 | operand.rex();  // REX.W000
+  if (dst.NeedsRex()) {
+    rex |= 0x44;  // REX.0R00
+  }
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+void X86_64Assembler::EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src) {
+  EmitOptionalRex(true, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalByteRegNormalizingRex32(CpuRegister dst, const Operand& operand) {
+  uint8_t rex = 0x40 | operand.rex();  // REX.0000
+  if (dst.NeedsRex()) {
+    rex |= 0x44;  // REX.0R00
+  }
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+constexpr size_t kFramePointerSize = 8;
+
 void X86_64Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                              const std::vector<ManagedRegister>& spill_regs,
-                              const ManagedRegisterEntrySpills& entry_spills) {
+                                 const std::vector<ManagedRegister>& spill_regs,
+                                 const ManagedRegisterEntrySpills& entry_spills) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
     pushq(spill_regs.at(i).AsX86_64().AsCpuRegister());
   }
   // return address then method on stack
-  addq(RSP, Immediate(-frame_size + (spill_regs.size() * kPointerSize) +
-                      kPointerSize /*method*/ + kPointerSize /*return address*/));
+  addq(CpuRegister(RSP), Immediate(-frame_size + (spill_regs.size() * kFramePointerSize) +
+                                   kFramePointerSize /*method*/ + kFramePointerSize /*return address*/));
   pushq(method_reg.AsX86_64().AsCpuRegister());
 
   for (size_t i = 0; i < entry_spills.size(); ++i) {
     ManagedRegisterSpill spill = entry_spills.at(i);
     if (spill.AsX86_64().IsCpuRegister()) {
       if (spill.getSize() == 8) {
-        movq(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsCpuRegister());
+        movq(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()),
+             spill.AsX86_64().AsCpuRegister());
       } else {
         CHECK_EQ(spill.getSize(), 4);
-        movl(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsCpuRegister());
+        movl(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), spill.AsX86_64().AsCpuRegister());
       }
     } else {
       if (spill.getSize() == 8) {
-        movsd(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
+        movsd(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
       } else {
         CHECK_EQ(spill.getSize(), 4);
-        movss(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
+        movss(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
       }
     }
   }
@@ -1493,7 +1608,7 @@
 void X86_64Assembler::RemoveFrame(size_t frame_size,
                             const std::vector<ManagedRegister>& spill_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
-  addq(RSP, Immediate(frame_size - (spill_regs.size() * kPointerSize) - kPointerSize));
+  addq(CpuRegister(RSP), Immediate(frame_size - (spill_regs.size() * kFramePointerSize) - kFramePointerSize));
   for (size_t i = 0; i < spill_regs.size(); ++i) {
     popq(spill_regs.at(i).AsX86_64().AsCpuRegister());
   }
@@ -1502,12 +1617,12 @@
 
 void X86_64Assembler::IncreaseFrameSize(size_t adjust) {
   CHECK_ALIGNED(adjust, kStackAlignment);
-  addq(RSP, Immediate(-adjust));
+  addq(CpuRegister(RSP), Immediate(-adjust));
 }
 
 void X86_64Assembler::DecreaseFrameSize(size_t adjust) {
   CHECK_ALIGNED(adjust, kStackAlignment);
-  addq(RSP, Immediate(adjust));
+  addq(CpuRegister(RSP), Immediate(adjust));
 }
 
 void X86_64Assembler::Store(FrameOffset offs, ManagedRegister msrc, size_t size) {
@@ -1517,28 +1632,28 @@
   } else if (src.IsCpuRegister()) {
     if (size == 4) {
       CHECK_EQ(4u, size);
-      movl(Address(RSP, offs), src.AsCpuRegister());
+      movl(Address(CpuRegister(RSP), offs), src.AsCpuRegister());
     } else {
       CHECK_EQ(8u, size);
-      movq(Address(RSP, offs), src.AsCpuRegister());
+      movq(Address(CpuRegister(RSP), offs), src.AsCpuRegister());
     }
   } else if (src.IsRegisterPair()) {
     CHECK_EQ(0u, size);
-    movq(Address(RSP, offs), src.AsRegisterPairLow());
-    movq(Address(RSP, FrameOffset(offs.Int32Value()+4)),
+    movq(Address(CpuRegister(RSP), offs), src.AsRegisterPairLow());
+    movq(Address(CpuRegister(RSP), FrameOffset(offs.Int32Value()+4)),
          src.AsRegisterPairHigh());
   } else if (src.IsX87Register()) {
     if (size == 4) {
-      fstps(Address(RSP, offs));
+      fstps(Address(CpuRegister(RSP), offs));
     } else {
-      fstpl(Address(RSP, offs));
+      fstpl(Address(CpuRegister(RSP), offs));
     }
   } else {
     CHECK(src.IsXmmRegister());
     if (size == 4) {
-      movss(Address(RSP, offs), src.AsXmmRegister());
+      movss(Address(CpuRegister(RSP), offs), src.AsXmmRegister());
     } else {
-      movsd(Address(RSP, offs), src.AsXmmRegister());
+      movsd(Address(CpuRegister(RSP), offs), src.AsXmmRegister());
     }
   }
 }
@@ -1546,40 +1661,36 @@
 void X86_64Assembler::StoreRef(FrameOffset dest, ManagedRegister msrc) {
   X86_64ManagedRegister src = msrc.AsX86_64();
   CHECK(src.IsCpuRegister());
-  movq(Address(RSP, dest), src.AsCpuRegister());
+  movq(Address(CpuRegister(RSP), dest), src.AsCpuRegister());
 }
 
 void X86_64Assembler::StoreRawPtr(FrameOffset dest, ManagedRegister msrc) {
   X86_64ManagedRegister src = msrc.AsX86_64();
   CHECK(src.IsCpuRegister());
-  movq(Address(RSP, dest), src.AsCpuRegister());
+  movq(Address(CpuRegister(RSP), dest), src.AsCpuRegister());
 }
 
 void X86_64Assembler::StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                         ManagedRegister) {
-  movl(Address(RSP, dest), Immediate(imm));  // TODO(64) movq?
+                                            ManagedRegister) {
+  movl(Address(CpuRegister(RSP), dest), Immediate(imm));  // TODO(64) movq?
 }
 
-void X86_64Assembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                          ManagedRegister) {
+void X86_64Assembler::StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
+                                               ManagedRegister) {
   gs()->movl(Address::Absolute(dest, true), Immediate(imm));  // TODO(64) movq?
 }
 
-void X86_64Assembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                            FrameOffset fr_offs,
-                                            ManagedRegister mscratch) {
+void X86_64Assembler::StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+                                                 FrameOffset fr_offs,
+                                                 ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
-  leaq(scratch.AsCpuRegister(), Address(RSP, fr_offs));
+  leaq(scratch.AsCpuRegister(), Address(CpuRegister(RSP), fr_offs));
   gs()->movq(Address::Absolute(thr_offs, true), scratch.AsCpuRegister());
 }
 
-void X86_64Assembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
-  gs()->movq(Address::Absolute(thr_offs, true), RSP);
-}
-
-void X86_64Assembler::StoreLabelToThread(ThreadOffset thr_offs, Label* lbl) {
-  gs()->movl(Address::Absolute(thr_offs, true), lbl);  // TODO(64) movq?
+void X86_64Assembler::StoreStackPointerToThread64(ThreadOffset<8> thr_offs) {
+  gs()->movq(Address::Absolute(thr_offs, true), CpuRegister(RSP));
 }
 
 void X86_64Assembler::StoreSpanning(FrameOffset /*dst*/, ManagedRegister /*src*/,
@@ -1594,42 +1705,41 @@
   } else if (dest.IsCpuRegister()) {
     if (size == 4) {
       CHECK_EQ(4u, size);
-      movl(dest.AsCpuRegister(), Address(RSP, src));
+      movl(dest.AsCpuRegister(), Address(CpuRegister(RSP), src));
     } else {
       CHECK_EQ(8u, size);
-      movq(dest.AsCpuRegister(), Address(RSP, src));
+      movq(dest.AsCpuRegister(), Address(CpuRegister(RSP), src));
     }
   } else if (dest.IsRegisterPair()) {
     CHECK_EQ(0u, size);
-    movq(dest.AsRegisterPairLow(), Address(RSP, src));
-    movq(dest.AsRegisterPairHigh(), Address(RSP, FrameOffset(src.Int32Value()+4)));
+    movq(dest.AsRegisterPairLow(), Address(CpuRegister(RSP), src));
+    movq(dest.AsRegisterPairHigh(), Address(CpuRegister(RSP), FrameOffset(src.Int32Value()+4)));
   } else if (dest.IsX87Register()) {
     if (size == 4) {
-      flds(Address(RSP, src));
+      flds(Address(CpuRegister(RSP), src));
     } else {
-      fldl(Address(RSP, src));
+      fldl(Address(CpuRegister(RSP), src));
     }
   } else {
     CHECK(dest.IsXmmRegister());
     if (size == 4) {
-      movss(dest.AsXmmRegister(), Address(RSP, src));
+      movss(dest.AsXmmRegister(), Address(CpuRegister(RSP), src));
     } else {
-      movsd(dest.AsXmmRegister(), Address(RSP, src));
+      movsd(dest.AsXmmRegister(), Address(CpuRegister(RSP), src));
     }
   }
 }
 
-void X86_64Assembler::Load(ManagedRegister mdest, ThreadOffset src, size_t size) {
+void X86_64Assembler::LoadFromThread64(ManagedRegister mdest, ThreadOffset<8> src, size_t size) {
   X86_64ManagedRegister dest = mdest.AsX86_64();
   if (dest.IsNoRegister()) {
     CHECK_EQ(0u, size);
   } else if (dest.IsCpuRegister()) {
     CHECK_EQ(4u, size);
-    gs()->movq(dest.AsCpuRegister(), Address::Absolute(src, true));
+    gs()->movl(dest.AsCpuRegister(), Address::Absolute(src, true));
   } else if (dest.IsRegisterPair()) {
     CHECK_EQ(8u, size);
     gs()->movq(dest.AsRegisterPairLow(), Address::Absolute(src, true));
-    gs()->movq(dest.AsRegisterPairHigh(), Address::Absolute(ThreadOffset(src.Int32Value()+4), true));
   } else if (dest.IsX87Register()) {
     if (size == 4) {
       gs()->flds(Address::Absolute(src, true));
@@ -1649,7 +1759,7 @@
 void X86_64Assembler::LoadRef(ManagedRegister mdest, FrameOffset  src) {
   X86_64ManagedRegister dest = mdest.AsX86_64();
   CHECK(dest.IsCpuRegister());
-  movq(dest.AsCpuRegister(), Address(RSP, src));
+  movq(dest.AsCpuRegister(), Address(CpuRegister(RSP), src));
 }
 
 void X86_64Assembler::LoadRef(ManagedRegister mdest, ManagedRegister base,
@@ -1666,8 +1776,7 @@
   movq(dest.AsCpuRegister(), Address(base.AsX86_64().AsCpuRegister(), offs));
 }
 
-void X86_64Assembler::LoadRawPtrFromThread(ManagedRegister mdest,
-                                        ThreadOffset offs) {
+void X86_64Assembler::LoadRawPtrFromThread64(ManagedRegister mdest, ThreadOffset<8> offs) {
   X86_64ManagedRegister dest = mdest.AsX86_64();
   CHECK(dest.IsCpuRegister());
   gs()->movq(dest.AsCpuRegister(), Address::Absolute(offs, true));
@@ -1678,7 +1787,7 @@
   CHECK(size == 1 || size == 2) << size;
   CHECK(reg.IsCpuRegister()) << reg;
   if (size == 1) {
-    movsxb(reg.AsCpuRegister(), reg.AsByteRegister());
+    movsxb(reg.AsCpuRegister(), reg.AsCpuRegister());
   } else {
     movsxw(reg.AsCpuRegister(), reg.AsCpuRegister());
   }
@@ -1689,7 +1798,7 @@
   CHECK(size == 1 || size == 2) << size;
   CHECK(reg.IsCpuRegister()) << reg;
   if (size == 1) {
-    movzxb(reg.AsCpuRegister(), reg.AsByteRegister());
+    movzxb(reg.AsCpuRegister(), reg.AsCpuRegister());
   } else {
     movzxw(reg.AsCpuRegister(), reg.AsCpuRegister());
   }
@@ -1703,17 +1812,17 @@
       movq(dest.AsCpuRegister(), src.AsCpuRegister());
     } else if (src.IsX87Register() && dest.IsXmmRegister()) {
       // Pass via stack and pop X87 register
-      subl(RSP, Immediate(16));
+      subl(CpuRegister(RSP), Immediate(16));
       if (size == 4) {
         CHECK_EQ(src.AsX87Register(), ST0);
-        fstps(Address(RSP, 0));
-        movss(dest.AsXmmRegister(), Address(RSP, 0));
+        fstps(Address(CpuRegister(RSP), 0));
+        movss(dest.AsXmmRegister(), Address(CpuRegister(RSP), 0));
       } else {
         CHECK_EQ(src.AsX87Register(), ST0);
-        fstpl(Address(RSP, 0));
-        movsd(dest.AsXmmRegister(), Address(RSP, 0));
+        fstpl(Address(CpuRegister(RSP), 0));
+        movsd(dest.AsXmmRegister(), Address(CpuRegister(RSP), 0));
       }
-      addq(RSP, Immediate(16));
+      addq(CpuRegister(RSP), Immediate(16));
     } else {
       // TODO: x87, SSE
       UNIMPLEMENTED(FATAL) << ": Move " << dest << ", " << src;
@@ -1725,22 +1834,22 @@
                            ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
-  movl(scratch.AsCpuRegister(), Address(RSP, src));
-  movl(Address(RSP, dest), scratch.AsCpuRegister());
+  movl(scratch.AsCpuRegister(), Address(CpuRegister(RSP), src));
+  movl(Address(CpuRegister(RSP), dest), scratch.AsCpuRegister());
 }
 
-void X86_64Assembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                        ThreadOffset thr_offs,
-                                        ManagedRegister mscratch) {
+void X86_64Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs,
+                                             ThreadOffset<8> thr_offs,
+                                             ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
   gs()->movq(scratch.AsCpuRegister(), Address::Absolute(thr_offs, true));
   Store(fr_offs, scratch, 8);
 }
 
-void X86_64Assembler::CopyRawPtrToThread(ThreadOffset thr_offs,
-                                      FrameOffset fr_offs,
-                                      ManagedRegister mscratch) {
+void X86_64Assembler::CopyRawPtrToThread64(ThreadOffset<8> thr_offs,
+                                           FrameOffset fr_offs,
+                                           ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
   Load(scratch, fr_offs, 8);
@@ -1771,17 +1880,17 @@
                         ManagedRegister scratch, size_t size) {
   CHECK(scratch.IsNoRegister());
   CHECK_EQ(size, 4u);
-  pushq(Address(RSP, src));
+  pushq(Address(CpuRegister(RSP), src));
   popq(Address(dest_base.AsX86_64().AsCpuRegister(), dest_offset));
 }
 
 void X86_64Assembler::Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
                         ManagedRegister mscratch, size_t size) {
-  Register scratch = mscratch.AsX86_64().AsCpuRegister();
+  CpuRegister scratch = mscratch.AsX86_64().AsCpuRegister();
   CHECK_EQ(size, 4u);
-  movq(scratch, Address(RSP, src_base));
+  movq(scratch, Address(CpuRegister(RSP), src_base));
   movq(scratch, Address(scratch, src_offset));
-  movq(Address(RSP, dest), scratch);
+  movq(Address(CpuRegister(RSP), dest), scratch);
 }
 
 void X86_64Assembler::Copy(ManagedRegister dest, Offset dest_offset,
@@ -1795,10 +1904,10 @@
 
 void X86_64Assembler::Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
                         ManagedRegister mscratch, size_t size) {
-  Register scratch = mscratch.AsX86_64().AsCpuRegister();
+  CpuRegister scratch = mscratch.AsX86_64().AsCpuRegister();
   CHECK_EQ(size, 4u);
   CHECK_EQ(dest.Int32Value(), src.Int32Value());
-  movq(scratch, Address(RSP, src));
+  movq(scratch, Address(CpuRegister(RSP), src));
   pushq(Address(scratch, src_offset));
   popq(Address(scratch, dest_offset));
 }
@@ -1818,7 +1927,7 @@
     // Use out_reg as indicator of NULL
     in_reg = out_reg;
     // TODO: movzwl
-    movl(in_reg.AsCpuRegister(), Address(RSP, sirt_offset));
+    movl(in_reg.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
   }
   CHECK(in_reg.IsCpuRegister());
   CHECK(out_reg.IsCpuRegister());
@@ -1830,10 +1939,10 @@
     }
     testl(in_reg.AsCpuRegister(), in_reg.AsCpuRegister());
     j(kZero, &null_arg);
-    leaq(out_reg.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(out_reg.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
     Bind(&null_arg);
   } else {
-    leaq(out_reg.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(out_reg.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
   }
 }
 
@@ -1845,13 +1954,13 @@
   CHECK(scratch.IsCpuRegister());
   if (null_allowed) {
     Label null_arg;
-    movl(scratch.AsCpuRegister(), Address(RSP, sirt_offset));
+    movl(scratch.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
     testl(scratch.AsCpuRegister(), scratch.AsCpuRegister());
     j(kZero, &null_arg);
-    leaq(scratch.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(scratch.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
     Bind(&null_arg);
   } else {
-    leaq(scratch.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(scratch.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
   }
   Store(out_off, scratch, 8);
 }
@@ -1889,35 +1998,42 @@
 }
 
 void X86_64Assembler::Call(FrameOffset base, Offset offset, ManagedRegister mscratch) {
-  Register scratch = mscratch.AsX86_64().AsCpuRegister();
-  movq(scratch, Address(RSP, base));
+  CpuRegister scratch = mscratch.AsX86_64().AsCpuRegister();
+  movq(scratch, Address(CpuRegister(RSP), base));
   call(Address(scratch, offset));
 }
 
-void X86_64Assembler::Call(ThreadOffset offset, ManagedRegister /*mscratch*/) {
+void X86_64Assembler::CallFromThread64(ThreadOffset<8> offset, ManagedRegister /*mscratch*/) {
   gs()->call(Address::Absolute(offset, true));
 }
 
 void X86_64Assembler::GetCurrentThread(ManagedRegister tr) {
-  gs()->movq(tr.AsX86_64().AsCpuRegister(),
-             Address::Absolute(Thread::SelfOffset(), true));
+  gs()->movq(tr.AsX86_64().AsCpuRegister(), Address::Absolute(Thread::SelfOffset<8>(), true));
 }
 
-void X86_64Assembler::GetCurrentThread(FrameOffset offset,
-                                    ManagedRegister mscratch) {
+void X86_64Assembler::GetCurrentThread(FrameOffset offset, ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
-  gs()->movq(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset(), true));
-  movq(Address(RSP, offset), scratch.AsCpuRegister());
+  gs()->movq(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset<8>(), true));
+  movq(Address(CpuRegister(RSP), offset), scratch.AsCpuRegister());
 }
 
+// Slowpath entered when Thread::Current()->_exception is non-null
+class X86_64ExceptionSlowPath FINAL : public SlowPath {
+ public:
+  explicit X86_64ExceptionSlowPath(size_t stack_adjust) : stack_adjust_(stack_adjust) {}
+  virtual void Emit(Assembler *sp_asm) OVERRIDE;
+ private:
+  const size_t stack_adjust_;
+};
+
 void X86_64Assembler::ExceptionPoll(ManagedRegister /*scratch*/, size_t stack_adjust) {
-  X86ExceptionSlowPath* slow = new X86ExceptionSlowPath(stack_adjust);
+  X86_64ExceptionSlowPath* slow = new X86_64ExceptionSlowPath(stack_adjust);
   buffer_.EnqueueSlowPath(slow);
-  gs()->cmpl(Address::Absolute(Thread::ExceptionOffset(), true), Immediate(0));
+  gs()->cmpl(Address::Absolute(Thread::ExceptionOffset<8>(), true), Immediate(0));
   j(kNotEqual, slow->Entry());
 }
 
-void X86ExceptionSlowPath::Emit(Assembler *sasm) {
+void X86_64ExceptionSlowPath::Emit(Assembler *sasm) {
   X86_64Assembler* sp_asm = down_cast<X86_64Assembler*>(sasm);
 #define __ sp_asm->
   __ Bind(&entry_);
@@ -1925,27 +2041,14 @@
   if (stack_adjust_ != 0) {  // Fix up the frame.
     __ DecreaseFrameSize(stack_adjust_);
   }
-  // Pass exception as argument in RAX
-  __ gs()->movq(RAX, Address::Absolute(Thread::ExceptionOffset(), true));  // TODO(64): Pass argument via RDI
-  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(pDeliverException), true));
+  // Pass exception as argument in RDI
+  __ gs()->movq(CpuRegister(RDI), Address::Absolute(Thread::ExceptionOffset<8>(), true));
+  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(8, pDeliverException), true));
   // this call should never return
   __ int3();
 #undef __
 }
 
-static const char* kRegisterNames[] = {
-  "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
-  "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
-};
-
-std::ostream& operator<<(std::ostream& os, const Register& rhs) {
-  if (rhs >= RAX && rhs <= R15) {
-    os << kRegisterNames[rhs];
-  } else {
-    os << "Register[" << static_cast<int>(rhs) << "]";
-  }
-  return os;
-}
 }  // namespace x86_64
 }  // namespace art
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index d48ba72..4738dcb 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -68,6 +68,10 @@
     return static_cast<Register>(encoding_at(1) & 7);
   }
 
+  uint8_t rex() const {
+    return rex_;
+  }
+
   int8_t disp8() const {
     CHECK_GE(length_, 2);
     return static_cast<int8_t>(encoding_[length_ - 1]);
@@ -80,25 +84,36 @@
     return value;
   }
 
-  bool IsRegister(Register reg) const {
+  bool IsRegister(CpuRegister reg) const {
     return ((encoding_[0] & 0xF8) == 0xC0)  // Addressing mode is register only.
-        && ((encoding_[0] & 0x07) == reg);  // Register codes match.
+        && ((encoding_[0] & 0x07) == reg.LowBits())  // Register codes match.
+        && (reg.NeedsRex() == ((rex_ & 1) != 0));  // REX.000B bits match.
   }
 
  protected:
   // Operand can be sub classed (e.g: Address).
-  Operand() : length_(0) { }
+  Operand() : rex_(0), length_(0) { }
 
-  void SetModRM(int mod, Register rm) {
+  void SetModRM(uint8_t mod, CpuRegister rm) {
     CHECK_EQ(mod & ~3, 0);
-    encoding_[0] = (mod << 6) | rm;
+    if (rm.NeedsRex()) {
+      rex_ |= 0x41;  // REX.000B
+    }
+    encoding_[0] = (mod << 6) | rm.LowBits();
     length_ = 1;
   }
 
-  void SetSIB(ScaleFactor scale, Register index, Register base) {
+  void SetSIB(ScaleFactor scale, CpuRegister index, CpuRegister base) {
     CHECK_EQ(length_, 1);
     CHECK_EQ(scale & ~3, 0);
-    encoding_[1] = (scale << 6) | (index << 3) | base;
+    if (base.NeedsRex()) {
+      rex_ |= 0x41;  // REX.000B
+    }
+    if (index.NeedsRex()) {
+      rex_ |= 0x42;  // REX.00X0
+    }
+    encoding_[1] = (scale << 6) | (static_cast<uint8_t>(index.AsRegister()) << 3) |
+        static_cast<uint8_t>(base.AsRegister());
     length_ = 2;
   }
 
@@ -115,11 +130,11 @@
   }
 
  private:
-  byte length_;
-  byte encoding_[6];
-  byte padding_;
+  uint8_t rex_;
+  uint8_t length_;
+  uint8_t encoding_[6];
 
-  explicit Operand(Register reg) { SetModRM(3, reg); }
+  explicit Operand(CpuRegister reg) { SetModRM(3, reg); }
 
   // Get the operand encoding byte at the given index.
   uint8_t encoding_at(int index) const {
@@ -136,77 +151,85 @@
 
 class Address : public Operand {
  public:
-  Address(Register base, int32_t disp) {
+  Address(CpuRegister base, int32_t disp) {
     Init(base, disp);
   }
 
-  Address(Register base, Offset disp) {
+  Address(CpuRegister base, Offset disp) {
     Init(base, disp.Int32Value());
   }
 
-  Address(Register base, FrameOffset disp) {
-    CHECK_EQ(base, RSP);
-    Init(RSP, disp.Int32Value());
+  Address(CpuRegister base, FrameOffset disp) {
+    CHECK_EQ(base.AsRegister(), RSP);
+    Init(CpuRegister(RSP), disp.Int32Value());
   }
 
-  Address(Register base, MemberOffset disp) {
+  Address(CpuRegister base, MemberOffset disp) {
     Init(base, disp.Int32Value());
   }
 
-  void Init(Register base, int32_t disp) {
-    if (disp == 0 && base != RBP) {
+  void Init(CpuRegister base, int32_t disp) {
+    if (disp == 0 && base.AsRegister() != RBP) {
       SetModRM(0, base);
-      if (base == RSP) SetSIB(TIMES_1, RSP, base);
+      if (base.AsRegister() == RSP) {
+        SetSIB(TIMES_1, CpuRegister(RSP), base);
+      }
     } else if (disp >= -128 && disp <= 127) {
       SetModRM(1, base);
-      if (base == RSP) SetSIB(TIMES_1, RSP, base);
+      if (base.AsRegister() == RSP) {
+        SetSIB(TIMES_1, CpuRegister(RSP), base);
+      }
       SetDisp8(disp);
     } else {
       SetModRM(2, base);
-      if (base == RSP) SetSIB(TIMES_1, RSP, base);
+      if (base.AsRegister() == RSP) {
+        SetSIB(TIMES_1, CpuRegister(RSP), base);
+      }
       SetDisp32(disp);
     }
   }
 
 
-  Address(Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, RSP);  // Illegal addressing mode.
-    SetModRM(0, RSP);
-    SetSIB(scale, index, RBP);
+  Address(CpuRegister index, ScaleFactor scale, int32_t disp) {
+    CHECK_NE(index.AsRegister(), RSP);  // Illegal addressing mode.
+    SetModRM(0, CpuRegister(RSP));
+    SetSIB(scale, index, CpuRegister(RBP));
     SetDisp32(disp);
   }
 
-  Address(Register base, Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, RSP);  // Illegal addressing mode.
-    if (disp == 0 && base != RBP) {
-      SetModRM(0, RSP);
+  Address(CpuRegister base, CpuRegister index, ScaleFactor scale, int32_t disp) {
+    CHECK_NE(index.AsRegister(), RSP);  // Illegal addressing mode.
+    if (disp == 0 && base.AsRegister() != RBP) {
+      SetModRM(0, CpuRegister(RSP));
       SetSIB(scale, index, base);
     } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, RSP);
+      SetModRM(1, CpuRegister(RSP));
       SetSIB(scale, index, base);
       SetDisp8(disp);
     } else {
-      SetModRM(2, RSP);
+      SetModRM(2, CpuRegister(RSP));
       SetSIB(scale, index, base);
       SetDisp32(disp);
     }
   }
 
-  static Address Absolute(uword addr, bool has_rip = false) {
+  // If no_rip is true then the Absolute address isn't RIP relative.
+  static Address Absolute(uword addr, bool no_rip = false) {
     Address result;
-    if (has_rip) {
-      result.SetModRM(0, RSP);
-      result.SetSIB(TIMES_1, RSP, RBP);
+    if (no_rip) {
+      result.SetModRM(0, CpuRegister(RSP));
+      result.SetSIB(TIMES_1, CpuRegister(RSP), CpuRegister(RBP));
       result.SetDisp32(addr);
     } else {
-      result.SetModRM(0, RBP);
+      result.SetModRM(0, CpuRegister(RBP));
       result.SetDisp32(addr);
     }
     return result;
   }
 
-  static Address Absolute(ThreadOffset addr, bool has_rip = false) {
-    return Absolute(addr.Int32Value(), has_rip);
+  // If no_rip is true then the Absolute address isn't RIP relative.
+  static Address Absolute(ThreadOffset<8> addr, bool no_rip = false) {
+    return Absolute(addr.Int32Value(), no_rip);
   }
 
  private:
@@ -216,7 +239,7 @@
 };
 
 
-class X86_64Assembler : public Assembler {
+class X86_64Assembler FINAL : public Assembler {
  public:
   X86_64Assembler() {}
   virtual ~X86_64Assembler() {}
@@ -224,56 +247,51 @@
   /*
    * Emit Machine Instructions.
    */
-  void call(Register reg);
+  void call(CpuRegister reg);
   void call(const Address& address);
   void call(Label* label);
 
-  void pushq(Register reg);
+  void pushq(CpuRegister reg);
   void pushq(const Address& address);
   void pushq(const Immediate& imm);
 
-  void popq(Register reg);
+  void popq(CpuRegister reg);
   void popq(const Address& address);
 
-  void movq(Register dst, const Immediate& src);
-  void movl(Register dst, const Immediate& src);
-  void movq(Register dst, Register src);
-  void movl(Register dst, Register src);
+  void movq(CpuRegister dst, const Immediate& src);
+  void movl(CpuRegister dst, const Immediate& src);
+  void movq(CpuRegister dst, CpuRegister src);
+  void movl(CpuRegister dst, CpuRegister src);
 
-  void movq(Register dst, const Address& src);
-  void movl(Register dst, const Address& src);
-  void movq(const Address& dst, Register src);
-  void movl(const Address& dst, Register src);
+  void movq(CpuRegister dst, const Address& src);
+  void movl(CpuRegister dst, const Address& src);
+  void movq(const Address& dst, CpuRegister src);
+  void movl(const Address& dst, CpuRegister src);
   void movl(const Address& dst, const Immediate& imm);
-  void movl(const Address& dst, Label* lbl);
 
-  void movzxb(Register dst, ByteRegister src);
-  void movzxb(Register dst, const Address& src);
-  void movsxb(Register dst, ByteRegister src);
-  void movsxb(Register dst, const Address& src);
-  void movb(Register dst, const Address& src);
-  void movb(const Address& dst, ByteRegister src);
+  void movzxb(CpuRegister dst, CpuRegister src);
+  void movzxb(CpuRegister dst, const Address& src);
+  void movsxb(CpuRegister dst, CpuRegister src);
+  void movsxb(CpuRegister dst, const Address& src);
+  void movb(CpuRegister dst, const Address& src);
+  void movb(const Address& dst, CpuRegister src);
   void movb(const Address& dst, const Immediate& imm);
 
-  void movzxw(Register dst, Register src);
-  void movzxw(Register dst, const Address& src);
-  void movsxw(Register dst, Register src);
-  void movsxw(Register dst, const Address& src);
-  void movw(Register dst, const Address& src);
-  void movw(const Address& dst, Register src);
+  void movzxw(CpuRegister dst, CpuRegister src);
+  void movzxw(CpuRegister dst, const Address& src);
+  void movsxw(CpuRegister dst, CpuRegister src);
+  void movsxw(CpuRegister dst, const Address& src);
+  void movw(CpuRegister dst, const Address& src);
+  void movw(const Address& dst, CpuRegister src);
 
-  void leaq(Register dst, const Address& src);
-
-  void cmovl(Condition condition, Register dst, Register src);
-
-  void setb(Condition condition, Register dst);
+  void leaq(CpuRegister dst, const Address& src);
 
   void movss(XmmRegister dst, const Address& src);
   void movss(const Address& dst, XmmRegister src);
   void movss(XmmRegister dst, XmmRegister src);
 
-  void movd(XmmRegister dst, Register src);
-  void movd(Register dst, XmmRegister src);
+  void movd(XmmRegister dst, CpuRegister src);
+  void movd(CpuRegister dst, XmmRegister src);
 
   void addss(XmmRegister dst, XmmRegister src);
   void addss(XmmRegister dst, const Address& src);
@@ -297,17 +315,17 @@
   void divsd(XmmRegister dst, XmmRegister src);
   void divsd(XmmRegister dst, const Address& src);
 
-  void cvtsi2ss(XmmRegister dst, Register src);
-  void cvtsi2sd(XmmRegister dst, Register src);
+  void cvtsi2ss(XmmRegister dst, CpuRegister src);
+  void cvtsi2sd(XmmRegister dst, CpuRegister src);
 
-  void cvtss2si(Register dst, XmmRegister src);
+  void cvtss2si(CpuRegister dst, XmmRegister src);
   void cvtss2sd(XmmRegister dst, XmmRegister src);
 
-  void cvtsd2si(Register dst, XmmRegister src);
+  void cvtsd2si(CpuRegister dst, XmmRegister src);
   void cvtsd2ss(XmmRegister dst, XmmRegister src);
 
-  void cvttss2si(Register dst, XmmRegister src);
-  void cvttsd2si(Register dst, XmmRegister src);
+  void cvttss2si(CpuRegister dst, XmmRegister src);
+  void cvttsd2si(CpuRegister dst, XmmRegister src);
 
   void cvtdq2pd(XmmRegister dst, XmmRegister src);
 
@@ -344,77 +362,62 @@
   void fcos();
   void fptan();
 
-  void xchgl(Register dst, Register src);
-  void xchgl(Register reg, const Address& address);
+  void xchgl(CpuRegister dst, CpuRegister src);
+  void xchgl(CpuRegister reg, const Address& address);
 
-  void cmpl(Register reg, const Immediate& imm);
-  void cmpl(Register reg0, Register reg1);
-  void cmpl(Register reg, const Address& address);
+  void cmpl(CpuRegister reg, const Immediate& imm);
+  void cmpl(CpuRegister reg0, CpuRegister reg1);
+  void cmpl(CpuRegister reg, const Address& address);
 
-  void cmpl(const Address& address, Register reg);
+  void cmpl(const Address& address, CpuRegister reg);
   void cmpl(const Address& address, const Immediate& imm);
 
-  void testl(Register reg1, Register reg2);
-  void testl(Register reg, const Immediate& imm);
+  void testl(CpuRegister reg1, CpuRegister reg2);
+  void testl(CpuRegister reg, const Immediate& imm);
 
-  void andl(Register dst, const Immediate& imm);
-  void andl(Register dst, Register src);
+  void andl(CpuRegister dst, const Immediate& imm);
+  void andl(CpuRegister dst, CpuRegister src);
 
-  void orl(Register dst, const Immediate& imm);
-  void orl(Register dst, Register src);
+  void orl(CpuRegister dst, const Immediate& imm);
+  void orl(CpuRegister dst, CpuRegister src);
 
-  void xorl(Register dst, Register src);
+  void xorl(CpuRegister dst, CpuRegister src);
 
-  void addl(Register dst, Register src);
-  void addq(Register reg, const Immediate& imm);
-  void addl(Register reg, const Immediate& imm);
-  void addl(Register reg, const Address& address);
+  void addl(CpuRegister dst, CpuRegister src);
+  void addq(CpuRegister reg, const Immediate& imm);
+  void addl(CpuRegister reg, const Immediate& imm);
+  void addl(CpuRegister reg, const Address& address);
 
-  void addl(const Address& address, Register reg);
+  void addl(const Address& address, CpuRegister reg);
   void addl(const Address& address, const Immediate& imm);
 
-  void adcl(Register dst, Register src);
-  void adcl(Register reg, const Immediate& imm);
-  void adcl(Register dst, const Address& address);
-
-  void subl(Register dst, Register src);
-  void subl(Register reg, const Immediate& imm);
-  void subl(Register reg, const Address& address);
+  void subl(CpuRegister dst, CpuRegister src);
+  void subl(CpuRegister reg, const Immediate& imm);
+  void subl(CpuRegister reg, const Address& address);
 
   void cdq();
 
-  void idivl(Register reg);
+  void idivl(CpuRegister reg);
 
-  void imull(Register dst, Register src);
-  void imull(Register reg, const Immediate& imm);
-  void imull(Register reg, const Address& address);
+  void imull(CpuRegister dst, CpuRegister src);
+  void imull(CpuRegister reg, const Immediate& imm);
+  void imull(CpuRegister reg, const Address& address);
 
-  void imull(Register reg);
+  void imull(CpuRegister reg);
   void imull(const Address& address);
 
-  void mull(Register reg);
+  void mull(CpuRegister reg);
   void mull(const Address& address);
 
-  void sbbl(Register dst, Register src);
-  void sbbl(Register reg, const Immediate& imm);
-  void sbbl(Register reg, const Address& address);
+  void shll(CpuRegister reg, const Immediate& imm);
+  void shll(CpuRegister operand, CpuRegister shifter);
+  void shrl(CpuRegister reg, const Immediate& imm);
+  void shrl(CpuRegister operand, CpuRegister shifter);
+  void sarl(CpuRegister reg, const Immediate& imm);
+  void sarl(CpuRegister operand, CpuRegister shifter);
 
-  void incl(Register reg);
-  void incl(const Address& address);
-
-  void decl(Register reg);
-  void decl(const Address& address);
-
-  void shll(Register reg, const Immediate& imm);
-  void shll(Register operand, Register shifter);
-  void shrl(Register reg, const Immediate& imm);
-  void shrl(Register operand, Register shifter);
-  void sarl(Register reg, const Immediate& imm);
-  void sarl(Register operand, Register shifter);
-  void shld(Register dst, Register src);
-
-  void negl(Register reg);
-  void notl(Register reg);
+  void negl(CpuRegister reg);
+  void notl(CpuRegister reg);
 
   void enter(const Immediate& imm);
   void leave();
@@ -428,12 +431,12 @@
 
   void j(Condition condition, Label* label);
 
-  void jmp(Register reg);
+  void jmp(CpuRegister reg);
   void jmp(const Address& address);
   void jmp(Label* label);
 
   X86_64Assembler* lock();
-  void cmpxchgl(const Address& address, Register reg);
+  void cmpxchgl(const Address& address, CpuRegister reg);
 
   void mfence();
 
@@ -443,7 +446,7 @@
   // Macros for High-level operations.
   //
 
-  void AddImmediate(Register reg, const Immediate& imm);
+  void AddImmediate(CpuRegister reg, const Immediate& imm);
 
   void LoadDoubleConstant(XmmRegister dst, double value);
 
@@ -452,7 +455,7 @@
 
   void DoubleAbs(XmmRegister reg);
 
-  void LockCmpxchgl(const Address& address, Register reg) {
+  void LockCmpxchgl(const Address& address, CpuRegister reg) {
     lock()->cmpxchgl(address, reg);
   }
 
@@ -468,109 +471,99 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister src);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister src);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
+  void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
+  void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread64(ThreadOffset<8> thr_offs) OVERRIDE;
 
-  void StoreLabelToThread(ThreadOffset thr_offs, Label* lbl);
-
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister dest, FrameOffset src, size_t size);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size);
+  void LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size);
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch);
+  void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch);
+  void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister scratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister);
+  void MemoryBarrier(ManagedRegister) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister scratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister scratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
   virtual void LoadReferenceFromSirt(ManagedRegister dst,
@@ -578,40 +571,57 @@
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
-  inline void EmitUint8(uint8_t value);
-  inline void EmitInt32(int32_t value);
-  inline void EmitRegisterOperand(int rm, int reg);
-  inline void EmitXmmRegisterOperand(int rm, XmmRegister reg);
-  inline void EmitFixup(AssemblerFixup* fixup);
-  inline void EmitOperandSizeOverride();
+  void EmitUint8(uint8_t value);
+  void EmitInt32(int32_t value);
+  void EmitRegisterOperand(uint8_t rm, uint8_t reg);
+  void EmitXmmRegisterOperand(uint8_t rm, XmmRegister reg);
+  void EmitFixup(AssemblerFixup* fixup);
+  void EmitOperandSizeOverride();
 
-  void EmitOperand(int rm, const Operand& operand);
+  void EmitOperand(uint8_t rm, const Operand& operand);
   void EmitImmediate(const Immediate& imm);
-  void EmitComplex(int rm, const Operand& operand, const Immediate& immediate);
+  void EmitComplex(uint8_t rm, const Operand& operand, const Immediate& immediate);
   void EmitLabel(Label* label, int instruction_size);
   void EmitLabelLink(Label* label);
   void EmitNearLabelLink(Label* label);
 
-  void EmitGenericShift(int rm, Register reg, const Immediate& imm);
-  void EmitGenericShift(int rm, Register operand, Register shifter);
-  void rex(Register &dst, Register &src, size_t size = 4);
-  void rex_reg(Register &dst, size_t size = 4);
-  void rex_rm(Register &src, size_t size = 4);
+  void EmitGenericShift(int rm, CpuRegister reg, const Immediate& imm);
+  void EmitGenericShift(int rm, CpuRegister operand, CpuRegister shifter);
+
+  // If any input is not false, output the necessary rex prefix.
+  void EmitOptionalRex(bool force, bool w, bool r, bool x, bool b);
+
+  // Emit a rex prefix byte if necessary for reg. ie if reg is a register in the range R8 to R15.
+  void EmitOptionalRex32(CpuRegister reg);
+  void EmitOptionalRex32(CpuRegister dst, CpuRegister src);
+  void EmitOptionalRex32(XmmRegister dst, XmmRegister src);
+  void EmitOptionalRex32(CpuRegister dst, XmmRegister src);
+  void EmitOptionalRex32(XmmRegister dst, CpuRegister src);
+  void EmitOptionalRex32(const Operand& operand);
+  void EmitOptionalRex32(CpuRegister dst, const Operand& operand);
+  void EmitOptionalRex32(XmmRegister dst, const Operand& operand);
+
+  // Emit a REX.W prefix plus necessary register bit encodings.
+  void EmitRex64(CpuRegister reg);
+  void EmitRex64(CpuRegister dst, CpuRegister src);
+  void EmitRex64(CpuRegister dst, const Operand& operand);
+
+  // Emit a REX prefix to normalize byte registers plus necessary register bit encodings.
+  void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src);
+  void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, const Operand& operand);
 
   DISALLOW_COPY_AND_ASSIGN(X86_64Assembler);
 };
@@ -624,14 +634,14 @@
   buffer_.Emit<int32_t>(value);
 }
 
-inline void X86_64Assembler::EmitRegisterOperand(int rm, int reg) {
+inline void X86_64Assembler::EmitRegisterOperand(uint8_t rm, uint8_t reg) {
   CHECK_GE(rm, 0);
   CHECK_LT(rm, 8);
   buffer_.Emit<uint8_t>(0xC0 + (rm << 3) + reg);
 }
 
-inline void X86_64Assembler::EmitXmmRegisterOperand(int rm, XmmRegister reg) {
-  EmitRegisterOperand(rm, static_cast<Register>(reg));
+inline void X86_64Assembler::EmitXmmRegisterOperand(uint8_t rm, XmmRegister reg) {
+  EmitRegisterOperand(rm, static_cast<uint8_t>(reg.AsFloatRegister()));
 }
 
 inline void X86_64Assembler::EmitFixup(AssemblerFixup* fixup) {
@@ -642,15 +652,6 @@
   EmitUint8(0x66);
 }
 
-// Slowpath entered when Thread::Current()->_exception is non-null
-class X86ExceptionSlowPath : public SlowPath {
- public:
-  explicit X86ExceptionSlowPath(size_t stack_adjust) : stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm);
- private:
-  const size_t stack_adjust_;
-};
-
 }  // namespace x86_64
 }  // namespace art
 
diff --git a/compiler/utils/x86_64/constants_x86_64.h b/compiler/utils/x86_64/constants_x86_64.h
index 3340802..58a0379 100644
--- a/compiler/utils/x86_64/constants_x86_64.h
+++ b/compiler/utils/x86_64/constants_x86_64.h
@@ -27,30 +27,37 @@
 namespace art {
 namespace x86_64 {
 
-enum ByteRegister {
-  AL = 0,
-  CL = 1,
-  DL = 2,
-  BL = 3,
-  AH = 4,
-  CH = 5,
-  DH = 6,
-  BH = 7,
-  kNoByteRegister = -1  // Signals an illegal register.
+class CpuRegister {
+ public:
+  explicit CpuRegister(Register r) : reg_(r) {}
+  Register AsRegister() const {
+    return reg_;
+  }
+  uint8_t LowBits() const {
+    return reg_ & 7;
+  }
+  bool NeedsRex() const {
+    return reg_ > 7;
+  }
+ private:
+  const Register reg_;
 };
+std::ostream& operator<<(std::ostream& os, const CpuRegister& reg);
 
-
-enum XmmRegister {
-  _XMM0 = 0,
-  _XMM1 = 1,
-  _XMM2 = 2,
-  _XMM3 = 3,
-  _XMM4 = 4,
-  _XMM5 = 5,
-  _XMM6 = 6,
-  _XMM7 = 7,
-  kNumberOfXmmRegisters = 8,
-  kNoXmmRegister = -1  // Signals an illegal register.
+class XmmRegister {
+ public:
+  explicit XmmRegister(FloatRegister r) : reg_(r) {}
+  FloatRegister AsFloatRegister() const {
+    return reg_;
+  }
+  uint8_t LowBits() const {
+    return reg_ & 7;
+  }
+  bool NeedsRex() const {
+    return reg_ > 7;
+  }
+ private:
+  const FloatRegister reg_;
 };
 std::ostream& operator<<(std::ostream& os, const XmmRegister& reg);
 
diff --git a/compiler/utils/x86_64/managed_register_x86_64.cc b/compiler/utils/x86_64/managed_register_x86_64.cc
index 057a894..b8c2db2 100644
--- a/compiler/utils/x86_64/managed_register_x86_64.cc
+++ b/compiler/utils/x86_64/managed_register_x86_64.cc
@@ -60,8 +60,8 @@
   CHECK(other.IsValidManagedRegister());
   if (Equals(other)) return true;
   if (IsRegisterPair()) {
-    Register low = AsRegisterPairLow();
-    Register high = AsRegisterPairHigh();
+    Register low = AsRegisterPairLow().AsRegister();
+    Register high = AsRegisterPairHigh().AsRegister();
     return X86_64ManagedRegister::FromCpuRegister(low).Overlaps(other) ||
         X86_64ManagedRegister::FromCpuRegister(high).Overlaps(other);
   }
@@ -94,11 +94,11 @@
   if (!IsValidManagedRegister()) {
     os << "No Register";
   } else if (IsXmmRegister()) {
-    os << "XMM: " << static_cast<int>(AsXmmRegister());
+    os << "XMM: " << static_cast<int>(AsXmmRegister().AsFloatRegister());
   } else if (IsX87Register()) {
     os << "X87: " << static_cast<int>(AsX87Register());
   } else if (IsCpuRegister()) {
-    os << "CPU: " << static_cast<int>(AsCpuRegister());
+    os << "CPU: " << static_cast<int>(AsCpuRegister().AsRegister());
   } else if (IsRegisterPair()) {
     os << "Pair: " << AsRegisterPairLow() << ", " << AsRegisterPairHigh();
   } else {
diff --git a/compiler/utils/x86_64/managed_register_x86_64.h b/compiler/utils/x86_64/managed_register_x86_64.h
index d68c59d..822659f 100644
--- a/compiler/utils/x86_64/managed_register_x86_64.h
+++ b/compiler/utils/x86_64/managed_register_x86_64.h
@@ -46,8 +46,8 @@
 const int kNumberOfCpuRegIds = kNumberOfCpuRegisters;
 const int kNumberOfCpuAllocIds = kNumberOfCpuRegisters;
 
-const int kNumberOfXmmRegIds = kNumberOfXmmRegisters;
-const int kNumberOfXmmAllocIds = kNumberOfXmmRegisters;
+const int kNumberOfXmmRegIds = kNumberOfFloatRegisters;
+const int kNumberOfXmmAllocIds = kNumberOfFloatRegisters;
 
 const int kNumberOfX87RegIds = kNumberOfX87Registers;
 const int kNumberOfX87AllocIds = kNumberOfX87Registers;
@@ -87,20 +87,14 @@
 // There is a one-to-one mapping between ManagedRegister and register id.
 class X86_64ManagedRegister : public ManagedRegister {
  public:
-  ByteRegister AsByteRegister() const {
+  CpuRegister AsCpuRegister() const {
     CHECK(IsCpuRegister());
-    CHECK_LT(AsCpuRegister(), RSP);  // RSP, RBP, ESI and RDI cannot be encoded as byte registers.
-    return static_cast<ByteRegister>(id_);
-  }
-
-  Register AsCpuRegister() const {
-    CHECK(IsCpuRegister());
-    return static_cast<Register>(id_);
+    return CpuRegister(static_cast<Register>(id_));
   }
 
   XmmRegister AsXmmRegister() const {
     CHECK(IsXmmRegister());
-    return static_cast<XmmRegister>(id_ - kNumberOfCpuRegIds);
+    return XmmRegister(static_cast<FloatRegister>(id_ - kNumberOfCpuRegIds));
   }
 
   X87Register AsX87Register() const {
@@ -109,13 +103,13 @@
                                     (kNumberOfCpuRegIds + kNumberOfXmmRegIds));
   }
 
-  Register AsRegisterPairLow() const {
+  CpuRegister AsRegisterPairLow() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdLow().
     return FromRegId(AllocIdLow()).AsCpuRegister();
   }
 
-  Register AsRegisterPairHigh() const {
+  CpuRegister AsRegisterPairHigh() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdHigh().
     return FromRegId(AllocIdHigh()).AsCpuRegister();
@@ -157,8 +151,7 @@
     return FromRegId(r);
   }
 
-  static X86_64ManagedRegister FromXmmRegister(XmmRegister r) {
-    CHECK_NE(r, kNoXmmRegister);
+  static X86_64ManagedRegister FromXmmRegister(FloatRegister r) {
     return FromRegId(r + kNumberOfCpuRegIds);
   }
 
diff --git a/dalvikvm/Android.mk b/dalvikvm/Android.mk
index a046391..231fba1 100644
--- a/dalvikvm/Android.mk
+++ b/dalvikvm/Android.mk
@@ -26,6 +26,9 @@
 LOCAL_CFLAGS := $(dalvikvm_cflags)
 LOCAL_SHARED_LIBRARIES := libdl libnativehelper
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
+LOCAL_MULTILIB := both
+LOCAL_MODULE_STEM_32 := dalvikvm
+LOCAL_MODULE_STEM_64 := dalvikvm64
 include external/stlport/libstlport.mk
 include $(BUILD_EXECUTABLE)
 ART_TARGET_EXECUTABLES += $(TARGET_OUT_EXECUTABLES)/$(LOCAL_MODULE)
diff --git a/dex2oat/Android.mk b/dex2oat/Android.mk
index 6cd0538..038f0a7 100644
--- a/dex2oat/Android.mk
+++ b/dex2oat/Android.mk
@@ -22,10 +22,10 @@
 	dex2oat.cc
 
 ifeq ($(ART_BUILD_TARGET_NDEBUG),true)
-  $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libart-compiler,art/compiler,target,ndebug))
+  $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libart-compiler,art/compiler,target,ndebug,32))
 endif
 ifeq ($(ART_BUILD_TARGET_DEBUG),true)
-  $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libartd-compiler,art/compiler,target,debug))
+  $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libartd-compiler,art/compiler,target,debug,32))
 endif
 
 ifeq ($(WITH_HOST_DALVIK),true)
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 72effde..f665f5c 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -153,8 +153,8 @@
   UsageError("      Example: --compiler-backend=Portable");
   UsageError("      Default: Quick");
   UsageError("");
-  UsageError("  --compiler-filter=(interpret-only|space|balanced|speed|everything): select");
-  UsageError("      compiler filter.");
+  UsageError("  --compiler-filter=(verify-none|interpret-only|space|balanced|speed|everything):");
+  UsageError("      select compiler filter.");
   UsageError("      Example: --compiler-filter=everything");
 #if ART_SMALL_MODE
   UsageError("      Default: interpret-only");
@@ -189,7 +189,8 @@
   UsageError("");
   UsageError("  --num-dex-methods=<method-count>: threshold size for a small dex file for");
   UsageError("      compiler filter tuning. If the input has fewer than this many methods");
-  UsageError("      and the filter is not interpret-only, overrides the filter to use speed");
+  UsageError("      and the filter is not interpret-only or verify-none, overrides the");
+  UsageError("      filter to use speed");
   UsageError("      Example: --num-dex-method=%d", CompilerOptions::kDefaultNumDexMethodsThreshold);
   UsageError("      Default: %d", CompilerOptions::kDefaultNumDexMethodsThreshold);
   UsageError("");
@@ -201,8 +202,8 @@
   UsageError("      such as initial heap size, maximum heap size, and verbose output.");
   UsageError("      Use a separate --runtime-arg switch for each argument.");
   UsageError("      Example: --runtime-arg -Xms256m");
-    UsageError("");
-    UsageError("  --profile-file=<filename>: specify profiler output file to use for compilation.");
+  UsageError("");
+  UsageError("  --profile-file=<filename>: specify profiler output file to use for compilation.");
   UsageError("");
   UsageError("  --print-pass-names: print a list of pass names");
   UsageError("");
@@ -740,7 +741,7 @@
 
   // Take the default set of instruction features from the build.
   InstructionSetFeatures instruction_set_features =
-      ParseFeatureList(STRINGIFY(ART_DEFAULT_INSTRUCTION_SET_FEATURES));
+      ParseFeatureList(Runtime::GetDefaultInstructionSetFeatures());
 
 #if defined(__arm__)
   InstructionSet instruction_set = kThumb2;
@@ -1037,7 +1038,9 @@
   }
   CHECK(compiler_filter_string != nullptr);
   CompilerOptions::CompilerFilter compiler_filter = CompilerOptions::kDefaultCompilerFilter;
-  if (strcmp(compiler_filter_string, "interpret-only") == 0) {
+  if (strcmp(compiler_filter_string, "verify-none") == 0) {
+    compiler_filter = CompilerOptions::kVerifyNone;
+  } else if (strcmp(compiler_filter_string, "interpret-only") == 0) {
     compiler_filter = CompilerOptions::kInterpretOnly;
   } else if (strcmp(compiler_filter_string, "space") == 0) {
     compiler_filter = CompilerOptions::kSpace;
@@ -1208,10 +1211,10 @@
   }
 
   /*
-   * If we're not in interpret-only mode, go ahead and compile small applications. Don't
-   * bother to check if we're doing the image.
+   * If we're not in interpret-only or verify-none mode, go ahead and compile small applications.
+   * Don't bother to check if we're doing the image.
    */
-  if (!image && (compiler_options.GetCompilerFilter() != CompilerOptions::kInterpretOnly)) {
+  if (!image && compiler_options.IsCompilationEnabled()) {
     size_t num_methods = 0;
     for (size_t i = 0; i != dex_files.size(); ++i) {
       const DexFile* dex_file = dex_files[i];
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 55fd52f..899aa78 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -305,7 +305,7 @@
           }
           if (rn.r == 9) {
             args << "  ; ";
-            Thread::DumpThreadOffset(args, offset, 4);
+            Thread::DumpThreadOffset<4>(args, offset);
           }
         }
       }
@@ -1291,7 +1291,7 @@
               args << Rt << ", [" << Rn << ", #" << imm12 << "]";
               if (Rn.r == 9) {
                 args << "  ; ";
-                Thread::DumpThreadOffset(args, imm12, 4);
+                Thread::DumpThreadOffset<4>(args, imm12);
               } else if (Rn.r == 15) {
                 intptr_t lit_adr = reinterpret_cast<intptr_t>(instr_ptr);
                 lit_adr = RoundDown(lit_adr, 4) + 4 + imm12;
@@ -1304,7 +1304,7 @@
               args << Rt << ", [" << Rn << ", #" << imm12 << "]";
               if (Rn.r == 9) {
                 args << "  ; ";
-                Thread::DumpThreadOffset(args, imm12, 4);
+                Thread::DumpThreadOffset<4>(args, imm12);
               } else if (Rn.r == 15) {
                 intptr_t lit_adr = reinterpret_cast<intptr_t>(instr_ptr);
                 lit_adr = RoundDown(lit_adr, 4) + 4 + imm12;
@@ -1361,7 +1361,7 @@
             args << Rt << ", [" << Rn << ", #" << imm12 << "]";
             if (Rn.r == 9) {
               args << "  ; ";
-              Thread::DumpThreadOffset(args, imm12, 4);
+              Thread::DumpThreadOffset<4>(args, imm12);
             } else if (Rn.r == 15) {
               intptr_t lit_adr = reinterpret_cast<intptr_t>(instr_ptr);
               lit_adr = RoundDown(lit_adr, 4) + 4 + imm12;
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index 72ff761..5e89f6f 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -237,7 +237,7 @@
               args << StringPrintf("%+d(r%d)", offset, rs);
               if (rs == 17) {
                 args << "  ; ";
-                Thread::DumpThreadOffset(args, offset, 4);
+                Thread::DumpThreadOffset<4>(args, offset);
               }
             }
             break;
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 4a03ebe..68e77d4 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -849,9 +849,13 @@
     }
     args << StringPrintf("%+d (%p)", displacement, instr + displacement);
   }
-  if (prefix[1] == kFs) {
+  if (prefix[1] == kFs && !supports_rex_) {
     args << "  ; ";
-    Thread::DumpThreadOffset(args, address_bits, 4);
+    Thread::DumpThreadOffset<4>(args, address_bits);
+  }
+  if (prefix[1] == kGs && supports_rex_) {
+    args << "  ; ";
+    Thread::DumpThreadOffset<8>(args, address_bits);
   }
   std::stringstream hex;
   for (size_t i = 0; begin_instr + i < instr; ++i) {
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 1576905..cf7f895 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -73,6 +73,7 @@
 	hprof/hprof.cc \
 	image.cc \
 	indirect_reference_table.cc \
+	instruction_set.cc \
 	instrumentation.cc \
 	intern_table.cc \
 	interpreter/interpreter.cc \
@@ -103,7 +104,7 @@
 	native/dalvik_system_VMDebug.cc \
 	native/dalvik_system_VMRuntime.cc \
 	native/dalvik_system_VMStack.cc \
-	native/dalvik_system_Zygote.cc \
+	native/dalvik_system_ZygoteHooks.cc \
 	native/java_lang_Class.cc \
 	native/java_lang_DexCache.cc \
 	native/java_lang_Object.cc \
@@ -294,6 +295,7 @@
 
 
 LIBART_ENUM_OPERATOR_OUT_HEADER_FILES := \
+	arch/x86_64/registers_x86_64.h \
 	base/mutex.h \
 	dex_file.h \
 	dex_instruction.h \
@@ -435,6 +437,11 @@
   endif
 
   ifeq ($$(art_target_or_host),target)
+    ifneq ($$(art_ndebug_or_debug),debug)
+      # Leave the symbols in the shared library so that stack unwinders can
+      # produce meaningful name resolution.
+      LOCAL_STRIP_MODULE := keep_symbols
+    endif
     include $(BUILD_SHARED_LIBRARY)
   else # host
     include $(BUILD_HOST_SHARED_LIBRARY)
diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h
index cfffbea..4b64076 100644
--- a/runtime/arch/arm/asm_support_arm.h
+++ b/runtime/arch/arm/asm_support_arm.h
@@ -23,13 +23,13 @@
 #define rSUSPEND r4
 // Register holding Thread::Current().
 #define rSELF r9
-// Offset of field Thread::suspend_count_ verified in InitCpu
+// Offset of field Thread::tls32_.state_and_flags verified in InitCpu
 #define THREAD_FLAGS_OFFSET 0
-// Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
-// Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 12
-// Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
-#define THREAD_ID_OFFSET 60
+// Offset of field Thread::tls32_.thin_lock_thread_id verified in InitCpu
+#define THREAD_ID_OFFSET 12
+// Offset of field Thread::tlsPtr_.card_table verified in InitCpu
+#define THREAD_CARD_TABLE_OFFSET 112
+// Offset of field Thread::tlsPtr_.exception verified in InitCpu
+#define THREAD_EXCEPTION_OFFSET 116
 
 #endif  // ART_RUNTIME_ARCH_ARM_ASM_SUPPORT_ARM_H_
diff --git a/runtime/arch/arm/fault_handler_arm.cc b/runtime/arch/arm/fault_handler_arm.cc
index abce838..aaba598 100644
--- a/runtime/arch/arm/fault_handler_arm.cc
+++ b/runtime/arch/arm/fault_handler_arm.cc
@@ -45,12 +45,13 @@
   return instr_size;
 }
 
-void FaultManager::GetMethodAndReturnPC(void* context, uintptr_t& method, uintptr_t& return_pc) {
+void FaultManager::GetMethodAndReturnPCAndSP(void* context, mirror::ArtMethod** out_method,
+                                             uintptr_t* out_return_pc, uintptr_t* out_sp) {
   struct ucontext *uc = (struct ucontext *)context;
   struct sigcontext *sc = reinterpret_cast<struct sigcontext*>(&uc->uc_mcontext);
-  uintptr_t* sp = reinterpret_cast<uint32_t*>(sc->arm_sp);
-  LOG(DEBUG) << "sp: " << sp;
-  if (sp == nullptr) {
+  *out_sp = static_cast<uintptr_t>(sc->arm_sp);
+  LOG(DEBUG) << "sp: " << *out_sp;
+  if (*out_sp == 0) {
     return;
   }
 
@@ -58,12 +59,12 @@
   // get the method from the top of the stack.  However it's in r0.
   uintptr_t* fault_addr = reinterpret_cast<uintptr_t*>(sc->fault_address);
   uintptr_t* overflow_addr = reinterpret_cast<uintptr_t*>(
-      reinterpret_cast<uint8_t*>(sp) - Thread::kStackOverflowReservedBytes);
+      reinterpret_cast<uint8_t*>(*out_sp) - Thread::kStackOverflowReservedBytes);
   if (overflow_addr == fault_addr) {
-    method = sc->arm_r0;
+    *out_method = reinterpret_cast<mirror::ArtMethod*>(sc->arm_r0);
   } else {
     // The method is at the top of the stack.
-    method = sp[0];
+    *out_method = reinterpret_cast<mirror::ArtMethod*>(reinterpret_cast<uintptr_t*>(*out_sp)[0]);
   }
 
   // Work out the return PC.  This will be the address of the instruction
@@ -76,7 +77,7 @@
   LOG(DEBUG) << "pc: " << std::hex << static_cast<void*>(ptr);
   uint32_t instr_size = GetInstructionSize(ptr);
 
-  return_pc = (sc->arm_pc + instr_size) | 1;
+  *out_return_pc = (sc->arm_pc + instr_size) | 1;
 }
 
 bool NullPointerHandler::Action(int sig, siginfo_t* info, void* context) {
@@ -87,7 +88,7 @@
   // register in order to find the mapping.
 
   // Need to work out the size of the instruction that caused the exception.
-  struct ucontext *uc = (struct ucontext *)context;
+  struct ucontext *uc = reinterpret_cast<struct ucontext*>(context);
   struct sigcontext *sc = reinterpret_cast<struct sigcontext*>(&uc->uc_mcontext);
   uint8_t* ptr = reinterpret_cast<uint8_t*>(sc->arm_pc);
 
@@ -109,7 +110,7 @@
 bool SuspensionHandler::Action(int sig, siginfo_t* info, void* context) {
   // These are the instructions to check for.  The first one is the ldr r0,[r9,#xxx]
   // where xxx is the offset of the suspend trigger.
-  uint32_t checkinst1 = 0xf8d90000 + Thread::ThreadSuspendTriggerOffset().Int32Value();
+  uint32_t checkinst1 = 0xf8d90000 + Thread::ThreadSuspendTriggerOffset<4>().Int32Value();
   uint16_t checkinst2 = 0x6800;
 
   struct ucontext *uc = (struct ucontext *)context;
diff --git a/runtime/arch/arm/thread_arm.cc b/runtime/arch/arm/thread_arm.cc
index df4a04a..2a551a8 100644
--- a/runtime/arch/arm/thread_arm.cc
+++ b/runtime/arch/arm/thread_arm.cc
@@ -22,10 +22,10 @@
 namespace art {
 
 void Thread::InitCpu() {
-  CHECK_EQ(THREAD_FLAGS_OFFSET, OFFSETOF_MEMBER(Thread, state_and_flags_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_FLAGS_OFFSET, ThreadFlagsOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<4>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/arm64/asm_support_arm64.h b/runtime/arch/arm64/asm_support_arm64.h
index 263a764..a7e68ed 100644
--- a/runtime/arch/arm64/asm_support_arm64.h
+++ b/runtime/arch/arm64/asm_support_arm64.h
@@ -40,10 +40,10 @@
 // Offset of field Thread::suspend_count_ verified in InitCpu
 #define THREAD_FLAGS_OFFSET 0
 // Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
+#define THREAD_CARD_TABLE_OFFSET 112
 // Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 16
+#define THREAD_EXCEPTION_OFFSET 120
 // Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
-#define THREAD_ID_OFFSET 112
+#define THREAD_ID_OFFSET 12
 
 #endif  // ART_RUNTIME_ARCH_ARM64_ASM_SUPPORT_ARM64_H_
diff --git a/runtime/arch/arm64/fault_handler_arm64.cc b/runtime/arch/arm64/fault_handler_arm64.cc
index 419e5af..74c3023 100644
--- a/runtime/arch/arm64/fault_handler_arm64.cc
+++ b/runtime/arch/arm64/fault_handler_arm64.cc
@@ -29,7 +29,8 @@
 
 namespace art {
 
-void FaultManager::GetMethodAndReturnPC(void* context, uintptr_t& method, uintptr_t& return_pc) {
+void FaultManager::GetMethodAndReturnPCAndSP(void* context, mirror::ArtMethod** out_method,
+                                             uintptr_t* out_return_pc, uintptr_t* out_sp) {
 }
 
 bool NullPointerHandler::Action(int sig, siginfo_t* info, void* context) {
diff --git a/runtime/arch/arm64/thread_arm64.cc b/runtime/arch/arm64/thread_arm64.cc
index 4eebb85..564dced 100644
--- a/runtime/arch/arm64/thread_arm64.cc
+++ b/runtime/arch/arm64/thread_arm64.cc
@@ -22,10 +22,10 @@
 namespace art {
 
 void Thread::InitCpu() {
-  CHECK_EQ(THREAD_FLAGS_OFFSET, OFFSETOF_MEMBER(Thread, state_and_flags_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_FLAGS_OFFSET, ThreadFlagsOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<8>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/mips/asm_support_mips.h b/runtime/arch/mips/asm_support_mips.h
index 5307997..36ce1b6 100644
--- a/runtime/arch/mips/asm_support_mips.h
+++ b/runtime/arch/mips/asm_support_mips.h
@@ -23,11 +23,11 @@
 #define rSUSPEND $s0
 // Register holding Thread::Current().
 #define rSELF $s1
-// Offset of field Thread::suspend_count_ verified in InitCpu
+// Offset of field Thread::tls32_.state_and_flags verified in InitCpu
 #define THREAD_FLAGS_OFFSET 0
-// Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
-// Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 12
+// Offset of field Thread::tlsPtr_.card_table verified in InitCpu
+#define THREAD_CARD_TABLE_OFFSET 112
+// Offset of field Thread::tlsPtr_.exception verified in InitCpu
+#define THREAD_EXCEPTION_OFFSET 116
 
 #endif  // ART_RUNTIME_ARCH_MIPS_ASM_SUPPORT_MIPS_H_
diff --git a/runtime/arch/mips/fault_handler_mips.cc b/runtime/arch/mips/fault_handler_mips.cc
index 8d494c1..1ecd7d9 100644
--- a/runtime/arch/mips/fault_handler_mips.cc
+++ b/runtime/arch/mips/fault_handler_mips.cc
@@ -29,7 +29,8 @@
 
 namespace art {
 
-void FaultManager::GetMethodAndReturnPC(void* context, uintptr_t& method, uintptr_t& return_pc) {
+void FaultManager::GetMethodAndReturnPCAndSP(void* context, mirror::ArtMethod** out_method,
+                                             uintptr_t* out_return_pc, uintptr_t* out_sp) {
 }
 
 bool NullPointerHandler::Action(int sig, siginfo_t* info, void* context) {
diff --git a/runtime/arch/mips/thread_mips.cc b/runtime/arch/mips/thread_mips.cc
index f5d211f..a451496 100644
--- a/runtime/arch/mips/thread_mips.cc
+++ b/runtime/arch/mips/thread_mips.cc
@@ -22,9 +22,9 @@
 namespace art {
 
 void Thread::InitCpu() {
-  CHECK_EQ(THREAD_FLAGS_OFFSET, OFFSETOF_MEMBER(Thread, state_and_flags_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
+  CHECK_EQ(THREAD_FLAGS_OFFSET, ThreadFlagsOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<4>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/x86/asm_support_x86.h b/runtime/arch/x86/asm_support_x86.h
index e817ff7..e986c41 100644
--- a/runtime/arch/x86/asm_support_x86.h
+++ b/runtime/arch/x86/asm_support_x86.h
@@ -20,12 +20,12 @@
 #include "asm_support.h"
 
 // Offset of field Thread::self_ verified in InitCpu
-#define THREAD_SELF_OFFSET 40
+#define THREAD_SELF_OFFSET 148
 // Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
+#define THREAD_CARD_TABLE_OFFSET 112
 // Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 12
+#define THREAD_EXCEPTION_OFFSET 116
 // Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
-#define THREAD_ID_OFFSET 60
+#define THREAD_ID_OFFSET 12
 
 #endif  // ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_H_
diff --git a/runtime/arch/x86/fault_handler_x86.cc b/runtime/arch/x86/fault_handler_x86.cc
index 171a541..7c1980e 100644
--- a/runtime/arch/x86/fault_handler_x86.cc
+++ b/runtime/arch/x86/fault_handler_x86.cc
@@ -29,7 +29,8 @@
 
 namespace art {
 
-void FaultManager::GetMethodAndReturnPC(void* context, uintptr_t& method, uintptr_t& return_pc) {
+void FaultManager::GetMethodAndReturnPCAndSP(void* context, mirror::ArtMethod** out_method,
+                                             uintptr_t* out_return_pc, uintptr_t* out_sp) {
 }
 
 bool NullPointerHandler::Action(int sig, siginfo_t* info, void* context) {
diff --git a/runtime/arch/x86/thread_x86.cc b/runtime/arch/x86/thread_x86.cc
index 235da99..26cd864 100644
--- a/runtime/arch/x86/thread_x86.cc
+++ b/runtime/arch/x86/thread_x86.cc
@@ -120,11 +120,11 @@
       :);  // clobber
 
   // Allow easy indirection back to Thread*.
-  self_ = this;
+  tlsPtr_.self = this;
 
   // Sanity check that reads from %fs point to this Thread*.
   Thread* self_check;
-  CHECK_EQ(THREAD_SELF_OFFSET, OFFSETOF_MEMBER(Thread, self_));
+  CHECK_EQ(THREAD_SELF_OFFSET, SelfOffset<4>().Int32Value());
   __asm__ __volatile__("movl %%fs:(%1), %0"
       : "=r"(self_check)  // output
       : "r"(THREAD_SELF_OFFSET)  // input
@@ -132,9 +132,9 @@
   CHECK_EQ(self_check, this);
 
   // Sanity check other offsets.
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<4>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/x86_64/asm_support_x86_64.h b/runtime/arch/x86_64/asm_support_x86_64.h
index 03d9e24..70ef3ef 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.h
+++ b/runtime/arch/x86_64/asm_support_x86_64.h
@@ -27,12 +27,12 @@
 #define RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET 16
 
 // Offset of field Thread::self_ verified in InitCpu
-#define THREAD_SELF_OFFSET 72
+#define THREAD_SELF_OFFSET 184
 // Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
+#define THREAD_CARD_TABLE_OFFSET 112
 // Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 16
+#define THREAD_EXCEPTION_OFFSET 120
 // Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
-#define THREAD_ID_OFFSET 112
+#define THREAD_ID_OFFSET 12
 
 #endif  // ART_RUNTIME_ARCH_X86_64_ASM_SUPPORT_X86_64_H_
diff --git a/runtime/arch/x86_64/fault_handler_x86_64.cc b/runtime/arch/x86_64/fault_handler_x86_64.cc
index 3ef19fb..233d3c7 100644
--- a/runtime/arch/x86_64/fault_handler_x86_64.cc
+++ b/runtime/arch/x86_64/fault_handler_x86_64.cc
@@ -29,7 +29,8 @@
 
 namespace art {
 
-void FaultManager::GetMethodAndReturnPC(void* context, uintptr_t& method, uintptr_t& return_pc) {
+void FaultManager::GetMethodAndReturnPCAndSP(void* context, mirror::ArtMethod** out_method,
+                                             uintptr_t* out_return_pc, uintptr_t* out_sp) {
 }
 
 bool NullPointerHandler::Action(int sig, siginfo_t* info, void* context) {
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 0d75a89..17b8556 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -146,7 +146,6 @@
     // Outgoing argument set up
     mov %rsp, %rdx                    // pass SP
     mov %gs:THREAD_SELF_OFFSET, %rsi  // pass Thread::Current()
-    mov %rax, %rdi                    // pass arg1
     call PLT_VAR(cxx_name, 1)     // cxx_name(arg1, Thread*, SP)
     int3                          // unreached
     END_FUNCTION VAR(c_name, 0)
diff --git a/runtime/arch/x86_64/registers_x86_64.h b/runtime/arch/x86_64/registers_x86_64.h
index b9d06b5..8b0dc07 100644
--- a/runtime/arch/x86_64/registers_x86_64.h
+++ b/runtime/arch/x86_64/registers_x86_64.h
@@ -67,7 +67,7 @@
   XMM15 = 15,
   kNumberOfFloatRegisters = 16
 };
-std::ostream& operator<<(std::ostream& os, const Register& rhs);
+std::ostream& operator<<(std::ostream& os, const FloatRegister& rhs);
 
 }  // namespace x86_64
 }  // namespace art
diff --git a/runtime/arch/x86_64/thread_x86_64.cc b/runtime/arch/x86_64/thread_x86_64.cc
index b74fc5d..de4c56a 100644
--- a/runtime/arch/x86_64/thread_x86_64.cc
+++ b/runtime/arch/x86_64/thread_x86_64.cc
@@ -36,11 +36,11 @@
   arch_prctl(ARCH_SET_GS, this);
 
   // Allow easy indirection back to Thread*.
-  self_ = this;
+  tlsPtr_.self = this;
 
   // Sanity check that reads from %gs point to this Thread*.
   Thread* self_check;
-  CHECK_EQ(THREAD_SELF_OFFSET, OFFSETOF_MEMBER(Thread, self_));
+  CHECK_EQ(THREAD_SELF_OFFSET, SelfOffset<8>().Int32Value());
   __asm__ __volatile__("movq %%gs:(%1), %0"
       : "=r"(self_check)  // output
       : "r"(THREAD_SELF_OFFSET)  // input
@@ -54,15 +54,15 @@
            Runtime::GetCalleeSaveMethodOffset(Runtime::kRefsOnly));
   CHECK_EQ(static_cast<size_t>(RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET),
            Runtime::GetCalleeSaveMethodOffset(Runtime::kRefsAndArgs));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<8>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
   // Sanity check that reads from %gs point to this Thread*.
   Thread* self_check;
-  CHECK_EQ(THREAD_SELF_OFFSET, OFFSETOF_MEMBER(Thread, self_));
+  CHECK_EQ(THREAD_SELF_OFFSET, SelfOffset<8>().Int32Value());
   __asm__ __volatile__("movq %%gs:(%1), %0"
       : "=r"(self_check)  // output
       : "r"(THREAD_SELF_OFFSET)  // input
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 0c1a72a..8ef407d 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -17,7 +17,7 @@
 #ifndef ART_RUNTIME_ASM_SUPPORT_H_
 #define ART_RUNTIME_ASM_SUPPORT_H_
 
-#include "brooks_pointer.h"
+#include "read_barrier.h"
 
 // Value loaded into rSUSPEND for quick. When this value is counted down to zero we do a suspend
 // check.
@@ -27,7 +27,7 @@
 #define CLASS_OFFSET 0
 #define LOCK_WORD_OFFSET 4
 
-#ifndef USE_BROOKS_POINTER
+#ifndef USE_BAKER_OR_BROOKS_READ_BARRIER
 
 // Offsets within java.lang.Class.
 #define CLASS_COMPONENT_TYPE_OFFSET 12
diff --git a/runtime/brooks_pointer.h b/runtime/brooks_pointer.h
deleted file mode 100644
index 3dac6e9..0000000
--- a/runtime/brooks_pointer.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_RUNTIME_BROOKS_POINTER_H_
-#define ART_RUNTIME_BROOKS_POINTER_H_
-
-// This is in a separate file (from globals.h) because asm_support.h
-// (a C header, not C++) can't include globals.h.
-
-// Uncomment this and the two fields in Object.java (libcore) to
-// enable brooks pointers.
-// #define USE_BROOKS_POINTER
-
-#endif  // ART_RUNTIME_BROOKS_POINTER_H_
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 08ea123..6c5406e 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -206,8 +206,8 @@
   CHECK(java_lang_Class.get() != NULL);
   mirror::Class::SetClassClass(java_lang_Class.get());
   java_lang_Class->SetClass(java_lang_Class.get());
-  if (kUseBrooksPointer) {
-    java_lang_Class->AssertSelfBrooksPointer();
+  if (kUseBakerOrBrooksReadBarrier) {
+    java_lang_Class->AssertReadBarrierPointer();
   }
   java_lang_Class->SetClassSize(sizeof(mirror::ClassClass));
   heap->DecrementDisableMovingGC(self);
@@ -567,40 +567,16 @@
   argv.push_back("--runtime-arg");
   argv.push_back(Runtime::Current()->GetClassPathString());
 
-  argv.push_back("--runtime-arg");
-  std::string checkstr = "-implicit-checks";
+  Runtime::Current()->AddCurrentRuntimeFeaturesAsDex2OatArguments(&argv);
 
-  int nchecks = 0;
-  char checksep = ':';
-
-  if (!Runtime::Current()->ExplicitNullChecks()) {
-    checkstr += checksep;
-    checksep = ',';
-    checkstr += "null";
-    ++nchecks;
+  if (!Runtime::Current()->IsVerificationEnabled()) {
+    argv.push_back("--compiler-filter=verify-none");
   }
-  if (!Runtime::Current()->ExplicitSuspendChecks()) {
-    checkstr += checksep;
-    checksep = ',';
-    checkstr += "suspend";
-    ++nchecks;
-  }
-
-  if (!Runtime::Current()->ExplicitStackOverflowChecks()) {
-    checkstr += checksep;
-    checksep = ',';
-    checkstr += "stack";
-    ++nchecks;
-  }
-
-  if (nchecks == 0) {
-    checkstr += ":none";
-  }
-  argv.push_back(checkstr);
 
   if (!kIsTargetBuild) {
     argv.push_back("--host");
   }
+
   argv.push_back(boot_image_option);
   argv.push_back(dex_file_option);
   argv.push_back(oat_fd_option);
@@ -1864,8 +1840,8 @@
   CHECK(descriptor != NULL);
 
   klass->SetClass(GetClassRoot(kJavaLangClass));
-  if (kUseBrooksPointer) {
-    klass->AssertSelfBrooksPointer();
+  if (kUseBakerOrBrooksReadBarrier) {
+    klass->AssertReadBarrierPointer();
   }
   uint32_t access_flags = dex_class_def.access_flags_;
   // Make sure that none of our runtime-only flags are set.
@@ -2561,6 +2537,12 @@
     klass->SetStatus(mirror::Class::kStatusVerifyingAtRuntime, self);
   }
 
+  // Skip verification if disabled.
+  if (!Runtime::Current()->IsVerificationEnabled()) {
+    klass->SetStatus(mirror::Class::kStatusVerified, self);
+    return;
+  }
+
   // Verify super class.
   SirtRef<mirror::Class> super(self, klass->GetSuperClass());
   if (super.get() != NULL) {
diff --git a/runtime/class_linker_test.cc b/runtime/class_linker_test.cc
index 7eb7b01..5b72a44 100644
--- a/runtime/class_linker_test.cc
+++ b/runtime/class_linker_test.cc
@@ -452,9 +452,9 @@
 
     // alphabetical 32-bit
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Object, monitor_), "shadow$_monitor_"));
-#ifdef USE_BROOKS_POINTER
-    offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Object, x_brooks_ptr_), "shadow$_x_brooks_ptr_"));
-    offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Object, x_padding_), "shadow$_x_padding_"));
+#ifdef USE_BAKER_OR_BROOKS_READ_BARRIER
+    offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Object, x_rb_ptr_), "shadow$_x_rb_ptr_"));
+    offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Object, x_xpadding_), "shadow$_x_xpadding_"));
 #endif
   };
 };
@@ -731,7 +731,7 @@
   EXPECT_FALSE(JavaLangObject->IsSynthetic());
   EXPECT_EQ(2U, JavaLangObject->NumDirectMethods());
   EXPECT_EQ(11U, JavaLangObject->NumVirtualMethods());
-  if (!kUseBrooksPointer) {
+  if (!kUseBakerOrBrooksReadBarrier) {
     EXPECT_EQ(2U, JavaLangObject->NumInstanceFields());
   } else {
     EXPECT_EQ(4U, JavaLangObject->NumInstanceFields());
@@ -740,11 +740,11 @@
   EXPECT_STREQ(fh.GetName(), "shadow$_klass_");
   fh.ChangeField(JavaLangObject->GetInstanceField(1));
   EXPECT_STREQ(fh.GetName(), "shadow$_monitor_");
-  if (kUseBrooksPointer) {
+  if (kUseBakerOrBrooksReadBarrier) {
     fh.ChangeField(JavaLangObject->GetInstanceField(2));
-    EXPECT_STREQ(fh.GetName(), "shadow$_x_brooks_ptr_");
+    EXPECT_STREQ(fh.GetName(), "shadow$_x_rb_ptr_");
     fh.ChangeField(JavaLangObject->GetInstanceField(3));
-    EXPECT_STREQ(fh.GetName(), "shadow$_x_padding_");
+    EXPECT_STREQ(fh.GetName(), "shadow$_x_xpadding_");
   }
 
   EXPECT_EQ(0U, JavaLangObject->NumStaticFields());
diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h
index 4b50cf4..723e32c 100644
--- a/runtime/common_runtime_test.h
+++ b/runtime/common_runtime_test.h
@@ -255,7 +255,11 @@
       filename += getenv("ANDROID_HOST_OUT");
       filename += "/framework/";
     } else {
+#ifdef __LP64__
+      filename += "/data/nativetest/art64/";
+#else
       filename += "/data/nativetest/art/";
+#endif
     }
     filename += "art-test-dex-";
     filename += name;
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 024f830..2872a02 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -1924,7 +1924,7 @@
   if (error != JDWP::ERR_NONE) {
     return error;
   }
-  thread->Interrupt();
+  thread->Interrupt(soa.Self());
   return JDWP::ERR_NONE;
 }
 
diff --git a/runtime/dex_instruction.cc b/runtime/dex_instruction.cc
index 8fccd6d..7546245 100644
--- a/runtime/dex_instruction.cc
+++ b/runtime/dex_instruction.cc
@@ -551,6 +551,20 @@
       uint32_t arg[5];
       GetArgs(arg);
       switch (Opcode()) {
+        case FILLED_NEW_ARRAY:
+        {
+          const int32_t a = VRegA_35c();
+          os << opcode << " {";
+          for (int i = 0; i < a; ++i) {
+            if (i > 0) {
+              os << ", ";
+            }
+            os << "v" << arg[i];
+          }
+          os << "}, type@" << VRegB_35c();
+        }
+        break;
+
         case INVOKE_VIRTUAL:
         case INVOKE_SUPER:
         case INVOKE_DIRECT:
diff --git a/runtime/entrypoints/interpreter/interpreter_entrypoints.h b/runtime/entrypoints/interpreter/interpreter_entrypoints.h
index c7df4e6..d8b2204 100644
--- a/runtime/entrypoints/interpreter/interpreter_entrypoints.h
+++ b/runtime/entrypoints/interpreter/interpreter_entrypoints.h
@@ -21,9 +21,8 @@
 #include "dex_file.h"
 #include "offsets.h"
 
-#define INTERPRETER_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, interpreter_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(InterpreterEntryPoints, x)))
+#define INTERPRETER_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::InterpreterEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(InterpreterEntryPoints, x))
 
 namespace art {
 
diff --git a/runtime/entrypoints/jni/jni_entrypoints.h b/runtime/entrypoints/jni/jni_entrypoints.h
index 0a53447..6fb0560 100644
--- a/runtime/entrypoints/jni/jni_entrypoints.h
+++ b/runtime/entrypoints/jni/jni_entrypoints.h
@@ -20,9 +20,8 @@
 #include "base/macros.h"
 #include "offsets.h"
 
-#define JNI_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, jni_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(JniEntryPoints, x)))
+#define JNI_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::JniEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(JniEntryPoints, x))
 
 namespace art {
 
diff --git a/runtime/entrypoints/portable/portable_entrypoints.h b/runtime/entrypoints/portable/portable_entrypoints.h
index dbea707..6f77e1c 100644
--- a/runtime/entrypoints/portable/portable_entrypoints.h
+++ b/runtime/entrypoints/portable/portable_entrypoints.h
@@ -27,9 +27,8 @@
 }  // namespace mirror
 class Thread;
 
-#define PORTABLE_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, portable_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(PortableEntryPoints, x)))
+#define PORTABLE_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::PortableEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(PortableEntryPoints, x))
 
 // Pointers to functions that are called by code generated by compiler's adhering to the portable
 // compiler ABI.
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index 5c3b824..ec69e28 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -22,9 +22,8 @@
 #include "base/macros.h"
 #include "offsets.h"
 
-#define QUICK_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, quick_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(QuickEntryPoints, x)))
+#define QUICK_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::QuickEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(QuickEntryPoints, x))
 
 namespace art {
 
diff --git a/runtime/exception_test.cc b/runtime/exception_test.cc
index 9c76a14..208eb74 100644
--- a/runtime/exception_test.cc
+++ b/runtime/exception_test.cc
@@ -199,7 +199,7 @@
     thread->PushShadowFrame(reinterpret_cast<ShadowFrame*>(&fake_stack[0]));
   }
 
-  jobject internal = thread->CreateInternalStackTrace(soa);
+  jobject internal = thread->CreateInternalStackTrace<false>(soa);
   ASSERT_TRUE(internal != NULL);
   jobjectArray ste_array = Thread::InternalStackTraceToStackTraceElementArray(soa, internal);
   ASSERT_TRUE(ste_array != NULL);
diff --git a/runtime/fault_handler.cc b/runtime/fault_handler.cc
index fcb567e..b8093bc 100644
--- a/runtime/fault_handler.cc
+++ b/runtime/fault_handler.cc
@@ -60,43 +60,50 @@
 }
 
 void FaultManager::HandleFault(int sig, siginfo_t* info, void* context) {
-  bool handled = false;
   LOG(DEBUG) << "Handling fault";
-  if (IsInGeneratedCode(context)) {
+  if (IsInGeneratedCode(context, true)) {
     LOG(DEBUG) << "in generated code, looking for handler";
-    for (auto& handler : handlers_) {
+    for (const auto& handler : generated_code_handlers_) {
       LOG(DEBUG) << "invoking Action on handler " << handler;
-      handled = handler->Action(sig, info, context);
-      if (handled) {
+      if (handler->Action(sig, info, context)) {
         return;
       }
     }
   }
-
-  if (!handled) {
-    LOG(ERROR)<< "Caught unknown SIGSEGV in ART fault handler";
-    oldaction_.sa_sigaction(sig, info, context);
-  }
-}
-
-void FaultManager::AddHandler(FaultHandler* handler) {
-  handlers_.push_back(handler);
-}
-
-void FaultManager::RemoveHandler(FaultHandler* handler) {
-  for (Handlers::iterator i = handlers_.begin(); i != handlers_.end(); ++i) {
-    FaultHandler* h = *i;
-    if (h == handler) {
-      handlers_.erase(i);
+  for (const auto& handler : other_handlers_) {
+    if (handler->Action(sig, info, context)) {
       return;
     }
   }
+  LOG(ERROR)<< "Caught unknown SIGSEGV in ART fault handler";
+  oldaction_.sa_sigaction(sig, info, context);
 }
 
+void FaultManager::AddHandler(FaultHandler* handler, bool generated_code) {
+  if (generated_code) {
+    generated_code_handlers_.push_back(handler);
+  } else {
+    other_handlers_.push_back(handler);
+  }
+}
+
+void FaultManager::RemoveHandler(FaultHandler* handler) {
+  auto it = std::find(generated_code_handlers_.begin(), generated_code_handlers_.end(), handler);
+  if (it != generated_code_handlers_.end()) {
+    generated_code_handlers_.erase(it);
+    return;
+  }
+  auto it2 = std::find(other_handlers_.begin(), other_handlers_.end(), handler);
+  if (it2 != other_handlers_.end()) {
+    other_handlers_.erase(it);
+    return;
+  }
+  LOG(FATAL) << "Attempted to remove non existent handler " << handler;
+}
 
 // This function is called within the signal handler.  It checks that
 // the mutator_lock is held (shared).  No annotalysis is done.
-bool FaultManager::IsInGeneratedCode(void *context) {
+bool FaultManager::IsInGeneratedCode(void* context, bool check_dex_pc) {
   // We can only be running Java code in the current thread if it
   // is in Runnable state.
   LOG(DEBUG) << "Checking for generated code";
@@ -119,27 +126,25 @@
     return false;
   }
 
-  uintptr_t potential_method = 0;
+  mirror::ArtMethod* method_obj = 0;
   uintptr_t return_pc = 0;
+  uintptr_t sp = 0;
 
   // Get the architecture specific method address and return address.  These
-  // are in architecture specific files in arch/<arch>/fault_handler_<arch>.cc
-  GetMethodAndReturnPC(context, /*out*/potential_method, /*out*/return_pc);
+  // are in architecture specific files in arch/<arch>/fault_handler_<arch>.
+  GetMethodAndReturnPCAndSP(context, &method_obj, &return_pc, &sp);
 
   // If we don't have a potential method, we're outta here.
-  LOG(DEBUG) << "potential method: " << potential_method;
-  if (potential_method == 0) {
+  LOG(DEBUG) << "potential method: " << method_obj;
+  if (method_obj == 0 || !IsAligned<kObjectAlignment>(method_obj)) {
     LOG(DEBUG) << "no method";
     return false;
   }
 
   // Verify that the potential method is indeed a method.
   // TODO: check the GC maps to make sure it's an object.
-
-  mirror::Object* method_obj =
-      reinterpret_cast<mirror::Object*>(potential_method);
-
   // Check that the class pointer inside the object is not null and is aligned.
+  // TODO: Method might be not a heap address, and GetClass could fault.
   mirror::Class* cls = method_obj->GetClass<kVerifyNone>();
   if (cls == nullptr) {
     LOG(DEBUG) << "not a class";
@@ -164,41 +169,64 @@
 
   // We can be certain that this is a method now.  Check if we have a GC map
   // at the return PC address.
-  mirror::ArtMethod* method =
-      reinterpret_cast<mirror::ArtMethod*>(potential_method);
   if (true || kIsDebugBuild) {
     LOG(DEBUG) << "looking for dex pc for return pc " << std::hex << return_pc;
-    const void* code = Runtime::Current()->GetInstrumentation()->GetQuickCodeFor(method);
+    const void* code = Runtime::Current()->GetInstrumentation()->GetQuickCodeFor(method_obj);
     uint32_t sought_offset = return_pc - reinterpret_cast<uintptr_t>(code);
     LOG(DEBUG) << "pc offset: " << std::hex << sought_offset;
   }
-  uint32_t dexpc = method->ToDexPc(return_pc, false);
+  uint32_t dexpc = method_obj->ToDexPc(return_pc, false);
   LOG(DEBUG) << "dexpc: " << dexpc;
-  return dexpc != DexFile::kDexNoIndex;
+  return !check_dex_pc || dexpc != DexFile::kDexNoIndex;
+}
+
+FaultHandler::FaultHandler(FaultManager* manager) : manager_(manager) {
 }
 
 //
 // Null pointer fault handler
 //
-
-NullPointerHandler::NullPointerHandler(FaultManager* manager) {
-  manager->AddHandler(this);
+NullPointerHandler::NullPointerHandler(FaultManager* manager) : FaultHandler(manager) {
+  manager_->AddHandler(this, true);
 }
 
 //
 // Suspension fault handler
 //
-
-SuspensionHandler::SuspensionHandler(FaultManager* manager) {
-  manager->AddHandler(this);
+SuspensionHandler::SuspensionHandler(FaultManager* manager) : FaultHandler(manager) {
+  manager_->AddHandler(this, true);
 }
 
 //
 // Stack overflow fault handler
 //
-
-StackOverflowHandler::StackOverflowHandler(FaultManager* manager) {
-  manager->AddHandler(this);
+StackOverflowHandler::StackOverflowHandler(FaultManager* manager) : FaultHandler(manager) {
+  manager_->AddHandler(this, true);
 }
+
+//
+// Stack trace handler, used to help get a stack trace from SIGSEGV inside of compiled code.
+//
+JavaStackTraceHandler::JavaStackTraceHandler(FaultManager* manager) : FaultHandler(manager) {
+  manager_->AddHandler(this, false);
+}
+
+bool JavaStackTraceHandler::Action(int sig, siginfo_t* siginfo, void* context) {
+  // Make sure that we are in the generated code, but we may not have a dex pc.
+  if (manager_->IsInGeneratedCode(context, false)) {
+    LOG(ERROR) << "Dumping java stack trace for crash in generated code";
+    mirror::ArtMethod* method = nullptr;
+    uintptr_t return_pc = 0;
+    uintptr_t sp = 0;
+    manager_->GetMethodAndReturnPCAndSP(context, &method, &return_pc, &sp);
+    Thread* self = Thread::Current();
+    // Inside of generated code, sp[0] is the method, so sp is the frame.
+    mirror::ArtMethod** frame = reinterpret_cast<mirror::ArtMethod**>(sp);
+    self->SetTopOfStack(frame, 0);  // Since we don't necessarily have a dex pc, pass in 0.
+    self->DumpJavaStack(LOG(ERROR));
+  }
+  return false;  // Return false since we want to propagate the fault to the main signal handler.
+}
+
 }   // namespace art
 
diff --git a/runtime/fault_handler.h b/runtime/fault_handler.h
index 9fe6e9a..ea2f7c8 100644
--- a/runtime/fault_handler.h
+++ b/runtime/fault_handler.h
@@ -26,6 +26,11 @@
 #include "base/mutex.h"   // For annotalysis.
 
 namespace art {
+
+namespace mirror {
+class ArtMethod;
+}       // namespace mirror
+
 class FaultHandler;
 
 class FaultManager {
@@ -36,53 +41,77 @@
   void Init();
 
   void HandleFault(int sig, siginfo_t* info, void* context);
-  void AddHandler(FaultHandler* handler);
+  void AddHandler(FaultHandler* handler, bool generated_code);
   void RemoveHandler(FaultHandler* handler);
+  void GetMethodAndReturnPCAndSP(void* context, mirror::ArtMethod** out_method,
+                                 uintptr_t* out_return_pc, uintptr_t* out_sp);
+  bool IsInGeneratedCode(void *context, bool check_dex_pc) NO_THREAD_SAFETY_ANALYSIS;
 
  private:
-  bool IsInGeneratedCode(void *context) NO_THREAD_SAFETY_ANALYSIS;
-  void GetMethodAndReturnPC(void* context, uintptr_t& method, uintptr_t& return_pc);
-
-  typedef std::vector<FaultHandler*> Handlers;
-  Handlers handlers_;
+  std::vector<FaultHandler*> generated_code_handlers_;
+  std::vector<FaultHandler*> other_handlers_;
   struct sigaction oldaction_;
+  DISALLOW_COPY_AND_ASSIGN(FaultManager);
 };
 
 class FaultHandler {
  public:
-  FaultHandler() : manager_(nullptr) {}
-  explicit FaultHandler(FaultManager* manager) : manager_(manager) {}
+  explicit FaultHandler(FaultManager* manager);
   virtual ~FaultHandler() {}
+  FaultManager* GetFaultManager() {
+    return manager_;
+  }
 
   virtual bool Action(int sig, siginfo_t* siginfo, void* context) = 0;
+
  protected:
   FaultManager* const manager_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(FaultHandler);
 };
 
 class NullPointerHandler FINAL : public FaultHandler {
  public:
-  NullPointerHandler() {}
   explicit NullPointerHandler(FaultManager* manager);
 
   bool Action(int sig, siginfo_t* siginfo, void* context) OVERRIDE;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(NullPointerHandler);
 };
 
 class SuspensionHandler FINAL : public FaultHandler {
  public:
-  SuspensionHandler() {}
   explicit SuspensionHandler(FaultManager* manager);
 
   bool Action(int sig, siginfo_t* siginfo, void* context) OVERRIDE;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(SuspensionHandler);
 };
 
 class StackOverflowHandler FINAL : public FaultHandler {
  public:
-  StackOverflowHandler() {}
   explicit StackOverflowHandler(FaultManager* manager);
 
   bool Action(int sig, siginfo_t* siginfo, void* context) OVERRIDE;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(StackOverflowHandler);
 };
 
+class JavaStackTraceHandler FINAL : public FaultHandler {
+ public:
+  explicit JavaStackTraceHandler(FaultManager* manager);
+
+  bool Action(int sig, siginfo_t* siginfo, void* context) OVERRIDE NO_THREAD_SAFETY_ANALYSIS;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(JavaStackTraceHandler);
+};
+
+
 // Statically allocated so the the signal handler can get access to it.
 extern FaultManager fault_manager;
 
diff --git a/runtime/gc/accounting/space_bitmap-inl.h b/runtime/gc/accounting/space_bitmap-inl.h
index d6d1b3e..0fbd27c 100644
--- a/runtime/gc/accounting/space_bitmap-inl.h
+++ b/runtime/gc/accounting/space_bitmap-inl.h
@@ -29,10 +29,10 @@
   DCHECK_GE(addr, heap_begin_);
   const uintptr_t offset = addr - heap_begin_;
   const size_t index = OffsetToIndex(offset);
-  const word mask = OffsetToMask(offset);
-  word* const address = &bitmap_begin_[index];
+  const uword mask = OffsetToMask(offset);
+  uword* const address = &bitmap_begin_[index];
   DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_;
-  word old_word;
+  uword old_word;
   do {
     old_word = *address;
     // Fast path: The bit is already set.
@@ -58,74 +58,79 @@
 void SpaceBitmap::VisitMarkedRange(uintptr_t visit_begin, uintptr_t visit_end,
                                    const Visitor& visitor) const {
   DCHECK_LT(visit_begin, visit_end);
-#ifdef __LP64__
-  // TODO: make the optimized code below work in the 64bit case.
-  for (uintptr_t i = visit_begin; i < visit_end; i += kAlignment) {
-    mirror::Object* obj = reinterpret_cast<mirror::Object*>(i);
-    if (Test(obj)) {
-      visitor(obj);
-    }
-  }
-#else
-  const size_t bit_index_start = (visit_begin - heap_begin_) / kAlignment;
-  const size_t bit_index_end = (visit_end - heap_begin_ - 1) / kAlignment;
+  DCHECK_LE(heap_begin_, visit_begin);
+  DCHECK_LE(visit_end, HeapLimit());
 
-  size_t word_start = bit_index_start / kBitsPerWord;
-  size_t word_end = bit_index_end / kBitsPerWord;
-  DCHECK_LT(word_end * kWordSize, Size());
+  const uintptr_t offset_start = visit_begin - heap_begin_;
+  const uintptr_t offset_end = visit_end - heap_begin_;
 
-  // Trim off left_bits of left bits.
-  size_t edge_word = bitmap_begin_[word_start];
+  const uintptr_t index_start = OffsetToIndex(offset_start);
+  const uintptr_t index_end = OffsetToIndex(offset_end);
 
-  // Handle bits on the left first as a special case
-  size_t left_bits = bit_index_start & (kBitsPerWord - 1);
-  if (left_bits != 0) {
-    edge_word &= (1 << (kBitsPerWord - left_bits)) - 1;
-  }
+  const size_t bit_start = (offset_start / kAlignment) % kBitsPerWord;
+  const size_t bit_end = (offset_end / kAlignment) % kBitsPerWord;
 
-  // If word_start == word_end then handle this case at the same place we handle the right edge.
-  if (edge_word != 0 && word_start < word_end) {
-    uintptr_t ptr_base = IndexToOffset(word_start) + heap_begin_;
-    do {
-      const size_t shift = CLZ(edge_word);
-      mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
-      visitor(obj);
-      edge_word ^= static_cast<size_t>(kWordHighBitMask) >> shift;
-    } while (edge_word != 0);
-  }
-  word_start++;
+  // Index(begin)  ...    Index(end)
+  // [xxxxx???][........][????yyyy]
+  //      ^                   ^
+  //      |                   #---- Bit of visit_end
+  //      #---- Bit of visit_begin
+  //
 
-  for (size_t i = word_start; i < word_end; i++) {
-    size_t w = bitmap_begin_[i];
-    if (w != 0) {
-      uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
+  // Left edge.
+  uword left_edge = bitmap_begin_[index_start];
+  // Mark of lower bits that are not in range.
+  left_edge &= ~((static_cast<uword>(1) << bit_start) - 1);
+
+  // Right edge. Either unique, or left_edge.
+  uword right_edge;
+
+  if (index_start < index_end) {
+    // Left edge != right edge.
+
+    // Traverse left edge.
+    if (left_edge != 0) {
+      const uintptr_t ptr_base = IndexToOffset(index_start) + heap_begin_;
       do {
-        const size_t shift = CLZ(w);
+        const size_t shift = CTZ(left_edge);
         mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
         visitor(obj);
-        w ^= static_cast<size_t>(kWordHighBitMask) >> shift;
-      } while (w != 0);
+        left_edge ^= (static_cast<uword>(1)) << shift;
+      } while (left_edge != 0);
     }
+
+    // Traverse the middle, full part.
+    for (size_t i = index_start + 1; i < index_end; ++i) {
+      uword w = bitmap_begin_[i];
+      if (w != 0) {
+        const uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
+        do {
+          const size_t shift = CTZ(w);
+          mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
+          visitor(obj);
+          w ^= (static_cast<uword>(1)) << shift;
+        } while (w != 0);
+      }
+    }
+
+    // Right edge is unique.
+    right_edge = bitmap_begin_[index_end];
+  } else {
+    // Right edge = left edge.
+    right_edge = left_edge;
   }
 
-  // Handle the right edge, and also the left edge if both edges are on the same word.
-  size_t right_bits = bit_index_end & (kBitsPerWord - 1);
-
-  // If word_start == word_end then we need to use the word which we removed the left bits.
-  if (word_start <= word_end) {
-    edge_word = bitmap_begin_[word_end];
+  // Right edge handling.
+  right_edge &= ((static_cast<uword>(1) << bit_end) - 1) | (static_cast<uword>(1) << bit_end);
+  if (right_edge != 0) {
+    const uintptr_t ptr_base = IndexToOffset(index_end) + heap_begin_;
+    do {
+      const size_t shift = CTZ(right_edge);
+      mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
+      visitor(obj);
+      right_edge ^= (static_cast<uword>(1)) << shift;
+    } while (right_edge != 0);
   }
-
-  // Bits that we trim off the right.
-  edge_word &= ~((static_cast<size_t>(kWordHighBitMask) >> right_bits) - 1);
-  uintptr_t ptr_base = IndexToOffset(word_end) + heap_begin_;
-  while (edge_word != 0) {
-    const size_t shift = CLZ(edge_word);
-    mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
-    visitor(obj);
-    edge_word ^= static_cast<size_t>(kWordHighBitMask) >> shift;
-  }
-#endif
 }
 
 inline bool SpaceBitmap::Modify(const mirror::Object* obj, bool do_set) {
@@ -133,10 +138,10 @@
   DCHECK_GE(addr, heap_begin_);
   const uintptr_t offset = addr - heap_begin_;
   const size_t index = OffsetToIndex(offset);
-  const word mask = OffsetToMask(offset);
+  const uword mask = OffsetToMask(offset);
   DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_;
-  word* address = &bitmap_begin_[index];
-  word old_word = *address;
+  uword* address = &bitmap_begin_[index];
+  uword old_word = *address;
   if (do_set) {
     *address = old_word | mask;
   } else {
diff --git a/runtime/gc/accounting/space_bitmap.cc b/runtime/gc/accounting/space_bitmap.cc
index ad4ff1b..1957c21 100644
--- a/runtime/gc/accounting/space_bitmap.cc
+++ b/runtime/gc/accounting/space_bitmap.cc
@@ -53,7 +53,7 @@
 SpaceBitmap* SpaceBitmap::CreateFromMemMap(const std::string& name, MemMap* mem_map,
                                            byte* heap_begin, size_t heap_capacity) {
   CHECK(mem_map != nullptr);
-  word* bitmap_begin = reinterpret_cast<word*>(mem_map->Begin());
+  uword* bitmap_begin = reinterpret_cast<uword*>(mem_map->Begin());
   size_t bitmap_size = OffsetToIndex(RoundUp(heap_capacity, kAlignment * kBitsPerWord)) * kWordSize;
   return new SpaceBitmap(name, mem_map, bitmap_begin, bitmap_size, heap_begin);
 }
@@ -107,16 +107,16 @@
   CHECK(callback != NULL);
 
   uintptr_t end = OffsetToIndex(HeapLimit() - heap_begin_ - 1);
-  word* bitmap_begin = bitmap_begin_;
+  uword* bitmap_begin = bitmap_begin_;
   for (uintptr_t i = 0; i <= end; ++i) {
-    word w = bitmap_begin[i];
+    uword w = bitmap_begin[i];
     if (w != 0) {
       uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
       do {
-        const size_t shift = CLZ(w);
+        const size_t shift = CTZ(w);
         mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
         (*callback)(obj, arg);
-        w ^= static_cast<size_t>(kWordHighBitMask) >> shift;
+        w ^= (static_cast<uword>(1)) << shift;
       } while (w != 0);
     }
   }
@@ -150,15 +150,15 @@
   size_t start = OffsetToIndex(sweep_begin - live_bitmap.heap_begin_);
   size_t end = OffsetToIndex(sweep_end - live_bitmap.heap_begin_ - 1);
   CHECK_LT(end, live_bitmap.Size() / kWordSize);
-  word* live = live_bitmap.bitmap_begin_;
-  word* mark = mark_bitmap.bitmap_begin_;
+  uword* live = live_bitmap.bitmap_begin_;
+  uword* mark = mark_bitmap.bitmap_begin_;
   for (size_t i = start; i <= end; i++) {
-    word garbage = live[i] & ~mark[i];
+    uword garbage = live[i] & ~mark[i];
     if (UNLIKELY(garbage != 0)) {
       uintptr_t ptr_base = IndexToOffset(i) + live_bitmap.heap_begin_;
       do {
-        const size_t shift = CLZ(garbage);
-        garbage ^= static_cast<size_t>(kWordHighBitMask) >> shift;
+        const size_t shift = CTZ(garbage);
+        garbage ^= (static_cast<uword>(1)) << shift;
         *pb++ = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
       } while (garbage != 0);
       // Make sure that there are always enough slots available for an
@@ -254,14 +254,15 @@
   CHECK(callback != NULL);
   uintptr_t end = Size() / kWordSize;
   for (uintptr_t i = 0; i < end; ++i) {
-    word w = bitmap_begin_[i];
+    // Need uint for unsigned shift.
+    uword w = bitmap_begin_[i];
     if (UNLIKELY(w != 0)) {
       uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
       while (w != 0) {
-        const size_t shift = CLZ(w);
+        const size_t shift = CTZ(w);
         mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
         WalkFieldsInOrder(visited.get(), callback, obj, arg);
-        w ^= static_cast<size_t>(kWordHighBitMask) >> shift;
+        w ^= (static_cast<uword>(1)) << shift;
       }
     }
   }
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index 5fd2bce..aa24b03 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -70,9 +70,9 @@
     return static_cast<uintptr_t>(index * kAlignment * kBitsPerWord);
   }
 
-  // Pack the bits in backwards so they come out in address order when using CLZ.
-  static word OffsetToMask(uintptr_t offset) {
-    return static_cast<uintptr_t>(kWordHighBitMask) >> ((offset / kAlignment) % kBitsPerWord);
+  // Bits are packed in the obvious way.
+  static uword OffsetToMask(uintptr_t offset) {
+    return (static_cast<size_t>(1)) << ((offset / kAlignment) % kBitsPerWord);
   }
 
   inline bool Set(const mirror::Object* obj) {
@@ -140,7 +140,7 @@
   void CopyFrom(SpaceBitmap* source_bitmap);
 
   // Starting address of our internal storage.
-  word* Begin() {
+  uword* Begin() {
     return bitmap_begin_;
   }
 
@@ -181,7 +181,7 @@
  private:
   // TODO: heap_end_ is initialized so that the heap bitmap is empty, this doesn't require the -1,
   // however, we document that this is expected on heap_end_
-  SpaceBitmap(const std::string& name, MemMap* mem_map, word* bitmap_begin, size_t bitmap_size,
+  SpaceBitmap(const std::string& name, MemMap* mem_map, uword* bitmap_begin, size_t bitmap_size,
               const void* heap_begin)
       : mem_map_(mem_map), bitmap_begin_(bitmap_begin), bitmap_size_(bitmap_size),
         heap_begin_(reinterpret_cast<uintptr_t>(heap_begin)),
@@ -193,7 +193,7 @@
   UniquePtr<MemMap> mem_map_;
 
   // This bitmap itself, word sized for efficiency in scanning.
-  word* const bitmap_begin_;
+  uword* const bitmap_begin_;
 
   // Size of this bitmap.
   size_t bitmap_size_;
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 19fdc63..f5f6f16 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -565,7 +565,7 @@
 
   if (LIKELY(idx <= kMaxThreadLocalSizeBracketIdx)) {
     // Use a thread-local run.
-    Run* thread_local_run = reinterpret_cast<Run*>(self->rosalloc_runs_[idx]);
+    Run* thread_local_run = reinterpret_cast<Run*>(self->GetRosAllocRun(idx));
     if (UNLIKELY(thread_local_run == NULL)) {
       MutexLock mu(self, *size_bracket_locks_[idx]);
       thread_local_run = RefillRun(self, idx);
@@ -575,7 +575,7 @@
       DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
       DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
       thread_local_run->is_thread_local_ = 1;
-      self->rosalloc_runs_[idx] = thread_local_run;
+      self->SetRosAllocRun(idx, thread_local_run);
       DCHECK(!thread_local_run->IsFull());
     }
 
@@ -600,7 +600,7 @@
       } else {
         // No slots got freed. Try to refill the thread-local run.
         DCHECK(thread_local_run->IsFull());
-        self->rosalloc_runs_[idx] = NULL;
+        self->SetRosAllocRun(idx, nullptr);
         thread_local_run->is_thread_local_ = 0;
         if (kIsDebugBuild) {
           full_runs_[idx].insert(thread_local_run);
@@ -619,7 +619,7 @@
         DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
         DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
         thread_local_run->is_thread_local_ = 1;
-        self->rosalloc_runs_[idx] = thread_local_run;
+        self->SetRosAllocRun(idx, thread_local_run);
         DCHECK(!thread_local_run->IsFull());
       }
 
@@ -1602,11 +1602,11 @@
   WriterMutexLock wmu(self, bulk_free_lock_);
   for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
     MutexLock mu(self, *size_bracket_locks_[idx]);
-    Run* thread_local_run = reinterpret_cast<Run*>(thread->rosalloc_runs_[idx]);
+    Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
     if (thread_local_run != NULL) {
       DCHECK_EQ(thread_local_run->magic_num_, kMagicNum);
       DCHECK_NE(thread_local_run->is_thread_local_, 0);
-      thread->rosalloc_runs_[idx] = NULL;
+      thread->SetRosAllocRun(idx, nullptr);
       // Note the thread local run may not be full here.
       bool dont_care;
       thread_local_run->MergeThreadLocalFreeBitMapToAllocBitMap(&dont_care);
@@ -1659,7 +1659,7 @@
     WriterMutexLock wmu(self, bulk_free_lock_);
     for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
       MutexLock mu(self, *size_bracket_locks_[idx]);
-      Run* thread_local_run = reinterpret_cast<Run*>(thread->rosalloc_runs_[idx]);
+      Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
       DCHECK(thread_local_run == nullptr);
     }
   }
@@ -1924,7 +1924,7 @@
       Thread* thread = *it;
       for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
         MutexLock mu(self, *rosalloc->size_bracket_locks_[i]);
-        Run* thread_local_run = reinterpret_cast<Run*>(thread->rosalloc_runs_[i]);
+        Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(i));
         if (thread_local_run == this) {
           CHECK(!owner_found)
               << "A thread local run has more than one owner thread " << Dump();
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index ccfa9cf..5b7b8a2 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -128,7 +128,7 @@
   // Mark all reachable objects, done concurrently.
   virtual void MarkingPhase() = 0;
 
-  // Only called for concurrent GCs.
+  // Phase of the GC which is run with mutator lock exclusively held.
   virtual void PausePhase();
 
   // Called with mutators running.
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 91ccd64..ca2d0bd 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -343,9 +343,9 @@
 
 inline void MarkSweep::MarkObjectNonNull(Object* obj) {
   DCHECK(obj != nullptr);
-  if (kUseBrooksPointer) {
-    // Verify all the objects have the correct Brooks pointer installed.
-    obj->AssertSelfBrooksPointer();
+  if (kUseBakerOrBrooksReadBarrier) {
+    // Verify all the objects have the correct pointer installed.
+    obj->AssertReadBarrierPointer();
   }
   if (immune_region_.ContainsObject(obj)) {
     if (kCountMarkedObjects) {
@@ -415,9 +415,9 @@
 
 inline bool MarkSweep::MarkObjectParallel(const Object* obj) {
   DCHECK(obj != nullptr);
-  if (kUseBrooksPointer) {
-    // Verify all the objects have the correct Brooks pointer installed.
-    obj->AssertSelfBrooksPointer();
+  if (kUseBakerOrBrooksReadBarrier) {
+    // Verify all the objects have the correct pointer installed.
+    obj->AssertReadBarrierPointer();
   }
   if (immune_region_.ContainsObject(obj)) {
     DCHECK(IsMarked(obj));
diff --git a/runtime/gc/collector/semi_space-inl.h b/runtime/gc/collector/semi_space-inl.h
index d60298b6..df731ff 100644
--- a/runtime/gc/collector/semi_space-inl.h
+++ b/runtime/gc/collector/semi_space-inl.h
@@ -45,9 +45,9 @@
   if (obj == nullptr) {
     return;
   }
-  if (kUseBrooksPointer) {
+  if (kUseBakerOrBrooksReadBarrier) {
     // Verify all the objects have the correct forward pointer installed.
-    obj->AssertSelfBrooksPointer();
+    obj->AssertReadBarrierPointer();
   }
   if (!immune_region_.ContainsObject(obj)) {
     if (from_space_->HasAddress(obj)) {
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 222bd63..1366858 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -561,11 +561,13 @@
   // references.
   saved_bytes_ +=
       CopyAvoidingDirtyingPages(reinterpret_cast<void*>(forward_address), obj, object_size);
-  if (kUseBrooksPointer) {
-    obj->AssertSelfBrooksPointer();
-    DCHECK_EQ(forward_address->GetBrooksPointer(), obj);
-    forward_address->SetBrooksPointer(forward_address);
-    forward_address->AssertSelfBrooksPointer();
+  if (kUseBakerOrBrooksReadBarrier) {
+    obj->AssertReadBarrierPointer();
+    if (kUseBrooksReadBarrier) {
+      DCHECK_EQ(forward_address->GetReadBarrierPointer(), obj);
+      forward_address->SetReadBarrierPointer(forward_address);
+    }
+    forward_address->AssertReadBarrierPointer();
   }
   if (to_space_live_bitmap_ != nullptr) {
     to_space_live_bitmap_->Set(forward_address);
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 8bfe793..25f20d6 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -73,9 +73,11 @@
   DCHECK_GT(bytes_allocated, 0u);
   DCHECK_GT(usable_size, 0u);
   obj->SetClass(klass);
-  if (kUseBrooksPointer) {
-    obj->SetBrooksPointer(obj);
-    obj->AssertSelfBrooksPointer();
+  if (kUseBakerOrBrooksReadBarrier) {
+    if (kUseBrooksReadBarrier) {
+      obj->SetReadBarrierPointer(obj);
+    }
+    obj->AssertReadBarrierPointer();
   }
   if (collector::SemiSpace::kUseRememberedSet && UNLIKELY(allocator == kAllocatorTypeNonMoving)) {
     // (Note this if statement will be constant folded away for the
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 1a32a9a..915e54f 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -21,7 +21,6 @@
 
 #include <limits>
 #include <vector>
-#include <valgrind.h>
 
 #include "base/histogram-inl.h"
 #include "base/stl_util.h"
@@ -150,7 +149,7 @@
       total_allocation_time_(0),
       verify_object_mode_(kVerifyObjectModeDisabled),
       disable_moving_gc_count_(0),
-      running_on_valgrind_(RUNNING_ON_VALGRIND > 0),
+      running_on_valgrind_(Runtime::Current()->RunningOnValgrind()),
       use_tlab_(use_tlab) {
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() entering";
@@ -1605,11 +1604,13 @@
     }
     // Copy the object over to its new location.
     memcpy(reinterpret_cast<void*>(forward_address), obj, object_size);
-    if (kUseBrooksPointer) {
-      obj->AssertSelfBrooksPointer();
-      DCHECK_EQ(forward_address->GetBrooksPointer(), obj);
-      forward_address->SetBrooksPointer(forward_address);
-      forward_address->AssertSelfBrooksPointer();
+    if (kUseBakerOrBrooksReadBarrier) {
+      obj->AssertReadBarrierPointer();
+      if (kUseBrooksReadBarrier) {
+        DCHECK_EQ(forward_address->GetReadBarrierPointer(), obj);
+        forward_address->SetReadBarrierPointer(forward_address);
+      }
+      forward_address->AssertReadBarrierPointer();
     }
     return forward_address;
   }
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 6148894..a955cc8 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -213,7 +213,7 @@
   // since there can exist multiple bump pointer spaces which exist at the same time.
   if (num_blocks_ > 0) {
     for (Thread* thread : thread_list) {
-      total += thread->thread_local_pos_ - thread->thread_local_start_;
+      total += thread->GetThreadLocalBytesAllocated();
     }
   }
   return total;
@@ -231,15 +231,15 @@
   // since there can exist multiple bump pointer spaces which exist at the same time.
   if (num_blocks_ > 0) {
     for (Thread* thread : thread_list) {
-      total += thread->thread_local_objects_;
+      total += thread->GetThreadLocalObjectsAllocated();
     }
   }
   return total;
 }
 
 void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
-  objects_allocated_.FetchAndAdd(thread->thread_local_objects_);
-  bytes_allocated_.FetchAndAdd(thread->thread_local_pos_ - thread->thread_local_start_);
+  objects_allocated_.FetchAndAdd(thread->GetThreadLocalObjectsAllocated());
+  bytes_allocated_.FetchAndAdd(thread->GetThreadLocalBytesAllocated());
   thread->SetTlab(nullptr, nullptr);
 }
 
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index 30c2edb..60f566c 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -60,7 +60,7 @@
 
   // Everything is set so record in immutable structure and leave
   byte* begin = mem_map->Begin();
-  if (Runtime::Current()->GetHeap()->RunningOnValgrind()) {
+  if (Runtime::Current()->RunningOnValgrind()) {
     return new ValgrindMallocSpace<DlMallocSpace, void*>(
         name, mem_map, mspace, begin, end, begin + capacity, growth_limit, initial_size);
   } else {
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index bb52c66..faa539f 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -67,36 +67,6 @@
   arg_vector.push_back("--runtime-arg");
   arg_vector.push_back("-Xmx64m");
 
-  arg_vector.push_back("--runtime-arg");
-  std::string checkstr = "-implicit-checks";
-  int nchecks = 0;
-  char checksep = ':';
-
-  if (!Runtime::Current()->ExplicitNullChecks()) {
-    checkstr += checksep;
-    checksep = ',';
-    checkstr += "null";
-    ++nchecks;
-  }
-  if (!Runtime::Current()->ExplicitSuspendChecks()) {
-    checkstr += checksep;
-    checksep = ',';
-    checkstr += "suspend";
-    ++nchecks;
-  }
-
-  if (!Runtime::Current()->ExplicitStackOverflowChecks()) {
-    checkstr += checksep;
-    checksep = ',';
-    checkstr += "stack";
-    ++nchecks;
-  }
-
-  if (nchecks == 0) {
-    checkstr += ":none";
-  }
-
-  arg_vector.push_back(checkstr);
 
   for (size_t i = 0; i < boot_class_path.size(); i++) {
     arg_vector.push_back(std::string("--dex-file=") + boot_class_path[i]);
@@ -108,6 +78,8 @@
   oat_file_option_string += "oat";
   arg_vector.push_back(oat_file_option_string);
 
+  Runtime::Current()->AddCurrentRuntimeFeaturesAsDex2OatArguments(&arg_vector);
+
   arg_vector.push_back(StringPrintf("--base=0x%x", ART_BASE_ADDRESS));
 
   if (kIsTargetBuild) {
@@ -166,10 +138,8 @@
     mirror::Object* obj = reinterpret_cast<mirror::Object*>(current);
     CHECK(live_bitmap_->Test(obj));
     CHECK(obj->GetClass() != nullptr) << "Image object at address " << obj << " has null class";
-    if (kUseBrooksPointer) {
-      CHECK(obj->GetBrooksPointer() == obj)
-          << "Bad Brooks pointer: obj=" << reinterpret_cast<void*>(obj)
-          << " brooks_ptr=" << reinterpret_cast<void*>(obj->GetBrooksPointer());
+    if (kUseBakerOrBrooksReadBarrier) {
+      obj->AssertReadBarrierPointer();
     }
     current += RoundUp(obj->SizeOf(), kObjectAlignment);
   }
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 2fc67ec..0b353c7 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -97,7 +97,7 @@
       lock_("large object map space lock", kAllocSpaceLock) {}
 
 LargeObjectMapSpace* LargeObjectMapSpace::Create(const std::string& name) {
-  if (RUNNING_ON_VALGRIND > 0) {
+  if (Runtime::Current()->RunningOnValgrind()) {
     return new ValgrindLargeObjectMapSpace(name);
   } else {
     return new LargeObjectMapSpace(name);
diff --git a/runtime/gc/space/space_test.h b/runtime/gc/space/space_test.h
index 6d3602c..5c735df 100644
--- a/runtime/gc/space/space_test.h
+++ b/runtime/gc/space/space_test.h
@@ -85,8 +85,8 @@
     EXPECT_GE(size, SizeOfZeroLengthByteArray());
     EXPECT_TRUE(byte_array_class != nullptr);
     o->SetClass(byte_array_class);
-    if (kUseBrooksPointer) {
-      o->SetBrooksPointer(o);
+    if (kUseBrooksReadBarrier) {
+      o->SetReadBarrierPointer(o);
     }
     mirror::Array* arr = o->AsArray<kVerifyNone>();
     size_t header_size = SizeOfZeroLengthByteArray();
diff --git a/runtime/globals.h b/runtime/globals.h
index 9c6fa0d..f2d6862 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -19,7 +19,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include "brooks_pointer.h"
+#include "read_barrier.h"
 
 namespace art {
 
@@ -97,12 +97,20 @@
 // code, if possible.
 static constexpr bool kEmbedClassInCode = true;
 
-#ifdef USE_BROOKS_POINTER
-static constexpr bool kUseBrooksPointer = true;
+#ifdef USE_BAKER_READ_BARRIER
+static constexpr bool kUseBakerReadBarrier = true;
 #else
-static constexpr bool kUseBrooksPointer = false;
+static constexpr bool kUseBakerReadBarrier = false;
 #endif
 
+#ifdef USE_BROOKS_READ_BARRIER
+static constexpr bool kUseBrooksReadBarrier = true;
+#else
+static constexpr bool kUseBrooksReadBarrier = false;
+#endif
+
+static constexpr bool kUseBakerOrBrooksReadBarrier = kUseBakerReadBarrier || kUseBrooksReadBarrier;
+
 // If true, references within the heap are poisoned (negated).
 static constexpr bool kPoisonHeapReferences = false;
 
diff --git a/runtime/instruction_set.cc b/runtime/instruction_set.cc
new file mode 100644
index 0000000..c964629
--- /dev/null
+++ b/runtime/instruction_set.cc
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "instruction_set.h"
+
+namespace art {
+
+std::string InstructionSetFeatures::GetFeatureString() const {
+  std::string result;
+  if ((mask_ & kHwDiv) != 0) {
+    result += "div";
+  }
+  if (result.size() == 0) {
+    result = "none";
+  }
+  return result;
+}
+
+}  // namespace art
diff --git a/runtime/instruction_set.h b/runtime/instruction_set.h
index cbc9912..c5a4ec8 100644
--- a/runtime/instruction_set.h
+++ b/runtime/instruction_set.h
@@ -33,6 +33,7 @@
   kX86_64,
   kMips
 };
+std::ostream& operator<<(std::ostream& os, const InstructionSet& rhs);
 
 enum InstructionFeatures {
   kHwDiv = 1                  // Supports hardware divide.
@@ -44,6 +45,8 @@
   InstructionSetFeatures() : mask_(0) {}
   explicit InstructionSetFeatures(uint32_t mask) : mask_(mask) {}
 
+  static InstructionSetFeatures GuessInstructionSetFeatures();
+
   bool HasDivideInstruction() const {
       return (mask_ & kHwDiv) != 0;
   }
@@ -52,20 +55,7 @@
     mask_ = (mask_ & ~kHwDiv) | (v ? kHwDiv : 0);
   }
 
-  std::string GetFeatureString() const {
-    std::string result;
-    if ((mask_ & kHwDiv) != 0) {
-      result += "div";
-    }
-    if (result.size() == 0) {
-      result = "none";
-    }
-    return result;
-  }
-
-  uint32_t get_mask() const {
-    return mask_;
-  }
+  std::string GetFeatureString() const;
 
   // Other features in here.
 
@@ -81,8 +71,6 @@
   uint32_t mask_;
 };
 
-std::ostream& operator<<(std::ostream& os, const InstructionSet& rhs);
-
 }  // namespace art
 
 #endif  // ART_RUNTIME_INSTRUCTION_SET_H_
diff --git a/runtime/interpreter/interpreter.cc b/runtime/interpreter/interpreter.cc
index abe7fe1..1bf0078 100644
--- a/runtime/interpreter/interpreter.cc
+++ b/runtime/interpreter/interpreter.cc
@@ -68,7 +68,7 @@
     result->SetL(Array::CreateMultiArray(self, sirt_class, sirt_dimensions));
   } else if (name == "java.lang.Object java.lang.Throwable.nativeFillInStackTrace()") {
     ScopedObjectAccessUnchecked soa(self);
-    result->SetL(soa.Decode<Object*>(self->CreateInternalStackTrace(soa)));
+    result->SetL(soa.Decode<Object*>(self->CreateInternalStackTrace<true>(soa)));
   } else if (name == "int java.lang.System.identityHashCode(java.lang.Object)") {
     mirror::Object* obj = reinterpret_cast<Object*>(args[0]);
     result->SetI((obj != nullptr) ? obj->IdentityHashCode() : 0);
@@ -96,7 +96,7 @@
     result->SetI(Primitive::ComponentSize(primitive_type));
   } else {
     // Throw an exception so we can abort the transaction and undo every change.
-    ThrowLocation throw_location;
+    ThrowLocation throw_location = self->GetCurrentLocationForThrow();
     self->ThrowNewExceptionF(throw_location, "Ljava/lang/InternalError;",
                              "Attempt to invoke native method in non-started runtime: %s",
                              name.c_str());
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 13aa77f..f7aeffd 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -2829,7 +2829,7 @@
       local_ref_cookie(IRT_FIRST_SEGMENT),
       locals(kLocalsInitial, kLocalsMax, kLocal),
       check_jni(false),
-      critical(false),
+      critical(0),
       monitors("monitors", kMonitorsInitial, kMonitorsMax) {
   functions = unchecked_functions = &gJniNativeInterface;
   if (vm->check_jni) {
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index ddc07ff..d955b97 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -391,11 +391,8 @@
   void SetComponentType(Class* new_component_type) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(GetComponentType() == NULL);
     DCHECK(new_component_type != NULL);
-    if (Runtime::Current()->IsActiveTransaction()) {
-      SetFieldObject<true>(ComponentTypeOffset(), new_component_type, false);
-    } else {
-      SetFieldObject<false>(ComponentTypeOffset(), new_component_type, false);
-    }
+    // Component type is invariant: use non-transactional mode without check.
+    SetFieldObject<false, false>(ComponentTypeOffset(), new_component_type, false);
   }
 
   size_t GetComponentSize() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 527b8a6..b6c140d 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -93,33 +93,41 @@
   Monitor::Wait(self, this, ms, ns, true, kTimedWaiting);
 }
 
-inline Object* Object::GetBrooksPointer() {
-#ifdef USE_BROOKS_POINTER
-  DCHECK(kUseBrooksPointer);
-  return GetFieldObject<Object, kVerifyNone>(OFFSET_OF_OBJECT_MEMBER(Object, x_brooks_ptr_), false);
+inline Object* Object::GetReadBarrierPointer() {
+#ifdef USE_BAKER_OR_BROOKS_READ_BARRIER
+  DCHECK(kUseBakerOrBrooksReadBarrier);
+  return GetFieldObject<Object, kVerifyNone>(OFFSET_OF_OBJECT_MEMBER(Object, x_rb_ptr_), false);
 #else
   LOG(FATAL) << "Unreachable";
   return nullptr;
 #endif
 }
 
-inline void Object::SetBrooksPointer(Object* brooks_pointer) {
-#ifdef USE_BROOKS_POINTER
-  DCHECK(kUseBrooksPointer);
+inline void Object::SetReadBarrierPointer(Object* rb_pointer) {
+#ifdef USE_BAKER_OR_BROOKS_READ_BARRIER
+  DCHECK(kUseBakerOrBrooksReadBarrier);
   // We don't mark the card as this occurs as part of object allocation. Not all objects have
   // backing cards, such as large objects.
   SetFieldObjectWithoutWriteBarrier<false, false, kVerifyNone>(
-      OFFSET_OF_OBJECT_MEMBER(Object, x_brooks_ptr_), brooks_pointer, false);
+      OFFSET_OF_OBJECT_MEMBER(Object, x_rb_ptr_), rb_pointer, false);
 #else
   LOG(FATAL) << "Unreachable";
 #endif
 }
 
-inline void Object::AssertSelfBrooksPointer() const {
-#ifdef USE_BROOKS_POINTER
-  DCHECK(kUseBrooksPointer);
+inline void Object::AssertReadBarrierPointer() const {
+#if defined(USE_BAKER_READ_BARRIER)
+  DCHECK(kUseBakerReadBarrier);
   Object* obj = const_cast<Object*>(this);
-  DCHECK_EQ(obj, obj->GetBrooksPointer());
+  DCHECK(obj->GetReadBarrierPointer() == nullptr)
+      << "Bad Baker pointer: obj=" << reinterpret_cast<void*>(obj)
+      << " ptr=" << reinterpret_cast<void*>(obj->GetReadBarrierPointer());
+#elif defined(USE_BROOKS_READ_BARRIER)
+  DCHECK(kUseBrooksReadBarrier);
+  Object* obj = const_cast<Object*>(this);
+  DCHECK_EQ(obj, obj->GetReadBarrierPointer())
+      << "Bad Brooks pointer: obj=" << reinterpret_cast<void*>(obj)
+      << " ptr=" << reinterpret_cast<void*>(obj->GetReadBarrierPointer());
 #else
   LOG(FATAL) << "Unreachable";
 #endif
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 0a77828..1ac23ce 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -78,9 +78,9 @@
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   void SetClass(Class* new_klass) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  Object* GetBrooksPointer() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void SetBrooksPointer(Object* brooks_pointer) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void AssertSelfBrooksPointer() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  Object* GetReadBarrierPointer() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SetReadBarrierPointer(Object* rb_pointer) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void AssertReadBarrierPointer() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // The verifier treats all interfaces as java.lang.Object and relies on runtime checks in
   // invoke-interface to detect incompatible interface types.
@@ -289,12 +289,12 @@
   // Monitor and hash code information.
   uint32_t monitor_;
 
-#ifdef USE_BROOKS_POINTER
-  // Note names use a 'x' prefix and the x_brooks_ptr_ is of type int
+#ifdef USE_BAKER_OR_BROOKS_READ_BARRIER
+  // Note names use a 'x' prefix and the x_rb_ptr_ is of type int
   // instead of Object to go with the alphabetical/by-type field order
   // on the Java side.
-  uint32_t x_brooks_ptr_;  // For the Brooks pointer.
-  uint32_t x_padding_;     // For 8-byte alignment. TODO: get rid of this.
+  uint32_t x_rb_ptr_;      // For the Baker or Brooks pointer.
+  uint32_t x_xpadding_;    // For 8-byte alignment. TODO: get rid of this.
 #endif
 
   friend class art::ImageWriter;
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index 7d8da14..32f30c3 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -425,8 +425,8 @@
   EXPECT_EQ(string->GetLength(), 7);
   EXPECT_EQ(string->GetUtfLength(), 7);
 
-  string->SetOffset<false>(2);
-  string->SetCount<false>(5);
+  string->SetOffset(2);
+  string->SetCount(5);
   EXPECT_TRUE(string->Equals("droid"));
   EXPECT_EQ(string->GetLength(), 5);
   EXPECT_EQ(string->GetUtfLength(), 5);
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index d4f11b2..88a8e6f 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -59,10 +59,11 @@
   return -1;
 }
 
-template<bool kTransactionActive>
 void String::SetArray(CharArray* new_array) {
+  // Array is invariant so use non-transactional mode. Also disable check as we may run inside
+  // a transaction.
   DCHECK(new_array != NULL);
-  SetFieldObject<kTransactionActive>(OFFSET_OF_OBJECT_MEMBER(String, array_), new_array, false);
+  SetFieldObject<false, false>(OFFSET_OF_OBJECT_MEMBER(String, array_), new_array, false);
 }
 
 // TODO: get global references for these
@@ -168,13 +169,8 @@
   // Hold reference in case AllocObject causes GC.
   String* string = down_cast<String*>(GetJavaLangString()->AllocObject(self));
   if (LIKELY(string != nullptr)) {
-    if (Runtime::Current()->IsActiveTransaction()) {
-      string->SetArray<true>(array.get());
-      string->SetCount<true>(array->GetLength());
-    } else {
-      string->SetArray<false>(array.get());
-      string->SetCount<false>(array->GetLength());
-    }
+    string->SetArray(array.get());
+    string->SetCount(array->GetLength());
   }
   return string;
 }
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 1340e7d..de9e4c4 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -118,17 +118,18 @@
     SetField32<false, false>(OFFSET_OF_OBJECT_MEMBER(String, hash_code_), new_hash_code, false);
   }
 
-  template<bool kTransactionActive>
   void SetCount(int32_t new_count) {
+    // Count is invariant so use non-transactional mode. Also disable check as we may run inside
+    // a transaction.
     DCHECK_LE(0, new_count);
-    SetField32<kTransactionActive>(OFFSET_OF_OBJECT_MEMBER(String, count_), new_count, false);
+    SetField32<false, false>(OFFSET_OF_OBJECT_MEMBER(String, count_), new_count, false);
   }
 
-  template<bool kTransactionActive>
   void SetOffset(int32_t new_offset) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    // Offset is only used during testing so use non-transactional mode.
     DCHECK_LE(0, new_offset);
     DCHECK_GE(GetLength(), new_offset);
-    SetField32<kTransactionActive>(OFFSET_OF_OBJECT_MEMBER(String, offset_), new_offset, false);
+    SetField32<false>(OFFSET_OF_OBJECT_MEMBER(String, offset_), new_offset, false);
   }
 
   static String* Alloc(Thread* self, int32_t utf16_length)
@@ -137,7 +138,6 @@
   static String* Alloc(Thread* self, const SirtRef<CharArray>& array)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  template<bool kTransactionActive>
   void SetArray(CharArray* new_array) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Field order required by test "ValidateFieldOrderOfJavaCppUnionClasses".
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 332aef0..bcaf8ec 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -157,7 +157,7 @@
 void Monitor::AppendToWaitSet(Thread* thread) {
   DCHECK(owner_ == Thread::Current());
   DCHECK(thread != NULL);
-  DCHECK(thread->wait_next_ == NULL) << thread->wait_next_;
+  DCHECK(thread->GetWaitNext() == nullptr) << thread->GetWaitNext();
   if (wait_set_ == NULL) {
     wait_set_ = thread;
     return;
@@ -165,10 +165,10 @@
 
   // push_back.
   Thread* t = wait_set_;
-  while (t->wait_next_ != NULL) {
-    t = t->wait_next_;
+  while (t->GetWaitNext() != nullptr) {
+    t = t->GetWaitNext();
   }
-  t->wait_next_ = thread;
+  t->SetWaitNext(thread);
 }
 
 /*
@@ -182,19 +182,19 @@
     return;
   }
   if (wait_set_ == thread) {
-    wait_set_ = thread->wait_next_;
-    thread->wait_next_ = NULL;
+    wait_set_ = thread->GetWaitNext();
+    thread->SetWaitNext(nullptr);
     return;
   }
 
   Thread* t = wait_set_;
-  while (t->wait_next_ != NULL) {
-    if (t->wait_next_ == thread) {
-      t->wait_next_ = thread->wait_next_;
-      thread->wait_next_ = NULL;
+  while (t->GetWaitNext() != NULL) {
+    if (t->GetWaitNext() == thread) {
+      t->SetWaitNext(thread->GetWaitNext());
+      thread->SetWaitNext(nullptr);
       return;
     }
-    t = t->wait_next_;
+    t = t->GetWaitNext();
   }
 }
 
@@ -226,6 +226,7 @@
     monitor_lock_.Unlock(self);  // Let go of locks in order.
     {
       ScopedThreadStateChange tsc(self, kBlocked);  // Change to blocked and give up mutator_lock_.
+      self->SetMonitorEnterObject(obj_);
       MutexLock mu2(self, monitor_lock_);  // Reacquire monitor_lock_ without mutator_lock_ for Wait.
       if (owner_ != NULL) {  // Did the owner_ give the lock up?
         ++num_waiters_;
@@ -248,6 +249,7 @@
           }
         }
       }
+      self->SetMonitorEnterObject(nullptr);
     }
     monitor_lock_.Lock(self);  // Reacquire locks in order.
   }
@@ -447,33 +449,33 @@
   bool was_interrupted = false;
   {
     // Pseudo-atomically wait on self's wait_cond_ and release the monitor lock.
-    MutexLock mu(self, *self->wait_mutex_);
+    MutexLock mu(self, *self->GetWaitMutex());
 
     // Set wait_monitor_ to the monitor object we will be waiting on. When wait_monitor_ is
     // non-NULL a notifying or interrupting thread must signal the thread's wait_cond_ to wake it
     // up.
-    DCHECK(self->wait_monitor_ == NULL);
-    self->wait_monitor_ = this;
+    DCHECK(self->GetWaitMonitor() == nullptr);
+    self->SetWaitMonitor(this);
 
     // Release the monitor lock.
     monitor_contenders_.Signal(self);
     monitor_lock_.Unlock(self);
 
     // Handle the case where the thread was interrupted before we called wait().
-    if (self->interrupted_) {
+    if (self->IsInterruptedLocked()) {
       was_interrupted = true;
     } else {
       // Wait for a notification or a timeout to occur.
       if (why == kWaiting) {
-        self->wait_cond_->Wait(self);
+        self->GetWaitConditionVariable()->Wait(self);
       } else {
         DCHECK(why == kTimedWaiting || why == kSleeping) << why;
-        self->wait_cond_->TimedWait(self, ms, ns);
+        self->GetWaitConditionVariable()->TimedWait(self, ms, ns);
       }
-      if (self->interrupted_) {
+      if (self->IsInterruptedLocked()) {
         was_interrupted = true;
       }
-      self->interrupted_ = false;
+      self->SetInterruptedLocked(false);
     }
   }
 
@@ -485,15 +487,15 @@
     // that a thread in a waiting/sleeping state has a non-null wait_monitor_ for debugging
     // and diagnostic purposes. (If you reset this earlier, stack dumps will claim that threads
     // are waiting on "null".)
-    MutexLock mu(self, *self->wait_mutex_);
-    DCHECK(self->wait_monitor_ != NULL);
-    self->wait_monitor_ = NULL;
+    MutexLock mu(self, *self->GetWaitMutex());
+    DCHECK(self->GetWaitMonitor() != nullptr);
+    self->SetWaitMonitor(nullptr);
   }
 
   // Re-acquire the monitor and lock.
   Lock(self);
   monitor_lock_.Lock(self);
-  self->wait_mutex_->AssertNotHeld(self);
+  self->GetWaitMutex()->AssertNotHeld(self);
 
   /*
    * We remove our thread from wait set after restoring the count
@@ -516,8 +518,8 @@
      * cleared when this exception is thrown."
      */
     {
-      MutexLock mu(self, *self->wait_mutex_);
-      self->interrupted_ = false;
+      MutexLock mu(self, *self->GetWaitMutex());
+      self->SetInterruptedLocked(false);
     }
     if (interruptShouldThrow) {
       ThrowLocation throw_location = self->GetCurrentLocationForThrow();
@@ -538,13 +540,13 @@
   // Signal the first waiting thread in the wait set.
   while (wait_set_ != NULL) {
     Thread* thread = wait_set_;
-    wait_set_ = thread->wait_next_;
-    thread->wait_next_ = NULL;
+    wait_set_ = thread->GetWaitNext();
+    thread->SetWaitNext(nullptr);
 
     // Check to see if the thread is still waiting.
-    MutexLock mu(self, *thread->wait_mutex_);
-    if (thread->wait_monitor_ != NULL) {
-      thread->wait_cond_->Signal(self);
+    MutexLock mu(self, *thread->GetWaitMutex());
+    if (thread->GetWaitMonitor() != nullptr) {
+      thread->GetWaitConditionVariable()->Signal(self);
       return;
     }
   }
@@ -561,8 +563,8 @@
   // Signal all threads in the wait set.
   while (wait_set_ != NULL) {
     Thread* thread = wait_set_;
-    wait_set_ = thread->wait_next_;
-    thread->wait_next_ = NULL;
+    wait_set_ = thread->GetWaitNext();
+    thread->SetWaitNext(nullptr);
     thread->Notify();
   }
 }
@@ -633,6 +635,7 @@
     ThreadList* thread_list = Runtime::Current()->GetThreadList();
     // Suspend the owner, inflate. First change to blocked and give up mutator_lock_.
     ScopedThreadStateChange tsc(self, kBlocked);
+    self->SetMonitorEnterObject(obj.get());
     if (lock_word == obj->GetLockWord()) {  // If lock word hasn't changed.
       bool timed_out;
       Thread* owner = thread_list->SuspendThreadByThreadId(owner_thread_id, false, &timed_out);
@@ -647,6 +650,7 @@
         thread_list->Resume(owner, false);
       }
     }
+    self->SetMonitorEnterObject(nullptr);
   }
 }
 
@@ -867,59 +871,60 @@
 }
 
 void Monitor::DescribeWait(std::ostream& os, const Thread* thread) {
-  ThreadState state = thread->GetState();
-
-  int32_t object_identity_hashcode = 0;
+  // Determine the wait message and object we're waiting or blocked upon.
+  mirror::Object* pretty_object = nullptr;
+  const char* wait_message = nullptr;
   uint32_t lock_owner = ThreadList::kInvalidThreadId;
-  std::string pretty_type;
+  ThreadState state = thread->GetState();
   if (state == kWaiting || state == kTimedWaiting || state == kSleeping) {
-    if (state == kSleeping) {
-      os << "  - sleeping on ";
-    } else {
-      os << "  - waiting on ";
-    }
-    {
-      Thread* self = Thread::Current();
-      MutexLock mu(self, *thread->wait_mutex_);
-      Monitor* monitor = thread->wait_monitor_;
-      if (monitor != NULL) {
-        mirror::Object* object = monitor->obj_;
-        object_identity_hashcode = object->IdentityHashCode();
-        pretty_type = PrettyTypeOf(object);
-      }
+    wait_message = (state == kSleeping) ? "  - sleeping on " : "  - waiting on ";
+    Thread* self = Thread::Current();
+    MutexLock mu(self, *thread->GetWaitMutex());
+    Monitor* monitor = thread->GetWaitMonitor();
+    if (monitor != nullptr) {
+      pretty_object = monitor->obj_;
     }
   } else if (state == kBlocked) {
-    os << "  - waiting to lock ";
-    mirror::Object* object = thread->monitor_enter_object_;
-    if (object != NULL) {
-      object_identity_hashcode = object->IdentityHashCode();
-      lock_owner = object->GetLockOwnerThreadId();
-      pretty_type = PrettyTypeOf(object);
+    wait_message = "  - waiting to lock ";
+    pretty_object = thread->GetMonitorEnterObject();
+    if (pretty_object != nullptr) {
+      lock_owner = pretty_object->GetLockOwnerThreadId();
     }
-  } else {
-    // We're not waiting on anything.
-    return;
   }
 
-  // - waiting on <0x6008c468> (a java.lang.Class<java.lang.ref.ReferenceQueue>)
-  os << StringPrintf("<0x%08x> (a %s)", object_identity_hashcode, pretty_type.c_str());
-
-  // - waiting to lock <0x613f83d8> (a java.lang.Object) held by thread 5
-  if (lock_owner != ThreadList::kInvalidThreadId) {
-    os << " held by thread " << lock_owner;
+  if (wait_message != nullptr) {
+    if (pretty_object == nullptr) {
+      os << wait_message << "an unknown object";
+    } else {
+      if ((pretty_object->GetLockWord().GetState() == LockWord::kThinLocked) &&
+          Locks::mutator_lock_->IsExclusiveHeld(Thread::Current())) {
+        // Getting the identity hashcode here would result in lock inflation and suspension of the
+        // current thread, which isn't safe if this is the only runnable thread.
+        os << wait_message << StringPrintf("<@addr=0x%" PRIxPTR "> (a %s)",
+                                           reinterpret_cast<intptr_t>(pretty_object),
+                                           PrettyTypeOf(pretty_object).c_str());
+      } else {
+        // - waiting on <0x6008c468> (a java.lang.Class<java.lang.ref.ReferenceQueue>)
+        os << wait_message << StringPrintf("<0x%08x> (a %s)", pretty_object->IdentityHashCode(),
+                                           PrettyTypeOf(pretty_object).c_str());
+      }
+    }
+    // - waiting to lock <0x613f83d8> (a java.lang.Object) held by thread 5
+    if (lock_owner != ThreadList::kInvalidThreadId) {
+      os << " held by thread " << lock_owner;
+    }
+    os << "\n";
   }
-
-  os << "\n";
 }
 
 mirror::Object* Monitor::GetContendedMonitor(Thread* thread) {
   // This is used to implement JDWP's ThreadReference.CurrentContendedMonitor, and has a bizarre
   // definition of contended that includes a monitor a thread is trying to enter...
-  mirror::Object* result = thread->monitor_enter_object_;
+  mirror::Object* result = thread->GetMonitorEnterObject();
   if (result == NULL) {
     // ...but also a monitor that the thread is waiting on.
-    MutexLock mu(Thread::Current(), *thread->wait_mutex_);
-    Monitor* monitor = thread->wait_monitor_;
+    MutexLock mu(Thread::Current(), *thread->GetWaitMutex());
+    Monitor* monitor = thread->GetWaitMonitor();
     if (monitor != NULL) {
       result = monitor->GetObject();
     }
@@ -1118,7 +1123,7 @@
       Monitor* mon = lock_word.FatLockMonitor();
       owner_ = mon->owner_;
       entry_count_ = 1 + mon->lock_count_;
-      for (Thread* waiter = mon->wait_set_; waiter != NULL; waiter = waiter->wait_next_) {
+      for (Thread* waiter = mon->wait_set_; waiter != NULL; waiter = waiter->GetWaitNext()) {
         waiters_.push_back(waiter);
       }
       break;
diff --git a/runtime/native/dalvik_system_VMStack.cc b/runtime/native/dalvik_system_VMStack.cc
index 9975bf7..cf31064 100644
--- a/runtime/native/dalvik_system_VMStack.cc
+++ b/runtime/native/dalvik_system_VMStack.cc
@@ -30,7 +30,7 @@
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   jobject trace = nullptr;
   if (soa.Decode<mirror::Object*>(peer) == soa.Self()->GetPeer()) {
-    trace = soa.Self()->CreateInternalStackTrace(soa);
+    trace = soa.Self()->CreateInternalStackTrace<false>(soa);
   } else {
     // Suspend thread to build stack trace.
     soa.Self()->TransitionFromRunnableToSuspended(kNative);
@@ -39,7 +39,7 @@
     if (thread != nullptr) {
       // Must be runnable to create returned array.
       CHECK_EQ(soa.Self()->TransitionFromSuspendedToRunnable(), kNative);
-      trace = thread->CreateInternalStackTrace(soa);
+      trace = thread->CreateInternalStackTrace<false>(soa);
       soa.Self()->TransitionFromRunnableToSuspended(kNative);
       // Restart suspended thread.
       Runtime::Current()->GetThreadList()->Resume(thread, false);
diff --git a/runtime/native/dalvik_system_Zygote.cc b/runtime/native/dalvik_system_Zygote.cc
deleted file mode 100644
index 0307207..0000000
--- a/runtime/native/dalvik_system_Zygote.cc
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// sys/mount.h has to come before linux/fs.h due to redefinition of MS_RDONLY, MS_BIND, etc
-#include <sys/mount.h>
-#include <linux/fs.h>
-
-#include <grp.h>
-#include <paths.h>
-#include <signal.h>
-#include <stdlib.h>
-#include <sys/resource.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#include "cutils/fs.h"
-#include "cutils/multiuser.h"
-#include "cutils/sched_policy.h"
-#include "debugger.h"
-#include "jni_internal.h"
-#include "JNIHelp.h"
-#include "ScopedLocalRef.h"
-#include "ScopedPrimitiveArray.h"
-#include "ScopedUtfChars.h"
-#include "thread-inl.h"
-#include "utils.h"
-
-#if defined(HAVE_PRCTL)
-#include <sys/prctl.h>
-#endif
-
-#include <selinux/android.h>
-
-#if defined(__linux__)
-#include <sys/personality.h>
-#include <sys/utsname.h>
-#if defined(HAVE_ANDROID_OS)
-#include <sys/capability.h>
-#endif
-#endif
-
-namespace art {
-
-static pid_t gSystemServerPid = 0;
-
-// Must match values in dalvik.system.Zygote.
-enum MountExternalKind {
-  MOUNT_EXTERNAL_NONE = 0,
-  MOUNT_EXTERNAL_SINGLEUSER = 1,
-  MOUNT_EXTERNAL_MULTIUSER = 2,
-  MOUNT_EXTERNAL_MULTIUSER_ALL = 3,
-};
-
-// This signal handler is for zygote mode, since the zygote must reap its children
-static void SigChldHandler(int /*signal_number*/) {
-  pid_t pid;
-  int status;
-
-  while ((pid = waitpid(-1, &status, WNOHANG)) > 0) {
-     // Log process-death status that we care about.  In general it is
-     // not safe to call LOG(...) from a signal handler because of
-     // possible reentrancy.  However, we know a priori that the
-     // current implementation of LOG() is safe to call from a SIGCHLD
-     // handler in the zygote process.  If the LOG() implementation
-     // changes its locking strategy or its use of syscalls within the
-     // lazy-init critical section, its use here may become unsafe.
-    if (WIFEXITED(status)) {
-      if (WEXITSTATUS(status)) {
-        LOG(INFO) << "Process " << pid << " exited cleanly (" << WEXITSTATUS(status) << ")";
-      } else if (false) {
-        LOG(INFO) << "Process " << pid << " exited cleanly (" << WEXITSTATUS(status) << ")";
-      }
-    } else if (WIFSIGNALED(status)) {
-      if (WTERMSIG(status) != SIGKILL) {
-        LOG(INFO) << "Process " << pid << " terminated by signal (" << WTERMSIG(status) << ")";
-      } else if (false) {
-        LOG(INFO) << "Process " << pid << " terminated by signal (" << WTERMSIG(status) << ")";
-      }
-#ifdef WCOREDUMP
-      if (WCOREDUMP(status)) {
-        LOG(INFO) << "Process " << pid << " dumped core";
-      }
-#endif /* ifdef WCOREDUMP */
-    }
-
-    // If the just-crashed process is the system_server, bring down zygote
-    // so that it is restarted by init and system server will be restarted
-    // from there.
-    if (pid == gSystemServerPid) {
-      LOG(ERROR) << "Exit zygote because system server (" << pid << ") has terminated";
-      kill(getpid(), SIGKILL);
-    }
-  }
-
-  if (pid < 0) {
-    PLOG(WARNING) << "Zygote SIGCHLD error in waitpid";
-  }
-}
-
-// Configures the SIGCHLD handler for the zygote process. This is configured
-// very late, because earlier in the runtime we may fork() and exec()
-// other processes, and we want to waitpid() for those rather than
-// have them be harvested immediately.
-//
-// This ends up being called repeatedly before each fork(), but there's
-// no real harm in that.
-static void SetSigChldHandler() {
-  struct sigaction sa;
-  memset(&sa, 0, sizeof(sa));
-  sa.sa_handler = SigChldHandler;
-
-  int err = sigaction(SIGCHLD, &sa, NULL);
-  if (err < 0) {
-    PLOG(WARNING) << "Error setting SIGCHLD handler";
-  }
-}
-
-// Sets the SIGCHLD handler back to default behavior in zygote children.
-static void UnsetSigChldHandler() {
-  struct sigaction sa;
-  memset(&sa, 0, sizeof(sa));
-  sa.sa_handler = SIG_DFL;
-
-  int err = sigaction(SIGCHLD, &sa, NULL);
-  if (err < 0) {
-    PLOG(WARNING) << "Error unsetting SIGCHLD handler";
-  }
-}
-
-// Calls POSIX setgroups() using the int[] object as an argument.
-// A NULL argument is tolerated.
-static void SetGids(JNIEnv* env, jintArray javaGids) {
-  if (javaGids == NULL) {
-    return;
-  }
-
-  COMPILE_ASSERT(sizeof(gid_t) == sizeof(jint), sizeof_gid_and_jint_are_differerent);
-  ScopedIntArrayRO gids(env, javaGids);
-  CHECK(gids.get() != NULL);
-  int rc = setgroups(gids.size(), reinterpret_cast<const gid_t*>(&gids[0]));
-  if (rc == -1) {
-    PLOG(FATAL) << "setgroups failed";
-  }
-}
-
-// Sets the resource limits via setrlimit(2) for the values in the
-// two-dimensional array of integers that's passed in. The second dimension
-// contains a tuple of length 3: (resource, rlim_cur, rlim_max). NULL is
-// treated as an empty array.
-static void SetRLimits(JNIEnv* env, jobjectArray javaRlimits) {
-  if (javaRlimits == NULL) {
-    return;
-  }
-
-  rlimit rlim;
-  memset(&rlim, 0, sizeof(rlim));
-
-  for (int i = 0; i < env->GetArrayLength(javaRlimits); ++i) {
-    ScopedLocalRef<jobject> javaRlimitObject(env, env->GetObjectArrayElement(javaRlimits, i));
-    ScopedIntArrayRO javaRlimit(env, reinterpret_cast<jintArray>(javaRlimitObject.get()));
-    if (javaRlimit.size() != 3) {
-      LOG(FATAL) << "rlimits array must have a second dimension of size 3";
-    }
-
-    rlim.rlim_cur = javaRlimit[1];
-    rlim.rlim_max = javaRlimit[2];
-
-    int rc = setrlimit(javaRlimit[0], &rlim);
-    if (rc == -1) {
-      PLOG(FATAL) << "setrlimit(" << javaRlimit[0] << ", "
-                  << "{" << rlim.rlim_cur << ", " << rlim.rlim_max << "}) failed";
-    }
-  }
-}
-
-#if defined(HAVE_ANDROID_OS)
-
-// The debug malloc library needs to know whether it's the zygote or a child.
-extern "C" int gMallocLeakZygoteChild;
-
-static void EnableDebugger() {
-  // To let a non-privileged gdbserver attach to this
-  // process, we must set our dumpable flag.
-  if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) == -1) {
-    PLOG(ERROR) << "prctl(PR_SET_DUMPABLE) failed for pid " << getpid();
-  }
-  // We don't want core dumps, though, so set the core dump size to 0.
-  rlimit rl;
-  rl.rlim_cur = 0;
-  rl.rlim_max = RLIM_INFINITY;
-  if (setrlimit(RLIMIT_CORE, &rl) == -1) {
-    PLOG(ERROR) << "setrlimit(RLIMIT_CORE) failed for pid " << getpid();
-  }
-}
-
-static void EnableKeepCapabilities() {
-  int rc = prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0);
-  if (rc == -1) {
-    PLOG(FATAL) << "prctl(PR_SET_KEEPCAPS) failed";
-  }
-}
-
-static void DropCapabilitiesBoundingSet() {
-  for (int i = 0; prctl(PR_CAPBSET_READ, i, 0, 0, 0) >= 0; i++) {
-    int rc = prctl(PR_CAPBSET_DROP, i, 0, 0, 0);
-    if (rc == -1) {
-      if (errno == EINVAL) {
-        PLOG(ERROR) << "prctl(PR_CAPBSET_DROP) failed with EINVAL. Please verify "
-                    << "your kernel is compiled with file capabilities support";
-      } else {
-        PLOG(FATAL) << "prctl(PR_CAPBSET_DROP) failed";
-      }
-    }
-  }
-}
-
-static void SetCapabilities(int64_t permitted, int64_t effective) {
-  __user_cap_header_struct capheader;
-  memset(&capheader, 0, sizeof(capheader));
-  capheader.version = _LINUX_CAPABILITY_VERSION_3;
-  capheader.pid = 0;
-
-  __user_cap_data_struct capdata[2];
-  memset(&capdata, 0, sizeof(capdata));
-  capdata[0].effective = effective;
-  capdata[1].effective = effective >> 32;
-  capdata[0].permitted = permitted;
-  capdata[1].permitted = permitted >> 32;
-
-  if (capset(&capheader, &capdata[0]) == -1) {
-    PLOG(FATAL) << "capset(" << permitted << ", " << effective << ") failed";
-  }
-}
-
-static void SetSchedulerPolicy() {
-  errno = -set_sched_policy(0, SP_DEFAULT);
-  if (errno != 0) {
-    PLOG(FATAL) << "set_sched_policy(0, SP_DEFAULT) failed";
-  }
-}
-
-#else
-
-static int gMallocLeakZygoteChild = 0;
-
-static void EnableDebugger() {}
-static void EnableKeepCapabilities() {}
-static void DropCapabilitiesBoundingSet() {}
-static void SetCapabilities(int64_t, int64_t) {}
-static void SetSchedulerPolicy() {}
-
-#endif
-
-static void EnableDebugFeatures(uint32_t debug_flags) {
-  // Must match values in dalvik.system.Zygote.
-  enum {
-    DEBUG_ENABLE_DEBUGGER           = 1,
-    DEBUG_ENABLE_CHECKJNI           = 1 << 1,
-    DEBUG_ENABLE_ASSERT             = 1 << 2,
-    DEBUG_ENABLE_SAFEMODE           = 1 << 3,
-    DEBUG_ENABLE_JNI_LOGGING        = 1 << 4,
-  };
-
-  if ((debug_flags & DEBUG_ENABLE_CHECKJNI) != 0) {
-    Runtime* runtime = Runtime::Current();
-    JavaVMExt* vm = runtime->GetJavaVM();
-    if (!vm->check_jni) {
-      LOG(DEBUG) << "Late-enabling -Xcheck:jni";
-      vm->SetCheckJniEnabled(true);
-      // There's only one thread running at this point, so only one JNIEnv to fix up.
-      Thread::Current()->GetJniEnv()->SetCheckJniEnabled(true);
-    } else {
-      LOG(DEBUG) << "Not late-enabling -Xcheck:jni (already on)";
-    }
-    debug_flags &= ~DEBUG_ENABLE_CHECKJNI;
-  }
-
-  if ((debug_flags & DEBUG_ENABLE_JNI_LOGGING) != 0) {
-    gLogVerbosity.third_party_jni = true;
-    debug_flags &= ~DEBUG_ENABLE_JNI_LOGGING;
-  }
-
-  Dbg::SetJdwpAllowed((debug_flags & DEBUG_ENABLE_DEBUGGER) != 0);
-  if ((debug_flags & DEBUG_ENABLE_DEBUGGER) != 0) {
-    EnableDebugger();
-  }
-  debug_flags &= ~DEBUG_ENABLE_DEBUGGER;
-
-  // These two are for backwards compatibility with Dalvik.
-  debug_flags &= ~DEBUG_ENABLE_ASSERT;
-  debug_flags &= ~DEBUG_ENABLE_SAFEMODE;
-
-  if (debug_flags != 0) {
-    LOG(ERROR) << StringPrintf("Unknown bits set in debug_flags: %#x", debug_flags);
-  }
-}
-
-// Create a private mount namespace and bind mount appropriate emulated
-// storage for the given user.
-static bool MountEmulatedStorage(uid_t uid, jint mount_mode) {
-  if (mount_mode == MOUNT_EXTERNAL_NONE) {
-    return true;
-  }
-
-  // See storage config details at http://source.android.com/tech/storage/
-  userid_t user_id = multiuser_get_user_id(uid);
-
-  // Create a second private mount namespace for our process
-  if (unshare(CLONE_NEWNS) == -1) {
-      PLOG(WARNING) << "Failed to unshare()";
-      return false;
-  }
-
-  // Create bind mounts to expose external storage
-  if (mount_mode == MOUNT_EXTERNAL_MULTIUSER || mount_mode == MOUNT_EXTERNAL_MULTIUSER_ALL) {
-    // These paths must already be created by init.rc
-    const char* source = getenv("EMULATED_STORAGE_SOURCE");
-    const char* target = getenv("EMULATED_STORAGE_TARGET");
-    const char* legacy = getenv("EXTERNAL_STORAGE");
-    if (source == NULL || target == NULL || legacy == NULL) {
-      LOG(WARNING) << "Storage environment undefined; unable to provide external storage";
-      return false;
-    }
-
-    // Prepare source paths
-
-    // /mnt/shell/emulated/0
-    std::string source_user(StringPrintf("%s/%d", source, user_id));
-    // /storage/emulated/0
-    std::string target_user(StringPrintf("%s/%d", target, user_id));
-
-    if (fs_prepare_dir(source_user.c_str(), 0000, 0, 0) == -1
-        || fs_prepare_dir(target_user.c_str(), 0000, 0, 0) == -1) {
-      return false;
-    }
-
-    if (mount_mode == MOUNT_EXTERNAL_MULTIUSER_ALL) {
-      // Mount entire external storage tree for all users
-      if (TEMP_FAILURE_RETRY(mount(source, target, NULL, MS_BIND, NULL)) == -1) {
-        PLOG(WARNING) << "Failed to mount " << source << " to " << target;
-        return false;
-      }
-    } else {
-      // Only mount user-specific external storage
-      if (TEMP_FAILURE_RETRY(
-              mount(source_user.c_str(), target_user.c_str(), NULL, MS_BIND, NULL)) == -1) {
-        PLOG(WARNING) << "Failed to mount " << source_user << " to " << target_user;
-        return false;
-      }
-    }
-
-    if (fs_prepare_dir(legacy, 0000, 0, 0) == -1) {
-        return false;
-    }
-
-    // Finally, mount user-specific path into place for legacy users
-    if (TEMP_FAILURE_RETRY(
-            mount(target_user.c_str(), legacy, NULL, MS_BIND | MS_REC, NULL)) == -1) {
-      PLOG(WARNING) << "Failed to mount " << target_user << " to " << legacy;
-      return false;
-    }
-  } else {
-    LOG(WARNING) << "Mount mode " << mount_mode << " unsupported";
-    return false;
-  }
-
-  return true;
-}
-
-#if defined(__linux__)
-static bool NeedsNoRandomizeWorkaround() {
-#if !defined(__arm__)
-    return false;
-#else
-    int major;
-    int minor;
-    struct utsname uts;
-    if (uname(&uts) == -1) {
-        return false;
-    }
-
-    if (sscanf(uts.release, "%d.%d", &major, &minor) != 2) {
-        return false;
-    }
-
-    // Kernels before 3.4.* need the workaround.
-    return (major < 3) || ((major == 3) && (minor < 4));
-#endif
-}
-#endif
-
-// Utility to close down the Zygote socket file descriptors while
-// the child is still running as root with Zygote's privileges.  Each
-// descriptor (if any) is closed via dup2(), replacing it with a valid
-// (open) descriptor to /dev/null.
-
-static void DetachDescriptors(JNIEnv* env, jintArray fdsToClose) {
-  if (!fdsToClose) {
-    return;
-  }
-  jsize count = env->GetArrayLength(fdsToClose);
-  jint *ar = env->GetIntArrayElements(fdsToClose, 0);
-  if (!ar) {
-      PLOG(FATAL) << "Bad fd array";
-  }
-  jsize i;
-  int devnull;
-  for (i = 0; i < count; i++) {
-    devnull = open("/dev/null", O_RDWR);
-    if (devnull < 0) {
-      PLOG(FATAL) << "Failed to open /dev/null";
-      continue;
-    }
-    PLOG(VERBOSE) << "Switching descriptor " << ar[i] << " to /dev/null";
-    if (dup2(devnull, ar[i]) < 0) {
-      PLOG(FATAL) << "Failed dup2() on descriptor " << ar[i];
-    }
-    close(devnull);
-  }
-}
-
-// Utility routine to fork zygote and specialize the child process.
-static pid_t ForkAndSpecializeCommon(JNIEnv* env, uid_t uid, gid_t gid, jintArray javaGids,
-                                     jint debug_flags, jobjectArray javaRlimits,
-                                     jlong permittedCapabilities, jlong effectiveCapabilities,
-                                     jint mount_external,
-                                     jstring java_se_info, jstring java_se_name,
-                                     bool is_system_server, jintArray fdsToClose) {
-  Runtime* runtime = Runtime::Current();
-  CHECK(runtime->IsZygote()) << "runtime instance not started with -Xzygote";
-  if (!runtime->PreZygoteFork()) {
-    LOG(FATAL) << "pre-fork heap failed";
-  }
-
-  SetSigChldHandler();
-
-  // Grab thread before fork potentially makes Thread::pthread_key_self_ unusable.
-  Thread* self = Thread::Current();
-
-  // dvmDumpLoaderStats("zygote");  // TODO: ?
-  pid_t pid = fork();
-
-  if (pid == 0) {
-    // The child process.
-    gMallocLeakZygoteChild = 1;
-
-    // Clean up any descriptors which must be closed immediately
-    DetachDescriptors(env, fdsToClose);
-
-    // Keep capabilities across UID change, unless we're staying root.
-    if (uid != 0) {
-      EnableKeepCapabilities();
-    }
-
-    DropCapabilitiesBoundingSet();
-
-    if (!MountEmulatedStorage(uid, mount_external)) {
-      PLOG(WARNING) << "Failed to mount emulated storage";
-      if (errno == ENOTCONN || errno == EROFS) {
-        // When device is actively encrypting, we get ENOTCONN here
-        // since FUSE was mounted before the framework restarted.
-        // When encrypted device is booting, we get EROFS since
-        // FUSE hasn't been created yet by init.
-        // In either case, continue without external storage.
-      } else {
-        LOG(FATAL) << "Cannot continue without emulated storage";
-      }
-    }
-
-    SetGids(env, javaGids);
-
-    SetRLimits(env, javaRlimits);
-
-    int rc = setresgid(gid, gid, gid);
-    if (rc == -1) {
-      PLOG(FATAL) << "setresgid(" << gid << ") failed";
-    }
-
-    rc = setresuid(uid, uid, uid);
-    if (rc == -1) {
-      PLOG(FATAL) << "setresuid(" << uid << ") failed";
-    }
-
-#if defined(__linux__)
-    if (NeedsNoRandomizeWorkaround()) {
-        // Work around ARM kernel ASLR lossage (http://b/5817320).
-        int old_personality = personality(0xffffffff);
-        int new_personality = personality(old_personality | ADDR_NO_RANDOMIZE);
-        if (new_personality == -1) {
-            PLOG(WARNING) << "personality(" << new_personality << ") failed";
-        }
-    }
-#endif
-
-    SetCapabilities(permittedCapabilities, effectiveCapabilities);
-
-    SetSchedulerPolicy();
-
-#if defined(HAVE_ANDROID_OS)
-    {  // NOLINT(whitespace/braces)
-      const char* se_info_c_str = NULL;
-      UniquePtr<ScopedUtfChars> se_info;
-      if (java_se_info != NULL) {
-          se_info.reset(new ScopedUtfChars(env, java_se_info));
-          se_info_c_str = se_info->c_str();
-          CHECK(se_info_c_str != NULL);
-      }
-      const char* se_name_c_str = NULL;
-      UniquePtr<ScopedUtfChars> se_name;
-      if (java_se_name != NULL) {
-          se_name.reset(new ScopedUtfChars(env, java_se_name));
-          se_name_c_str = se_name->c_str();
-          CHECK(se_name_c_str != NULL);
-      }
-      rc = selinux_android_setcontext(uid, is_system_server, se_info_c_str, se_name_c_str);
-      if (rc == -1) {
-        PLOG(FATAL) << "selinux_android_setcontext(" << uid << ", "
-                    << (is_system_server ? "true" : "false") << ", "
-                    << "\"" << se_info_c_str << "\", \"" << se_name_c_str << "\") failed";
-      }
-
-      // Make it easier to debug audit logs by setting the main thread's name to the
-      // nice name rather than "app_process".
-      if (se_info_c_str == NULL && is_system_server) {
-        se_name_c_str = "system_server";
-      }
-      if (se_info_c_str != NULL) {
-        SetThreadName(se_name_c_str);
-      }
-    }
-#else
-    UNUSED(is_system_server);
-    UNUSED(java_se_info);
-    UNUSED(java_se_name);
-#endif
-
-    // Our system thread ID, etc, has changed so reset Thread state.
-    self->InitAfterFork();
-
-    EnableDebugFeatures(debug_flags);
-
-    UnsetSigChldHandler();
-    runtime->DidForkFromZygote();
-  } else if (pid > 0) {
-    // the parent process
-  }
-  return pid;
-}
-
-static jint Zygote_nativeForkAndSpecialize(JNIEnv* env, jclass, jint uid, jint gid, jintArray gids,
-                                           jint debug_flags, jobjectArray rlimits,
-                                           jint mount_external, jstring se_info, jstring se_name,
-                                           jintArray fdsToClose) {
-  return ForkAndSpecializeCommon(env, uid, gid, gids, debug_flags, rlimits, 0, 0, mount_external,
-                                 se_info, se_name, false, fdsToClose);
-}
-
-static jint Zygote_nativeForkSystemServer(JNIEnv* env, jclass, uid_t uid, gid_t gid, jintArray gids,
-                                          jint debug_flags, jobjectArray rlimits,
-                                          jlong permittedCapabilities,
-                                          jlong effectiveCapabilities) {
-  pid_t pid = ForkAndSpecializeCommon(env, uid, gid, gids,
-                                      debug_flags, rlimits,
-                                      permittedCapabilities, effectiveCapabilities,
-                                      MOUNT_EXTERNAL_NONE, NULL, NULL, true, NULL);
-  if (pid > 0) {
-      // The zygote process checks whether the child process has died or not.
-      LOG(INFO) << "System server process " << pid << " has been created";
-      gSystemServerPid = pid;
-      // There is a slight window that the system server process has crashed
-      // but it went unnoticed because we haven't published its pid yet. So
-      // we recheck here just to make sure that all is well.
-      int status;
-      if (waitpid(pid, &status, WNOHANG) == pid) {
-          LOG(FATAL) << "System server process " << pid << " has died. Restarting Zygote!";
-      }
-  }
-  return pid;
-}
-
-static JNINativeMethod gMethods[] = {
-  NATIVE_METHOD(Zygote, nativeForkAndSpecialize, "(II[II[[IILjava/lang/String;Ljava/lang/String;[I)I"),
-  NATIVE_METHOD(Zygote, nativeForkSystemServer, "(II[II[[IJJ)I"),
-};
-
-void register_dalvik_system_Zygote(JNIEnv* env) {
-  REGISTER_NATIVE_METHODS("dalvik/system/Zygote");
-}
-
-}  // namespace art
diff --git a/runtime/native/dalvik_system_ZygoteHooks.cc b/runtime/native/dalvik_system_ZygoteHooks.cc
new file mode 100644
index 0000000..2af5324
--- /dev/null
+++ b/runtime/native/dalvik_system_ZygoteHooks.cc
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+
+#include "debugger.h"
+#include "jni_internal.h"
+#include "JNIHelp.h"
+#include "thread-inl.h"
+
+#if defined(HAVE_PRCTL)
+#include <sys/prctl.h>
+#endif
+
+#include <sys/resource.h>
+
+namespace art {
+
+static void EnableDebugger() {
+  // To let a non-privileged gdbserver attach to this
+  // process, we must set our dumpable flag.
+  if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) == -1) {
+    PLOG(ERROR) << "prctl(PR_SET_DUMPABLE) failed for pid " << getpid();
+  }
+  // We don't want core dumps, though, so set the core dump size to 0.
+  rlimit rl;
+  rl.rlim_cur = 0;
+  rl.rlim_max = RLIM_INFINITY;
+  if (setrlimit(RLIMIT_CORE, &rl) == -1) {
+    PLOG(ERROR) << "setrlimit(RLIMIT_CORE) failed for pid " << getpid();
+  }
+}
+
+static void EnableDebugFeatures(uint32_t debug_flags) {
+  // Must match values in dalvik.system.Zygote.
+  enum {
+    DEBUG_ENABLE_DEBUGGER           = 1,
+    DEBUG_ENABLE_CHECKJNI           = 1 << 1,
+    DEBUG_ENABLE_ASSERT             = 1 << 2,
+    DEBUG_ENABLE_SAFEMODE           = 1 << 3,
+    DEBUG_ENABLE_JNI_LOGGING        = 1 << 4,
+  };
+
+  if ((debug_flags & DEBUG_ENABLE_CHECKJNI) != 0) {
+    Runtime* runtime = Runtime::Current();
+    JavaVMExt* vm = runtime->GetJavaVM();
+    if (!vm->check_jni) {
+      LOG(DEBUG) << "Late-enabling -Xcheck:jni";
+      vm->SetCheckJniEnabled(true);
+      // There's only one thread running at this point, so only one JNIEnv to fix up.
+      Thread::Current()->GetJniEnv()->SetCheckJniEnabled(true);
+    } else {
+      LOG(DEBUG) << "Not late-enabling -Xcheck:jni (already on)";
+    }
+    debug_flags &= ~DEBUG_ENABLE_CHECKJNI;
+  }
+
+  if ((debug_flags & DEBUG_ENABLE_JNI_LOGGING) != 0) {
+    gLogVerbosity.third_party_jni = true;
+    debug_flags &= ~DEBUG_ENABLE_JNI_LOGGING;
+  }
+
+  Dbg::SetJdwpAllowed((debug_flags & DEBUG_ENABLE_DEBUGGER) != 0);
+  if ((debug_flags & DEBUG_ENABLE_DEBUGGER) != 0) {
+    EnableDebugger();
+  }
+  debug_flags &= ~DEBUG_ENABLE_DEBUGGER;
+
+  // These two are for backwards compatibility with Dalvik.
+  debug_flags &= ~DEBUG_ENABLE_ASSERT;
+  debug_flags &= ~DEBUG_ENABLE_SAFEMODE;
+
+  if (debug_flags != 0) {
+    LOG(ERROR) << StringPrintf("Unknown bits set in debug_flags: %#x", debug_flags);
+  }
+}
+
+static jlong ZygoteHooks_nativePreFork(JNIEnv* env, jclass) {
+  Runtime* runtime = Runtime::Current();
+  CHECK(runtime->IsZygote()) << "runtime instance not started with -Xzygote";
+  if (!runtime->PreZygoteFork()) {
+    LOG(FATAL) << "pre-fork heap failed";
+  }
+
+  // Grab thread before fork potentially makes Thread::pthread_key_self_ unusable.
+  Thread* self = Thread::Current();
+  return reinterpret_cast<jlong>(self);
+}
+
+static void ZygoteHooks_nativePostForkChild(JNIEnv* env, jclass, jlong token, jint debug_flags) {
+  Thread* thread = reinterpret_cast<Thread*>(token);
+  // Our system thread ID, etc, has changed so reset Thread state.
+  thread->InitAfterFork();
+  EnableDebugFeatures(debug_flags);
+  Runtime::Current()->DidForkFromZygote();
+}
+
+static JNINativeMethod gMethods[] = {
+  NATIVE_METHOD(ZygoteHooks, nativePreFork, "()J"),
+  NATIVE_METHOD(ZygoteHooks, nativePostForkChild, "(JI)V"),
+};
+
+void register_dalvik_system_ZygoteHooks(JNIEnv* env) {
+  REGISTER_NATIVE_METHODS("dalvik/system/ZygoteHooks");
+}
+
+}  // namespace art
diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc
index de1b593..0b84005 100644
--- a/runtime/native/java_lang_Thread.cc
+++ b/runtime/native/java_lang_Thread.cc
@@ -104,11 +104,11 @@
 }
 
 static void Thread_nativeInterrupt(JNIEnv* env, jobject java_thread) {
-  ScopedObjectAccess soa(env);
+  ScopedFastNativeObjectAccess soa(env);
   MutexLock mu(soa.Self(), *Locks::thread_list_lock_);
   Thread* thread = Thread::FromManagedThread(soa, java_thread);
   if (thread != NULL) {
-    thread->Interrupt();
+    thread->Interrupt(soa.Self());
   }
 }
 
@@ -175,7 +175,7 @@
   NATIVE_METHOD(Thread, nativeCreate, "(Ljava/lang/Thread;JZ)V"),
   NATIVE_METHOD(Thread, nativeGetStatus, "(Z)I"),
   NATIVE_METHOD(Thread, nativeHoldsLock, "(Ljava/lang/Object;)Z"),
-  NATIVE_METHOD(Thread, nativeInterrupt, "()V"),
+  NATIVE_METHOD(Thread, nativeInterrupt, "!()V"),
   NATIVE_METHOD(Thread, nativeSetName, "(Ljava/lang/String;)V"),
   NATIVE_METHOD(Thread, nativeSetPriority, "(I)V"),
   NATIVE_METHOD(Thread, sleep, "!(Ljava/lang/Object;JI)V"),
diff --git a/runtime/native/java_lang_Throwable.cc b/runtime/native/java_lang_Throwable.cc
index d1a1105..3ed4cfe 100644
--- a/runtime/native/java_lang_Throwable.cc
+++ b/runtime/native/java_lang_Throwable.cc
@@ -22,7 +22,7 @@
 
 static jobject Throwable_nativeFillInStackTrace(JNIEnv* env, jclass) {
   ScopedFastNativeObjectAccess soa(env);
-  return soa.Self()->CreateInternalStackTrace(soa);
+  return soa.Self()->CreateInternalStackTrace<false>(soa);
 }
 
 static jobjectArray Throwable_nativeGetStackTrace(JNIEnv* env, jclass, jobject javaStackState) {
diff --git a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
index a7ca0b8..5d90f1a 100644
--- a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
+++ b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
@@ -49,7 +49,7 @@
   if (static_cast<uint32_t>(thin_lock_id) == self->GetThreadId()) {
     // No need to suspend ourself to build stacktrace.
     ScopedObjectAccess soa(env);
-    jobject internal_trace = self->CreateInternalStackTrace(soa);
+    jobject internal_trace = self->CreateInternalStackTrace<false>(soa);
     trace = Thread::InternalStackTraceToStackTraceElementArray(soa, internal_trace);
   } else {
     // Suspend thread to build stack trace.
@@ -59,7 +59,7 @@
     if (thread != nullptr) {
       {
         ScopedObjectAccess soa(env);
-        jobject internal_trace = thread->CreateInternalStackTrace(soa);
+        jobject internal_trace = thread->CreateInternalStackTrace<false>(soa);
         trace = Thread::InternalStackTraceToStackTraceElementArray(soa, internal_trace);
       }
       // Restart suspended thread.
diff --git a/runtime/oat.cc b/runtime/oat.cc
index f970789..246e090 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -22,7 +22,7 @@
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '1', '9', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '2', '0', '\0' };
 
 OatHeader::OatHeader() {
   memset(this, 0, sizeof(*this));
diff --git a/runtime/object_utils.h b/runtime/object_utils.h
index 63801d3..072f074 100644
--- a/runtime/object_utils.h
+++ b/runtime/object_utils.h
@@ -25,6 +25,7 @@
 #include "mirror/class.h"
 #include "mirror/dex_cache.h"
 #include "mirror/iftable.h"
+#include "mirror/proxy.h"
 #include "mirror/string.h"
 
 #include "runtime.h"
@@ -133,7 +134,9 @@
     } else if (klass_->IsArrayClass()) {
       return 2;
     } else if (klass_->IsProxyClass()) {
-      return klass_->GetIfTable()->Count();
+      mirror::SynthesizedProxyClass* proxyClass = reinterpret_cast<mirror::SynthesizedProxyClass*>(klass_);
+      mirror::ObjectArray<mirror::Class>* interfaces = proxyClass->GetInterfaces();
+      return interfaces != nullptr ? interfaces->GetLength() : 0;
     } else {
       const DexFile::TypeList* interfaces = GetInterfaceTypeList();
       if (interfaces == nullptr) {
@@ -164,7 +167,10 @@
         return GetClassLinker()->FindSystemClass(Thread::Current(), "Ljava/io/Serializable;");
       }
     } else if (klass_->IsProxyClass()) {
-      return klass_->GetIfTable()->GetInterface(idx);
+      mirror::SynthesizedProxyClass* proxyClass = reinterpret_cast<mirror::SynthesizedProxyClass*>(klass_);
+      mirror::ObjectArray<mirror::Class>* interfaces = proxyClass->GetInterfaces();
+      DCHECK(interfaces != nullptr);
+      return interfaces->Get(idx);
     } else {
       uint16_t type_idx = GetDirectInterfaceTypeIdx(idx);
       mirror::Class* interface = GetDexCache()->GetResolvedType(type_idx);
diff --git a/runtime/offsets.h b/runtime/offsets.h
index ed4e49e..72a6b0f 100644
--- a/runtime/offsets.h
+++ b/runtime/offsets.h
@@ -50,6 +50,7 @@
 };
 
 // Offsets relative to the current running thread.
+template<size_t pointer_size>
 class ThreadOffset : public Offset {
  public:
   explicit ThreadOffset(size_t val) : Offset(val) {}
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index e2086f1..08a674f 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -196,6 +196,8 @@
   profile_backoff_coefficient_ = 2.0;
   profile_clock_source_ = kDefaultProfilerClockSource;
 
+  verify_ = true;
+
   // Default to explicit checks.  Switch off with -implicit-checks:.
   // or setprop dalvik.vm.implicit_checks check1,check2,...
 #ifdef HAVE_ANDROID_OS
@@ -569,6 +571,16 @@
         return false;
       }
       image_compiler_options_.push_back(options[i].first);
+    } else if (StartsWith(option, "-Xverify:")) {
+      std::string verify_mode = option.substr(strlen("-Xverify:"));
+      if (verify_mode == "none") {
+        verify_ = false;
+      } else if (verify_mode == "remote" || verify_mode == "all") {
+        verify_ = true;
+      } else {
+        Usage("Unknown -Xverify option %s", verify_mode.c_str());
+        return false;
+      }
     } else if (StartsWith(option, "-ea:") ||
                StartsWith(option, "-da:") ||
                StartsWith(option, "-enableassertions:") ||
@@ -578,7 +590,6 @@
                (option == "-dsa") ||
                (option == "-enablesystemassertions") ||
                (option == "-disablesystemassertions") ||
-               StartsWith(option, "-Xverify:") ||
                (option == "-Xrs") ||
                StartsWith(option, "-Xint:") ||
                StartsWith(option, "-Xdexopt:") ||
diff --git a/runtime/parsed_options.h b/runtime/parsed_options.h
index d6516a8..416bc78 100644
--- a/runtime/parsed_options.h
+++ b/runtime/parsed_options.h
@@ -80,6 +80,7 @@
   uint32_t profile_interval_us_;
   double profile_backoff_coefficient_;
   ProfilerClockSource profile_clock_source_;
+  bool verify_;
 
   static constexpr uint32_t kExplicitNullCheck = 1;
   static constexpr uint32_t kExplicitSuspendCheck = 2;
diff --git a/runtime/proxy_test.cc b/runtime/proxy_test.cc
index 6453cb4..eebfba8 100644
--- a/runtime/proxy_test.cc
+++ b/runtime/proxy_test.cc
@@ -107,7 +107,8 @@
 TEST_F(ProxyTest, ProxyClassHelper) {
   ScopedObjectAccess soa(Thread::Current());
   jobject jclass_loader = LoadDex("Interfaces");
-  SirtRef<mirror::ClassLoader> class_loader(soa.Self(), soa.Decode<mirror::ClassLoader*>(jclass_loader));
+  SirtRef<mirror::ClassLoader> class_loader(soa.Self(),
+                                            soa.Decode<mirror::ClassLoader*>(jclass_loader));
 
   mirror::Class* I = class_linker_->FindClass(soa.Self(), "LInterfaces$I;", class_loader);
   mirror::Class* J = class_linker_->FindClass(soa.Self(), "LInterfaces$J;", class_loader);
@@ -120,20 +121,66 @@
   mirror::Class* proxyClass = GenerateProxyClass(soa, jclass_loader, "$Proxy1234", interfaces);
   ASSERT_TRUE(proxyClass != nullptr);
   ASSERT_TRUE(proxyClass->IsProxyClass());
-
-  mirror::Class* javaIoSerializable = class_linker_->FindSystemClass(soa.Self(), "Ljava/io/Serializable;");
-  ASSERT_TRUE(javaIoSerializable != nullptr);
+  ASSERT_TRUE(proxyClass->IsInitialized());
 
   // Check ClassHelper for proxy.
   ClassHelper kh(proxyClass);
-  EXPECT_EQ(kh.NumDirectInterfaces(), 3U);  // java.io.Serializable, Interfaces$I and Interfaces$J.
-  EXPECT_EQ(javaIoSerializable, kh.GetDirectInterface(0));
-  EXPECT_EQ(I, kh.GetDirectInterface(1));
-  EXPECT_EQ(J, kh.GetDirectInterface(2));
+  EXPECT_EQ(kh.NumDirectInterfaces(), 2U);  // Interfaces$I and Interfaces$J.
+  EXPECT_EQ(I, kh.GetDirectInterface(0));
+  EXPECT_EQ(J, kh.GetDirectInterface(1));
   std::string proxyClassDescriptor(kh.GetDescriptor());
   EXPECT_EQ("L$Proxy1234;", proxyClassDescriptor);
-//  EXPECT_EQ(nullptr, kh.GetSourceFile());
 }
 
+// Creates a proxy class and check FieldHelper works correctly.
+TEST_F(ProxyTest, ProxyFieldHelper) {
+  ScopedObjectAccess soa(Thread::Current());
+  jobject jclass_loader = LoadDex("Interfaces");
+  SirtRef<mirror::ClassLoader> class_loader(soa.Self(),
+                                            soa.Decode<mirror::ClassLoader*>(jclass_loader));
+
+  mirror::Class* I = class_linker_->FindClass(soa.Self(), "LInterfaces$I;", class_loader);
+  mirror::Class* J = class_linker_->FindClass(soa.Self(), "LInterfaces$J;", class_loader);
+  ASSERT_TRUE(I != nullptr);
+  ASSERT_TRUE(J != nullptr);
+  std::vector<mirror::Class*> interfaces;
+  interfaces.push_back(I);
+  interfaces.push_back(J);
+
+  mirror::Class* proxyClass = GenerateProxyClass(soa, jclass_loader, "$Proxy1234", interfaces);
+  ASSERT_TRUE(proxyClass != nullptr);
+  ASSERT_TRUE(proxyClass->IsProxyClass());
+  ASSERT_TRUE(proxyClass->IsInitialized());
+
+  mirror::ObjectArray<mirror::ArtField>* instance_fields = proxyClass->GetIFields();
+  EXPECT_TRUE(instance_fields == nullptr);
+
+  mirror::ObjectArray<mirror::ArtField>* static_fields = proxyClass->GetSFields();
+  ASSERT_TRUE(static_fields != nullptr);
+  ASSERT_EQ(2, static_fields->GetLength());
+
+  mirror::Class* interfacesFieldClass = class_linker_->FindSystemClass(soa.Self(),
+                                                                       "[Ljava/lang/Class;");
+  ASSERT_TRUE(interfacesFieldClass != nullptr);
+  mirror::Class* throwsFieldClass = class_linker_->FindSystemClass(soa.Self(),
+                                                                   "[[Ljava/lang/Class;");
+  ASSERT_TRUE(throwsFieldClass != nullptr);
+
+  // Test "Class[] interfaces" field.
+  FieldHelper fh(static_fields->Get(0));
+  EXPECT_EQ("interfaces", std::string(fh.GetName()));
+  EXPECT_EQ("[Ljava/lang/Class;", std::string(fh.GetTypeDescriptor()));
+  EXPECT_EQ(interfacesFieldClass, fh.GetType());
+  EXPECT_EQ("L$Proxy1234;", std::string(fh.GetDeclaringClassDescriptor()));
+  EXPECT_FALSE(fh.IsPrimitiveType());
+
+  // Test "Class[][] throws" field.
+  fh.ChangeField(static_fields->Get(1));
+  EXPECT_EQ("throws", std::string(fh.GetName()));
+  EXPECT_EQ("[[Ljava/lang/Class;", std::string(fh.GetTypeDescriptor()));
+  EXPECT_EQ(throwsFieldClass, fh.GetType());
+  EXPECT_EQ("L$Proxy1234;", std::string(fh.GetDeclaringClassDescriptor()));
+  EXPECT_FALSE(fh.IsPrimitiveType());
+}
 
 }  // namespace art
diff --git a/runtime/read_barrier.h b/runtime/read_barrier.h
new file mode 100644
index 0000000..ba0d830
--- /dev/null
+++ b/runtime/read_barrier.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_READ_BARRIER_H_
+#define ART_RUNTIME_READ_BARRIER_H_
+
+// This is in a separate file (from globals.h) because asm_support.h
+// (a C header, not C++) can't include globals.h.
+
+// Uncomment one of the following two and the two fields in
+// Object.java (libcore) to enable baker or brooks pointers.
+
+// #define USE_BAKER_READ_BARRIER
+// #define USE_BROOKS_READ_BARRIER
+
+#if defined(USE_BAKER_READ_BARRIER) || defined(USE_BROOKS_READ_BARRIER)
+#define USE_BAKER_OR_BROOKS_READ_BARRIER
+#endif
+
+#if defined(USE_BAKER_READ_BARRIER) && defined(USE_BROOKS_READ_BARRIER)
+#error "Only one of Baker or Brooks can be enabled at a time."
+#endif
+
+#endif  // ART_RUNTIME_READ_BARRIER_H_
diff --git a/runtime/reflection.cc b/runtime/reflection.cc
index 6ed61f6..f0ba003 100644
--- a/runtime/reflection.cc
+++ b/runtime/reflection.cc
@@ -799,8 +799,7 @@
   visitor.WalkStack();
   mirror::Class* caller_class = visitor.caller->GetDeclaringClass();
 
-  if ((((access_flags & kAccPublic) != 0) && declaring_class->IsPublic()) ||
-      caller_class == declaring_class) {
+  if (((access_flags & kAccPublic) != 0) || (caller_class == declaring_class)) {
     return true;
   }
   if ((access_flags & kAccPrivate) != 0) {
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index a8da2f8..1b3c996 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -22,6 +22,7 @@
 
 #include <signal.h>
 #include <sys/syscall.h>
+#include <valgrind.h>
 
 #include <cstdio>
 #include <cstdlib>
@@ -77,6 +78,9 @@
 
 namespace art {
 
+static constexpr bool kEnableJavaStackTraceHandler = true;
+const char* Runtime::kDefaultInstructionSetFeatures =
+    STRINGIFY(ART_DEFAULT_INSTRUCTION_SET_FEATURES);
 Runtime* Runtime::instance_ = NULL;
 
 Runtime::Runtime()
@@ -111,6 +115,7 @@
       exit_(nullptr),
       abort_(nullptr),
       stats_enabled_(false),
+      running_on_valgrind_(RUNNING_ON_VALGRIND > 0),
       profile_(false),
       profile_period_s_(0),
       profile_duration_s_(0),
@@ -124,10 +129,11 @@
       system_thread_group_(nullptr),
       system_class_loader_(nullptr),
       dump_gc_performance_on_shutdown_(false),
-      preinitialization_transaction(nullptr),
+      preinitialization_transaction_(nullptr),
       null_pointer_handler_(nullptr),
       suspend_handler_(nullptr),
-      stack_overflow_handler_(nullptr) {
+      stack_overflow_handler_(nullptr),
+      verify_(false) {
   for (int i = 0; i < Runtime::kLastCalleeSaveType; i++) {
     callee_save_methods_[i] = nullptr;
   }
@@ -516,6 +522,7 @@
   thread_list_ = new ThreadList;
   intern_table_ = new InternTable;
 
+  verify_ = options->verify_;
 
   if (options->interpreter_only_) {
     GetInstrumentation()->ForceInterpretOnly();
@@ -523,12 +530,11 @@
 
   if (options->explicit_checks_ != (ParsedOptions::kExplicitSuspendCheck |
         ParsedOptions::kExplicitNullCheck |
-        ParsedOptions::kExplicitStackOverflowCheck)) {
-    // Initialize the fault manager.
+        ParsedOptions::kExplicitStackOverflowCheck) || kEnableJavaStackTraceHandler) {
     fault_manager.Init();
 
-    // These need to be in a specific order.  The null point check must be
-    // the last in the list.
+    // These need to be in a specific order.  The null point check handler must be
+    // after the suspend check and stack overflow check handlers.
     if ((options->explicit_checks_ & ParsedOptions::kExplicitSuspendCheck) == 0) {
       suspend_handler_ = new SuspensionHandler(&fault_manager);
     }
@@ -540,6 +546,10 @@
     if ((options->explicit_checks_ & ParsedOptions::kExplicitNullCheck) == 0) {
       null_pointer_handler_ = new NullPointerHandler(&fault_manager);
     }
+
+    if (kEnableJavaStackTraceHandler) {
+      new JavaStackTraceHandler(&fault_manager);
+    }
   }
 
   heap_ = new gc::Heap(options->heap_initial_size_,
@@ -576,7 +586,7 @@
   // objects. We can't supply a thread group yet; it will be fixed later. Since we are the main
   // thread, we do not get a java peer.
   Thread* self = Thread::Attach("main", false, NULL, false);
-  CHECK_EQ(self->thin_lock_thread_id_, ThreadList::kMainThreadId);
+  CHECK_EQ(self->GetThreadId(), ThreadList::kMainThreadId);
   CHECK(self != NULL);
 
   // Set us to runnable so tools using a runtime can allocate and GC by default
@@ -702,7 +712,7 @@
   REGISTER(register_dalvik_system_VMDebug);
   REGISTER(register_dalvik_system_VMRuntime);
   REGISTER(register_dalvik_system_VMStack);
-  REGISTER(register_dalvik_system_Zygote);
+  REGISTER(register_dalvik_system_ZygoteHooks);
   REGISTER(register_java_lang_Class);
   REGISTER(register_java_lang_DexCache);
   REGISTER(register_java_lang_Object);
@@ -894,8 +904,8 @@
       verifier->VisitRoots(callback, arg);
     }
   }
-  if (preinitialization_transaction != nullptr) {
-    preinitialization_transaction->VisitRoots(callback, arg);
+  if (preinitialization_transaction_ != nullptr) {
+    preinitialization_transaction_->VisitRoots(callback, arg);
   }
   instrumentation_.VisitRoots(callback, arg);
 }
@@ -1141,73 +1151,68 @@
 }
 
 // Transaction support.
-// TODO move them to header file for inlining.
-bool Runtime::IsActiveTransaction() const {
-  return preinitialization_transaction != nullptr;
-}
-
 void Runtime::EnterTransactionMode(Transaction* transaction) {
   DCHECK(IsCompiler());
   DCHECK(transaction != nullptr);
   DCHECK(!IsActiveTransaction());
-  preinitialization_transaction = transaction;
+  preinitialization_transaction_ = transaction;
 }
 
 void Runtime::ExitTransactionMode() {
   DCHECK(IsCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction = nullptr;
+  preinitialization_transaction_ = nullptr;
 }
 
 void Runtime::RecordWriteField32(mirror::Object* obj, MemberOffset field_offset,
                                  uint32_t value, bool is_volatile) const {
   DCHECK(IsCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction->RecordWriteField32(obj, field_offset, value, is_volatile);
+  preinitialization_transaction_->RecordWriteField32(obj, field_offset, value, is_volatile);
 }
 
 void Runtime::RecordWriteField64(mirror::Object* obj, MemberOffset field_offset,
                                  uint64_t value, bool is_volatile) const {
   DCHECK(IsCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction->RecordWriteField64(obj, field_offset, value, is_volatile);
+  preinitialization_transaction_->RecordWriteField64(obj, field_offset, value, is_volatile);
 }
 
 void Runtime::RecordWriteFieldReference(mirror::Object* obj, MemberOffset field_offset,
                                         mirror::Object* value, bool is_volatile) const {
   DCHECK(IsCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction->RecordWriteFieldReference(obj, field_offset, value, is_volatile);
+  preinitialization_transaction_->RecordWriteFieldReference(obj, field_offset, value, is_volatile);
 }
 
 void Runtime::RecordWriteArray(mirror::Array* array, size_t index, uint64_t value) const {
   DCHECK(IsCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction->RecordWriteArray(array, index, value);
+  preinitialization_transaction_->RecordWriteArray(array, index, value);
 }
 
 void Runtime::RecordStrongStringInsertion(mirror::String* s, uint32_t hash_code) const {
   DCHECK(IsCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction->RecordStrongStringInsertion(s, hash_code);
+  preinitialization_transaction_->RecordStrongStringInsertion(s, hash_code);
 }
 
 void Runtime::RecordWeakStringInsertion(mirror::String* s, uint32_t hash_code) const {
   DCHECK(IsCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction->RecordWeakStringInsertion(s, hash_code);
+  preinitialization_transaction_->RecordWeakStringInsertion(s, hash_code);
 }
 
 void Runtime::RecordStrongStringRemoval(mirror::String* s, uint32_t hash_code) const {
   DCHECK(IsCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction->RecordStrongStringRemoval(s, hash_code);
+  preinitialization_transaction_->RecordStrongStringRemoval(s, hash_code);
 }
 
 void Runtime::RecordWeakStringRemoval(mirror::String* s, uint32_t hash_code) const {
   DCHECK(IsCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction->RecordWeakStringRemoval(s, hash_code);
+  preinitialization_transaction_->RecordWeakStringRemoval(s, hash_code);
 }
 
 void Runtime::SetFaultMessage(const std::string& message) {
@@ -1215,6 +1220,59 @@
   fault_message_ = message;
 }
 
+void Runtime::AddCurrentRuntimeFeaturesAsDex2OatArguments(std::vector<std::string>* argv)
+    const {
+  argv->push_back("--runtime-arg");
+  std::string checkstr = "-implicit-checks";
+
+  int nchecks = 0;
+  char checksep = ':';
+
+  if (!ExplicitNullChecks()) {
+    checkstr += checksep;
+    checksep = ',';
+    checkstr += "null";
+    ++nchecks;
+  }
+  if (!ExplicitSuspendChecks()) {
+    checkstr += checksep;
+    checksep = ',';
+    checkstr += "suspend";
+    ++nchecks;
+  }
+
+  if (!ExplicitStackOverflowChecks()) {
+    checkstr += checksep;
+    checksep = ',';
+    checkstr += "stack";
+    ++nchecks;
+  }
+
+  if (nchecks == 0) {
+    checkstr += ":none";
+  }
+  argv->push_back(checkstr);
+
+  // Make the dex2oat instruction set match that of the launching runtime. If we have multiple
+  // architecture support, dex2oat may be compiled as a different instruction-set than that
+  // currently being executed.
+#if defined(__arm__)
+  argv->push_back("--instruction-set=arm");
+#elif defined(__aarch64__)
+  argv->push_back("--instruction-set=arm64");
+#elif defined(__i386__)
+  argv->push_back("--instruction-set=x86");
+#elif defined(__x86_64__)
+  argv->push_back("--instruction-set=x86_64");
+#elif defined(__mips__)
+  argv->push_back("--instruction-set=mips");
+#endif
+
+  std::string features("--instruction-set-features=");
+  features += GetDefaultInstructionSetFeatures();
+  argv->push_back(features);
+}
+
 void Runtime::UpdateProfilerState(int state) {
   LOG(DEBUG) << "Profiler state updated to " << state;
 }
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 50c88d3..7b3e04c 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -378,7 +378,9 @@
   void UpdateProfilerState(int state);
 
   // Transaction support.
-  bool IsActiveTransaction() const;
+  bool IsActiveTransaction() const {
+    return preinitialization_transaction_ != nullptr;
+  }
   void EnterTransactionMode(Transaction* transaction);
   void ExitTransactionMode();
   void RecordWriteField32(mirror::Object* obj, MemberOffset field_offset, uint32_t value,
@@ -405,6 +407,8 @@
     return fault_message_;
   }
 
+  void AddCurrentRuntimeFeaturesAsDex2OatArguments(std::vector<std::string>* arg_vector) const;
+
   bool ExplicitNullChecks() const {
     return null_pointer_handler_ == nullptr;
   }
@@ -417,6 +421,18 @@
     return stack_overflow_handler_ == nullptr;
   }
 
+  bool IsVerificationEnabled() const {
+    return verify_;
+  }
+
+  bool RunningOnValgrind() const {
+    return running_on_valgrind_;
+  }
+
+  static const char* GetDefaultInstructionSetFeatures() {
+    return kDefaultInstructionSetFeatures;
+  }
+
  private:
   static void InitPlatformSignalHandlers();
 
@@ -433,15 +449,15 @@
   void StartDaemonThreads();
   void StartSignalCatcher();
 
-  // NOTE: these must match the gc::ProcessState values as they come directly
-  // from the framework.
-  static constexpr int kProfileForground = 0;
-  static constexpr int kProfileBackgrouud = 1;
-
-
   // A pointer to the active runtime or NULL.
   static Runtime* instance_;
 
+  static const char* kDefaultInstructionSetFeatures;
+
+  // NOTE: these must match the gc::ProcessState values as they come directly from the framework.
+  static constexpr int kProfileForground = 0;
+  static constexpr int kProfileBackgrouud = 1;
+
   mirror::ArtMethod* callee_save_methods_[kLastCalleeSaveType];
   mirror::Throwable* pre_allocated_OutOfMemoryError_;
   mirror::ArtMethod* resolution_method_;
@@ -517,6 +533,8 @@
   bool stats_enabled_;
   RuntimeStats stats_;
 
+  const bool running_on_valgrind_;
+
   // Runtime profile support.
   bool profile_;
   std::string profile_output_filename_;
@@ -544,11 +562,14 @@
   bool dump_gc_performance_on_shutdown_;
 
   // Transaction used for pre-initializing classes at compilation time.
-  Transaction* preinitialization_transaction;
+  Transaction* preinitialization_transaction_;
   NullPointerHandler* null_pointer_handler_;
   SuspensionHandler* suspend_handler_;
   StackOverflowHandler* stack_overflow_handler_;
 
+  // If false, verification is disabled. True by default.
+  bool verify_;
+
   DISALLOW_COPY_AND_ASSIGN(Runtime);
 };
 
diff --git a/runtime/runtime_stats.h b/runtime/runtime_stats.h
index 05d3fbb..6ed7fd5 100644
--- a/runtime/runtime_stats.h
+++ b/runtime/runtime_stats.h
@@ -89,20 +89,20 @@
   }
 
   // Number of objects allocated.
-  int allocated_objects;
+  uint64_t allocated_objects;
   // Cumulative size of all objects allocated.
-  int allocated_bytes;
+  uint64_t allocated_bytes;
 
   // Number of objects freed.
-  int freed_objects;
+  uint64_t freed_objects;
   // Cumulative size of all freed objects.
-  int freed_bytes;
+  uint64_t freed_bytes;
 
   // Number of times an allocation triggered a GC.
-  int gc_for_alloc_count;
+  uint64_t gc_for_alloc_count;
 
   // Number of initialized classes.
-  int class_init_count;
+  uint64_t class_init_count;
   // Cumulative time spent in class initialization.
   uint64_t class_init_time_ns;
 
diff --git a/runtime/stack_indirect_reference_table.h b/runtime/stack_indirect_reference_table.h
index e6dda85..daef3ff 100644
--- a/runtime/stack_indirect_reference_table.h
+++ b/runtime/stack_indirect_reference_table.h
@@ -98,14 +98,19 @@
             && (sirt_entry <= (&references_[number_of_references_ - 1])));
   }
 
+  // Offset of link within SIRT, used by generated code
+  static size_t LinkOffset() {
+    return OFFSETOF_MEMBER(StackIndirectReferenceTable, link_);
+  }
+
   // Offset of length within SIRT, used by generated code
   static uint32_t NumberOfReferencesOffset() {
     return OFFSETOF_MEMBER(StackIndirectReferenceTable, number_of_references_);
   }
 
   // Offset of link within SIRT, used by generated code
-  static size_t LinkOffset() {
-    return OFFSETOF_MEMBER(StackIndirectReferenceTable, link_);
+  static size_t ReferencesOffset() {
+    return OFFSETOF_MEMBER(StackIndirectReferenceTable, references_);
   }
 
  private:
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 66077f9..fc886d5 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -51,8 +51,8 @@
   DCHECK_NE(new_state, kRunnable);
   DCHECK_EQ(this, Thread::Current());
   union StateAndFlags old_state_and_flags;
-  old_state_and_flags.as_int = state_and_flags_.as_int;
-  state_and_flags_.as_struct.state = new_state;
+  old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
+  tls32_.state_and_flags.as_struct.state = new_state;
   return static_cast<ThreadState>(old_state_and_flags.as_struct.state);
 }
 
@@ -60,7 +60,7 @@
 #ifdef NDEBUG
   UNUSED(check_locks);  // Keep GCC happy about unused parameters.
 #else
-  CHECK_EQ(0u, no_thread_suspension_) << last_no_thread_suspension_cause_;
+  CHECK_EQ(0u, tls32_.no_thread_suspension) << tlsPtr_.last_no_thread_suspension_cause;
   if (check_locks) {
     bool bad_mutexes_held = false;
     for (int i = kLockLevelCount - 1; i >= 0; --i) {
@@ -88,7 +88,7 @@
   union StateAndFlags old_state_and_flags;
   union StateAndFlags new_state_and_flags;
   while (true) {
-    old_state_and_flags.as_int = state_and_flags_.as_int;
+    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     if (UNLIKELY((old_state_and_flags.as_struct.flags & kCheckpointRequest) != 0)) {
       RunCheckpointFunction();
       continue;
@@ -98,7 +98,7 @@
     new_state_and_flags.as_struct.flags = old_state_and_flags.as_struct.flags;
     new_state_and_flags.as_struct.state = new_state;
     int status = android_atomic_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                       &state_and_flags_.as_int);
+                                       &tls32_.state_and_flags.as_int);
     if (LIKELY(status == 0)) {
       break;
     }
@@ -110,22 +110,22 @@
 inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
   bool done = false;
   union StateAndFlags old_state_and_flags;
-  old_state_and_flags.as_int = state_and_flags_.as_int;
+  old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
   int16_t old_state = old_state_and_flags.as_struct.state;
   DCHECK_NE(static_cast<ThreadState>(old_state), kRunnable);
   do {
     Locks::mutator_lock_->AssertNotHeld(this);  // Otherwise we starve GC..
-    old_state_and_flags.as_int = state_and_flags_.as_int;
+    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
     if (UNLIKELY((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0)) {
       // Wait while our suspend count is non-zero.
       MutexLock mu(this, *Locks::thread_suspend_count_lock_);
-      old_state_and_flags.as_int = state_and_flags_.as_int;
+      old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
       DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
       while ((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0) {
         // Re-check when Thread::resume_cond_ is notified.
         Thread::resume_cond_->Wait(this);
-        old_state_and_flags.as_int = state_and_flags_.as_int;
+        old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
         DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
       }
       DCHECK_EQ(GetSuspendCount(), 0);
@@ -133,7 +133,7 @@
     // Re-acquire shared mutator_lock_ access.
     Locks::mutator_lock_->SharedLock(this);
     // Atomically change from suspended to runnable if no suspend request pending.
-    old_state_and_flags.as_int = state_and_flags_.as_int;
+    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
     if (LIKELY((old_state_and_flags.as_struct.flags & kSuspendRequest) == 0)) {
       union StateAndFlags new_state_and_flags;
@@ -141,7 +141,7 @@
       new_state_and_flags.as_struct.state = kRunnable;
       // CAS the value without a memory barrier, that occurred in the lock above.
       done = android_atomic_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                &state_and_flags_.as_int) == 0;
+                                &tls32_.state_and_flags.as_int) == 0;
     }
     if (UNLIKELY(!done)) {
       // Failed to transition to Runnable. Release shared mutator_lock_ access and try again.
@@ -161,26 +161,27 @@
 }
 
 inline size_t Thread::TlabSize() const {
-  return thread_local_end_ - thread_local_pos_;
+  return tlsPtr_.thread_local_end - tlsPtr_.thread_local_pos;
 }
 
 inline mirror::Object* Thread::AllocTlab(size_t bytes) {
   DCHECK_GE(TlabSize(), bytes);
-  ++thread_local_objects_;
-  mirror::Object* ret = reinterpret_cast<mirror::Object*>(thread_local_pos_);
-  thread_local_pos_ += bytes;
+  ++tlsPtr_.thread_local_objects;
+  mirror::Object* ret = reinterpret_cast<mirror::Object*>(tlsPtr_.thread_local_pos);
+  tlsPtr_.thread_local_pos += bytes;
   return ret;
 }
 
 inline bool Thread::PushOnThreadLocalAllocationStack(mirror::Object* obj) {
-  DCHECK_LE(thread_local_alloc_stack_top_, thread_local_alloc_stack_end_);
-  if (thread_local_alloc_stack_top_ < thread_local_alloc_stack_end_) {
+  DCHECK_LE(tlsPtr_.thread_local_alloc_stack_top, tlsPtr_.thread_local_alloc_stack_end);
+  if (tlsPtr_.thread_local_alloc_stack_top < tlsPtr_.thread_local_alloc_stack_end) {
     // There's room.
-    DCHECK_LE(reinterpret_cast<byte*>(thread_local_alloc_stack_top_) + sizeof(mirror::Object*),
-              reinterpret_cast<byte*>(thread_local_alloc_stack_end_));
-    DCHECK(*thread_local_alloc_stack_top_ == nullptr);
-    *thread_local_alloc_stack_top_ = obj;
-    ++thread_local_alloc_stack_top_;
+    DCHECK_LE(reinterpret_cast<byte*>(tlsPtr_.thread_local_alloc_stack_top) +
+                  sizeof(mirror::Object*),
+              reinterpret_cast<byte*>(tlsPtr_.thread_local_alloc_stack_end));
+    DCHECK(*tlsPtr_.thread_local_alloc_stack_top == nullptr);
+    *tlsPtr_.thread_local_alloc_stack_top = obj;
+    ++tlsPtr_.thread_local_alloc_stack_top;
     return true;
   }
   return false;
@@ -193,8 +194,8 @@
   DCHECK_ALIGNED(start, sizeof(mirror::Object*));
   DCHECK_ALIGNED(end, sizeof(mirror::Object*));
   DCHECK_LT(start, end);
-  thread_local_alloc_stack_end_ = end;
-  thread_local_alloc_stack_top_ = start;
+  tlsPtr_.thread_local_alloc_stack_end = end;
+  tlsPtr_.thread_local_alloc_stack_top = start;
 }
 
 inline void Thread::RevokeThreadLocalAllocationStack() {
@@ -204,8 +205,8 @@
     DCHECK(this == self || IsSuspended() || GetState() == kWaitingPerformingGc)
         << GetState() << " thread " << this << " self " << self;
   }
-  thread_local_alloc_stack_end_ = nullptr;
-  thread_local_alloc_stack_top_ = nullptr;
+  tlsPtr_.thread_local_alloc_stack_end = nullptr;
+  tlsPtr_.thread_local_alloc_stack_top = nullptr;
 }
 
 }  // namespace art
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 3692b9f..29d011c 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -79,57 +79,49 @@
 static const char* kThreadNameDuringStartup = "<native thread without managed peer>";
 
 void Thread::InitCardTable() {
-  card_table_ = Runtime::Current()->GetHeap()->GetCardTable()->GetBiasedBegin();
+  tlsPtr_.card_table = Runtime::Current()->GetHeap()->GetCardTable()->GetBiasedBegin();
 }
 
-#if !defined(__APPLE__)
 static void UnimplementedEntryPoint() {
   UNIMPLEMENTED(FATAL);
 }
-#endif
 
 void InitEntryPoints(InterpreterEntryPoints* ipoints, JniEntryPoints* jpoints,
                      PortableEntryPoints* ppoints, QuickEntryPoints* qpoints);
 
 void Thread::InitTlsEntryPoints() {
-#if !defined(__APPLE__)  // The Mac GCC is too old to accept this code.
   // Insert a placeholder so we can easily tell if we call an unimplemented entry point.
-  uintptr_t* begin = reinterpret_cast<uintptr_t*>(&interpreter_entrypoints_);
-  uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(begin) + sizeof(quick_entrypoints_));
+  uintptr_t* begin = reinterpret_cast<uintptr_t*>(&tlsPtr_.interpreter_entrypoints);
+  uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(begin) +
+                                                sizeof(tlsPtr_.quick_entrypoints));
   for (uintptr_t* it = begin; it != end; ++it) {
     *it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);
   }
-  begin = reinterpret_cast<uintptr_t*>(&interpreter_entrypoints_);
-  end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(begin) + sizeof(portable_entrypoints_));
-  for (uintptr_t* it = begin; it != end; ++it) {
-    *it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);
-  }
-#endif
-  InitEntryPoints(&interpreter_entrypoints_, &jni_entrypoints_, &portable_entrypoints_,
-                  &quick_entrypoints_);
+  InitEntryPoints(&tlsPtr_.interpreter_entrypoints, &tlsPtr_.jni_entrypoints,
+                  &tlsPtr_.portable_entrypoints, &tlsPtr_.quick_entrypoints);
 }
 
 void Thread::ResetQuickAllocEntryPointsForThread() {
-  ResetQuickAllocEntryPoints(&quick_entrypoints_);
+  ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints);
 }
 
 void Thread::SetDeoptimizationShadowFrame(ShadowFrame* sf) {
-  deoptimization_shadow_frame_ = sf;
+  tlsPtr_.deoptimization_shadow_frame = sf;
 }
 
 void Thread::SetDeoptimizationReturnValue(const JValue& ret_val) {
-  deoptimization_return_value_.SetJ(ret_val.GetJ());
+  tls64_.deoptimization_return_value.SetJ(ret_val.GetJ());
 }
 
 ShadowFrame* Thread::GetAndClearDeoptimizationShadowFrame(JValue* ret_val) {
-  ShadowFrame* sf = deoptimization_shadow_frame_;
-  deoptimization_shadow_frame_ = nullptr;
-  ret_val->SetJ(deoptimization_return_value_.GetJ());
+  ShadowFrame* sf = tlsPtr_.deoptimization_shadow_frame;
+  tlsPtr_.deoptimization_shadow_frame = nullptr;
+  ret_val->SetJ(tls64_.deoptimization_return_value.GetJ());
   return sf;
 }
 
 void Thread::InitTid() {
-  tid_ = ::art::GetTid();
+  tls32_.tid = ::art::GetTid();
 }
 
 void Thread::InitAfterFork() {
@@ -159,10 +151,10 @@
     ScopedObjectAccess soa(self);
 
     // Copy peer into self, deleting global reference when done.
-    CHECK(self->jpeer_ != nullptr);
-    self->opeer_ = soa.Decode<mirror::Object*>(self->jpeer_);
-    self->GetJniEnv()->DeleteGlobalRef(self->jpeer_);
-    self->jpeer_ = nullptr;
+    CHECK(self->tlsPtr_.jpeer != nullptr);
+    self->tlsPtr_.opeer = soa.Decode<mirror::Object*>(self->tlsPtr_.jpeer);
+    self->GetJniEnv()->DeleteGlobalRef(self->tlsPtr_.jpeer);
+    self->tlsPtr_.jpeer = nullptr;
 
     {
       SirtRef<mirror::String> thread_name(self, self->GetThreadName(soa));
@@ -171,7 +163,7 @@
     Dbg::PostThreadStart(self);
 
     // Invoke the 'run' method of our java.lang.Thread.
-    mirror::Object* receiver = self->opeer_;
+    mirror::Object* receiver = self->tlsPtr_.opeer;
     jmethodID mid = WellKnownClasses::java_lang_Thread_run;
     InvokeVirtualOrInterfaceWithJValues(soa, receiver, mid, nullptr);
   }
@@ -237,7 +229,7 @@
 // is the StackOverflow reserved region used when creating the StackOverflow
 // exception.
 void Thread::InstallImplicitProtection(bool is_main_stack) {
-  byte* pregion = stack_end_;
+  byte* pregion = tlsPtr_.stack_end;
 
   constexpr uint32_t kMarker = 0xdadadada;
   uintptr_t *marker = reinterpret_cast<uintptr_t*>(pregion);
@@ -288,7 +280,7 @@
 
   Thread* child_thread = new Thread(is_daemon);
   // Use global JNI ref to hold peer live while child thread starts.
-  child_thread->jpeer_ = env->NewGlobalRef(java_peer);
+  child_thread->tlsPtr_.jpeer = env->NewGlobalRef(java_peer);
   stack_size = FixStackSize(stack_size);
 
   // Thread.start is synchronized, so we know that nativePeer is 0, and know that we're not racing to
@@ -311,8 +303,8 @@
       runtime->EndThreadBirth();
     }
     // Manually delete the global reference since Thread::Init will not have been run.
-    env->DeleteGlobalRef(child_thread->jpeer_);
-    child_thread->jpeer_ = nullptr;
+    env->DeleteGlobalRef(child_thread->tlsPtr_.jpeer);
+    child_thread->tlsPtr_.jpeer = nullptr;
     delete child_thread;
     child_thread = nullptr;
     // TODO: remove from thread group?
@@ -340,15 +332,15 @@
   InitTid();
   // Set pthread_self_ ahead of pthread_setspecific, that makes Thread::Current function, this
   // avoids pthread_self_ ever being invalid when discovered from Thread::Current().
-  pthread_self_ = pthread_self();
+  tlsPtr_.pthread_self = pthread_self();
   CHECK(is_started_);
   CHECK_PTHREAD_CALL(pthread_setspecific, (Thread::pthread_key_self_, this), "attach self");
   DCHECK_EQ(Thread::Current(), this);
 
-  thin_lock_thread_id_ = thread_list->AllocThreadId(this);
+  tls32_.thin_lock_thread_id = thread_list->AllocThreadId(this);
   InitStackHwm();
 
-  jni_env_ = new JNIEnvExt(this, java_vm);
+  tlsPtr_.jni_env = new JNIEnvExt(this, java_vm);
   thread_list->Register(this);
 }
 
@@ -385,7 +377,7 @@
   } else {
     // These aren't necessary, but they improve diagnostics for unit tests & command-line tools.
     if (thread_name != nullptr) {
-      self->name_->assign(thread_name);
+      self->tlsPtr_.name->assign(thread_name);
       ::art::SetThreadName(thread_name);
     }
   }
@@ -396,7 +388,7 @@
 void Thread::CreatePeer(const char* name, bool as_daemon, jobject thread_group) {
   Runtime* runtime = Runtime::Current();
   CHECK(runtime->IsStarted());
-  JNIEnv* env = jni_env_;
+  JNIEnv* env = tlsPtr_.jni_env;
 
   if (thread_group == nullptr) {
     thread_group = runtime->GetMainThreadGroup();
@@ -412,7 +404,7 @@
   }
   {
     ScopedObjectAccess soa(this);
-    opeer_ = soa.Decode<mirror::Object*>(peer.get());
+    tlsPtr_.opeer = soa.Decode<mirror::Object*>(peer.get());
   }
   env->CallNonvirtualVoidMethod(peer.get(),
                                 WellKnownClasses::java_lang_Thread,
@@ -422,8 +414,8 @@
 
   Thread* self = this;
   DCHECK_EQ(self, Thread::Current());
-  jni_env_->SetLongField(peer.get(), WellKnownClasses::java_lang_Thread_nativePeer,
-                         reinterpret_cast<jlong>(self));
+  env->SetLongField(peer.get(), WellKnownClasses::java_lang_Thread_nativePeer,
+                    reinterpret_cast<jlong>(self));
 
   ScopedObjectAccess soa(self);
   SirtRef<mirror::String> peer_thread_name(soa.Self(), GetThreadName(soa));
@@ -449,34 +441,36 @@
 void Thread::InitPeer(ScopedObjectAccess& soa, jboolean thread_is_daemon, jobject thread_group,
                       jobject thread_name, jint thread_priority) {
   soa.DecodeField(WellKnownClasses::java_lang_Thread_daemon)->
-      SetBoolean<kTransactionActive>(opeer_, thread_is_daemon);
+      SetBoolean<kTransactionActive>(tlsPtr_.opeer, thread_is_daemon);
   soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->
-      SetObject<kTransactionActive>(opeer_, soa.Decode<mirror::Object*>(thread_group));
+      SetObject<kTransactionActive>(tlsPtr_.opeer, soa.Decode<mirror::Object*>(thread_group));
   soa.DecodeField(WellKnownClasses::java_lang_Thread_name)->
-      SetObject<kTransactionActive>(opeer_, soa.Decode<mirror::Object*>(thread_name));
+      SetObject<kTransactionActive>(tlsPtr_.opeer, soa.Decode<mirror::Object*>(thread_name));
   soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)->
-      SetInt<kTransactionActive>(opeer_, thread_priority);
+      SetInt<kTransactionActive>(tlsPtr_.opeer, thread_priority);
 }
 
 void Thread::SetThreadName(const char* name) {
-  name_->assign(name);
+  tlsPtr_.name->assign(name);
   ::art::SetThreadName(name);
   Dbg::DdmSendThreadNotification(this, CHUNK_TYPE("THNM"));
 }
 
 void Thread::InitStackHwm() {
-  void* stack_base;
-  size_t stack_size;
-  GetThreadStack(pthread_self_, &stack_base, &stack_size);
+  void* read_stack_base;
+  size_t read_stack_size;
+  GetThreadStack(tlsPtr_.pthread_self, &read_stack_base, &read_stack_size);
 
   // TODO: include this in the thread dumps; potentially useful in SIGQUIT output?
-  VLOG(threads) << StringPrintf("Native stack is at %p (%s)", stack_base, PrettySize(stack_size).c_str());
+  VLOG(threads) << StringPrintf("Native stack is at %p (%s)", read_stack_base,
+                                PrettySize(read_stack_size).c_str());
 
-  stack_begin_ = reinterpret_cast<byte*>(stack_base);
-  stack_size_ = stack_size;
+  tlsPtr_.stack_begin = reinterpret_cast<byte*>(read_stack_base);
+  tlsPtr_.stack_size = read_stack_size;
 
-  if (stack_size_ <= kStackOverflowReservedBytes) {
-    LOG(FATAL) << "Attempt to attach a thread with a too-small stack (" << stack_size_ << " bytes)";
+  if (read_stack_size <= kStackOverflowReservedBytes) {
+    LOG(FATAL) << "Attempt to attach a thread with a too-small stack (" << read_stack_size
+        << " bytes)";
   }
 
   // TODO: move this into the Linux GetThreadStack implementation.
@@ -500,12 +494,12 @@
       CHECK_PTHREAD_CALL(pthread_attr_destroy, (&default_attributes), "default stack size query");
 
       // ...and use that as our limit.
-      size_t old_stack_size = stack_size_;
-      stack_size_ = default_stack_size;
-      stack_begin_ += (old_stack_size - stack_size_);
+      size_t old_stack_size = read_stack_size;
+      tlsPtr_.stack_size = default_stack_size;
+      tlsPtr_.stack_begin += (old_stack_size - default_stack_size);
       VLOG(threads) << "Limiting unlimited stack (reported as " << PrettySize(old_stack_size) << ")"
-                    << " to " << PrettySize(stack_size_)
-                    << " with base " << reinterpret_cast<void*>(stack_begin_);
+                    << " to " << PrettySize(default_stack_size)
+                    << " with base " << reinterpret_cast<void*>(tlsPtr_.stack_begin);
     }
   }
 #endif
@@ -521,16 +515,16 @@
       // to install our own region so we need to move the limits
       // of the stack to make room for it.
       constexpr uint32_t kDelta = 16 * KB;
-      stack_begin_ += kDelta;
-      stack_end_ += kDelta;
-      stack_size_ -= kDelta;
+      tlsPtr_.stack_begin += kDelta;
+      tlsPtr_.stack_end += kDelta;
+      tlsPtr_.stack_size -= kDelta;
     }
     InstallImplicitProtection(is_main_thread);
   }
 
   // Sanity check.
   int stack_variable;
-  CHECK_GT(&stack_variable, reinterpret_cast<void*>(stack_end_));
+  CHECK_GT(&stack_variable, reinterpret_cast<void*>(tlsPtr_.stack_end));
 }
 
 void Thread::ShortDump(std::ostream& os) const {
@@ -542,8 +536,8 @@
   }
   os << GetState()
            << ",Thread*=" << this
-           << ",peer=" << opeer_
-           << ",\"" << *name_ << "\""
+           << ",peer=" << tlsPtr_.opeer
+           << ",\"" << *tlsPtr_.name << "\""
            << "]";
 }
 
@@ -554,17 +548,17 @@
 
 mirror::String* Thread::GetThreadName(const ScopedObjectAccessUnchecked& soa) const {
   mirror::ArtField* f = soa.DecodeField(WellKnownClasses::java_lang_Thread_name);
-  return (opeer_ != nullptr) ? reinterpret_cast<mirror::String*>(f->GetObject(opeer_)) : nullptr;
+  return (tlsPtr_.opeer != nullptr) ? reinterpret_cast<mirror::String*>(f->GetObject(tlsPtr_.opeer)) : nullptr;
 }
 
 void Thread::GetThreadName(std::string& name) const {
-  name.assign(*name_);
+  name.assign(*tlsPtr_.name);
 }
 
 uint64_t Thread::GetCpuMicroTime() const {
 #if defined(HAVE_POSIX_CLOCKS)
   clockid_t cpu_clock_id;
-  pthread_getcpuclockid(pthread_self_, &cpu_clock_id);
+  pthread_getcpuclockid(tlsPtr_.pthread_self, &cpu_clock_id);
   timespec now;
   clock_gettime(cpu_clock_id, &now);
   return static_cast<uint64_t>(now.tv_sec) * UINT64_C(1000000) + now.tv_nsec / UINT64_C(1000);
@@ -575,11 +569,11 @@
 }
 
 void Thread::AtomicSetFlag(ThreadFlag flag) {
-  android_atomic_or(flag, &state_and_flags_.as_int);
+  android_atomic_or(flag, &tls32_.state_and_flags.as_int);
 }
 
 void Thread::AtomicClearFlag(ThreadFlag flag) {
-  android_atomic_and(-1 ^ flag, &state_and_flags_.as_int);
+  android_atomic_and(-1 ^ flag, &tls32_.state_and_flags.as_int);
 }
 
 // Attempt to rectify locks so that we dump thread list with required locks before exiting.
@@ -604,24 +598,26 @@
 }
 
 void Thread::ModifySuspendCount(Thread* self, int delta, bool for_debugger) {
-  DCHECK(delta == -1 || delta == +1 || delta == -debug_suspend_count_)
-      << delta << " " << debug_suspend_count_ << " " << this;
-  DCHECK_GE(suspend_count_, debug_suspend_count_) << this;
-  Locks::thread_suspend_count_lock_->AssertHeld(self);
-  if (this != self && !IsSuspended()) {
-    Locks::thread_list_lock_->AssertHeld(self);
+  if (kIsDebugBuild) {
+    DCHECK(delta == -1 || delta == +1 || delta == -tls32_.debug_suspend_count)
+          << delta << " " << tls32_.debug_suspend_count << " " << this;
+    DCHECK_GE(tls32_.suspend_count, tls32_.debug_suspend_count) << this;
+    Locks::thread_suspend_count_lock_->AssertHeld(self);
+    if (this != self && !IsSuspended()) {
+      Locks::thread_list_lock_->AssertHeld(self);
+    }
   }
-  if (UNLIKELY(delta < 0 && suspend_count_ <= 0)) {
+  if (UNLIKELY(delta < 0 && tls32_.suspend_count <= 0)) {
     UnsafeLogFatalForSuspendCount(self, this);
     return;
   }
 
-  suspend_count_ += delta;
+  tls32_.suspend_count += delta;
   if (for_debugger) {
-    debug_suspend_count_ += delta;
+    tls32_.debug_suspend_count += delta;
   }
 
-  if (suspend_count_ == 0) {
+  if (tls32_.suspend_count == 0) {
     AtomicClearFlag(kSuspendRequest);
   } else {
     AtomicSetFlag(kSuspendRequest);
@@ -639,8 +635,8 @@
   {
     MutexLock mu(this, *Locks::thread_suspend_count_lock_);
     for (uint32_t i = 0; i < kMaxCheckpoints; ++i) {
-      checkpoints[i] = checkpoint_functions_[i];
-      checkpoint_functions_[i] = nullptr;
+      checkpoints[i] = tlsPtr_.checkpoint_functions[i];
+      tlsPtr_.checkpoint_functions[i] = nullptr;
     }
     AtomicClearFlag(kCheckpointRequest);
   }
@@ -661,14 +657,14 @@
 
 bool Thread::RequestCheckpoint(Closure* function) {
   union StateAndFlags old_state_and_flags;
-  old_state_and_flags.as_int = state_and_flags_.as_int;
+  old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
   if (old_state_and_flags.as_struct.state != kRunnable) {
     return false;  // Fail, thread is suspended and so can't run a checkpoint.
   }
 
   uint32_t available_checkpoint = kMaxCheckpoints;
   for (uint32_t i = 0 ; i < kMaxCheckpoints; ++i) {
-    if (checkpoint_functions_[i] == nullptr) {
+    if (tlsPtr_.checkpoint_functions[i] == nullptr) {
       available_checkpoint = i;
       break;
     }
@@ -677,7 +673,7 @@
     // No checkpoint functions available, we can't run a checkpoint
     return false;
   }
-  checkpoint_functions_[available_checkpoint] = function;
+  tlsPtr_.checkpoint_functions[available_checkpoint] = function;
 
   // Checkpoint function installed now install flag bit.
   // We must be runnable to request a checkpoint.
@@ -686,11 +682,11 @@
   new_state_and_flags.as_int = old_state_and_flags.as_int;
   new_state_and_flags.as_struct.flags |= kCheckpointRequest;
   int succeeded = android_atomic_acquire_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                         &state_and_flags_.as_int);
+                                             &tls32_.state_and_flags.as_int);
   if (UNLIKELY(succeeded != 0)) {
     // The thread changed state before the checkpoint was installed.
-    CHECK_EQ(checkpoint_functions_[available_checkpoint], function);
-    checkpoint_functions_[available_checkpoint] = nullptr;
+    CHECK_EQ(tlsPtr_.checkpoint_functions[available_checkpoint], function);
+    tlsPtr_.checkpoint_functions[available_checkpoint] = nullptr;
   } else {
     CHECK_EQ(ReadFlag(kCheckpointRequest), true);
     TriggerSuspend();
@@ -715,13 +711,15 @@
   bool is_daemon = false;
   Thread* self = Thread::Current();
 
-  if (self != nullptr && thread != nullptr && thread->opeer_ != nullptr) {
+  if (self != nullptr && thread != nullptr && thread->tlsPtr_.opeer != nullptr) {
     ScopedObjectAccessUnchecked soa(self);
-    priority = soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)->GetInt(thread->opeer_);
-    is_daemon = soa.DecodeField(WellKnownClasses::java_lang_Thread_daemon)->GetBoolean(thread->opeer_);
+    priority = soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)
+        ->GetInt(thread->tlsPtr_.opeer);
+    is_daemon = soa.DecodeField(WellKnownClasses::java_lang_Thread_daemon)
+        ->GetBoolean(thread->tlsPtr_.opeer);
 
     mirror::Object* thread_group =
-        soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->GetObject(thread->opeer_);
+        soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->GetObject(thread->tlsPtr_.opeer);
 
     if (thread_group != nullptr) {
       mirror::ArtField* group_name_field =
@@ -740,7 +738,7 @@
   }
 
   if (thread != nullptr) {
-    os << '"' << *thread->name_ << '"';
+    os << '"' << *thread->tlsPtr_.name << '"';
     if (is_daemon) {
       os << " daemon";
     }
@@ -760,9 +758,9 @@
   if (thread != nullptr) {
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
     os << "  | group=\"" << group_name << "\""
-       << " sCount=" << thread->suspend_count_
-       << " dsCount=" << thread->debug_suspend_count_
-       << " obj=" << reinterpret_cast<void*>(thread->opeer_)
+       << " sCount=" << thread->tls32_.suspend_count
+       << " dsCount=" << thread->tls32_.debug_suspend_count
+       << " obj=" << reinterpret_cast<void*>(thread->tlsPtr_.opeer)
        << " self=" << reinterpret_cast<const void*>(thread) << "\n";
   }
 
@@ -772,9 +770,10 @@
   if (thread != nullptr) {
     int policy;
     sched_param sp;
-    CHECK_PTHREAD_CALL(pthread_getschedparam, (thread->pthread_self_, &policy, &sp), __FUNCTION__);
+    CHECK_PTHREAD_CALL(pthread_getschedparam, (thread->tlsPtr_.pthread_self, &policy, &sp),
+                       __FUNCTION__);
     os << " sched=" << policy << "/" << sp.sched_priority
-       << " handle=" << reinterpret_cast<void*>(thread->pthread_self_);
+       << " handle=" << reinterpret_cast<void*>(thread->tlsPtr_.pthread_self);
   }
   os << "\n";
 
@@ -799,8 +798,9 @@
      << " core=" << task_cpu
      << " HZ=" << sysconf(_SC_CLK_TCK) << "\n";
   if (thread != nullptr) {
-    os << "  | stack=" << reinterpret_cast<void*>(thread->stack_begin_) << "-" << reinterpret_cast<void*>(thread->stack_end_)
-       << " stackSize=" << PrettySize(thread->stack_size_) << "\n";
+    os << "  | stack=" << reinterpret_cast<void*>(thread->tlsPtr_.stack_begin) << "-"
+        << reinterpret_cast<void*>(thread->tlsPtr_.stack_end) << " stackSize="
+        << PrettySize(thread->tlsPtr_.stack_size) << "\n";
   }
 }
 
@@ -832,7 +832,7 @@
     int line_number = -1;
     if (dex_cache != nullptr) {  // be tolerant of bad input
       const DexFile& dex_file = *dex_cache->GetDexFile();
-      line_number = dex_file.GetLineNumFromPC(m, GetDexPc());
+      line_number = dex_file.GetLineNumFromPC(m, GetDexPc(false));
     }
     if (line_number == last_line_number && last_method == m) {
       ++repetition_count;
@@ -870,7 +870,21 @@
   static void DumpLockedObject(mirror::Object* o, void* context)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     std::ostream& os = *reinterpret_cast<std::ostream*>(context);
-    os << "  - locked <" << o << "> (a " << PrettyTypeOf(o) << ")\n";
+    os << "  - locked ";
+    if (o == nullptr) {
+      os << "an unknown object";
+    } else {
+      if ((o->GetLockWord().GetState() == LockWord::kThinLocked) &&
+          Locks::mutator_lock_->IsExclusiveHeld(Thread::Current())) {
+        // Getting the identity hashcode here would result in lock inflation and suspension of the
+        // current thread, which isn't safe if this is the only runnable thread.
+        os << StringPrintf("<@addr=0x%" PRIxPTR "> (a %s)", reinterpret_cast<intptr_t>(o),
+                           PrettyTypeOf(o).c_str());
+      } else {
+        os << StringPrintf("<0x%08x> (a %s)", o->IdentityHashCode(), PrettyTypeOf(o).c_str());
+      }
+    }
+    os << "\n";
   }
 
   std::ostream& os;
@@ -905,6 +919,13 @@
   return current_method != nullptr && current_method->IsNative();
 }
 
+void Thread::DumpJavaStack(std::ostream& os) const {
+  UniquePtr<Context> context(Context::Create());
+  StackDumpVisitor dumper(os, const_cast<Thread*>(this), context.get(),
+                          !tls32_.throwing_OutOfMemoryError);
+  dumper.WalkStack();
+}
+
 void Thread::DumpStack(std::ostream& os) const {
   // TODO: we call this code when dying but may not have suspended the thread ourself. The
   //       IsSuspended check is therefore racy with the use for dumping (normally we inhibit
@@ -918,9 +939,7 @@
       SirtRef<mirror::ArtMethod> method_ref(Thread::Current(), GetCurrentMethod(nullptr));
       DumpNativeStack(os, GetTid(), "  native: ", false, method_ref.get());
     }
-    UniquePtr<Context> context(Context::Create());
-    StackDumpVisitor dumper(os, const_cast<Thread*>(this), context.get(), !throwing_OutOfMemoryError_);
-    dumper.WalkStack();
+    DumpJavaStack(os);
   } else {
     os << "Not able to dump stack of thread that isn't suspended";
   }
@@ -928,11 +947,12 @@
 
 void Thread::ThreadExitCallback(void* arg) {
   Thread* self = reinterpret_cast<Thread*>(arg);
-  if (self->thread_exit_check_count_ == 0) {
-    LOG(WARNING) << "Native thread exiting without having called DetachCurrentThread (maybe it's going to use a pthread_key_create destructor?): " << *self;
+  if (self->tls32_.thread_exit_check_count == 0) {
+    LOG(WARNING) << "Native thread exiting without having called DetachCurrentThread (maybe it's "
+        "going to use a pthread_key_create destructor?): " << *self;
     CHECK(is_started_);
     CHECK_PTHREAD_CALL(pthread_setspecific, (Thread::pthread_key_self_, self), "reattach self");
-    self->thread_exit_check_count_ = 1;
+    self->tls32_.thread_exit_check_count = 1;
   } else {
     LOG(FATAL) << "Native thread exited without calling DetachCurrentThread: " << *self;
   }
@@ -984,58 +1004,21 @@
   }
 }
 
-Thread::Thread(bool daemon)
-    : suspend_count_(0),
-      card_table_(nullptr),
-      exception_(nullptr),
-      stack_end_(nullptr),
-      managed_stack_(),
-      jni_env_(nullptr),
-      self_(nullptr),
-      opeer_(nullptr),
-      jpeer_(nullptr),
-      stack_begin_(nullptr),
-      stack_size_(0),
-      thin_lock_thread_id_(0),
-      stack_trace_sample_(nullptr),
-      trace_clock_base_(0),
-      tid_(0),
-      wait_mutex_(new Mutex("a thread wait mutex")),
-      wait_cond_(new ConditionVariable("a thread wait condition variable", *wait_mutex_)),
-      wait_monitor_(nullptr),
-      interrupted_(false),
-      wait_next_(nullptr),
-      monitor_enter_object_(nullptr),
-      top_sirt_(nullptr),
-      runtime_(nullptr),
-      class_loader_override_(nullptr),
-      long_jump_context_(nullptr),
-      throwing_OutOfMemoryError_(false),
-      debug_suspend_count_(0),
-      debug_invoke_req_(new DebugInvokeReq),
-      single_step_control_(new SingleStepControl),
-      deoptimization_shadow_frame_(nullptr),
-      instrumentation_stack_(new std::deque<instrumentation::InstrumentationStackFrame>),
-      name_(new std::string(kThreadNameDuringStartup)),
-      daemon_(daemon),
-      pthread_self_(0),
-      no_thread_suspension_(0),
-      last_no_thread_suspension_cause_(nullptr),
-      suspend_trigger_(reinterpret_cast<uintptr_t*>(&suspend_trigger_)),
-      thread_exit_check_count_(0),
-      thread_local_start_(nullptr),
-      thread_local_pos_(nullptr),
-      thread_local_end_(nullptr),
-      thread_local_objects_(0),
-      thread_local_alloc_stack_top_(nullptr),
-      thread_local_alloc_stack_end_(nullptr) {
+Thread::Thread(bool daemon) : tls32_(daemon), wait_monitor_(nullptr), interrupted_(false) {
+  wait_mutex_ = new Mutex("a thread wait mutex");
+  wait_cond_ = new ConditionVariable("a thread wait condition variable", *wait_mutex_);
+  tlsPtr_.debug_invoke_req = new DebugInvokeReq;
+  tlsPtr_.single_step_control = new SingleStepControl;
+  tlsPtr_.instrumentation_stack = new std::deque<instrumentation::InstrumentationStackFrame>;
+  tlsPtr_.name = new std::string(kThreadNameDuringStartup);
+
   CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
-  state_and_flags_.as_struct.flags = 0;
-  state_and_flags_.as_struct.state = kNative;
-  memset(&held_mutexes_[0], 0, sizeof(held_mutexes_));
-  memset(rosalloc_runs_, 0, sizeof(rosalloc_runs_));
+  tls32_.state_and_flags.as_struct.flags = 0;
+  tls32_.state_and_flags.as_struct.state = kNative;
+  memset(&tlsPtr_.held_mutexes[0], 0, sizeof(tlsPtr_.held_mutexes));
+  memset(tlsPtr_.rosalloc_runs, 0, sizeof(tlsPtr_.rosalloc_runs));
   for (uint32_t i = 0; i < kMaxCheckpoints; ++i) {
-    checkpoint_functions_[i] = nullptr;
+    tlsPtr_.checkpoint_functions[i] = nullptr;
   }
 }
 
@@ -1046,7 +1029,8 @@
   // assigned fairly early on, and needs to be.
   // It turns out that the last thing to change is the thread name; that's a good proxy for "has
   // this thread _ever_ entered kRunnable".
-  return (jpeer_ == nullptr && opeer_ == nullptr) || (*name_ == kThreadNameDuringStartup);
+  return (tlsPtr_.jpeer == nullptr && tlsPtr_.opeer == nullptr) ||
+      (*tlsPtr_.name == kThreadNameDuringStartup);
 }
 
 void Thread::AssertNoPendingException() const {
@@ -1084,7 +1068,7 @@
   Thread* self = this;
   DCHECK_EQ(self, Thread::Current());
 
-  if (opeer_ != nullptr) {
+  if (tlsPtr_.opeer != nullptr) {
     ScopedObjectAccess soa(self);
     // We may need to call user-supplied managed code, do this before final clean-up.
     HandleUncaughtExceptions(soa);
@@ -1092,16 +1076,18 @@
 
     // this.nativePeer = 0;
     if (Runtime::Current()->IsActiveTransaction()) {
-      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)->SetLong<true>(opeer_, 0);
+      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)
+          ->SetLong<true>(tlsPtr_.opeer, 0);
     } else {
-      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)->SetLong<false>(opeer_, 0);
+      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)
+          ->SetLong<false>(tlsPtr_.opeer, 0);
     }
     Dbg::PostThreadDeath(self);
 
     // Thread.join() is implemented as an Object.wait() on the Thread.lock object. Signal anyone
     // who is waiting.
     mirror::Object* lock =
-        soa.DecodeField(WellKnownClasses::java_lang_Thread_lock)->GetObject(opeer_);
+        soa.DecodeField(WellKnownClasses::java_lang_Thread_lock)->GetObject(tlsPtr_.opeer);
     // (This conditional is only needed for tests, where Thread.lock won't have been set.)
     if (lock != nullptr) {
       SirtRef<mirror::Object> sirt_obj(self, lock);
@@ -1111,29 +1097,29 @@
   }
 
   // On thread detach, all monitors entered with JNI MonitorEnter are automatically exited.
-  if (jni_env_ != nullptr) {
-    jni_env_->monitors.VisitRoots(MonitorExitVisitor, self, 0, kRootVMInternal);
+  if (tlsPtr_.jni_env != nullptr) {
+    tlsPtr_.jni_env->monitors.VisitRoots(MonitorExitVisitor, self, 0, kRootVMInternal);
   }
 }
 
 Thread::~Thread() {
-  if (jni_env_ != nullptr && jpeer_ != nullptr) {
+  if (tlsPtr_.jni_env != nullptr && tlsPtr_.jpeer != nullptr) {
     // If pthread_create fails we don't have a jni env here.
-    jni_env_->DeleteGlobalRef(jpeer_);
-    jpeer_ = nullptr;
+    tlsPtr_.jni_env->DeleteGlobalRef(tlsPtr_.jpeer);
+    tlsPtr_.jpeer = nullptr;
   }
-  opeer_ = nullptr;
+  tlsPtr_.opeer = nullptr;
 
-  bool initialized = (jni_env_ != nullptr);  // Did Thread::Init run?
+  bool initialized = (tlsPtr_.jni_env != nullptr);  // Did Thread::Init run?
   if (initialized) {
-    delete jni_env_;
-    jni_env_ = nullptr;
+    delete tlsPtr_.jni_env;
+    tlsPtr_.jni_env = nullptr;
   }
   CHECK_NE(GetState(), kRunnable);
   CHECK_NE(ReadFlag(kCheckpointRequest), true);
-  CHECK(checkpoint_functions_[0] == nullptr);
-  CHECK(checkpoint_functions_[1] == nullptr);
-  CHECK(checkpoint_functions_[2] == nullptr);
+  CHECK(tlsPtr_.checkpoint_functions[0] == nullptr);
+  CHECK(tlsPtr_.checkpoint_functions[1] == nullptr);
+  CHECK(tlsPtr_.checkpoint_functions[2] == nullptr);
 
   // We may be deleting a still born thread.
   SetStateUnsafe(kTerminated);
@@ -1141,19 +1127,19 @@
   delete wait_cond_;
   delete wait_mutex_;
 
-  if (long_jump_context_ != nullptr) {
-    delete long_jump_context_;
+  if (tlsPtr_.long_jump_context != nullptr) {
+    delete tlsPtr_.long_jump_context;
   }
 
   if (initialized) {
     CleanupCpu();
   }
 
-  delete debug_invoke_req_;
-  delete single_step_control_;
-  delete instrumentation_stack_;
-  delete name_;
-  delete stack_trace_sample_;
+  delete tlsPtr_.debug_invoke_req;
+  delete tlsPtr_.single_step_control;
+  delete tlsPtr_.instrumentation_stack;
+  delete tlsPtr_.name;
+  delete tlsPtr_.stack_trace_sample;
 
   Runtime::Current()->GetHeap()->RevokeThreadLocalBuffers(this);
 
@@ -1164,47 +1150,50 @@
   if (!IsExceptionPending()) {
     return;
   }
-  ScopedLocalRef<jobject> peer(jni_env_, soa.AddLocalReference<jobject>(opeer_));
+  ScopedLocalRef<jobject> peer(tlsPtr_.jni_env, soa.AddLocalReference<jobject>(tlsPtr_.opeer));
   ScopedThreadStateChange tsc(this, kNative);
 
   // Get and clear the exception.
-  ScopedLocalRef<jthrowable> exception(jni_env_, jni_env_->ExceptionOccurred());
-  jni_env_->ExceptionClear();
+  ScopedLocalRef<jthrowable> exception(tlsPtr_.jni_env, tlsPtr_.jni_env->ExceptionOccurred());
+  tlsPtr_.jni_env->ExceptionClear();
 
   // If the thread has its own handler, use that.
-  ScopedLocalRef<jobject> handler(jni_env_,
-                                  jni_env_->GetObjectField(peer.get(),
-                                                           WellKnownClasses::java_lang_Thread_uncaughtHandler));
+  ScopedLocalRef<jobject> handler(tlsPtr_.jni_env,
+                                  tlsPtr_.jni_env->GetObjectField(peer.get(),
+                                      WellKnownClasses::java_lang_Thread_uncaughtHandler));
   if (handler.get() == nullptr) {
     // Otherwise use the thread group's default handler.
-    handler.reset(jni_env_->GetObjectField(peer.get(), WellKnownClasses::java_lang_Thread_group));
+    handler.reset(tlsPtr_.jni_env->GetObjectField(peer.get(),
+                                                  WellKnownClasses::java_lang_Thread_group));
   }
 
   // Call the handler.
-  jni_env_->CallVoidMethod(handler.get(),
-                           WellKnownClasses::java_lang_Thread$UncaughtExceptionHandler_uncaughtException,
-                           peer.get(), exception.get());
+  tlsPtr_.jni_env->CallVoidMethod(handler.get(),
+      WellKnownClasses::java_lang_Thread$UncaughtExceptionHandler_uncaughtException,
+      peer.get(), exception.get());
 
   // If the handler threw, clear that exception too.
-  jni_env_->ExceptionClear();
+  tlsPtr_.jni_env->ExceptionClear();
 }
 
 void Thread::RemoveFromThreadGroup(ScopedObjectAccess& soa) {
   // this.group.removeThread(this);
   // group can be null if we're in the compiler or a test.
-  mirror::Object* ogroup = soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->GetObject(opeer_);
+  mirror::Object* ogroup = soa.DecodeField(WellKnownClasses::java_lang_Thread_group)
+      ->GetObject(tlsPtr_.opeer);
   if (ogroup != nullptr) {
     ScopedLocalRef<jobject> group(soa.Env(), soa.AddLocalReference<jobject>(ogroup));
-    ScopedLocalRef<jobject> peer(soa.Env(), soa.AddLocalReference<jobject>(opeer_));
+    ScopedLocalRef<jobject> peer(soa.Env(), soa.AddLocalReference<jobject>(tlsPtr_.opeer));
     ScopedThreadStateChange tsc(soa.Self(), kNative);
-    jni_env_->CallVoidMethod(group.get(), WellKnownClasses::java_lang_ThreadGroup_removeThread,
-                             peer.get());
+    tlsPtr_.jni_env->CallVoidMethod(group.get(),
+                                    WellKnownClasses::java_lang_ThreadGroup_removeThread,
+                                    peer.get());
   }
 }
 
 size_t Thread::NumSirtReferences() {
   size_t count = 0;
-  for (StackIndirectReferenceTable* cur = top_sirt_; cur; cur = cur->GetLink()) {
+  for (StackIndirectReferenceTable* cur = tlsPtr_.top_sirt; cur; cur = cur->GetLink()) {
     count += cur->NumberOfReferences();
   }
   return count;
@@ -1213,17 +1202,17 @@
 bool Thread::SirtContains(jobject obj) const {
   StackReference<mirror::Object>* sirt_entry =
       reinterpret_cast<StackReference<mirror::Object>*>(obj);
-  for (StackIndirectReferenceTable* cur = top_sirt_; cur; cur = cur->GetLink()) {
+  for (StackIndirectReferenceTable* cur = tlsPtr_.top_sirt; cur; cur = cur->GetLink()) {
     if (cur->Contains(sirt_entry)) {
       return true;
     }
   }
   // JNI code invoked from portable code uses shadow frames rather than the SIRT.
-  return managed_stack_.ShadowFramesContain(sirt_entry);
+  return tlsPtr_.managed_stack.ShadowFramesContain(sirt_entry);
 }
 
 void Thread::SirtVisitRoots(RootCallback* visitor, void* arg, uint32_t thread_id) {
-  for (StackIndirectReferenceTable* cur = top_sirt_; cur; cur = cur->GetLink()) {
+  for (StackIndirectReferenceTable* cur = tlsPtr_.top_sirt; cur; cur = cur->GetLink()) {
     size_t num_refs = cur->NumberOfReferences();
     for (size_t j = 0; j < num_refs; ++j) {
       mirror::Object* object = cur->GetReference(j);
@@ -1248,7 +1237,7 @@
   mirror::Object* result;
   // The "kinds" below are sorted by the frequency we expect to encounter them.
   if (kind == kLocal) {
-    IndirectReferenceTable& locals = jni_env_->locals;
+    IndirectReferenceTable& locals = tlsPtr_.jni_env->locals;
     result = locals.Get(ref);
   } else if (kind == kSirtOrInvalid) {
     // TODO: make stack indirect reference table lookup more efficient.
@@ -1287,19 +1276,18 @@
 // Implements java.lang.Thread.interrupted.
 bool Thread::Interrupted() {
   MutexLock mu(Thread::Current(), *wait_mutex_);
-  bool interrupted = interrupted_;
-  interrupted_ = false;
+  bool interrupted = IsInterruptedLocked();
+  SetInterruptedLocked(false);
   return interrupted;
 }
 
 // Implements java.lang.Thread.isInterrupted.
 bool Thread::IsInterrupted() {
   MutexLock mu(Thread::Current(), *wait_mutex_);
-  return interrupted_;
+  return IsInterruptedLocked();
 }
 
-void Thread::Interrupt() {
-  Thread* self = Thread::Current();
+void Thread::Interrupt(Thread* self) {
   MutexLock mu(self, *wait_mutex_);
   if (interrupted_) {
     return;
@@ -1360,6 +1348,7 @@
   bool skipping_;
 };
 
+template<bool kTransactionActive>
 class BuildInternalStackTraceVisitor : public StackVisitor {
  public:
   explicit BuildInternalStackTraceVisitor(Thread* self, Thread* thread, int skip_depth)
@@ -1383,7 +1372,7 @@
     // Save PC trace in last element of method trace, also places it into the
     // object graph.
     // We are called from native: use non-transactional mode.
-    method_trace->Set<false>(depth, dex_pc_trace);
+    method_trace->Set<kTransactionActive>(depth, dex_pc_trace);
     // Set the Object*s and assert that no thread suspension is now possible.
     const char* last_no_suspend_cause =
         self_->StartAssertNoThreadSuspension("Building internal stack trace");
@@ -1411,14 +1400,8 @@
     if (m->IsRuntimeMethod()) {
       return true;  // Ignore runtime frames (in particular callee save).
     }
-    // TODO dedup this code.
-    if (Runtime::Current()->IsActiveTransaction()) {
-      method_trace_->Set<true>(count_, m);
-      dex_pc_trace_->Set<true>(count_, m->IsProxyMethod() ? DexFile::kDexNoIndex : GetDexPc());
-    } else {
-      method_trace_->Set<false>(count_, m);
-      dex_pc_trace_->Set<false>(count_, m->IsProxyMethod() ? DexFile::kDexNoIndex : GetDexPc());
-    }
+    method_trace_->Set<kTransactionActive>(count_, m);
+    dex_pc_trace_->Set<kTransactionActive>(count_, m->IsProxyMethod() ? DexFile::kDexNoIndex : GetDexPc());
     ++count_;
     return true;
   }
@@ -1439,6 +1422,7 @@
   mirror::ObjectArray<mirror::Object>* method_trace_;
 };
 
+template<bool kTransactionActive>
 jobject Thread::CreateInternalStackTrace(const ScopedObjectAccessUnchecked& soa) const {
   // Compute depth of stack
   CountStackDepthVisitor count_visitor(const_cast<Thread*>(this));
@@ -1447,8 +1431,9 @@
   int32_t skip_depth = count_visitor.GetSkipDepth();
 
   // Build internal stack trace.
-  BuildInternalStackTraceVisitor build_trace_visitor(soa.Self(), const_cast<Thread*>(this),
-                                                     skip_depth);
+  BuildInternalStackTraceVisitor<kTransactionActive> build_trace_visitor(soa.Self(),
+                                                                         const_cast<Thread*>(this),
+                                                                         skip_depth);
   if (!build_trace_visitor.Init(depth)) {
     return nullptr;  // Allocation failed.
   }
@@ -1461,6 +1446,8 @@
   }
   return soa.AddLocalReference<jobjectArray>(trace);
 }
+template jobject Thread::CreateInternalStackTrace<false>(const ScopedObjectAccessUnchecked& soa) const;
+template jobject Thread::CreateInternalStackTrace<true>(const ScopedObjectAccessUnchecked& soa) const;
 
 jobjectArray Thread::InternalStackTraceToStackTraceElementArray(const ScopedObjectAccess& soa,
     jobject internal, jobjectArray output_array, int* stack_depth) {
@@ -1677,12 +1664,12 @@
 
 void Thread::ThrowOutOfMemoryError(const char* msg) {
   LOG(ERROR) << StringPrintf("Throwing OutOfMemoryError \"%s\"%s",
-      msg, (throwing_OutOfMemoryError_ ? " (recursive case)" : ""));
+      msg, (tls32_.throwing_OutOfMemoryError ? " (recursive case)" : ""));
   ThrowLocation throw_location = GetCurrentLocationForThrow();
-  if (!throwing_OutOfMemoryError_) {
-    throwing_OutOfMemoryError_ = true;
+  if (!tls32_.throwing_OutOfMemoryError) {
+    tls32_.throwing_OutOfMemoryError = true;
     ThrowNewException(throw_location, "Ljava/lang/OutOfMemoryError;", msg);
-    throwing_OutOfMemoryError_ = false;
+    tls32_.throwing_OutOfMemoryError = false;
   } else {
     Dump(LOG(ERROR));  // The pre-allocated OOME has no stack, so help out and log one.
     SetException(throw_location, Runtime::Current()->GetPreAllocatedOutOfMemoryError());
@@ -1705,140 +1692,146 @@
 #endif
 }
 
-struct EntryPointInfo {
-  uint32_t offset;
-  const char* name;
-};
-#define INTERPRETER_ENTRY_POINT_INFO(x) { INTERPRETER_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-#define JNI_ENTRY_POINT_INFO(x)         { JNI_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-#define PORTABLE_ENTRY_POINT_INFO(x)    { PORTABLE_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-#define QUICK_ENTRY_POINT_INFO(x)       { QUICK_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-static const EntryPointInfo gThreadEntryPointInfo[] = {
-  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToInterpreterBridge),
-  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToCompiledCodeBridge),
-  JNI_ENTRY_POINT_INFO(pDlsymLookup),
-  PORTABLE_ENTRY_POINT_INFO(pPortableImtConflictTrampoline),
-  PORTABLE_ENTRY_POINT_INFO(pPortableResolutionTrampoline),
-  PORTABLE_ENTRY_POINT_INFO(pPortableToInterpreterBridge),
-  QUICK_ENTRY_POINT_INFO(pAllocArray),
-  QUICK_ENTRY_POINT_INFO(pAllocArrayResolved),
-  QUICK_ENTRY_POINT_INFO(pAllocArrayWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pAllocObject),
-  QUICK_ENTRY_POINT_INFO(pAllocObjectResolved),
-  QUICK_ENTRY_POINT_INFO(pAllocObjectInitialized),
-  QUICK_ENTRY_POINT_INFO(pAllocObjectWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArray),
-  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArrayWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInstanceofNonTrivial),
-  QUICK_ENTRY_POINT_INFO(pCheckCast),
-  QUICK_ENTRY_POINT_INFO(pInitializeStaticStorage),
-  QUICK_ENTRY_POINT_INFO(pInitializeTypeAndVerifyAccess),
-  QUICK_ENTRY_POINT_INFO(pInitializeType),
-  QUICK_ENTRY_POINT_INFO(pResolveString),
-  QUICK_ENTRY_POINT_INFO(pSet32Instance),
-  QUICK_ENTRY_POINT_INFO(pSet32Static),
-  QUICK_ENTRY_POINT_INFO(pSet64Instance),
-  QUICK_ENTRY_POINT_INFO(pSet64Static),
-  QUICK_ENTRY_POINT_INFO(pSetObjInstance),
-  QUICK_ENTRY_POINT_INFO(pSetObjStatic),
-  QUICK_ENTRY_POINT_INFO(pGet32Instance),
-  QUICK_ENTRY_POINT_INFO(pGet32Static),
-  QUICK_ENTRY_POINT_INFO(pGet64Instance),
-  QUICK_ENTRY_POINT_INFO(pGet64Static),
-  QUICK_ENTRY_POINT_INFO(pGetObjInstance),
-  QUICK_ENTRY_POINT_INFO(pGetObjStatic),
-  QUICK_ENTRY_POINT_INFO(pAputObjectWithNullAndBoundCheck),
-  QUICK_ENTRY_POINT_INFO(pAputObjectWithBoundCheck),
-  QUICK_ENTRY_POINT_INFO(pAputObject),
-  QUICK_ENTRY_POINT_INFO(pHandleFillArrayData),
-  QUICK_ENTRY_POINT_INFO(pJniMethodStart),
-  QUICK_ENTRY_POINT_INFO(pJniMethodStartSynchronized),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEnd),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEndSynchronized),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReference),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReferenceSynchronized),
-  QUICK_ENTRY_POINT_INFO(pQuickGenericJniTrampoline),
-  QUICK_ENTRY_POINT_INFO(pLockObject),
-  QUICK_ENTRY_POINT_INFO(pUnlockObject),
-  QUICK_ENTRY_POINT_INFO(pCmpgDouble),
-  QUICK_ENTRY_POINT_INFO(pCmpgFloat),
-  QUICK_ENTRY_POINT_INFO(pCmplDouble),
-  QUICK_ENTRY_POINT_INFO(pCmplFloat),
-  QUICK_ENTRY_POINT_INFO(pFmod),
-  QUICK_ENTRY_POINT_INFO(pSqrt),
-  QUICK_ENTRY_POINT_INFO(pL2d),
-  QUICK_ENTRY_POINT_INFO(pFmodf),
-  QUICK_ENTRY_POINT_INFO(pL2f),
-  QUICK_ENTRY_POINT_INFO(pD2iz),
-  QUICK_ENTRY_POINT_INFO(pF2iz),
-  QUICK_ENTRY_POINT_INFO(pIdivmod),
-  QUICK_ENTRY_POINT_INFO(pD2l),
-  QUICK_ENTRY_POINT_INFO(pF2l),
-  QUICK_ENTRY_POINT_INFO(pLdiv),
-  QUICK_ENTRY_POINT_INFO(pLmod),
-  QUICK_ENTRY_POINT_INFO(pLmul),
-  QUICK_ENTRY_POINT_INFO(pShlLong),
-  QUICK_ENTRY_POINT_INFO(pShrLong),
-  QUICK_ENTRY_POINT_INFO(pUshrLong),
-  QUICK_ENTRY_POINT_INFO(pIndexOf),
-  QUICK_ENTRY_POINT_INFO(pMemcmp16),
-  QUICK_ENTRY_POINT_INFO(pStringCompareTo),
-  QUICK_ENTRY_POINT_INFO(pMemcpy),
-  QUICK_ENTRY_POINT_INFO(pQuickImtConflictTrampoline),
-  QUICK_ENTRY_POINT_INFO(pQuickResolutionTrampoline),
-  QUICK_ENTRY_POINT_INFO(pQuickToInterpreterBridge),
-  QUICK_ENTRY_POINT_INFO(pInvokeDirectTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeInterfaceTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeStaticTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeSuperTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeVirtualTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pCheckSuspend),
-  QUICK_ENTRY_POINT_INFO(pTestSuspend),
-  QUICK_ENTRY_POINT_INFO(pDeliverException),
-  QUICK_ENTRY_POINT_INFO(pThrowArrayBounds),
-  QUICK_ENTRY_POINT_INFO(pThrowDivZero),
-  QUICK_ENTRY_POINT_INFO(pThrowNoSuchMethod),
-  QUICK_ENTRY_POINT_INFO(pThrowNullPointer),
-  QUICK_ENTRY_POINT_INFO(pThrowStackOverflow),
-};
-#undef QUICK_ENTRY_POINT_INFO
+// Explicitly instantiate 32 and 64bit thread offset dumping support.
+template void Thread::DumpThreadOffset<4>(std::ostream& os, uint32_t offset);
+template void Thread::DumpThreadOffset<8>(std::ostream& os, uint32_t offset);
 
-void Thread::DumpThreadOffset(std::ostream& os, uint32_t offset, size_t size_of_pointers) {
-  CHECK_EQ(size_of_pointers, 4U);  // TODO: support 64-bit targets.
-
-#define DO_THREAD_OFFSET(x) \
-    if (offset == static_cast<uint32_t>(OFFSETOF_VOLATILE_MEMBER(Thread, x))) { \
-      os << # x; \
+template<size_t ptr_size>
+void Thread::DumpThreadOffset(std::ostream& os, uint32_t offset) {
+#define DO_THREAD_OFFSET(x, y) \
+    if (offset == x.Uint32Value()) { \
+      os << y; \
       return; \
     }
-  DO_THREAD_OFFSET(state_and_flags_);
-  DO_THREAD_OFFSET(card_table_);
-  DO_THREAD_OFFSET(exception_);
-  DO_THREAD_OFFSET(opeer_);
-  DO_THREAD_OFFSET(jni_env_);
-  DO_THREAD_OFFSET(self_);
-  DO_THREAD_OFFSET(stack_end_);
-  DO_THREAD_OFFSET(suspend_count_);
-  DO_THREAD_OFFSET(thin_lock_thread_id_);
-  // DO_THREAD_OFFSET(top_of_managed_stack_);
-  // DO_THREAD_OFFSET(top_of_managed_stack_pc_);
-  DO_THREAD_OFFSET(top_sirt_);
-  DO_THREAD_OFFSET(suspend_trigger_);
+  DO_THREAD_OFFSET(ThreadFlagsOffset<ptr_size>(), "state_and_flags")
+  DO_THREAD_OFFSET(CardTableOffset<ptr_size>(), "card_table")
+  DO_THREAD_OFFSET(ExceptionOffset<ptr_size>(), "exception")
+  DO_THREAD_OFFSET(PeerOffset<ptr_size>(), "peer");
+  DO_THREAD_OFFSET(JniEnvOffset<ptr_size>(), "jni_env")
+  DO_THREAD_OFFSET(SelfOffset<ptr_size>(), "self")
+  DO_THREAD_OFFSET(StackEndOffset<ptr_size>(), "stack_end")
+  DO_THREAD_OFFSET(ThinLockIdOffset<ptr_size>(), "thin_lock_thread_id")
+  DO_THREAD_OFFSET(TopOfManagedStackOffset<ptr_size>(), "top_quick_frame_method")
+  DO_THREAD_OFFSET(TopOfManagedStackPcOffset<ptr_size>(), "top_quick_frame_pc")
+  DO_THREAD_OFFSET(TopShadowFrameOffset<ptr_size>(), "top_shadow_frame")
+  DO_THREAD_OFFSET(TopSirtOffset<ptr_size>(), "top_sirt")
+  DO_THREAD_OFFSET(ThreadSuspendTriggerOffset<ptr_size>(), "suspend_trigger")
 #undef DO_THREAD_OFFSET
 
-  size_t entry_point_count = arraysize(gThreadEntryPointInfo);
-  CHECK_EQ(entry_point_count * size_of_pointers,
-           sizeof(InterpreterEntryPoints) + sizeof(JniEntryPoints) + sizeof(PortableEntryPoints) +
-           sizeof(QuickEntryPoints));
-  uint32_t expected_offset = OFFSETOF_MEMBER(Thread, interpreter_entrypoints_);
-  for (size_t i = 0; i < entry_point_count; ++i) {
-    CHECK_EQ(gThreadEntryPointInfo[i].offset, expected_offset) << gThreadEntryPointInfo[i].name;
-    expected_offset += size_of_pointers;
-    if (gThreadEntryPointInfo[i].offset == offset) {
-      os << gThreadEntryPointInfo[i].name;
-      return;
+#define INTERPRETER_ENTRY_POINT_INFO(x) \
+    if (INTERPRETER_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
     }
-  }
+  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToInterpreterBridge)
+  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToCompiledCodeBridge)
+#undef INTERPRETER_ENTRY_POINT_INFO
+
+#define JNI_ENTRY_POINT_INFO(x) \
+    if (JNI_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
+    }
+  JNI_ENTRY_POINT_INFO(pDlsymLookup)
+#undef JNI_ENTRY_POINT_INFO
+
+#define PORTABLE_ENTRY_POINT_INFO(x) \
+    if (PORTABLE_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
+    }
+  PORTABLE_ENTRY_POINT_INFO(pPortableImtConflictTrampoline)
+  PORTABLE_ENTRY_POINT_INFO(pPortableResolutionTrampoline)
+  PORTABLE_ENTRY_POINT_INFO(pPortableToInterpreterBridge)
+#undef PORTABLE_ENTRY_POINT_INFO
+
+#define QUICK_ENTRY_POINT_INFO(x) \
+    if (QUICK_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
+    }
+  QUICK_ENTRY_POINT_INFO(pAllocArray)
+  QUICK_ENTRY_POINT_INFO(pAllocArrayResolved)
+  QUICK_ENTRY_POINT_INFO(pAllocArrayWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pAllocObject)
+  QUICK_ENTRY_POINT_INFO(pAllocObjectResolved)
+  QUICK_ENTRY_POINT_INFO(pAllocObjectInitialized)
+  QUICK_ENTRY_POINT_INFO(pAllocObjectWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArray)
+  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArrayWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInstanceofNonTrivial)
+  QUICK_ENTRY_POINT_INFO(pCheckCast)
+  QUICK_ENTRY_POINT_INFO(pInitializeStaticStorage)
+  QUICK_ENTRY_POINT_INFO(pInitializeTypeAndVerifyAccess)
+  QUICK_ENTRY_POINT_INFO(pInitializeType)
+  QUICK_ENTRY_POINT_INFO(pResolveString)
+  QUICK_ENTRY_POINT_INFO(pSet32Instance)
+  QUICK_ENTRY_POINT_INFO(pSet32Static)
+  QUICK_ENTRY_POINT_INFO(pSet64Instance)
+  QUICK_ENTRY_POINT_INFO(pSet64Static)
+  QUICK_ENTRY_POINT_INFO(pSetObjInstance)
+  QUICK_ENTRY_POINT_INFO(pSetObjStatic)
+  QUICK_ENTRY_POINT_INFO(pGet32Instance)
+  QUICK_ENTRY_POINT_INFO(pGet32Static)
+  QUICK_ENTRY_POINT_INFO(pGet64Instance)
+  QUICK_ENTRY_POINT_INFO(pGet64Static)
+  QUICK_ENTRY_POINT_INFO(pGetObjInstance)
+  QUICK_ENTRY_POINT_INFO(pGetObjStatic)
+  QUICK_ENTRY_POINT_INFO(pAputObjectWithNullAndBoundCheck)
+  QUICK_ENTRY_POINT_INFO(pAputObjectWithBoundCheck)
+  QUICK_ENTRY_POINT_INFO(pAputObject)
+  QUICK_ENTRY_POINT_INFO(pHandleFillArrayData)
+  QUICK_ENTRY_POINT_INFO(pJniMethodStart)
+  QUICK_ENTRY_POINT_INFO(pJniMethodStartSynchronized)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEnd)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndSynchronized)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReference)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReferenceSynchronized)
+  QUICK_ENTRY_POINT_INFO(pQuickGenericJniTrampoline)
+  QUICK_ENTRY_POINT_INFO(pLockObject)
+  QUICK_ENTRY_POINT_INFO(pUnlockObject)
+  QUICK_ENTRY_POINT_INFO(pCmpgDouble)
+  QUICK_ENTRY_POINT_INFO(pCmpgFloat)
+  QUICK_ENTRY_POINT_INFO(pCmplDouble)
+  QUICK_ENTRY_POINT_INFO(pCmplFloat)
+  QUICK_ENTRY_POINT_INFO(pFmod)
+  QUICK_ENTRY_POINT_INFO(pSqrt)
+  QUICK_ENTRY_POINT_INFO(pL2d)
+  QUICK_ENTRY_POINT_INFO(pFmodf)
+  QUICK_ENTRY_POINT_INFO(pL2f)
+  QUICK_ENTRY_POINT_INFO(pD2iz)
+  QUICK_ENTRY_POINT_INFO(pF2iz)
+  QUICK_ENTRY_POINT_INFO(pIdivmod)
+  QUICK_ENTRY_POINT_INFO(pD2l)
+  QUICK_ENTRY_POINT_INFO(pF2l)
+  QUICK_ENTRY_POINT_INFO(pLdiv)
+  QUICK_ENTRY_POINT_INFO(pLmod)
+  QUICK_ENTRY_POINT_INFO(pLmul)
+  QUICK_ENTRY_POINT_INFO(pShlLong)
+  QUICK_ENTRY_POINT_INFO(pShrLong)
+  QUICK_ENTRY_POINT_INFO(pUshrLong)
+  QUICK_ENTRY_POINT_INFO(pIndexOf)
+  QUICK_ENTRY_POINT_INFO(pMemcmp16)
+  QUICK_ENTRY_POINT_INFO(pStringCompareTo)
+  QUICK_ENTRY_POINT_INFO(pMemcpy)
+  QUICK_ENTRY_POINT_INFO(pQuickImtConflictTrampoline)
+  QUICK_ENTRY_POINT_INFO(pQuickResolutionTrampoline)
+  QUICK_ENTRY_POINT_INFO(pQuickToInterpreterBridge)
+  QUICK_ENTRY_POINT_INFO(pInvokeDirectTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeInterfaceTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeStaticTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeSuperTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeVirtualTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pCheckSuspend)
+  QUICK_ENTRY_POINT_INFO(pTestSuspend)
+  QUICK_ENTRY_POINT_INFO(pDeliverException)
+  QUICK_ENTRY_POINT_INFO(pThrowArrayBounds)
+  QUICK_ENTRY_POINT_INFO(pThrowDivZero)
+  QUICK_ENTRY_POINT_INFO(pThrowNoSuchMethod)
+  QUICK_ENTRY_POINT_INFO(pThrowNullPointer)
+  QUICK_ENTRY_POINT_INFO(pThrowStackOverflow)
+#undef QUICK_ENTRY_POINT_INFO
+
   os << offset;
 }
 
@@ -1869,11 +1862,11 @@
 }
 
 Context* Thread::GetLongJumpContext() {
-  Context* result = long_jump_context_;
+  Context* result = tlsPtr_.long_jump_context;
   if (result == nullptr) {
     result = Context::Create();
   } else {
-    long_jump_context_ = nullptr;  // Avoid context being shared.
+    tlsPtr_.long_jump_context = nullptr;  // Avoid context being shared.
     result->Reset();
   }
   return result;
@@ -1918,11 +1911,11 @@
   return ThrowLocation(visitor.this_object_, visitor.method_, visitor.dex_pc_);
 }
 
-bool Thread::HoldsLock(mirror::Object* object) {
+bool Thread::HoldsLock(mirror::Object* object) const {
   if (object == nullptr) {
     return false;
   }
-  return object->GetLockOwnerThreadId() == thin_lock_thread_id_;
+  return object->GetLockOwnerThreadId() == GetThreadId();
 }
 
 // RootVisitor parameters are: (const Object* obj, size_t vreg, const StackVisitor* visitor).
@@ -2061,30 +2054,30 @@
 
 void Thread::SetClassLoaderOverride(mirror::ClassLoader* class_loader_override) {
   VerifyObject(class_loader_override);
-  class_loader_override_ = class_loader_override;
+  tlsPtr_.class_loader_override = class_loader_override;
 }
 
 void Thread::VisitRoots(RootCallback* visitor, void* arg) {
   uint32_t thread_id = GetThreadId();
-  if (opeer_ != nullptr) {
-    visitor(&opeer_, arg, thread_id, kRootThreadObject);
+  if (tlsPtr_.opeer != nullptr) {
+    visitor(&tlsPtr_.opeer, arg, thread_id, kRootThreadObject);
   }
-  if (exception_ != nullptr) {
-    visitor(reinterpret_cast<mirror::Object**>(&exception_), arg, thread_id, kRootNativeStack);
+  if (tlsPtr_.exception != nullptr) {
+    visitor(reinterpret_cast<mirror::Object**>(&tlsPtr_.exception), arg, thread_id, kRootNativeStack);
   }
-  throw_location_.VisitRoots(visitor, arg);
-  if (class_loader_override_ != nullptr) {
-    visitor(reinterpret_cast<mirror::Object**>(&class_loader_override_), arg, thread_id,
+  tlsPtr_.throw_location.VisitRoots(visitor, arg);
+  if (tlsPtr_.class_loader_override != nullptr) {
+    visitor(reinterpret_cast<mirror::Object**>(&tlsPtr_.class_loader_override), arg, thread_id,
             kRootNativeStack);
   }
-  jni_env_->locals.VisitRoots(visitor, arg, thread_id, kRootJNILocal);
-  jni_env_->monitors.VisitRoots(visitor, arg, thread_id, kRootJNIMonitor);
+  tlsPtr_.jni_env->locals.VisitRoots(visitor, arg, thread_id, kRootJNILocal);
+  tlsPtr_.jni_env->monitors.VisitRoots(visitor, arg, thread_id, kRootJNIMonitor);
   SirtVisitRoots(visitor, arg, thread_id);
-  if (debug_invoke_req_ != nullptr) {
-    debug_invoke_req_->VisitRoots(visitor, arg, thread_id, kRootDebugger);
+  if (tlsPtr_.debug_invoke_req != nullptr) {
+    tlsPtr_.debug_invoke_req->VisitRoots(visitor, arg, thread_id, kRootDebugger);
   }
-  if (single_step_control_ != nullptr) {
-    single_step_control_->VisitRoots(visitor, arg, thread_id, kRootDebugger);
+  if (tlsPtr_.single_step_control != nullptr) {
+    tlsPtr_.single_step_control->VisitRoots(visitor, arg, thread_id, kRootDebugger);
   }
   // Visit roots on this thread's stack
   Context* context = GetLongJumpContext();
@@ -2116,7 +2109,7 @@
 // Set the stack end to that to be used during a stack overflow
 void Thread::SetStackEndForStackOverflow() {
   // During stack overflow we allow use of the full stack.
-  if (stack_end_ == stack_begin_) {
+  if (tlsPtr_.stack_end == tlsPtr_.stack_begin) {
     // However, we seem to have already extended to use the full stack.
     LOG(ERROR) << "Need to increase kStackOverflowReservedBytes (currently "
                << kStackOverflowReservedBytes << ")?";
@@ -2124,23 +2117,23 @@
     LOG(FATAL) << "Recursive stack overflow.";
   }
 
-  stack_end_ = stack_begin_;
+  tlsPtr_.stack_end = tlsPtr_.stack_begin;
 }
 
 void Thread::SetTlab(byte* start, byte* end) {
   DCHECK_LE(start, end);
-  thread_local_start_ = start;
-  thread_local_pos_  = thread_local_start_;
-  thread_local_end_ = end;
-  thread_local_objects_ = 0;
+  tlsPtr_.thread_local_start = start;
+  tlsPtr_.thread_local_pos  = tlsPtr_.thread_local_start;
+  tlsPtr_.thread_local_end = end;
+  tlsPtr_.thread_local_objects = 0;
 }
 
 bool Thread::HasTlab() const {
-  bool has_tlab = thread_local_pos_ != nullptr;
+  bool has_tlab = tlsPtr_.thread_local_pos != nullptr;
   if (has_tlab) {
-    DCHECK(thread_local_start_ != nullptr && thread_local_end_ != nullptr);
+    DCHECK(tlsPtr_.thread_local_start != nullptr && tlsPtr_.thread_local_end != nullptr);
   } else {
-    DCHECK(thread_local_start_ == nullptr && thread_local_end_ == nullptr);
+    DCHECK(tlsPtr_.thread_local_start == nullptr && tlsPtr_.thread_local_end == nullptr);
   }
   return has_tlab;
 }
diff --git a/runtime/thread.h b/runtime/thread.h
index 63d22c5..d25bbe9 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -92,7 +92,7 @@
   kCheckpointRequest = 2  // Request that the thread do some checkpoint work and then continue.
 };
 
-class PACKED(4) Thread {
+class Thread {
  public:
   // Space to throw a StackOverflowError in.
   // TODO: shrink reserved space, in particular for 64bit.
@@ -145,7 +145,8 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Translates 172 to pAllocArrayFromCode and so on.
-  static void DumpThreadOffset(std::ostream& os, uint32_t offset, size_t size_of_pointers);
+  template<size_t size_of_pointers>
+  static void DumpThreadOffset(std::ostream& os, uint32_t offset);
 
   // Dumps a one-line summary of thread state (used for operator<<).
   void ShortDump(std::ostream& os) const;
@@ -155,6 +156,10 @@
       LOCKS_EXCLUDED(Locks::thread_suspend_count_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  void DumpJavaStack(std::ostream& os) const
+      LOCKS_EXCLUDED(Locks::thread_suspend_count_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // Dumps the SIGQUIT per-thread header. 'thread' can be NULL for a non-attached thread, in which
   // case we use 'tid' to identify the thread, and we'll include as much information as we can.
   static void DumpState(std::ostream& os, const Thread* thread, pid_t tid)
@@ -162,32 +167,24 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   ThreadState GetState() const {
-    DCHECK(state_and_flags_.as_struct.state >= kTerminated && state_and_flags_.as_struct.state <= kSuspended);
-    return static_cast<ThreadState>(state_and_flags_.as_struct.state);
-  }
-
-  // This function can be used to make sure a thread's state is valid.
-  void CheckState(int id) const {
-    if (state_and_flags_.as_struct.state >= kTerminated && state_and_flags_.as_struct.state <= kSuspended) {
-      return;
-    }
-    LOG(INFO) << "Thread " << this << " state is invalid: " << state_and_flags_.as_struct.state << " id=" << id;
-    CHECK(false);
+    DCHECK_GE(tls32_.state_and_flags.as_struct.state, kTerminated);
+    DCHECK_LE(tls32_.state_and_flags.as_struct.state, kSuspended);
+    return static_cast<ThreadState>(tls32_.state_and_flags.as_struct.state);
   }
 
   ThreadState SetState(ThreadState new_state);
 
   int GetSuspendCount() const EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    return suspend_count_;
+    return tls32_.suspend_count;
   }
 
   int GetDebugSuspendCount() const EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    return debug_suspend_count_;
+    return tls32_.debug_suspend_count;
   }
 
   bool IsSuspended() const {
     union StateAndFlags state_and_flags;
-    state_and_flags.as_int = state_and_flags_.as_int;
+    state_and_flags.as_int = tls32_.state_and_flags.as_int;
     return state_and_flags.as_struct.state != kRunnable &&
         (state_and_flags.as_struct.flags & kSuspendRequest) != 0;
   }
@@ -221,9 +218,9 @@
   const char* StartAssertNoThreadSuspension(const char* cause) {
     if (kIsDebugBuild) {
       CHECK(cause != NULL);
-      const char* previous_cause = last_no_thread_suspension_cause_;
-      no_thread_suspension_++;
-      last_no_thread_suspension_cause_ = cause;
+      const char* previous_cause = tlsPtr_.last_no_thread_suspension_cause;
+      tls32_.no_thread_suspension++;
+      tlsPtr_.last_no_thread_suspension_cause = cause;
       return previous_cause;
     } else {
       return nullptr;
@@ -233,20 +230,20 @@
   // End region where no thread suspension is expected.
   void EndAssertNoThreadSuspension(const char* old_cause) {
     if (kIsDebugBuild) {
-      CHECK(old_cause != NULL || no_thread_suspension_ == 1);
-      CHECK_GT(no_thread_suspension_, 0U);
-      no_thread_suspension_--;
-      last_no_thread_suspension_cause_ = old_cause;
+      CHECK(old_cause != nullptr || tls32_.no_thread_suspension == 1);
+      CHECK_GT(tls32_.no_thread_suspension, 0U);
+      tls32_.no_thread_suspension--;
+      tlsPtr_.last_no_thread_suspension_cause = old_cause;
     }
   }
 
   void AssertThreadSuspensionIsAllowable(bool check_locks = true) const;
 
   bool IsDaemon() const {
-    return daemon_;
+    return tls32_.daemon;
   }
 
-  bool HoldsLock(mirror::Object*) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool HoldsLock(mirror::Object*) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   /*
    * Changes the priority of this thread to match that of the java.lang.Thread object.
@@ -265,11 +262,11 @@
   static int GetNativePriority();
 
   uint32_t GetThreadId() const {
-    return thin_lock_thread_id_;
+    return tls32_.thin_lock_thread_id;
   }
 
   pid_t GetTid() const {
-    return tid_;
+    return tls32_.tid;
   }
 
   // Returns the java.lang.Thread's name, or NULL if this Thread* doesn't have a peer.
@@ -287,30 +284,30 @@
   uint64_t GetCpuMicroTime() const;
 
   mirror::Object* GetPeer() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    CHECK(jpeer_ == NULL);
-    return opeer_;
+    CHECK(tlsPtr_.jpeer == nullptr);
+    return tlsPtr_.opeer;
   }
 
   bool HasPeer() const {
-    return jpeer_ != NULL || opeer_ != NULL;
+    return tlsPtr_.jpeer != nullptr || tlsPtr_.opeer != nullptr;
   }
 
   RuntimeStats* GetStats() {
-    return &stats_;
+    return &tls64_.stats;
   }
 
   bool IsStillStarting() const;
 
   bool IsExceptionPending() const {
-    return exception_ != NULL;
+    return tlsPtr_.exception != nullptr;
   }
 
   mirror::Throwable* GetException(ThrowLocation* throw_location) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    if (throw_location != NULL) {
-      *throw_location = throw_location_;
+    if (throw_location != nullptr) {
+      *throw_location = tlsPtr_.throw_location;
     }
-    return exception_;
+    return tlsPtr_.exception;
   }
 
   void AssertNoPendingException() const;
@@ -320,13 +317,13 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     CHECK(new_exception != NULL);
     // TODO: DCHECK(!IsExceptionPending());
-    exception_ = new_exception;
-    throw_location_ = throw_location;
+    tlsPtr_.exception = new_exception;
+    tlsPtr_.throw_location = throw_location;
   }
 
   void ClearException() {
-    exception_ = NULL;
-    throw_location_.Clear();
+    tlsPtr_.exception = nullptr;
+    tlsPtr_.throw_location.Clear();
   }
 
   // Find catch block and perform long jump to appropriate exception handle
@@ -334,8 +331,8 @@
 
   Context* GetLongJumpContext();
   void ReleaseLongJumpContext(Context* context) {
-    DCHECK(long_jump_context_ == NULL);
-    long_jump_context_ = context;
+    DCHECK(tlsPtr_.long_jump_context == nullptr);
+    tlsPtr_.long_jump_context = context;
   }
 
   mirror::ArtMethod* GetCurrentMethod(uint32_t* dex_pc) const
@@ -344,16 +341,17 @@
   ThrowLocation GetCurrentLocationForThrow() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void SetTopOfStack(mirror::ArtMethod** top_method, uintptr_t pc) {
-    managed_stack_.SetTopQuickFrame(top_method);
-    managed_stack_.SetTopQuickFramePc(pc);
+    tlsPtr_.managed_stack.SetTopQuickFrame(top_method);
+    tlsPtr_.managed_stack.SetTopQuickFramePc(pc);
   }
 
   void SetTopOfShadowStack(ShadowFrame* top) {
-    managed_stack_.SetTopShadowFrame(top);
+    tlsPtr_.managed_stack.SetTopShadowFrame(top);
   }
 
   bool HasManagedStack() const {
-    return managed_stack_.GetTopQuickFrame() != NULL || managed_stack_.GetTopShadowFrame() != NULL;
+    return (tlsPtr_.managed_stack.GetTopQuickFrame() != nullptr) ||
+        (tlsPtr_.managed_stack.GetTopShadowFrame() != nullptr);
   }
 
   // If 'msg' is NULL, no detail message is set.
@@ -387,28 +385,73 @@
 
   // JNI methods
   JNIEnvExt* GetJniEnv() const {
-    return jni_env_;
+    return tlsPtr_.jni_env;
   }
 
   // Convert a jobject into a Object*
   mirror::Object* DecodeJObject(jobject obj) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  mirror::Object* GetMonitorEnterObject() const {
+    return tlsPtr_.monitor_enter_object;
+  }
+
+  void SetMonitorEnterObject(mirror::Object* obj) {
+    tlsPtr_.monitor_enter_object = obj;
+  }
+
   // Implements java.lang.Thread.interrupted.
-  bool Interrupted();
+  bool Interrupted() LOCKS_EXCLUDED(wait_mutex_);
   // Implements java.lang.Thread.isInterrupted.
-  bool IsInterrupted();
-  void Interrupt();
-  void Notify();
+  bool IsInterrupted() LOCKS_EXCLUDED(wait_mutex_);
+  bool IsInterruptedLocked() EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return interrupted_;
+  }
+  void Interrupt(Thread* self) LOCKS_EXCLUDED(wait_mutex_);
+  void SetInterruptedLocked(bool i) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    interrupted_ = i;
+  }
+  void Notify() LOCKS_EXCLUDED(wait_mutex_);
+
+ private:
+  void NotifyLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_);
+
+ public:
+  Mutex* GetWaitMutex() const LOCK_RETURNED(wait_mutex_) {
+    return wait_mutex_;
+  }
+
+  ConditionVariable* GetWaitConditionVariable() const EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return wait_cond_;
+  }
+
+  Monitor* GetWaitMonitor() const EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return wait_monitor_;
+  }
+
+  void SetWaitMonitor(Monitor* mon) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    wait_monitor_ = mon;
+  }
+
+
+  // Waiter link-list support.
+  Thread* GetWaitNext() const {
+    return tlsPtr_.wait_next;
+  }
+
+  void SetWaitNext(Thread* next) {
+    tlsPtr_.wait_next = next;
+  }
 
   mirror::ClassLoader* GetClassLoaderOverride() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return class_loader_override_;
+    return tlsPtr_.class_loader_override;
   }
 
   void SetClassLoaderOverride(mirror::ClassLoader* class_loader_override)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Create the internal representation of a stack trace, that is more time
-  // and space efficient to compute than the StackTraceElement[]
+  // and space efficient to compute than the StackTraceElement[].
+  template<bool kTransactionActive>
   jobject CreateInternalStackTrace(const ScopedObjectAccessUnchecked& soa) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -428,41 +471,99 @@
   // Offsets of various members of native Thread class, used by compiled code.
   //
 
-  static ThreadOffset SelfOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, self_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThinLockIdOffset() {
+    return ThreadOffset<pointer_size>(
+        OFFSETOF_MEMBER(Thread, tls32_) +
+        OFFSETOF_MEMBER(tls_32bit_sized_values, thin_lock_thread_id));
   }
 
-  static ThreadOffset ExceptionOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, exception_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadFlagsOffset() {
+    return ThreadOffset<pointer_size>(
+        OFFSETOF_MEMBER(Thread, tls32_) +
+        OFFSETOF_MEMBER(tls_32bit_sized_values, state_and_flags));
   }
 
-  static ThreadOffset PeerOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, opeer_));
+ private:
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadOffsetFromTlsPtr(size_t tls_ptr_offset) {
+    size_t base = OFFSETOF_MEMBER(Thread, tlsPtr_);
+    size_t scale;
+    size_t shrink;
+    if (pointer_size == sizeof(void*)) {
+      scale = 1;
+      shrink = 1;
+    } else if (pointer_size > sizeof(void*)) {
+      scale = pointer_size / sizeof(void*);
+      shrink = 1;
+    } else {
+      DCHECK_GT(sizeof(void*), pointer_size);
+      scale = 1;
+      shrink = sizeof(void*) / pointer_size;
+    }
+    return ThreadOffset<pointer_size>(base + ((tls_ptr_offset * scale) / shrink));
   }
 
-  static ThreadOffset ThinLockIdOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+ public:
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> QuickEntryPointOffset(size_t quick_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, quick_entrypoints) + quick_entrypoint_offset);
   }
 
-  static ThreadOffset CardTableOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, card_table_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> InterpreterEntryPointOffset(size_t interp_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, interpreter_entrypoints) + interp_entrypoint_offset);
   }
 
-  static ThreadOffset ThreadFlagsOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, state_and_flags_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> JniEntryPointOffset(size_t jni_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, jni_entrypoints) + jni_entrypoint_offset);
   }
 
-  static ThreadOffset ThreadSuspendTriggerOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, suspend_trigger_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> PortableEntryPointOffset(size_t port_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, portable_entrypoints) + port_entrypoint_offset);
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> SelfOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, self));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ExceptionOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, exception));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> PeerOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, opeer));
+  }
+
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> CardTableOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, card_table));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadSuspendTriggerOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, suspend_trigger));
   }
 
   // Size of stack less any space reserved for stack overflow
   size_t GetStackSize() const {
-    return stack_size_ - (stack_end_ - stack_begin_);
+    return tlsPtr_.stack_size - (tlsPtr_.stack_end - tlsPtr_.stack_begin);
   }
 
   byte* GetStackEnd() const {
-    return stack_end_;
+    return tlsPtr_.stack_end;
   }
 
   // Set the stack end to that to be used during a stack overflow
@@ -475,9 +576,9 @@
     if (implicit_overflow_check) {
       // For implicit checks we also need to add in the protected region above the
       // overflow region.
-      stack_end_ = stack_begin_ + kStackOverflowImplicitCheckSize;
+      tlsPtr_.stack_end = tlsPtr_.stack_begin + kStackOverflowImplicitCheckSize;
     } else {
-      stack_end_ = stack_begin_ + kStackOverflowReservedBytes;
+      tlsPtr_.stack_end = tlsPtr_.stack_begin + kStackOverflowReservedBytes;
     }
   }
 
@@ -485,55 +586,65 @@
   void InstallImplicitProtection(bool is_main_stack);
 
   bool IsHandlingStackOverflow() const {
-    return stack_end_ == stack_begin_;
+    return tlsPtr_.stack_end == tlsPtr_.stack_begin;
   }
 
-  static ThreadOffset StackEndOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, stack_end_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> StackEndOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, stack_end));
   }
 
-  static ThreadOffset JniEnvOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, jni_env_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> JniEnvOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, jni_env));
   }
 
-  static ThreadOffset TopOfManagedStackOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopQuickFrameOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopOfManagedStackOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopQuickFrameOffset());
   }
 
-  static ThreadOffset TopOfManagedStackPcOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopQuickFramePcOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopOfManagedStackPcOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopQuickFramePcOffset());
   }
 
   const ManagedStack* GetManagedStack() const {
-    return &managed_stack_;
+    return &tlsPtr_.managed_stack;
   }
 
   // Linked list recording fragments of managed stack.
   void PushManagedStackFragment(ManagedStack* fragment) {
-    managed_stack_.PushManagedStackFragment(fragment);
+    tlsPtr_.managed_stack.PushManagedStackFragment(fragment);
   }
   void PopManagedStackFragment(const ManagedStack& fragment) {
-    managed_stack_.PopManagedStackFragment(fragment);
+    tlsPtr_.managed_stack.PopManagedStackFragment(fragment);
   }
 
   ShadowFrame* PushShadowFrame(ShadowFrame* new_top_frame) {
-    return managed_stack_.PushShadowFrame(new_top_frame);
+    return tlsPtr_.managed_stack.PushShadowFrame(new_top_frame);
   }
 
   ShadowFrame* PopShadowFrame() {
-    return managed_stack_.PopShadowFrame();
+    return tlsPtr_.managed_stack.PopShadowFrame();
   }
 
-  static ThreadOffset TopShadowFrameOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopShadowFrameOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopShadowFrameOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopShadowFrameOffset());
   }
 
   // Number of references allocated in JNI ShadowFrames on this thread.
   size_t NumJniShadowFrameReferences() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return managed_stack_.NumJniShadowFrameReferences();
+    return tlsPtr_.managed_stack.NumJniShadowFrameReferences();
   }
 
   // Number of references in SIRTs on this thread.
@@ -551,27 +662,28 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void PushSirt(StackIndirectReferenceTable* sirt) {
-    sirt->SetLink(top_sirt_);
-    top_sirt_ = sirt;
+    sirt->SetLink(tlsPtr_.top_sirt);
+    tlsPtr_.top_sirt = sirt;
   }
 
   StackIndirectReferenceTable* PopSirt() {
-    StackIndirectReferenceTable* sirt = top_sirt_;
+    StackIndirectReferenceTable* sirt = tlsPtr_.top_sirt;
     DCHECK(sirt != NULL);
-    top_sirt_ = top_sirt_->GetLink();
+    tlsPtr_.top_sirt = tlsPtr_.top_sirt->GetLink();
     return sirt;
   }
 
-  static ThreadOffset TopSirtOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, top_sirt_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopSirtOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, top_sirt));
   }
 
-  DebugInvokeReq* GetInvokeReq() {
-    return debug_invoke_req_;
+  DebugInvokeReq* GetInvokeReq() const {
+    return tlsPtr_.debug_invoke_req;
   }
 
   SingleStepControl* GetSingleStepControl() const {
-    return single_step_control_;
+    return tlsPtr_.single_step_control;
   }
 
   void SetDeoptimizationShadowFrame(ShadowFrame* sf);
@@ -580,41 +692,41 @@
   ShadowFrame* GetAndClearDeoptimizationShadowFrame(JValue* ret_val);
 
   std::deque<instrumentation::InstrumentationStackFrame>* GetInstrumentationStack() {
-    return instrumentation_stack_;
+    return tlsPtr_.instrumentation_stack;
   }
 
   std::vector<mirror::ArtMethod*>* GetStackTraceSample() const {
-    return stack_trace_sample_;
+    return tlsPtr_.stack_trace_sample;
   }
 
   void SetStackTraceSample(std::vector<mirror::ArtMethod*>* sample) {
-    stack_trace_sample_ = sample;
+    tlsPtr_.stack_trace_sample = sample;
   }
 
   uint64_t GetTraceClockBase() const {
-    return trace_clock_base_;
+    return tls64_.trace_clock_base;
   }
 
   void SetTraceClockBase(uint64_t clock_base) {
-    trace_clock_base_ = clock_base;
+    tls64_.trace_clock_base = clock_base;
   }
 
   BaseMutex* GetHeldMutex(LockLevel level) const {
-    return held_mutexes_[level];
+    return tlsPtr_.held_mutexes[level];
   }
 
   void SetHeldMutex(LockLevel level, BaseMutex* mutex) {
-    held_mutexes_[level] = mutex;
+    tlsPtr_.held_mutexes[level] = mutex;
   }
 
   void RunCheckpointFunction();
 
   bool ReadFlag(ThreadFlag flag) const {
-    return (state_and_flags_.as_struct.flags & flag) != 0;
+    return (tls32_.state_and_flags.as_struct.flags & flag) != 0;
   }
 
   bool TestAllFlags() const {
-    return (state_and_flags_.as_struct.flags != 0);
+    return (tls32_.state_and_flags.as_struct.flags != 0);
   }
 
   void AtomicSetFlag(ThreadFlag flag);
@@ -623,11 +735,57 @@
 
   void ResetQuickAllocEntryPointsForThread();
 
- private:
-  // We have no control over the size of 'bool', but want our boolean fields
-  // to be 4-byte quantities.
-  typedef uint32_t bool32_t;
+  // Returns the remaining space in the TLAB.
+  size_t TlabSize() const;
+  // Doesn't check that there is room.
+  mirror::Object* AllocTlab(size_t bytes);
+  void SetTlab(byte* start, byte* end);
+  bool HasTlab() const;
 
+  // Remove the suspend trigger for this thread by making the suspend_trigger_ TLS value
+  // equal to a valid pointer.
+  // TODO: does this need to atomic?  I don't think so.
+  void RemoveSuspendTrigger() {
+    tlsPtr_.suspend_trigger = reinterpret_cast<uintptr_t*>(&tlsPtr_.suspend_trigger);
+  }
+
+  // Trigger a suspend check by making the suspend_trigger_ TLS value an invalid pointer.
+  // The next time a suspend check is done, it will load from the value at this address
+  // and trigger a SIGSEGV.
+  void TriggerSuspend() {
+    tlsPtr_.suspend_trigger = nullptr;
+  }
+
+
+  // Push an object onto the allocation stack.
+  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+
+  // Set the thread local allocation pointers to the given pointers.
+  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+
+  // Resets the thread local allocation pointers.
+  void RevokeThreadLocalAllocationStack();
+
+  size_t GetThreadLocalBytesAllocated() const {
+    return tlsPtr_.thread_local_pos - tlsPtr_.thread_local_start;
+  }
+
+  size_t GetThreadLocalObjectsAllocated() const {
+    return tlsPtr_.thread_local_objects;
+  }
+
+  // ROS alloc TLS.
+  static constexpr size_t kRosAllocNumOfSizeBrackets = 34;
+
+  void* GetRosAllocRun(size_t index) const {
+    return tlsPtr_.rosalloc_runs[index];
+  }
+
+  void SetRosAllocRun(size_t index, void* run) {
+    tlsPtr_.rosalloc_runs[index] = run;
+  }
+
+ private:
   explicit Thread(bool daemon);
   ~Thread() LOCKS_EXCLUDED(Locks::mutator_lock_,
                            Locks::thread_suspend_count_lock_);
@@ -644,7 +802,7 @@
   // Dbg::Disconnected.
   ThreadState SetStateUnsafe(ThreadState new_state) {
     ThreadState old_state = GetState();
-    state_and_flags_.as_struct.state = new_state;
+    tls32_.state_and_flags.as_struct.state = new_state;
     return old_state;
   }
 
@@ -678,22 +836,6 @@
   void SetUpAlternateSignalStack();
   void TearDownAlternateSignalStack();
 
-  void NotifyLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_);
-
-  static void ThreadExitCallback(void* arg);
-
-  // Has Thread::Startup been called?
-  static bool is_started_;
-
-  // TLS key used to retrieve the Thread*.
-  static pthread_key_t pthread_key_self_;
-
-  // Used to notify threads that they should attempt to resume, they will suspend again if
-  // their suspend count is > 0.
-  static ConditionVariable* resume_cond_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // --- Frequently accessed fields first for short offsets ---
-
   // 32 bits of atomically changed state and flags. Keeping as 32 bits allows and atomic CAS to
   // change from being Suspended to Runnable without a suspend request occurring.
   union PACKED(4) StateAndFlags {
@@ -715,206 +857,225 @@
     // See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47409
     DISALLOW_COPY_AND_ASSIGN(StateAndFlags);
   };
-  union StateAndFlags state_and_flags_;
-  COMPILE_ASSERT(sizeof(union StateAndFlags) == sizeof(int32_t),
-                 sizeof_state_and_flags_and_int32_are_different);
 
-  // A non-zero value is used to tell the current thread to enter a safe point
-  // at the next poll.
-  int suspend_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // The biased card table, see CardTable for details
-  byte* card_table_;
-
-  // The pending exception or NULL.
-  mirror::Throwable* exception_;
-
-  // The end of this thread's stack. This is the lowest safely-addressable address on the stack.
-  // We leave extra space so there's room for the code that throws StackOverflowError.
-  byte* stack_end_;
-
-  // The top of the managed stack often manipulated directly by compiler generated code.
-  ManagedStack managed_stack_;
-
-  // Every thread may have an associated JNI environment
-  JNIEnvExt* jni_env_;
-
-  // Initialized to "this". On certain architectures (such as x86) reading
-  // off of Thread::Current is easy but getting the address of Thread::Current
-  // is hard. This field can be read off of Thread::Current to give the address.
-  Thread* self_;
-
-  // Our managed peer (an instance of java.lang.Thread). The jobject version is used during thread
-  // start up, until the thread is registered and the local opeer_ is used.
-  mirror::Object* opeer_;
-  jobject jpeer_;
-
-  // The "lowest addressable byte" of the stack
-  byte* stack_begin_;
-
-  // Size of the stack
-  size_t stack_size_;
-
-  // Thin lock thread id. This is a small integer used by the thin lock implementation.
-  // This is not to be confused with the native thread's tid, nor is it the value returned
-  // by java.lang.Thread.getId --- this is a distinct value, used only for locking. One
-  // important difference between this id and the ids visible to managed code is that these
-  // ones get reused (to ensure that they fit in the number of bits available).
-  uint32_t thin_lock_thread_id_;
-
-  // Pointer to previous stack trace captured by sampling profiler.
-  std::vector<mirror::ArtMethod*>* stack_trace_sample_;
-
-  // The clock base used for tracing.
-  uint64_t trace_clock_base_;
-
-  // System thread id.
-  pid_t tid_;
-
-  ThrowLocation throw_location_;
-
-  // Guards the 'interrupted_' and 'wait_monitor_' members.
-  mutable Mutex* wait_mutex_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  // Condition variable waited upon during a wait.
-  ConditionVariable* wait_cond_ GUARDED_BY(wait_mutex_);
-  // Pointer to the monitor lock we're currently waiting on or NULL if not waiting.
-  Monitor* wait_monitor_ GUARDED_BY(wait_mutex_);
-  // Thread "interrupted" status; stays raised until queried or thrown.
-  bool32_t interrupted_ GUARDED_BY(wait_mutex_);
-  // The next thread in the wait set this thread is part of or NULL if not waiting.
-  Thread* wait_next_;
-
-
-  // If we're blocked in MonitorEnter, this is the object we're trying to lock.
-  mirror::Object* monitor_enter_object_;
-
-  // Top of linked list of stack indirect reference tables or NULL for none
-  StackIndirectReferenceTable* top_sirt_;
-
-  Runtime* runtime_;
-
-  RuntimeStats stats_;
-
-  // Needed to get the right ClassLoader in JNI_OnLoad, but also
-  // useful for testing.
-  mirror::ClassLoader* class_loader_override_;
-
-  // Thread local, lazily allocated, long jump context. Used to deliver exceptions.
-  Context* long_jump_context_;
-
-  // A boolean telling us whether we're recursively throwing OOME.
-  bool32_t throwing_OutOfMemoryError_;
-
-  // How much of 'suspend_count_' is by request of the debugger, used to set things right
-  // when the debugger detaches. Must be <= suspend_count_.
-  int debug_suspend_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // JDWP invoke-during-breakpoint support.
-  DebugInvokeReq* debug_invoke_req_;
-
-  // JDWP single-stepping support.
-  SingleStepControl* single_step_control_;
-
-  // Shadow frame that is used temporarily during the deoptimization of a method.
-  ShadowFrame* deoptimization_shadow_frame_;
-  JValue deoptimization_return_value_;
-
-  // Additional stack used by method instrumentation to store method and return pc values.
-  // Stored as a pointer since std::deque is not PACKED.
-  std::deque<instrumentation::InstrumentationStackFrame>* instrumentation_stack_;
-
-  // A cached copy of the java.lang.Thread's name.
-  std::string* name_;
-
-  // Is the thread a daemon?
-  const bool32_t daemon_;
-
-  // A cached pthread_t for the pthread underlying this Thread*.
-  pthread_t pthread_self_;
-
-  // Support for Mutex lock hierarchy bug detection.
-  BaseMutex* held_mutexes_[kLockLevelCount];
-
-  // A positive value implies we're in a region where thread suspension isn't expected.
-  uint32_t no_thread_suspension_;
-
-  // If no_thread_suspension_ is > 0, what is causing that assertion.
-  const char* last_no_thread_suspension_cause_;
+  static void ThreadExitCallback(void* arg);
 
   // Maximum number of checkpoint functions.
   static constexpr uint32_t kMaxCheckpoints = 3;
 
-  // Pending checkpoint function or NULL if non-pending. Installation guarding by
-  // Locks::thread_suspend_count_lock_.
-  Closure* checkpoint_functions_[kMaxCheckpoints];
+  // Has Thread::Startup been called?
+  static bool is_started_;
 
- public:
-  // Entrypoint function pointers
-  // TODO: move this near the top, since changing its offset requires all oats to be recompiled!
-  InterpreterEntryPoints interpreter_entrypoints_;
-  JniEntryPoints jni_entrypoints_;
-  PortableEntryPoints portable_entrypoints_;
-  QuickEntryPoints quick_entrypoints_;
+  // TLS key used to retrieve the Thread*.
+  static pthread_key_t pthread_key_self_;
 
-  // Setting this to 0 will trigger a SEGV and thus a suspend check.  It is normally
-  // set to the address of itself.
-  uintptr_t* suspend_trigger_;
+  // Used to notify threads that they should attempt to resume, they will suspend again if
+  // their suspend count is > 0.
+  static ConditionVariable* resume_cond_ GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // How many times has our pthread key's destructor been called?
-  uint32_t thread_exit_check_count_;
+  /***********************************************************************************************/
+  // Thread local storage. Fields are grouped by size to enable 32 <-> 64 searching to account for
+  // pointer size differences. To encourage shorter encoding, more frequently used values appear
+  // first if possible.
+  /***********************************************************************************************/
 
-  // Thread-local allocation pointer.
-  byte* thread_local_start_;
-  byte* thread_local_pos_;
-  byte* thread_local_end_;
-  size_t thread_local_objects_;
-  // Returns the remaining space in the TLAB.
-  size_t TlabSize() const;
-  // Doesn't check that there is room.
-  mirror::Object* AllocTlab(size_t bytes);
-  void SetTlab(byte* start, byte* end);
-  bool HasTlab() const;
+  struct PACKED(4)  tls_32bit_sized_values {
+    // We have no control over the size of 'bool', but want our boolean fields
+    // to be 4-byte quantities.
+    typedef uint32_t bool32_t;
 
-  // Remove the suspend trigger for this thread by making the suspend_trigger_ TLS value
-  // equal to a valid pointer.
-  // TODO: does this need to atomic?  I don't think so.
-  void RemoveSuspendTrigger() {
-    suspend_trigger_ = reinterpret_cast<uintptr_t*>(&suspend_trigger_);
-  }
+    explicit tls_32bit_sized_values(bool is_daemon) :
+      suspend_count(0), debug_suspend_count(0), thin_lock_thread_id(0), tid(0),
+      daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
+      thread_exit_check_count(0) {
+    }
 
-  // Trigger a suspend check by making the suspend_trigger_ TLS value an invalid pointer.
-  // The next time a suspend check is done, it will load from the value at this address
-  // and trigger a SIGSEGV.
-  void TriggerSuspend() {
-    suspend_trigger_ = nullptr;
-  }
+    union StateAndFlags state_and_flags;
+    COMPILE_ASSERT(sizeof(union StateAndFlags) == sizeof(int32_t),
+                   sizeof_state_and_flags_and_int32_are_different);
 
-  // Thread-local rosalloc runs. There are 34 size brackets in rosalloc
-  // runs (RosAlloc::kNumOfSizeBrackets). We can't refer to the
-  // RosAlloc class due to a header file circular dependency issue.
-  // To compensate, we check that the two values match at RosAlloc
-  // initialization time.
-  static const size_t kRosAllocNumOfSizeBrackets = 34;
-  void* rosalloc_runs_[kRosAllocNumOfSizeBrackets];
+    // A non-zero value is used to tell the current thread to enter a safe point
+    // at the next poll.
+    int suspend_count GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // Thread-local allocation stack data/routines.
-  mirror::Object** thread_local_alloc_stack_top_;
-  mirror::Object** thread_local_alloc_stack_end_;
+    // How much of 'suspend_count_' is by request of the debugger, used to set things right
+    // when the debugger detaches. Must be <= suspend_count_.
+    int debug_suspend_count GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // Push an object onto the allocation stack.
-  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+    // Thin lock thread id. This is a small integer used by the thin lock implementation.
+    // This is not to be confused with the native thread's tid, nor is it the value returned
+    // by java.lang.Thread.getId --- this is a distinct value, used only for locking. One
+    // important difference between this id and the ids visible to managed code is that these
+    // ones get reused (to ensure that they fit in the number of bits available).
+    uint32_t thin_lock_thread_id;
 
-  // Set the thread local allocation pointers to the given pointers.
-  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+    // System thread id.
+    uint32_t tid;
 
-  // Resets the thread local allocation pointers.
-  void RevokeThreadLocalAllocationStack();
+    // Is the thread a daemon?
+    const bool32_t daemon;
 
- private:
+    // A boolean telling us whether we're recursively throwing OOME.
+    bool32_t throwing_OutOfMemoryError;
+
+    // A positive value implies we're in a region where thread suspension isn't expected.
+    uint32_t no_thread_suspension;
+
+    // How many times has our pthread key's destructor been called?
+    uint32_t thread_exit_check_count;
+  } tls32_;
+
+  struct PACKED(8) tls_64bit_sized_values {
+    tls_64bit_sized_values() : trace_clock_base(0), deoptimization_return_value() {
+    }
+
+    // The clock base used for tracing.
+    uint64_t trace_clock_base;
+
+    // Return value used by deoptimization.
+    JValue deoptimization_return_value;
+
+    RuntimeStats stats;
+  } tls64_;
+
+  struct PACKED(4) tls_ptr_sized_values {
+      tls_ptr_sized_values() : card_table(nullptr), exception(nullptr), stack_end(nullptr),
+      managed_stack(), suspend_trigger(nullptr), jni_env(nullptr), self(nullptr), opeer(nullptr),
+      jpeer(nullptr), stack_begin(nullptr), stack_size(0), throw_location(),
+      stack_trace_sample(nullptr), wait_next(nullptr), monitor_enter_object(nullptr),
+      top_sirt(nullptr), class_loader_override(nullptr), long_jump_context(nullptr),
+      instrumentation_stack(nullptr), debug_invoke_req(nullptr), single_step_control(nullptr),
+      deoptimization_shadow_frame(nullptr), name(nullptr), pthread_self(0),
+      last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
+      thread_local_pos(nullptr), thread_local_end(nullptr), thread_local_objects(0),
+      thread_local_alloc_stack_top(nullptr), thread_local_alloc_stack_end(nullptr) {
+    }
+
+    // The biased card table, see CardTable for details.
+    byte* card_table;
+
+    // The pending exception or NULL.
+    mirror::Throwable* exception;
+
+    // The end of this thread's stack. This is the lowest safely-addressable address on the stack.
+    // We leave extra space so there's room for the code that throws StackOverflowError.
+    byte* stack_end;
+
+    // The top of the managed stack often manipulated directly by compiler generated code.
+    ManagedStack managed_stack;
+
+    // In certain modes, setting this to 0 will trigger a SEGV and thus a suspend check.  It is
+    // normally set to the address of itself.
+    uintptr_t* suspend_trigger;
+
+    // Every thread may have an associated JNI environment
+    JNIEnvExt* jni_env;
+
+    // Initialized to "this". On certain architectures (such as x86) reading off of Thread::Current
+    // is easy but getting the address of Thread::Current is hard. This field can be read off of
+    // Thread::Current to give the address.
+    Thread* self;
+
+    // Our managed peer (an instance of java.lang.Thread). The jobject version is used during thread
+    // start up, until the thread is registered and the local opeer_ is used.
+    mirror::Object* opeer;
+    jobject jpeer;
+
+    // The "lowest addressable byte" of the stack.
+    byte* stack_begin;
+
+    // Size of the stack.
+    size_t stack_size;
+
+    // The location the current exception was thrown from.
+    ThrowLocation throw_location;
+
+    // Pointer to previous stack trace captured by sampling profiler.
+    std::vector<mirror::ArtMethod*>* stack_trace_sample;
+
+    // The next thread in the wait set this thread is part of or NULL if not waiting.
+    Thread* wait_next;
+
+    // If we're blocked in MonitorEnter, this is the object we're trying to lock.
+    mirror::Object* monitor_enter_object;
+
+    // Top of linked list of stack indirect reference tables or NULL for none.
+    StackIndirectReferenceTable* top_sirt;
+
+    // Needed to get the right ClassLoader in JNI_OnLoad, but also
+    // useful for testing.
+    mirror::ClassLoader* class_loader_override;
+
+    // Thread local, lazily allocated, long jump context. Used to deliver exceptions.
+    Context* long_jump_context;
+
+    // Additional stack used by method instrumentation to store method and return pc values.
+    // Stored as a pointer since std::deque is not PACKED.
+    std::deque<instrumentation::InstrumentationStackFrame>* instrumentation_stack;
+
+    // JDWP invoke-during-breakpoint support.
+    DebugInvokeReq* debug_invoke_req;
+
+    // JDWP single-stepping support.
+    SingleStepControl* single_step_control;
+
+    // Shadow frame stack that is used temporarily during the deoptimization of a method.
+    ShadowFrame* deoptimization_shadow_frame;
+
+    // A cached copy of the java.lang.Thread's name.
+    std::string* name;
+
+    // A cached pthread_t for the pthread underlying this Thread*.
+    pthread_t pthread_self;
+
+    // Support for Mutex lock hierarchy bug detection.
+    BaseMutex* held_mutexes[kLockLevelCount];
+
+    // If no_thread_suspension_ is > 0, what is causing that assertion.
+    const char* last_no_thread_suspension_cause;
+
+    // Pending checkpoint function or NULL if non-pending. Installation guarding by
+    // Locks::thread_suspend_count_lock_.
+    Closure* checkpoint_functions[kMaxCheckpoints];
+
+    // Entrypoint function pointers.
+    // TODO: move this to more of a global offset table model to avoid per-thread duplication.
+    InterpreterEntryPoints interpreter_entrypoints;
+    JniEntryPoints jni_entrypoints;
+    PortableEntryPoints portable_entrypoints;
+    QuickEntryPoints quick_entrypoints;
+
+    // Thread-local allocation pointer.
+    byte* thread_local_start;
+    byte* thread_local_pos;
+    byte* thread_local_end;
+    size_t thread_local_objects;
+
+    // Thread-local rosalloc runs. There are 34 size brackets in rosalloc
+    // runs (RosAlloc::kNumOfSizeBrackets). We can't refer to the
+    // RosAlloc class due to a header file circular dependency issue.
+    // To compensate, we check that the two values match at RosAlloc
+    // initialization time.
+    void* rosalloc_runs[kRosAllocNumOfSizeBrackets];
+
+    // Thread-local allocation stack data/routines.
+    mirror::Object** thread_local_alloc_stack_top;
+    mirror::Object** thread_local_alloc_stack_end;
+  } tlsPtr_;
+
+  // Guards the 'interrupted_' and 'wait_monitor_' members.
+  Mutex* wait_mutex_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+
+  // Condition variable waited upon during a wait.
+  ConditionVariable* wait_cond_ GUARDED_BY(wait_mutex_);
+  // Pointer to the monitor lock we're currently waiting on or NULL if not waiting.
+  Monitor* wait_monitor_ GUARDED_BY(wait_mutex_);
+
+  // Thread "interrupted" status; stays raised until queried or thrown.
+  bool interrupted_ GUARDED_BY(wait_mutex_);
+
   friend class Dbg;  // For SetStateUnsafe.
   friend class gc::collector::SemiSpace;  // For getting stack traces.
-  friend class Monitor;
-  friend class MonitorInfo;
   friend class Runtime;  // For CreatePeer.
   friend class ScopedThreadStateChange;
   friend class SignalCatcher;  // For SetStateUnsafe.
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index ec610e1..7de9433 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -63,7 +63,7 @@
 
 bool ThreadList::Contains(pid_t tid) {
   for (const auto& thread : list_) {
-    if (thread->tid_ == tid) {
+    if (thread->GetTid() == tid) {
       return true;
     }
   }
@@ -77,8 +77,8 @@
 void ThreadList::DumpNativeStacks(std::ostream& os) {
   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
   for (const auto& thread : list_) {
-    os << "DUMPING THREAD " << thread->tid_ << "\n";
-    DumpNativeStack(os, thread->tid_, "\t", true);
+    os << "DUMPING THREAD " << thread->GetTid() << "\n";
+    DumpNativeStack(os, thread->GetTid(), "\t", true);
     os << "\n";
   }
 }
@@ -607,7 +607,7 @@
     // though.
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
     self->ModifySuspendCount(self, +1, true);
-    CHECK_GT(self->suspend_count_, 0);
+    CHECK_GT(self->GetSuspendCount(), 0);
   }
 
   VLOG(threads) << *self << " self-suspending (debugger)";
@@ -631,18 +631,18 @@
 
   {
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
-    while (self->suspend_count_ != 0) {
+    while (self->GetSuspendCount() != 0) {
       Thread::resume_cond_->Wait(self);
-      if (self->suspend_count_ != 0) {
+      if (self->GetSuspendCount() != 0) {
         // The condition was signaled but we're still suspended. This
         // can happen if the debugger lets go while a SIGQUIT thread
         // dump event is pending (assuming SignalCatcher was resumed for
         // just long enough to try to grab the thread-suspend lock).
         LOG(DEBUG) << *self << " still suspended after undo "
-                   << "(suspend count=" << self->suspend_count_ << ")";
+                   << "(suspend count=" << self->GetSuspendCount() << ")";
       }
     }
-    CHECK_EQ(self->suspend_count_, 0);
+    CHECK_EQ(self->GetSuspendCount(), 0);
   }
 
   VLOG(threads) << *self << " self-reviving (debugger)";
@@ -661,10 +661,10 @@
     debug_suspend_all_count_ = 0;
     // Update running threads.
     for (const auto& thread : list_) {
-      if (thread == self || thread->debug_suspend_count_ == 0) {
+      if (thread == self || thread->GetDebugSuspendCount() == 0) {
         continue;
       }
-      thread->ModifySuspendCount(self, -thread->debug_suspend_count_, true);
+      thread->ModifySuspendCount(self, -thread->GetDebugSuspendCount(), true);
     }
   }
 
@@ -749,11 +749,14 @@
   // SuspendAll requests.
   MutexLock mu(self, *Locks::thread_list_lock_);
   MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-  self->suspend_count_ = suspend_all_count_;
-  self->debug_suspend_count_ = debug_suspend_all_count_;
-  if (self->suspend_count_ > 0) {
-    self->AtomicSetFlag(kSuspendRequest);
-    self->TriggerSuspend();
+  CHECK_GE(suspend_all_count_, debug_suspend_all_count_);
+  // Modify suspend count in increments of 1 to maintain invariants in ModifySuspendCount. While
+  // this isn't particularly efficient the suspend counts are most commonly 0 or 1.
+  for (int delta = debug_suspend_all_count_; delta > 0; delta--) {
+    self->ModifySuspendCount(self, +1, true);
+  }
+  for (int delta = suspend_all_count_ - debug_suspend_all_count_; delta > 0; delta--) {
+    self->ModifySuspendCount(self, +1, false);
   }
   CHECK(!Contains(self));
   list_.push_back(self);
@@ -768,7 +771,7 @@
   // suspend and so on, must happen at this point, and not in ~Thread.
   self->Destroy();
 
-  uint32_t thin_lock_id = self->thin_lock_thread_id_;
+  uint32_t thin_lock_id = self->GetThreadId();
   while (self != nullptr) {
     // Remove and delete the Thread* while holding the thread_list_lock_ and
     // thread_suspend_count_lock_ so that the unregistering thread cannot be suspended.
diff --git a/runtime/throw_location.h b/runtime/throw_location.h
index c171b07..b36eb67 100644
--- a/runtime/throw_location.h
+++ b/runtime/throw_location.h
@@ -41,7 +41,16 @@
                 uint32_t throw_dex_pc) :
       this_object_(throw_this_object),
       method_(throw_method),
-      dex_pc_(throw_dex_pc) {}
+      dex_pc_(throw_dex_pc)
+#ifdef __LP64__
+      , pad_(0)
+#endif
+
+  {
+#ifdef __LP64__
+    UNUSED(pad_);
+#endif
+  }
 
   mirror::Object* GetThis() const {
     return this_object_;
@@ -72,6 +81,10 @@
   mirror::ArtMethod* method_;
   // The instruction within the throwing method.
   uint32_t dex_pc_;
+  // Ensure 8byte alignment on 64bit.
+#ifdef __LP64__
+  uint32_t pad_;
+#endif
 };
 
 }  // namespace art
diff --git a/runtime/transaction.cc b/runtime/transaction.cc
index fcda6c9..e18cf04 100644
--- a/runtime/transaction.cc
+++ b/runtime/transaction.cc
@@ -84,19 +84,18 @@
 void Transaction::RecordWriteArray(mirror::Array* array, size_t index, uint64_t value) {
   DCHECK(array != nullptr);
   DCHECK(array->IsArrayInstance());
+  DCHECK(!array->IsObjectArray());
   MutexLock mu(Thread::Current(), log_lock_);
   ArrayLog& array_log = array_logs_[array];
   array_log.LogValue(index, value);
 }
 
 void Transaction::RecordStrongStringInsertion(mirror::String* s, uint32_t hash_code) {
-  DCHECK(s != nullptr);
   InternStringLog log(s, hash_code, InternStringLog::kStrongString, InternStringLog::kInsert);
   LogInternedString(log);
 }
 
 void Transaction::RecordWeakStringInsertion(mirror::String* s, uint32_t hash_code) {
-  DCHECK(s != nullptr);
   InternStringLog log(s, hash_code, InternStringLog::kWeakString, InternStringLog::kInsert);
   LogInternedString(log);
 }
@@ -198,9 +197,7 @@
 
   for (auto it : array_logs_) {
     mirror::Array* old_root = it.first;
-    if (old_root->IsObjectArray()) {
-      it.second.VisitRoots(callback, arg);
-    }
+    CHECK(!old_root->IsObjectArray());
     mirror::Array* new_root = old_root;
     callback(reinterpret_cast<mirror::Object**>(&new_root), arg, 0, kRootUnknown);
     if (new_root != old_root) {
@@ -403,23 +400,12 @@
     case Primitive::kPrimDouble:
       array->AsDoubleArray()->SetWithoutChecks<false>(index, static_cast<double>(value));
       break;
-    case Primitive::kPrimNot: {
-      mirror::ObjectArray<mirror::Object>* obj_array = array->AsObjectArray<mirror::Object>();
-      obj_array->SetWithoutChecks<false>(index, reinterpret_cast<mirror::Object*>(
-          static_cast<uintptr_t>(value)));
+    case Primitive::kPrimNot:
+      LOG(FATAL) << "ObjectArray should be treated as Object";
       break;
-    }
     default:
       LOG(FATAL) << "Unsupported type " << array_type;
   }
 }
 
-void Transaction::ArrayLog::VisitRoots(RootCallback* callback, void* arg) {
-  for (auto& it : array_values_) {
-    mirror::Object* obj = reinterpret_cast<mirror::Object*>(static_cast<uintptr_t>(it.second));
-    callback(&obj, arg, 0, kRootUnknown);
-    it.second = reinterpret_cast<uintptr_t>(obj);
-  }
-}
-
 }  // namespace art
diff --git a/runtime/transaction.h b/runtime/transaction.h
index cf696de..6fd86c8 100644
--- a/runtime/transaction.h
+++ b/runtime/transaction.h
@@ -118,7 +118,6 @@
     void LogValue(size_t index, uint64_t value);
 
     void Undo(mirror::Array* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-    void VisitRoots(RootCallback* callback, void* arg);
 
     size_t Size() const {
       return array_values_.size();
@@ -145,6 +144,7 @@
     };
     InternStringLog(mirror::String* s, uint32_t hash_code, StringKind kind, StringOp op)
       : str_(s), hash_code_(hash_code), string_kind_(kind), string_op_(op) {
+      DCHECK(s != nullptr);
     }
 
     void Undo(InternTable* intern_table) EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
diff --git a/test/064-field-access/src/Main.java b/test/064-field-access/src/Main.java
index c9b93ba..8dd22ba 100644
--- a/test/064-field-access/src/Main.java
+++ b/test/064-field-access/src/Main.java
@@ -554,15 +554,17 @@
         }
         // Check access or lack of to field.
         Class<?> subClassAccessExceptionClass = null;
-        if (f.getName().contains("Private") ||
+        if ((f.getName().contains("Private") ||
             (!same_package && f.getName().contains("Package")) ||
-            (!same_package && f.getName().contains("Protected"))) {
+            (!same_package && f.getName().contains("Protected"))) &&
+            !(protected_class && f.getName().contains("Public"))) {
           subClassAccessExceptionClass = IllegalAccessException.class;
         }
         Class<?> mainClassAccessExceptionClass = null;
-        if (f.getName().contains("Private") ||
+        if ((f.getName().contains("Private") ||
             (!same_package && f.getName().contains("Package")) ||
-            (!same_package && f.getName().contains("Protected"))) {
+            (!same_package && f.getName().contains("Protected"))) &&
+            !(protected_class && f.getName().contains("Public"))) {
           mainClassAccessExceptionClass = IllegalAccessException.class;
         }
 
@@ -605,7 +607,7 @@
 
       for (Method m : methods) {
         Class<?> subClassAccessExceptionClass = null;
-        if (protected_class || m.getName().contains("Private") ||
+        if (m.getName().contains("Private") ||
             (!same_package && m.getName().contains("Package")) ||
             (!same_package && m.getName().contains("Protected"))) {
           subClassAccessExceptionClass = IllegalAccessException.class;
diff --git a/test/Android.mk b/test/Android.mk
index bb6c437..5879039 100644
--- a/test/Android.mk
+++ b/test/Android.mk
@@ -77,6 +77,12 @@
     LOCAL_ADDITIONAL_DEPENDENCIES += $(LOCAL_PATH)/Android.mk
     include $(BUILD_JAVA_LIBRARY)
     ART_TEST_TARGET_DEX_FILES += $$(LOCAL_INSTALLED_MODULE)
+
+    ifdef TARGET_2ND_ARCH
+      # TODO: make this a simple copy
+$(4)/$(1)-$(2).jar: $(3)/$(1)-$(2).jar
+	cp $$< $(4)/
+    endif
   endif
 
   ifeq ($(ART_BUILD_HOST),true)
@@ -93,8 +99,8 @@
     ART_TEST_HOST_DEX_FILES += $$(LOCAL_INSTALLED_MODULE)
   endif
 endef
-$(foreach dir,$(TEST_DEX_DIRECTORIES), $(eval $(call build-art-test-dex,art-test-dex,$(dir),$(ART_NATIVETEST_OUT))))
-$(foreach dir,$(TEST_OAT_DIRECTORIES), $(eval $(call build-art-test-dex,oat-test-dex,$(dir),$(ART_TEST_OUT))))
+$(foreach dir,$(TEST_DEX_DIRECTORIES), $(eval $(call build-art-test-dex,art-test-dex,$(dir),$(ART_NATIVETEST_OUT),$(2ND_ART_NATIVETEST_OUT))))
+$(foreach dir,$(TEST_OAT_DIRECTORIES), $(eval $(call build-art-test-dex,oat-test-dex,$(dir),$(ART_TEST_OUT),$(2ND_ART_TEST_OUT))))
 
 ########################################################################
 
@@ -102,19 +108,31 @@
 ART_TEST_HOST_OAT_DEFAULT_TARGETS :=
 ART_TEST_HOST_OAT_INTERPRETER_TARGETS :=
 
+define declare-test-art-oat-targets-impl
+.PHONY: test-art-target-oat-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX)
+test-art-target-oat-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX): $($(2)ART_TEST_OUT)/oat-test-dex-$(1).jar test-art-target-sync
+	adb shell touch $($(2)ART_TEST_DIR)/test-art-target-oat-$(1)
+	adb shell rm $($(2)ART_TEST_DIR)/test-art-target-oat-$(1)
+	adb shell sh -c "/system/bin/dalvikvm$($(2)ART_TARGET_BINARY_SUFFIX) $(DALVIKVM_FLAGS) -XXlib:libartd.so -Ximage:$($(2)ART_TEST_DIR)/core.art -classpath $($(2)ART_TEST_DIR)/oat-test-dex-$(1).jar -Djava.library.path=$($(2)ART_TEST_DIR) $(1) && touch $($(2)ART_TEST_DIR)/test-art-target-oat-$(1)"
+	$(hide) (adb pull $($(2)ART_TEST_DIR)/test-art-target-oat-$(1) /tmp/ && echo test-art-target-oat-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX) PASSED) || (echo test-art-target-oat-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX) FAILED && exit 1)
+	$(hide) rm /tmp/test-art-target-oat-$(1)
+endef
+
 # $(1): directory
 # $(2): arguments
 define declare-test-art-oat-targets
-.PHONY: test-art-target-oat-$(1)
-test-art-target-oat-$(1): $(ART_TEST_OUT)/oat-test-dex-$(1).jar test-art-target-sync
-	adb shell touch $(ART_TEST_DIR)/test-art-target-oat-$(1)
-	adb shell rm $(ART_TEST_DIR)/test-art-target-oat-$(1)
-	adb shell sh -c "/system/bin/dalvikvm $(DALVIKVM_FLAGS) -XXlib:libartd.so -Ximage:$(ART_TEST_DIR)/core.art -classpath $(ART_TEST_DIR)/oat-test-dex-$(1).jar -Djava.library.path=$(ART_TEST_DIR) $(1) $(2) && touch $(ART_TEST_DIR)/test-art-target-oat-$(1)"
-	$(hide) (adb pull $(ART_TEST_DIR)/test-art-target-oat-$(1) /tmp/ && echo test-art-target-oat-$(1) PASSED) || (echo test-art-target-oat-$(1) FAILED && exit 1)
-	$(hide) rm /tmp/test-art-target-oat-$(1)
+  ifdef TARGET_2ND_ARCH
+    $(call declare-test-art-oat-targets-impl,$(1),2ND_)
 
-$(HOST_OUT_JAVA_LIBRARIES)/oat-test-dex-$(1).odex: $(HOST_OUT_JAVA_LIBRARIES)/oat-test-dex-$(1).jar $(HOST_CORE_IMG_OUT) | $(DEX2OAT)
-	$(DEX2OAT) $(DEX2OAT_FLAGS) --runtime-arg -Xms16m --runtime-arg -Xmx16m --boot-image=$(HOST_CORE_IMG_OUT) --dex-file=$(PWD)/$$< --oat-file=$(PWD)/$$@ --instruction-set=$(ART_HOST_ARCH) --host --android-root=$(HOST_OUT)
+    # Bind the primary to the non-suffix rule
+    ifneq ($(ART_PHONY_TEST_TARGET_SUFFIX),)
+test-art-target-oat-$(1): test-art-target-oat-$(1)$(ART_PHONY_TEST_TARGET_SUFFIX)
+    endif
+  endif
+  $(call declare-test-art-oat-targets-impl,$(1),)
+
+$(HOST_OUT_JAVA_LIBRARIES)/oat-test-dex-$(1).odex: $(HOST_OUT_JAVA_LIBRARIES)/oat-test-dex-$(1).jar $(HOST_CORE_IMG_OUT) | $(DEX2OATD)
+	$(DEX2OATD) $(DEX2OAT_FLAGS) --runtime-arg -Xms16m --runtime-arg -Xmx16m --boot-image=$(HOST_CORE_IMG_OUT) --dex-file=$(PWD)/$$< --oat-file=$(PWD)/$$@ --instruction-set=$(ART_HOST_ARCH) --host --android-root=$(HOST_OUT)
 
 .PHONY: test-art-host-oat-default-$(1)
 test-art-host-oat-default-$(1): $(HOST_OUT_JAVA_LIBRARIES)/oat-test-dex-$(1).odex test-art-host-dependencies
@@ -172,7 +190,9 @@
 TEST_ART_RUN_TESTS := $(subst $(LOCAL_PATH)/,, $(TEST_ART_RUN_TESTS))
 TEST_ART_TIMING_SENSITIVE_RUN_TESTS := 055-enum-performance
 ifdef dist_goal # disable timing sensitive tests on "dist" builds.
-  $(foreach test, $(TEST_ART_TIMING_SENSITIVE_RUN_TESTS), $(eval TEST_ART_RUN_TESTS := $(filter-out $(test), $(TEST_ART_RUN_TESTS))))
+  $(foreach test, $(TEST_ART_TIMING_SENSITIVE_RUN_TESTS), \
+    $(info Skipping $(test)) \
+    $(eval TEST_ART_RUN_TESTS := $(filter-out $(test), $(TEST_ART_RUN_TESTS))))
 endif
 $(foreach test, $(TEST_ART_RUN_TESTS), $(eval $(call declare-make-art-run-test,$(test))))
 
diff --git a/test/etc/push-and-run-test-jar b/test/etc/push-and-run-test-jar
index 9e30f65..93d7e79 100755
--- a/test/etc/push-and-run-test-jar
+++ b/test/etc/push-and-run-test-jar
@@ -19,6 +19,7 @@
 DEV_MODE="n"
 INVOKE_WITH=""
 FLAGS=""
+TARGET_SUFFIX=""
 
 while true; do
     if [ "x$1" = "x--quiet" ]; then
@@ -79,6 +80,9 @@
     elif [ "x$1" = "x--" ]; then
         shift
         break
+    elif [ "x$1" = "x--64" ]; then
+        TARGET_SUFFIX="64"
+        shift
     elif expr "x$1" : "x--" >/dev/null 2>&1; then
         echo "unknown $0 option: $1" 1>&2
         exit 1
@@ -136,7 +140,7 @@
 fi
 
 if [ "$GDB" = "y" ]; then
-    gdb="gdbserver :5039"
+    gdb="/data/gdbserver$TARGET_SUFFIX :5039"
     gdbargs="$exe"
 fi
 
@@ -147,7 +151,7 @@
 JNI_OPTS="-Xjnigreflimit:512 -Xcheck:jni"
 
 cmdline="cd $DEX_LOCATION && mkdir dalvik-cache && export ANDROID_DATA=$DEX_LOCATION && export DEX_LOCATION=$DEX_LOCATION && \
-    $INVOKE_WITH $gdb /system/bin/dalvikvm $FLAGS $gdbargs -XXlib:$LIB $ZYGOTE $JNI_OPTS $INT_OPTS $DEBUGGER_OPTS $BOOT_OPT -cp $DEX_LOCATION/$TEST_NAME.jar Main"
+    $INVOKE_WITH $gdb /system/bin/dalvikvm$TARGET_SUFFIX $FLAGS $gdbargs -XXlib:$LIB $ZYGOTE $JNI_OPTS $INT_OPTS $DEBUGGER_OPTS $BOOT_OPT -cp $DEX_LOCATION/$TEST_NAME.jar Main"
 if [ "$DEV_MODE" = "y" ]; then
   echo $cmdline "$@"
 fi
diff --git a/test/run-test b/test/run-test
index 8ff0915..58de980 100755
--- a/test/run-test
+++ b/test/run-test
@@ -152,6 +152,9 @@
     elif [ "x$1" = "x--help" ]; then
         usage="yes"
         shift
+    elif [ "x$1" = "x--64" ]; then
+        run_args="${run_args} --64"
+        shift
     elif expr "x$1" : "x--" >/dev/null 2>&1; then
         echo "unknown $0 option: $1" 1>&2
         usage="yes"
@@ -244,6 +247,7 @@
         echo "    --jvm                Use a host-local RI virtual machine."
         echo "    --output-path [path] Location where to store the build" \
              "files."
+        echo "    --64                 Run the test in 64-bit mode"
     ) 1>&2
     exit 1
 fi