Merge "Fix crash when debugging exception"
diff --git a/Android.mk b/Android.mk
index 6139cb9..4351be9 100644
--- a/Android.mk
+++ b/Android.mk
@@ -99,6 +99,8 @@
 include $(art_path)/tools/Android.mk
 include $(art_build_path)/Android.oat.mk
 
+
+
 # ART_HOST_DEPENDENCIES depends on Android.executable.mk above for ART_HOST_EXECUTABLES
 ART_HOST_DEPENDENCIES := $(ART_HOST_EXECUTABLES) $(HOST_OUT_JAVA_LIBRARIES)/core-libart-hostdex.jar
 ART_HOST_DEPENDENCIES += $(HOST_OUT_SHARED_LIBRARIES)/libjavacore$(ART_HOST_SHLIB_EXTENSION)
@@ -110,11 +112,18 @@
 include $(art_path)/test/Android.mk
 include $(art_build_path)/Android.gtest.mk
 
+$(eval $(call combine-art-multi-target-var,ART_TARGET_GTEST_TARGETS))
+$(eval $(call combine-art-multi-target-var,ART_TARGET_GTEST_EXECUTABLES))
+
 # The ART_*_TEST_DEPENDENCIES definitions:
 # - depend on Android.oattest.mk above for ART_TEST_*_DEX_FILES
 # - depend on Android.gtest.mk above for ART_*_GTEST_EXECUTABLES
 ART_HOST_TEST_DEPENDENCIES   := $(ART_HOST_DEPENDENCIES)   $(ART_HOST_GTEST_EXECUTABLES)   $(ART_TEST_HOST_DEX_FILES)   $(HOST_CORE_IMG_OUT)
-ART_TARGET_TEST_DEPENDENCIES := $(ART_TARGET_DEPENDENCIES) $(ART_TARGET_GTEST_EXECUTABLES) $(ART_TEST_TARGET_DEX_FILES) $(TARGET_CORE_IMG_OUT)
+
+define declare-art-target-test-dependencies-var
+ART_TARGET_TEST_DEPENDENCIES$(1) := $(ART_TARGET_DEPENDENCIES) $(ART_TARGET_GTEST_EXECUTABLES$(1)) $(ART_TEST_TARGET_DEX_FILES$(1)) $(TARGET_CORE_IMG_OUT$(1))
+endef
+$(eval $(call call-art-multi-target-var,declare-art-target-test-dependencies-var,ART_TARGET_TEST_DEPENDENCIES))
 
 include $(art_build_path)/Android.libarttest.mk
 
@@ -209,46 +218,70 @@
 # target test targets
 
 # "mm test-art-target" to build and run all target tests
-.PHONY: test-art-target
-test-art-target: test-art-target-gtest test-art-target-oat test-art-target-run-test
-	@echo test-art-target PASSED
+define declare-test-art-target
+.PHONY: test-art-target$(1)
+test-art-target$(1): test-art-target-gtest$(1) test-art-target-oat$(1) test-art-target-run-test$(1)
+	@echo test-art-target$(1) PASSED
+endef
+$(eval $(call call-art-multi-target-rule,declare-test-art-target,test-art-target))
 
-.PHONY: test-art-target-dependencies
-test-art-target-dependencies: $(ART_TARGET_TEST_DEPENDENCIES) $(ART_TEST_OUT)/libarttest.so
+
+define declare-test-art-target-dependencies
+.PHONY: test-art-target-dependencies$(1)
+test-art-target-dependencies$(1): $(ART_TARGET_TEST_DEPENDENCIES$(1)) $(ART_TEST_OUT)/libarttest.so
+endef
+$(eval $(call call-art-multi-target-rule,declare-test-art-target-dependencies,test-art-target-dependencies))
+
 
 .PHONY: test-art-target-sync
-test-art-target-sync: test-art-target-dependencies
+test-art-target-sync: test-art-target-dependencies$(ART_PHONY_TEST_TARGET_SUFFIX) test-art-target-dependencies$(2ND_ART_PHONY_TEST_TARGET_SUFFIX)
 	adb remount
 	adb sync
 	adb shell mkdir -p $(ART_TEST_DIR)
 
-.PHONY: test-art-target-gtest
-test-art-target-gtest: $(ART_TARGET_GTEST_TARGETS)
 
-.PHONY: test-art-target-oat
-test-art-target-oat: $(ART_TEST_TARGET_OAT_TARGETS)
-	@echo test-art-target-oat PASSED
+define declare-test-art-target-gtest
+.PHONY: test-art-target-gtest$(1)
+test-art-target-gtest$(1): $(ART_TARGET_GTEST_TARGETS$(1))
+	@echo test-art-target-gtest$(1) PASSED
+endef
+$(eval $(call call-art-multi-target-rule,declare-test-art-target-gtest,test-art-target-gtest))
+
+
+define declare-test-art-target-oat
+.PHONY: test-art-target-oat$(1)
+test-art-target-oat$(1): $(ART_TEST_TARGET_OAT_TARGETS$(1))
+	@echo test-art-target-oat$(1) PASSED
+endef
+$(eval $(call call-art-multi-target-rule,declare-test-art-target-oat,test-art-target-oat))
+
 
 define declare-test-art-target-run-test-impl
+$(2)run_test_$(1) :=
+ifeq ($($(2)ART_PHONY_TEST_TARGET_SUFFIX),64)
+ $(2)run_test_$(1) := --64
+endif
 .PHONY: test-art-target-run-test-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX)
 test-art-target-run-test-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX): test-art-target-sync $(DX) $(HOST_OUT_EXECUTABLES)/jasmin
-	DX=$(abspath $(DX)) JASMIN=$(abspath $(HOST_OUT_EXECUTABLES)/jasmin) art/test/run-test $(DALVIKVM_FLAGS) $(1) $(3)
+	DX=$(abspath $(DX)) JASMIN=$(abspath $(HOST_OUT_EXECUTABLES)/jasmin) art/test/run-test $(DALVIKVM_FLAGS) $$($(2)run_test_$(1)) $(1)
 	@echo test-art-target-run-test-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX) PASSED
 endef
 
 define declare-test-art-target-run-test
 
   ifdef TARGET_2ND_ARCH
-    $(call declare-test-art-target-run-test-impl,$(1),2ND_,)
+    $(call declare-test-art-target-run-test-impl,$(1),2ND_)
+    
+    TEST_ART_TARGET_RUN_TEST_TARGETS$(2ND_ART_PHONY_TEST_TARGET_SUFFIX) += test-art-target-run-test-$(1)$(2ND_ART_PHONY_TEST_TARGET_SUFFIX)
 
     ifneq ($(ART_PHONY_TEST_TARGET_SUFFIX),)
       # Link primary to non-suffix
 test-art-target-run-test-$(1): test-art-target-run-test-$(1)$(ART_PHONY_TEST_TARGET_SUFFIX)
     endif
   endif
-  $(call declare-test-art-target-run-test-impl,$(1),,--$(ART_TARGET_BINARY_SUFFIX))
+  $(call declare-test-art-target-run-test-impl,$(1),)
 
-  TEST_ART_TARGET_RUN_TEST_TARGETS += test-art-target-run-test-$(1)
+  TEST_ART_TARGET_RUN_TEST_TARGETS$(ART_PHONY_TEST_TARGET_SUFFIX) += test-art-target-run-test-$(1)$(ART_PHONY_TEST_TARGET_SUFFIX)
 
 test-art-run-test-$(1): test-art-host-run-test-$(1) test-art-target-run-test-$(1)
 
@@ -256,9 +289,14 @@
 
 $(foreach test, $(TEST_ART_RUN_TESTS), $(eval $(call declare-test-art-target-run-test,$(test))))
 
-.PHONY: test-art-target-run-test
-test-art-target-run-test: $(TEST_ART_TARGET_RUN_TEST_TARGETS)
-	@echo test-art-target-run-test PASSED
+
+define declare-test-art-target-run-test
+.PHONY: test-art-target-run-test$(1)
+test-art-target-run-test$(1): $(TEST_ART_TARGET_RUN_TEST_TARGETS$(1))
+	@echo test-art-target-run-test$(1) PASSED
+endef
+$(eval $(call call-art-multi-target-rule,declare-test-art-target-run-test,test-art-target-run-test))
+
 
 ########################################################################
 # oat-target and oat-target-sync targets
@@ -286,7 +324,12 @@
 
 $$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD_DEPENDENCY)
 	@mkdir -p $$(dir $$@)
-	$(DEX2OATD) --runtime-arg -Xms64m --runtime-arg -Xmx64m --boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) --dex-location=/$(1) --oat-file=$$@ --instruction-set=$(TARGET_ARCH) --instruction-set-features=$(TARGET_INSTRUCTION_SET_FEATURES) --android-root=$(PRODUCT_OUT)/system
+	$(DEX2OATD) --runtime-arg -Xms64m --runtime-arg -Xmx64m \
+		--boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) \
+		--dex-location=/$(1) --oat-file=$$@ \
+		--instruction-set=$(DEX2OAT_TARGET_ARCH) \
+		--instruction-set-features=$(DEX2OAT_TARGET_INSTRUCTION_SET_FEATURES) \
+		--android-root=$(PRODUCT_OUT)/system
 
 endif
 
diff --git a/build/Android.common.mk b/build/Android.common.mk
index b9a297b..c95b1c6 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -291,4 +291,71 @@
   ART_BUILD_DEBUG := true
 endif
 
+# Helper function to call a function twice with a target suffix
+# $(1): The generator function for the rules
+#         Has one argument, the suffix
+define call-art-multi-target
+  $(call $(1),$(ART_PHONY_TEST_TARGET_SUFFIX))
+  
+  ifdef TARGET_2ND_ARCH
+    $(call $(1),$(2ND_ART_PHONY_TEST_TARGET_SUFFIX))
+  endif
+endef
+
+# Helper function to combine two variables with suffixes together.
+# $(1): The base name.
+define combine-art-multi-target-var
+  ifdef TARGET_2ND_ARCH
+    ifneq ($(ART_PHONY_TEST_TARGET_SUFFIX),)
+      ifneq ($(2ND_ART_PHONY_TEST_TARGET_SUFFIX),)
+$(1) := $($(1)$(ART_PHONY_TEST_TARGET_SUFFIX)) $($(1)$(2ND_ART_PHONY_TEST_TARGET_SUFFIX))
+      endif
+    endif
+  endif
+endef
+
+
+# Helper function to define a variable twice with a target suffix. Assume the name generated is
+# derived from $(2) so we can create a combined var.
+# $(1): The generator function for the rules
+#         Has one argument, the suffix
+define call-art-multi-target-var
+  $(call $(1),$(ART_PHONY_TEST_TARGET_SUFFIX))
+  
+  ifdef TARGET_2ND_ARCH
+    $(call $(1),$(2ND_ART_PHONY_TEST_TARGET_SUFFIX))
+    
+    # Link both together, if it makes sense
+    ifneq ($(ART_PHONY_TEST_TARGET_SUFFIX),)
+      ifneq ($(2ND_ART_PHONY_TEST_TARGET_SUFFIX),)
+$(2) := $(2)$(ART_PHONY_TEST_TARGET_SUFFIX) $(2)$(2ND_ART_PHONY_TEST_TARGET_SUFFIX)
+      endif
+    endif
+
+  endif
+endef
+
+# Helper function to call a function twice with a target suffix. Assume it generates make rules
+# with the given name, and link them.
+# $(1): The generator function for the rules
+#         Has one argument, the suffix
+# $(2): The base rule name, necessary for the link
+#       We assume we can link the names together easily...
+define call-art-multi-target-rule
+  $(call $(1),$(ART_PHONY_TEST_TARGET_SUFFIX))
+  
+  ifdef TARGET_2ND_ARCH
+    $(call $(1),$(2ND_ART_PHONY_TEST_TARGET_SUFFIX))
+  
+    # Link both together, if it makes sense
+    ifneq ($(ART_PHONY_TEST_TARGET_SUFFIX),)
+      ifneq ($(2ND_ART_PHONY_TEST_TARGET_SUFFIX),)
+.PHONY: $(2)
+$(2): $(2)$(ART_PHONY_TEST_TARGET_SUFFIX) $(2)$(2ND_ART_PHONY_TEST_TARGET_SUFFIX)
+      endif
+    endif
+  endif
+endef
+
+
 endif # ANDROID_COMMON_MK
diff --git a/build/Android.executable.mk b/build/Android.executable.mk
index 27d687c..6aa1c18 100644
--- a/build/Android.executable.mk
+++ b/build/Android.executable.mk
@@ -52,10 +52,6 @@
   art_multilib := $(7)
 
   include $(CLEAR_VARS)
-  ifeq ($$(art_target_or_host),target)
-    include external/stlport/libstlport.mk
-  endif
-
   LOCAL_CPP_EXTENSION := $(ART_CPP_EXTENSION)
   LOCAL_MODULE_TAGS := optional
   LOCAL_SRC_FILES := $$(art_source)
@@ -104,9 +100,12 @@
   endif
 
   ifeq ($$(art_target_or_host),target)
+    include art/build/Android.libcxx.mk
     include $(BUILD_EXECUTABLE)
     ART_TARGET_EXECUTABLES := $(ART_TARGET_EXECUTABLES) $(TARGET_OUT_EXECUTABLES)/$$(LOCAL_MODULE)
   else # host
+    LOCAL_IS_HOST_MODULE := true
+    include art/build/Android.libcxx.mk
     include $(BUILD_HOST_EXECUTABLE)
     ART_HOST_EXECUTABLES := $(ART_HOST_EXECUTABLES) $(HOST_OUT_EXECUTABLES)/$$(LOCAL_MODULE)
   endif
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 5b83056..4b655b5 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -18,6 +18,7 @@
 
 RUNTIME_GTEST_COMMON_SRC_FILES := \
 	runtime/barrier_test.cc \
+	runtime/base/bit_field_test.cc \
 	runtime/base/bit_vector_test.cc \
 	runtime/base/hex_dump_test.cc \
 	runtime/base/histogram_test.cc \
@@ -59,6 +60,7 @@
 	runtime/verifier/method_verifier_test.cc \
 	runtime/verifier/reg_type_test.cc \
 	runtime/zip_archive_test.cc \
+	runtime/stack_indirect_reference_table_test.cc
 
 COMPILER_GTEST_COMMON_SRC_FILES := \
 	runtime/jni_internal_test.cc \
@@ -103,10 +105,12 @@
 	compiler/utils/x86/assembler_x86_test.cc
 
 ART_HOST_GTEST_EXECUTABLES :=
-ART_TARGET_GTEST_EXECUTABLES :=
+ART_TARGET_GTEST_EXECUTABLES$(ART_PHONY_TEST_TARGET_SUFFIX) :=
+ART_TARGET_GTEST_EXECUTABLES$(2ND_ART_PHONY_TEST_TARGET_SUFFIX) :=
 ART_HOST_GTEST_TARGETS :=
 ART_HOST_VALGRIND_GTEST_TARGETS :=
-ART_TARGET_GTEST_TARGETS :=
+ART_TARGET_GTEST_TARGETS$(ART_PHONY_TEST_TARGET_SUFFIX) :=
+ART_TARGET_GTEST_TARGETS$(2ND_ART_PHONY_TEST_TARGET_SUFFIX) :=
 
 ART_TEST_CFLAGS :=
 ifeq ($(ART_USE_PORTABLE_COMPILER),true)
@@ -125,7 +129,7 @@
 	$(hide) (adb pull $($(1)ART_TEST_DIR)/$$@ /tmp/ && echo $$@ PASSED) || (echo $$@ FAILED && exit 1)
 	$(hide) rm /tmp/$$@
 
-    ART_TARGET_GTEST_TARGETS += $$(art_gtest_target)$($(1)ART_PHONY_TEST_TARGET_SUFFIX)
+  ART_TARGET_GTEST_TARGETS$($(1)ART_PHONY_TEST_TARGET_SUFFIX) += $$(art_gtest_target)$($(1)ART_PHONY_TEST_TARGET_SUFFIX)
 endef
 
 
@@ -148,10 +152,6 @@
   art_gtest_name := $$(notdir $$(basename $$(art_gtest_filename)))
 
   include $(CLEAR_VARS)
-  ifeq ($$(art_target_or_host),target)
-    include external/stlport/libstlport.mk
-  endif
-
   LOCAL_CPP_EXTENSION := $(ART_CPP_EXTENSION)
   LOCAL_MODULE := $$(art_gtest_name)
   ifeq ($$(art_target_or_host),target)
@@ -177,18 +177,22 @@
     LOCAL_CLANG := $(ART_TARGET_CLANG)
     LOCAL_CFLAGS += $(ART_TARGET_CFLAGS) $(ART_TARGET_DEBUG_CFLAGS)
     LOCAL_CFLAGS_x86 := $(ART_TARGET_CFLAGS_x86)
-    LOCAL_SHARED_LIBRARIES += libdl libicuuc libicui18n libnativehelper libz libcutils
+    LOCAL_SHARED_LIBRARIES += libdl libicuuc libicui18n libnativehelper libz libcutils libvixl
     LOCAL_STATIC_LIBRARIES += libgtest
     LOCAL_MODULE_PATH_32 := $(ART_BASE_NATIVETEST_OUT)
     LOCAL_MODULE_PATH_64 := $(ART_BASE_NATIVETEST_OUT)64
     LOCAL_MULTILIB := both
+    include art/build/Android.libcxx.mk
     include $(BUILD_EXECUTABLE)
-    ART_TARGET_GTEST_EXECUTABLES += $$(art_gtest_exe)
+    
+    ART_TARGET_GTEST_EXECUTABLES$(ART_PHONY_TEST_TARGET_SUFFIX) += $(ART_NATIVETEST_OUT)/$$(LOCAL_MODULE)
     art_gtest_target := test-art-$$(art_target_or_host)-gtest-$$(art_gtest_name)
 
     ifdef TARGET_2ND_ARCH
       $(call build-art-test-make-target,2ND_)
 
+      ART_TARGET_GTEST_EXECUTABLES$(2ND_ART_PHONY_TEST_TARGET_SUFFIX) += $(2ND_ART_NATIVETEST_OUT)/$$(LOCAL_MODULE)
+
       # Bind the primary to the non-suffix rule
       ifneq ($(ART_PHONY_TEST_TARGET_SUFFIX),)
 $$(art_gtest_target): $$(art_gtest_target)$(ART_PHONY_TEST_TARGET_SUFFIX)
@@ -200,12 +204,14 @@
     LOCAL_CLANG := $(ART_HOST_CLANG)
     LOCAL_CFLAGS += $(ART_HOST_CFLAGS) $(ART_HOST_DEBUG_CFLAGS)
     LOCAL_SHARED_LIBRARIES += libicuuc-host libicui18n-host libnativehelper libz-host
-    LOCAL_STATIC_LIBRARIES += libcutils
+    LOCAL_STATIC_LIBRARIES += libcutils libvixl
     ifneq ($(WITHOUT_HOST_CLANG),true)
         # GCC host compiled tests fail with this linked, presumably due to destructors that run.
         LOCAL_STATIC_LIBRARIES += libgtest_host
     endif
     LOCAL_LDLIBS += -lpthread -ldl
+    LOCAL_IS_HOST_MODULE := true
+    include art/build/Android.libcxx.mk
     include $(BUILD_HOST_EXECUTABLE)
     art_gtest_exe := $(HOST_OUT_EXECUTABLES)/$$(LOCAL_MODULE)
     ART_HOST_GTEST_EXECUTABLES += $$(art_gtest_exe)
diff --git a/build/Android.libarttest.mk b/build/Android.libarttest.mk
index 14d16ac..18d321a 100644
--- a/build/Android.libarttest.mk
+++ b/build/Android.libarttest.mk
@@ -31,10 +31,6 @@
   art_target_or_host := $(1)
 
   include $(CLEAR_VARS)
-  ifeq ($$(art_target_or_host),target)
-   include external/stlport/libstlport.mk
-  endif
-
   LOCAL_CPP_EXTENSION := $(ART_CPP_EXTENSION)
   LOCAL_MODULE := libarttest
   ifeq ($$(art_target_or_host),target)
@@ -55,6 +51,7 @@
     LOCAL_MODULE_PATH_32 := $(ART_BASE_TEST_OUT)
     LOCAL_MODULE_PATH_64 := $(ART_BASE_TEST_OUT)64
     LOCAL_MODULE_TARGET_ARCH := $(ART_SUPPORTED_ARCH)
+    include art/build/Android.libcxx.mk
     include $(BUILD_SHARED_LIBRARY)
   else # host
     LOCAL_CLANG := $(ART_HOST_CLANG)
@@ -64,6 +61,8 @@
     ifeq ($(HOST_OS),linux)
       LOCAL_LDLIBS += -lrt
     endif
+    LOCAL_IS_HOST_MODULE := true
+    include art/build/Android.libcxx.mk
     include $(BUILD_HOST_SHARED_LIBRARY)
   endif
 endef
diff --git a/build/Android.libcxx.mk b/build/Android.libcxx.mk
new file mode 100644
index 0000000..3dd1eb7
--- /dev/null
+++ b/build/Android.libcxx.mk
@@ -0,0 +1,20 @@
+#
+# Copyright (C) 2014 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ifneq ($(LOCAL_IS_HOST_MODULE),true)
+  include external/stlport/libstlport.mk
+  # include external/libcxx/libcxx.mk
+endif
diff --git a/build/Android.oat.mk b/build/Android.oat.mk
index cb45a85..9d7579d 100644
--- a/build/Android.oat.mk
+++ b/build/Android.oat.mk
@@ -63,6 +63,9 @@
 		--oat-location=$$($(1)TARGET_CORE_OAT) --image=$$($(1)TARGET_CORE_IMG_OUT) --base=$$(LIBART_IMG_TARGET_BASE_ADDRESS) \
 		--instruction-set=$$($(1)TARGET_ARCH) --instruction-set-features=$$(TARGET_INSTRUCTION_SET_FEATURES) --android-root=$$(PRODUCT_OUT)/system
 
+# This "renaming" eases declaration in art/Android.mk
+TARGET_CORE_IMG_OUT$($(1)ART_PHONY_TEST_TARGET_SUFFIX) := $($(1)TARGET_CORE_IMG_OUT)
+
 $$($(1)TARGET_CORE_OAT_OUT): $$($(1)TARGET_CORE_IMG_OUT)
 endef
 
diff --git a/compiler/Android.mk b/compiler/Android.mk
index b17cd52..6d656e6 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -158,11 +158,10 @@
   art_ndebug_or_debug := $(2)
 
   include $(CLEAR_VARS)
-  ifeq ($$(art_target_or_host),target)
-    include external/stlport/libstlport.mk
-  else
+  ifeq ($$(art_target_or_host),host)
     LOCAL_IS_HOST_MODULE := true
   endif
+  include art/build/Android.libcxx.mk
   LOCAL_CPP_EXTENSION := $(ART_CPP_EXTENSION)
   ifeq ($$(art_ndebug_or_debug),ndebug)
     LOCAL_MODULE := libart-compiler
diff --git a/compiler/common_compiler_test.h b/compiler/common_compiler_test.h
index 9a21da0..fdf09a5 100644
--- a/compiler/common_compiler_test.h
+++ b/compiler/common_compiler_test.h
@@ -299,7 +299,7 @@
 
       // for ARM, do a runtime check to make sure that the features we are passed from
       // the build match the features we actually determine at runtime.
-      ASSERT_EQ(instruction_set_features, runtime_features);
+      ASSERT_LE(instruction_set_features, runtime_features);
 #elif defined(__aarch64__)
       instruction_set = kArm64;
       // TODO: arm64 compilation support.
diff --git a/compiler/compiled_method.cc b/compiler/compiled_method.cc
index 8e013c1..59ed827 100644
--- a/compiler/compiled_method.cc
+++ b/compiler/compiled_method.cc
@@ -82,21 +82,7 @@
 }
 
 uint32_t CompiledCode::AlignCode(uint32_t offset, InstructionSet instruction_set) {
-  switch (instruction_set) {
-    case kArm:
-    case kThumb2:
-      return RoundUp(offset, kArmAlignment);
-    case kArm64:
-      return RoundUp(offset, kArm64Alignment);
-    case kMips:
-      return RoundUp(offset, kMipsAlignment);
-    case kX86:  // Fall-through.
-    case kX86_64:
-      return RoundUp(offset, kX86Alignment);
-    default:
-      LOG(FATAL) << "Unknown InstructionSet: " << instruction_set;
-      return 0;
-  }
+  return RoundUp(offset, GetInstructionSetAlignment(instruction_set));
 }
 
 size_t CompiledCode::CodeDelta() const {
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 6c8c85d..6f4fa3a 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -323,10 +323,6 @@
 std::ostream& operator<<(std::ostream& os, const X86ConditionCode& kind);
 
 enum ThrowKind {
-  kThrowNullPointer,
-  kThrowDivZero,
-  kThrowArrayBounds,
-  kThrowConstantArrayBounds,
   kThrowNoSuchMethod,
 };
 
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index e48e5bf..ed2ecac 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -155,14 +155,16 @@
   cu.compiler_driver = &driver;
   cu.class_linker = class_linker;
   cu.instruction_set = driver.GetInstructionSet();
-  cu.target64 = (cu.instruction_set == kX86_64) || (cu.instruction_set == kArm64);
+  if (cu.instruction_set == kArm) {
+    cu.instruction_set = kThumb2;
+  }
+  cu.target64 = Is64BitInstructionSet(cu.instruction_set);
   cu.compiler = compiler;
   // TODO: x86_64 & arm64 are not yet implemented.
-  DCHECK((cu.instruction_set == kThumb2) ||
-         (cu.instruction_set == kX86) ||
-         (cu.instruction_set == kX86_64) ||
-         (cu.instruction_set == kMips));
-
+  CHECK((cu.instruction_set == kThumb2) ||
+        (cu.instruction_set == kX86) ||
+        (cu.instruction_set == kX86_64) ||
+        (cu.instruction_set == kMips));
 
   /* Adjust this value accordingly once inlining is performed */
   cu.num_dalvik_registers = code_item->registers_size_;
@@ -179,6 +181,17 @@
         (cu.enable_debug & (1 << kDebugVerbose));
   }
 
+  if (gVerboseMethods.size() != 0) {
+    cu.verbose = false;
+    for (size_t i = 0; i < gVerboseMethods.size(); ++i) {
+      if (PrettyMethod(method_idx, dex_file).find(gVerboseMethods[i])
+          != std::string::npos) {
+        cu.verbose = true;
+        break;
+      }
+    }
+  }
+
   /*
    * TODO: rework handling of optimization and debug flags.  Should we split out
    * MIR and backend flags?  Need command-line setting as well.
diff --git a/compiler/dex/local_value_numbering.cc b/compiler/dex/local_value_numbering.cc
index 8dbc2bb..c0068b2 100644
--- a/compiler/dex/local_value_numbering.cc
+++ b/compiler/dex/local_value_numbering.cc
@@ -215,17 +215,13 @@
     case Instruction::CONST_STRING_JUMBO:
     case Instruction::CONST_CLASS:
     case Instruction::NEW_ARRAY:
-      if ((mir->optimization_flags & MIR_INLINED) == 0) {
-        // 1 result, treat as unique each time, use result s_reg - will be unique.
-        res = MarkNonAliasingNonNull(mir);
-      }
+      // 1 result, treat as unique each time, use result s_reg - will be unique.
+      res = MarkNonAliasingNonNull(mir);
       break;
     case Instruction::MOVE_RESULT_WIDE:
-      if ((mir->optimization_flags & MIR_INLINED) == 0) {
-        // 1 wide result, treat as unique each time, use result s_reg - will be unique.
-        res = GetOperandValueWide(mir->ssa_rep->defs[0]);
-        SetOperandValueWide(mir->ssa_rep->defs[0], res);
-      }
+      // 1 wide result, treat as unique each time, use result s_reg - will be unique.
+      res = GetOperandValueWide(mir->ssa_rep->defs[0]);
+      SetOperandValueWide(mir->ssa_rep->defs[0], res);
       break;
 
     case kMirOpPhi:
diff --git a/compiler/dex/quick/arm/arm_lir.h b/compiler/dex/quick/arm/arm_lir.h
index 1784af3..c9acd66 100644
--- a/compiler/dex/quick/arm/arm_lir.h
+++ b/compiler/dex/quick/arm/arm_lir.h
@@ -454,8 +454,6 @@
   kThumb2Vcmps,      // vcmp [111011101] D [11010] rd[15-12] [1011] E [1] M [0] rm[3-0].
   kThumb2LdrPcRel12,  // ldr rd,[pc,#imm12] [1111100011011111] rt[15-12] imm12[11-0].
   kThumb2BCond,      // b<c> [1110] S cond[25-22] imm6[21-16] [10] J1 [0] J2 imm11[10..0].
-  kThumb2Vmovd_RR,   // vmov [111011101] D [110000] vd[15-12 [101101] M [0] vm[3-0].
-  kThumb2Vmovs_RR,   // vmov [111011101] D [110000] vd[15-12 [101001] M [0] vm[3-0].
   kThumb2Fmrs,       // vmov [111011100000] vn[19-16] rt[15-12] [1010] N [0010000].
   kThumb2Fmsr,       // vmov [111011100001] vn[19-16] rt[15-12] [1010] N [0010000].
   kThumb2Fmrrd,      // vmov [111011000100] rt2[19-16] rt[15-12] [101100] M [1] vm[3-0].
diff --git a/compiler/dex/quick/arm/assemble_arm.cc b/compiler/dex/quick/arm/assemble_arm.cc
index 1c35018..f77b0a6 100644
--- a/compiler/dex/quick/arm/assemble_arm.cc
+++ b/compiler/dex/quick/arm/assemble_arm.cc
@@ -848,14 +848,6 @@
                  kFmtUnused, -1, -1,
                  IS_BINARY_OP | IS_BRANCH | USES_CCODES | NEEDS_FIXUP,
                  "b!1c", "!0t", 4, kFixupCondBranch),
-    ENCODING_MAP(kThumb2Vmovd_RR,       0xeeb00b40,
-                 kFmtDfp, 22, 12, kFmtDfp, 5, 0, kFmtUnused, -1, -1,
-                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
-                 "vmov.f64", "!0S, !1S", 4, kFixupNone),
-    ENCODING_MAP(kThumb2Vmovs_RR,       0xeeb00a40,
-                 kFmtSfp, 22, 12, kFmtSfp, 5, 0, kFmtUnused, -1, -1,
-                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
-                 "vmov.f32", "!0s, !1s", 4, kFixupNone),
     ENCODING_MAP(kThumb2Fmrs,       0xee100a10,
                  kFmtBitBlt, 15, 12, kFmtSfp, 7, 16, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0_USE1,
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 13fa635..b0bc11d 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -131,7 +131,7 @@
     RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
     void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
-    void GenDivZeroCheck(RegStorage reg);
+    void GenDivZeroCheckWide(RegStorage reg);
     void GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method);
     void GenExitSequence();
     void GenSpecialExitSequence();
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 1c563bb..d5b34a5 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -90,8 +90,7 @@
  *     neg   rX
  * done:
  */
-void ArmMir2Lir::GenCmpLong(RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) {
+void ArmMir2Lir::GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2) {
   LIR* target1;
   LIR* target2;
   rl_src1 = LoadValueWide(rl_src1, kCoreReg);
@@ -101,7 +100,7 @@
   OpRegReg(kOpCmp, rl_src1.reg.GetHigh(), rl_src2.reg.GetHigh());
   LIR* branch1 = OpCondBranch(kCondLt, NULL);
   LIR* branch2 = OpCondBranch(kCondGt, NULL);
-  OpRegRegReg(kOpSub, t_reg, rl_src1.reg, rl_src2.reg);
+  OpRegRegReg(kOpSub, t_reg, rl_src1.reg.GetLow(), rl_src2.reg.GetLow());
   LIR* branch3 = OpCondBranch(kCondEq, NULL);
 
   LIR* it = OpIT(kCondHi, "E");
@@ -434,10 +433,6 @@
   if (pattern == DivideNone) {
     return false;
   }
-  // Tuning: add rem patterns
-  if (!is_div) {
-    return false;
-  }
 
   RegStorage r_magic = AllocTemp();
   LoadConstant(r_magic, magic_table[lit].magic);
@@ -445,25 +440,45 @@
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage r_hi = AllocTemp();
   RegStorage r_lo = AllocTemp();
+
+  // rl_dest and rl_src might overlap.
+  // Reuse r_hi to save the div result for reminder case.
+  RegStorage r_div_result = is_div ? rl_result.reg : r_hi;
+
   NewLIR4(kThumb2Smull, r_lo.GetReg(), r_hi.GetReg(), r_magic.GetReg(), rl_src.reg.GetReg());
   switch (pattern) {
     case Divide3:
-      OpRegRegRegShift(kOpSub, rl_result.reg, r_hi, rl_src.reg, EncodeShift(kArmAsr, 31));
+      OpRegRegRegShift(kOpSub, r_div_result, r_hi, rl_src.reg, EncodeShift(kArmAsr, 31));
       break;
     case Divide5:
       OpRegRegImm(kOpAsr, r_lo, rl_src.reg, 31);
-      OpRegRegRegShift(kOpRsub, rl_result.reg, r_lo, r_hi,
+      OpRegRegRegShift(kOpRsub, r_div_result, r_lo, r_hi,
                        EncodeShift(kArmAsr, magic_table[lit].shift));
       break;
     case Divide7:
       OpRegReg(kOpAdd, r_hi, rl_src.reg);
       OpRegRegImm(kOpAsr, r_lo, rl_src.reg, 31);
-      OpRegRegRegShift(kOpRsub, rl_result.reg, r_lo, r_hi,
+      OpRegRegRegShift(kOpRsub, r_div_result, r_lo, r_hi,
                        EncodeShift(kArmAsr, magic_table[lit].shift));
       break;
     default:
       LOG(FATAL) << "Unexpected pattern: " << pattern;
   }
+
+  if (!is_div) {
+    // div_result = src / lit
+    // tmp1 = div_result * lit
+    // dest = src - tmp1
+    RegStorage tmp1 = r_lo;
+    EasyMultiplyOp ops[2];
+
+    bool canEasyMultiply = GetEasyMultiplyTwoOps(lit, ops);
+    DCHECK_NE(canEasyMultiply, false);
+
+    GenEasyMultiplyTwoOps(tmp1, r_div_result, ops);
+    OpRegRegReg(kOpSub, rl_result.reg, rl_src.reg, tmp1);
+  }
+
   StoreValue(rl_dest, rl_result);
   return true;
 }
@@ -489,6 +504,7 @@
   }
 
   op->op = kOpInvalid;
+  op->shift = 0;
   return false;
 }
 
@@ -497,6 +513,7 @@
   GetEasyMultiplyOp(lit, &ops[0]);
   if (GetEasyMultiplyOp(lit, &ops[0])) {
     ops[1].op = kOpInvalid;
+    ops[1].shift = 0;
     return true;
   }
 
@@ -527,31 +544,52 @@
   return false;
 }
 
+// Generate instructions to do multiply.
+// Additional temporary register is required,
+// if it need to generate 2 instructions and src/dest overlap.
 void ArmMir2Lir::GenEasyMultiplyTwoOps(RegStorage r_dest, RegStorage r_src, EasyMultiplyOp* ops) {
-  // dest = ( src << shift1) + [ src | -src | 0 ]
-  // dest = (dest << shift2) + [ src | -src | 0 ]
-  for (int i = 0; i < 2; i++) {
-    RegStorage r_src2;
-    if (i == 0) {
-      r_src2 = r_src;
-    } else {
-      r_src2 = r_dest;
-    }
-    switch (ops[i].op) {
+  // tmp1 = ( src << shift1) + [ src | -src | 0 ]
+  // dest = (tmp1 << shift2) + [ src | -src | 0 ]
+
+  RegStorage r_tmp1;
+  if (ops[1].op == kOpInvalid) {
+    r_tmp1 = r_dest;
+  } else if (r_dest.GetReg() != r_src.GetReg()) {
+    r_tmp1 = r_dest;
+  } else {
+    r_tmp1 = AllocTemp();
+  }
+
+  switch (ops[0].op) {
     case kOpLsl:
-      OpRegRegImm(kOpLsl, r_dest, r_src2, ops[i].shift);
+      OpRegRegImm(kOpLsl, r_tmp1, r_src, ops[0].shift);
       break;
     case kOpAdd:
-      OpRegRegRegShift(kOpAdd, r_dest, r_src, r_src2, EncodeShift(kArmLsl, ops[i].shift));
+      OpRegRegRegShift(kOpAdd, r_tmp1, r_src, r_src, EncodeShift(kArmLsl, ops[0].shift));
       break;
     case kOpRsub:
-      OpRegRegRegShift(kOpRsub, r_dest, r_src, r_src2, EncodeShift(kArmLsl, ops[i].shift));
+      OpRegRegRegShift(kOpRsub, r_tmp1, r_src, r_src, EncodeShift(kArmLsl, ops[0].shift));
       break;
     default:
-      DCHECK_NE(i, 0);
-      DCHECK_EQ(ops[i].op, kOpInvalid);
+      DCHECK_EQ(ops[0].op, kOpInvalid);
       break;
-    }
+  }
+
+  switch (ops[1].op) {
+    case kOpInvalid:
+      return;
+    case kOpLsl:
+      OpRegRegImm(kOpLsl, r_dest, r_tmp1, ops[1].shift);
+      break;
+    case kOpAdd:
+      OpRegRegRegShift(kOpAdd, r_dest, r_src, r_tmp1, EncodeShift(kArmLsl, ops[1].shift));
+      break;
+    case kOpRsub:
+      OpRegRegRegShift(kOpRsub, r_dest, r_src, r_tmp1, EncodeShift(kArmLsl, ops[1].shift));
+      break;
+    default:
+      LOG(FATAL) << "Unexpected opcode passed to GenEasyMultiplyTwoOps";
+      break;
   }
 }
 
@@ -873,12 +911,12 @@
   }
 }
 
-void ArmMir2Lir::GenDivZeroCheck(RegStorage reg) {
+void ArmMir2Lir::GenDivZeroCheckWide(RegStorage reg) {
   DCHECK(reg.IsPair());   // TODO: support k64BitSolo.
   RegStorage t_reg = AllocTemp();
   NewLIR4(kThumb2OrrRRRs, t_reg.GetReg(), reg.GetLowReg(), reg.GetHighReg(), 0);
   FreeTemp(t_reg);
-  GenCheck(kCondEq, kThrowDivZero);
+  GenDivZeroCheck(kCondEq);
 }
 
 // Test suspend flag, return target of taken suspend branch
@@ -1129,9 +1167,9 @@
 
     if (needs_range_check) {
       if (constant_index) {
-        GenImmedCheck(kCondLs, reg_len, mir_graph_->ConstantValue(rl_index), kThrowConstantArrayBounds);
+        GenArrayBoundsCheck(mir_graph_->ConstantValue(rl_index), reg_len);
       } else {
-        GenRegRegCheck(kCondLs, reg_len, rl_index.reg, kThrowArrayBounds);
+        GenArrayBoundsCheck(rl_index.reg, reg_len);
       }
       FreeTemp(reg_len);
     }
@@ -1158,7 +1196,7 @@
     rl_result = EvalLoc(rl_dest, reg_class, true);
 
     if (needs_range_check) {
-      GenRegRegCheck(kCondUge, rl_index.reg, reg_len, kThrowArrayBounds);
+      GenArrayBoundsCheck(rl_index.reg, reg_len);
       FreeTemp(reg_len);
     }
     LoadBaseIndexed(reg_ptr, rl_index.reg, rl_result.reg, scale, size);
@@ -1233,9 +1271,9 @@
     }
     if (needs_range_check) {
       if (constant_index) {
-        GenImmedCheck(kCondLs, reg_len, mir_graph_->ConstantValue(rl_index), kThrowConstantArrayBounds);
+        GenArrayBoundsCheck(mir_graph_->ConstantValue(rl_index), reg_len);
       } else {
-        GenRegRegCheck(kCondLs, reg_len, rl_index.reg, kThrowArrayBounds);
+        GenArrayBoundsCheck(rl_index.reg, reg_len);
       }
       FreeTemp(reg_len);
     }
@@ -1251,7 +1289,7 @@
     OpRegRegImm(kOpAdd, reg_ptr, rl_array.reg, data_offset);
     rl_src = LoadValue(rl_src, reg_class);
     if (needs_range_check) {
-      GenRegRegCheck(kCondUge, rl_index.reg, reg_len, kThrowArrayBounds);
+      GenArrayBoundsCheck(rl_index.reg, reg_len);
       FreeTemp(reg_len);
     }
     StoreBaseIndexed(reg_ptr, rl_index.reg, rl_src.reg, scale, size);
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index 5e9a8b0..1053a8f 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -562,7 +562,8 @@
 
   // Keep special registers from being allocated
   // Don't reserve the r4 if we are doing implicit suspend checks.
-  bool no_suspend = NO_SUSPEND || !Runtime::Current()->ExplicitSuspendChecks();
+  // TODO: re-enable this when we can safely save r4 over the suspension code path.
+  bool no_suspend = NO_SUSPEND;  // || !Runtime::Current()->ExplicitSuspendChecks();
   for (int i = 0; i < num_reserved; i++) {
     if (no_suspend && (ReservedRegs[i] == rARM_SUSPEND)) {
       // Don't reserve the suspend register.
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc
index fa6de96..8806e68 100644
--- a/compiler/dex/quick/dex_file_method_inliner.cc
+++ b/compiler/dex/quick/dex_file_method_inliner.cc
@@ -47,6 +47,22 @@
   return insn;
 }
 
+uint32_t GetInvokeReg(MIR* invoke, uint32_t arg) {
+  DCHECK_LT(arg, invoke->dalvikInsn.vA);
+  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc) {
+    return invoke->dalvikInsn.vC + arg;  // Non-range invoke.
+  } else {
+    DCHECK_EQ(Instruction::FormatOf(invoke->dalvikInsn.opcode), Instruction::k35c);
+    return invoke->dalvikInsn.arg[arg];  // Range invoke.
+  }
+}
+
+bool WideArgIsInConsecutiveDalvikRegs(MIR* invoke, uint32_t arg) {
+  DCHECK_LT(arg + 1, invoke->dalvikInsn.vA);
+  return Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc ||
+      invoke->dalvikInsn.arg[arg + 1u] == invoke->dalvikInsn.arg[arg] + 1u;
+}
+
 }  // anonymous namespace
 
 const uint32_t DexFileMethodInliner::kIndexUnresolved;
@@ -396,7 +412,8 @@
       result = GenInlineIGet(mir_graph, bb, invoke, move_result, method, method_idx);
       break;
     case kInlineOpIPut:
-      result = GenInlineIPut(mir_graph, bb, invoke, method, method_idx);
+      move_result = mir_graph->FindMoveResult(bb, invoke);
+      result = GenInlineIPut(mir_graph, bb, invoke, move_result, method, method_idx);
       break;
     default:
       LOG(FATAL) << "Unexpected inline op: " << method.opcode;
@@ -578,25 +595,24 @@
   // Select opcode and argument.
   const InlineReturnArgData& data = method.d.return_data;
   Instruction::Code opcode = Instruction::MOVE_FROM16;
+  uint32_t arg = GetInvokeReg(invoke, data.arg);
   if (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_OBJECT) {
     DCHECK_EQ(data.is_object, 1u);
+    DCHECK_EQ(data.is_wide, 0u);
     opcode = Instruction::MOVE_OBJECT_FROM16;
   } else if (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_WIDE) {
     DCHECK_EQ(data.is_wide, 1u);
+    DCHECK_EQ(data.is_object, 0u);
     opcode = Instruction::MOVE_WIDE_FROM16;
+    if (!WideArgIsInConsecutiveDalvikRegs(invoke, data.arg)) {
+      // The two halfs of the source value are not in consecutive dalvik registers in INVOKE.
+      return false;
+    }
   } else {
     DCHECK(move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT);
     DCHECK_EQ(data.is_wide, 0u);
     DCHECK_EQ(data.is_object, 0u);
   }
-  DCHECK_LT(data.is_wide ? data.arg + 1u : data.arg, invoke->dalvikInsn.vA);
-  int arg;
-  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k35c) {
-    arg = invoke->dalvikInsn.arg[data.arg];  // Non-range invoke.
-  } else {
-    DCHECK_EQ(Instruction::FormatOf(invoke->dalvikInsn.opcode), Instruction::k3rc);
-    arg = invoke->dalvikInsn.vC + data.arg;  // Range invoke.
-  }
 
   // Insert the move instruction
   MIR* insn = AllocReplacementMIR(mir_graph, invoke, move_result);
@@ -616,33 +632,39 @@
   }
 
   const InlineIGetIPutData& data = method.d.ifield_data;
-  if (invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
-      invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE ||
-      data.object_arg != 0) {
-    // TODO: Implement inlining of IGET on non-"this" registers (needs correct stack trace for NPE).
-    return false;
-  }
+  Instruction::Code opcode = static_cast<Instruction::Code>(Instruction::IGET + data.op_variant);
+  DCHECK_EQ(InlineMethodAnalyser::IGetVariant(opcode), data.op_variant);
+  uint32_t object_reg = GetInvokeReg(invoke, data.object_arg);
 
   if (move_result == nullptr) {
     // Result is unused. If volatile, we still need to emit the IGET but we have no destination.
     return !data.is_volatile;
   }
 
-  Instruction::Code opcode = static_cast<Instruction::Code>(Instruction::IGET + data.op_variant);
-  DCHECK_EQ(InlineMethodAnalyser::IGetVariant(opcode), data.op_variant);
+  DCHECK_EQ(data.method_is_static != 0u,
+            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
+            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE);
+  bool object_is_this = (data.method_is_static == 0u && data.object_arg == 0u);
+  if (!object_is_this) {
+    // TODO: Implement inlining of IGET on non-"this" registers (needs correct stack trace for NPE).
+    // Allow synthetic accessors. We don't care about losing their stack frame in NPE.
+    if (!InlineMethodAnalyser::IsSyntheticAccessor(
+        mir_graph->GetMethodLoweringInfo(invoke).GetTargetMethod())) {
+      return false;
+    }
+  }
+
+  if (object_is_this) {
+    // Mark invoke as NOP, null-check is done on IGET. No aborts after this.
+    invoke->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+  }
 
   MIR* insn = AllocReplacementMIR(mir_graph, invoke, move_result);
   insn->width += insn->offset - invoke->offset;
   insn->offset = invoke->offset;
   insn->dalvikInsn.opcode = opcode;
   insn->dalvikInsn.vA = move_result->dalvikInsn.vA;
-  DCHECK_LT(data.object_arg, invoke->dalvikInsn.vA);
-  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc) {
-    insn->dalvikInsn.vB = invoke->dalvikInsn.vC + data.object_arg;
-  } else {
-    DCHECK_EQ(Instruction::FormatOf(invoke->dalvikInsn.opcode), Instruction::k35c);
-    insn->dalvikInsn.vB = invoke->dalvikInsn.arg[data.object_arg];
-  }
+  insn->dalvikInsn.vB = object_reg;
   mir_graph->ComputeInlineIFieldLoweringInfo(data.field_idx, invoke, insn);
 
   DCHECK(mir_graph->GetIFieldLoweringInfo(insn).IsResolved());
@@ -655,32 +677,55 @@
 }
 
 bool DexFileMethodInliner::GenInlineIPut(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
-                                         const InlineMethod& method, uint32_t method_idx) {
+                                         MIR* move_result, const InlineMethod& method,
+                                         uint32_t method_idx) {
   CompilationUnit* cu = mir_graph->GetCurrentDexCompilationUnit()->GetCompilationUnit();
   if (cu->enable_debug & (1 << kDebugSlowFieldPath)) {
     return false;
   }
 
   const InlineIGetIPutData& data = method.d.ifield_data;
-  if (invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
-      invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE ||
-      data.object_arg != 0) {
-    // TODO: Implement inlining of IPUT on non-"this" registers (needs correct stack trace for NPE).
+  Instruction::Code opcode = static_cast<Instruction::Code>(Instruction::IPUT + data.op_variant);
+  DCHECK_EQ(InlineMethodAnalyser::IPutVariant(opcode), data.op_variant);
+  uint32_t object_reg = GetInvokeReg(invoke, data.object_arg);
+  uint32_t src_reg = GetInvokeReg(invoke, data.src_arg);
+  uint32_t return_reg =
+      data.return_arg_plus1 != 0u ? GetInvokeReg(invoke, data.return_arg_plus1 - 1u) : 0u;
+
+  if (opcode == Instruction::IPUT_WIDE && !WideArgIsInConsecutiveDalvikRegs(invoke, data.src_arg)) {
+    // The two halfs of the source value are not in consecutive dalvik registers in INVOKE.
     return false;
   }
 
-  Instruction::Code opcode = static_cast<Instruction::Code>(Instruction::IPUT + data.op_variant);
-  DCHECK_EQ(InlineMethodAnalyser::IPutVariant(opcode), data.op_variant);
-
-  MIR* insn = AllocReplacementMIR(mir_graph, invoke, nullptr);
-  insn->dalvikInsn.opcode = opcode;
-  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc) {
-    insn->dalvikInsn.vA = invoke->dalvikInsn.vC + data.src_arg;
-    insn->dalvikInsn.vB = invoke->dalvikInsn.vC + data.object_arg;
-  } else {
-    insn->dalvikInsn.vA = invoke->dalvikInsn.arg[data.src_arg];
-    insn->dalvikInsn.vB = invoke->dalvikInsn.arg[data.object_arg];
+  DCHECK(move_result == nullptr || data.return_arg_plus1 != 0u);
+  if (move_result != nullptr && move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_WIDE &&
+      !WideArgIsInConsecutiveDalvikRegs(invoke, data.return_arg_plus1 - 1u)) {
+    // The two halfs of the return value are not in consecutive dalvik registers in INVOKE.
+    return false;
   }
+
+  DCHECK_EQ(data.method_is_static != 0u,
+            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
+            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE);
+  bool object_is_this = (data.method_is_static == 0u && data.object_arg == 0u);
+  if (!object_is_this) {
+    // TODO: Implement inlining of IPUT on non-"this" registers (needs correct stack trace for NPE).
+    // Allow synthetic accessors. We don't care about losing their stack frame in NPE.
+    if (!InlineMethodAnalyser::IsSyntheticAccessor(
+        mir_graph->GetMethodLoweringInfo(invoke).GetTargetMethod())) {
+      return false;
+    }
+  }
+
+  if (object_is_this) {
+    // Mark invoke as NOP, null-check is done on IPUT. No aborts after this.
+    invoke->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+  }
+
+  MIR* insn = AllocReplacementMIR(mir_graph, invoke, move_result);
+  insn->dalvikInsn.opcode = opcode;
+  insn->dalvikInsn.vA = src_reg;
+  insn->dalvikInsn.vB = object_reg;
   mir_graph->ComputeInlineIFieldLoweringInfo(data.field_idx, invoke, insn);
 
   DCHECK(mir_graph->GetIFieldLoweringInfo(insn).IsResolved());
@@ -689,6 +734,24 @@
   DCHECK_EQ(data.is_volatile, mir_graph->GetIFieldLoweringInfo(insn).IsVolatile() ? 1u : 0u);
 
   bb->InsertMIRAfter(invoke, insn);
+
+  if (move_result != nullptr) {
+    MIR* move = AllocReplacementMIR(mir_graph, invoke, move_result);
+    insn->width = invoke->width;
+    move->offset = move_result->offset;
+    move->width = move_result->width;
+    if (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT) {
+      move->dalvikInsn.opcode = Instruction::MOVE_FROM16;
+    } else if (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_OBJECT) {
+      move->dalvikInsn.opcode = Instruction::MOVE_OBJECT_FROM16;
+    } else {
+      DCHECK_EQ(move_result->dalvikInsn.opcode, Instruction::MOVE_RESULT_WIDE);
+      move->dalvikInsn.opcode = Instruction::MOVE_WIDE_FROM16;
+    }
+    move->dalvikInsn.vA = move_result->dalvikInsn.vA;
+    move->dalvikInsn.vB = return_reg;
+    bb->InsertMIRAfter(insn, move);
+  }
   return true;
 }
 
diff --git a/compiler/dex/quick/dex_file_method_inliner.h b/compiler/dex/quick/dex_file_method_inliner.h
index b4e190a..c03f89c 100644
--- a/compiler/dex/quick/dex_file_method_inliner.h
+++ b/compiler/dex/quick/dex_file_method_inliner.h
@@ -302,7 +302,7 @@
     static bool GenInlineIGet(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
                               MIR* move_result, const InlineMethod& method, uint32_t method_idx);
     static bool GenInlineIPut(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
-                              const InlineMethod& method, uint32_t method_idx);
+                              MIR* move_result, const InlineMethod& method, uint32_t method_idx);
 
     ReaderWriterMutex lock_;
     /*
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index bfa22da..6781a9b 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -42,17 +42,6 @@
   barrier->u.m.def_mask = ENCODE_ALL;
 }
 
-// TODO: need to do some work to split out targets with
-// condition codes and those without
-LIR* Mir2Lir::GenCheck(ConditionCode c_code, ThrowKind kind) {
-  DCHECK_NE(cu_->instruction_set, kMips);
-  LIR* tgt = RawLIR(0, kPseudoThrowTarget, kind, current_dalvik_offset_);
-  LIR* branch = OpCondBranch(c_code, tgt);
-  // Remember branch target - will process later
-  throw_launchpads_.Insert(tgt);
-  return branch;
-}
-
 LIR* Mir2Lir::GenImmedCheck(ConditionCode c_code, RegStorage reg, int imm_val, ThrowKind kind) {
   LIR* tgt;
   LIR* branch;
@@ -69,6 +58,111 @@
   return branch;
 }
 
+void Mir2Lir::GenDivZeroException() {
+  LIR* branch = OpUnconditionalBranch(nullptr);
+  AddDivZeroCheckSlowPath(branch);
+}
+
+void Mir2Lir::GenDivZeroCheck(ConditionCode c_code) {
+  LIR* branch = OpCondBranch(c_code, nullptr);
+  AddDivZeroCheckSlowPath(branch);
+}
+
+void Mir2Lir::GenDivZeroCheck(RegStorage reg) {
+  LIR* branch = OpCmpImmBranch(kCondEq, reg, 0, nullptr);
+  AddDivZeroCheckSlowPath(branch);
+}
+
+void Mir2Lir::AddDivZeroCheckSlowPath(LIR* branch) {
+  class DivZeroCheckSlowPath : public Mir2Lir::LIRSlowPath {
+   public:
+    DivZeroCheckSlowPath(Mir2Lir* m2l, LIR* branch)
+        : LIRSlowPath(m2l, m2l->GetCurrentDexPc(), branch) {
+    }
+
+    void Compile() OVERRIDE {
+      m2l_->ResetRegPool();
+      m2l_->ResetDefTracking();
+      GenerateTargetLabel();
+      m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(4, pThrowDivZero), true);
+    }
+  };
+
+  AddSlowPath(new (arena_) DivZeroCheckSlowPath(this, branch));
+}
+
+void Mir2Lir::GenArrayBoundsCheck(RegStorage index, RegStorage length) {
+  class ArrayBoundsCheckSlowPath : public Mir2Lir::LIRSlowPath {
+   public:
+    ArrayBoundsCheckSlowPath(Mir2Lir* m2l, LIR* branch, RegStorage index, RegStorage length)
+        : LIRSlowPath(m2l, m2l->GetCurrentDexPc(), branch),
+          index_(index), length_(length) {
+    }
+
+    void Compile() OVERRIDE {
+      m2l_->ResetRegPool();
+      m2l_->ResetDefTracking();
+      GenerateTargetLabel();
+      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds),
+                                    index_, length_, true);
+    }
+
+   private:
+    const RegStorage index_;
+    const RegStorage length_;
+  };
+
+  LIR* branch = OpCmpBranch(kCondUge, index, length, nullptr);
+  AddSlowPath(new (arena_) ArrayBoundsCheckSlowPath(this, branch, index, length));
+}
+
+void Mir2Lir::GenArrayBoundsCheck(int index, RegStorage length) {
+  class ArrayBoundsCheckSlowPath : public Mir2Lir::LIRSlowPath {
+   public:
+    ArrayBoundsCheckSlowPath(Mir2Lir* m2l, LIR* branch, int index, RegStorage length)
+        : LIRSlowPath(m2l, m2l->GetCurrentDexPc(), branch),
+          index_(index), length_(length) {
+    }
+
+    void Compile() OVERRIDE {
+      m2l_->ResetRegPool();
+      m2l_->ResetDefTracking();
+      GenerateTargetLabel();
+
+      m2l_->OpRegCopy(m2l_->TargetReg(kArg1), length_);
+      m2l_->LoadConstant(m2l_->TargetReg(kArg0), index_);
+      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds),
+                                    m2l_->TargetReg(kArg0), m2l_->TargetReg(kArg1), true);
+    }
+
+   private:
+    const int32_t index_;
+    const RegStorage length_;
+  };
+
+  LIR* branch = OpCmpImmBranch(kCondLs, length, index, nullptr);
+  AddSlowPath(new (arena_) ArrayBoundsCheckSlowPath(this, branch, index, length));
+}
+
+LIR* Mir2Lir::GenNullCheck(RegStorage reg) {
+  class NullCheckSlowPath : public Mir2Lir::LIRSlowPath {
+   public:
+    NullCheckSlowPath(Mir2Lir* m2l, LIR* branch)
+        : LIRSlowPath(m2l, m2l->GetCurrentDexPc(), branch) {
+    }
+
+    void Compile() OVERRIDE {
+      m2l_->ResetRegPool();
+      m2l_->ResetDefTracking();
+      GenerateTargetLabel();
+      m2l_->CallRuntimeHelper(QUICK_ENTRYPOINT_OFFSET(4, pThrowNullPointer), true);
+    }
+  };
+
+  LIR* branch = OpCmpImmBranch(kCondEq, reg, 0, nullptr);
+  AddSlowPath(new (arena_) NullCheckSlowPath(this, branch));
+  return branch;
+}
 
 /* Perform null-check on a register.  */
 LIR* Mir2Lir::GenNullCheck(RegStorage m_reg, int opt_flags) {
@@ -83,7 +177,7 @@
   if (!(cu_->disable_opt & (1 << kNullCheckElimination)) && (opt_flags & MIR_IGNORE_NULL_CHECK)) {
     return NULL;
   }
-  return GenImmedCheck(kCondEq, m_reg, 0, kThrowNullPointer);
+  return GenNullCheck(m_reg);
 }
 
 void Mir2Lir::MarkPossibleNullPointerException(int opt_flags) {
@@ -479,7 +573,12 @@
     }
     // rBase now holds static storage base
     if (is_long_or_double) {
-      rl_src = LoadValueWide(rl_src, kAnyReg);
+      RegisterClass register_kind = kAnyReg;
+      if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
+        // Force long/double volatile stores into SSE registers to avoid tearing.
+        register_kind = kFPReg;
+      }
+      rl_src = LoadValueWide(rl_src, register_kind);
     } else {
       rl_src = LoadValue(rl_src, kAnyReg);
     }
@@ -560,7 +659,12 @@
       FreeTemp(r_method);
     }
     // r_base now holds static storage base
-    RegLocation rl_result = EvalLoc(rl_dest, kAnyReg, true);
+    RegisterClass result_reg_kind = kAnyReg;
+    if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
+      // Force long/double volatile loads into SSE registers to avoid tearing.
+      result_reg_kind = kFPReg;
+    }
+    RegLocation rl_result = EvalLoc(rl_dest, result_reg_kind, true);
 
     if (is_long_or_double) {
       LoadBaseDispWide(r_base, field_info.FieldOffset().Int32Value(), rl_result.reg, INVALID_SREG);
@@ -634,64 +738,7 @@
     AppendLIR(lab);
     ThreadOffset<4> func_offset(-1);
     int v1 = lab->operands[2];
-    int v2 = lab->operands[3];
-    const bool target_x86 = cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64;
     switch (lab->operands[0]) {
-      case kThrowNullPointer:
-        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowNullPointer);
-        break;
-      case kThrowConstantArrayBounds:  // v1 is length reg (for Arm/Mips), v2 constant index
-        // v1 holds the constant array index.  Mips/Arm uses v2 for length, x86 reloads.
-        if (target_x86) {
-          OpRegMem(kOpMov, TargetReg(kArg1), RegStorage::Solo32(v1),
-                   mirror::Array::LengthOffset().Int32Value());
-        } else {
-          OpRegCopy(TargetReg(kArg1), RegStorage::Solo32(v1));
-        }
-        // Make sure the following LoadConstant doesn't mess with kArg1.
-        LockTemp(TargetReg(kArg1));
-        LoadConstant(TargetReg(kArg0), v2);
-        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds);
-        break;
-      case kThrowArrayBounds:
-        // Move v1 (array index) to kArg0 and v2 (array length) to kArg1
-        if (v2 != TargetReg(kArg0).GetReg()) {
-          OpRegCopy(TargetReg(kArg0), RegStorage::Solo32(v1));
-          if (target_x86) {
-            // x86 leaves the array pointer in v2, so load the array length that the handler expects
-            OpRegMem(kOpMov, TargetReg(kArg1), RegStorage::Solo32(v2),
-                     mirror::Array::LengthOffset().Int32Value());
-          } else {
-            OpRegCopy(TargetReg(kArg1), RegStorage::Solo32(v2));
-          }
-        } else {
-          if (v1 == TargetReg(kArg1).GetReg()) {
-            // Swap v1 and v2, using kArg2 as a temp
-            OpRegCopy(TargetReg(kArg2), RegStorage::Solo32(v1));
-            if (target_x86) {
-              // x86 leaves the array pointer in v2; load the array length that the handler expects
-              OpRegMem(kOpMov, TargetReg(kArg1), RegStorage::Solo32(v2),
-                       mirror::Array::LengthOffset().Int32Value());
-            } else {
-              OpRegCopy(TargetReg(kArg1), RegStorage::Solo32(v2));
-            }
-            OpRegCopy(TargetReg(kArg0), TargetReg(kArg2));
-          } else {
-            if (target_x86) {
-              // x86 leaves the array pointer in v2; load the array length that the handler expects
-              OpRegMem(kOpMov, TargetReg(kArg1), RegStorage::Solo32(v2),
-                       mirror::Array::LengthOffset().Int32Value());
-            } else {
-              OpRegCopy(TargetReg(kArg1), RegStorage::Solo32(v2));
-            }
-            OpRegCopy(TargetReg(kArg0), RegStorage::Solo32(v1));
-          }
-        }
-        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds);
-        break;
-      case kThrowDivZero:
-        func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowDivZero);
-        break;
       case kThrowNoSuchMethod:
         OpRegCopy(TargetReg(kArg0), RegStorage::Solo32(v1));
         func_offset =
@@ -720,9 +767,12 @@
       DCHECK(rl_dest.wide);
       GenNullCheck(rl_obj.reg, opt_flags);
       if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
-        rl_result = EvalLoc(rl_dest, reg_class, true);
-        // FIXME?  duplicate null check?
-        GenNullCheck(rl_obj.reg, opt_flags);
+        RegisterClass result_reg_kind = kAnyReg;
+        if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
+          // Force long/double volatile loads into SSE registers to avoid tearing.
+          result_reg_kind = kFPReg;
+        }
+        rl_result = EvalLoc(rl_dest, result_reg_kind, true);
         LoadBaseDispWide(rl_obj.reg, field_info.FieldOffset().Int32Value(), rl_result.reg,
                          rl_obj.s_reg_low);
         MarkPossibleNullPointerException(opt_flags);
@@ -787,7 +837,12 @@
     DCHECK_GE(field_info.FieldOffset().Int32Value(), 0);
     rl_obj = LoadValue(rl_obj, kCoreReg);
     if (is_long_or_double) {
-      rl_src = LoadValueWide(rl_src, kAnyReg);
+      RegisterClass src_reg_kind = kAnyReg;
+      if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
+        // Force long/double volatile stores into SSE registers to avoid tearing.
+        src_reg_kind = kFPReg;
+      }
+      rl_src = LoadValueWide(rl_src, src_reg_kind);
       GenNullCheck(rl_obj.reg, opt_flags);
       RegStorage reg_ptr = AllocTemp();
       OpRegRegImm(kOpAdd, reg_ptr, rl_obj.reg, field_info.FieldOffset().Int32Value());
@@ -1533,7 +1588,7 @@
       rl_src1 = LoadValue(rl_src1, kCoreReg);
       rl_src2 = LoadValue(rl_src2, kCoreReg);
       if (check_zero) {
-          GenImmedCheck(kCondEq, rl_src2.reg, 0, kThrowDivZero);
+        GenDivZeroCheck(rl_src2.reg);
       }
       rl_result = GenDivRem(rl_dest, rl_src1.reg, rl_src2.reg, op == kOpDiv);
       done = true;
@@ -1544,7 +1599,7 @@
         rl_src1 = LoadValue(rl_src1, kCoreReg);
         rl_src2 = LoadValue(rl_src2, kCoreReg);
         if (check_zero) {
-            GenImmedCheck(kCondEq, rl_src2.reg, 0, kThrowDivZero);
+          GenDivZeroCheck(rl_src2.reg);
         }
         rl_result = GenDivRem(rl_dest, rl_src1.reg, rl_src2.reg, op == kOpDiv);
         done = true;
@@ -1559,7 +1614,7 @@
       RegStorage r_tgt = CallHelperSetup(func_offset);
       LoadValueDirectFixed(rl_src1, TargetReg(kArg0));
       if (check_zero) {
-        GenImmedCheck(kCondEq, TargetReg(kArg1), 0, kThrowDivZero);
+        GenDivZeroCheck(TargetReg(kArg1));
       }
       // NOTE: callout here is not a safepoint.
       CallHelper(r_tgt, func_offset, false /* not a safepoint */);
@@ -1654,9 +1709,8 @@
     StoreValue(rl_dest, rl_result);
     return true;
   }
-  // There is RegRegRegShift on Arm, so check for more special cases.
-  // TODO: disabled, need to handle case of "dest == src" properly.
-  if (false && cu_->instruction_set == kThumb2) {
+  // There is RegRegRegShift on Arm, so check for more special cases
+  if (cu_->instruction_set == kThumb2) {
     return EasyMultiply(rl_src, rl_dest, lit);
   }
   // Can we simplify this multiplication?
@@ -1785,7 +1839,7 @@
     case Instruction::REM_INT_LIT8:
     case Instruction::REM_INT_LIT16: {
       if (lit == 0) {
-        GenImmedCheck(kCondAl, RegStorage::InvalidReg(), 0, kThrowDivZero);
+        GenDivZeroException();
         return;
       }
       if ((opcode == Instruction::DIV_INT) ||
@@ -1959,7 +2013,7 @@
       RegStorage r_tmp2 = RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3));
       LoadValueDirectWideFixed(rl_src2, r_tmp2);
       RegStorage r_tgt = CallHelperSetup(func_offset);
-      GenDivZeroCheck(RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3)));
+      GenDivZeroCheckWide(RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3)));
       LoadValueDirectWideFixed(rl_src1, r_tmp1);
       // NOTE: callout here is not a safepoint
       CallHelper(r_tgt, func_offset, false /* not safepoint */);
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 396a709..758096b 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -87,6 +87,12 @@
   return call_inst;
 }
 
+void Mir2Lir::CallRuntimeHelper(ThreadOffset<4> helper_offset, bool safepoint_pc) {
+  RegStorage r_tgt = CallHelperSetup(helper_offset);
+  ClobberCallerSave();
+  CallHelper(r_tgt, helper_offset, safepoint_pc);
+}
+
 void Mir2Lir::CallRuntimeHelperImm(ThreadOffset<4> helper_offset, int arg0, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   LoadConstant(TargetReg(kArg0), arg0);
@@ -249,12 +255,27 @@
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
 
+void Mir2Lir::CopyToArgumentRegs(RegStorage arg0, RegStorage arg1) {
+  if (arg1.GetReg() == TargetReg(kArg0).GetReg()) {
+    if (arg0.GetReg() == TargetReg(kArg1).GetReg()) {
+      // Swap kArg0 and kArg1 with kArg2 as temp.
+      OpRegCopy(TargetReg(kArg2), arg1);
+      OpRegCopy(TargetReg(kArg0), arg0);
+      OpRegCopy(TargetReg(kArg1), TargetReg(kArg2));
+    } else {
+      OpRegCopy(TargetReg(kArg1), arg1);
+      OpRegCopy(TargetReg(kArg0), arg0);
+    }
+  } else {
+    OpRegCopy(TargetReg(kArg0), arg0);
+    OpRegCopy(TargetReg(kArg1), arg1);
+  }
+}
+
 void Mir2Lir::CallRuntimeHelperRegReg(ThreadOffset<4> helper_offset, RegStorage arg0,
                                       RegStorage arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
-  DCHECK_NE(TargetReg(kArg0).GetReg(), arg1.GetReg());  // check copy into arg0 won't clobber arg1
-  OpRegCopy(TargetReg(kArg0), arg0);
-  OpRegCopy(TargetReg(kArg1), arg1);
+  CopyToArgumentRegs(arg0, arg1);
   ClobberCallerSave();
   CallHelper(r_tgt, helper_offset, safepoint_pc);
 }
@@ -262,9 +283,7 @@
 void Mir2Lir::CallRuntimeHelperRegRegImm(ThreadOffset<4> helper_offset, RegStorage arg0,
                                          RegStorage arg1, int arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
-  DCHECK_NE(TargetReg(kArg0).GetReg(), arg1.GetReg());  // check copy into arg0 won't clobber arg1
-  OpRegCopy(TargetReg(kArg0), arg0);
-  OpRegCopy(TargetReg(kArg1), arg1);
+  CopyToArgumentRegs(arg0, arg1);
   LoadConstant(TargetReg(kArg2), arg2);
   ClobberCallerSave();
   CallHelper(r_tgt, helper_offset, safepoint_pc);
@@ -1490,7 +1509,7 @@
         ((cu_->disable_opt & (1 << kNullCheckElimination)) != 0 ||
          (info->opt_flags & MIR_IGNORE_NULL_CHECK) == 0))  {
       RegLocation rl_obj = LoadValue(info->args[0], kCoreReg);
-      GenImmedCheck(kCondEq, rl_obj.reg, 0, kThrowNullPointer);
+      GenNullCheck(rl_obj.reg);
     }
     return;
   }
diff --git a/compiler/dex/quick/gen_loadstore.cc b/compiler/dex/quick/gen_loadstore.cc
index 897d86d..208eadd 100644
--- a/compiler/dex/quick/gen_loadstore.cc
+++ b/compiler/dex/quick/gen_loadstore.cc
@@ -211,7 +211,12 @@
     LoadValueDirectWide(rl_src, rl_src.reg);
     rl_src.location = kLocPhysReg;
     MarkLive(rl_src.reg.GetLow(), rl_src.s_reg_low);
-    MarkLive(rl_src.reg.GetHigh(), GetSRegHi(rl_src.s_reg_low));
+    if (rl_src.reg.GetLowReg() != rl_src.reg.GetHighReg()) {
+      MarkLive(rl_src.reg.GetHigh(), GetSRegHi(rl_src.s_reg_low));
+    } else {
+      // This must be an x86 vector register value.
+      DCHECK(IsFpReg(rl_src.reg) && (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64));
+    }
   }
   return rl_src;
 }
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 5089111..40641d6 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -131,7 +131,7 @@
     RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
     void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
-    void GenDivZeroCheck(RegStorage reg);
+    void GenDivZeroCheckWide(RegStorage reg);
     void GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method);
     void GenExitSequence();
     void GenSpecialExitSequence();
diff --git a/compiler/dex/quick/mips/int_mips.cc b/compiler/dex/quick/mips/int_mips.cc
index 5fe96d2..2375720 100644
--- a/compiler/dex/quick/mips/int_mips.cc
+++ b/compiler/dex/quick/mips/int_mips.cc
@@ -342,11 +342,11 @@
   }
 }
 
-void MipsMir2Lir::GenDivZeroCheck(RegStorage reg) {
+void MipsMir2Lir::GenDivZeroCheckWide(RegStorage reg) {
   DCHECK(reg.IsPair());   // TODO: support k64BitSolo.
   RegStorage t_reg = AllocTemp();
   OpRegRegReg(kOpOr, t_reg, reg.GetLow(), reg.GetHigh());
-  GenImmedCheck(kCondEq, t_reg, 0, kThrowDivZero);
+  GenDivZeroCheck(t_reg);
   FreeTemp(t_reg);
 }
 
@@ -513,7 +513,7 @@
     rl_result = EvalLoc(rl_dest, reg_class, true);
 
     if (needs_range_check) {
-      GenRegRegCheck(kCondUge, rl_index.reg, reg_len, kThrowArrayBounds);
+      GenArrayBoundsCheck(rl_index.reg, reg_len);
       FreeTemp(reg_len);
     }
     LoadBaseDispWide(reg_ptr, 0, rl_result.reg, INVALID_SREG);
@@ -524,7 +524,7 @@
     rl_result = EvalLoc(rl_dest, reg_class, true);
 
     if (needs_range_check) {
-      GenRegRegCheck(kCondUge, rl_index.reg, reg_len, kThrowArrayBounds);
+      GenArrayBoundsCheck(rl_index.reg, reg_len);
       FreeTemp(reg_len);
     }
     LoadBaseIndexed(reg_ptr, rl_index.reg, rl_result.reg, scale, size);
@@ -590,7 +590,7 @@
     rl_src = LoadValueWide(rl_src, reg_class);
 
     if (needs_range_check) {
-      GenRegRegCheck(kCondUge, rl_index.reg, reg_len, kThrowArrayBounds);
+      GenArrayBoundsCheck(rl_index.reg, reg_len);
       FreeTemp(reg_len);
     }
 
@@ -598,7 +598,7 @@
   } else {
     rl_src = LoadValue(rl_src, reg_class);
     if (needs_range_check) {
-      GenRegRegCheck(kCondUge, rl_index.reg, reg_len, kThrowArrayBounds);
+       GenArrayBoundsCheck(rl_index.reg, reg_len);
       FreeTemp(reg_len);
     }
     StoreBaseIndexed(reg_ptr, rl_index.reg, rl_src.reg, scale, size);
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 73fdc82..6fcdf70 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -120,7 +120,7 @@
 bool Mir2Lir::GenSpecialIGet(MIR* mir, const InlineMethod& special) {
   // FastInstance() already checked by DexFileMethodInliner.
   const InlineIGetIPutData& data = special.d.ifield_data;
-  if (data.method_is_static || data.object_arg != 0) {
+  if (data.method_is_static != 0u || data.object_arg != 0u) {
     // The object is not "this" and has to be null-checked.
     return false;
   }
@@ -151,10 +151,14 @@
 bool Mir2Lir::GenSpecialIPut(MIR* mir, const InlineMethod& special) {
   // FastInstance() already checked by DexFileMethodInliner.
   const InlineIGetIPutData& data = special.d.ifield_data;
-  if (data.method_is_static || data.object_arg != 0) {
+  if (data.method_is_static != 0u || data.object_arg != 0u) {
     // The object is not "this" and has to be null-checked.
     return false;
   }
+  if (data.return_arg_plus1 != 0u) {
+    // The setter returns a method argument which we don't support here.
+    return false;
+  }
 
   bool wide = (data.op_variant == InlineMethodAnalyser::IPutVariant(Instruction::IPUT_WIDE));
 
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 35f948e..1f69eb5 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -562,7 +562,14 @@
     void HandleThrowLaunchPads();
     void HandleSlowPaths();
     void GenBarrier();
-    LIR* GenCheck(ConditionCode c_code, ThrowKind kind);
+    void GenDivZeroException();
+    // c_code holds condition code that's generated from testing divisor against 0.
+    void GenDivZeroCheck(ConditionCode c_code);
+    // reg holds divisor.
+    void GenDivZeroCheck(RegStorage reg);
+    void GenArrayBoundsCheck(RegStorage index, RegStorage length);
+    void GenArrayBoundsCheck(int32_t index, RegStorage length);
+    LIR* GenNullCheck(RegStorage reg);
     void MarkPossibleNullPointerException(int opt_flags);
     void MarkPossibleStackOverflowException();
     void ForceImplicitNullCheck(RegStorage reg, int opt_flags);
@@ -619,6 +626,7 @@
     LIR* CallHelper(RegStorage r_tgt, ThreadOffset<4> helper_offset, bool safepoint_pc,
                     bool use_link = true);
     RegStorage CallHelperSetup(ThreadOffset<4> helper_offset);
+    void CallRuntimeHelper(ThreadOffset<4> helper_offset, bool safepoint_pc);
     void CallRuntimeHelperImm(ThreadOffset<4> helper_offset, int arg0, bool safepoint_pc);
     void CallRuntimeHelperReg(ThreadOffset<4> helper_offset, RegStorage arg0, bool safepoint_pc);
     void CallRuntimeHelperRegLocation(ThreadOffset<4> helper_offset, RegLocation arg0,
@@ -958,10 +966,9 @@
      * @brief Used for generating code that throws ArithmeticException if both registers are zero.
      * @details This is used for generating DivideByZero checks when divisor is held in two
      *  separate registers.
-     * @param reg_lo The register holding the lower 32-bits.
-     * @param reg_hi The register holding the upper 32-bits.
+     * @param reg The register holding the pair of 32-bit values.
      */
-    virtual void GenDivZeroCheck(RegStorage reg) = 0;
+    virtual void GenDivZeroCheckWide(RegStorage reg) = 0;
 
     virtual void GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method) = 0;
     virtual void GenExitSequence() = 0;
@@ -1220,6 +1227,11 @@
      */
     bool GenSpecialIdentity(MIR* mir, const InlineMethod& special);
 
+    void AddDivZeroCheckSlowPath(LIR* branch);
+
+    // Copy arg0 and arg1 to kArg0 and kArg1 safely, possibly using
+    // kArg2 as temp.
+    void CopyToArgumentRegs(RegStorage arg0, RegStorage arg1);
 
   public:
     // TODO: add accessors for these.
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index af2a140..b802591 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -135,7 +135,9 @@
     RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
     void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
-    void GenDivZeroCheck(RegStorage reg);
+    void GenDivZeroCheckWide(RegStorage reg);
+    void GenArrayBoundsCheck(RegStorage index, RegStorage array_base, int32_t len_offset);
+    void GenArrayBoundsCheck(int32_t index, RegStorage array_base, int32_t len_offset);
     void GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method);
     void GenExitSequence();
     void GenSpecialExitSequence();
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index c1d1e01..a23a3bf 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -629,7 +629,7 @@
 
   if (check_zero) {
     // Handle division by zero case.
-    GenImmedCheck(kCondEq, rs_r1, 0, kThrowDivZero);
+    GenDivZeroCheck(rs_r1);
   }
 
   // Have to catch 0x80000000/-1 case, or we will get an exception!
@@ -876,7 +876,7 @@
   }
 }
 
-void X86Mir2Lir::GenDivZeroCheck(RegStorage reg) {
+void X86Mir2Lir::GenDivZeroCheckWide(RegStorage reg) {
   DCHECK(reg.IsPair());  // TODO: allow 64BitSolo.
   // We are not supposed to clobber the incoming storage, so allocate a temporary.
   RegStorage t_reg = AllocTemp();
@@ -885,12 +885,92 @@
   OpRegRegReg(kOpOr, t_reg, reg.GetLow(), reg.GetHigh());
 
   // In case of zero, throw ArithmeticException.
-  GenCheck(kCondEq, kThrowDivZero);
+  GenDivZeroCheck(kCondEq);
 
   // The temp is no longer needed so free it at this time.
   FreeTemp(t_reg);
 }
 
+void X86Mir2Lir::GenArrayBoundsCheck(RegStorage index,
+                                     RegStorage array_base,
+                                     int len_offset) {
+  class ArrayBoundsCheckSlowPath : public Mir2Lir::LIRSlowPath {
+   public:
+    ArrayBoundsCheckSlowPath(Mir2Lir* m2l, LIR* branch,
+                             RegStorage index, RegStorage array_base, int32_t len_offset)
+        : LIRSlowPath(m2l, m2l->GetCurrentDexPc(), branch),
+          index_(index), array_base_(array_base), len_offset_(len_offset) {
+    }
+
+    void Compile() OVERRIDE {
+      m2l_->ResetRegPool();
+      m2l_->ResetDefTracking();
+      GenerateTargetLabel();
+
+      RegStorage new_index = index_;
+      // Move index out of kArg1, either directly to kArg0, or to kArg2.
+      if (index_.GetReg() == m2l_->TargetReg(kArg1).GetReg()) {
+        if (array_base_.GetReg() == m2l_->TargetReg(kArg0).GetReg()) {
+          m2l_->OpRegCopy(m2l_->TargetReg(kArg2), index_);
+          new_index = m2l_->TargetReg(kArg2);
+        } else {
+          m2l_->OpRegCopy(m2l_->TargetReg(kArg0), index_);
+          new_index = m2l_->TargetReg(kArg0);
+        }
+      }
+      // Load array length to kArg1.
+      m2l_->OpRegMem(kOpMov, m2l_->TargetReg(kArg1), array_base_, len_offset_);
+      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds),
+                                    new_index, m2l_->TargetReg(kArg1), true);
+    }
+
+   private:
+    const RegStorage index_;
+    const RegStorage array_base_;
+    const int32_t len_offset_;
+  };
+
+  OpRegMem(kOpCmp, index, array_base, len_offset);
+  LIR* branch = OpCondBranch(kCondUge, nullptr);
+  AddSlowPath(new (arena_) ArrayBoundsCheckSlowPath(this, branch,
+                                                    index, array_base, len_offset));
+}
+
+void X86Mir2Lir::GenArrayBoundsCheck(int32_t index,
+                                     RegStorage array_base,
+                                     int32_t len_offset) {
+  class ArrayBoundsCheckSlowPath : public Mir2Lir::LIRSlowPath {
+   public:
+    ArrayBoundsCheckSlowPath(Mir2Lir* m2l, LIR* branch,
+                             int32_t index, RegStorage array_base, int32_t len_offset)
+        : LIRSlowPath(m2l, m2l->GetCurrentDexPc(), branch),
+          index_(index), array_base_(array_base), len_offset_(len_offset) {
+    }
+
+    void Compile() OVERRIDE {
+      m2l_->ResetRegPool();
+      m2l_->ResetDefTracking();
+      GenerateTargetLabel();
+
+      // Load array length to kArg1.
+      m2l_->OpRegMem(kOpMov, m2l_->TargetReg(kArg1), array_base_, len_offset_);
+      m2l_->LoadConstant(m2l_->TargetReg(kArg0), index_);
+      m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds),
+                                    m2l_->TargetReg(kArg0), m2l_->TargetReg(kArg1), true);
+    }
+
+   private:
+    const int32_t index_;
+    const RegStorage array_base_;
+    const int32_t len_offset_;
+  };
+
+  NewLIR3(IS_SIMM8(index) ? kX86Cmp32MI8 : kX86Cmp32MI, array_base.GetReg(), len_offset, index);
+  LIR* branch = OpCondBranch(kCondLs, nullptr);
+  AddSlowPath(new (arena_) ArrayBoundsCheckSlowPath(this, branch,
+                                                    index, array_base, len_offset));
+}
+
 // Test suspend flag, return target of taken suspend branch
 LIR* X86Mir2Lir::OpTestSuspend(LIR* target) {
   OpTlsCmp(Thread::ThreadFlagsOffset<4>(), 0);
@@ -1348,10 +1428,9 @@
 
   if (!(opt_flags & MIR_IGNORE_RANGE_CHECK)) {
     if (constant_index) {
-      GenMemImmedCheck(kCondLs, rl_array.reg, len_offset,
-                       constant_index_value, kThrowConstantArrayBounds);
+      GenArrayBoundsCheck(constant_index_value, rl_array.reg, len_offset);
     } else {
-      GenRegMemCheck(kCondUge, rl_index.reg, rl_array.reg, len_offset, kThrowArrayBounds);
+      GenArrayBoundsCheck(rl_index.reg, rl_array.reg, len_offset);
     }
   }
   rl_result = EvalLoc(rl_dest, reg_class, true);
@@ -1400,10 +1479,9 @@
 
   if (!(opt_flags & MIR_IGNORE_RANGE_CHECK)) {
     if (constant_index) {
-      GenMemImmedCheck(kCondLs, rl_array.reg, len_offset,
-                       constant_index_value, kThrowConstantArrayBounds);
+      GenArrayBoundsCheck(constant_index_value, rl_array.reg, len_offset);
     } else {
-      GenRegMemCheck(kCondUge, rl_index.reg, rl_array.reg, len_offset, kThrowArrayBounds);
+      GenArrayBoundsCheck(rl_index.reg, rl_array.reg, len_offset);
     }
   }
   if ((size == kLong) || (size == kDouble)) {
@@ -2056,6 +2134,8 @@
         // Can we do this directly into memory?
         rl_result = UpdateLoc(rl_dest);
         if (rl_result.location == kLocPhysReg) {
+          // Ensure res is in a core reg
+          rl_result = EvalLoc(rl_dest, kCoreReg, true);
           // Can we do this from memory directly?
           rl_rhs = UpdateLoc(rl_rhs);
           if (rl_rhs.location != kLocPhysReg) {
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index dcc5d9b..5a8ad7a 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -1064,6 +1064,7 @@
   LoadWordDisp(rs_rDX, count_offset, rs_rCX);
   LIR *length_compare = nullptr;
   int start_value = 0;
+  bool is_index_on_stack = false;
   if (zero_based) {
     // We have to handle an empty string.  Use special instruction JECXZ.
     length_compare = NewLIR0(kX86Jecxz8);
@@ -1084,14 +1085,32 @@
       // Runtime start index.
       rl_start = UpdateLoc(rl_start);
       if (rl_start.location == kLocPhysReg) {
+        // Handle "start index < 0" case.
+        OpRegReg(kOpXor, rs_rBX, rs_rBX);
+        OpRegReg(kOpCmp, rl_start.reg, rs_rBX);
+        OpCondRegReg(kOpCmov, kCondLt, rl_start.reg, rs_rBX);
+
+        // The length of the string should be greater than the start index.
         length_compare = OpCmpBranch(kCondLe, rs_rCX, rl_start.reg, nullptr);
         OpRegReg(kOpSub, rs_rCX, rl_start.reg);
+        if (rl_start.reg == rs_rDI) {
+          // The special case. We will use EDI further, so lets put start index to stack.
+          NewLIR1(kX86Push32R, rDI);
+          is_index_on_stack = true;
+        }
       } else {
-        // Compare to memory to avoid a register load.  Handle pushed EDI.
+        // Load the start index from stack, remembering that we pushed EDI.
         int displacement = SRegOffset(rl_start.s_reg_low) + sizeof(uint32_t);
-        OpRegMem(kOpCmp, rs_rCX, rs_rX86_SP, displacement);
-        length_compare = NewLIR2(kX86Jcc8, 0, kX86CondLe);
-        OpRegMem(kOpSub, rs_rCX, rs_rX86_SP, displacement);
+        LoadWordDisp(rs_rX86_SP, displacement, rs_rBX);
+        OpRegReg(kOpXor, rs_rDI, rs_rDI);
+        OpRegReg(kOpCmp, rs_rBX, rs_rDI);
+        OpCondRegReg(kOpCmov, kCondLt, rs_rBX, rs_rDI);
+
+        length_compare = OpCmpBranch(kCondLe, rs_rCX, rs_rBX, nullptr);
+        OpRegReg(kOpSub, rs_rCX, rs_rBX);
+        // Put the start index to stack.
+        NewLIR1(kX86Push32R, rBX);
+        is_index_on_stack = true;
       }
     }
   }
@@ -1113,21 +1132,12 @@
       NewLIR3(kX86Lea32RM, rDI, rBX, 2 * start_value);
     }
   } else {
-    if (rl_start.location == kLocPhysReg) {
-      if (rl_start.reg.GetReg() == rDI) {
-        // We have a slight problem here.  We are already using RDI!
-        // Grab the value from the stack.
-        LoadWordDisp(rs_rX86_SP, 0, rs_rDX);
-        OpLea(rs_rDI, rs_rBX, rs_rDX, 1, 0);
-      } else {
-        OpLea(rs_rDI, rs_rBX, rl_start.reg, 1, 0);
-      }
-    } else {
-      OpRegCopy(rs_rDI, rs_rBX);
-      // Load the start index from stack, remembering that we pushed EDI.
-      int displacement = SRegOffset(rl_start.s_reg_low) + sizeof(uint32_t);
-      LoadWordDisp(rs_rX86_SP, displacement, rs_rDX);
+    if (is_index_on_stack == true) {
+      // Load the start index from stack.
+      NewLIR1(kX86Pop32R, rDX);
       OpLea(rs_rDI, rs_rBX, rs_rDX, 1, 0);
+    } else {
+      OpLea(rs_rDI, rs_rBX, rl_start.reg, 1, 0);
     }
   }
 
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index b12b6a7..0ad30be 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -19,10 +19,8 @@
 #define ATRACE_TAG ATRACE_TAG_DALVIK
 #include <utils/Trace.h>
 
-#include <fstream>
 #include <vector>
 #include <unistd.h>
-#include <utility>
 
 #include "base/stl_util.h"
 #include "base/timing_logger.h"
@@ -341,7 +339,6 @@
       compiler_(Compiler::Create(compiler_kind)),
       instruction_set_(instruction_set),
       instruction_set_features_(instruction_set_features),
-      instruction_set_is_64_bit_(instruction_set == kX86_64 || instruction_set == kArm64),
       freezing_constructor_lock_("freezing constructor lock"),
       compiled_classes_lock_("compiled classes lock"),
       compiled_methods_lock_("compiled method lock"),
@@ -372,7 +369,7 @@
 
   // Read the profile file if one is provided.
   if (profile_file != "") {
-    profile_ok_ = ReadProfile(profile_file);
+    profile_ok_ = ProfileHelper::LoadProfileMap(profile_map_, profile_file);
   }
 
   dex_to_dex_compiler_ = reinterpret_cast<DexToDexCompilerFn>(ArtCompileDEX);
@@ -450,7 +447,7 @@
 }
 
 #define CREATE_TRAMPOLINE(type, abi, offset) \
-    if (instruction_set_is_64_bit_) { \
+    if (Is64BitInstructionSet(instruction_set_)) { \
       return CreateTrampoline64(instruction_set_, abi, \
                                 type ## _ENTRYPOINT_OFFSET(8, offset)); \
     } else { \
@@ -1898,8 +1895,9 @@
       compiled_method = compiler_->Compile(
           *this, code_item, access_flags, invoke_type, class_def_idx,
           method_idx, class_loader, dex_file);
-    } else if (dex_to_dex_compilation_level != kDontDexToDexCompile) {
-      // TODO: add a mode to disable DEX-to-DEX compilation ?
+    }
+    if (compiled_method == nullptr && dex_to_dex_compilation_level != kDontDexToDexCompile) {
+      // TODO: add a command-line option to disable DEX-to-DEX compilation ?
       (*dex_to_dex_compiler_)(*this, code_item, access_flags,
                               invoke_type, class_def_idx,
                               method_idx, class_loader, dex_file,
@@ -2035,86 +2033,9 @@
     }
   }
 
-bool CompilerDriver::ReadProfile(const std::string& filename) {
-  VLOG(compiler) << "reading profile file " << filename;
-  struct stat st;
-  int err = stat(filename.c_str(), &st);
-  if (err == -1) {
-    VLOG(compiler) << "not found";
-    return false;
-  }
-  std::ifstream in(filename.c_str());
-  if (!in) {
-    VLOG(compiler) << "profile file " << filename << " exists but can't be opened";
-    VLOG(compiler) << "file owner: " << st.st_uid << ":" << st.st_gid;
-    VLOG(compiler) << "me: " << getuid() << ":" << getgid();
-    VLOG(compiler) << "file permissions: " << std::oct << st.st_mode;
-    VLOG(compiler) << "errno: " << errno;
-    return false;
-  }
-  // The first line contains summary information.
-  std::string line;
-  std::getline(in, line);
-  if (in.eof()) {
-    return false;
-  }
-  std::vector<std::string> summary_info;
-  Split(line, '/', summary_info);
-  if (summary_info.size() != 3) {
-    // Bad summary info.  It should be count/total/bootpath.
-    return false;
-  }
-  // This is the number of hits in all methods.
-  uint32_t total_count = 0;
-  for (int i = 0 ; i < 3; ++i) {
-    total_count += atoi(summary_info[i].c_str());
-  }
-
-  // Now read each line until the end of file.  Each line consists of 3 fields separated by '/'.
-  // Store the info in descending order given by the most used methods.
-  typedef std::set<std::pair<int, std::vector<std::string>>> ProfileSet;
-  ProfileSet countSet;
-  while (!in.eof()) {
-    std::getline(in, line);
-    if (in.eof()) {
-      break;
-    }
-    std::vector<std::string> info;
-    Split(line, '/', info);
-    if (info.size() != 3) {
-      // Malformed.
-      break;
-    }
-    int count = atoi(info[1].c_str());
-    countSet.insert(std::make_pair(-count, info));
-  }
-
-  uint32_t curTotalCount = 0;
-  ProfileSet::iterator end = countSet.end();
-  const ProfileData* prevData = nullptr;
-  for (ProfileSet::iterator it = countSet.begin(); it != end ; it++) {
-    const std::string& methodname = it->second[0];
-    uint32_t count = -it->first;
-    uint32_t size = atoi(it->second[2].c_str());
-    double usedPercent = (count * 100.0) / total_count;
-
-    curTotalCount += count;
-    // Methods with the same count should be part of the same top K percentage bucket.
-    double topKPercentage = (prevData != nullptr) && (prevData->GetCount() == count)
-      ? prevData->GetTopKUsedPercentage()
-      : 100 * static_cast<double>(curTotalCount) / static_cast<double>(total_count);
-
-    // Add it to the profile map.
-    ProfileData curData = ProfileData(methodname, count, size, usedPercent, topKPercentage);
-    profile_map_[methodname] = curData;
-    prevData = &curData;
-  }
-  return true;
-}
-
 bool CompilerDriver::SkipCompilation(const std::string& method_name) {
   if (!profile_ok_) {
-    return true;
+    return false;
   }
   // Methods that comprise topKPercentThreshold % of the total samples will be compiled.
   double topKPercentThreshold = 90.0;
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 802f859..d7d40d5 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -32,6 +32,7 @@
 #include "invoke_type.h"
 #include "method_reference.h"
 #include "os.h"
+#include "profiler.h"
 #include "runtime.h"
 #include "safe_map.h"
 #include "thread_pool.h"
@@ -594,43 +595,9 @@
     return cfi_info_.get();
   }
 
-  // Profile data.  This is generated from previous runs of the program and stored
-  // in a file.  It is used to determine whether to compile a particular method or not.
-  class ProfileData {
-   public:
-    ProfileData() : count_(0), method_size_(0), usedPercent_(0) {}
-    ProfileData(const std::string& method_name, uint32_t count, uint32_t method_size,
-      double usedPercent, double topKUsedPercentage) :
-      method_name_(method_name), count_(count), method_size_(method_size),
-      usedPercent_(usedPercent), topKUsedPercentage_(topKUsedPercentage) {
-      // TODO: currently method_size_ and count_ are unused.
-      UNUSED(method_size_);
-      UNUSED(count_);
-    }
-
-    bool IsAbove(double v) const { return usedPercent_ >= v; }
-    double GetUsedPercent() const { return usedPercent_; }
-    uint32_t GetCount() const { return count_; }
-    double GetTopKUsedPercentage() const { return topKUsedPercentage_; }
-
-   private:
-    std::string method_name_;    // Method name.
-    uint32_t count_;             // Number of times it has been called.
-    uint32_t method_size_;       // Size of the method on dex instructions.
-    double usedPercent_;         // Percentage of how many times this method was called.
-    double topKUsedPercentage_;  // The percentage of the group that comprise K% of the total used
-                                 // methods this methods belongs to.
-  };
-
-  // Profile data is stored in a map, indexed by the full method name.
-  typedef std::map<const std::string, ProfileData> ProfileMap;
   ProfileMap profile_map_;
   bool profile_ok_;
 
-  // Read the profile data from the given file.  Calculates the percentage for each method.
-  // Returns false if there was no profile file or it was malformed.
-  bool ReadProfile(const std::string& filename);
-
   // Should the compiler run on this method given profile information?
   bool SkipCompilation(const std::string& method_name);
 
@@ -725,7 +692,6 @@
 
   const InstructionSet instruction_set_;
   const InstructionSetFeatures instruction_set_features_;
-  const bool instruction_set_is_64_bit_;
 
   // All class references that require
   mutable ReaderWriterMutex freezing_constructor_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index f6a324f..e88ed42 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -367,6 +367,8 @@
   elf_header.e_ident[EI_ABIVERSION] = 0;
   elf_header.e_type = ET_DYN;
   switch (compiler_driver_->GetInstructionSet()) {
+    case kArm:
+      // Fall through.
     case kThumb2: {
       elf_header.e_machine = EM_ARM;
       elf_header.e_flags = EF_ARM_EABI_VER5;
@@ -396,7 +398,6 @@
                             EF_MIPS_ARCH_32R2);
       break;
     }
-    case kArm:
     default: {
       LOG(FATAL) << "Unknown instruction set: " << compiler_driver_->GetInstructionSet();
       break;
diff --git a/compiler/elf_writer_test.cc b/compiler/elf_writer_test.cc
index 8175c35..864dadc 100644
--- a/compiler/elf_writer_test.cc
+++ b/compiler/elf_writer_test.cc
@@ -50,7 +50,11 @@
     CHECK(host_dir != NULL);
     elf_filename = StringPrintf("%s/framework/core.oat", host_dir);
   } else {
+#ifdef __LP64__
+    elf_filename = "/data/art-test64/core.oat";
+#else
     elf_filename = "/data/art-test/core.oat";
+#endif
   }
   LOG(INFO) << "elf_filename=" << elf_filename;
 
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index 05d6693..7c5741b 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -164,7 +164,7 @@
       EXPECT_TRUE(reinterpret_cast<byte*>(klass) >= image_end ||
                   reinterpret_cast<byte*>(klass) < image_begin) << descriptor;
     }
-    EXPECT_TRUE(Monitor::IsValidLockWord(klass->GetLockWord()));
+    EXPECT_TRUE(Monitor::IsValidLockWord(klass->GetLockWord(false)));
   }
 }
 
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 0405198..3400b01 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -179,7 +179,7 @@
   image_bitmap_->Set(obj);
   // Before we stomp over the lock word, save the hash code for later.
   Monitor::Deflate(Thread::Current(), object);;
-  LockWord lw(object->GetLockWord());
+  LockWord lw(object->GetLockWord(false));
   switch (lw.GetState()) {
     case LockWord::kFatLocked: {
       LOG(FATAL) << "Fat locked object " << obj << " found during object copy";
@@ -199,7 +199,7 @@
       LOG(FATAL) << "Unreachable.";
       break;
   }
-  object->SetLockWord(LockWord::FromForwardingAddress(offset));
+  object->SetLockWord(LockWord::FromForwardingAddress(offset), false);
   DCHECK(IsImageOffsetAssigned(object));
 }
 
@@ -212,13 +212,13 @@
 
 bool ImageWriter::IsImageOffsetAssigned(mirror::Object* object) const {
   DCHECK(object != nullptr);
-  return object->GetLockWord().GetState() == LockWord::kForwardingAddress;
+  return object->GetLockWord(false).GetState() == LockWord::kForwardingAddress;
 }
 
 size_t ImageWriter::GetImageOffset(mirror::Object* object) const {
   DCHECK(object != nullptr);
   DCHECK(IsImageOffsetAssigned(object));
-  LockWord lock_word = object->GetLockWord();
+  LockWord lock_word = object->GetLockWord(false);
   size_t offset = lock_word.ForwardingAddress();
   DCHECK_LT(offset, image_end_);
   return offset;
@@ -235,8 +235,8 @@
   }
 
   // Create the image bitmap.
-  image_bitmap_.reset(gc::accounting::SpaceBitmap::Create("image bitmap", image_->Begin(),
-                                                          length));
+  image_bitmap_.reset(gc::accounting::ContinuousSpaceBitmap::Create("image bitmap", image_->Begin(),
+                                                                    length));
   if (image_bitmap_.get() == nullptr) {
     LOG(ERROR) << "Failed to allocate memory for image bitmap";
     return false;
@@ -525,7 +525,7 @@
 
   // Return to write header at start of image with future location of image_roots. At this point,
   // image_end_ is the size of the image (excluding bitmaps).
-  const size_t heap_bytes_per_bitmap_byte = kBitsPerByte * gc::accounting::SpaceBitmap::kAlignment;
+  const size_t heap_bytes_per_bitmap_byte = kBitsPerByte * kObjectAlignment;
   const size_t bitmap_bytes = RoundUp(image_end_, heap_bytes_per_bitmap_byte) /
       heap_bytes_per_bitmap_byte;
   ImageHeader image_header(PointerToLowMemUInt32(image_begin_),
@@ -555,15 +555,15 @@
   heap->VisitObjects(CopyAndFixupObjectsCallback, this);
   // Fix up the object previously had hash codes.
   for (const std::pair<mirror::Object*, uint32_t>& hash_pair : saved_hashes_) {
-    hash_pair.first->SetLockWord(LockWord::FromHashCode(hash_pair.second));
+    hash_pair.first->SetLockWord(LockWord::FromHashCode(hash_pair.second), false);
   }
   saved_hashes_.clear();
   self->EndAssertNoThreadSuspension(old_cause);
 }
 
 void ImageWriter::CopyAndFixupObjectsCallback(Object* obj, void* arg) {
-  DCHECK(obj != NULL);
-  DCHECK(arg != NULL);
+  DCHECK(obj != nullptr);
+  DCHECK(arg != nullptr);
   ImageWriter* image_writer = reinterpret_cast<ImageWriter*>(arg);
   // see GetLocalAddress for similar computation
   size_t offset = image_writer->GetImageOffset(obj);
@@ -575,7 +575,7 @@
   Object* copy = reinterpret_cast<Object*>(dst);
   // Write in a hash code of objects which have inflated monitors or a hash code in their monitor
   // word.
-  copy->SetLockWord(LockWord());
+  copy->SetLockWord(LockWord(), false);
   image_writer->FixupObject(obj, copy);
 }
 
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 92b24f6..ee241cb 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -173,7 +173,7 @@
   const byte* oat_data_begin_;
 
   // Image bitmap which lets us know where the objects inside of the image reside.
-  UniquePtr<gc::accounting::SpaceBitmap> image_bitmap_;
+  UniquePtr<gc::accounting::ContinuousSpaceBitmap> image_bitmap_;
 
   // Offset from oat_data_begin_ to the stubs.
   uint32_t interpreter_to_interpreter_bridge_offset_;
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index 3204282..6b5e55e 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -1268,4 +1268,227 @@
   env_->CallNonvirtualVoidMethodA(jobj_, jklass_, jmethod_, args);
 }
 
+TEST_F(JniCompilerTest, WithoutImplementation) {
+  TEST_DISABLED_FOR_PORTABLE();
+  SetUpForTest(false, "withoutImplementation", "()V", nullptr);
+
+  env_->CallVoidMethod(jobj_, jmethod_);
+
+  EXPECT_TRUE(Thread::Current()->IsExceptionPending());
+  EXPECT_TRUE(env_->ExceptionCheck() == JNI_TRUE);
+}
+
+template <typename U, typename V> V convert(U in) {
+  DCHECK_LE(sizeof(U), sizeof(V));
+  union { U u; V v; } tmp;
+  tmp.u = in;
+  return tmp.v;
+}
+
+void Java_MyClassNatives_stackArgsIntsFirst(JNIEnv* env, jclass klass, jint i1, jint i2, jint i3,
+                                            jint i4, jint i5, jint i6, jint i7, jint i8, jint i9,
+                                            jint i10, jfloat f1, jfloat f2, jfloat f3, jfloat f4,
+                                            jfloat f5, jfloat f6, jfloat f7, jfloat f8, jfloat f9,
+                                            jfloat f10) {
+  EXPECT_EQ(i1, 1);
+  EXPECT_EQ(i2, 2);
+  EXPECT_EQ(i3, 3);
+  EXPECT_EQ(i4, 4);
+  EXPECT_EQ(i5, 5);
+  EXPECT_EQ(i6, 6);
+  EXPECT_EQ(i7, 7);
+  EXPECT_EQ(i8, 8);
+  EXPECT_EQ(i9, 9);
+  EXPECT_EQ(i10, 10);
+
+  jint i11 = convert<jfloat, jint>(f1);
+  EXPECT_EQ(i11, 11);
+  jint i12 = convert<jfloat, jint>(f2);
+  EXPECT_EQ(i12, 12);
+  jint i13 = convert<jfloat, jint>(f3);
+  EXPECT_EQ(i13, 13);
+  jint i14 = convert<jfloat, jint>(f4);
+  EXPECT_EQ(i14, 14);
+  jint i15 = convert<jfloat, jint>(f5);
+  EXPECT_EQ(i15, 15);
+  jint i16 = convert<jfloat, jint>(f6);
+  EXPECT_EQ(i16, 16);
+  jint i17 = convert<jfloat, jint>(f7);
+  EXPECT_EQ(i17, 17);
+  jint i18 = convert<jfloat, jint>(f8);
+  EXPECT_EQ(i18, 18);
+  jint i19 = convert<jfloat, jint>(f9);
+  EXPECT_EQ(i19, 19);
+  jint i20 = convert<jfloat, jint>(f10);
+  EXPECT_EQ(i20, 20);
+}
+
+TEST_F(JniCompilerTest, StackArgsIntsFirst) {
+  TEST_DISABLED_FOR_PORTABLE();
+  SetUpForTest(true, "stackArgsIntsFirst", "(IIIIIIIIIIFFFFFFFFFF)V",
+               reinterpret_cast<void*>(&Java_MyClassNatives_stackArgsIntsFirst));
+
+  jint i1 = 1;
+  jint i2 = 2;
+  jint i3 = 3;
+  jint i4 = 4;
+  jint i5 = 5;
+  jint i6 = 6;
+  jint i7 = 7;
+  jint i8 = 8;
+  jint i9 = 9;
+  jint i10 = 10;
+
+  jfloat f1 = convert<jint, jfloat>(11);
+  jfloat f2 = convert<jint, jfloat>(12);
+  jfloat f3 = convert<jint, jfloat>(13);
+  jfloat f4 = convert<jint, jfloat>(14);
+  jfloat f5 = convert<jint, jfloat>(15);
+  jfloat f6 = convert<jint, jfloat>(16);
+  jfloat f7 = convert<jint, jfloat>(17);
+  jfloat f8 = convert<jint, jfloat>(18);
+  jfloat f9 = convert<jint, jfloat>(19);
+  jfloat f10 = convert<jint, jfloat>(20);
+
+  env_->CallStaticVoidMethod(jklass_, jmethod_, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, f1, f2,
+                             f3, f4, f5, f6, f7, f8, f9, f10);
+}
+
+void Java_MyClassNatives_stackArgsFloatsFirst(JNIEnv* env, jclass klass, jfloat f1, jfloat f2,
+                                              jfloat f3, jfloat f4, jfloat f5, jfloat f6, jfloat f7,
+                                              jfloat f8, jfloat f9, jfloat f10, jint i1, jint i2,
+                                              jint i3, jint i4, jint i5, jint i6, jint i7, jint i8,
+                                              jint i9, jint i10) {
+  EXPECT_EQ(i1, 1);
+  EXPECT_EQ(i2, 2);
+  EXPECT_EQ(i3, 3);
+  EXPECT_EQ(i4, 4);
+  EXPECT_EQ(i5, 5);
+  EXPECT_EQ(i6, 6);
+  EXPECT_EQ(i7, 7);
+  EXPECT_EQ(i8, 8);
+  EXPECT_EQ(i9, 9);
+  EXPECT_EQ(i10, 10);
+
+  jint i11 = convert<jfloat, jint>(f1);
+  EXPECT_EQ(i11, 11);
+  jint i12 = convert<jfloat, jint>(f2);
+  EXPECT_EQ(i12, 12);
+  jint i13 = convert<jfloat, jint>(f3);
+  EXPECT_EQ(i13, 13);
+  jint i14 = convert<jfloat, jint>(f4);
+  EXPECT_EQ(i14, 14);
+  jint i15 = convert<jfloat, jint>(f5);
+  EXPECT_EQ(i15, 15);
+  jint i16 = convert<jfloat, jint>(f6);
+  EXPECT_EQ(i16, 16);
+  jint i17 = convert<jfloat, jint>(f7);
+  EXPECT_EQ(i17, 17);
+  jint i18 = convert<jfloat, jint>(f8);
+  EXPECT_EQ(i18, 18);
+  jint i19 = convert<jfloat, jint>(f9);
+  EXPECT_EQ(i19, 19);
+  jint i20 = convert<jfloat, jint>(f10);
+  EXPECT_EQ(i20, 20);
+}
+
+TEST_F(JniCompilerTest, StackArgsFloatsFirst) {
+  TEST_DISABLED_FOR_PORTABLE();
+  SetUpForTest(true, "stackArgsFloatsFirst", "(FFFFFFFFFFIIIIIIIIII)V",
+               reinterpret_cast<void*>(&Java_MyClassNatives_stackArgsFloatsFirst));
+
+  jint i1 = 1;
+  jint i2 = 2;
+  jint i3 = 3;
+  jint i4 = 4;
+  jint i5 = 5;
+  jint i6 = 6;
+  jint i7 = 7;
+  jint i8 = 8;
+  jint i9 = 9;
+  jint i10 = 10;
+
+  jfloat f1 = convert<jint, jfloat>(11);
+  jfloat f2 = convert<jint, jfloat>(12);
+  jfloat f3 = convert<jint, jfloat>(13);
+  jfloat f4 = convert<jint, jfloat>(14);
+  jfloat f5 = convert<jint, jfloat>(15);
+  jfloat f6 = convert<jint, jfloat>(16);
+  jfloat f7 = convert<jint, jfloat>(17);
+  jfloat f8 = convert<jint, jfloat>(18);
+  jfloat f9 = convert<jint, jfloat>(19);
+  jfloat f10 = convert<jint, jfloat>(20);
+
+  env_->CallStaticVoidMethod(jklass_, jmethod_, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, i1, i2, i3,
+                             i4, i5, i6, i7, i8, i9, i10);
+}
+
+void Java_MyClassNatives_stackArgsMixed(JNIEnv* env, jclass klass, jint i1, jfloat f1, jint i2,
+                                        jfloat f2, jint i3, jfloat f3, jint i4, jfloat f4, jint i5,
+                                        jfloat f5, jint i6, jfloat f6, jint i7, jfloat f7, jint i8,
+                                        jfloat f8, jint i9, jfloat f9, jint i10, jfloat f10) {
+  EXPECT_EQ(i1, 1);
+  EXPECT_EQ(i2, 2);
+  EXPECT_EQ(i3, 3);
+  EXPECT_EQ(i4, 4);
+  EXPECT_EQ(i5, 5);
+  EXPECT_EQ(i6, 6);
+  EXPECT_EQ(i7, 7);
+  EXPECT_EQ(i8, 8);
+  EXPECT_EQ(i9, 9);
+  EXPECT_EQ(i10, 10);
+
+  jint i11 = convert<jfloat, jint>(f1);
+  EXPECT_EQ(i11, 11);
+  jint i12 = convert<jfloat, jint>(f2);
+  EXPECT_EQ(i12, 12);
+  jint i13 = convert<jfloat, jint>(f3);
+  EXPECT_EQ(i13, 13);
+  jint i14 = convert<jfloat, jint>(f4);
+  EXPECT_EQ(i14, 14);
+  jint i15 = convert<jfloat, jint>(f5);
+  EXPECT_EQ(i15, 15);
+  jint i16 = convert<jfloat, jint>(f6);
+  EXPECT_EQ(i16, 16);
+  jint i17 = convert<jfloat, jint>(f7);
+  EXPECT_EQ(i17, 17);
+  jint i18 = convert<jfloat, jint>(f8);
+  EXPECT_EQ(i18, 18);
+  jint i19 = convert<jfloat, jint>(f9);
+  EXPECT_EQ(i19, 19);
+  jint i20 = convert<jfloat, jint>(f10);
+  EXPECT_EQ(i20, 20);
+}
+
+TEST_F(JniCompilerTest, StackArgsMixed) {
+  TEST_DISABLED_FOR_PORTABLE();
+  SetUpForTest(true, "stackArgsMixed", "(IFIFIFIFIFIFIFIFIFIF)V",
+               reinterpret_cast<void*>(&Java_MyClassNatives_stackArgsMixed));
+
+  jint i1 = 1;
+  jint i2 = 2;
+  jint i3 = 3;
+  jint i4 = 4;
+  jint i5 = 5;
+  jint i6 = 6;
+  jint i7 = 7;
+  jint i8 = 8;
+  jint i9 = 9;
+  jint i10 = 10;
+
+  jfloat f1 = convert<jint, jfloat>(11);
+  jfloat f2 = convert<jint, jfloat>(12);
+  jfloat f3 = convert<jint, jfloat>(13);
+  jfloat f4 = convert<jint, jfloat>(14);
+  jfloat f5 = convert<jint, jfloat>(15);
+  jfloat f6 = convert<jint, jfloat>(16);
+  jfloat f7 = convert<jint, jfloat>(17);
+  jfloat f8 = convert<jint, jfloat>(18);
+  jfloat f9 = convert<jint, jfloat>(19);
+  jfloat f10 = convert<jint, jfloat>(20);
+
+  env_->CallStaticVoidMethod(jklass_, jmethod_, i1, f1, i2, f2, i3, f3, i4, f4, i5, f5, i6, f6, i7,
+                             f7, i8, f8, i9, f9, i10, f10);
+}
+
 }  // namespace art
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index ab39d6b..ae18d2e 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -145,7 +145,7 @@
   // Method*, LR and callee save area size, local reference segment state
   size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
+  size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index c408fa9..6212a23 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -21,14 +21,29 @@
 namespace art {
 namespace arm64 {
 
-// Calling convention
+static const Register kCoreArgumentRegisters[] = {
+  X0, X1, X2, X3, X4, X5, X6, X7
+};
 
+static const WRegister kWArgumentRegisters[] = {
+  W0, W1, W2, W3, W4, W5, W6, W7
+};
+
+static const DRegister kDArgumentRegisters[] = {
+  D0, D1, D2, D3, D4, D5, D6, D7
+};
+
+static const SRegister kSArgumentRegisters[] = {
+  S0, S1, S2, S3, S4, S5, S6, S7
+};
+
+// Calling convention
 ManagedRegister Arm64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
-  return Arm64ManagedRegister::FromCoreRegister(IP0);  // X16
+  return Arm64ManagedRegister::FromCoreRegister(X20);  // saved on entry restored on exit
 }
 
 ManagedRegister Arm64JniCallingConvention::InterproceduralScratchRegister() {
-  return Arm64ManagedRegister::FromCoreRegister(IP0);  // X16
+  return Arm64ManagedRegister::FromCoreRegister(X20);  // saved on entry restored on exit
 }
 
 static ManagedRegister ReturnRegisterForShorty(const char* shorty) {
@@ -79,64 +94,64 @@
 FrameOffset Arm64ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
   CHECK(IsCurrentParamOnStack());
   FrameOffset result =
-      FrameOffset(displacement_.Int32Value() +         // displacement
+      FrameOffset(displacement_.Int32Value() +   // displacement
                   kFramePointerSize +                 // Method*
-                  (itr_slots_ * kFramePointerSize));  // offset into in args
+                  (itr_slots_ * sizeof(uint32_t)));  // offset into in args
   return result;
 }
 
 const ManagedRegisterEntrySpills& Arm64ManagedRuntimeCallingConvention::EntrySpills() {
   // We spill the argument registers on ARM64 to free them up for scratch use, we then assume
   // all arguments are on the stack.
-  if (entry_spills_.size() == 0) {
-    // TODO Need fp regs spilled too.
-    //
-    size_t num_spills = NumArgs();
+  if ((entry_spills_.size() == 0) && (NumArgs() > 0)) {
+    int gp_reg_index = 1;   // we start from X1/W1, X0 holds ArtMethod*.
+    int fp_reg_index = 0;   // D0/S0.
 
-    // TODO Floating point need spilling too.
-    if (num_spills > 0) {
-      entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X1));
-      if (num_spills > 1) {
-        entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X2));
-        if (num_spills > 2) {
-          entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X3));
-          if (num_spills > 3) {
-            entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X5));
-            if (num_spills > 4) {
-              entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X6));
-              if (num_spills > 5) {
-                entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X7));
-              }
+    // We need to choose the correct register (D/S or X/W) since the managed
+    // stack uses 32bit stack slots.
+    ResetIterator(FrameOffset(0));
+    while (HasNext()) {
+      if (IsCurrentParamAFloatOrDouble()) {  // FP regs.
+          if (fp_reg_index < 8) {
+            if (!IsCurrentParamADouble()) {
+              entry_spills_.push_back(Arm64ManagedRegister::FromSRegister(kSArgumentRegisters[fp_reg_index]));
+            } else {
+              entry_spills_.push_back(Arm64ManagedRegister::FromDRegister(kDArgumentRegisters[fp_reg_index]));
             }
+            fp_reg_index++;
+          } else {  // just increase the stack offset.
+            if (!IsCurrentParamADouble()) {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
+            } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 8);
+            }
+          }
+      } else {  // GP regs.
+        if (gp_reg_index < 8) {
+          if (IsCurrentParamALong() && (!IsCurrentParamAReference())) {
+            entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(kCoreArgumentRegisters[gp_reg_index]));
+          } else {
+            entry_spills_.push_back(Arm64ManagedRegister::FromWRegister(kWArgumentRegisters[gp_reg_index]));
+          }
+          gp_reg_index++;
+        } else {  // just increase the stack offset.
+          if (IsCurrentParamALong() && (!IsCurrentParamAReference())) {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 8);
+          } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
           }
         }
       }
+      Next();
     }
   }
-
   return entry_spills_;
 }
-// JNI calling convention
 
+// JNI calling convention
 Arm64JniCallingConvention::Arm64JniCallingConvention(bool is_static, bool is_synchronized,
                                                      const char* shorty)
     : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
-  // TODO This needs to be converted to 64bit.
-  // Compute padding to ensure longs and doubles are not split in AAPCS. Ignore the 'this' jobject
-  // or jclass for static methods and the JNIEnv. We start at the aligned register r2.
-//  size_t padding = 0;
-//  for (size_t cur_arg = IsStatic() ? 0 : 1, cur_reg = 2; cur_arg < NumArgs(); cur_arg++) {
-//    if (IsParamALongOrDouble(cur_arg)) {
-//      if ((cur_reg & 1) != 0) {
-//        padding += 4;
-//        cur_reg++;  // additional bump to ensure alignment
-//      }
-//      cur_reg++;  // additional bump to skip extra long word
-//    }
-//    cur_reg++;  // bump the iterator for every argument
-//  }
-  padding_ =0;
-
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X19));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X20));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X21));
@@ -162,83 +177,87 @@
 uint32_t Arm64JniCallingConvention::CoreSpillMask() const {
   // Compute spill mask to agree with callee saves initialized in the constructor
   uint32_t result = 0;
-  result =  1 << X19 | 1 << X20 | 1 << X21 | 1 << X22 | 1 << X23 | 1 << X24 | 1 << X25
-      | 1 << X26 | 1 << X27 | 1 << X28 | 1<< X29 | 1 << LR;
+  result =  1 << X19 | 1 << X20 | 1 << X21 | 1 << X22 | 1 << X23 | 1 << X24 |
+            1 << X25 | 1 << X26 | 1 << X27 | 1 << X28 | 1 << X29 | 1 << LR;
+  return result;
+}
+
+uint32_t Arm64JniCallingConvention::FpSpillMask() const {
+  // Compute spill mask to agree with callee saves initialized in the constructor
+  uint32_t result = 0;
+  result = 1 << D8 | 1 << D9 | 1 << D10 | 1 << D11 | 1 << D12 | 1 << D13 |
+           1 << D14 | 1 << D15;
   return result;
 }
 
 ManagedRegister Arm64JniCallingConvention::ReturnScratchRegister() const {
-  return Arm64ManagedRegister::FromCoreRegister(X9);
+  return ManagedRegister::NoRegister();
 }
 
 size_t Arm64JniCallingConvention::FrameSize() {
-  // Method*, LR and callee save area size, local reference segment state
-  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
+  // Method*, callee save area size, local reference segment state
+  size_t frame_data_size = ((1 + CalleeSaveRegisters().size()) * kFramePointerSize) + sizeof(uint32_t);
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
+  size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
 
 size_t Arm64JniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize + padding_,
-                 kStackAlignment);
-}
-
-// JniCallingConvention ABI follows AAPCS where longs and doubles must occur
-// in even register numbers and stack slots
-void Arm64JniCallingConvention::Next() {
-  JniCallingConvention::Next();
-  size_t arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-  if ((itr_args_ >= 2) &&
-      (arg_pos < NumArgs()) &&
-      IsParamALongOrDouble(arg_pos)) {
-    // itr_slots_ needs to be an even number, according to AAPCS.
-    if ((itr_slots_ & 0x1u) != 0) {
-      itr_slots_++;
-    }
-  }
+  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
 }
 
 bool Arm64JniCallingConvention::IsCurrentParamInRegister() {
-  return itr_slots_ < 4;
+  if (IsCurrentParamAFloatOrDouble()) {
+    return (itr_float_and_doubles_ < 8);
+  } else {
+    return ((itr_args_ - itr_float_and_doubles_) < 8);
+  }
 }
 
 bool Arm64JniCallingConvention::IsCurrentParamOnStack() {
   return !IsCurrentParamInRegister();
 }
 
-// TODO and floating point?
-
-static const Register kJniArgumentRegisters[] = {
-  X0, X1, X2, X3, X4, X5, X6, X7
-};
 ManagedRegister Arm64JniCallingConvention::CurrentParamRegister() {
-  CHECK_LT(itr_slots_, 4u);
-  int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-  // TODO Floating point & 64bit registers.
-  if ((itr_args_ >= 2) && IsParamALongOrDouble(arg_pos)) {
-    CHECK_EQ(itr_slots_, 2u);
-    return Arm64ManagedRegister::FromCoreRegister(X1);
+  CHECK(IsCurrentParamInRegister());
+  if (IsCurrentParamAFloatOrDouble()) {
+    CHECK_LT(itr_float_and_doubles_, 8u);
+    if (IsCurrentParamADouble()) {
+      return Arm64ManagedRegister::FromDRegister(kDArgumentRegisters[itr_float_and_doubles_]);
+    } else {
+      return Arm64ManagedRegister::FromSRegister(kSArgumentRegisters[itr_float_and_doubles_]);
+    }
   } else {
-    return
-      Arm64ManagedRegister::FromCoreRegister(kJniArgumentRegisters[itr_slots_]);
+    int gp_reg = itr_args_ - itr_float_and_doubles_;
+    CHECK_LT(static_cast<unsigned int>(gp_reg), 8u);
+    if (IsCurrentParamALong() || IsCurrentParamAReference() || IsCurrentParamJniEnv())  {
+      return Arm64ManagedRegister::FromCoreRegister(kCoreArgumentRegisters[gp_reg]);
+    } else {
+      return Arm64ManagedRegister::FromWRegister(kWArgumentRegisters[gp_reg]);
+    }
   }
 }
 
 FrameOffset Arm64JniCallingConvention::CurrentParamStackOffset() {
-  CHECK_GE(itr_slots_, 4u);
-  size_t offset = displacement_.Int32Value() - OutArgSize() + ((itr_slots_ - 4) * kFramePointerSize);
+  CHECK(IsCurrentParamOnStack());
+  size_t args_on_stack = itr_args_
+                  - std::min(8u, itr_float_and_doubles_)
+                  - std::min(8u, (itr_args_ - itr_float_and_doubles_));
+  size_t offset = displacement_.Int32Value() - OutArgSize() + (args_on_stack * kFramePointerSize);
   CHECK_LT(offset, OutArgSize());
   return FrameOffset(offset);
 }
 
 size_t Arm64JniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = IsStatic() ? 1 : 0;  // count jclass
-  // regular argument parameters and this
-  size_t param_args = NumArgs() + NumLongOrDoubleArgs();
-  // count JNIEnv* less arguments in registers
-  return static_args + param_args + 1 - 4;
+  // all arguments including JNI args
+  size_t all_args = NumArgs() + NumberOfExtraArgumentsForJni();
+
+  size_t all_stack_args = all_args -
+            std::min(8u, static_cast<unsigned int>(NumFloatOrDoubleArgs())) -
+            std::min(8u, static_cast<unsigned int>((all_args - NumFloatOrDoubleArgs())));
+
+  return all_stack_args;
 }
 
 }  // namespace arm64
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index c18cd2b..92f547c 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -55,7 +55,6 @@
   ManagedRegister IntReturnRegister() OVERRIDE;
   ManagedRegister InterproceduralScratchRegister() OVERRIDE;
   // JNI calling convention
-  void Next() OVERRIDE;  // Override default behavior for AAPCS
   size_t FrameSize() OVERRIDE;
   size_t OutArgSize() OVERRIDE;
   const std::vector<ManagedRegister>& CalleeSaveRegisters() const OVERRIDE {
@@ -63,9 +62,7 @@
   }
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
-  uint32_t FpSpillMask() const OVERRIDE {
-    return 0;  // Floats aren't spilled in JNI down call
-  }
+  uint32_t FpSpillMask() const OVERRIDE;
   bool IsCurrentParamInRegister() OVERRIDE;
   bool IsCurrentParamOnStack() OVERRIDE;
   ManagedRegister CurrentParamRegister() OVERRIDE;
@@ -78,9 +75,6 @@
   // TODO: these values aren't unique and can be shared amongst instances
   std::vector<ManagedRegister> callee_save_regs_;
 
-  // Padding to ensure longs and doubles are not split in AAPCS
-  size_t padding_;
-
   DISALLOW_COPY_AND_ASSIGN(Arm64JniCallingConvention);
 };
 
diff --git a/compiler/jni/quick/calling_convention.cc b/compiler/jni/quick/calling_convention.cc
index 8efdcda..a99a4c2 100644
--- a/compiler/jni/quick/calling_convention.cc
+++ b/compiler/jni/quick/calling_convention.cc
@@ -90,6 +90,14 @@
   return IsParamAFloatOrDouble(itr_args_);
 }
 
+bool ManagedRuntimeCallingConvention::IsCurrentParamADouble() {
+  return IsParamADouble(itr_args_);
+}
+
+bool ManagedRuntimeCallingConvention::IsCurrentParamALong() {
+  return IsParamALong(itr_args_);
+}
+
 // JNI calling convention
 
 JniCallingConvention* JniCallingConvention::Create(bool is_static, bool is_synchronized,
@@ -168,6 +176,10 @@
   }
 }
 
+bool JniCallingConvention::IsCurrentParamJniEnv() {
+  return (itr_args_ == kJniEnv);
+}
+
 bool JniCallingConvention::IsCurrentParamAFloatOrDouble() {
   switch (itr_args_) {
     case kJniEnv:
@@ -181,6 +193,32 @@
   }
 }
 
+bool JniCallingConvention::IsCurrentParamADouble() {
+  switch (itr_args_) {
+    case kJniEnv:
+      return false;  // JNIEnv*
+    case kObjectOrClass:
+      return false;   // jobject or jclass
+    default: {
+      int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
+      return IsParamADouble(arg_pos);
+    }
+  }
+}
+
+bool JniCallingConvention::IsCurrentParamALong() {
+  switch (itr_args_) {
+    case kJniEnv:
+      return false;  // JNIEnv*
+    case kObjectOrClass:
+      return false;   // jobject or jclass
+    default: {
+      int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
+      return IsParamALong(arg_pos);
+    }
+  }
+}
+
 // Return position of SIRT entry holding reference at the current iterator
 // position
 FrameOffset JniCallingConvention::CurrentParamSirtEntryOffset() {
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 7e1cf63..4d25d1c 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -126,6 +126,24 @@
     char ch = shorty_[param];
     return (ch == 'F' || ch == 'D');
   }
+  bool IsParamADouble(unsigned int param) const {
+    DCHECK_LT(param, NumArgs());
+    if (IsStatic()) {
+      param++;  // 0th argument must skip return value at start of the shorty
+    } else if (param == 0) {
+      return false;  // this argument
+    }
+    return shorty_[param] == 'D';
+  }
+  bool IsParamALong(unsigned int param) const {
+    DCHECK_LT(param, NumArgs());
+    if (IsStatic()) {
+      param++;  // 0th argument must skip return value at start of the shorty
+    } else if (param == 0) {
+      return true;  // this argument
+    }
+    return shorty_[param] == 'J';
+  }
   bool IsParamAReference(unsigned int param) const {
     DCHECK_LT(param, NumArgs());
     if (IsStatic()) {
@@ -214,6 +232,8 @@
   void Next();
   bool IsCurrentParamAReference();
   bool IsCurrentParamAFloatOrDouble();
+  bool IsCurrentParamADouble();
+  bool IsCurrentParamALong();
   bool IsCurrentArgExplicit();  // ie a non-implict argument such as this
   bool IsCurrentArgPossiblyNull();
   size_t CurrentParamSize();
@@ -283,6 +303,9 @@
   virtual void Next();
   bool IsCurrentParamAReference();
   bool IsCurrentParamAFloatOrDouble();
+  bool IsCurrentParamADouble();
+  bool IsCurrentParamALong();
+  bool IsCurrentParamJniEnv();
   size_t CurrentParamSize();
   virtual bool IsCurrentParamInRegister() = 0;
   virtual bool IsCurrentParamOnStack() = 0;
@@ -299,17 +322,17 @@
 
   FrameOffset SirtLinkOffset() const {
     return FrameOffset(SirtOffset().Int32Value() +
-                       StackIndirectReferenceTable::LinkOffset());
+                       StackIndirectReferenceTable::LinkOffset(frame_pointer_size_));
   }
 
   FrameOffset SirtNumRefsOffset() const {
     return FrameOffset(SirtOffset().Int32Value() +
-                       StackIndirectReferenceTable::NumberOfReferencesOffset());
+                       StackIndirectReferenceTable::NumberOfReferencesOffset(frame_pointer_size_));
   }
 
   FrameOffset SirtReferencesOffset() const {
     return FrameOffset(SirtOffset().Int32Value() +
-                       StackIndirectReferenceTable::ReferencesOffset());
+                       StackIndirectReferenceTable::ReferencesOffset(frame_pointer_size_));
   }
 
   virtual ~JniCallingConvention() {}
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index dcdcdd1..93b1b5a 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -29,6 +29,7 @@
 #include "utils/assembler.h"
 #include "utils/managed_register.h"
 #include "utils/arm/managed_register_arm.h"
+#include "utils/arm64/managed_register_arm64.h"
 #include "utils/mips/managed_register_mips.h"
 #include "utils/x86/managed_register_x86.h"
 #include "thread.h"
@@ -63,6 +64,7 @@
   if (instruction_set == kThumb2) {
     instruction_set = kArm;
   }
+  const bool is_64_bit_target = Is64BitInstructionSet(instruction_set);
   // Calling conventions used to iterate over parameters to method
   UniquePtr<JniCallingConvention> main_jni_conv(
       JniCallingConvention::Create(is_static, is_synchronized, shorty, instruction_set));
@@ -73,11 +75,17 @@
 
   // Calling conventions to call into JNI method "end" possibly passing a returned reference, the
   //     method and the current thread.
-  size_t jni_end_arg_count = 0;
-  if (reference_return) { jni_end_arg_count++; }
-  if (is_synchronized) { jni_end_arg_count++; }
-  const char* jni_end_shorty = jni_end_arg_count == 0 ? "I"
-                                                        : (jni_end_arg_count == 1 ? "II" : "III");
+  const char* jni_end_shorty;
+  if (reference_return && is_synchronized) {
+    jni_end_shorty = "ILL";
+  } else if (reference_return) {
+    jni_end_shorty = "IL";
+  } else if (is_synchronized) {
+    jni_end_shorty = "VL";
+  } else {
+    jni_end_shorty = "V";
+  }
+
   UniquePtr<JniCallingConvention> end_jni_conv(
       JniCallingConvention::Create(is_static, is_synchronized, jni_end_shorty, instruction_set));
 
@@ -101,12 +109,22 @@
   __ StoreImmediateToFrame(main_jni_conv->SirtNumRefsOffset(),
                            main_jni_conv->ReferenceCount(),
                            mr_conv->InterproceduralScratchRegister());
-  __ CopyRawPtrFromThread32(main_jni_conv->SirtLinkOffset(),
-                          Thread::TopSirtOffset<4>(),
-                          mr_conv->InterproceduralScratchRegister());
-  __ StoreStackOffsetToThread32(Thread::TopSirtOffset<4>(),
-                              main_jni_conv->SirtOffset(),
-                              mr_conv->InterproceduralScratchRegister());
+
+  if (is_64_bit_target) {
+    __ CopyRawPtrFromThread64(main_jni_conv->SirtLinkOffset(),
+                            Thread::TopSirtOffset<8>(),
+                            mr_conv->InterproceduralScratchRegister());
+    __ StoreStackOffsetToThread64(Thread::TopSirtOffset<8>(),
+                                main_jni_conv->SirtOffset(),
+                                mr_conv->InterproceduralScratchRegister());
+  } else {
+    __ CopyRawPtrFromThread32(main_jni_conv->SirtLinkOffset(),
+                            Thread::TopSirtOffset<4>(),
+                            mr_conv->InterproceduralScratchRegister());
+    __ StoreStackOffsetToThread32(Thread::TopSirtOffset<4>(),
+                                main_jni_conv->SirtOffset(),
+                                mr_conv->InterproceduralScratchRegister());
+  }
 
   // 3. Place incoming reference arguments into SIRT
   main_jni_conv->Next();  // Skip JNIEnv*
@@ -154,9 +172,15 @@
   }
 
   // 4. Write out the end of the quick frames.
-  __ StoreStackPointerToThread32(Thread::TopOfManagedStackOffset<4>());
-  __ StoreImmediateToThread32(Thread::TopOfManagedStackPcOffset<4>(), 0,
-                            mr_conv->InterproceduralScratchRegister());
+  if (is_64_bit_target) {
+    __ StoreStackPointerToThread64(Thread::TopOfManagedStackOffset<8>());
+    __ StoreImmediateToThread64(Thread::TopOfManagedStackPcOffset<8>(), 0,
+                              mr_conv->InterproceduralScratchRegister());
+  } else {
+    __ StoreStackPointerToThread32(Thread::TopOfManagedStackOffset<4>());
+    __ StoreImmediateToThread32(Thread::TopOfManagedStackPcOffset<4>(), 0,
+                              mr_conv->InterproceduralScratchRegister());
+  }
 
   // 5. Move frame down to allow space for out going args.
   const size_t main_out_arg_size = main_jni_conv->OutArgSize();
@@ -164,13 +188,14 @@
   const size_t max_out_arg_size = std::max(main_out_arg_size, end_out_arg_size);
   __ IncreaseFrameSize(max_out_arg_size);
 
-
   // 6. Call into appropriate JniMethodStart passing Thread* so that transition out of Runnable
   //    can occur. The result is the saved JNI local state that is restored by the exit call. We
   //    abuse the JNI calling convention here, that is guaranteed to support passing 2 pointer
   //    arguments.
-  ThreadOffset<4> jni_start = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStartSynchronized)
-                                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStart);
+  ThreadOffset<4> jni_start32 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStartSynchronized)
+                                                : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStart);
+  ThreadOffset<8> jni_start64 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(8, pJniMethodStartSynchronized)
+                                                : QUICK_ENTRYPOINT_OFFSET(8, pJniMethodStart);
   main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
   FrameOffset locked_object_sirt_offset(0);
   if (is_synchronized) {
@@ -192,12 +217,21 @@
   }
   if (main_jni_conv->IsCurrentParamInRegister()) {
     __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
-    __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start),
-            main_jni_conv->InterproceduralScratchRegister());
+    if (is_64_bit_target) {
+      __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start64),
+             main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start32),
+             main_jni_conv->InterproceduralScratchRegister());
+    }
   } else {
     __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset(),
                         main_jni_conv->InterproceduralScratchRegister());
-    __ CallFromThread32(jni_start, main_jni_conv->InterproceduralScratchRegister());
+    if (is_64_bit_target) {
+      __ CallFromThread64(jni_start64, main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ CallFromThread32(jni_start32, main_jni_conv->InterproceduralScratchRegister());
+    }
   }
   if (is_synchronized) {  // Check for exceptions from monitor enter.
     __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), main_out_arg_size);
@@ -259,11 +293,20 @@
   if (main_jni_conv->IsCurrentParamInRegister()) {
     ManagedRegister jni_env = main_jni_conv->CurrentParamRegister();
     DCHECK(!jni_env.Equals(main_jni_conv->InterproceduralScratchRegister()));
-    __ LoadRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>());
+    if (is_64_bit_target) {
+      __ LoadRawPtrFromThread64(jni_env, Thread::JniEnvOffset<8>());
+    } else {
+      __ LoadRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>());
+    }
   } else {
     FrameOffset jni_env = main_jni_conv->CurrentParamStackOffset();
-    __ CopyRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>(),
+    if (is_64_bit_target) {
+      __ CopyRawPtrFromThread64(jni_env, Thread::JniEnvOffset<8>(),
                             main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ CopyRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>(),
+                            main_jni_conv->InterproceduralScratchRegister());
+    }
   }
 
   // 9. Plant call to native code associated with method.
@@ -289,25 +332,29 @@
     if (instruction_set == kMips && main_jni_conv->GetReturnType() == Primitive::kPrimDouble &&
         return_save_location.Uint32Value() % 8 != 0) {
       // Ensure doubles are 8-byte aligned for MIPS
-      return_save_location = FrameOffset(return_save_location.Uint32Value() + kPointerSize);
+      return_save_location = FrameOffset(return_save_location.Uint32Value() + kMipsPointerSize);
     }
     CHECK_LT(return_save_location.Uint32Value(), frame_size+main_out_arg_size);
     __ Store(return_save_location, main_jni_conv->ReturnRegister(), main_jni_conv->SizeOfReturnValue());
   }
 
-  // 12. Call into JNI method end possibly passing a returned reference, the method and the current
   //     thread.
   end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
-  ThreadOffset<4> jni_end(-1);
+  ThreadOffset<4> jni_end32(-1);
+  ThreadOffset<8> jni_end64(-1);
   if (reference_return) {
     // Pass result.
-    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReferenceSynchronized)
-                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReference);
+    jni_end32 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReferenceSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReference);
+    jni_end64 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEndWithReferenceSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEndWithReference);
     SetNativeParameter(jni_asm.get(), end_jni_conv.get(), end_jni_conv->ReturnRegister());
     end_jni_conv->Next();
   } else {
-    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndSynchronized)
-                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEnd);
+    jni_end32 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEnd);
+    jni_end64 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEndSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEnd);
   }
   // Pass saved local reference state.
   if (end_jni_conv->IsCurrentParamOnStack()) {
@@ -334,12 +381,21 @@
   }
   if (end_jni_conv->IsCurrentParamInRegister()) {
     __ GetCurrentThread(end_jni_conv->CurrentParamRegister());
-    __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end),
-            end_jni_conv->InterproceduralScratchRegister());
+    if (is_64_bit_target) {
+      __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end64),
+              end_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end32),
+              end_jni_conv->InterproceduralScratchRegister());
+    }
   } else {
     __ GetCurrentThread(end_jni_conv->CurrentParamStackOffset(),
                         end_jni_conv->InterproceduralScratchRegister());
-    __ CallFromThread32(ThreadOffset<4>(jni_end), end_jni_conv->InterproceduralScratchRegister());
+    if (is_64_bit_target) {
+      __ CallFromThread64(ThreadOffset<8>(jni_end64), end_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ CallFromThread32(ThreadOffset<4>(jni_end32), end_jni_conv->InterproceduralScratchRegister());
+    }
   }
 
   // 13. Reload return value
@@ -360,6 +416,10 @@
   // 17. Finalize code generation
   __ EmitSlowPaths();
   size_t cs = __ CodeSize();
+  if (instruction_set == kArm64) {
+    // Test that we do not exceed the buffer size.
+    CHECK(cs < arm64::kBufferSizeArm64);
+  }
   std::vector<uint8_t> managed_code(cs);
   MemoryRegion code(&managed_code[0], managed_code.size());
   __ FinalizeInstructions(code);
diff --git a/compiler/jni/quick/mips/calling_convention_mips.cc b/compiler/jni/quick/mips/calling_convention_mips.cc
index 51a3f54..8e1c0c7 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.cc
+++ b/compiler/jni/quick/mips/calling_convention_mips.cc
@@ -149,7 +149,7 @@
   // Method*, LR and callee save area size, local reference segment state
   size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
+  size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 8b440ed..153f953 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -126,7 +126,7 @@
   // Method*, return address and callee save area size, local reference segment state
   size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
+  size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 21e0bd7..4dfa29a 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -141,7 +141,7 @@
   // Method*, return address and callee save area size, local reference segment state
   size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus link_ (pointer) and number_of_references_ (uint32_t) for SIRT header
-  size_t sirt_size = kFramePointerSize + sizeof(uint32_t) + (ReferenceCount() * sirt_pointer_size_);
+  size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
diff --git a/compiler/llvm/llvm_compilation_unit.cc b/compiler/llvm/llvm_compilation_unit.cc
index 1d027f9..fe60959 100644
--- a/compiler/llvm/llvm_compilation_unit.cc
+++ b/compiler/llvm/llvm_compilation_unit.cc
@@ -314,23 +314,8 @@
 // section if the section alignment is greater than kArchAlignment.
 void LlvmCompilationUnit::CheckCodeAlign(uint32_t align) const {
   InstructionSet insn_set = GetInstructionSet();
-  switch (insn_set) {
-  case kThumb2:
-  case kArm:
-    CHECK_LE(align, static_cast<uint32_t>(kArmAlignment));
-    break;
-
-  case kX86:
-    CHECK_LE(align, static_cast<uint32_t>(kX86Alignment));
-    break;
-
-  case kMips:
-    CHECK_LE(align, static_cast<uint32_t>(kMipsAlignment));
-    break;
-
-  default:
-    LOG(FATAL) << "Unknown instruction set: " << insn_set;
-  }
+  size_t insn_set_align = GetInstructionSetAlignment(insn_set);
+  CHECK_LE(align, static_cast<uint32_t>(insn_set_align));
 }
 
 
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 2d45a2f..dc66e9c 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -376,7 +376,9 @@
     } else {
       CHECK(quick_code != nullptr);
       offset = compiled_method->AlignCode(offset);
-      DCHECK_ALIGNED(offset, kArmAlignment);
+      DCHECK_ALIGNED_PARAM(offset,
+                           GetInstructionSetAlignment(compiled_method->GetInstructionSet()));
+
       uint32_t code_size = quick_code->size() * sizeof(uint8_t);
       CHECK_NE(code_size, 0U);
       uint32_t thumb_offset = compiled_method->CodeDelta();
@@ -508,11 +510,7 @@
           refs++;
         }
       }
-      InstructionSet trg_isa = compiler_driver_->GetInstructionSet();
-      size_t pointer_size = 4;
-      if (trg_isa == kArm64 || trg_isa == kX86_64) {
-        pointer_size = 8;
-      }
+      size_t pointer_size = GetInstructionSetPointerSize(compiler_driver_->GetInstructionSet());
       size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(pointer_size, refs);
 
       // Get the generic spill masks and base frame size.
@@ -826,7 +824,9 @@
         relative_offset += aligned_code_delta;
         DCHECK_OFFSET();
       }
-      DCHECK_ALIGNED(relative_offset, kArmAlignment);
+      DCHECK_ALIGNED_PARAM(relative_offset,
+                           GetInstructionSetAlignment(compiled_method->GetInstructionSet()));
+
       uint32_t code_size = quick_code->size() * sizeof(uint8_t);
       CHECK_NE(code_size, 0U);
 
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 0554876..1efdd38 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -25,7 +25,8 @@
 
 namespace art {
 
-void HGraphBuilder::InitializeLocals(int count) {
+void HGraphBuilder::InitializeLocals(uint16_t count) {
+  graph_->SetNumberOfVRegs(count);
   locals_.SetSize(count);
   for (int i = 0; i < count; i++) {
     HLocal* local = new (arena_) HLocal(i);
@@ -34,15 +35,81 @@
   }
 }
 
+bool HGraphBuilder::InitializeParameters(uint16_t number_of_parameters) {
+  // dex_compilation_unit_ is null only when unit testing.
+  if (dex_compilation_unit_ == nullptr) {
+    return true;
+  }
+
+  graph_->SetNumberOfInVRegs(number_of_parameters);
+  const char* shorty = dex_compilation_unit_->GetShorty();
+  int locals_index = locals_.Size() - number_of_parameters;
+  int parameter_index = 0;
+
+  if (!dex_compilation_unit_->IsStatic()) {
+    // Add the implicit 'this' argument, not expressed in the signature.
+    HParameterValue* parameter =
+        new (arena_) HParameterValue(parameter_index++, Primitive::kPrimNot);
+    entry_block_->AddInstruction(parameter);
+    HLocal* local = GetLocalAt(locals_index++);
+    entry_block_->AddInstruction(new (arena_) HStoreLocal(local, parameter));
+    number_of_parameters--;
+  }
+
+  uint32_t pos = 1;
+  for (int i = 0; i < number_of_parameters; i++) {
+    switch (shorty[pos++]) {
+      case 'F':
+      case 'D': {
+        return false;
+      }
+
+      default: {
+        // integer and reference parameters.
+        HParameterValue* parameter =
+            new (arena_) HParameterValue(parameter_index++, Primitive::GetType(shorty[pos - 1]));
+        entry_block_->AddInstruction(parameter);
+        HLocal* local = GetLocalAt(locals_index++);
+        // Store the parameter value in the local that the dex code will use
+        // to reference that parameter.
+        entry_block_->AddInstruction(new (arena_) HStoreLocal(local, parameter));
+        if (parameter->GetType() == Primitive::kPrimLong) {
+          i++;
+          locals_index++;
+          parameter_index++;
+        }
+        break;
+      }
+    }
+  }
+  return true;
+}
+
 static bool CanHandleCodeItem(const DexFile::CodeItem& code_item) {
   if (code_item.tries_size_ > 0) {
     return false;
-  } else if (code_item.ins_size_ > 0) {
-    return false;
   }
   return true;
 }
 
+template<typename T>
+void HGraphBuilder::If_22t(const Instruction& instruction, int32_t dex_offset, bool is_not) {
+  HInstruction* first = LoadLocal(instruction.VRegA(), Primitive::kPrimInt);
+  HInstruction* second = LoadLocal(instruction.VRegB(), Primitive::kPrimInt);
+  current_block_->AddInstruction(new (arena_) T(first, second));
+  if (is_not) {
+    current_block_->AddInstruction(new (arena_) HNot(current_block_->GetLastInstruction()));
+  }
+  current_block_->AddInstruction(new (arena_) HIf(current_block_->GetLastInstruction()));
+  HBasicBlock* target = FindBlockStartingAt(instruction.GetTargetOffset() + dex_offset);
+  DCHECK(target != nullptr);
+  current_block_->AddSuccessor(target);
+  target = FindBlockStartingAt(dex_offset + instruction.SizeInCodeUnits());
+  DCHECK(target != nullptr);
+  current_block_->AddSuccessor(target);
+  current_block_ = nullptr;
+}
+
 HGraph* HGraphBuilder::BuildGraph(const DexFile::CodeItem& code_item) {
   if (!CanHandleCodeItem(code_item)) {
     return nullptr;
@@ -66,6 +133,10 @@
   // start a new block, and create these blocks.
   ComputeBranchTargets(code_ptr, code_end);
 
+  if (!InitializeParameters(code_item.ins_size_)) {
+    return nullptr;
+  }
+
   size_t dex_offset = 0;
   while (code_ptr < code_end) {
     // Update the current block if dex_offset starts a new block.
@@ -139,6 +210,112 @@
   return branch_targets_.Get(index);
 }
 
+template<typename T>
+void HGraphBuilder::Binop_32x(const Instruction& instruction, Primitive::Type type) {
+  HInstruction* first = LoadLocal(instruction.VRegB(), type);
+  HInstruction* second = LoadLocal(instruction.VRegC(), type);
+  current_block_->AddInstruction(new (arena_) T(type, first, second));
+  UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+}
+
+template<typename T>
+void HGraphBuilder::Binop_12x(const Instruction& instruction, Primitive::Type type) {
+  HInstruction* first = LoadLocal(instruction.VRegA(), type);
+  HInstruction* second = LoadLocal(instruction.VRegB(), type);
+  current_block_->AddInstruction(new (arena_) T(type, first, second));
+  UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+}
+
+template<typename T>
+void HGraphBuilder::Binop_22s(const Instruction& instruction, bool reverse) {
+  HInstruction* first = LoadLocal(instruction.VRegB(), Primitive::kPrimInt);
+  HInstruction* second = GetIntConstant(instruction.VRegC_22s());
+  if (reverse) {
+    std::swap(first, second);
+  }
+  current_block_->AddInstruction(new (arena_) T(Primitive::kPrimInt, first, second));
+  UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+}
+
+template<typename T>
+void HGraphBuilder::Binop_22b(const Instruction& instruction, bool reverse) {
+  HInstruction* first = LoadLocal(instruction.VRegB(), Primitive::kPrimInt);
+  HInstruction* second = GetIntConstant(instruction.VRegC_22b());
+  if (reverse) {
+    std::swap(first, second);
+  }
+  current_block_->AddInstruction(new (arena_) T(Primitive::kPrimInt, first, second));
+  UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+}
+
+void HGraphBuilder::BuildReturn(const Instruction& instruction, Primitive::Type type) {
+  if (type == Primitive::kPrimVoid) {
+    current_block_->AddInstruction(new (arena_) HReturnVoid());
+  } else {
+    HInstruction* value = LoadLocal(instruction.VRegA(), type);
+    current_block_->AddInstruction(new (arena_) HReturn(value));
+  }
+  current_block_->AddSuccessor(exit_block_);
+  current_block_ = nullptr;
+}
+
+bool HGraphBuilder::BuildInvoke(const Instruction& instruction,
+                                uint32_t dex_offset,
+                                uint32_t method_idx,
+                                uint32_t number_of_vreg_arguments,
+                                bool is_range,
+                                uint32_t* args,
+                                uint32_t register_index) {
+  const DexFile::MethodId& method_id = dex_file_->GetMethodId(method_idx);
+  const DexFile::ProtoId& proto_id = dex_file_->GetProtoId(method_id.proto_idx_);
+  const char* descriptor = dex_file_->StringDataByIdx(proto_id.shorty_idx_);
+  Primitive::Type return_type = Primitive::GetType(descriptor[0]);
+  bool is_instance_call =
+      instruction.Opcode() != Instruction::INVOKE_STATIC
+      && instruction.Opcode() != Instruction::INVOKE_STATIC_RANGE;
+  const size_t number_of_arguments = strlen(descriptor) - (is_instance_call ? 0 : 1);
+
+  // Treat invoke-direct like static calls for now.
+  HInvoke* invoke = new (arena_) HInvokeStatic(
+      arena_, number_of_arguments, return_type, dex_offset, method_idx);
+
+  size_t start_index = 0;
+  if (is_instance_call) {
+    HInstruction* arg = LoadLocal(is_range ? register_index : args[0], Primitive::kPrimNot);
+    invoke->SetArgumentAt(0, arg);
+    start_index = 1;
+  }
+
+  uint32_t descriptor_index = 1;
+  uint32_t argument_index = start_index;
+  for (size_t i = start_index; i < number_of_vreg_arguments; i++, argument_index++) {
+    Primitive::Type type = Primitive::GetType(descriptor[descriptor_index++]);
+    switch (type) {
+      case Primitive::kPrimFloat:
+      case Primitive::kPrimDouble:
+        return false;
+
+      default: {
+        if (!is_range && type == Primitive::kPrimLong && args[i] + 1 != args[i + 1]) {
+          LOG(WARNING) << "Non sequential register pair in " << dex_compilation_unit_->GetSymbol()
+                       << " at " << dex_offset;
+          // We do not implement non sequential register pair.
+          return false;
+        }
+        HInstruction* arg = LoadLocal(is_range ? register_index + i : args[i], type);
+        invoke->SetArgumentAt(argument_index, arg);
+        if (type == Primitive::kPrimLong) {
+          i++;
+        }
+      }
+    }
+  }
+
+  DCHECK_EQ(argument_index, number_of_arguments);
+  current_block_->AddInstruction(invoke);
+  return true;
+}
+
 bool HGraphBuilder::AnalyzeDexInstruction(const Instruction& instruction, int32_t dex_offset) {
   if (current_block_ == nullptr) {
     return true;  // Dead code
@@ -147,30 +324,57 @@
   switch (instruction.Opcode()) {
     case Instruction::CONST_4: {
       int32_t register_index = instruction.VRegA();
-      HIntConstant* constant = GetConstant(instruction.VRegB_11n());
+      HIntConstant* constant = GetIntConstant(instruction.VRegB_11n());
       UpdateLocal(register_index, constant);
       break;
     }
 
+    case Instruction::CONST_16: {
+      int32_t register_index = instruction.VRegA();
+      HIntConstant* constant = GetIntConstant(instruction.VRegB_21s());
+      UpdateLocal(register_index, constant);
+      break;
+    }
+
+    case Instruction::CONST_WIDE_16: {
+      int32_t register_index = instruction.VRegA();
+      HLongConstant* constant = GetLongConstant(instruction.VRegB_21s());
+      UpdateLocal(register_index, constant);
+      break;
+    }
+
+    case Instruction::CONST_WIDE_32: {
+      int32_t register_index = instruction.VRegA();
+      HLongConstant* constant = GetLongConstant(instruction.VRegB_31i());
+      UpdateLocal(register_index, constant);
+      break;
+    }
+
+    case Instruction::CONST_WIDE: {
+      int32_t register_index = instruction.VRegA();
+      HLongConstant* constant = GetLongConstant(instruction.VRegB_51l());
+      UpdateLocal(register_index, constant);
+      break;
+    }
+
+    case Instruction::MOVE: {
+      HInstruction* value = LoadLocal(instruction.VRegB(), Primitive::kPrimInt);
+      UpdateLocal(instruction.VRegA(), value);
+      break;
+    }
+
     case Instruction::RETURN_VOID: {
-      current_block_->AddInstruction(new (arena_) HReturnVoid());
-      current_block_->AddSuccessor(exit_block_);
-      current_block_ = nullptr;
+      BuildReturn(instruction, Primitive::kPrimVoid);
       break;
     }
 
     case Instruction::IF_EQ: {
-      HInstruction* first = LoadLocal(instruction.VRegA());
-      HInstruction* second = LoadLocal(instruction.VRegB());
-      current_block_->AddInstruction(new (arena_) HEqual(first, second));
-      current_block_->AddInstruction(new (arena_) HIf(current_block_->GetLastInstruction()));
-      HBasicBlock* target = FindBlockStartingAt(instruction.GetTargetOffset() + dex_offset);
-      DCHECK(target != nullptr);
-      current_block_->AddSuccessor(target);
-      target = FindBlockStartingAt(dex_offset + instruction.SizeInCodeUnits());
-      DCHECK(target != nullptr);
-      current_block_->AddSuccessor(target);
-      current_block_ = nullptr;
+      If_22t<HEqual>(instruction, dex_offset, false);
+      break;
+    }
+
+    case Instruction::IF_NE: {
+      If_22t<HEqual>(instruction, dex_offset, true);
       break;
     }
 
@@ -186,93 +390,112 @@
     }
 
     case Instruction::RETURN: {
-      HInstruction* value = LoadLocal(instruction.VRegA());
-      current_block_->AddInstruction(new (arena_) HReturn(value));
-      current_block_->AddSuccessor(exit_block_);
-      current_block_ = nullptr;
+      BuildReturn(instruction, Primitive::kPrimInt);
       break;
     }
 
-    case Instruction::INVOKE_STATIC: {
+    case Instruction::RETURN_OBJECT: {
+      BuildReturn(instruction, Primitive::kPrimNot);
+      break;
+    }
+
+    case Instruction::RETURN_WIDE: {
+      BuildReturn(instruction, Primitive::kPrimLong);
+      break;
+    }
+
+    case Instruction::INVOKE_STATIC:
+    case Instruction::INVOKE_DIRECT: {
       uint32_t method_idx = instruction.VRegB_35c();
-      const DexFile::MethodId& method_id = dex_file_->GetMethodId(method_idx);
-      uint32_t return_type_idx = dex_file_->GetProtoId(method_id.proto_idx_).return_type_idx_;
-      const char* descriptor = dex_file_->StringByTypeIdx(return_type_idx);
-      const size_t number_of_arguments = instruction.VRegA_35c();
-
-      if (Primitive::GetType(descriptor[0]) != Primitive::kPrimVoid) {
-        return false;
-      }
-
-      HInvokeStatic* invoke = new (arena_) HInvokeStatic(
-          arena_, number_of_arguments, dex_offset, method_idx);
-
+      uint32_t number_of_vreg_arguments = instruction.VRegA_35c();
       uint32_t args[5];
       instruction.GetArgs(args);
-
-      for (size_t i = 0; i < number_of_arguments; i++) {
-        HInstruction* arg = LoadLocal(args[i]);
-        HInstruction* push = new (arena_) HPushArgument(arg, i);
-        current_block_->AddInstruction(push);
-        invoke->SetArgumentAt(i, push);
+      if (!BuildInvoke(instruction, dex_offset, method_idx, number_of_vreg_arguments, false, args, -1)) {
+        return false;
       }
-
-      current_block_->AddInstruction(invoke);
       break;
     }
 
-    case Instruction::INVOKE_STATIC_RANGE: {
+    case Instruction::INVOKE_STATIC_RANGE:
+    case Instruction::INVOKE_DIRECT_RANGE: {
       uint32_t method_idx = instruction.VRegB_3rc();
-      const DexFile::MethodId& method_id = dex_file_->GetMethodId(method_idx);
-      uint32_t return_type_idx = dex_file_->GetProtoId(method_id.proto_idx_).return_type_idx_;
-      const char* descriptor = dex_file_->StringByTypeIdx(return_type_idx);
-      const size_t number_of_arguments = instruction.VRegA_3rc();
-
-      if (Primitive::GetType(descriptor[0]) != Primitive::kPrimVoid) {
+      uint32_t number_of_vreg_arguments = instruction.VRegA_3rc();
+      uint32_t register_index = instruction.VRegC();
+      if (!BuildInvoke(instruction, dex_offset, method_idx,
+                       number_of_vreg_arguments, true, nullptr, register_index)) {
         return false;
       }
-
-      HInvokeStatic* invoke = new (arena_) HInvokeStatic(
-          arena_, number_of_arguments, dex_offset, method_idx);
-      int32_t register_index = instruction.VRegC();
-      for (size_t i = 0; i < number_of_arguments; i++) {
-        HInstruction* arg = LoadLocal(register_index + i);
-        HInstruction* push = new (arena_) HPushArgument(arg, i);
-        current_block_->AddInstruction(push);
-        invoke->SetArgumentAt(i, push);
-      }
-      current_block_->AddInstruction(invoke);
       break;
     }
 
     case Instruction::ADD_INT: {
-      HInstruction* first = LoadLocal(instruction.VRegB());
-      HInstruction* second = LoadLocal(instruction.VRegC());
-      current_block_->AddInstruction(new (arena_) HAdd(Primitive::kPrimInt, first, second));
-      UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+      Binop_32x<HAdd>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::ADD_LONG: {
+      Binop_32x<HAdd>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
+    case Instruction::SUB_INT: {
+      Binop_32x<HSub>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::SUB_LONG: {
+      Binop_32x<HSub>(instruction, Primitive::kPrimLong);
       break;
     }
 
     case Instruction::ADD_INT_2ADDR: {
-      HInstruction* first = LoadLocal(instruction.VRegA());
-      HInstruction* second = LoadLocal(instruction.VRegB());
-      current_block_->AddInstruction(new (arena_) HAdd(Primitive::kPrimInt, first, second));
-      UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+      Binop_12x<HAdd>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::ADD_LONG_2ADDR: {
+      Binop_12x<HAdd>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
+    case Instruction::SUB_INT_2ADDR: {
+      Binop_12x<HSub>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::SUB_LONG_2ADDR: {
+      Binop_12x<HSub>(instruction, Primitive::kPrimLong);
       break;
     }
 
     case Instruction::ADD_INT_LIT16: {
-      HInstruction* first = LoadLocal(instruction.VRegB());
-      HInstruction* second = GetConstant(instruction.VRegC_22s());
-      current_block_->AddInstruction(new (arena_) HAdd(Primitive::kPrimInt, first, second));
-      UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+      Binop_22s<HAdd>(instruction, false);
+      break;
+    }
+
+    case Instruction::RSUB_INT: {
+      Binop_22s<HSub>(instruction, true);
       break;
     }
 
     case Instruction::ADD_INT_LIT8: {
-      HInstruction* first = LoadLocal(instruction.VRegB());
-      HInstruction* second = GetConstant(instruction.VRegC_22b());
-      current_block_->AddInstruction(new (arena_) HAdd(Primitive::kPrimInt, first, second));
+      Binop_22b<HAdd>(instruction, false);
+      break;
+    }
+
+    case Instruction::RSUB_INT_LIT8: {
+      Binop_22b<HSub>(instruction, true);
+      break;
+    }
+
+    case Instruction::NEW_INSTANCE: {
+      current_block_->AddInstruction(
+          new (arena_) HNewInstance(dex_offset, instruction.VRegB_21c()));
+      UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+      break;
+    }
+
+    case Instruction::MOVE_RESULT_WIDE: {
       UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
       break;
     }
@@ -286,7 +509,7 @@
   return true;
 }
 
-HIntConstant* HGraphBuilder::GetConstant0() {
+HIntConstant* HGraphBuilder::GetIntConstant0() {
   if (constant0_ != nullptr) {
     return constant0_;
   }
@@ -295,7 +518,7 @@
   return constant0_;
 }
 
-HIntConstant* HGraphBuilder::GetConstant1() {
+HIntConstant* HGraphBuilder::GetIntConstant1() {
   if (constant1_ != nullptr) {
     return constant1_;
   }
@@ -304,10 +527,10 @@
   return constant1_;
 }
 
-HIntConstant* HGraphBuilder::GetConstant(int constant) {
+HIntConstant* HGraphBuilder::GetIntConstant(int32_t constant) {
   switch (constant) {
-    case 0: return GetConstant0();
-    case 1: return GetConstant1();
+    case 0: return GetIntConstant0();
+    case 1: return GetIntConstant1();
     default: {
       HIntConstant* instruction = new (arena_) HIntConstant(constant);
       entry_block_->AddInstruction(instruction);
@@ -316,6 +539,12 @@
   }
 }
 
+HLongConstant* HGraphBuilder::GetLongConstant(int64_t constant) {
+  HLongConstant* instruction = new (arena_) HLongConstant(constant);
+  entry_block_->AddInstruction(instruction);
+  return instruction;
+}
+
 HLocal* HGraphBuilder::GetLocalAt(int register_index) const {
   return locals_.Get(register_index);
 }
@@ -325,9 +554,9 @@
   current_block_->AddInstruction(new (arena_) HStoreLocal(local, instruction));
 }
 
-HInstruction* HGraphBuilder::LoadLocal(int register_index) const {
+HInstruction* HGraphBuilder::LoadLocal(int register_index, Primitive::Type type) const {
   HLocal* local = GetLocalAt(register_index);
-  current_block_->AddInstruction(new (arena_) HLoadLocal(local));
+  current_block_->AddInstruction(new (arena_) HLoadLocal(local, type));
   return current_block_->GetLastInstruction();
 }
 
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index 46ca9aa..108514a 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -19,6 +19,7 @@
 
 #include "dex_file.h"
 #include "driver/dex_compilation_unit.h"
+#include "primitive.h"
 #include "utils/allocation.h"
 #include "utils/growable_array.h"
 
@@ -29,13 +30,14 @@
 class HBasicBlock;
 class HGraph;
 class HIntConstant;
+class HLongConstant;
 class HInstruction;
 class HLocal;
 
 class HGraphBuilder : public ValueObject {
  public:
   HGraphBuilder(ArenaAllocator* arena,
-                const DexCompilationUnit* dex_compilation_unit = nullptr,
+                DexCompilationUnit* dex_compilation_unit = nullptr,
                 const DexFile* dex_file = nullptr)
       : arena_(arena),
         branch_targets_(arena, 0),
@@ -63,13 +65,43 @@
   void MaybeUpdateCurrentBlock(size_t index);
   HBasicBlock* FindBlockStartingAt(int32_t index) const;
 
-  HIntConstant* GetConstant0();
-  HIntConstant* GetConstant1();
-  HIntConstant* GetConstant(int constant);
-  void InitializeLocals(int count);
+  HIntConstant* GetIntConstant0();
+  HIntConstant* GetIntConstant1();
+  HIntConstant* GetIntConstant(int32_t constant);
+  HLongConstant* GetLongConstant(int64_t constant);
+  void InitializeLocals(uint16_t count);
   HLocal* GetLocalAt(int register_index) const;
   void UpdateLocal(int register_index, HInstruction* instruction) const;
-  HInstruction* LoadLocal(int register_index) const;
+  HInstruction* LoadLocal(int register_index, Primitive::Type type) const;
+
+  // Temporarily returns whether the compiler supports the parameters
+  // of the method.
+  bool InitializeParameters(uint16_t number_of_parameters);
+
+  template<typename T>
+  void Binop_32x(const Instruction& instruction, Primitive::Type type);
+
+  template<typename T>
+  void Binop_12x(const Instruction& instruction, Primitive::Type type);
+
+  template<typename T>
+  void Binop_22b(const Instruction& instruction, bool reverse);
+
+  template<typename T>
+  void Binop_22s(const Instruction& instruction, bool reverse);
+
+  template<typename T> void If_22t(const Instruction& instruction, int32_t dex_offset, bool is_not);
+
+  void BuildReturn(const Instruction& instruction, Primitive::Type type);
+
+  // Builds an invocation node and returns whether the instruction is supported.
+  bool BuildInvoke(const Instruction& instruction,
+                   uint32_t dex_offset,
+                   uint32_t method_idx,
+                   uint32_t number_of_vreg_arguments,
+                   bool is_range,
+                   uint32_t* args,
+                   uint32_t register_index);
 
   ArenaAllocator* const arena_;
 
@@ -89,7 +121,7 @@
   HIntConstant* constant1_;
 
   const DexFile* const dex_file_;
-  const DexCompilationUnit* const dex_compilation_unit_;
+  DexCompilationUnit* const dex_compilation_unit_;
 
   DISALLOW_COPY_AND_ASSIGN(HGraphBuilder);
 };
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 05e5d7b..7e63c69 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -30,12 +30,11 @@
 namespace art {
 
 void CodeGenerator::Compile(CodeAllocator* allocator) {
-  frame_size_ = GetGraph()->GetMaximumNumberOfOutVRegs() * kWordSize;
   const GrowableArray<HBasicBlock*>* blocks = GetGraph()->GetBlocks();
   DCHECK(blocks->Get(0) == GetGraph()->GetEntryBlock());
   DCHECK(GoesToNextBlock(GetGraph()->GetEntryBlock(), blocks->Get(1)));
-  CompileEntryBlock();
-  for (size_t i = 1; i < blocks->Size(); i++) {
+  GenerateFrameEntry();
+  for (size_t i = 0; i < blocks->Size(); i++) {
     CompileBlock(blocks->Get(i));
   }
   size_t code_size = GetAssembler()->CodeSize();
@@ -44,32 +43,11 @@
   GetAssembler()->FinalizeInstructions(code);
 }
 
-void CodeGenerator::CompileEntryBlock() {
-  HGraphVisitor* location_builder = GetLocationBuilder();
-  HGraphVisitor* instruction_visitor = GetInstructionVisitor();
-  // The entry block contains all locals for this method. By visiting the entry block,
-  // we're computing the required frame size.
-  for (HInstructionIterator it(GetGraph()->GetEntryBlock()); !it.Done(); it.Advance()) {
-    HInstruction* current = it.Current();
-    // Instructions in the entry block should not generate code.
-    if (kIsDebugBuild) {
-      current->Accept(location_builder);
-      DCHECK(current->GetLocations() == nullptr);
-    }
-    current->Accept(instruction_visitor);
-  }
-  GenerateFrameEntry();
-}
-
 void CodeGenerator::CompileBlock(HBasicBlock* block) {
   Bind(GetLabelOf(block));
   HGraphVisitor* location_builder = GetLocationBuilder();
   HGraphVisitor* instruction_visitor = GetInstructionVisitor();
   for (HInstructionIterator it(block); !it.Done(); it.Advance()) {
-    // For each instruction, we emulate a stack-based machine, where the inputs are popped from
-    // the runtime stack, and the result is pushed on the stack. We currently can do this because
-    // we do not perform any code motion, and the Dex format does not reference individual
-    // instructions but uses registers instead (our equivalent of HLocal).
     HInstruction* current = it.Current();
     current->Accept(location_builder);
     InitLocations(current);
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 01bbcc0..5c7cac1 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -17,6 +17,7 @@
 #ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_H_
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_H_
 
+#include "base/bit_field.h"
 #include "globals.h"
 #include "instruction_set.h"
 #include "memory_region.h"
@@ -25,6 +26,8 @@
 
 namespace art {
 
+static size_t constexpr kVRegSize = 4;
+
 class DexCompilationUnit;
 
 class CodeAllocator {
@@ -49,30 +52,149 @@
  */
 class Location : public ValueObject {
  public:
-  template<typename T>
-  T reg() const { return static_cast<T>(reg_); }
+  enum Kind {
+    kInvalid = 0,
+    kStackSlot = 1,  // Word size slot.
+    kDoubleStackSlot = 2,  // 64bit stack slot.
+    kRegister = 3,
+    // On 32bits architectures, quick can pass a long where the
+    // low bits are in the last parameter register, and the high
+    // bits are in a stack slot. The kQuickParameter kind is for
+    // handling this special case.
+    kQuickParameter = 4,
+  };
 
-  Location() : reg_(kInvalid) { }
-  explicit Location(uword reg) : reg_(reg) { }
-
-  static Location RegisterLocation(uword reg) {
-    return Location(reg);
+  Location() : value_(kInvalid) {
+    DCHECK(!IsValid());
   }
 
-  bool IsValid() const { return reg_ != kInvalid; }
-
-  Location(const Location& other) : reg_(other.reg_) { }
+  Location(const Location& other) : ValueObject(), value_(other.value_) {}
 
   Location& operator=(const Location& other) {
-    reg_ = other.reg_;
+    value_ = other.value_;
     return *this;
   }
 
+  bool IsValid() const {
+    return value_ != kInvalid;
+  }
+
+  // Register locations.
+  static Location RegisterLocation(ManagedRegister reg) {
+    return Location(kRegister, reg.RegId());
+  }
+
+  bool IsRegister() const {
+    return GetKind() == kRegister;
+  }
+
+  ManagedRegister reg() const {
+    DCHECK(IsRegister());
+    return static_cast<ManagedRegister>(GetPayload());
+  }
+
+  static uword EncodeStackIndex(intptr_t stack_index) {
+    DCHECK(-kStackIndexBias <= stack_index);
+    DCHECK(stack_index < kStackIndexBias);
+    return static_cast<uword>(kStackIndexBias + stack_index);
+  }
+
+  static Location StackSlot(intptr_t stack_index) {
+    uword payload = EncodeStackIndex(stack_index);
+    Location loc(kStackSlot, payload);
+    // Ensure that sign is preserved.
+    DCHECK_EQ(loc.GetStackIndex(), stack_index);
+    return loc;
+  }
+
+  bool IsStackSlot() const {
+    return GetKind() == kStackSlot;
+  }
+
+  static Location DoubleStackSlot(intptr_t stack_index) {
+    uword payload = EncodeStackIndex(stack_index);
+    Location loc(kDoubleStackSlot, payload);
+    // Ensure that sign is preserved.
+    DCHECK_EQ(loc.GetStackIndex(), stack_index);
+    return loc;
+  }
+
+  bool IsDoubleStackSlot() const {
+    return GetKind() == kDoubleStackSlot;
+  }
+
+  intptr_t GetStackIndex() const {
+    DCHECK(IsStackSlot() || IsDoubleStackSlot());
+    // Decode stack index manually to preserve sign.
+    return GetPayload() - kStackIndexBias;
+  }
+
+  intptr_t GetHighStackIndex(uintptr_t word_size) const {
+    DCHECK(IsDoubleStackSlot());
+    // Decode stack index manually to preserve sign.
+    return GetPayload() - kStackIndexBias + word_size;
+  }
+
+  static Location QuickParameter(uint32_t parameter_index) {
+    return Location(kQuickParameter, parameter_index);
+  }
+
+  uint32_t GetQuickParameterIndex() const {
+    DCHECK(IsQuickParameter());
+    return GetPayload();
+  }
+
+  bool IsQuickParameter() const {
+    return GetKind() == kQuickParameter;
+  }
+
+  arm::ArmManagedRegister AsArm() const;
+  x86::X86ManagedRegister AsX86() const;
+
+  Kind GetKind() const {
+    return KindField::Decode(value_);
+  }
+
+  bool Equals(Location other) const {
+    return value_ == other.value_;
+  }
+
+  const char* DebugString() const {
+    switch (GetKind()) {
+      case kInvalid: return "?";
+      case kRegister: return "R";
+      case kStackSlot: return "S";
+      case kDoubleStackSlot: return "DS";
+      case kQuickParameter: return "Q";
+    }
+    return "?";
+  }
+
  private:
-  // The target register for that location.
-  // TODO: Support stack location.
-  uword reg_;
-  static const uword kInvalid = -1;
+  // Number of bits required to encode Kind value.
+  static constexpr uint32_t kBitsForKind = 4;
+  static constexpr uint32_t kBitsForPayload = kWordSize * kBitsPerByte - kBitsForKind;
+
+  explicit Location(uword value) : value_(value) {}
+
+  Location(Kind kind, uword payload)
+      : value_(KindField::Encode(kind) | PayloadField::Encode(payload)) {}
+
+  uword GetPayload() const {
+    return PayloadField::Decode(value_);
+  }
+
+  typedef BitField<Kind, 0, kBitsForKind> KindField;
+  typedef BitField<uword, kBitsForKind, kBitsForPayload> PayloadField;
+
+  // Layout for stack slots.
+  static const intptr_t kStackIndexBias =
+      static_cast<intptr_t>(1) << (kBitsForPayload - 1);
+
+  // Location either contains kind and payload fields or a tagged handle for
+  // a constant locations. Values of enumeration Kind are selected in such a
+  // way that none of them can be interpreted as a kConstant tag.
+  uword value_;
 };
 
 /**
@@ -145,6 +267,7 @@
   virtual HGraphVisitor* GetLocationBuilder() = 0;
   virtual HGraphVisitor* GetInstructionVisitor() = 0;
   virtual Assembler* GetAssembler() = 0;
+  virtual size_t GetWordSize() const = 0;
 
   uint32_t GetFrameSize() const { return frame_size_; }
   void SetFrameSize(uint32_t size) { frame_size_ = size; }
@@ -179,7 +302,6 @@
  private:
   void InitLocations(HInstruction* instruction);
   void CompileBlock(HBasicBlock* block);
-  void CompileEntryBlock();
 
   HGraph* const graph_;
 
@@ -203,11 +325,10 @@
     return registers_[index];
   }
 
-  uint8_t GetStackOffsetOf(size_t index) const {
-    DCHECK_GE(index, number_of_registers_);
+  uint8_t GetStackOffsetOf(size_t index, size_t word_size) const {
     // We still reserve the space for parameters passed by registers.
-    // Add kWordSize for the method pointer.
-    return index * kWordSize + kWordSize;
+    // Add word_size for the method pointer.
+    return index * kVRegSize + word_size;
   }
 
  private:
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 09d6f7b..27691ac 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -17,6 +17,7 @@
 #include "code_generator_arm.h"
 #include "utils/assembler.h"
 #include "utils/arm/assembler_arm.h"
+#include "utils/arm/managed_register_arm.h"
 
 #include "mirror/array.h"
 #include "mirror/art_method.h"
@@ -24,11 +25,20 @@
 #define __ reinterpret_cast<ArmAssembler*>(GetAssembler())->
 
 namespace art {
+
+arm::ArmManagedRegister Location::AsArm() const {
+  return reg().AsArm();
+}
+
 namespace arm {
 
 static constexpr int kNumberOfPushedRegistersAtEntry = 1;
 static constexpr int kCurrentMethodStackOffset = 0;
 
+static Location ArmCoreLocation(Register reg) {
+  return Location::RegisterLocation(ArmManagedRegister::FromCoreRegister(reg));
+}
+
 InstructionCodeGeneratorARM::InstructionCodeGeneratorARM(HGraph* graph, CodeGeneratorARM* codegen)
       : HGraphVisitor(graph),
         assembler_(codegen->GetAssembler()),
@@ -38,15 +48,19 @@
   core_spill_mask_ |= (1 << LR);
   __ PushList((1 << LR));
 
-  // Add the current ART method to the frame size and the return PC.
-  SetFrameSize(RoundUp(GetFrameSize() + 2 * kWordSize, kStackAlignment));
-  // The retrn PC has already been pushed on the stack.
-  __ AddConstant(SP, -(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kWordSize));
+  SetFrameSize(RoundUp(
+      (GetGraph()->GetMaximumNumberOfOutVRegs() + GetGraph()->GetNumberOfVRegs()) * kVRegSize
+      + kVRegSize  // filler
+      + kArmWordSize  // Art method
+      + kNumberOfPushedRegistersAtEntry * kArmWordSize,
+      kStackAlignment));
+  // The return PC has already been pushed on the stack.
+  __ AddConstant(SP, -(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kArmWordSize));
   __ str(R0, Address(SP, 0));
 }
 
 void CodeGeneratorARM::GenerateFrameExit() {
-  __ AddConstant(SP, GetFrameSize() - kNumberOfPushedRegistersAtEntry * kWordSize);
+  __ AddConstant(SP, GetFrameSize() - kNumberOfPushedRegistersAtEntry * kArmWordSize);
   __ PopList((1 << PC));
 }
 
@@ -55,21 +69,210 @@
 }
 
 int32_t CodeGeneratorARM::GetStackSlot(HLocal* local) const {
-  return (GetGraph()->GetMaximumNumberOfOutVRegs() + local->GetRegNumber()) * kWordSize;
+  uint16_t reg_number = local->GetRegNumber();
+  uint16_t number_of_vregs = GetGraph()->GetNumberOfVRegs();
+  uint16_t number_of_in_vregs = GetGraph()->GetNumberOfInVRegs();
+  if (reg_number >= number_of_vregs - number_of_in_vregs) {
+    // Local is a parameter of the method. It is stored in the caller's frame.
+    return GetFrameSize() + kArmWordSize  // ART method
+                          + (reg_number - number_of_vregs + number_of_in_vregs) * kVRegSize;
+  } else {
+    // Local is a temporary in this method. It is stored in this method's frame.
+    return GetFrameSize() - (kNumberOfPushedRegistersAtEntry * kArmWordSize)
+                          - kVRegSize  // filler.
+                          - (number_of_vregs * kVRegSize)
+                          + (reg_number * kVRegSize);
+  }
+}
+
+Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) {
+  switch (type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot: {
+      uint32_t index = gp_index_++;
+      if (index < calling_convention.GetNumberOfRegisters()) {
+        return ArmCoreLocation(calling_convention.GetRegisterAt(index));
+      } else {
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(index, kArmWordSize));
+      }
+    }
+
+    case Primitive::kPrimLong: {
+      uint32_t index = gp_index_;
+      gp_index_ += 2;
+      if (index + 1 < calling_convention.GetNumberOfRegisters()) {
+        return Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(
+            calling_convention.GetRegisterPairAt(index)));
+      } else if (index + 1 == calling_convention.GetNumberOfRegisters()) {
+        return Location::QuickParameter(index);
+      } else {
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(index, kArmWordSize));
+      }
+    }
+
+    case Primitive::kPrimDouble:
+    case Primitive::kPrimFloat:
+      LOG(FATAL) << "Unimplemented parameter type " << type;
+      break;
+
+    case Primitive::kPrimVoid:
+      LOG(FATAL) << "Unexpected parameter type " << type;
+      break;
+  }
+  return Location();
+}
+
+void CodeGeneratorARM::Move32(Location destination, Location source) {
+  if (source.Equals(destination)) {
+    return;
+  }
+  if (destination.IsRegister()) {
+    if (source.IsRegister()) {
+      __ Mov(destination.AsArm().AsCoreRegister(), source.AsArm().AsCoreRegister());
+    } else {
+      __ ldr(destination.AsArm().AsCoreRegister(), Address(SP, source.GetStackIndex()));
+    }
+  } else {
+    DCHECK(destination.IsStackSlot());
+    if (source.IsRegister()) {
+      __ str(source.AsArm().AsCoreRegister(), Address(SP, destination.GetStackIndex()));
+    } else {
+      __ ldr(R0, Address(SP, source.GetStackIndex()));
+      __ str(R0, Address(SP, destination.GetStackIndex()));
+    }
+  }
+}
+
+void CodeGeneratorARM::Move64(Location destination, Location source) {
+  if (source.Equals(destination)) {
+    return;
+  }
+  if (destination.IsRegister()) {
+    if (source.IsRegister()) {
+      __ Mov(destination.AsArm().AsRegisterPairLow(), source.AsArm().AsRegisterPairLow());
+      __ Mov(destination.AsArm().AsRegisterPairHigh(), source.AsArm().AsRegisterPairHigh());
+    } else if (source.IsQuickParameter()) {
+      uint32_t argument_index = source.GetQuickParameterIndex();
+      InvokeDexCallingConvention calling_convention;
+      __ Mov(destination.AsArm().AsRegisterPairLow(),
+             calling_convention.GetRegisterAt(argument_index));
+      __ ldr(destination.AsArm().AsRegisterPairHigh(),
+             Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize) + GetFrameSize()));
+    } else {
+      DCHECK(source.IsDoubleStackSlot());
+      if (destination.AsArm().AsRegisterPair() == R1_R2) {
+        __ ldr(R1, Address(SP, source.GetStackIndex()));
+        __ ldr(R2, Address(SP, source.GetHighStackIndex(kArmWordSize)));
+      } else {
+        __ LoadFromOffset(kLoadWordPair, destination.AsArm().AsRegisterPairLow(),
+                          SP, source.GetStackIndex());
+      }
+    }
+  } else if (destination.IsQuickParameter()) {
+    InvokeDexCallingConvention calling_convention;
+    uint32_t argument_index = destination.GetQuickParameterIndex();
+    if (source.IsRegister()) {
+      __ Mov(calling_convention.GetRegisterAt(argument_index), source.AsArm().AsRegisterPairLow());
+      __ str(source.AsArm().AsRegisterPairHigh(),
+             Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize)));
+    } else {
+      DCHECK(source.IsDoubleStackSlot());
+      __ ldr(calling_convention.GetRegisterAt(argument_index), Address(SP, source.GetStackIndex()));
+      __ ldr(R0, Address(SP, source.GetHighStackIndex(kArmWordSize)));
+      __ str(R0, Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize)));
+    }
+  } else {
+    DCHECK(destination.IsDoubleStackSlot());
+    if (source.IsRegister()) {
+      if (source.AsArm().AsRegisterPair() == R1_R2) {
+        __ str(R1, Address(SP, destination.GetStackIndex()));
+        __ str(R2, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
+      } else {
+        __ StoreToOffset(kStoreWordPair, source.AsArm().AsRegisterPairLow(),
+                         SP, destination.GetStackIndex());
+      }
+    } else if (source.IsQuickParameter()) {
+      InvokeDexCallingConvention calling_convention;
+      uint32_t argument_index = source.GetQuickParameterIndex();
+      __ str(calling_convention.GetRegisterAt(argument_index),
+             Address(SP, destination.GetStackIndex()));
+      __ ldr(R0,
+             Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize) + GetFrameSize()));
+      __ str(R0, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
+    } else {
+      DCHECK(source.IsDoubleStackSlot());
+      __ ldr(R0, Address(SP, source.GetStackIndex()));
+      __ str(R0, Address(SP, destination.GetStackIndex()));
+      __ ldr(R0, Address(SP, source.GetHighStackIndex(kArmWordSize)));
+      __ str(R0, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
+    }
+  }
 }
 
 void CodeGeneratorARM::Move(HInstruction* instruction, Location location, HInstruction* move_for) {
   if (instruction->AsIntConstant() != nullptr) {
-    __ LoadImmediate(location.reg<Register>(), instruction->AsIntConstant()->GetValue());
+    int32_t value = instruction->AsIntConstant()->GetValue();
+    if (location.IsRegister()) {
+      __ LoadImmediate(location.AsArm().AsCoreRegister(), value);
+    } else {
+      __ LoadImmediate(R0, value);
+      __ str(R0, Address(SP, location.GetStackIndex()));
+    }
+  } else if (instruction->AsLongConstant() != nullptr) {
+    int64_t value = instruction->AsLongConstant()->GetValue();
+    if (location.IsRegister()) {
+      __ LoadImmediate(location.AsArm().AsRegisterPairLow(), Low32Bits(value));
+      __ LoadImmediate(location.AsArm().AsRegisterPairHigh(), High32Bits(value));
+    } else {
+      __ LoadImmediate(R0, Low32Bits(value));
+      __ str(R0, Address(SP, location.GetStackIndex()));
+      __ LoadImmediate(R0, High32Bits(value));
+      __ str(R0, Address(SP, location.GetHighStackIndex(kArmWordSize)));
+    }
   } else if (instruction->AsLoadLocal() != nullptr) {
-    __ LoadFromOffset(kLoadWord, location.reg<Register>(),
-                      SP, GetStackSlot(instruction->AsLoadLocal()->GetLocal()));
+    uint32_t stack_slot = GetStackSlot(instruction->AsLoadLocal()->GetLocal());
+    switch (instruction->GetType()) {
+      case Primitive::kPrimBoolean:
+      case Primitive::kPrimByte:
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+      case Primitive::kPrimInt:
+      case Primitive::kPrimNot:
+        Move32(location, Location::StackSlot(stack_slot));
+        break;
+
+      case Primitive::kPrimLong:
+        Move64(location, Location::DoubleStackSlot(stack_slot));
+        break;
+
+      default:
+        LOG(FATAL) << "Unimplemented type " << instruction->GetType();
+    }
   } else {
     // This can currently only happen when the instruction that requests the move
     // is the next to be compiled.
     DCHECK_EQ(instruction->GetNext(), move_for);
-    __ mov(location.reg<Register>(),
-           ShifterOperand(instruction->GetLocations()->Out().reg<Register>()));
+    switch (instruction->GetType()) {
+      case Primitive::kPrimBoolean:
+      case Primitive::kPrimByte:
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+      case Primitive::kPrimNot:
+      case Primitive::kPrimInt:
+        Move32(location, instruction->GetLocations()->Out());
+        break;
+
+      case Primitive::kPrimLong:
+        Move64(location, instruction->GetLocations()->Out());
+        break;
+
+      default:
+        LOG(FATAL) << "Unimplemented type " << instruction->GetType();
+    }
   }
 }
 
@@ -99,13 +302,13 @@
 
 void LocationsBuilderARM::VisitIf(HIf* if_instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
-  locations->SetInAt(0, Location(R0));
+  locations->SetInAt(0, ArmCoreLocation(R0));
   if_instr->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorARM::VisitIf(HIf* if_instr) {
   // TODO: Generate the input as a condition, instead of materializing in a register.
-  __ cmp(if_instr->GetLocations()->InAt(0).reg<Register>(), ShifterOperand(0));
+  __ cmp(if_instr->GetLocations()->InAt(0).AsArm().AsCoreRegister(), ShifterOperand(0));
   __ b(codegen_->GetLabelOf(if_instr->IfFalseSuccessor()), EQ);
   if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfTrueSuccessor())) {
     __ b(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
@@ -114,18 +317,18 @@
 
 void LocationsBuilderARM::VisitEqual(HEqual* equal) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(equal);
-  locations->SetInAt(0, Location(R0));
-  locations->SetInAt(1, Location(R1));
-  locations->SetOut(Location(R0));
+  locations->SetInAt(0, ArmCoreLocation(R0));
+  locations->SetInAt(1, ArmCoreLocation(R1));
+  locations->SetOut(ArmCoreLocation(R0));
   equal->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorARM::VisitEqual(HEqual* equal) {
   LocationSummary* locations = equal->GetLocations();
-  __ teq(locations->InAt(0).reg<Register>(),
-         ShifterOperand(locations->InAt(1).reg<Register>()));
-  __ mov(locations->Out().reg<Register>(), ShifterOperand(1), EQ);
-  __ mov(locations->Out().reg<Register>(), ShifterOperand(0), NE);
+  __ teq(locations->InAt(0).AsArm().AsCoreRegister(),
+         ShifterOperand(locations->InAt(1).AsArm().AsCoreRegister()));
+  __ mov(locations->Out().AsArm().AsCoreRegister(), ShifterOperand(1), EQ);
+  __ mov(locations->Out().AsArm().AsCoreRegister(), ShifterOperand(0), NE);
 }
 
 void LocationsBuilderARM::VisitLocal(HLocal* local) {
@@ -134,7 +337,6 @@
 
 void InstructionCodeGeneratorARM::VisitLocal(HLocal* local) {
   DCHECK_EQ(local->GetBlock(), GetGraph()->GetEntryBlock());
-  codegen_->SetFrameSize(codegen_->GetFrameSize() + kWordSize);
 }
 
 void LocationsBuilderARM::VisitLoadLocal(HLoadLocal* load) {
@@ -147,14 +349,27 @@
 
 void LocationsBuilderARM::VisitStoreLocal(HStoreLocal* store) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(store);
-  locations->SetInAt(1, Location(R0));
+  switch (store->InputAt(1)->GetType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      locations->SetInAt(1, Location::StackSlot(codegen_->GetStackSlot(store->GetLocal())));
+      break;
+
+    case Primitive::kPrimLong:
+      locations->SetInAt(1, Location::DoubleStackSlot(codegen_->GetStackSlot(store->GetLocal())));
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented local type " << store->InputAt(1)->GetType();
+  }
   store->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorARM::VisitStoreLocal(HStoreLocal* store) {
-  LocationSummary* locations = store->GetLocations();
-  __ StoreToOffset(kStoreWord, locations->InAt(1).reg<Register>(),
-                   SP, codegen_->GetStackSlot(store->GetLocal()));
 }
 
 void LocationsBuilderARM::VisitIntConstant(HIntConstant* constant) {
@@ -165,6 +380,14 @@
   // Will be generated at use site.
 }
 
+void LocationsBuilderARM::VisitLongConstant(HLongConstant* constant) {
+  constant->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorARM::VisitLongConstant(HLongConstant* constant) {
+  // Will be generated at use site.
+}
+
 void LocationsBuilderARM::VisitReturnVoid(HReturnVoid* ret) {
   ret->SetLocations(nullptr);
 }
@@ -175,56 +398,83 @@
 
 void LocationsBuilderARM::VisitReturn(HReturn* ret) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(ret);
-  locations->SetInAt(0, Location(R0));
+  switch (ret->InputAt(0)->GetType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      locations->SetInAt(0, ArmCoreLocation(R0));
+      break;
+
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(R0_R1)));
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
+  }
+
   ret->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorARM::VisitReturn(HReturn* ret) {
-  DCHECK_EQ(ret->GetLocations()->InAt(0).reg<Register>(), R0);
+  if (kIsDebugBuild) {
+    switch (ret->InputAt(0)->GetType()) {
+      case Primitive::kPrimBoolean:
+      case Primitive::kPrimByte:
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+      case Primitive::kPrimInt:
+      case Primitive::kPrimNot:
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsArm().AsCoreRegister(), R0);
+        break;
+
+      case Primitive::kPrimLong:
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsArm().AsRegisterPair(), R0_R1);
+        break;
+
+      default:
+        LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
+    }
+  }
   codegen_->GenerateFrameExit();
 }
 
-static constexpr Register kParameterCoreRegisters[] = { R1, R2, R3 };
-static constexpr int kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
-
-class InvokeStaticCallingConvention : public CallingConvention<Register> {
- public:
-  InvokeStaticCallingConvention()
-      : CallingConvention(kParameterCoreRegisters, kParameterCoreRegistersLength) {}
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(InvokeStaticCallingConvention);
-};
-
-void LocationsBuilderARM::VisitPushArgument(HPushArgument* argument) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(argument);
-  InvokeStaticCallingConvention calling_convention;
-  if (argument->GetArgumentIndex() < calling_convention.GetNumberOfRegisters()) {
-    Location location = Location(calling_convention.GetRegisterAt(argument->GetArgumentIndex()));
-    locations->SetInAt(0, location);
-    locations->SetOut(location);
-  } else {
-    locations->SetInAt(0, Location(R0));
-  }
-  argument->SetLocations(locations);
-}
-
-void InstructionCodeGeneratorARM::VisitPushArgument(HPushArgument* argument) {
-  uint8_t argument_index = argument->GetArgumentIndex();
-  InvokeStaticCallingConvention calling_convention;
-  size_t parameter_registers = calling_convention.GetNumberOfRegisters();
-  LocationSummary* locations = argument->GetLocations();
-  if (argument_index >= parameter_registers) {
-    uint8_t offset = calling_convention.GetStackOffsetOf(argument_index);
-    __ StoreToOffset(kStoreWord, locations->InAt(0).reg<Register>(), SP, offset);
-  } else {
-    DCHECK_EQ(locations->Out().reg<Register>(), locations->InAt(0).reg<Register>());
-  }
-}
-
 void LocationsBuilderARM::VisitInvokeStatic(HInvokeStatic* invoke) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(invoke);
-  locations->AddTemp(Location(R0));
+  locations->AddTemp(ArmCoreLocation(R0));
+
+  InvokeDexCallingConventionVisitor calling_convention_visitor;
+  for (int i = 0; i < invoke->InputCount(); i++) {
+    HInstruction* input = invoke->InputAt(i);
+    locations->SetInAt(i, calling_convention_visitor.GetNextLocation(input->GetType()));
+  }
+
+  switch (invoke->GetType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      locations->SetOut(ArmCoreLocation(R0));
+      break;
+
+    case Primitive::kPrimLong:
+      locations->SetOut(Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(R0_R1)));
+      break;
+
+    case Primitive::kPrimVoid:
+      break;
+
+    case Primitive::kPrimDouble:
+    case Primitive::kPrimFloat:
+      LOG(FATAL) << "Unimplemented return type " << invoke->GetType();
+      break;
+  }
+
   invoke->SetLocations(locations);
 }
 
@@ -233,9 +483,9 @@
 }
 
 void InstructionCodeGeneratorARM::VisitInvokeStatic(HInvokeStatic* invoke) {
-  Register temp = invoke->GetLocations()->GetTemp(0).reg<Register>();
+  Register temp = invoke->GetLocations()->GetTemp(0).AsArm().AsCoreRegister();
   size_t index_in_cache = mirror::Array::DataOffset(sizeof(mirror::Object*)).Int32Value() +
-      invoke->GetIndexInDexCache() * kWordSize;
+      invoke->GetIndexInDexCache() * kArmWordSize;
 
   // TODO: Implement all kinds of calls:
   // 1) boot -> boot
@@ -263,13 +513,30 @@
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(add);
   switch (add->GetResultType()) {
     case Primitive::kPrimInt: {
-      locations->SetInAt(0, Location(R0));
-      locations->SetInAt(1, Location(R1));
-      locations->SetOut(Location(R0));
+      locations->SetInAt(0, ArmCoreLocation(R0));
+      locations->SetInAt(1, ArmCoreLocation(R1));
+      locations->SetOut(ArmCoreLocation(R0));
       break;
     }
+
+    case Primitive::kPrimLong: {
+      locations->SetInAt(
+          0, Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(R0_R1)));
+      locations->SetInAt(
+          1, Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(R2_R3)));
+      locations->SetOut(Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(R0_R1)));
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+      break;
+
     default:
-      LOG(FATAL) << "Unimplemented";
+      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
   }
   add->SetLocations(locations);
 }
@@ -278,14 +545,154 @@
   LocationSummary* locations = add->GetLocations();
   switch (add->GetResultType()) {
     case Primitive::kPrimInt:
-      __ add(locations->Out().reg<Register>(),
-             locations->InAt(0).reg<Register>(),
-             ShifterOperand(locations->InAt(1).reg<Register>()));
+      __ add(locations->Out().AsArm().AsCoreRegister(),
+             locations->InAt(0).AsArm().AsCoreRegister(),
+             ShifterOperand(locations->InAt(1).AsArm().AsCoreRegister()));
       break;
+
+    case Primitive::kPrimLong:
+      __ adds(locations->Out().AsArm().AsRegisterPairLow(),
+              locations->InAt(0).AsArm().AsRegisterPairLow(),
+              ShifterOperand(locations->InAt(1).AsArm().AsRegisterPairLow()));
+      __ adc(locations->Out().AsArm().AsRegisterPairHigh(),
+             locations->InAt(0).AsArm().AsRegisterPairHigh(),
+             ShifterOperand(locations->InAt(1).AsArm().AsRegisterPairHigh()));
+      break;
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+      break;
+
     default:
-      LOG(FATAL) << "Unimplemented";
+      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
   }
 }
 
+void LocationsBuilderARM::VisitSub(HSub* sub) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(sub);
+  switch (sub->GetResultType()) {
+    case Primitive::kPrimInt: {
+      locations->SetInAt(0, ArmCoreLocation(R0));
+      locations->SetInAt(1, ArmCoreLocation(R1));
+      locations->SetOut(ArmCoreLocation(R0));
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      locations->SetInAt(
+          0, Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(R0_R1)));
+      locations->SetInAt(
+          1, Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(R2_R3)));
+      locations->SetOut(Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(R0_R1)));
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected sub type " << sub->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented sub type " << sub->GetResultType();
+  }
+  sub->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorARM::VisitSub(HSub* sub) {
+  LocationSummary* locations = sub->GetLocations();
+  switch (sub->GetResultType()) {
+    case Primitive::kPrimInt:
+      __ sub(locations->Out().AsArm().AsCoreRegister(),
+             locations->InAt(0).AsArm().AsCoreRegister(),
+             ShifterOperand(locations->InAt(1).AsArm().AsCoreRegister()));
+      break;
+
+    case Primitive::kPrimLong:
+      __ subs(locations->Out().AsArm().AsRegisterPairLow(),
+              locations->InAt(0).AsArm().AsRegisterPairLow(),
+              ShifterOperand(locations->InAt(1).AsArm().AsRegisterPairLow()));
+      __ sbc(locations->Out().AsArm().AsRegisterPairHigh(),
+             locations->InAt(0).AsArm().AsRegisterPairHigh(),
+             ShifterOperand(locations->InAt(1).AsArm().AsRegisterPairHigh()));
+      break;
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected sub type " << sub->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented sub type " << sub->GetResultType();
+  }
+}
+
+static constexpr Register kRuntimeParameterCoreRegisters[] = { R0, R1 };
+static constexpr size_t kRuntimeParameterCoreRegistersLength =
+    arraysize(kRuntimeParameterCoreRegisters);
+
+class InvokeRuntimeCallingConvention : public CallingConvention<Register> {
+ public:
+  InvokeRuntimeCallingConvention()
+      : CallingConvention(kRuntimeParameterCoreRegisters,
+                          kRuntimeParameterCoreRegistersLength) {}
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InvokeRuntimeCallingConvention);
+};
+
+void LocationsBuilderARM::VisitNewInstance(HNewInstance* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  locations->SetOut(ArmCoreLocation(R0));
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorARM::VisitNewInstance(HNewInstance* instruction) {
+  InvokeRuntimeCallingConvention calling_convention;
+  LoadCurrentMethod(calling_convention.GetRegisterAt(1));
+  __ LoadImmediate(calling_convention.GetRegisterAt(0), instruction->GetTypeIndex());
+
+  int32_t offset = QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocObjectWithAccessCheck).Int32Value();
+  __ ldr(LR, Address(TR, offset));
+  __ blx(LR);
+
+  codegen_->RecordPcInfo(instruction->GetDexPc());
+}
+
+void LocationsBuilderARM::VisitParameterValue(HParameterValue* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  Location location = parameter_visitor_.GetNextLocation(instruction->GetType());
+  if (location.IsStackSlot()) {
+    location = Location::StackSlot(location.GetStackIndex() + codegen_->GetFrameSize());
+  } else if (location.IsDoubleStackSlot()) {
+    location = Location::DoubleStackSlot(location.GetStackIndex() + codegen_->GetFrameSize());
+  }
+  locations->SetOut(location);
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorARM::VisitParameterValue(HParameterValue* instruction) {
+  // Nothing to do, the parameter is already at its location.
+}
+
+void LocationsBuilderARM::VisitNot(HNot* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  locations->SetInAt(0, ArmCoreLocation(R0));
+  locations->SetOut(ArmCoreLocation(R0));
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorARM::VisitNot(HNot* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  __ eor(locations->Out().AsArm().AsCoreRegister(),
+         locations->InAt(0).AsArm().AsCoreRegister(), ShifterOperand(1));
+}
+
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 52d6b2e..ed35f94 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -22,15 +22,47 @@
 #include "utils/arm/assembler_arm.h"
 
 namespace art {
-
-class Assembler;
-class Label;
-
 namespace arm {
 
+class CodeGeneratorARM;
+
+static constexpr size_t kArmWordSize = 4;
+
+static constexpr Register kParameterCoreRegisters[] = { R1, R2, R3 };
+static constexpr RegisterPair kParameterCorePairRegisters[] = { R1_R2, R2_R3 };
+static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
+
+class InvokeDexCallingConvention : public CallingConvention<Register> {
+ public:
+  InvokeDexCallingConvention()
+      : CallingConvention(kParameterCoreRegisters, kParameterCoreRegistersLength) {}
+
+  RegisterPair GetRegisterPairAt(size_t argument_index) {
+    DCHECK_LT(argument_index + 1, GetNumberOfRegisters());
+    return kParameterCorePairRegisters[argument_index];
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConvention);
+};
+
+class InvokeDexCallingConventionVisitor {
+ public:
+  InvokeDexCallingConventionVisitor() : gp_index_(0) {}
+
+  Location GetNextLocation(Primitive::Type type);
+
+ private:
+  InvokeDexCallingConvention calling_convention;
+  uint32_t gp_index_;
+
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
+};
+
 class LocationsBuilderARM : public HGraphVisitor {
  public:
-  explicit LocationsBuilderARM(HGraph* graph) : HGraphVisitor(graph) { }
+  explicit LocationsBuilderARM(HGraph* graph, CodeGeneratorARM* codegen)
+      : HGraphVisitor(graph), codegen_(codegen) {}
 
 #define DECLARE_VISIT_INSTRUCTION(name)     \
   virtual void Visit##name(H##name* instr);
@@ -40,11 +72,12 @@
 #undef DECLARE_VISIT_INSTRUCTION
 
  private:
+  CodeGeneratorARM* const codegen_;
+  InvokeDexCallingConventionVisitor parameter_visitor_;
+
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderARM);
 };
 
-class CodeGeneratorARM;
-
 class InstructionCodeGeneratorARM : public HGraphVisitor {
  public:
   InstructionCodeGeneratorARM(HGraph* graph, CodeGeneratorARM* codegen);
@@ -70,7 +103,7 @@
  public:
   explicit CodeGeneratorARM(HGraph* graph)
       : CodeGenerator(graph),
-        location_builder_(graph),
+        location_builder_(graph, this),
         instruction_visitor_(graph, this) { }
   virtual ~CodeGeneratorARM() { }
 
@@ -79,6 +112,10 @@
   virtual void Bind(Label* label) OVERRIDE;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
 
+  virtual size_t GetWordSize() const OVERRIDE {
+    return kArmWordSize;
+  }
+
   virtual HGraphVisitor* GetLocationBuilder() OVERRIDE {
     return &location_builder_;
   }
@@ -94,6 +131,11 @@
   int32_t GetStackSlot(HLocal* local) const;
 
  private:
+  // Helper method to move a 32bits value between two locations.
+  void Move32(Location destination, Location source);
+  // Helper method to move a 64bits value between two locations.
+  void Move64(Location destination, Location source);
+
   LocationsBuilderARM location_builder_;
   InstructionCodeGeneratorARM instruction_visitor_;
   ArmAssembler assembler_;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 7b0a087..1142631 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -17,6 +17,7 @@
 #include "code_generator_x86.h"
 #include "utils/assembler.h"
 #include "utils/x86/assembler_x86.h"
+#include "utils/x86/managed_register_x86.h"
 
 #include "mirror/array.h"
 #include "mirror/art_method.h"
@@ -24,11 +25,20 @@
 #define __ reinterpret_cast<X86Assembler*>(GetAssembler())->
 
 namespace art {
+
+x86::X86ManagedRegister Location::AsX86() const {
+  return reg().AsX86();
+}
+
 namespace x86 {
 
 static constexpr int kNumberOfPushedRegistersAtEntry = 1;
 static constexpr int kCurrentMethodStackOffset = 0;
 
+static Location X86CpuLocation(Register reg) {
+  return Location::RegisterLocation(X86ManagedRegister::FromCpuRegister(reg));
+}
+
 InstructionCodeGeneratorX86::InstructionCodeGeneratorX86(HGraph* graph, CodeGeneratorX86* codegen)
       : HGraphVisitor(graph),
         assembler_(codegen->GetAssembler()),
@@ -39,15 +49,20 @@
   static const int kFakeReturnRegister = 8;
   core_spill_mask_ |= (1 << kFakeReturnRegister);
 
-  // Add the current ART method to the frame size and the return PC.
-  SetFrameSize(RoundUp(GetFrameSize() + 2 * kWordSize, kStackAlignment));
+  SetFrameSize(RoundUp(
+      (GetGraph()->GetMaximumNumberOfOutVRegs() + GetGraph()->GetNumberOfVRegs()) * kVRegSize
+      + kVRegSize  // filler
+      + kX86WordSize  // Art method
+      + kNumberOfPushedRegistersAtEntry * kX86WordSize,
+      kStackAlignment));
+
   // The return PC has already been pushed on the stack.
-  __ subl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kWordSize));
-  __ movl(Address(ESP, 0), EAX);
+  __ subl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86WordSize));
+  __ movl(Address(ESP, kCurrentMethodStackOffset), EAX);
 }
 
 void CodeGeneratorX86::GenerateFrameExit() {
-  __ addl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kWordSize));
+  __ addl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86WordSize));
 }
 
 void CodeGeneratorX86::Bind(Label* label) {
@@ -59,21 +74,214 @@
 }
 
 int32_t CodeGeneratorX86::GetStackSlot(HLocal* local) const {
-  return (GetGraph()->GetMaximumNumberOfOutVRegs() + local->GetRegNumber()) * kWordSize;
+  uint16_t reg_number = local->GetRegNumber();
+  uint16_t number_of_vregs = GetGraph()->GetNumberOfVRegs();
+  uint16_t number_of_in_vregs = GetGraph()->GetNumberOfInVRegs();
+  if (reg_number >= number_of_vregs - number_of_in_vregs) {
+    // Local is a parameter of the method. It is stored in the caller's frame.
+    return GetFrameSize() + kX86WordSize  // ART method
+                          + (reg_number - number_of_vregs + number_of_in_vregs) * kVRegSize;
+  } else {
+    // Local is a temporary in this method. It is stored in this method's frame.
+    return GetFrameSize() - (kNumberOfPushedRegistersAtEntry * kX86WordSize)
+                          - kVRegSize  // filler.
+                          - (number_of_vregs * kVRegSize)
+                          + (reg_number * kVRegSize);
+  }
+}
+
+static constexpr Register kRuntimeParameterCoreRegisters[] = { EAX, ECX, EDX };
+static constexpr size_t kRuntimeParameterCoreRegistersLength =
+    arraysize(kRuntimeParameterCoreRegisters);
+
+class InvokeRuntimeCallingConvention : public CallingConvention<Register> {
+ public:
+  InvokeRuntimeCallingConvention()
+      : CallingConvention(kRuntimeParameterCoreRegisters,
+                          kRuntimeParameterCoreRegistersLength) {}
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InvokeRuntimeCallingConvention);
+};
+
+Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) {
+  switch (type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot: {
+      uint32_t index = gp_index_++;
+      if (index < calling_convention.GetNumberOfRegisters()) {
+        return X86CpuLocation(calling_convention.GetRegisterAt(index));
+      } else {
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(index, kX86WordSize));
+      }
+    }
+
+    case Primitive::kPrimLong: {
+      uint32_t index = gp_index_;
+      gp_index_ += 2;
+      if (index + 1 < calling_convention.GetNumberOfRegisters()) {
+        return Location::RegisterLocation(X86ManagedRegister::FromRegisterPair(
+            calling_convention.GetRegisterPairAt(index)));
+      } else if (index + 1 == calling_convention.GetNumberOfRegisters()) {
+        return Location::QuickParameter(index);
+      } else {
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(index, kX86WordSize));
+      }
+    }
+
+    case Primitive::kPrimDouble:
+    case Primitive::kPrimFloat:
+      LOG(FATAL) << "Unimplemented parameter type " << type;
+      break;
+
+    case Primitive::kPrimVoid:
+      LOG(FATAL) << "Unexpected parameter type " << type;
+      break;
+  }
+  return Location();
+}
+
+void CodeGeneratorX86::Move32(Location destination, Location source) {
+  if (source.Equals(destination)) {
+    return;
+  }
+  if (destination.IsRegister()) {
+    if (source.IsRegister()) {
+      __ movl(destination.AsX86().AsCpuRegister(), source.AsX86().AsCpuRegister());
+    } else {
+      DCHECK(source.IsStackSlot());
+      __ movl(destination.AsX86().AsCpuRegister(), Address(ESP, source.GetStackIndex()));
+    }
+  } else {
+    if (source.IsRegister()) {
+      __ movl(Address(ESP, destination.GetStackIndex()), source.AsX86().AsCpuRegister());
+    } else {
+      DCHECK(source.IsStackSlot());
+      __ movl(EAX, Address(ESP, source.GetStackIndex()));
+      __ movl(Address(ESP, destination.GetStackIndex()), EAX);
+    }
+  }
+}
+
+void CodeGeneratorX86::Move64(Location destination, Location source) {
+  if (source.Equals(destination)) {
+    return;
+  }
+  if (destination.IsRegister()) {
+    if (source.IsRegister()) {
+      __ movl(destination.AsX86().AsRegisterPairLow(), source.AsX86().AsRegisterPairLow());
+      __ movl(destination.AsX86().AsRegisterPairHigh(), source.AsX86().AsRegisterPairHigh());
+    } else if (source.IsQuickParameter()) {
+      uint32_t argument_index = source.GetQuickParameterIndex();
+      InvokeDexCallingConvention calling_convention;
+      __ movl(destination.AsX86().AsRegisterPairLow(),
+              calling_convention.GetRegisterAt(argument_index));
+      __ movl(destination.AsX86().AsRegisterPairHigh(), Address(ESP,
+          calling_convention.GetStackOffsetOf(argument_index + 1, kX86WordSize) + GetFrameSize()));
+    } else {
+      DCHECK(source.IsDoubleStackSlot());
+      __ movl(destination.AsX86().AsRegisterPairLow(), Address(ESP, source.GetStackIndex()));
+      __ movl(destination.AsX86().AsRegisterPairHigh(),
+              Address(ESP, source.GetHighStackIndex(kX86WordSize)));
+    }
+  } else if (destination.IsQuickParameter()) {
+    InvokeDexCallingConvention calling_convention;
+    uint32_t argument_index = destination.GetQuickParameterIndex();
+    if (source.IsRegister()) {
+      __ movl(calling_convention.GetRegisterAt(argument_index), source.AsX86().AsRegisterPairLow());
+      __ movl(Address(ESP, calling_convention.GetStackOffsetOf(argument_index + 1, kX86WordSize)),
+              source.AsX86().AsRegisterPairHigh());
+    } else {
+      DCHECK(source.IsDoubleStackSlot());
+      __ movl(calling_convention.GetRegisterAt(argument_index),
+              Address(ESP, source.GetStackIndex()));
+      __ movl(EAX, Address(ESP, source.GetHighStackIndex(kX86WordSize)));
+      __ movl(Address(ESP, calling_convention.GetStackOffsetOf(argument_index + 1, kX86WordSize)), EAX);
+    }
+  } else {
+    if (source.IsRegister()) {
+      __ movl(Address(ESP, destination.GetStackIndex()), source.AsX86().AsRegisterPairLow());
+      __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)),
+              source.AsX86().AsRegisterPairHigh());
+    } else if (source.IsQuickParameter()) {
+      InvokeDexCallingConvention calling_convention;
+      uint32_t argument_index = source.GetQuickParameterIndex();
+      __ movl(Address(ESP, destination.GetStackIndex()),
+              calling_convention.GetRegisterAt(argument_index));
+      __ movl(EAX, Address(ESP,
+          calling_convention.GetStackOffsetOf(argument_index + 1, kX86WordSize) + GetFrameSize()));
+      __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)), EAX);
+    } else {
+      DCHECK(source.IsDoubleStackSlot());
+      __ movl(EAX, Address(ESP, source.GetStackIndex()));
+      __ movl(Address(ESP, destination.GetStackIndex()), EAX);
+      __ movl(EAX, Address(ESP, source.GetHighStackIndex(kX86WordSize)));
+      __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)), EAX);
+    }
+  }
 }
 
 void CodeGeneratorX86::Move(HInstruction* instruction, Location location, HInstruction* move_for) {
   if (instruction->AsIntConstant() != nullptr) {
-    __ movl(location.reg<Register>(), Immediate(instruction->AsIntConstant()->GetValue()));
+    Immediate imm(instruction->AsIntConstant()->GetValue());
+    if (location.IsRegister()) {
+      __ movl(location.AsX86().AsCpuRegister(), imm);
+    } else {
+      __ movl(Address(ESP, location.GetStackIndex()), imm);
+    }
+  } else if (instruction->AsLongConstant() != nullptr) {
+    int64_t value = instruction->AsLongConstant()->GetValue();
+    if (location.IsRegister()) {
+      __ movl(location.AsX86().AsRegisterPairLow(), Immediate(Low32Bits(value)));
+      __ movl(location.AsX86().AsRegisterPairHigh(), Immediate(High32Bits(value)));
+    } else {
+      __ movl(Address(ESP, location.GetStackIndex()), Immediate(Low32Bits(value)));
+      __ movl(Address(ESP, location.GetHighStackIndex(kX86WordSize)), Immediate(High32Bits(value)));
+    }
   } else if (instruction->AsLoadLocal() != nullptr) {
-    __ movl(location.reg<Register>(),
-            Address(ESP, GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
+    switch (instruction->GetType()) {
+      case Primitive::kPrimBoolean:
+      case Primitive::kPrimByte:
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+      case Primitive::kPrimInt:
+      case Primitive::kPrimNot:
+        Move32(location, Location::StackSlot(GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
+        break;
+
+      case Primitive::kPrimLong:
+        Move64(location, Location::DoubleStackSlot(
+            GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
+        break;
+
+      default:
+        LOG(FATAL) << "Unimplemented local type " << instruction->GetType();
+    }
   } else {
     // This can currently only happen when the instruction that requests the move
     // is the next to be compiled.
     DCHECK_EQ(instruction->GetNext(), move_for);
-    __ movl(location.reg<Register>(),
-            instruction->GetLocations()->Out().reg<Register>());
+    switch (instruction->GetType()) {
+      case Primitive::kPrimBoolean:
+      case Primitive::kPrimByte:
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+      case Primitive::kPrimInt:
+      case Primitive::kPrimNot:
+        Move32(location, instruction->GetLocations()->Out());
+        break;
+
+      case Primitive::kPrimLong:
+        Move64(location, instruction->GetLocations()->Out());
+        break;
+
+      default:
+        LOG(FATAL) << "Unimplemented type " << instruction->GetType();
+    }
   }
 }
 
@@ -103,13 +311,13 @@
 
 void LocationsBuilderX86::VisitIf(HIf* if_instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
-  locations->SetInAt(0, Location(EAX));
+  locations->SetInAt(0, X86CpuLocation(EAX));
   if_instr->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorX86::VisitIf(HIf* if_instr) {
   // TODO: Generate the input as a condition, instead of materializing in a register.
-  __ cmpl(if_instr->GetLocations()->InAt(0).reg<Register>(), Immediate(0));
+  __ cmpl(if_instr->GetLocations()->InAt(0).AsX86().AsCpuRegister(), Immediate(0));
   __ j(kEqual, codegen_->GetLabelOf(if_instr->IfFalseSuccessor()));
   if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfTrueSuccessor())) {
     __ jmp(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
@@ -122,7 +330,6 @@
 
 void InstructionCodeGeneratorX86::VisitLocal(HLocal* local) {
   DCHECK_EQ(local->GetBlock(), GetGraph()->GetEntryBlock());
-  codegen_->SetFrameSize(codegen_->GetFrameSize() + kWordSize);
 }
 
 void LocationsBuilderX86::VisitLoadLocal(HLoadLocal* local) {
@@ -133,29 +340,43 @@
   // Nothing to do, this is driven by the code generator.
 }
 
-void LocationsBuilderX86::VisitStoreLocal(HStoreLocal* local) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(local);
-  locations->SetInAt(1, Location(EAX));
-  local->SetLocations(locations);
+void LocationsBuilderX86::VisitStoreLocal(HStoreLocal* store) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(store);
+  switch (store->InputAt(1)->GetType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      locations->SetInAt(1, Location::StackSlot(codegen_->GetStackSlot(store->GetLocal())));
+      break;
+
+    case Primitive::kPrimLong:
+      locations->SetInAt(1, Location::DoubleStackSlot(codegen_->GetStackSlot(store->GetLocal())));
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented local type " << store->InputAt(1)->GetType();
+  }
+  store->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorX86::VisitStoreLocal(HStoreLocal* store) {
-  __ movl(Address(ESP, codegen_->GetStackSlot(store->GetLocal())),
-          store->GetLocations()->InAt(1).reg<Register>());
 }
 
 void LocationsBuilderX86::VisitEqual(HEqual* equal) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(equal);
-  locations->SetInAt(0, Location(EAX));
-  locations->SetInAt(1, Location(ECX));
-  locations->SetOut(Location(EAX));
+  locations->SetInAt(0, X86CpuLocation(EAX));
+  locations->SetInAt(1, X86CpuLocation(ECX));
+  locations->SetOut(X86CpuLocation(EAX));
   equal->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorX86::VisitEqual(HEqual* equal) {
-  __ cmpl(equal->GetLocations()->InAt(0).reg<Register>(),
-          equal->GetLocations()->InAt(1).reg<Register>());
-  __ setb(kEqual, equal->GetLocations()->Out().reg<Register>());
+  __ cmpl(equal->GetLocations()->InAt(0).AsX86().AsCpuRegister(),
+          equal->GetLocations()->InAt(1).AsX86().AsCpuRegister());
+  __ setb(kEqual, equal->GetLocations()->Out().AsX86().AsCpuRegister());
 }
 
 void LocationsBuilderX86::VisitIntConstant(HIntConstant* constant) {
@@ -166,6 +387,14 @@
   // Will be generated at use site.
 }
 
+void LocationsBuilderX86::VisitLongConstant(HLongConstant* constant) {
+  constant->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorX86::VisitLongConstant(HLongConstant* constant) {
+  // Will be generated at use site.
+}
+
 void LocationsBuilderX86::VisitReturnVoid(HReturnVoid* ret) {
   ret->SetLocations(nullptr);
 }
@@ -177,66 +406,91 @@
 
 void LocationsBuilderX86::VisitReturn(HReturn* ret) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(ret);
-  locations->SetInAt(0, Location(EAX));
+  switch (ret->InputAt(0)->GetType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      locations->SetInAt(0, X86CpuLocation(EAX));
+      break;
+
+    case Primitive::kPrimLong:
+      locations->SetInAt(
+          0, Location::RegisterLocation(X86ManagedRegister::FromRegisterPair(EAX_EDX)));
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
+  }
   ret->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorX86::VisitReturn(HReturn* ret) {
-  DCHECK_EQ(ret->GetLocations()->InAt(0).reg<Register>(), EAX);
+  if (kIsDebugBuild) {
+    switch (ret->InputAt(0)->GetType()) {
+      case Primitive::kPrimBoolean:
+      case Primitive::kPrimByte:
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+      case Primitive::kPrimInt:
+      case Primitive::kPrimNot:
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsX86().AsCpuRegister(), EAX);
+        break;
+
+      case Primitive::kPrimLong:
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsX86().AsRegisterPair(), EAX_EDX);
+        break;
+
+      default:
+        LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
+    }
+  }
   codegen_->GenerateFrameExit();
   __ ret();
 }
 
-static constexpr Register kParameterCoreRegisters[] = { ECX, EDX, EBX };
-static constexpr int kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
-
-class InvokeStaticCallingConvention : public CallingConvention<Register> {
- public:
-  InvokeStaticCallingConvention()
-      : CallingConvention(kParameterCoreRegisters, kParameterCoreRegistersLength) {}
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(InvokeStaticCallingConvention);
-};
-
-void LocationsBuilderX86::VisitPushArgument(HPushArgument* argument) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(argument);
-  InvokeStaticCallingConvention calling_convention;
-  if (argument->GetArgumentIndex() < calling_convention.GetNumberOfRegisters()) {
-    Location location = Location(calling_convention.GetRegisterAt(argument->GetArgumentIndex()));
-    locations->SetInAt(0, location);
-    locations->SetOut(location);
-  } else {
-    locations->SetInAt(0, Location(EAX));
-  }
-  argument->SetLocations(locations);
-}
-
-void InstructionCodeGeneratorX86::VisitPushArgument(HPushArgument* argument) {
-  uint8_t argument_index = argument->GetArgumentIndex();
-  InvokeStaticCallingConvention calling_convention;
-  size_t parameter_registers = calling_convention.GetNumberOfRegisters();
-  if (argument_index >= parameter_registers) {
-    uint8_t offset = calling_convention.GetStackOffsetOf(argument_index);
-    __ movl(Address(ESP, offset),
-            argument->GetLocations()->InAt(0).reg<Register>());
-
-  } else {
-    DCHECK_EQ(argument->GetLocations()->Out().reg<Register>(),
-              argument->GetLocations()->InAt(0).reg<Register>());
-  }
-}
-
 void LocationsBuilderX86::VisitInvokeStatic(HInvokeStatic* invoke) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(invoke);
-  locations->AddTemp(Location(EAX));
+  locations->AddTemp(X86CpuLocation(EAX));
+
+  InvokeDexCallingConventionVisitor calling_convention_visitor;
+  for (int i = 0; i < invoke->InputCount(); i++) {
+    HInstruction* input = invoke->InputAt(i);
+    locations->SetInAt(i, calling_convention_visitor.GetNextLocation(input->GetType()));
+  }
+
+  switch (invoke->GetType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      locations->SetOut(X86CpuLocation(EAX));
+      break;
+
+    case Primitive::kPrimLong:
+      locations->SetOut(Location::RegisterLocation(X86ManagedRegister::FromRegisterPair(EAX_EDX)));
+      break;
+
+    case Primitive::kPrimVoid:
+      break;
+
+    case Primitive::kPrimDouble:
+    case Primitive::kPrimFloat:
+      LOG(FATAL) << "Unimplemented return type " << invoke->GetType();
+      break;
+  }
+
   invoke->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorX86::VisitInvokeStatic(HInvokeStatic* invoke) {
-  Register temp = invoke->GetLocations()->GetTemp(0).reg<Register>();
+  Register temp = invoke->GetLocations()->GetTemp(0).AsX86().AsCpuRegister();
   size_t index_in_cache = mirror::Array::DataOffset(sizeof(mirror::Object*)).Int32Value() +
-      invoke->GetIndexInDexCache() * kWordSize;
+      invoke->GetIndexInDexCache() * kX86WordSize;
 
   // TODO: Implement all kinds of calls:
   // 1) boot -> boot
@@ -261,13 +515,29 @@
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(add);
   switch (add->GetResultType()) {
     case Primitive::kPrimInt: {
-      locations->SetInAt(0, Location(EAX));
-      locations->SetInAt(1, Location(ECX));
-      locations->SetOut(Location(EAX));
+      locations->SetInAt(0, X86CpuLocation(EAX));
+      locations->SetInAt(1, X86CpuLocation(ECX));
+      locations->SetOut(X86CpuLocation(EAX));
       break;
     }
+    case Primitive::kPrimLong: {
+      locations->SetInAt(
+          0, Location::RegisterLocation(X86ManagedRegister::FromRegisterPair(EAX_EDX)));
+      locations->SetInAt(
+          1, Location::RegisterLocation(X86ManagedRegister::FromRegisterPair(ECX_EBX)));
+      locations->SetOut(Location::RegisterLocation(X86ManagedRegister::FromRegisterPair(EAX_EDX)));
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+      break;
+
     default:
-      LOG(FATAL) << "Unimplemented";
+      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
   }
   add->SetLocations(locations);
 }
@@ -275,14 +545,147 @@
 void InstructionCodeGeneratorX86::VisitAdd(HAdd* add) {
   LocationSummary* locations = add->GetLocations();
   switch (add->GetResultType()) {
-    case Primitive::kPrimInt:
-      DCHECK_EQ(locations->InAt(0).reg<Register>(), locations->Out().reg<Register>());
-      __ addl(locations->InAt(0).reg<Register>(), locations->InAt(1).reg<Register>());
+    case Primitive::kPrimInt: {
+      DCHECK_EQ(locations->InAt(0).AsX86().AsCpuRegister(),
+                locations->Out().AsX86().AsCpuRegister());
+      __ addl(locations->InAt(0).AsX86().AsCpuRegister(),
+              locations->InAt(1).AsX86().AsCpuRegister());
       break;
+    }
+
+    case Primitive::kPrimLong: {
+      DCHECK_EQ(locations->InAt(0).AsX86().AsRegisterPair(),
+                locations->Out().AsX86().AsRegisterPair());
+      __ addl(locations->InAt(0).AsX86().AsRegisterPairLow(),
+              locations->InAt(1).AsX86().AsRegisterPairLow());
+      __ adcl(locations->InAt(0).AsX86().AsRegisterPairHigh(),
+              locations->InAt(1).AsX86().AsRegisterPairHigh());
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+      break;
+
     default:
-      LOG(FATAL) << "Unimplemented";
+      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
   }
 }
 
+void LocationsBuilderX86::VisitSub(HSub* sub) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(sub);
+  switch (sub->GetResultType()) {
+    case Primitive::kPrimInt: {
+      locations->SetInAt(0, X86CpuLocation(EAX));
+      locations->SetInAt(1, X86CpuLocation(ECX));
+      locations->SetOut(X86CpuLocation(EAX));
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      locations->SetInAt(
+          0, Location::RegisterLocation(X86ManagedRegister::FromRegisterPair(EAX_EDX)));
+      locations->SetInAt(
+          1, Location::RegisterLocation(X86ManagedRegister::FromRegisterPair(ECX_EBX)));
+      locations->SetOut(Location::RegisterLocation(X86ManagedRegister::FromRegisterPair(EAX_EDX)));
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected sub type " << sub->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented sub type " << sub->GetResultType();
+  }
+  sub->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86::VisitSub(HSub* sub) {
+  LocationSummary* locations = sub->GetLocations();
+  switch (sub->GetResultType()) {
+    case Primitive::kPrimInt: {
+      DCHECK_EQ(locations->InAt(0).AsX86().AsCpuRegister(),
+                locations->Out().AsX86().AsCpuRegister());
+      __ subl(locations->InAt(0).AsX86().AsCpuRegister(),
+              locations->InAt(1).AsX86().AsCpuRegister());
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      DCHECK_EQ(locations->InAt(0).AsX86().AsRegisterPair(),
+                locations->Out().AsX86().AsRegisterPair());
+      __ subl(locations->InAt(0).AsX86().AsRegisterPairLow(),
+              locations->InAt(1).AsX86().AsRegisterPairLow());
+      __ sbbl(locations->InAt(0).AsX86().AsRegisterPairHigh(),
+              locations->InAt(1).AsX86().AsRegisterPairHigh());
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected sub type " << sub->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented sub type " << sub->GetResultType();
+  }
+}
+
+void LocationsBuilderX86::VisitNewInstance(HNewInstance* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  locations->SetOut(X86CpuLocation(EAX));
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86::VisitNewInstance(HNewInstance* instruction) {
+  InvokeRuntimeCallingConvention calling_convention;
+  LoadCurrentMethod(calling_convention.GetRegisterAt(1));
+  __ movl(calling_convention.GetRegisterAt(0),
+          Immediate(instruction->GetTypeIndex()));
+
+  __ fs()->call(
+      Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocObjectWithAccessCheck)));
+
+  codegen_->RecordPcInfo(instruction->GetDexPc());
+}
+
+void LocationsBuilderX86::VisitParameterValue(HParameterValue* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  Location location = parameter_visitor_.GetNextLocation(instruction->GetType());
+  if (location.IsStackSlot()) {
+    location = Location::StackSlot(location.GetStackIndex() + codegen_->GetFrameSize());
+  } else if (location.IsDoubleStackSlot()) {
+    location = Location::DoubleStackSlot(location.GetStackIndex() + codegen_->GetFrameSize());
+  }
+  locations->SetOut(location);
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86::VisitParameterValue(HParameterValue* instruction) {
+  // Nothing to do, the parameter is already at its location.
+}
+
+void LocationsBuilderX86::VisitNot(HNot* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  locations->SetInAt(0, X86CpuLocation(EAX));
+  locations->SetOut(X86CpuLocation(EAX));
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86::VisitNot(HNot* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK_EQ(locations->InAt(0).AsX86().AsCpuRegister(), locations->Out().AsX86().AsCpuRegister());
+  __ xorl(locations->Out().AsX86().AsCpuRegister(), Immediate(1));
+}
+
 }  // namespace x86
 }  // namespace art
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index dd5044f..f22890e 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -22,12 +22,47 @@
 #include "utils/x86/assembler_x86.h"
 
 namespace art {
-
 namespace x86 {
 
+static constexpr size_t kX86WordSize = 4;
+
+class CodeGeneratorX86;
+
+static constexpr Register kParameterCoreRegisters[] = { ECX, EDX, EBX };
+static constexpr RegisterPair kParameterCorePairRegisters[] = { ECX_EDX, EDX_EBX };
+static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
+
+class InvokeDexCallingConvention : public CallingConvention<Register> {
+ public:
+  InvokeDexCallingConvention()
+      : CallingConvention(kParameterCoreRegisters, kParameterCoreRegistersLength) {}
+
+  RegisterPair GetRegisterPairAt(size_t argument_index) {
+    DCHECK_LT(argument_index + 1, GetNumberOfRegisters());
+    return kParameterCorePairRegisters[argument_index];
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConvention);
+};
+
+class InvokeDexCallingConventionVisitor {
+ public:
+  InvokeDexCallingConventionVisitor() : gp_index_(0) {}
+
+  Location GetNextLocation(Primitive::Type type);
+
+ private:
+  InvokeDexCallingConvention calling_convention;
+  uint32_t gp_index_;
+
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
+};
+
 class LocationsBuilderX86 : public HGraphVisitor {
  public:
-  explicit LocationsBuilderX86(HGraph* graph) : HGraphVisitor(graph) { }
+  LocationsBuilderX86(HGraph* graph, CodeGeneratorX86* codegen)
+      : HGraphVisitor(graph), codegen_(codegen) {}
 
 #define DECLARE_VISIT_INSTRUCTION(name)     \
   virtual void Visit##name(H##name* instr);
@@ -37,11 +72,12 @@
 #undef DECLARE_VISIT_INSTRUCTION
 
  private:
+  CodeGeneratorX86* const codegen_;
+  InvokeDexCallingConventionVisitor parameter_visitor_;
+
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderX86);
 };
 
-class CodeGeneratorX86;
-
 class InstructionCodeGeneratorX86 : public HGraphVisitor {
  public:
   InstructionCodeGeneratorX86(HGraph* graph, CodeGeneratorX86* codegen);
@@ -68,7 +104,7 @@
  public:
   explicit CodeGeneratorX86(HGraph* graph)
       : CodeGenerator(graph),
-        location_builder_(graph),
+        location_builder_(graph, this),
         instruction_visitor_(graph, this) { }
   virtual ~CodeGeneratorX86() { }
 
@@ -77,6 +113,10 @@
   virtual void Bind(Label* label) OVERRIDE;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
 
+  virtual size_t GetWordSize() const OVERRIDE {
+    return kX86WordSize;
+  }
+
   virtual HGraphVisitor* GetLocationBuilder() OVERRIDE {
     return &location_builder_;
   }
@@ -92,6 +132,11 @@
   int32_t GetStackSlot(HLocal* local) const;
 
  private:
+  // Helper method to move a 32bits value between two locations.
+  void Move32(Location destination, Location source);
+  // Helper method to move a 64bits value between two locations.
+  void Move64(Location destination, Location source);
+
   LocationsBuilderX86 location_builder_;
   InstructionCodeGeneratorX86 instruction_visitor_;
   X86Assembler assembler_;
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 2b21905..3da9ed9 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -42,6 +42,8 @@
         blocks_(arena, kDefaultNumberOfBlocks),
         dominator_order_(arena, kDefaultNumberOfBlocks),
         maximum_number_of_out_vregs_(0),
+        number_of_vregs_(0),
+        number_of_in_vregs_(0),
         current_instruction_id_(0) { }
 
   ArenaAllocator* GetArena() const { return arena_; }
@@ -68,6 +70,23 @@
     maximum_number_of_out_vregs_ = std::max(new_value, maximum_number_of_out_vregs_);
   }
 
+  void SetNumberOfVRegs(uint16_t number_of_vregs) {
+    number_of_vregs_ = number_of_vregs;
+  }
+
+  uint16_t GetNumberOfVRegs() const {
+    return number_of_vregs_;
+  }
+
+  void SetNumberOfInVRegs(uint16_t value) {
+    number_of_in_vregs_ = value;
+  }
+
+  uint16_t GetNumberOfInVRegs() const {
+    return number_of_in_vregs_;
+  }
+
+
  private:
   HBasicBlock* FindCommonDominator(HBasicBlock* first, HBasicBlock* second) const;
   void VisitBlockForDominatorTree(HBasicBlock* block,
@@ -90,9 +109,15 @@
   HBasicBlock* entry_block_;
   HBasicBlock* exit_block_;
 
-  // The maximum number of arguments passed to a HInvoke in this graph.
+  // The maximum number of virtual registers arguments passed to a HInvoke in this graph.
   uint16_t maximum_number_of_out_vregs_;
 
+  // The number of virtual registers in this method. Contains the parameters.
+  uint16_t number_of_vregs_;
+
+  // The number of virtual registers used by parameters of this method.
+  uint16_t number_of_in_vregs_;
+
   // The current id to assign to a newly added instruction. See HInstruction.id_.
   int current_instruction_id_;
 
@@ -201,10 +226,14 @@
   M(InvokeStatic)                                          \
   M(LoadLocal)                                             \
   M(Local)                                                 \
-  M(PushArgument)                                          \
+  M(LongConstant)                                          \
+  M(NewInstance)                                           \
+  M(Not)                                                   \
+  M(ParameterValue)                                        \
   M(Return)                                                \
   M(ReturnVoid)                                            \
   M(StoreLocal)                                            \
+  M(Sub)                                                   \
 
 #define FORWARD_DECLARATION(type) class H##type;
 FOR_EACH_INSTRUCTION(FORWARD_DECLARATION)
@@ -254,6 +283,8 @@
   virtual void Accept(HGraphVisitor* visitor) = 0;
   virtual const char* DebugName() const = 0;
 
+  virtual Primitive::Type GetType() const { return Primitive::kPrimVoid; }
+
   void AddUse(HInstruction* user) {
     uses_ = new (block_->GetGraph()->GetArena()) HUseListNode(user, uses_);
   }
@@ -505,6 +536,7 @@
   Primitive::Type GetResultType() const { return result_type_; }
 
   virtual bool IsCommutative() { return false; }
+  virtual Primitive::Type GetType() const { return GetResultType(); }
 
  private:
   const Primitive::Type result_type_;
@@ -521,6 +553,8 @@
 
   virtual bool IsCommutative() { return true; }
 
+  virtual Primitive::Type GetType() const { return Primitive::kPrimBoolean; }
+
   DECLARE_INSTRUCTION(Equal)
 
  private:
@@ -546,15 +580,19 @@
 // Load a given local. The local is an input of this instruction.
 class HLoadLocal : public HTemplateInstruction<1> {
  public:
-  explicit HLoadLocal(HLocal* local) {
+  explicit HLoadLocal(HLocal* local, Primitive::Type type) : type_(type) {
     SetRawInputAt(0, local);
   }
 
+  virtual Primitive::Type GetType() const { return type_; }
+
   HLocal* GetLocal() const { return reinterpret_cast<HLocal*>(InputAt(0)); }
 
   DECLARE_INSTRUCTION(LoadLocal)
 
  private:
+  const Primitive::Type type_;
+
   DISALLOW_COPY_AND_ASSIGN(HLoadLocal);
 };
 
@@ -582,6 +620,7 @@
   explicit HIntConstant(int32_t value) : value_(value) { }
 
   int32_t GetValue() const { return value_; }
+  virtual Primitive::Type GetType() const { return Primitive::kPrimInt; }
 
   DECLARE_INSTRUCTION(IntConstant)
 
@@ -591,10 +630,30 @@
   DISALLOW_COPY_AND_ASSIGN(HIntConstant);
 };
 
+class HLongConstant : public HTemplateInstruction<0> {
+ public:
+  explicit HLongConstant(int64_t value) : value_(value) { }
+
+  int64_t GetValue() const { return value_; }
+
+  virtual Primitive::Type GetType() const { return Primitive::kPrimLong; }
+
+  DECLARE_INSTRUCTION(LongConstant)
+
+ private:
+  const int64_t value_;
+
+  DISALLOW_COPY_AND_ASSIGN(HLongConstant);
+};
+
 class HInvoke : public HInstruction {
  public:
-  HInvoke(ArenaAllocator* arena, uint32_t number_of_arguments, int32_t dex_pc)
+  HInvoke(ArenaAllocator* arena,
+          uint32_t number_of_arguments,
+          Primitive::Type return_type,
+          uint32_t dex_pc)
     : inputs_(arena, number_of_arguments),
+      return_type_(return_type),
       dex_pc_(dex_pc) {
     inputs_.SetSize(number_of_arguments);
   }
@@ -606,11 +665,14 @@
     inputs_.Put(index, argument);
   }
 
-  int32_t GetDexPc() const { return dex_pc_; }
+  virtual Primitive::Type GetType() const { return return_type_; }
+
+  uint32_t GetDexPc() const { return dex_pc_; }
 
  protected:
   GrowableArray<HInstruction*> inputs_;
-  const int32_t dex_pc_;
+  const Primitive::Type return_type_;
+  const uint32_t dex_pc_;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HInvoke);
@@ -620,9 +682,11 @@
  public:
   HInvokeStatic(ArenaAllocator* arena,
                 uint32_t number_of_arguments,
-                int32_t dex_pc,
-                int32_t index_in_dex_cache)
-      : HInvoke(arena, number_of_arguments, dex_pc), index_in_dex_cache_(index_in_dex_cache) {}
+                Primitive::Type return_type,
+                uint32_t dex_pc,
+                uint32_t index_in_dex_cache)
+      : HInvoke(arena, number_of_arguments, return_type, dex_pc),
+        index_in_dex_cache_(index_in_dex_cache) {}
 
   uint32_t GetIndexInDexCache() const { return index_in_dex_cache_; }
 
@@ -634,22 +698,22 @@
   DISALLOW_COPY_AND_ASSIGN(HInvokeStatic);
 };
 
-// HPushArgument nodes are inserted after the evaluation of an argument
-// of a call. Their mere purpose is to ease the code generator's work.
-class HPushArgument : public HTemplateInstruction<1> {
+class HNewInstance : public HTemplateInstruction<0> {
  public:
-  HPushArgument(HInstruction* argument, uint8_t argument_index) : argument_index_(argument_index) {
-    SetRawInputAt(0, argument);
-  }
+  HNewInstance(uint32_t dex_pc, uint16_t type_index) : dex_pc_(dex_pc), type_index_(type_index) {}
 
-  uint8_t GetArgumentIndex() const { return argument_index_; }
+  uint32_t GetDexPc() const { return dex_pc_; }
+  uint16_t GetTypeIndex() const { return type_index_; }
 
-  DECLARE_INSTRUCTION(PushArgument)
+  virtual Primitive::Type GetType() const { return Primitive::kPrimNot; }
+
+  DECLARE_INSTRUCTION(NewInstance)
 
  private:
-  const uint8_t argument_index_;
+  const uint32_t dex_pc_;
+  const uint16_t type_index_;
 
-  DISALLOW_COPY_AND_ASSIGN(HPushArgument);
+  DISALLOW_COPY_AND_ASSIGN(HNewInstance);
 };
 
 class HAdd : public HBinaryOperation {
@@ -665,6 +729,56 @@
   DISALLOW_COPY_AND_ASSIGN(HAdd);
 };
 
+class HSub : public HBinaryOperation {
+ public:
+  HSub(Primitive::Type result_type, HInstruction* left, HInstruction* right)
+      : HBinaryOperation(result_type, left, right) {}
+
+  virtual bool IsCommutative() { return false; }
+
+  DECLARE_INSTRUCTION(Sub);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HSub);
+};
+
+// The value of a parameter in this method. Its location depends on
+// the calling convention.
+class HParameterValue : public HTemplateInstruction<0> {
+ public:
+  HParameterValue(uint8_t index, Primitive::Type parameter_type)
+      : index_(index), parameter_type_(parameter_type) {}
+
+  uint8_t GetIndex() const { return index_; }
+
+  virtual Primitive::Type GetType() const { return parameter_type_; }
+
+  DECLARE_INSTRUCTION(ParameterValue);
+
+ private:
+  // The index of this parameter in the parameters list. Must be less
+  // than HGraph::number_of_in_vregs_;
+  const uint8_t index_;
+
+  const Primitive::Type parameter_type_;
+
+  DISALLOW_COPY_AND_ASSIGN(HParameterValue);
+};
+
+class HNot : public HTemplateInstruction<1> {
+ public:
+  explicit HNot(HInstruction* input) {
+    SetRawInputAt(0, input);
+  }
+
+  virtual Primitive::Type GetType() const { return Primitive::kPrimBoolean; }
+
+  DECLARE_INSTRUCTION(Not);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HNot);
+};
+
 class HGraphVisitor : public ValueObject {
  public:
   explicit HGraphVisitor(HGraph* graph) : graph_(graph) { }
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index a11c2da..1d87eaa 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -50,11 +50,11 @@
 }
 
 void Arm64Assembler::GetCurrentThread(ManagedRegister tr) {
-  ___ Mov(reg_x(tr.AsArm64().AsCoreRegister()), reg_x(TR));
+  ___ Mov(reg_x(tr.AsArm64().AsCoreRegister()), reg_x(TR1));
 }
 
 void Arm64Assembler::GetCurrentThread(FrameOffset offset, ManagedRegister /* scratch */) {
-  StoreToOffset(TR, SP, offset.Int32Value());
+  StoreToOffset(TR1, SP, offset.Int32Value());
 }
 
 // See Arm64 PCS Section 5.2.2.1.
@@ -138,7 +138,8 @@
 void Arm64Assembler::StoreRef(FrameOffset offs, ManagedRegister m_src) {
   Arm64ManagedRegister src = m_src.AsArm64();
   CHECK(src.IsCoreRegister()) << src;
-  StoreToOffset(src.AsCoreRegister(), SP, offs.Int32Value());
+  StoreWToOffset(kStoreWord, src.AsOverlappingCoreRegisterLow(), SP,
+                 offs.Int32Value());
 }
 
 void Arm64Assembler::StoreRawPtr(FrameOffset offs, ManagedRegister m_src) {
@@ -152,30 +153,31 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadImmediate(scratch.AsCoreRegister(), imm);
-  StoreToOffset(scratch.AsCoreRegister(), SP, offs.Int32Value());
+  StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(), SP,
+                 offs.Int32Value());
 }
 
-void Arm64Assembler::StoreImmediateToThread32(ThreadOffset<4> offs, uint32_t imm,
+void Arm64Assembler::StoreImmediateToThread64(ThreadOffset<8> offs, uint32_t imm,
                                             ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadImmediate(scratch.AsCoreRegister(), imm);
-  StoreToOffset(scratch.AsCoreRegister(), TR, offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), TR1, offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackOffsetToThread32(ThreadOffset<4> tr_offs,
+void Arm64Assembler::StoreStackOffsetToThread64(ThreadOffset<8> tr_offs,
                                               FrameOffset fr_offs,
                                               ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   AddConstant(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackPointerToThread32(ThreadOffset<4> tr_offs) {
+void Arm64Assembler::StoreStackPointerToThread64(ThreadOffset<8> tr_offs) {
   // Arm64 does not support: "str sp, [dest]" therefore we use IP1 as a temp reg.
   ___ Mov(reg_x(IP1), reg_x(SP));
-  StoreToOffset(IP1, TR, tr_offs.Int32Value());
+  StoreToOffset(IP1, TR1, tr_offs.Int32Value());
 }
 
 void Arm64Assembler::StoreSpanning(FrameOffset dest_off, ManagedRegister m_source,
@@ -254,9 +256,13 @@
     CHECK_EQ(4u, size) << dest;
     ___ Ldr(reg_w(dest.AsWRegister()), MEM_OP(reg_x(base), offset));
   } else if (dest.IsCoreRegister()) {
-    CHECK_EQ(8u, size) << dest;
     CHECK_NE(dest.AsCoreRegister(), SP) << dest;
-    ___ Ldr(reg_x(dest.AsCoreRegister()), MEM_OP(reg_x(base), offset));
+    if (size == 4u) {
+      ___ Ldr(reg_w(dest.AsOverlappingCoreRegisterLow()), MEM_OP(reg_x(base), offset));
+    } else {
+      CHECK_EQ(8u, size) << dest;
+      ___ Ldr(reg_x(dest.AsCoreRegister()), MEM_OP(reg_x(base), offset));
+    }
   } else if (dest.IsSRegister()) {
     ___ Ldr(reg_s(dest.AsSRegister()), MEM_OP(reg_x(base), offset));
   } else {
@@ -269,14 +275,14 @@
   return Load(m_dst.AsArm64(), SP, src.Int32Value(), size);
 }
 
-void Arm64Assembler::LoadFromThread32(ManagedRegister m_dst, ThreadOffset<4> src, size_t size) {
-  return Load(m_dst.AsArm64(), TR, src.Int32Value(), size);
+void Arm64Assembler::LoadFromThread64(ManagedRegister m_dst, ThreadOffset<8> src, size_t size) {
+  return Load(m_dst.AsArm64(), TR1, src.Int32Value(), size);
 }
 
 void Arm64Assembler::LoadRef(ManagedRegister m_dst, FrameOffset offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
-  LoadFromOffset(dst.AsCoreRegister(), SP, offs.Int32Value());
+  LoadWFromOffset(kLoadWord, dst.AsOverlappingCoreRegisterLow(), SP, offs.Int32Value());
 }
 
 void Arm64Assembler::LoadRef(ManagedRegister m_dst, ManagedRegister m_base,
@@ -284,7 +290,8 @@
   Arm64ManagedRegister dst = m_dst.AsArm64();
   Arm64ManagedRegister base = m_base.AsArm64();
   CHECK(dst.IsCoreRegister() && base.IsCoreRegister());
-  LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
+  LoadWFromOffset(kLoadWord, dst.AsOverlappingCoreRegisterLow(), base.AsCoreRegister(),
+                  offs.Int32Value());
 }
 
 void Arm64Assembler::LoadRawPtr(ManagedRegister m_dst, ManagedRegister m_base, Offset offs) {
@@ -294,10 +301,10 @@
   LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
 }
 
-void Arm64Assembler::LoadRawPtrFromThread32(ManagedRegister m_dst, ThreadOffset<4> offs) {
+void Arm64Assembler::LoadRawPtrFromThread64(ManagedRegister m_dst, ThreadOffset<8> offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
-  LoadFromOffset(dst.AsCoreRegister(), TR, offs.Int32Value());
+  LoadFromOffset(dst.AsCoreRegister(), TR1, offs.Int32Value());
 }
 
 // Copying routines.
@@ -306,8 +313,16 @@
   Arm64ManagedRegister src = m_src.AsArm64();
   if (!dst.Equals(src)) {
     if (dst.IsCoreRegister()) {
-      CHECK(src.IsCoreRegister()) << src;
-      ___ Mov(reg_x(dst.AsCoreRegister()), reg_x(src.AsCoreRegister()));
+      if (size == 4) {
+        CHECK(src.IsWRegister());
+        ___ Mov(reg_x(dst.AsCoreRegister()), reg_w(src.AsWRegister()));
+      } else {
+        if (src.IsCoreRegister()) {
+          ___ Mov(reg_x(dst.AsCoreRegister()), reg_x(src.AsCoreRegister()));
+        } else {
+          ___ Mov(reg_x(dst.AsCoreRegister()), reg_w(src.AsWRegister()));
+        }
+      }
     } else if (dst.IsWRegister()) {
       CHECK(src.IsWRegister()) << src;
       ___ Mov(reg_w(dst.AsWRegister()), reg_w(src.AsWRegister()));
@@ -322,40 +337,42 @@
   }
 }
 
-void Arm64Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
-                                          ThreadOffset<4> tr_offs,
+void Arm64Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs,
+                                          ThreadOffset<8> tr_offs,
                                           ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
-  LoadFromOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
   StoreToOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
 }
 
-void Arm64Assembler::CopyRawPtrToThread32(ThreadOffset<4> tr_offs,
+void Arm64Assembler::CopyRawPtrToThread64(ThreadOffset<8> tr_offs,
                                         FrameOffset fr_offs,
                                         ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadFromOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
 }
 
 void Arm64Assembler::CopyRef(FrameOffset dest, FrameOffset src,
                              ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
-  LoadFromOffset(scratch.AsCoreRegister(), SP, src.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), SP, dest.Int32Value());
+  LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(),
+                  SP, src.Int32Value());
+  StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(),
+                 SP, dest.Int32Value());
 }
 
 void Arm64Assembler::Copy(FrameOffset dest, FrameOffset src,
                           ManagedRegister m_scratch, size_t size) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
-  CHECK(scratch.IsCoreRegister() || scratch.IsWRegister()) << scratch;
+  CHECK(scratch.IsCoreRegister()) << scratch;
   CHECK(size == 4 || size == 8) << size;
   if (size == 4) {
-    LoadWFromOffset(kLoadWord, scratch.AsWRegister(), SP, src.Int32Value());
-    StoreWToOffset(kStoreWord, scratch.AsWRegister(), SP, dest.Int32Value());
+    LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(), SP, src.Int32Value());
+    StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(), SP, dest.Int32Value());
   } else if (size == 8) {
     LoadFromOffset(scratch.AsCoreRegister(), SP, src.Int32Value());
     StoreToOffset(scratch.AsCoreRegister(), SP, dest.Int32Value());
@@ -418,10 +435,17 @@
   CHECK(scratch.IsCoreRegister() || scratch.IsWRegister()) << scratch;
   CHECK(size == 4 || size == 8) << size;
   if (size == 4) {
-    LoadWFromOffset(kLoadWord, scratch.AsWRegister(), src.AsCoreRegister(),
+    if (scratch.IsWRegister()) {
+      LoadWFromOffset(kLoadWord, scratch.AsWRegister(), src.AsCoreRegister(),
                     src_offset.Int32Value());
-    StoreWToOffset(kStoreWord, scratch.AsWRegister(), dest.AsCoreRegister(),
+      StoreWToOffset(kStoreWord, scratch.AsWRegister(), dest.AsCoreRegister(),
                    dest_offset.Int32Value());
+    } else {
+      LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(), src.AsCoreRegister(),
+                    src_offset.Int32Value());
+      StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(), dest.AsCoreRegister(),
+                   dest_offset.Int32Value());
+    }
   } else if (size == 8) {
     LoadFromOffset(scratch.AsCoreRegister(), src.AsCoreRegister(), src_offset.Int32Value());
     StoreToOffset(scratch.AsCoreRegister(), dest.AsCoreRegister(), dest_offset.Int32Value());
@@ -486,7 +510,7 @@
   ___ Blr(reg_x(scratch.AsCoreRegister()));
 }
 
-void Arm64Assembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*scratch*/) {
+void Arm64Assembler::CallFromThread64(ThreadOffset<8> /*offset*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL) << "Unimplemented Call() variant";
 }
 
@@ -502,10 +526,11 @@
     // the address in the SIRT holding the reference.
     // e.g. out_reg = (handle == 0) ? 0 : (SP+handle_offset)
     if (in_reg.IsNoRegister()) {
-      LoadFromOffset(out_reg.AsCoreRegister(), SP, sirt_offs.Int32Value());
+      LoadWFromOffset(kLoadWord, out_reg.AsOverlappingCoreRegisterLow(), SP,
+                      sirt_offs.Int32Value());
       in_reg = out_reg;
     }
-    ___ Cmp(reg_x(in_reg.AsCoreRegister()), 0);
+    ___ Cmp(reg_w(in_reg.AsOverlappingCoreRegisterLow()), 0);
     if (!out_reg.Equals(in_reg)) {
       LoadImmediate(out_reg.AsCoreRegister(), 0, EQ);
     }
@@ -520,11 +545,12 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   if (null_allowed) {
-    LoadFromOffset(scratch.AsCoreRegister(), SP, sirt_offset.Int32Value());
+    LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(), SP,
+                    sirt_offset.Int32Value());
     // Null values get a SIRT entry value of 0.  Otherwise, the sirt entry is
     // the address in the SIRT holding the reference.
     // e.g. scratch = (scratch == 0) ? 0 : (SP+sirt_offset)
-    ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
+    ___ Cmp(reg_w(scratch.AsOverlappingCoreRegisterLow()), 0);
     // Move this logic in add constants with flags.
     AddConstant(scratch.AsCoreRegister(), SP, sirt_offset.Int32Value(), NE);
   } else {
@@ -555,7 +581,7 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   Arm64Exception *current_exception = new Arm64Exception(scratch, stack_adjust);
   exception_blocks_.push_back(current_exception);
-  LoadFromOffset(scratch.AsCoreRegister(), TR, Thread::ExceptionOffset<4>().Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), TR1, Thread::ExceptionOffset<8>().Int32Value());
   ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
   ___ B(current_exception->Entry(), COND_OP(NE));
 }
@@ -569,7 +595,11 @@
   // Pass exception object as argument.
   // Don't care about preserving X0 as this won't return.
   ___ Mov(reg_x(X0), reg_x(exception->scratch_.AsCoreRegister()));
-  LoadFromOffset(IP1, TR, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
+  LoadFromOffset(IP1, TR1, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
+
+  // FIXME: Temporary fix for TR (XSELF).
+  ___ Mov(reg_x(TR), reg_x(TR1));
+
   ___ Blr(reg_x(IP1));
   // Call should never return.
   ___ Brk();
@@ -590,6 +620,9 @@
   CHECK_EQ(callee_save_regs.size(), kCalleeSavedRegsSize);
   ___ PushCalleeSavedRegisters();
 
+  // FIXME: Temporary fix for TR (XSELF).
+  ___ Mov(reg_x(TR1), reg_x(TR));
+
   // Increate frame to required size - must be at least space to push Method*.
   CHECK_GT(frame_size, kCalleeSavedRegsSize * kFramePointerSize);
   size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
@@ -598,11 +631,27 @@
   // Write Method*.
   StoreToOffset(X0, SP, 0);
 
-  // Write out entry spills, treated as X regs.
-  // TODO: we can implement a %2 STRP variant of StoreToOffset.
+  // Write out entry spills
+  int32_t offset = frame_size + kFramePointerSize;
   for (size_t i = 0; i < entry_spills.size(); ++i) {
-    Register reg = entry_spills.at(i).AsArm64().AsCoreRegister();
-    StoreToOffset(reg, SP, frame_size + kFramePointerSize + (i * kFramePointerSize));
+    Arm64ManagedRegister reg = entry_spills.at(i).AsArm64();
+    if (reg.IsNoRegister()) {
+      // only increment stack offset.
+      ManagedRegisterSpill spill = entry_spills.at(i);
+      offset += spill.getSize();
+    } else if (reg.IsCoreRegister()) {
+      StoreToOffset(reg.AsCoreRegister(), SP, offset);
+      offset += 8;
+    } else if (reg.IsWRegister()) {
+      StoreWToOffset(kStoreWord, reg.AsWRegister(), SP, offset);
+      offset += 4;
+    } else if (reg.IsDRegister()) {
+      StoreDToOffset(reg.AsDRegister(), SP, offset);
+      offset += 8;
+    } else if (reg.IsSRegister()) {
+      StoreSToOffset(reg.AsSRegister(), SP, offset);
+      offset += 4;
+    }
   }
 }
 
@@ -618,6 +667,9 @@
   size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
   DecreaseFrameSize(adjust);
 
+  // FIXME: Temporary fix for TR (XSELF).
+  ___ Mov(reg_x(TR), reg_x(TR1));
+
   // Pop callee saved and return to LR.
   ___ PopCalleeSavedRegisters();
   ___ Ret();
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 0220724..97fb93a 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -81,8 +81,8 @@
 
 class Arm64Assembler FINAL : public Assembler {
  public:
-  Arm64Assembler() : vixl_buf_(new byte[BUF_SIZE]),
-  vixl_masm_(new vixl::MacroAssembler(vixl_buf_, BUF_SIZE)) {}
+  Arm64Assembler() : vixl_buf_(new byte[kBufferSizeArm64]),
+  vixl_masm_(new vixl::MacroAssembler(vixl_buf_, kBufferSizeArm64)) {}
 
   virtual ~Arm64Assembler() {
     delete[] vixl_buf_;
@@ -114,27 +114,27 @@
   void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
   void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
   void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
-  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+  void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm, ManagedRegister scratch)
       OVERRIDE;
-  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+  void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
                                   ManagedRegister scratch) OVERRIDE;
-  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
+  void StoreStackPointerToThread64(ThreadOffset<8> thr_offs) OVERRIDE;
   void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
                      ManagedRegister scratch) OVERRIDE;
 
   // Load routines.
   void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
-  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
+  void LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size) OVERRIDE;
   void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
   void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
   void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
-  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
+  void LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs) OVERRIDE;
 
   // Copying routines.
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
-  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+  void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
                               ManagedRegister scratch) OVERRIDE;
-  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+  void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
       OVERRIDE;
   void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
   void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
@@ -183,7 +183,7 @@
   // Call to address held at [base+offset].
   void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
   void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
-  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch) OVERRIDE;
 
   // Jump to address (not setting link register)
   void JumpTo(ManagedRegister m_base, Offset offs, ManagedRegister m_scratch);
@@ -197,6 +197,8 @@
     CHECK(code < kNumberOfCoreRegisters) << code;
     if (code == SP) {
       return vixl::sp;
+    } else if (code == XZR) {
+      return vixl::xzr;
     }
     return vixl::Register::XRegFromCode(code);
   }
@@ -232,9 +234,6 @@
   void AddConstant(Register rd, int32_t value, Condition cond = AL);
   void AddConstant(Register rd, Register rn, int32_t value, Condition cond = AL);
 
-  // Vixl buffer size.
-  static constexpr size_t BUF_SIZE = 4096;
-
   // Vixl buffer.
   byte* vixl_buf_;
 
@@ -243,6 +242,9 @@
 
   // List of exception blocks to generate at the end of the code cache.
   std::vector<Arm64Exception*> exception_blocks_;
+
+  // Used for testing.
+  friend class Arm64ManagedRegister_VixlRegisters_Test;
 };
 
 class Arm64Exception {
diff --git a/compiler/utils/arm64/constants_arm64.h b/compiler/utils/arm64/constants_arm64.h
index ecf9fbe..2a08c95 100644
--- a/compiler/utils/arm64/constants_arm64.h
+++ b/compiler/utils/arm64/constants_arm64.h
@@ -31,6 +31,9 @@
 
 constexpr unsigned int kCalleeSavedRegsSize = 20;
 
+// Vixl buffer size.
+constexpr size_t kBufferSizeArm64 = 4096*2;
+
 }  // arm64
 }  // art
 
diff --git a/compiler/utils/arm64/managed_register_arm64.cc b/compiler/utils/arm64/managed_register_arm64.cc
index de5cb8c..8977313 100644
--- a/compiler/utils/arm64/managed_register_arm64.cc
+++ b/compiler/utils/arm64/managed_register_arm64.cc
@@ -53,7 +53,7 @@
   CHECK(!IsNoRegister());
   int no;
   if (IsCoreRegister()) {
-    if (IsStackPointer()) {
+    if (IsZeroRegister()) {
       no = static_cast<int>(X31);
     } else {
       no = static_cast<int>(AsCoreRegister());
diff --git a/compiler/utils/arm64/managed_register_arm64.h b/compiler/utils/arm64/managed_register_arm64.h
index 80f17f5..a0f520f 100644
--- a/compiler/utils/arm64/managed_register_arm64.h
+++ b/compiler/utils/arm64/managed_register_arm64.h
@@ -24,7 +24,7 @@
 namespace art {
 namespace arm64 {
 
-const int kNumberOfCoreRegIds = 32;
+const int kNumberOfCoreRegIds = kNumberOfCoreRegisters;
 const int kNumberOfWRegIds = kNumberOfWRegisters;
 const int kNumberOfDRegIds = kNumberOfDRegisters;
 const int kNumberOfSRegIds = kNumberOfSRegisters;
@@ -78,7 +78,7 @@
 
   WRegister AsOverlappingCoreRegisterLow() const {
     CHECK(IsValidManagedRegister());
-    if (IsStackPointer()) return W31;
+    if (IsZeroRegister()) return W31;
     return static_cast<WRegister>(AsCoreRegister());
   }
 
@@ -189,6 +189,10 @@
     return IsCoreRegister() && (id_ == SP);
   }
 
+  bool IsZeroRegister() const {
+    return IsCoreRegister() && (id_ == XZR);
+  }
+
   int RegId() const {
     CHECK(!IsNoRegister());
     return id_;
diff --git a/compiler/utils/arm64/managed_register_arm64_test.cc b/compiler/utils/arm64/managed_register_arm64_test.cc
index 88c01ee..f149f1b 100644
--- a/compiler/utils/arm64/managed_register_arm64_test.cc
+++ b/compiler/utils/arm64/managed_register_arm64_test.cc
@@ -15,6 +15,7 @@
  */
 
 #include "globals.h"
+#include "assembler_arm64.h"
 #include "managed_register_arm64.h"
 #include "gtest/gtest.h"
 
@@ -295,9 +296,8 @@
 
   Arm64ManagedRegister reg_X31 = Arm64ManagedRegister::FromCoreRegister(X31);
   EXPECT_TRUE(!reg_X31.Equals(Arm64ManagedRegister::NoRegister()));
-  // TODO: Fix the infrastructure, then re-enable.
-  // EXPECT_TRUE(!reg_X31.Equals(Arm64ManagedRegister::FromCoreRegister(SP)));
-  // EXPECT_TRUE(reg_X31.Equals(Arm64ManagedRegister::FromCoreRegister(XZR)));
+  EXPECT_TRUE(reg_X31.Equals(Arm64ManagedRegister::FromCoreRegister(SP)));
+  EXPECT_TRUE(!reg_X31.Equals(Arm64ManagedRegister::FromCoreRegister(XZR)));
   EXPECT_TRUE(!reg_X31.Equals(Arm64ManagedRegister::FromWRegister(W31)));
   EXPECT_TRUE(!reg_X31.Equals(Arm64ManagedRegister::FromWRegister(WZR)));
   EXPECT_TRUE(!reg_X31.Equals(Arm64ManagedRegister::FromSRegister(S0)));
@@ -305,8 +305,7 @@
 
   Arm64ManagedRegister reg_SP = Arm64ManagedRegister::FromCoreRegister(SP);
   EXPECT_TRUE(!reg_SP.Equals(Arm64ManagedRegister::NoRegister()));
-  // TODO: We expect these to pass - SP has a different semantic than X31/XZR.
-  // EXPECT_TRUE(!reg_SP.Equals(Arm64ManagedRegister::FromCoreRegister(X31)));
+  EXPECT_TRUE(reg_SP.Equals(Arm64ManagedRegister::FromCoreRegister(X31)));
   EXPECT_TRUE(!reg_SP.Equals(Arm64ManagedRegister::FromCoreRegister(XZR)));
   EXPECT_TRUE(!reg_SP.Equals(Arm64ManagedRegister::FromWRegister(W31)));
   EXPECT_TRUE(!reg_SP.Equals(Arm64ManagedRegister::FromSRegister(S0)));
@@ -453,17 +452,17 @@
 
   reg = Arm64ManagedRegister::FromCoreRegister(XZR);
   reg_o = Arm64ManagedRegister::FromWRegister(WZR);
-  // TODO: Overlap not implemented, yet
-  // EXPECT_TRUE(reg.Overlaps(Arm64ManagedRegister::FromCoreRegister(X31)));
+  EXPECT_TRUE(reg.Overlaps(Arm64ManagedRegister::FromCoreRegister(X31)));
   EXPECT_TRUE(!reg.Overlaps(Arm64ManagedRegister::FromCoreRegister(X1)));
-  // EXPECT_TRUE(reg.Overlaps(Arm64ManagedRegister::FromCoreRegister(SP)));
-  // EXPECT_TRUE(reg.Overlaps(Arm64ManagedRegister::FromWRegister(W31)));
+  EXPECT_TRUE(reg.Overlaps(Arm64ManagedRegister::FromCoreRegister(SP)));
+  EXPECT_TRUE(reg.Overlaps(Arm64ManagedRegister::FromWRegister(W31)));
   EXPECT_TRUE(!reg.Overlaps(Arm64ManagedRegister::FromWRegister(W1)));
   EXPECT_TRUE(!reg.Overlaps(Arm64ManagedRegister::FromWRegister(W12)));
   EXPECT_TRUE(!reg.Overlaps(Arm64ManagedRegister::FromWRegister(W19)));
   EXPECT_EQ(X31, reg_o.AsOverlappingWRegisterCore());
-  // TODO: XZR is not a core register right now.
-  // EXPECT_EQ(W31, reg.AsOverlappingCoreRegisterLow());
+  EXPECT_EQ(SP, reg_o.AsOverlappingWRegisterCore());
+  EXPECT_NE(XZR, reg_o.AsOverlappingWRegisterCore());
+  EXPECT_EQ(W31, reg.AsOverlappingCoreRegisterLow());
   EXPECT_TRUE(!reg.Overlaps(Arm64ManagedRegister::FromSRegister(S0)));
   EXPECT_TRUE(!reg.Overlaps(Arm64ManagedRegister::FromSRegister(S1)));
   EXPECT_TRUE(!reg.Overlaps(Arm64ManagedRegister::FromSRegister(S2)));
@@ -610,5 +609,154 @@
   EXPECT_TRUE(!reg.Overlaps(Arm64ManagedRegister::FromDRegister(D20)));
 }
 
+TEST(Arm64ManagedRegister, VixlRegisters) {
+  // X Registers.
+  EXPECT_TRUE(vixl::x0.Is(Arm64Assembler::reg_x(X0)));
+  EXPECT_TRUE(vixl::x1.Is(Arm64Assembler::reg_x(X1)));
+  EXPECT_TRUE(vixl::x2.Is(Arm64Assembler::reg_x(X2)));
+  EXPECT_TRUE(vixl::x3.Is(Arm64Assembler::reg_x(X3)));
+  EXPECT_TRUE(vixl::x4.Is(Arm64Assembler::reg_x(X4)));
+  EXPECT_TRUE(vixl::x5.Is(Arm64Assembler::reg_x(X5)));
+  EXPECT_TRUE(vixl::x6.Is(Arm64Assembler::reg_x(X6)));
+  EXPECT_TRUE(vixl::x7.Is(Arm64Assembler::reg_x(X7)));
+  EXPECT_TRUE(vixl::x8.Is(Arm64Assembler::reg_x(X8)));
+  EXPECT_TRUE(vixl::x9.Is(Arm64Assembler::reg_x(X9)));
+  EXPECT_TRUE(vixl::x10.Is(Arm64Assembler::reg_x(X10)));
+  EXPECT_TRUE(vixl::x11.Is(Arm64Assembler::reg_x(X11)));
+  EXPECT_TRUE(vixl::x12.Is(Arm64Assembler::reg_x(X12)));
+  EXPECT_TRUE(vixl::x13.Is(Arm64Assembler::reg_x(X13)));
+  EXPECT_TRUE(vixl::x14.Is(Arm64Assembler::reg_x(X14)));
+  EXPECT_TRUE(vixl::x15.Is(Arm64Assembler::reg_x(X15)));
+  EXPECT_TRUE(vixl::x16.Is(Arm64Assembler::reg_x(X16)));
+  EXPECT_TRUE(vixl::x17.Is(Arm64Assembler::reg_x(X17)));
+  EXPECT_TRUE(vixl::x18.Is(Arm64Assembler::reg_x(X18)));
+  EXPECT_TRUE(vixl::x19.Is(Arm64Assembler::reg_x(X19)));
+  EXPECT_TRUE(vixl::x20.Is(Arm64Assembler::reg_x(X20)));
+  EXPECT_TRUE(vixl::x21.Is(Arm64Assembler::reg_x(X21)));
+  EXPECT_TRUE(vixl::x22.Is(Arm64Assembler::reg_x(X22)));
+  EXPECT_TRUE(vixl::x23.Is(Arm64Assembler::reg_x(X23)));
+  EXPECT_TRUE(vixl::x24.Is(Arm64Assembler::reg_x(X24)));
+  EXPECT_TRUE(vixl::x25.Is(Arm64Assembler::reg_x(X25)));
+  EXPECT_TRUE(vixl::x26.Is(Arm64Assembler::reg_x(X26)));
+  EXPECT_TRUE(vixl::x27.Is(Arm64Assembler::reg_x(X27)));
+  EXPECT_TRUE(vixl::x28.Is(Arm64Assembler::reg_x(X28)));
+  EXPECT_TRUE(vixl::x29.Is(Arm64Assembler::reg_x(X29)));
+  EXPECT_TRUE(vixl::x30.Is(Arm64Assembler::reg_x(X30)));
+  // FIXME: Take a look here.
+  EXPECT_TRUE(vixl::sp.Is(Arm64Assembler::reg_x(X31)));
+  EXPECT_TRUE(!vixl::x31.Is(Arm64Assembler::reg_x(X31)));
+
+  EXPECT_TRUE(vixl::x18.Is(Arm64Assembler::reg_x(TR)));
+  EXPECT_TRUE(vixl::ip0.Is(Arm64Assembler::reg_x(IP0)));
+  EXPECT_TRUE(vixl::ip1.Is(Arm64Assembler::reg_x(IP1)));
+  EXPECT_TRUE(vixl::x29.Is(Arm64Assembler::reg_x(FP)));
+  EXPECT_TRUE(vixl::lr.Is(Arm64Assembler::reg_x(LR)));
+  EXPECT_TRUE(vixl::sp.Is(Arm64Assembler::reg_x(SP)));
+  EXPECT_TRUE(vixl::xzr.Is(Arm64Assembler::reg_x(XZR)));
+
+  // W Registers.
+  EXPECT_TRUE(vixl::w0.Is(Arm64Assembler::reg_w(W0)));
+  EXPECT_TRUE(vixl::w1.Is(Arm64Assembler::reg_w(W1)));
+  EXPECT_TRUE(vixl::w2.Is(Arm64Assembler::reg_w(W2)));
+  EXPECT_TRUE(vixl::w3.Is(Arm64Assembler::reg_w(W3)));
+  EXPECT_TRUE(vixl::w4.Is(Arm64Assembler::reg_w(W4)));
+  EXPECT_TRUE(vixl::w5.Is(Arm64Assembler::reg_w(W5)));
+  EXPECT_TRUE(vixl::w6.Is(Arm64Assembler::reg_w(W6)));
+  EXPECT_TRUE(vixl::w7.Is(Arm64Assembler::reg_w(W7)));
+  EXPECT_TRUE(vixl::w8.Is(Arm64Assembler::reg_w(W8)));
+  EXPECT_TRUE(vixl::w9.Is(Arm64Assembler::reg_w(W9)));
+  EXPECT_TRUE(vixl::w10.Is(Arm64Assembler::reg_w(W10)));
+  EXPECT_TRUE(vixl::w11.Is(Arm64Assembler::reg_w(W11)));
+  EXPECT_TRUE(vixl::w12.Is(Arm64Assembler::reg_w(W12)));
+  EXPECT_TRUE(vixl::w13.Is(Arm64Assembler::reg_w(W13)));
+  EXPECT_TRUE(vixl::w14.Is(Arm64Assembler::reg_w(W14)));
+  EXPECT_TRUE(vixl::w15.Is(Arm64Assembler::reg_w(W15)));
+  EXPECT_TRUE(vixl::w16.Is(Arm64Assembler::reg_w(W16)));
+  EXPECT_TRUE(vixl::w17.Is(Arm64Assembler::reg_w(W17)));
+  EXPECT_TRUE(vixl::w18.Is(Arm64Assembler::reg_w(W18)));
+  EXPECT_TRUE(vixl::w19.Is(Arm64Assembler::reg_w(W19)));
+  EXPECT_TRUE(vixl::w20.Is(Arm64Assembler::reg_w(W20)));
+  EXPECT_TRUE(vixl::w21.Is(Arm64Assembler::reg_w(W21)));
+  EXPECT_TRUE(vixl::w22.Is(Arm64Assembler::reg_w(W22)));
+  EXPECT_TRUE(vixl::w23.Is(Arm64Assembler::reg_w(W23)));
+  EXPECT_TRUE(vixl::w24.Is(Arm64Assembler::reg_w(W24)));
+  EXPECT_TRUE(vixl::w25.Is(Arm64Assembler::reg_w(W25)));
+  EXPECT_TRUE(vixl::w26.Is(Arm64Assembler::reg_w(W26)));
+  EXPECT_TRUE(vixl::w27.Is(Arm64Assembler::reg_w(W27)));
+  EXPECT_TRUE(vixl::w28.Is(Arm64Assembler::reg_w(W28)));
+  EXPECT_TRUE(vixl::w29.Is(Arm64Assembler::reg_w(W29)));
+  EXPECT_TRUE(vixl::w30.Is(Arm64Assembler::reg_w(W30)));
+  EXPECT_TRUE(vixl::w31.Is(Arm64Assembler::reg_w(W31)));
+  EXPECT_TRUE(vixl::wzr.Is(Arm64Assembler::reg_w(WZR)));
+
+  // D Registers.
+  EXPECT_TRUE(vixl::d0.Is(Arm64Assembler::reg_d(D0)));
+  EXPECT_TRUE(vixl::d1.Is(Arm64Assembler::reg_d(D1)));
+  EXPECT_TRUE(vixl::d2.Is(Arm64Assembler::reg_d(D2)));
+  EXPECT_TRUE(vixl::d3.Is(Arm64Assembler::reg_d(D3)));
+  EXPECT_TRUE(vixl::d4.Is(Arm64Assembler::reg_d(D4)));
+  EXPECT_TRUE(vixl::d5.Is(Arm64Assembler::reg_d(D5)));
+  EXPECT_TRUE(vixl::d6.Is(Arm64Assembler::reg_d(D6)));
+  EXPECT_TRUE(vixl::d7.Is(Arm64Assembler::reg_d(D7)));
+  EXPECT_TRUE(vixl::d8.Is(Arm64Assembler::reg_d(D8)));
+  EXPECT_TRUE(vixl::d9.Is(Arm64Assembler::reg_d(D9)));
+  EXPECT_TRUE(vixl::d10.Is(Arm64Assembler::reg_d(D10)));
+  EXPECT_TRUE(vixl::d11.Is(Arm64Assembler::reg_d(D11)));
+  EXPECT_TRUE(vixl::d12.Is(Arm64Assembler::reg_d(D12)));
+  EXPECT_TRUE(vixl::d13.Is(Arm64Assembler::reg_d(D13)));
+  EXPECT_TRUE(vixl::d14.Is(Arm64Assembler::reg_d(D14)));
+  EXPECT_TRUE(vixl::d15.Is(Arm64Assembler::reg_d(D15)));
+  EXPECT_TRUE(vixl::d16.Is(Arm64Assembler::reg_d(D16)));
+  EXPECT_TRUE(vixl::d17.Is(Arm64Assembler::reg_d(D17)));
+  EXPECT_TRUE(vixl::d18.Is(Arm64Assembler::reg_d(D18)));
+  EXPECT_TRUE(vixl::d19.Is(Arm64Assembler::reg_d(D19)));
+  EXPECT_TRUE(vixl::d20.Is(Arm64Assembler::reg_d(D20)));
+  EXPECT_TRUE(vixl::d21.Is(Arm64Assembler::reg_d(D21)));
+  EXPECT_TRUE(vixl::d22.Is(Arm64Assembler::reg_d(D22)));
+  EXPECT_TRUE(vixl::d23.Is(Arm64Assembler::reg_d(D23)));
+  EXPECT_TRUE(vixl::d24.Is(Arm64Assembler::reg_d(D24)));
+  EXPECT_TRUE(vixl::d25.Is(Arm64Assembler::reg_d(D25)));
+  EXPECT_TRUE(vixl::d26.Is(Arm64Assembler::reg_d(D26)));
+  EXPECT_TRUE(vixl::d27.Is(Arm64Assembler::reg_d(D27)));
+  EXPECT_TRUE(vixl::d28.Is(Arm64Assembler::reg_d(D28)));
+  EXPECT_TRUE(vixl::d29.Is(Arm64Assembler::reg_d(D29)));
+  EXPECT_TRUE(vixl::d30.Is(Arm64Assembler::reg_d(D30)));
+  EXPECT_TRUE(vixl::d31.Is(Arm64Assembler::reg_d(D31)));
+
+  // S Registers.
+  EXPECT_TRUE(vixl::s0.Is(Arm64Assembler::reg_s(S0)));
+  EXPECT_TRUE(vixl::s1.Is(Arm64Assembler::reg_s(S1)));
+  EXPECT_TRUE(vixl::s2.Is(Arm64Assembler::reg_s(S2)));
+  EXPECT_TRUE(vixl::s3.Is(Arm64Assembler::reg_s(S3)));
+  EXPECT_TRUE(vixl::s4.Is(Arm64Assembler::reg_s(S4)));
+  EXPECT_TRUE(vixl::s5.Is(Arm64Assembler::reg_s(S5)));
+  EXPECT_TRUE(vixl::s6.Is(Arm64Assembler::reg_s(S6)));
+  EXPECT_TRUE(vixl::s7.Is(Arm64Assembler::reg_s(S7)));
+  EXPECT_TRUE(vixl::s8.Is(Arm64Assembler::reg_s(S8)));
+  EXPECT_TRUE(vixl::s9.Is(Arm64Assembler::reg_s(S9)));
+  EXPECT_TRUE(vixl::s10.Is(Arm64Assembler::reg_s(S10)));
+  EXPECT_TRUE(vixl::s11.Is(Arm64Assembler::reg_s(S11)));
+  EXPECT_TRUE(vixl::s12.Is(Arm64Assembler::reg_s(S12)));
+  EXPECT_TRUE(vixl::s13.Is(Arm64Assembler::reg_s(S13)));
+  EXPECT_TRUE(vixl::s14.Is(Arm64Assembler::reg_s(S14)));
+  EXPECT_TRUE(vixl::s15.Is(Arm64Assembler::reg_s(S15)));
+  EXPECT_TRUE(vixl::s16.Is(Arm64Assembler::reg_s(S16)));
+  EXPECT_TRUE(vixl::s17.Is(Arm64Assembler::reg_s(S17)));
+  EXPECT_TRUE(vixl::s18.Is(Arm64Assembler::reg_s(S18)));
+  EXPECT_TRUE(vixl::s19.Is(Arm64Assembler::reg_s(S19)));
+  EXPECT_TRUE(vixl::s20.Is(Arm64Assembler::reg_s(S20)));
+  EXPECT_TRUE(vixl::s21.Is(Arm64Assembler::reg_s(S21)));
+  EXPECT_TRUE(vixl::s22.Is(Arm64Assembler::reg_s(S22)));
+  EXPECT_TRUE(vixl::s23.Is(Arm64Assembler::reg_s(S23)));
+  EXPECT_TRUE(vixl::s24.Is(Arm64Assembler::reg_s(S24)));
+  EXPECT_TRUE(vixl::s25.Is(Arm64Assembler::reg_s(S25)));
+  EXPECT_TRUE(vixl::s26.Is(Arm64Assembler::reg_s(S26)));
+  EXPECT_TRUE(vixl::s27.Is(Arm64Assembler::reg_s(S27)));
+  EXPECT_TRUE(vixl::s28.Is(Arm64Assembler::reg_s(S28)));
+  EXPECT_TRUE(vixl::s29.Is(Arm64Assembler::reg_s(S29)));
+  EXPECT_TRUE(vixl::s30.Is(Arm64Assembler::reg_s(S30)));
+  EXPECT_TRUE(vixl::s31.Is(Arm64Assembler::reg_s(S31)));
+}
+
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h
index f007d28..bfb2829 100644
--- a/compiler/utils/managed_register.h
+++ b/compiler/utils/managed_register.h
@@ -70,11 +70,13 @@
     return ManagedRegister();
   }
 
+  int RegId() const { return id_; }
+  explicit ManagedRegister(int reg_id) : id_(reg_id) { }
+
  protected:
   static const int kNoRegister = -1;
 
   ManagedRegister() : id_(kNoRegister) { }
-  explicit ManagedRegister(int reg_id) : id_(reg_id) { }
 
   int id_;
 };
@@ -89,6 +91,9 @@
   explicit ManagedRegisterSpill(const ManagedRegister& other)
       : ManagedRegister(other), size_(-1), spill_offset_(-1) { }
 
+  explicit ManagedRegisterSpill(const ManagedRegister& other, int32_t size)
+      : ManagedRegister(other), size_(size), spill_offset_(-1) { }
+
   int32_t getSpillOffset() {
     return spill_offset_;
   }
@@ -111,6 +116,11 @@
     std::vector<ManagedRegisterSpill>::push_back(spill);
   }
 
+  void push_back(ManagedRegister __x, int32_t __size) {
+    ManagedRegisterSpill spill(__x, __size);
+    std::vector<ManagedRegisterSpill>::push_back(spill);
+  }
+
   void push_back(ManagedRegisterSpill __x) {
     std::vector<ManagedRegisterSpill>::push_back(__x);
   }
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 6043c17..6a3efc5 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -863,6 +863,10 @@
   EmitOperand(dst, Operand(src));
 }
 
+void X86Assembler::xorl(Register dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(6, Operand(dst), imm);
+}
 
 void X86Assembler::addl(Register reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index f8fc4c0..057c80a 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -354,6 +354,7 @@
   void orl(Register dst, Register src);
 
   void xorl(Register dst, Register src);
+  void xorl(Register dst, const Immediate& imm);
 
   void addl(Register dst, Register src);
   void addl(Register reg, const Immediate& imm);
diff --git a/compiler/utils/x86/managed_register_x86.cc b/compiler/utils/x86/managed_register_x86.cc
index 7fae7a8..034a795 100644
--- a/compiler/utils/x86/managed_register_x86.cc
+++ b/compiler/utils/x86/managed_register_x86.cc
@@ -33,7 +33,8 @@
   P(EDX, EDI)                 \
   P(ECX, EBX)                 \
   P(ECX, EDI)                 \
-  P(EBX, EDI)
+  P(EBX, EDI)                 \
+  P(ECX, EDX)
 
 
 struct RegisterPairDescriptor {
diff --git a/compiler/utils/x86/managed_register_x86.h b/compiler/utils/x86/managed_register_x86.h
index 0201a96..09d2b49 100644
--- a/compiler/utils/x86/managed_register_x86.h
+++ b/compiler/utils/x86/managed_register_x86.h
@@ -37,7 +37,8 @@
   ECX_EBX = 7,
   ECX_EDI = 8,
   EBX_EDI = 9,
-  kNumberOfRegisterPairs = 10,
+  ECX_EDX = 10,  // Dalvik style passing
+  kNumberOfRegisterPairs = 11,
   kNoRegisterPair = -1,
 };
 
@@ -121,6 +122,12 @@
     return FromRegId(AllocIdHigh()).AsCpuRegister();
   }
 
+  RegisterPair AsRegisterPair() const {
+    CHECK(IsRegisterPair());
+    return static_cast<RegisterPair>(id_ -
+        (kNumberOfCpuRegIds + kNumberOfXmmRegIds + kNumberOfX87RegIds));
+  }
+
   bool IsCpuRegister() const {
     CHECK(IsValidManagedRegister());
     return (0 <= id_) && (id_ < kNumberOfCpuRegIds);
diff --git a/dalvikvm/Android.mk b/dalvikvm/Android.mk
index 231fba1..e99c76f 100644
--- a/dalvikvm/Android.mk
+++ b/dalvikvm/Android.mk
@@ -29,7 +29,7 @@
 LOCAL_MULTILIB := both
 LOCAL_MODULE_STEM_32 := dalvikvm
 LOCAL_MODULE_STEM_64 := dalvikvm64
-include external/stlport/libstlport.mk
+include art/build/Android.libcxx.mk
 include $(BUILD_EXECUTABLE)
 ART_TARGET_EXECUTABLES += $(TARGET_OUT_EXECUTABLES)/$(LOCAL_MODULE)
 
@@ -43,6 +43,8 @@
 LOCAL_SHARED_LIBRARIES := libnativehelper
 LOCAL_LDFLAGS := -ldl -lpthread
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
+LOCAL_IS_HOST_MODULE := true
+include art/build/Android.libcxx.mk
 include $(BUILD_HOST_EXECUTABLE)
 ART_HOST_EXECUTABLES += $(HOST_OUT_EXECUTABLES)/$(LOCAL_MODULE)
 endif
diff --git a/dalvikvm/dalvikvm.cc b/dalvikvm/dalvikvm.cc
index 3486c1d..8d71a7c 100644
--- a/dalvikvm/dalvikvm.cc
+++ b/dalvikvm/dalvikvm.cc
@@ -15,11 +15,10 @@
  */
 
 #include <signal.h>
+#include <stdio.h>
+#include <string.h>
 
 #include <algorithm>
-#include <cstdio>
-#include <cstring>
-#include <string>
 
 #include "jni.h"
 #include "JniInvocation.h"
diff --git a/dex2oat/Android.mk b/dex2oat/Android.mk
index 038f0a7..c17788e 100644
--- a/dex2oat/Android.mk
+++ b/dex2oat/Android.mk
@@ -21,11 +21,19 @@
 DEX2OAT_SRC_FILES := \
 	dex2oat.cc
 
+# TODO: Remove this when the framework (installd) supports pushing the
+# right instruction-set parameter for the primary architecture.
+ifneq ($(filter ro.zygote=zygote64,$(PRODUCT_DEFAULT_PROPERTY_OVERRIDES)),)
+  dex2oat_arch := 64
+else
+  dex2oat_arch := 32
+endif
+
 ifeq ($(ART_BUILD_TARGET_NDEBUG),true)
-  $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libart-compiler,art/compiler,target,ndebug,32))
+  $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libart-compiler,art/compiler,target,ndebug,$(dex2oat_arch)))
 endif
 ifeq ($(ART_BUILD_TARGET_DEBUG),true)
-  $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libartd-compiler,art/compiler,target,debug,32))
+  $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libartd-compiler,art/compiler,target,debug,$(dex2oat_arch)))
 endif
 
 ifeq ($(WITH_HOST_DALVIK),true)
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index f665f5c..ac76c35 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -139,7 +139,7 @@
   UsageError("      Example: --android-root=out/host/linux-x86");
   UsageError("      Default: $ANDROID_ROOT");
   UsageError("");
-  UsageError("  --instruction-set=(arm|mips|x86|x86_64): compile for a particular instruction");
+  UsageError("  --instruction-set=(arm|arm64|mips|x86|x86_64): compile for a particular instruction");
   UsageError("      set.");
   UsageError("      Example: --instruction-set=x86");
   UsageError("      Default: arm");
@@ -743,19 +743,7 @@
   InstructionSetFeatures instruction_set_features =
       ParseFeatureList(Runtime::GetDefaultInstructionSetFeatures());
 
-#if defined(__arm__)
-  InstructionSet instruction_set = kThumb2;
-#elif defined(__aarch64__)
-  InstructionSet instruction_set = kArm64;
-#elif defined(__i386__)
-  InstructionSet instruction_set = kX86;
-#elif defined(__x86_64__)
-  InstructionSet instruction_set = kX86_64;
-#elif defined(__mips__)
-  InstructionSet instruction_set = kMips;
-#else
-  InstructionSet instruction_set = kNone;
-#endif
+  InstructionSet instruction_set = kRuntimeISA;
 
   // Profile file to use
   std::string profile_file;
diff --git a/disassembler/Android.mk b/disassembler/Android.mk
index 56929fc..17828fd 100644
--- a/disassembler/Android.mk
+++ b/disassembler/Android.mk
@@ -43,11 +43,10 @@
   art_ndebug_or_debug := $(2)
 
   include $(CLEAR_VARS)
-  ifeq ($$(art_target_or_host),target)
-    include external/stlport/libstlport.mk
-  else
-    LOCAL_IS_HOST_MODULE := true
+  ifeq ($$(art_target_or_host),host)
+     LOCAL_IS_HOST_MODULE := true
   endif
+  include art/build/Android.libcxx.mk
   LOCAL_CPP_EXTENSION := $(ART_CPP_EXTENSION)
   ifeq ($$(art_ndebug_or_debug),ndebug)
     LOCAL_MODULE := libart-disassembler
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 899aa78..d6d2058 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -115,6 +115,10 @@
   "tst", "rsb", "cmp", "cmn", "orr", "mul", "bic", "mvn",
 };
 
+static const char* const kThumb2ShiftOperations[] = {
+    "lsl", "lsr", "asr", "ror"
+};
+
 static const char* kThumbReverseOperations[] = {
     "rev", "rev16", "rbit", "revsh"
 };
@@ -359,6 +363,61 @@
   }
 }
 
+uint32_t VFPExpand32(uint32_t imm8) {
+  CHECK_EQ(imm8 & 0xffu, imm8);
+  uint32_t bit_a = (imm8 >> 7) & 1;
+  uint32_t bit_b = (imm8 >> 6) & 1;
+  uint32_t slice = imm8 & 0x3f;
+  return (bit_a << 31) | ((1 << 30) - (bit_b << 25)) | (slice << 19);
+}
+
+uint64_t VFPExpand64(uint32_t imm8) {
+  CHECK_EQ(imm8 & 0xffu, imm8);
+  uint64_t bit_a = (imm8 >> 7) & 1;
+  uint64_t bit_b = (imm8 >> 6) & 1;
+  uint64_t slice = imm8 & 0x3f;
+  return (bit_a << 31) | ((UINT64_C(1) << 62) - (bit_b << 54)) | (slice << 48);
+}
+
+uint64_t AdvSIMDExpand(uint32_t op, uint32_t cmode, uint32_t imm8) {
+  CHECK_EQ(op & 1, op);
+  CHECK_EQ(cmode & 0xf, cmode);
+  CHECK_EQ(imm8 & 0xff, imm8);
+  int32_t cmode321 = cmode >> 1;
+  if (imm8 == 0 && cmode321 != 0 && cmode321 != 4 && cmode321 != 7) {
+    return INT64_C(0x00000000deadbeef);  // UNPREDICTABLE
+  }
+  uint64_t imm = imm8;
+  switch (cmode321) {
+    case 3: imm <<= 8;  // Fall through.
+    case 2: imm <<= 8;  // Fall through.
+    case 1: imm <<= 8;  // Fall through.
+    case 0: return static_cast<int64_t>((imm << 32) | imm);
+    case 5: imm <<= 8;  // Fall through.
+    case 4: return static_cast<int64_t>((imm << 48) | (imm << 32) | (imm << 16) | imm);
+    case 6:
+      imm = ((imm + 1u) << ((cmode & 1) != 0 ? 16 : 8)) - 1u;  // Add 8 or 16 ones.
+      return static_cast<int64_t>((imm << 32) | imm);
+    default:
+      CHECK_EQ(cmode321, 7);
+      if ((cmode & 1) == 0 && op == 0) {
+        imm = (imm << 8) | imm;
+        return static_cast<int64_t>((imm << 48) | (imm << 32) | (imm << 16) | imm);
+      } else if ((cmode & 1) == 0 && op != 0) {
+        for (int i = 1; i != 8; ++i) {
+          imm |= ((imm >> i) & UINT64_C(1)) << (i * 8);
+        }
+        imm = imm & ~UINT64_C(0xfe);
+        return static_cast<int64_t>((imm << 8) - imm);
+      } else if ((cmode & 1) != 0 && op == 0) {
+        imm = static_cast<uint32_t>(VFPExpand32(imm8));
+        return static_cast<int64_t>((imm << 32) | imm);
+      } else {
+        return INT64_C(0xdeadbeef00000000);  // UNDEFINED
+      }
+  }
+}
+
 size_t DisassemblerArm::DumpThumb32(std::ostream& os, const uint8_t* instr_ptr) {
   uint32_t instr = (ReadU16(instr_ptr) << 16) | ReadU16(instr_ptr + 2);
   // |111|1 1|1000000|0000|1111110000000000|
@@ -757,83 +816,136 @@
             }
           } else if ((op3 >> 4) == 2 && op4 == 0) {     // 10xxxx, op = 0
             // fp data processing
+            // VMLA, VMLS, VMUL, VNMUL, VADD, VSUB, VDIV, VMOV, ...
+            // |1111|1100|0|0|00|0000|1111|110|0|0|0|0|0|0000|
+            // |5  2|1  8|7|6|54|3  0|5  2|1 9|8|7|6|5|4|3  0|
+            // |----|----|-|-|--|----|----|---|-|-|-|-|-|----|
+            // |3322|2222|2|2|22|1111|1111|110|0|0|0|0|0|0000|
+            // |1  8|7  4|3|2|10|9  6|5  2|1 9|8|7|6|5|4|3  0|
+            // |----|----|-|-|--|----|----|---|-|-|-|-|-|----|
+            // |1110|1110|  op3 | Vn | Vd |101|S|N|Q|M|0| Vm |
+            // |1110|1110|0|D|00| Vn | Vd |101|S|N|0|M|0| Vm | VMLA
+            // |1110|1110|0|D|00| Vn | Vd |101|S|N|1|M|0| Vm | VMLS
+            // |1110|1110|0|D|10| Vn | Vd |101|S|N|0|M|0| Vm | VMUL
+            // |1110|1110|0|D|10| Vn | Vd |101|S|N|1|M|0| Vm | VNMUL
+            // |1110|1110|0|D|11| Vn | Vd |101|S|N|0|M|0| Vm | VADD
+            // |1110|1110|0|D|11| Vn | Vd |101|S|N|1|M|0| Vm | VSUB
+            // |1110|1110|1|D|00| Vn | Vd |101|S|N|0|M|0| Vm | VDIV
+            // |1110|1110|1|D|11| iH | Vd |101|S|0|0|0|0| iL | VMOV (imm)
+            // |1110|1110|1|D|11|op5 | Vd |101|S|.|1|M|0| Vm | ... (see below)
+            uint32_t S = (instr >> 8) & 1;
+            uint32_t Q = (instr >> 6) & 1;
+            FpRegister d(instr, 12, 22);
+            FpRegister n(instr, 16, 7);
+            FpRegister m(instr, 0, 5);
             if ((op3 & 0xB) == 0) {  // 100x00
-              // VMLA, VMLS
-              // |1111|1100|0|0|00|0000|1111|110|0|0|0 |0|0|0000|
-              // |5  2|1  8|7|6|54|3  0|5  2|1 9|8|7|6 |5|4|3  0|
-              // |----|----|-|-|--|----|----|---|-|-|- |-|-|----|
-              // |3322|2222|2|2|22|1111|1111|110|0|0|0 |0|0|0000|
-              // |1  8|7  4|3|2|10|9  6|5  2|1 9|8|7|6 |5|4|3  0|
-              // |----|----|-|-|--|----|----|---|-|-|- |-|-|----|
-              // |1110|1110|0|D|00| Vn | Vd |101|S|N|op|M|0| Vm |
-              uint32_t op = (instr >> 6) & 1;
-              FpRegister d(instr, 12, 22);
-              FpRegister n(instr, 16, 7);
-              FpRegister m(instr, 0, 5);
-              opcode << (op == 0 ? "vmla" : "vmls");
+              opcode << (Q == 0 ? "vmla" : "vmls") << (S != 0 ? ".f64" : ".f32");
               args << d << ", " << n << ", " << m;
-            } else if ((op3 & 0xB) == 0xB) {  // 101x11
-              uint32_t Q = (instr >> 6) & 1;
-              if (Q == 1) {
-                // VCVT (floating-point conversion)
-                // |1111|1100|0|0|00|0000|1111|110|0|0 |0|0|0|0000|
-                // |5  2|1  8|7|6|54|3  0|5  2|1 9|8|7 |6|5|4|3  0|
-                // |----|----|-|-|--|----|----|---|-|- |-|-|-|----|
-                // |3322|2222|2|2|22|1111|1111|110|0|0 |0|0|0|0000|
-                // |1  8|7  4|3|2|10|9  6|5  2|1 9|8|7 |6|5|4|3  0|
-                // |----|----|-|-|--|----|----|---|-|- |-|-|-|----|
-                // |1110|1110|1|D|11|op5 | Vd |101|S|op|1|M|0| Vm |
-                uint32_t op5 = (instr >> 16) & 0xF;
-                uint32_t S = (instr >> 8) & 1;
-                uint32_t op = (instr >> 7) & 1;
-                // Register types in these instructions relies on the combination of op5 and S.
-                FpRegister Dd(instr, 12, 22, 1);
-                FpRegister Sd(instr, 12, 22, 0);
-                FpRegister Dm(instr, 0, 5, 1);
-                FpRegister Sm(instr, 0, 5, 0);
-                if (op5 == 0xD) {
+            } else if ((op3 & 0xB) == 0x2) {  // 100x10
+              opcode << (Q == 0 ? "vmul" : "vnmul") << (S != 0 ? ".f64" : ".f32");
+              args << d << ", " << n << ", " << m;
+            } else if ((op3 & 0xB) == 0x3) {  // 100x11
+              opcode << (Q == 0 ? "vadd" : "vsub") << (S != 0 ? ".f64" : ".f32");
+              args << d << ", " << n << ", " << m;
+            } else if ((op3 & 0xB) == 0x8 && Q == 0) {  // 101x00, Q == 0
+              opcode << "vdiv" << (S != 0 ? ".f64" : ".f32");
+              args << d << ", " << n << ", " << m;
+            } else if ((op3 & 0xB) == 0xB && Q == 0) {  // 101x11, Q == 0
+              uint32_t imm8 = ((instr & 0xf0000u) >> 12) | (instr & 0xfu);
+              opcode << "vmov" << (S != 0 ? ".f64" : ".f32");
+              args << d << ", " << (S != 0 ? StringPrintf("0x%016" PRIx64, VFPExpand64(imm8))
+                                           : StringPrintf("0x%08x", VFPExpand32(imm8)));
+              if ((instr & 0xa0) != 0) {
+                args << " (UNPREDICTABLE)";
+              }
+            } else if ((op3 & 0xB) == 0xB && Q == 1) {  // 101x11, Q == 1
+              // VNEG, VSQRT, VCMP, VCMPE, VCVT (floating-point conversion)
+              // |1111|1100|0|0|00|0000|1111|110|0|0 |0|0|0|0000|
+              // |5  2|1  8|7|6|54|3  0|5  2|1 9|8|7 |6|5|4|3  0|
+              // |----|----|-|-|--|----|----|---|-|- |-|-|-|----|
+              // |3322|2222|2|2|22|1111|1111|110|0|0 |0|0|0|0000|
+              // |1  8|7  4|3|2|10|9  6|5  2|1 9|8|7 |6|5|4|3  0|
+              // |----|----|-|-|--|----|----|---|-|- |-|-|-|----|
+              // |1110|1110|1|D|11|0000| Vd |101|S|0 |1|M|0| Vm | VMOV (reg)
+              // |1110|1110|1|D|11|0000| Vd |101|S|1 |1|M|0| Vm | VABS
+              // |1110|1110|1|D|11|0001| Vd |101|S|0 |1|M|0| Vm | VNEG
+              // |1110|1110|1|D|11|0001| Vd |101|S|1 |1|M|0| Vm | VSQRT
+              // |1110|1110|1|D|11|0100| Vd |101|S|op|1|M|0| Vm | VCMP
+              // |1110|1110|1|D|11|0101| Vd |101|S|op|1|0|0|0000| VCMPE
+              // |1110|1110|1|D|11|op5 | Vd |101|S|op|1|M|0| Vm | VCVT
+              uint32_t op5 = (instr >> 16) & 0xF;
+              uint32_t op = (instr >> 7) & 1;
+              // Register types in VCVT instructions rely on the combination of op5 and S.
+              FpRegister Dd(instr, 12, 22, 1);
+              FpRegister Sd(instr, 12, 22, 0);
+              FpRegister Dm(instr, 0, 5, 1);
+              FpRegister Sm(instr, 0, 5, 0);
+              if (op5 == 0) {
+                opcode << (op == 0 ? "vmov" : "vabs") << (S != 0 ? ".f64" : ".f32");
+                args << d << ", " << m;
+              } else if (op5 == 1) {
+                opcode << (op != 0 ? "vsqrt" : "vneg") << (S != 0 ? ".f64" : ".f32");
+                args << d << ", " << m;
+              } else if (op5 == 4) {
+                opcode << "vcmp" << (S != 0 ? ".f64" : ".f32");
+                args << d << ", " << m;
+                if (op != 0) {
+                  args << " (quiet nan)";
+                }
+              } else if (op5 == 5) {
+                opcode << "vcmpe" << (S != 0 ? ".f64" : ".f32");
+                args << d << ", #0.0";
+                if (op != 0) {
+                  args << " (quiet nan)";
+                }
+                if ((instr & 0x2f) != 0) {
+                  args << " (UNPREDICTABLE)";
+                }
+              } else if (op5 == 0xD) {
+                if (S == 1) {
+                  // vcvt{r}.s32.f64
+                  opcode << "vcvt" << (op == 0 ? "r" : "") << ".s32.f64";
+                  args << Sd << ", " << Dm;
+                } else {
+                  // vcvt{r}.s32.f32
+                  opcode << "vcvt" << (op == 0 ? "r" : "") << ".s32.f32";
+                  args << Sd << ", " << Sm;
+                }
+              } else if (op5 == 0xC) {
+                if (S == 1) {
+                  // vcvt{r}.u32.f64
+                  opcode << "vcvt" << (op == 0 ? "r" : "") << ".u32.f64";
+                  args << Sd << ", " << Dm;
+                } else {
+                  // vcvt{r}.u32.f32
+                  opcode << "vcvt" << (op == 0 ? "r" : "") << ".u32.f32";
+                  args << Sd << ", " << Sm;
+                }
+              } else if (op5 == 0x8) {
+                if (S == 1) {
+                  // vcvt.f64.<Tm>
+                  opcode << "vcvt.f64." << (op == 0 ? "u" : "s") << "32";
+                  args << Dd << ", " << Sm;
+                } else {
+                  // vcvt.f32.<Tm>
+                  opcode << "vcvt.f32." << (op == 0 ? "u" : "s") << "32";
+                  args << Sd << ", " << Sm;
+                }
+              } else if (op5 == 0x7) {
+                if (op == 1) {
                   if (S == 1) {
-                    // vcvt{r}.s32.f64
-                    opcode << "vcvt" << (op == 0 ? "r" : "") << ".s32.f64";
-                    args << Sd << ", " << Dm;
-                  } else {
-                    // vcvt{r}.s32.f32
-                    opcode << "vcvt" << (op == 0 ? "r" : "") << ".s32.f32";
-                    args << Sd << ", " << Sm;
-                  }
-                } else if (op5 == 0xC) {
-                  if (S == 1) {
-                    // vcvt{r}.u32.f64
-                    opcode << "vcvt" << (op == 0 ? "r" : "") << ".u32.f64";
-                    args << Sd << ", " << Dm;
-                  } else {
-                    // vcvt{r}.u32.f32
-                    opcode << "vcvt" << (op == 0 ? "r" : "") << ".u32.f32";
-                    args << Sd << ", " << Sm;
-                  }
-                } else if (op5 == 0x8) {
-                  if (S == 1) {
-                    // vcvt.f64.<Tm>
-                    opcode << "vcvt.f64." << (op == 0 ? "u" : "s") << "32";
+                    // vcvt.f64.f32
+                    opcode << "vcvt.f64.f32";
                     args << Dd << ", " << Sm;
                   } else {
-                    // vcvt.f32.<Tm>
-                    opcode << "vcvt.f32." << (op == 0 ? "u" : "s") << "32";
-                    args << Sd << ", " << Sm;
-                  }
-                } else if (op5 == 0x7) {
-                  if (op == 1) {
-                    if (S == 1) {
-                      // vcvt.f64.f32
-                      opcode << "vcvt.f64.f32";
-                      args << Dd << ", " << Sm;
-                    } else {
-                      // vcvt.f32.f64
-                      opcode << "vcvt.f32.f64";
-                      args << Sd << ", " << Dm;
-                    }
+                    // vcvt.f32.f64
+                    opcode << "vcvt.f32.f64";
+                    args << Sd << ", " << Dm;
                   }
                 }
+              } else if ((op5 & 0xa) == 0xa) {
+                opcode << "vcvt";
+                args << "[undecoded: floating <-> fixed]";
               }
             }
           } else if ((op3 >> 4) == 2 && op4 == 1) {     // 10xxxx, op = 1
@@ -886,53 +998,6 @@
             }
           }
         }
-
-        if ((op3 & 0x30) == 0x20 && op4 == 0) {  // 10 xxxx ... 0
-          if ((coproc & 0xE) == 0xA) {
-            // VFP data-processing instructions
-            // |111|1|1100|0000|0000|1111|110|0|00  |0|0|0000|
-            // |5 3|2|1098|7654|3  0|54 2|10 |8|76  |5|4|3  0|
-            // |---|-|----|----|----|----|---|-|----|-|-|----|
-            // |332|2|2222|2222|1111|1111|110|0|00  |0|0|0000|
-            // |1 9|8|7654|3210|9  6|54 2|109|8|76  |5|4|3  0|
-            // |---|-|----|----|----|----|---|-|----|-|-|----|
-            // |111|T|1110|opc1|opc2|    |101| |opc3| | |    |
-            //  111 0 1110|1111 0100 1110 101 0 01   1 0 1001 - eef4ea69
-            uint32_t opc1 = (instr >> 20) & 0xF;
-            uint32_t opc2 = (instr >> 16) & 0xF;
-            uint32_t opc3 = (instr >> 6) & 0x3;
-            if ((opc1 & 0xB) == 0xB) {  // 1x11
-              // Other VFP data-processing instructions.
-              uint32_t sz = (instr >> 8) & 1;
-              FpRegister d(instr, 12, 22);
-              FpRegister m(instr, 0, 5);
-              switch (opc2) {
-                case 0x1:  // Vneg/Vsqrt
-                  //  1110 11101 D 11 0001 dddd 101s o1M0 mmmm
-                  opcode << (opc3 == 1 ? "vneg" : "vsqrt") << (sz == 1 ? ".f64" : ".f32");
-                  args << d << ", " << m;
-                  break;
-                case 0x4: case 0x5:  {  // Vector compare
-                  // 1110 11101 D 11 0100 dddd 101 sE1M0 mmmm
-                  opcode << (opc3 == 1 ? "vcmp" : "vcmpe") << (sz == 1 ? ".f64" : ".f32");
-                  args << d << ", " << m;
-                  break;
-                }
-              }
-            }
-          }
-        } else if ((op3 & 0x30) == 0x30) {  // 11 xxxx
-          // Advanced SIMD
-          if ((instr & 0xFFBF0ED0) == 0xeeb10ac0) {  // Vsqrt
-            //  1110 11101 D 11 0001 dddd 101S 11M0 mmmm
-            //  1110 11101 0 11 0001 1101 1011 1100 1000 - eeb1dbc8
-            uint32_t sz = (instr >> 8) & 1;
-            FpRegister d(instr, 12, 22);
-            FpRegister m(instr, 0, 5);
-            opcode << "vsqrt" << (sz == 1 ? ".f64" : ".f32");
-            args << d << ", " << m;
-          }
-        }
       }
       break;
     case 2:
@@ -1388,6 +1453,16 @@
       default:      // more formats
         if ((op2 >> 4) == 2) {      // 010xxxx
           // data processing (register)
+          if ((instr & 0x0080f0f0) == 0x0000f000) {
+            // LSL, LSR, ASR, ROR
+            uint32_t shift_op = (instr >> 21) & 3;
+            uint32_t S = (instr >> 20) & 1;
+            ArmRegister Rd(instr, 8);
+            ArmRegister Rn(instr, 16);
+            ArmRegister Rm(instr, 0);
+            opcode << kThumb2ShiftOperations[shift_op] << (S != 0 ? "s" : "");
+            args << Rd << ", " << Rn << ", " << Rm;
+          }
         } else if ((op2 >> 3) == 6) {       // 0110xxx
           // Multiply, multiply accumulate, and absolute difference
           op1 = (instr >> 20) & 0x7;
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 915c415..c191226 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -864,7 +864,7 @@
         }
       }
       // Dump the large objects separately.
-      heap->GetLargeObjectsSpace()->GetLiveObjects()->Walk(ImageDumper::Callback, this);
+      heap->GetLargeObjectsSpace()->GetLiveBitmap()->Walk(ImageDumper::Callback, this);
       indent_os << "\n";
       os_ = saved_os;
     }
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 9df69f0..d433fd5 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -344,9 +344,6 @@
   art_clang := $(3)
 
   include $(CLEAR_VARS)
-  ifeq ($$(art_target_or_host),target)
-    include external/stlport/libstlport.mk
-  endif
   LOCAL_CPP_EXTENSION := $(ART_CPP_EXTENSION)
   ifeq ($$(art_ndebug_or_debug),ndebug)
     LOCAL_MODULE := libart
@@ -366,6 +363,8 @@
     LOCAL_IS_HOST_MODULE := true
   endif
 
+  include art/build/Android.libcxx.mk
+
   GENERATED_SRC_DIR := $$(call local-generated-sources-dir)
   ENUM_OPERATOR_OUT_CC_FILES := $$(patsubst %.h,%_operator_out.cc,$$(LIBART_ENUM_OPERATOR_OUT_HEADER_FILES))
   ENUM_OPERATOR_OUT_GEN := $$(addprefix $$(GENERATED_SRC_DIR)/,$$(ENUM_OPERATOR_OUT_CC_FILES))
diff --git a/runtime/arch/arm/fault_handler_arm.cc b/runtime/arch/arm/fault_handler_arm.cc
index aaba598..3bbec71 100644
--- a/runtime/arch/arm/fault_handler_arm.cc
+++ b/runtime/arch/arm/fault_handler_arm.cc
@@ -35,7 +35,7 @@
 
 extern "C" void art_quick_throw_null_pointer_exception();
 extern "C" void art_quick_throw_stack_overflow(void*);
-extern "C" void art_quick_test_suspend();
+extern "C" void art_quick_implicit_suspend();
 
 // Get the size of a thumb2 instruction in bytes.
 static uint32_t GetInstructionSize(uint8_t* pc) {
@@ -142,7 +142,7 @@
   if (found) {
     LOG(DEBUG) << "suspend check match";
     // This is a suspend check.  Arrange for the signal handler to return to
-    // art_quick_test_suspend.  Also set LR so that after the suspend check it
+    // art_quick_implicit_suspend.  Also set LR so that after the suspend check it
     // will resume the instruction (current PC + 2).  PC points to the
     // ldr r0,[r0,#0] instruction (r0 will be 0, set by the trigger).
 
@@ -151,7 +151,7 @@
     LOG(DEBUG) << "arm lr: " << std::hex << sc->arm_lr;
     LOG(DEBUG) << "arm pc: " << std::hex << sc->arm_pc;
     sc->arm_lr = sc->arm_pc + 3;      // +2 + 1 (for thumb)
-    sc->arm_pc = reinterpret_cast<uintptr_t>(art_quick_test_suspend);
+    sc->arm_pc = reinterpret_cast<uintptr_t>(art_quick_implicit_suspend);
 
     // Now remove the suspend trigger that caused this fault.
     Thread::Current()->RemoveSuspendTrigger();
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 71dcd7f..4903732 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -888,6 +888,14 @@
     RESTORE_REF_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
 END art_quick_test_suspend
 
+ENTRY art_quick_implicit_suspend
+    mov    r0, rSELF
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME          @ save callee saves for stack crawl
+    mov    r1, sp
+    bl     artTestSuspendFromCode             @ (Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
+END art_quick_implicit_suspend
+
     /*
      * Called by managed code that is attempting to call a method on a proxy class. On entry
      * r0 holds the proxy method and r1 holds the receiver; r2 and r3 may contain arguments. The
diff --git a/runtime/arch/arm64/context_arm64.cc b/runtime/arch/arm64/context_arm64.cc
index 3d63c36..c96ff60 100644
--- a/runtime/arch/arm64/context_arm64.cc
+++ b/runtime/arch/arm64/context_arm64.cc
@@ -116,7 +116,8 @@
   uint64_t gprs[32];
   uint64_t fprs[32];
 
-  for (size_t i = 0; i < kNumberOfCoreRegisters; ++i) {
+  // Do not use kNumberOfCoreRegisters, as this is with the distinction of SP and XZR
+  for (size_t i = 0; i < 32; ++i) {
     gprs[i] = gprs_[i] != NULL ? *gprs_[i] : Arm64Context::kBadGprBase + i;
   }
   for (size_t i = 0; i < kNumberOfDRegisters; ++i) {
diff --git a/runtime/arch/arm64/jni_entrypoints_arm64.S b/runtime/arch/arm64/jni_entrypoints_arm64.S
index d2ed692..ba783ab 100644
--- a/runtime/arch/arm64/jni_entrypoints_arm64.S
+++ b/runtime/arch/arm64/jni_entrypoints_arm64.S
@@ -20,11 +20,76 @@
      * Jni dlsym lookup stub.
      */
     .extern artFindNativeMethod
-UNIMPLEMENTED art_jni_dlsym_lookup_stub
+
+    // TODO: Add CFI directives.
+ENTRY art_jni_dlsym_lookup_stub
+  // spill regs.
+  stp   x29, x30, [sp, #-16]!
+  mov   x29, sp
+  stp   d6, d7,   [sp, #-16]!
+  stp   d4, d5,   [sp, #-16]!
+  stp   d2, d3,   [sp, #-16]!
+  stp   d0, d1,   [sp, #-16]!
+  stp   x6, x7,   [sp, #-16]!
+  stp   x4, x5,   [sp, #-16]!
+  stp   x2, x3,   [sp, #-16]!
+  stp   x0, x1,   [sp, #-16]!
+
+  bl  artFindNativeMethod
+  mov  x17, x0    // store result in scratch reg.
+
+  // load spill regs.
+  ldp   x0, x1,   [sp], #16
+  ldp   x2, x3,   [sp], #16
+  ldp   x4, x5,   [sp], #16
+  ldp   x6, x7,   [sp], #16
+  ldp   d0, d1,   [sp], #16
+  ldp   d2, d3,   [sp], #16
+  ldp   d4, d5,   [sp], #16
+  ldp   d6, d7,   [sp], #16
+  ldp   x29, x30, [sp], #16
+
+  cbz   x17, 1f   // is method code null ?
+  br    x17       // if non-null, tail call to method's code.
+
+1:
+  ret             // restore regs and return to caller to handle exception.
+END art_jni_dlsym_lookup_stub
 
     /*
      * Entry point of native methods when JNI bug compatibility is enabled.
      */
     .extern artWorkAroundAppJniBugs
-UNIMPLEMENTED art_work_around_app_jni_bugs
+ENTRY art_work_around_app_jni_bugs
+  // spill regs.
+  stp   x29, x30, [sp, #-16]!
+  mov   x29, sp
+  stp   d6, d7,   [sp, #-16]!
+  stp   d4, d5,   [sp, #-16]!
+  stp   d2, d3,   [sp, #-16]!
+  stp   d0, d1,   [sp, #-16]!
+  stp   x6, x7,   [sp, #-16]!
+  stp   x4, x5,   [sp, #-16]!
+  stp   x2, x3,   [sp, #-16]!
+  stp   x0, x1,   [sp, #-16]!
+
+  mov   x0, x19   // Thread::Current.
+  mov   x1, sp    // SP.
+  bl    artWorkAroundAppJniBugs   // (Thread*, SP).
+  mov   x17, x0   // save target return.
+
+  // load spill regs.
+  ldp   x0, x1,   [sp], #16
+  ldp   x2, x3,   [sp], #16
+  ldp   x4, x5,   [sp], #16
+  ldp   x6, x7,   [sp], #16
+  ldp   d0, d1,   [sp], #16
+  ldp   d2, d3,   [sp], #16
+  ldp   d4, d5,   [sp], #16
+  ldp   d6, d7,   [sp], #16
+  ldp   x29, x30, [sp], #16
+
+  //tail call into JNI routine.
+  br    x17
+END art_work_around_app_jni_bugs
 
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 3082273..6ce5d06 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -60,26 +60,31 @@
 
     // Callee saved.
     stp xSELF, x19, [sp, #264]
-    stp x20, x21, [sp, #280]
-    stp x22, x23, [sp, #296]
-    stp x24, x25, [sp, #312]
-    stp x26, x27, [sp, #328]
-    stp x28, xFP, [sp, #344]    // Save FP.
-    str xLR, [sp, #360]
+    .cfi_rel_offset x18, 264
+    .cfi_rel_offset x19, 272
 
-    .cfi_offset x18,72
-    .cfi_offset x19,80
-    .cfi_offset x20,88
-    .cfi_offset x21,96
-    .cfi_offset x22,104
-    .cfi_offset x23,112
-    .cfi_offset x24,120
-    .cfi_offset x25,128
-    .cfi_offset x26,136
-    .cfi_offset x27,144
-    .cfi_offset x28,152
-    .cfi_offset x29,160
-    .cfi_offset x30,168
+    stp x20, x21, [sp, #280]
+    .cfi_rel_offset x20, 280
+    .cfi_rel_offset x21, 288
+
+    stp x22, x23, [sp, #296]
+    .cfi_rel_offset x22, 296
+    .cfi_rel_offset x23, 304
+
+    stp x24, x25, [sp, #312]
+    .cfi_rel_offset x24, 312
+    .cfi_rel_offset x25, 320
+
+    stp x26, x27, [sp, #328]
+    .cfi_rel_offset x26, 328
+    .cfi_rel_offset x27, 336
+
+    stp x28, xFP, [sp, #344]    // Save FP.
+    .cfi_rel_offset x28, 344
+    .cfi_rel_offset x29, 352
+
+    str xLR, [sp, #360]
+    .cfi_rel_offset x30, 360
 
     // Loads appropriate callee-save-method
     str x9, [sp]    // Store ArtMethod* Runtime::callee_save_methods_[kRefsAndArgs]
@@ -117,36 +122,44 @@
     stp d14, d15, [sp, #128]
 
     stp x1,  x2, [sp, #144]
-    stp x3,  x4, [sp, #160]
-    stp x5,  x6, [sp, #176]
-    stp x7,  xSELF, [sp, #192]
-    stp x19, x20, [sp, #208]
-    stp x21, x22, [sp, #224]
-    stp x23, x24, [sp, #240]
-    stp x25, x26, [sp, #256]
-    stp x27, x28, [sp, #272]
-    stp xFP, xLR, [sp, #288]
+    .cfi_rel_offset x1, 144
+    .cfi_rel_offset x2, 152
 
-    .cfi_offset x1,144
-    .cfi_offset x2,152
-    .cfi_offset x3,160
-    .cfi_offset x4,168
-    .cfi_offset x5,176
-    .cfi_offset x6,184
-    .cfi_offset x7,192
-    .cfi_offset x18,200
-    .cfi_offset x19,208
-    .cfi_offset x20,216
-    .cfi_offset x21,224
-    .cfi_offset x22,232
-    .cfi_offset x23,240
-    .cfi_offset x24,248
-    .cfi_offset x25,256
-    .cfi_offset x26,264
-    .cfi_offset x27,272
-    .cfi_offset x28,280
-    .cfi_offset x29,288
-    .cfi_offset x30,296
+    stp x3,  x4, [sp, #160]
+    .cfi_rel_offset x3, 160
+    .cfi_rel_offset x4, 168
+
+    stp x5,  x6, [sp, #176]
+    .cfi_rel_offset x5, 176
+    .cfi_rel_offset x6, 184
+
+    stp x7,  xSELF, [sp, #192]
+    .cfi_rel_offset x7, 192
+    .cfi_rel_offset x18, 200
+
+    stp x19, x20, [sp, #208]
+    .cfi_rel_offset x19, 208
+    .cfi_rel_offset x20, 216
+
+    stp x21, x22, [sp, #224]
+    .cfi_rel_offset x21, 224
+    .cfi_rel_offset x22, 232
+
+    stp x23, x24, [sp, #240]
+    .cfi_rel_offset x23, 240
+    .cfi_rel_offset x24, 248
+
+    stp x25, x26, [sp, #256]
+    .cfi_rel_offset x25, 256
+    .cfi_rel_offset x26, 264
+
+    stp x27, x28, [sp, #272]
+    .cfi_rel_offset x27, 272
+    .cfi_rel_offset x28, 280
+
+    stp xFP, xLR, [sp, #288]
+    .cfi_rel_offset x29, 288
+    .cfi_rel_offset x30, 296
 .endm
 
     /*
@@ -183,15 +196,44 @@
 
     // args.
     ldp x1,  x2, [sp, #144]
+    .cfi_restore x1
+    .cfi_restore x2
+
     ldp x3,  x4, [sp, #160]
+    .cfi_restore x3
+    .cfi_restore x4
+
     ldp x5,  x6, [sp, #176]
+    .cfi_restore x5
+    .cfi_restore x6
+
     ldp x7,  xSELF, [sp, #192]
+    .cfi_restore x7
+    .cfi_restore x18
+
     ldp x19, x20, [sp, #208]
+    .cfi_restore x19
+    .cfi_restore x20
+
     ldp x21, x22, [sp, #224]
+    .cfi_restore x21
+    .cfi_restore x22
+
     ldp x23, x24, [sp, #240]
+    .cfi_restore x23
+    .cfi_restore x24
+
     ldp x25, x26, [sp, #256]
+    .cfi_restore x25
+    .cfi_restore x26
+
     ldp x27, x28, [sp, #272]
+    .cfi_restore x27
+    .cfi_restore x28
+
     ldp xFP, xLR, [sp, #288]
+    .cfi_restore x29
+    .cfi_restore x30
 
     add sp, sp, #304
     .cfi_adjust_cfa_offset -304
@@ -210,15 +252,44 @@
 
     // args.
     ldp x1,  x2, [sp, #144]
+    .cfi_restore x1
+    .cfi_restore x2
+
     ldp x3,  x4, [sp, #160]
+    .cfi_restore x3
+    .cfi_restore x4
+
     ldp x5,  x6, [sp, #176]
+    .cfi_restore x5
+    .cfi_restore x6
+
     ldp x7,  xSELF, [sp, #192]
+    .cfi_restore x7
+    .cfi_restore x18
+
     ldp x19, x20, [sp, #208]
+    .cfi_restore x19
+    .cfi_restore x20
+
     ldp x21, x22, [sp, #224]
+    .cfi_restore x21
+    .cfi_restore x22
+
     ldp x23, x24, [sp, #240]
+    .cfi_restore x23
+    .cfi_restore x24
+
     ldp x25, x26, [sp, #256]
+    .cfi_restore x25
+    .cfi_restore x26
+
     ldp x27, x28, [sp, #272]
+    .cfi_restore x27
+    .cfi_restore x28
+
     ldp xFP, xLR, [sp, #288]
+    .cfi_restore x29
+    .cfi_restore x30
 
     add sp, sp, #304
     .cfi_adjust_cfa_offset -304
@@ -261,9 +332,14 @@
 END \c_name
 .endm
 
+// FIXME: Temporary fix for TR(XSELF).
 .macro ONE_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context.
+    mov x1, x19                       // pass Thread::Current.
+    mov x2, sp                        // pass SP.
+    b   \cxx_name                     // \cxx_name(Thread*, SP).
     brk 0
 END \c_name
 .endm
@@ -340,6 +416,113 @@
 INVOKE_TRAMPOLINE art_quick_invoke_super_trampoline_with_access_check, artInvokeSuperTrampolineWithAccessCheck
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
+
+.macro INVOKE_STUB_CREATE_FRAME
+
+SAVE_SIZE=5*8   // x4, x5, SP, LR & FP saved.
+SAVE_SIZE_AND_METHOD=SAVE_SIZE+8
+
+    mov x9, sp                          // Save stack pointer.
+    .cfi_register sp,x9
+
+    add x10, x2, # SAVE_SIZE_AND_METHOD // calculate size of frame.
+    sub x10, sp, x10                    // Calculate SP position - saves + ArtMethod* +  args
+    and x10, x10, # ~0xf                // Enforce 16 byte stack alignment.
+    mov sp, x10                         // Set new SP.
+
+    sub x10, x9, #SAVE_SIZE             // Calculate new FP (later). Done here as we must move SP
+    .cfi_def_cfa_register x10           // before this.
+    .cfi_adjust_cfa_offset SAVE_SIZE
+
+    str x9, [x10, #32]                  // Save old stack pointer.
+    .cfi_rel_offset sp, 32
+
+    stp x4, x5, [x10, #16]              // Save result and shorty addresses.
+    .cfi_rel_offset x4, 16
+    .cfi_rel_offset x5, 24
+
+    stp xFP, xLR, [x10]                 // Store LR & FP.
+    .cfi_rel_offset x29, 0
+    .cfi_rel_offset x30, 8
+
+    mov xFP, x10                        // Use xFP now, as it's callee-saved.
+    .cfi_def_cfa_register x29
+    mov xSELF, x3                       // Move thread pointer into SELF register.
+
+    // Copy arguments into stack frame.
+    // Use simple copy routine for now.
+    // 4 bytes per slot.
+    // X1 - source address
+    // W2 - args length
+    // X9 - destination address.
+    // W10 - temporary
+    add x9, sp, #8     // Destination address is bottom of stack + NULL.
+
+    // Use \@ to differentiate between macro invocations.
+.LcopyParams\@:
+    cmp w2, #0
+    beq .LendCopyParams\@
+    sub w2, w2, #4      // Need 65536 bytes of range.
+    ldr w10, [x1, x2]
+    str w10, [x9, x2]
+
+    b .LcopyParams\@
+
+.LendCopyParams\@:
+
+    // Store NULL into Method* at bottom of frame.
+    str xzr, [sp]
+
+.endm
+
+.macro INVOKE_STUB_CALL_AND_RETURN
+
+    // load method-> METHOD_QUICK_CODE_OFFSET
+    ldr x9, [x0 , #METHOD_QUICK_CODE_OFFSET]
+    // Branch to method.
+    blr x9
+
+    // Restore return value address and shorty address.
+    ldp x4,x5, [xFP, #16]
+    .cfi_restore x4
+    .cfi_restore x5
+
+    // Store result (w0/x0/s0/d0) appropriately, depending on resultType.
+    ldrb w10, [x5]
+
+    // Don't set anything for a void type.
+    cmp w10, #'V'
+    beq .Lexit_art_quick_invoke_stub\@
+
+    cmp w10, #'D'
+    bne .Lreturn_is_float\@
+    str d0, [x4]
+    b .Lexit_art_quick_invoke_stub\@
+
+.Lreturn_is_float\@:
+    cmp w10, #'F'
+    bne .Lreturn_is_int\@
+    str s0, [x4]
+    b .Lexit_art_quick_invoke_stub\@
+
+    // Just store x0. Doesn't matter if it is 64 or 32 bits.
+.Lreturn_is_int\@:
+    str x0, [x4]
+
+.Lexit_art_quick_invoke_stub\@:
+    ldr x2, [x29, #32]   // Restore stack pointer.
+    mov sp, x2
+    .cfi_restore sp
+
+    ldp x29, x30, [x29]    // Restore old frame pointer and link register.
+    .cfi_restore x29
+    .cfi_restore x30
+
+    ret
+
+.endm
+
+
 /*
  *  extern"C" void art_quick_invoke_stub(ArtMethod *method,   x0
  *                                       uint32_t  *args,     x1
@@ -377,63 +560,7 @@
  */
 ENTRY art_quick_invoke_stub
     // Spill registers as per AACPS64 calling convention.
-
-SAVE_SIZE=5*8   // x4, x5, LR & FP saved.
-SAVE_SIZE_AND_METHOD=SAVE_SIZE+8
-
-    mov x9, sp     // Save stack pointer.
-
-    mov x10, xFP   // Save frame pointer
-    .cfi_register x29,x10
-    add x11, x2, # SAVE_SIZE_AND_METHOD // calculate size of frame.
-
-    sub x11, sp, x11 // Calculate SP position - saves + ArtMethod* +  args
-
-    and x11, x11, # ~0xf  // Enforce 16 byte stack alignment.
-
-    sub xFP, x9, #SAVE_SIZE   // Calculate new FP. Don't store here until SP moved.
-    .cfi_def_cfa_register x29
-
-    mov sp, x11        // set new SP.
-
-    str x9, [xFP, #32]     // Save old stack pointer.
-
-    .cfi_offset x9, 32
-
-    stp x4, x5, [xFP, #16]  // Save result and shorty addresses.
-
-    .cfi_offset x4, 16
-    .cfi_offset x5, 24
-
-    stp x10, xLR, [xFP]   // Store lr & old fp @ fp
-
-    .cfi_offset x30, 0
-    .cfi_offset x10, 8
-
-    mov xSELF, x3       // Move thread pointer into SELF register.
-
-    // Copy arguments into stack frame.
-    // Use simple copy routine for now.
-    // 4 bytes per slot.
-    // X1 - source address
-    // W2 - args length
-    // X10 - destination address.
-    add x9, sp, #8     // Destination address is bottom of stack + NULL.
-
-    // w2 = argsize parameter.
-.LcopyParams:
-    cmp w2, #0
-    beq .LendCopyParams
-    sub w2, w2, #4      // Need 65536 bytes of range.
-    ldr w10, [x1, x2]
-    str w10, [x9, x2]
-
-    b .LcopyParams
-
-.LendCopyParams:
-
-    // Store NULL into Method* at bottom of frame.
-    str xzr, [sp]
+    INVOKE_STUB_CREATE_FRAME
 
     // Fill registers x/w1 to x/w7 and s/d0 to s/d7 with parameters.
     // Parse the passed shorty to determine which register to load.
@@ -460,7 +587,7 @@
     bne .LisDouble
 
     cmp x15, # 8*12         // Skip this load if all registers full.
-    beq .LfillRegisters
+    beq .Ladvance4
 
     add x17, x13, x15       // Calculate subroutine to jump to.
     br  x17
@@ -470,8 +597,7 @@
     bne .LisLong
 
     cmp x15, # 8*12         // Skip this load if all registers full.
-    beq .LfillRegisters
-
+    beq .Ladvance8
 
     add x17, x14, x15       // Calculate subroutine to jump to.
     br x17
@@ -481,18 +607,26 @@
     bne .LisOther
 
     cmp x8, # 6*12          // Skip this load if all registers full.
-    beq .LfillRegisters
+    beq .Ladvance8
 
     add x17, x12, x8        // Calculate subroutine to jump to.
     br x17
 
-
 .LisOther:                  // Everything else takes one vReg.
     cmp x8, # 6*12          // Skip this load if all registers full.
-    beq .LfillRegisters
+    beq .Ladvance4
+
     add x17, x11, x8        // Calculate subroutine to jump to.
     br x17
 
+.Ladvance4:
+    add x9, x9, #4
+    b .LfillRegisters
+
+.Ladvance8:
+    add x9, x9, #8
+    b .LfillRegisters
+
 // Macro for loading a parameter into a register.
 //  counter - the register with offset into these tables
 //  size - the size of the register - 4 or 8 bytes.
@@ -546,48 +680,8 @@
 
 .LcallFunction:
 
-    // load method-> METHOD_QUICK_CODE_OFFSET
-    ldr x9, [x0 , #METHOD_QUICK_CODE_OFFSET]
-    // Branch to method.
-    blr x9
+    INVOKE_STUB_CALL_AND_RETURN
 
-    // Restore return value address and shorty address.
-    ldp x4,x5, [xFP, #16]
-    .cfi_restore x4
-    .cfi_restore x5
-
-    // Store result (w0/x0/s0/d0) appropriately, depending on resultType.
-    ldrb w10, [x5]
-
-    // Don't set anything for a void type.
-    cmp w10, #'V'
-    beq .Lexit_art_quick_invoke_stub
-
-    cmp w10, #'D'
-    bne .Lreturn_is_float
-    str d0, [x4]
-    b .Lexit_art_quick_invoke_stub
-
-.Lreturn_is_float:
-    cmp w10, #'F'
-    bne .Lreturn_is_int
-    str s0, [x4]
-    b .Lexit_art_quick_invoke_stub
-
-    // Just store x0. Doesn't matter if it is 64 or 32 bits.
-.Lreturn_is_int:
-    str x0, [x4]
-
-.Lexit_art_quick_invoke_stub:
-    ldr x2, [x29, #32]   // Restore stack pointer.
-    mov sp, x2
-    .cfi_restore sp
-
-    ldp x29, x30, [x29]    // Restore old frame pointer and link register.
-    .cfi_restore x29
-    .cfi_restore x30
-
-    ret
 END art_quick_invoke_stub
 
 /*  extern"C"
@@ -600,64 +694,7 @@
  */
 ENTRY art_quick_invoke_static_stub
     // Spill registers as per AACPS64 calling convention.
-
-SAVE_SIZE=5*8   // x4, x5, SP, LR & FP saved
-SAVE_SIZE_AND_METHOD=SAVE_SIZE+8
-
-    mov x9, sp     // Save stack pointer.
-
-    mov x10, xFP   // Save frame pointer
-    .cfi_register x29,x10
-    add x11, x2, # SAVE_SIZE_AND_METHOD // calculate size of frame.
-
-    sub x11, sp, x11 // Calculate SP position - saves + ArtMethod* +  args
-
-    and x11, x11, # ~0xf  // Enforce 16 byte stack alignment.
-
-    sub xFP, x9, #SAVE_SIZE   // Calculate new FP. Don't store here until SP moved.
-
-    mov sp, x11        // set new SP.
-
-    .cfi_def_cfa_register   29
-
-    str x9, [xFP, #32]     // Save old stack pointer.
-
-    .cfi_offset x9, 32
-
-    stp x4, x5, [xFP, #16]  // Save result and shorty addresses.
-
-    .cfi_offset x4, 16
-    .cfi_offset x5, 24
-
-    stp x10, xLR, [x29]   // Store lr & old fp @ fp
-
-    .cfi_offset x30, 0
-    .cfi_offset x10, 8
-
-    mov xSELF, x3       // Move thread pointer into SELF register.
-
-    // Copy arguments into stack frame.
-    // Use simple copy routine for now.
-    // 4 bytes per slot.
-    // X1 - source address
-    // W2 - args length
-    // X10 - destination address.
-    add x9, sp, #8     // Destination address is bottom of stack + NULL.
-
-    // w2 = argsize parameter.
-.LcopyParams2:
-    cmp w2, #0
-    beq .LendCopyParams2
-    sub w2, w2, #4      // Need 65536 bytes of range.
-    ldr w10, [x1, x2]
-    str w10, [x9, x2]
-
-    b .LcopyParams2
-
-.LendCopyParams2:
-
-    // Store NULL into Method* at bottom of frame.
-    str xzr, [sp]
+    INVOKE_STUB_CREATE_FRAME
 
     // Fill registers x/w1 to x/w7 and s/d0 to s/d7 with parameters.
     // Parse the passed shorty to determine which register to load.
@@ -683,7 +720,7 @@
     bne .LisDouble2
 
     cmp x15, # 8*12         // Skip this load if all registers full.
-    beq .LfillRegisters2
+    beq .Ladvance4_2
 
     add x17, x13, x15       // Calculate subroutine to jump to.
     br  x17
@@ -693,8 +730,7 @@
     bne .LisLong2
 
     cmp x15, # 8*12         // Skip this load if all registers full.
-    beq .LfillRegisters2
-
+    beq .Ladvance8_2
 
     add x17, x14, x15       // Calculate subroutine to jump to.
     br x17
@@ -704,18 +740,26 @@
     bne .LisOther2
 
     cmp x8, # 7*12          // Skip this load if all registers full.
-    beq .LfillRegisters2
+    beq .Ladvance8_2
 
     add x17, x12, x8        // Calculate subroutine to jump to.
     br x17
 
-
 .LisOther2:                 // Everything else takes one vReg.
     cmp x8, # 7*12          // Skip this load if all registers full.
-    beq .LfillRegisters2
+    beq .Ladvance4_2
+
     add x17, x11, x8        // Calculate subroutine to jump to.
     br x17
 
+.Ladvance4_2:
+    add x9, x9, #4
+    b .LfillRegisters2
+
+.Ladvance8_2:
+    add x9, x9, #8
+    b .LfillRegisters2
+
 // Store ints.
 .LstoreW1_2:
     LOADREG x8 4 w1 .LfillRegisters2
@@ -761,52 +805,11 @@
 
 .LcallFunction2:
 
-    // load method-> METHOD_QUICK_CODE_OFFSET.
-    ldr x9, [x0 , #METHOD_QUICK_CODE_OFFSET]
-    // Branch to method.
-    blr x9
+    INVOKE_STUB_CALL_AND_RETURN
 
-    // Restore return value address and shorty address.
-    ldp x4, x5, [xFP, #16]
-    .cfi_restore x4
-    .cfi_restore x5
-
-    // Store result (w0/x0/s0/d0) appropriately, depending on resultType.
-    ldrb w10, [x5]
-
-    // Don't set anything for a void type.
-    cmp w10, #'V'
-    beq .Lexit_art_quick_invoke_stub2
-
-    cmp w10, #'D'
-    bne .Lreturn_is_float2
-    str d0, [x4]
-    b .Lexit_art_quick_invoke_stub2
-
-.Lreturn_is_float2:
-    cmp w10, #'F'
-    bne .Lreturn_is_int2
-    str s0, [x4]
-    b .Lexit_art_quick_invoke_stub2
-
-    // Just store x0. Doesn't matter if it is 64 or 32 bits.
-.Lreturn_is_int2:
-    str x0, [x4]
-
-.Lexit_art_quick_invoke_stub2:
-
-    ldr x2, [xFP, #32]   // Restore stack pointer.
-    mov sp, x2
-    .cfi_restore sp
-
-    ldp xFP, xLR, [xFP]    // Restore old frame pointer and link register.
-    .cfi_restore x29
-    .cfi_restore x30
-
-    ret
 END art_quick_invoke_static_stub
 
-// UNIMPLEMENTED art_quick_do_long_jump
+
 
     /*
      * On entry x0 is uintptr_t* gprs_ and x1 is uint64_t* fprs_
diff --git a/runtime/arch/arm64/registers_arm64.h b/runtime/arch/arm64/registers_arm64.h
index ca904bc..2503918 100644
--- a/runtime/arch/arm64/registers_arm64.h
+++ b/runtime/arch/arm64/registers_arm64.h
@@ -56,15 +56,16 @@
   X29 = 29,
   X30 = 30,
   X31 = 31,
-  TR  = 18,     // ART Thread Register.
+  TR  = 18,     // ART Thread Register - Needs to be one of the callee saved regs.
+  TR1 = 19,     // FIXME!
   IP0 = 16,     // Used as scratch by VIXL.
   IP1 = 17,     // Used as scratch by ART JNI Assembler.
   FP  = 29,
   LR  = 30,
   SP  = 31,     // SP is X31 and overlaps with XRZ but we encode it as a
                 // special register, due to the different instruction semantics.
-  XZR = 32,     // FIXME This needs to be reconciled with the JNI assembler.
-  kNumberOfCoreRegisters = 32,
+  XZR = 32,
+  kNumberOfCoreRegisters = 33,
   kNoRegister = -1,
 };
 std::ostream& operator<<(std::ostream& os, const Register& rhs);
@@ -103,7 +104,6 @@
   W29 = 29,
   W30 = 30,
   W31 = 31,
-  WSP = 31,
   WZR = 31,
   kNumberOfWRegisters = 32,
   kNoWRegister = -1,
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 4bde8b7..12460b9 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -43,7 +43,7 @@
 END_MACRO
 
 MACRO0(RESTORE_REF_ONLY_CALLEE_SAVE_FRAME)
-    addl MACRO_LITERAL(16), %esp  // Unwind stack up to return address
+    addl MACRO_LITERAL(16), %esp  // Unwind stack up to saved values
     CFI_ADJUST_CFA_OFFSET(-16)
     POP ebp  // Restore callee saves (ebx is saved/restored by the upcall)
     POP esi
@@ -532,12 +532,12 @@
     movl  %ecx, %eax                       // restore eax
     jmp  .Lretry_lock
 .Lalready_thin:
-    cmpw %ax, %dx                         // do we hold the lock already?
+    cmpw %cx, %dx                         // do we hold the lock already?
     jne  .Lslow_lock
-    addl LITERAL(65536), %eax             // increment recursion count
-    test LITERAL(0xC0000000), %eax        // overflowed if either of top two bits are set
+    addl LITERAL(65536), %ecx             // increment recursion count
+    test LITERAL(0xC0000000), %ecx        // overflowed if either of top two bits are set
     jne  .Lslow_lock                      // count overflowed so go slow
-    movl %eax, LOCK_WORD_OFFSET(%ecx)     // update lockword, cmpxchg not necessary as we hold lock
+    movl %ecx, LOCK_WORD_OFFSET(%eax)     // update lockword, cmpxchg not necessary as we hold lock
     ret
 .Lslow_lock:
     SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
@@ -561,8 +561,8 @@
     jz   .Lslow_unlock
     movl LOCK_WORD_OFFSET(%eax), %ecx     // ecx := lock word
     movl %fs:THREAD_ID_OFFSET, %edx       // edx := thread id
-    test %ecx, %ecx
-    jb   .Lslow_unlock                    // lock word contains a monitor
+    test LITERAL(0xC0000000), %ecx
+    jnz  .Lslow_unlock                    // lock word contains a monitor
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
     cmpl LITERAL(65536), %ecx
diff --git a/runtime/arch/x86_64/asm_support_x86_64.S b/runtime/arch/x86_64/asm_support_x86_64.S
index d03a474..a9f69f5 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.S
+++ b/runtime/arch/x86_64/asm_support_x86_64.S
@@ -137,4 +137,12 @@
     SIZE(\name, 0)
 END_MACRO
 
+MACRO0(UNREACHABLE)
+    int3
+END_MACRO
+
+MACRO0(UNTESTED)
+    int3
+END_MACRO
+
 #endif  // ART_RUNTIME_ARCH_X86_64_ASM_SUPPORT_X86_64_S_
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 17b8556..6509a9b 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -26,7 +26,7 @@
     // R10 := Runtime::Current()
     movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
     movq (%r10), %r10
-    // Save callee and GPR args, mixed together to agree with core spills bitmap.
+    // Save callee save registers to agree with core spills bitmap.
     PUSH r15  // Callee save.
     PUSH r14  // Callee save.
     PUSH r13  // Callee save.
@@ -35,7 +35,7 @@
     PUSH rbx  // Callee save.
     subq MACRO_LITERAL(8), %rsp  // Space for Method* (also aligns the frame).
     CFI_ADJUST_CFA_OFFSET(8)
-    // R10 := ArtMethod* for ref and args callee save frame method.
+    // R10 := ArtMethod* for save all callee save frame method.
     movq RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
     // Store ArtMethod* to bottom of stack.
     movq %r10, 0(%rsp)
@@ -46,13 +46,36 @@
      * Runtime::CreateCalleeSaveMethod(kRefsOnly)
      */
 MACRO0(SETUP_REF_ONLY_CALLEE_SAVE_FRAME)
-    int3
-    int3
+    UNTESTED
+    // R10 := Runtime::Current()
+    movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
+    movq (%r10), %r10
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
+    PUSH r15  // Callee save.
+    PUSH r14  // Callee save.
+    PUSH r13  // Callee save.
+    PUSH r12  // Callee save.
+    PUSH rbp  // Callee save.
+    PUSH rbx  // Callee save.
+    subq MACRO_LITERAL(8), %rsp  // Space for Method* (also aligns the frame).
+    CFI_ADJUST_CFA_OFFSET(8)
+    // R10 := ArtMethod* for refs only callee save frame method.
+    movq RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
+    // Store ArtMethod* to bottom of stack.
+    movq %r10, 0(%rsp)
 END_MACRO
 
 MACRO0(RESTORE_REF_ONLY_CALLEE_SAVE_FRAME)
-    int3
-    int3
+    UNTESTED
+    addq MACRO_LITERAL(8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-8)
+    // TODO: optimize by not restoring callee-saves restored by the ABI
+    POP rbx
+    POP rbp
+    POP r12
+    POP r13
+    POP r14
+    POP r15
 END_MACRO
 
     /*
@@ -130,13 +153,18 @@
     movq %gs:THREAD_SELF_OFFSET, %rdi
     movq %rsp, %rsi
     call PLT_SYMBOL(artDeliverPendingExceptionFromCode)  // artDeliverPendingExceptionFromCode(Thread*, SP)
-    int3                                     // unreached
+    UNREACHABLE
 END_MACRO
 
 MACRO2(NO_ARG_RUNTIME_EXCEPTION, c_name, cxx_name)
     DEFINE_FUNCTION VAR(c_name, 0)
-    int3
-    int3
+    UNTESTED
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
+    // Outgoing argument set up
+    movq %rsp, %rsi                    // pass SP
+    movq %gs:THREAD_SELF_OFFSET, %rdi  // pass Thread::Current()
+    call PLT_VAR(cxx_name, 1)     // cxx_name(Thread*, SP)
+    UNREACHABLE
     END_FUNCTION VAR(c_name, 0)
 END_MACRO
 
@@ -144,17 +172,22 @@
     DEFINE_FUNCTION VAR(c_name, 0)
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
     // Outgoing argument set up
-    mov %rsp, %rdx                    // pass SP
-    mov %gs:THREAD_SELF_OFFSET, %rsi  // pass Thread::Current()
+    movq %rsp, %rdx                    // pass SP
+    movq %gs:THREAD_SELF_OFFSET, %rsi  // pass Thread::Current()
     call PLT_VAR(cxx_name, 1)     // cxx_name(arg1, Thread*, SP)
-    int3                          // unreached
+    UNREACHABLE
     END_FUNCTION VAR(c_name, 0)
 END_MACRO
 
 MACRO2(TWO_ARG_RUNTIME_EXCEPTION, c_name, cxx_name)
     DEFINE_FUNCTION VAR(c_name, 0)
-    int3
-    int3
+    UNTESTED
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
+    // Outgoing argument set up
+    movq %rsp, %rcx                    // pass SP
+    movq %gs:THREAD_SELF_OFFSET, %rdx  // pass Thread::Current()
+    call PLT_VAR(cxx_name, 1)     // cxx_name(Thread*, SP)
+    UNREACHABLE
     END_FUNCTION VAR(c_name, 0)
 END_MACRO
 
@@ -314,7 +347,7 @@
     PUSH rbp                      // Save rbp.
     PUSH r8                       // Save r8/result*.
     PUSH r9                       // Save r9/shorty*.
-    mov %rsp, %rbp                // Copy value of stack pointer into base pointer.
+    movq %rsp, %rbp               // Copy value of stack pointer into base pointer.
     CFI_DEF_CFA_REGISTER(rbp)
     movl %edx, %r10d
     addl LITERAL(64), %edx        // Reserve space for return addr, method*, rbp, r8 and r9 in frame.
@@ -385,7 +418,7 @@
     PUSH rbp                      // Save rbp.
     PUSH r8                       // Save r8/result*.
     PUSH r9                       // Save r9/shorty*.
-    mov %rsp, %rbp                // Copy value of stack pointer into base pointer.
+    movq %rsp, %rbp               // Copy value of stack pointer into base pointer.
     CFI_DEF_CFA_REGISTER(rbp)
     movl %edx, %r10d
     addl LITERAL(64), %edx        // Reserve space for return addr, method*, rbp, r8 and r9 in frame.
@@ -429,43 +462,67 @@
 
 MACRO3(NO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name, 0)
-    int3
-    int3
+    UNTESTED
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    // Outgoing argument set up
+    movq %rsp, %rsi                   // pass SP
+    movq %gs:THREAD_SELF_OFFSET, %rdi // pass Thread::Current()
+    call PLT_VAR(cxx_name, 1)         // cxx_name(Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    CALL_MACRO(return_macro, 2)       // return or deliver exception
     END_FUNCTION VAR(c_name, 0)
 END_MACRO
 
 MACRO3(ONE_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name, 0)
-    int3
-    int3
+    UNTESTED
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME   // save ref containing registers for GC
+    // Outgoing argument set up
+    movq %rsp, %rdx                    // pass SP
+    movq %gs:THREAD_SELF_OFFSET, %rsi  // pass Thread::Current()
+    call PLT_VAR(cxx_name, 1)          // cxx_name(arg0, Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    CALL_MACRO(return_macro, 2)        // return or deliver exception
     END_FUNCTION VAR(c_name, 0)
 END_MACRO
 
 MACRO3(TWO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name, 0)
-    int3
-    int3
+    UNTESTED
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME   // save ref containing registers for GC
+    // Outgoing argument set up
+    movq %rsp, %rcx                    // pass SP
+    movq %gs:THREAD_SELF_OFFSET, %rdx  // pass Thread::Current()
+    call PLT_VAR(cxx_name, 1)          // cxx_name(arg0, arg1, Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    CALL_MACRO(return_macro, 2)       // return or deliver exception
     END_FUNCTION VAR(c_name, 0)
 END_MACRO
 
 MACRO3(THREE_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name, 0)
-    int3
-    int3
+    UNTESTED
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME   // save ref containing registers for GC
+    // Outgoing argument set up
+    movq %rsp, %r8                     // pass SP
+    movq %gs:THREAD_SELF_OFFSET, %rcx  // pass Thread::Current()
+    call PLT_VAR(cxx_name, 1)          // cxx_name(arg0, arg1, arg2, Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    CALL_MACRO(return_macro, 2)        // return or deliver exception
     END_FUNCTION VAR(c_name, 0)
 END_MACRO
 
 MACRO0(RETURN_IF_RESULT_IS_NON_ZERO)
-    int3
-    testl %eax, %eax               // eax == 0 ?
-    jz  1f                         // if eax == 0 goto 1
+    UNTESTED
+    testq %rax, %rax               // rax == 0 ?
+    jz  1f                         // if rax == 0 goto 1
     ret                            // return
 1:                                 // deliver exception on current thread
     DELIVER_PENDING_EXCEPTION
 END_MACRO
 
 MACRO0(RETURN_IF_EAX_ZERO)
-    int3
+    UNTESTED
     testl %eax, %eax               // eax == 0 ?
     jnz  1f                        // if eax != 0 goto 1
     ret                            // return
diff --git a/runtime/base/bit_field.h b/runtime/base/bit_field.h
new file mode 100644
index 0000000..e041bd0
--- /dev/null
+++ b/runtime/base/bit_field.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_BASE_BIT_FIELD_H_
+#define ART_RUNTIME_BASE_BIT_FIELD_H_
+
+#include "globals.h"
+#include "logging.h"
+
+namespace art {
+
+static const uword kUwordOne = 1U;
+
+// BitField is a template for encoding and decoding a bit field inside
+// an unsigned machine word.
+template<typename T, int position, int size>
+class BitField {
+ public:
+  // Tells whether the provided value fits into the bit field.
+  static bool IsValid(T value) {
+    return (static_cast<uword>(value) & ~((kUwordOne << size) - 1)) == 0;
+  }
+
+  // Returns a uword mask of the bit field.
+  static uword Mask() {
+    return (kUwordOne << size) - 1;
+  }
+
+  // Returns a uword mask of the bit field which can be applied directly to
+  // the raw unshifted bits.
+  static uword MaskInPlace() {
+    return ((kUwordOne << size) - 1) << position;
+  }
+
+  // Returns the shift count needed to right-shift the bit field to
+  // the least-significant bits.
+  static int Shift() {
+    return position;
+  }
+
+  // Returns the size of the bit field.
+  static int BitSize() {
+    return size;
+  }
+
+  // Returns a uword with the bit field value encoded.
+  static uword Encode(T value) {
+    DCHECK(IsValid(value));
+    return static_cast<uword>(value) << position;
+  }
+
+  // Extracts the bit field from the value.
+  static T Decode(uword value) {
+    return static_cast<T>((value >> position) & ((kUwordOne << size) - 1));
+  }
+
+  // Returns a uword with the bit field value encoded based on the
+  // original value. Only the bits corresponding to this bit field
+  // will be changed.
+  static uword Update(T value, uword original) {
+    DCHECK(IsValid(value));
+    return (static_cast<uword>(value) << position) |
+        (~MaskInPlace() & original);
+  }
+};
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_BASE_BIT_FIELD_H_
diff --git a/runtime/base/bit_field_test.cc b/runtime/base/bit_field_test.cc
new file mode 100644
index 0000000..afeb2c4
--- /dev/null
+++ b/runtime/base/bit_field_test.cc
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bit_field.h"
+#include "globals.h"
+
+#include "gtest/gtest.h"
+
+namespace art {
+
+TEST(BitFields, Test1) {
+  class TestBitFields : public BitField<int32_t, 1, 8> {};
+  ASSERT_TRUE(TestBitFields::IsValid(16));
+  ASSERT_TRUE(!TestBitFields::IsValid(256));
+  ASSERT_EQ(0x00ffU, TestBitFields::Mask());
+  ASSERT_EQ(0x001feU, TestBitFields::MaskInPlace());
+  ASSERT_EQ(1, TestBitFields::Shift());
+  ASSERT_EQ(8, TestBitFields::BitSize());
+  ASSERT_EQ(32U, TestBitFields::Encode(16));
+  ASSERT_EQ(16, TestBitFields::Decode(32));
+  ASSERT_EQ(2U, TestBitFields::Update(1, 16));
+}
+
+}  // namespace art
diff --git a/runtime/base/logging.cc b/runtime/base/logging.cc
index 46b8ff2..730a2c2 100644
--- a/runtime/base/logging.cc
+++ b/runtime/base/logging.cc
@@ -26,6 +26,8 @@
 
 LogVerbosity gLogVerbosity;
 
+std::vector<std::string> gVerboseMethods;
+
 unsigned int gAborting = 0;
 
 static LogSeverity gMinimumLogSeverity = INFO;
diff --git a/runtime/base/logging.h b/runtime/base/logging.h
index fcec733..bd5ae85 100644
--- a/runtime/base/logging.h
+++ b/runtime/base/logging.h
@@ -22,6 +22,7 @@
 #include <iostream>  // NOLINT
 #include <sstream>
 #include <signal.h>
+#include <vector>
 #include "base/macros.h"
 #include "log_severity.h"
 #include "UniquePtr.h"
@@ -299,6 +300,8 @@
 
 extern LogVerbosity gLogVerbosity;
 
+extern std::vector<std::string> gVerboseMethods;
+
 // Used on fatal exit. Prevents recursive aborts. Allows us to disable
 // some error checking to ensure fatal shutdown makes forward progress.
 extern unsigned int gAborting;
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 4b881f6..b50c098 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -76,6 +76,7 @@
   kClassLinkerClassesLock,
   kBreakpointLock,
   kMonitorLock,
+  kMonitorListLock,
   kThreadListLock,
   kBreakpointInvokeLock,
   kDeoptimizationLock,
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 78b7cc0..9ca0b78 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -692,7 +692,7 @@
     while (true) {
       file_.reset(OS::OpenFileWithFlags(filename, O_CREAT | O_RDWR));
       if (file_.get() == NULL) {
-        *error_msg = StringPrintf("Failed to open file '%s'", filename);
+        *error_msg = StringPrintf("Failed to open file '%s': %s", filename, strerror(errno));
         return false;
       }
       int flock_result = TEMP_FAILURE_RETRY(flock(file_->Fd(), LOCK_EX));
@@ -741,51 +741,57 @@
 const DexFile* ClassLinker::FindOrCreateOatFileForDexLocation(const char* dex_location,
                                                               uint32_t dex_location_checksum,
                                                               const char* oat_location,
-                                                              std::string* error_msg) {
+                                                              std::vector<std::string>* error_msgs) {
   // We play a locking game here so that if two different processes
   // race to generate (or worse, one tries to open a partial generated
   // file) we will be okay. This is actually common with apps that use
   // DexClassLoader to work around the dex method reference limit and
   // that have a background service running in a separate process.
   ScopedFlock scoped_flock;
-  if (!scoped_flock.Init(oat_location, error_msg)) {
+  std::string error_msg;
+  if (!scoped_flock.Init(oat_location, &error_msg)) {
+    error_msgs->push_back(error_msg);
     return nullptr;
   }
 
   // Check if we already have an up-to-date output file
   const DexFile* dex_file = FindDexFileInOatLocation(dex_location, dex_location_checksum,
-                                                     oat_location, error_msg);
+                                                     oat_location, &error_msg);
   if (dex_file != nullptr) {
     return dex_file;
   }
-  VLOG(class_linker) << "Failed to find dex file '" << dex_location << "' in oat location '"
-      << oat_location << "': " << *error_msg;
-  error_msg->clear();
+  std::string compound_msg = StringPrintf("Failed to find dex file '%s' in oat location '%s': %s",
+                                          dex_location, oat_location, error_msg.c_str());
+  VLOG(class_linker) << compound_msg;
+  error_msgs->push_back(compound_msg);
 
   // Generate the output oat file for the dex file
   VLOG(class_linker) << "Generating oat file " << oat_location << " for " << dex_location;
-  if (!GenerateOatFile(dex_location, scoped_flock.GetFile().Fd(), oat_location, error_msg)) {
-    CHECK(!error_msg->empty());
+  if (!GenerateOatFile(dex_location, scoped_flock.GetFile().Fd(), oat_location, &error_msg)) {
+    CHECK(!error_msg.empty());
+    error_msgs->push_back(error_msg);
     return nullptr;
   }
   const OatFile* oat_file = OatFile::Open(oat_location, oat_location, NULL,
                                           !Runtime::Current()->IsCompiler(),
-                                          error_msg);
+                                          &error_msg);
   if (oat_file == nullptr) {
-    *error_msg = StringPrintf("Failed to open generated oat file '%s': %s",
-                              oat_location, error_msg->c_str());
+    compound_msg = StringPrintf("\nFailed to open generated oat file '%s': %s",
+                                oat_location, error_msg.c_str());
+    error_msgs->push_back(compound_msg);
     return nullptr;
   }
   oat_file = RegisterOatFile(oat_file);
   const OatFile::OatDexFile* oat_dex_file = oat_file->GetOatDexFile(dex_location,
                                                                     &dex_location_checksum);
   if (oat_dex_file == nullptr) {
-    *error_msg = StringPrintf("Failed to find dex file '%s' (checksum 0x%x) in generated out file "
-                              "'%s'", dex_location, dex_location_checksum, oat_location);
+    error_msg = StringPrintf("\nFailed to find dex file '%s' (checksum 0x%x) in generated out file "
+                             "'%s'", dex_location, dex_location_checksum, oat_location);
+    error_msgs->push_back(error_msg);
     return nullptr;
   }
-  const DexFile* result = oat_dex_file->OpenDexFile(error_msg);
-  CHECK(result != nullptr) << *error_msg;
+  const DexFile* result = oat_dex_file->OpenDexFile(&error_msg);
+  CHECK(result != nullptr) << error_msgs << ", " << error_msg;
   CHECK_EQ(dex_location_checksum, result->GetLocationChecksum())
           << "dex_location=" << dex_location << " oat_location=" << oat_location << std::hex
           << " dex_location_checksum=" << dex_location_checksum
@@ -880,27 +886,34 @@
 
 const DexFile* ClassLinker::FindDexFileInOatFileFromDexLocation(const char* dex_location,
                                                                 const uint32_t* const dex_location_checksum,
-                                                                std::string* error_msg) {
+                                                                std::vector<std::string>* error_msgs) {
   const OatFile* open_oat_file = FindOpenedOatFileFromDexLocation(dex_location,
                                                                   dex_location_checksum);
   if (open_oat_file != nullptr) {
     const OatFile::OatDexFile* oat_dex_file = open_oat_file->GetOatDexFile(dex_location,
                                                                            dex_location_checksum);
-    return oat_dex_file->OpenDexFile(error_msg);
+    std::string error_msg;
+    const DexFile* ret = oat_dex_file->OpenDexFile(&error_msg);
+    if (ret == nullptr) {
+      error_msgs->push_back(error_msg);
+    }
+    return ret;
   }
 
   // Look for an existing file next to dex. for example, for
   // /foo/bar/baz.jar, look for /foo/bar/baz.odex.
   std::string odex_filename(OatFile::DexFilenameToOdexFilename(dex_location));
   bool open_failed;
+  std::string error_msg;
   const DexFile* dex_file = VerifyAndOpenDexFileFromOatFile(odex_filename, dex_location,
-                                                            error_msg, &open_failed);
+                                                            &error_msg, &open_failed);
   if (dex_file != nullptr) {
     return dex_file;
   }
   if (dex_location_checksum == nullptr) {
-    *error_msg = StringPrintf("Failed to open oat file from %s and no classes.dex found in %s: %s",
-                              odex_filename.c_str(), dex_location, error_msg->c_str());
+    error_msgs->push_back(StringPrintf("Failed to open oat file from %s and no classes.dex found in"
+                                      "%s: %s", odex_filename.c_str(), dex_location,
+                                       error_msg.c_str()));
     return nullptr;
   }
 
@@ -914,14 +927,15 @@
   if (!open_failed && TEMP_FAILURE_RETRY(unlink(cache_location.c_str())) != 0) {
     PLOG(FATAL) << "Failed to remove obsolete oat file from " << cache_location;
   }
-  VLOG(class_linker) << "Failed to open oat file from " << odex_filename
-      << " (error '" << *error_msg << "') or " << cache_location
-      << " (error '" << cache_error_msg << "').";
+  std::string compound_msg = StringPrintf("Failed to open oat file from %s (error '%s') or %s "
+                                          "(error '%s').", odex_filename.c_str(), error_msg.c_str(),
+                                          cache_location.c_str(), cache_error_msg.c_str());
+  VLOG(class_linker) << compound_msg;
+  error_msgs->push_back(compound_msg);
 
   // Try to generate oat file if it wasn't found or was obsolete.
-  error_msg->clear();
   return FindOrCreateOatFileForDexLocation(dex_location, *dex_location_checksum,
-                                           cache_location.c_str(), error_msg);
+                                           cache_location.c_str(), error_msgs);
 }
 
 const OatFile* ClassLinker::FindOpenedOatFileFromOatLocation(const std::string& oat_location) {
@@ -1620,9 +1634,10 @@
   if (method->IsProxyMethod()) {
     return GetPortableProxyInvokeHandler();
   }
-  const void* result = GetOatMethodFor(method).GetPortableCode();
+  const OatFile::OatMethod oat_method = GetOatMethodFor(method);
+  const void* result = oat_method.GetPortableCode();
   if (result == nullptr) {
-    if (GetOatMethodFor(method).GetQuickCode() == nullptr) {
+    if (oat_method.GetQuickCode() == nullptr) {
       // No code? You must mean to go into the interpreter.
       result = GetPortableToInterpreterBridge();
     } else {
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index 701e62e..d684ad5 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -273,14 +273,14 @@
   const DexFile* FindOrCreateOatFileForDexLocation(const char* dex_location,
                                                    uint32_t dex_location_checksum,
                                                    const char* oat_location,
-                                                   std::string* error_msg)
+                                                   std::vector<std::string>* error_msgs)
       LOCKS_EXCLUDED(dex_lock_, Locks::mutator_lock_);
   // Find a DexFile within an OatFile given a DexFile location. Note
   // that this returns null if the location checksum of the DexFile
   // does not match the OatFile.
   const DexFile* FindDexFileInOatFileFromDexLocation(const char* location,
                                                      const uint32_t* const location_checksum,
-                                                     std::string* error_msg)
+                                                     std::vector<std::string>* error_msgs)
       LOCKS_EXCLUDED(dex_lock_, Locks::mutator_lock_);
 
 
diff --git a/runtime/common_throws.cc b/runtime/common_throws.cc
index 4b6d82b..315f274 100644
--- a/runtime/common_throws.cc
+++ b/runtime/common_throws.cc
@@ -66,6 +66,28 @@
   }
 }
 
+static void ThrowWrappedException(const ThrowLocation* throw_location,
+                                  const char* exception_descriptor,
+                                  mirror::Class* referrer, const char* fmt, va_list* args = NULL)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  std::ostringstream msg;
+  if (args != NULL) {
+    std::string vmsg;
+    StringAppendV(&vmsg, fmt, *args);
+    msg << vmsg;
+  } else {
+    msg << fmt;
+  }
+  AddReferrerLocation(msg, referrer);
+  Thread* self = Thread::Current();
+  if (throw_location == NULL) {
+    ThrowLocation computed_throw_location = self->GetCurrentLocationForThrow();
+    self->ThrowNewWrappedException(computed_throw_location, exception_descriptor, msg.str().c_str());
+  } else {
+    self->ThrowNewWrappedException(*throw_location, exception_descriptor, msg.str().c_str());
+  }
+}
+
 // AbstractMethodError
 
 void ThrowAbstractMethodError(mirror::ArtMethod* method) {
@@ -243,6 +265,13 @@
   va_end(args);
 }
 
+void ThrowWrappedIOException(const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  ThrowWrappedException(NULL, "Ljava/io/IOException;", NULL, fmt, &args);
+  va_end(args);
+}
+
 // LinkageError
 
 void ThrowLinkageError(mirror::Class* referrer, const char* fmt, ...) {
diff --git a/runtime/common_throws.h b/runtime/common_throws.h
index c06763e..ebedae0 100644
--- a/runtime/common_throws.h
+++ b/runtime/common_throws.h
@@ -126,6 +126,9 @@
 void ThrowIOException(const char* fmt, ...) __attribute__((__format__(__printf__, 1, 2)))
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) COLD_ATTR;
 
+void ThrowWrappedIOException(const char* fmt, ...) __attribute__((__format__(__printf__, 1, 2)))
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) COLD_ATTR;
+
 // LinkageError
 
 void ThrowLinkageError(mirror::Class* referrer, const char* fmt, ...)
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 2872a02..9012f00 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -126,14 +126,14 @@
   return os;
 }
 
-class DebugInstrumentationListener : public instrumentation::InstrumentationListener {
+class DebugInstrumentationListener FINAL : public instrumentation::InstrumentationListener {
  public:
   DebugInstrumentationListener() {}
   virtual ~DebugInstrumentationListener() {}
 
-  virtual void MethodEntered(Thread* thread, mirror::Object* this_object,
-                             mirror::ArtMethod* method, uint32_t dex_pc)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void MethodEntered(Thread* thread, mirror::Object* this_object, mirror::ArtMethod* method,
+                     uint32_t dex_pc)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (method->IsNative()) {
       // TODO: post location events is a suspension point and native method entry stubs aren't.
       return;
@@ -141,10 +141,9 @@
     Dbg::PostLocationEvent(method, 0, this_object, Dbg::kMethodEntry, nullptr);
   }
 
-  virtual void MethodExited(Thread* thread, mirror::Object* this_object,
-                            mirror::ArtMethod* method,
-                            uint32_t dex_pc, const JValue& return_value)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void MethodExited(Thread* thread, mirror::Object* this_object, mirror::ArtMethod* method,
+                    uint32_t dex_pc, const JValue& return_value)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (method->IsNative()) {
       // TODO: post location events is a suspension point and native method entry stubs aren't.
       return;
@@ -152,26 +151,41 @@
     Dbg::PostLocationEvent(method, dex_pc, this_object, Dbg::kMethodExit, &return_value);
   }
 
-  virtual void MethodUnwind(Thread* thread, mirror::Object* this_object,
-                            mirror::ArtMethod* method, uint32_t dex_pc)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void MethodUnwind(Thread* thread, mirror::Object* this_object, mirror::ArtMethod* method,
+                    uint32_t dex_pc)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // We're not recorded to listen to this kind of event, so complain.
     LOG(ERROR) << "Unexpected method unwind event in debugger " << PrettyMethod(method)
                << " " << dex_pc;
   }
 
-  virtual void DexPcMoved(Thread* thread, mirror::Object* this_object,
-                          mirror::ArtMethod* method, uint32_t new_dex_pc)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void DexPcMoved(Thread* thread, mirror::Object* this_object, mirror::ArtMethod* method,
+                  uint32_t new_dex_pc)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     Dbg::UpdateDebugger(thread, this_object, method, new_dex_pc);
   }
 
-  virtual void ExceptionCaught(Thread* thread, const ThrowLocation& throw_location,
-                               mirror::ArtMethod* catch_method, uint32_t catch_dex_pc,
-                               mirror::Throwable* exception_object)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    Dbg::PostException(thread, throw_location, catch_method, catch_dex_pc, exception_object);
+  void FieldRead(Thread* thread, mirror::Object* this_object, mirror::ArtMethod* method,
+                 uint32_t dex_pc, mirror::ArtField* field)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    Dbg::PostFieldAccessEvent(method, dex_pc, this_object, field);
   }
+
+  void FieldWritten(Thread* thread, mirror::Object* this_object, mirror::ArtMethod* method,
+                    uint32_t dex_pc, mirror::ArtField* field, const JValue& field_value)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    Dbg::PostFieldModificationEvent(method, dex_pc, this_object, field, &field_value);
+  }
+
+  void ExceptionCaught(Thread* thread, const ThrowLocation& throw_location,
+                       mirror::ArtMethod* catch_method, uint32_t catch_dex_pc,
+                       mirror::Throwable* exception_object)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    Dbg::PostException(throw_location, catch_method, catch_dex_pc, exception_object);
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(DebugInstrumentationListener);
 } gDebugInstrumentationListener;
 
 // JDWP is allowed unless the Zygote forbids it.
@@ -211,6 +225,7 @@
 Mutex* Dbg::deoptimization_lock_ = nullptr;
 std::vector<DeoptimizationRequest> Dbg::deoptimization_requests_;
 size_t Dbg::full_deoptimization_event_count_ = 0;
+size_t Dbg::delayed_full_undeoptimization_count_ = 0;
 
 // Breakpoints.
 static std::vector<Breakpoint> gBreakpoints GUARDED_BY(Locks::breakpoint_lock_);
@@ -231,6 +246,14 @@
   }
 }
 
+void DebugInvokeReq::Clear() {
+  invoke_needed = false;
+  receiver = nullptr;
+  thread = nullptr;
+  klass = nullptr;
+  method = nullptr;
+}
+
 void SingleStepControl::VisitRoots(RootCallback* callback, void* arg, uint32_t tid,
                                    RootType root_type) {
   if (method != nullptr) {
@@ -238,6 +261,16 @@
   }
 }
 
+bool SingleStepControl::ContainsDexPc(uint32_t dex_pc) const {
+  return dex_pcs.find(dex_pc) == dex_pcs.end();
+}
+
+void SingleStepControl::Clear() {
+  is_active = false;
+  method = nullptr;
+  dex_pcs.clear();
+}
+
 void DeoptimizationRequest::VisitRoots(RootCallback* callback, void* arg) {
   if (method != nullptr) {
     callback(reinterpret_cast<mirror::Object**>(&method), arg, 0, kRootDebugger);
@@ -607,6 +640,14 @@
   return gDisposed;
 }
 
+// All the instrumentation events the debugger is registered for.
+static constexpr uint32_t kListenerEvents = instrumentation::Instrumentation::kMethodEntered |
+                                            instrumentation::Instrumentation::kMethodExited |
+                                            instrumentation::Instrumentation::kDexPcMoved |
+                                            instrumentation::Instrumentation::kFieldRead |
+                                            instrumentation::Instrumentation::kFieldWritten |
+                                            instrumentation::Instrumentation::kExceptionCaught;
+
 void Dbg::GoActive() {
   // Enable all debugging features, including scans for breakpoints.
   // This is a no-op if we're already active.
@@ -625,6 +666,7 @@
     MutexLock mu(Thread::Current(), *deoptimization_lock_);
     CHECK_EQ(deoptimization_requests_.size(), 0U);
     CHECK_EQ(full_deoptimization_event_count_, 0U);
+    CHECK_EQ(delayed_full_undeoptimization_count_, 0U);
   }
 
   Runtime* runtime = Runtime::Current();
@@ -633,11 +675,7 @@
   ThreadState old_state = self->SetStateUnsafe(kRunnable);
   CHECK_NE(old_state, kRunnable);
   runtime->GetInstrumentation()->EnableDeoptimization();
-  runtime->GetInstrumentation()->AddListener(&gDebugInstrumentationListener,
-                                             instrumentation::Instrumentation::kMethodEntered |
-                                             instrumentation::Instrumentation::kMethodExited |
-                                             instrumentation::Instrumentation::kDexPcMoved |
-                                             instrumentation::Instrumentation::kExceptionCaught);
+  runtime->GetInstrumentation()->AddListener(&gDebugInstrumentationListener, kListenerEvents);
   gDebuggerActive = true;
   CHECK_EQ(self->SetStateUnsafe(old_state), kRunnable);
   runtime->GetThreadList()->ResumeAll();
@@ -667,12 +705,9 @@
       MutexLock mu(Thread::Current(), *deoptimization_lock_);
       deoptimization_requests_.clear();
       full_deoptimization_event_count_ = 0U;
+      delayed_full_undeoptimization_count_ = 0U;
     }
-    runtime->GetInstrumentation()->RemoveListener(&gDebugInstrumentationListener,
-                                                  instrumentation::Instrumentation::kMethodEntered |
-                                                  instrumentation::Instrumentation::kMethodExited |
-                                                  instrumentation::Instrumentation::kDexPcMoved |
-                                                  instrumentation::Instrumentation::kExceptionCaught);
+    runtime->GetInstrumentation()->RemoveListener(&gDebugInstrumentationListener, kListenerEvents);
     runtime->GetInstrumentation()->DisableDeoptimization();
     gDebuggerActive = false;
   }
@@ -1217,7 +1252,8 @@
     LOG(WARNING) << __FUNCTION__ << " access out of bounds: offset=" << offset << "; count=" << count;
     return JDWP::ERR_INVALID_LENGTH;
   }
-  const char* descriptor = ClassHelper(dst->GetClass()).GetDescriptor();
+  ClassHelper ch(dst->GetClass());
+  const char* descriptor = ch.GetDescriptor();
   JDWP::JdwpTag tag = BasicTagFromDescriptor(descriptor + 1);
 
   if (IsPrimitiveTag(tag)) {
@@ -1571,6 +1607,13 @@
   OutputJValue(tag, return_value, pReply);
 }
 
+void Dbg::OutputFieldValue(JDWP::FieldId field_id, const JValue* field_value,
+                           JDWP::ExpandBuf* pReply) {
+  mirror::ArtField* f = FromFieldId(field_id);
+  JDWP::JdwpTag tag = BasicTagFromDescriptor(FieldHelper(f).GetTypeDescriptor());
+  OutputJValue(tag, field_value, pReply);
+}
+
 JDWP::JdwpError Dbg::GetBytecodes(JDWP::RefTypeId, JDWP::MethodId method_id,
                                   std::vector<uint8_t>& bytecodes)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -2443,21 +2486,70 @@
   return visitor.error_;
 }
 
+JDWP::ObjectId Dbg::GetThisObjectIdForEvent(mirror::Object* this_object) {
+  // If 'this_object' isn't already in the registry, we know that we're not looking for it, so
+  // there's no point adding it to the registry and burning through ids.
+  // When registering an event request with an instance filter, we've been given an existing object
+  // id so it must already be present in the registry when the event fires.
+  JDWP::ObjectId this_id = 0;
+  if (this_object != nullptr && gRegistry->Contains(this_object)) {
+    this_id = gRegistry->Add(this_object);
+  }
+  return this_id;
+}
+
 void Dbg::PostLocationEvent(mirror::ArtMethod* m, int dex_pc, mirror::Object* this_object,
                             int event_flags, const JValue* return_value) {
+  if (!IsDebuggerActive()) {
+    return;
+  }
+  DCHECK(m != nullptr);
+  DCHECK_EQ(m->IsStatic(), this_object == nullptr);
   JDWP::JdwpLocation location;
   SetLocation(location, m, dex_pc);
 
-  // If 'this_object' isn't already in the registry, we know that we're not looking for it,
-  // so there's no point adding it to the registry and burning through ids.
-  JDWP::ObjectId this_id = 0;
-  if (gRegistry->Contains(this_object)) {
-    this_id = gRegistry->Add(this_object);
-  }
+  // We need 'this' for InstanceOnly filters only.
+  JDWP::ObjectId this_id = GetThisObjectIdForEvent(this_object);
   gJdwpState->PostLocationEvent(&location, this_id, event_flags, return_value);
 }
 
-void Dbg::PostException(Thread* thread, const ThrowLocation& throw_location,
+void Dbg::PostFieldAccessEvent(mirror::ArtMethod* m, int dex_pc,
+                               mirror::Object* this_object, mirror::ArtField* f) {
+  if (!IsDebuggerActive()) {
+    return;
+  }
+  DCHECK(m != nullptr);
+  DCHECK(f != nullptr);
+  JDWP::JdwpLocation location;
+  SetLocation(location, m, dex_pc);
+
+  JDWP::RefTypeId type_id = gRegistry->AddRefType(f->GetDeclaringClass());
+  JDWP::FieldId field_id = ToFieldId(f);
+  JDWP::ObjectId this_id = gRegistry->Add(this_object);
+
+  gJdwpState->PostFieldEvent(&location, type_id, field_id, this_id, nullptr, false);
+}
+
+void Dbg::PostFieldModificationEvent(mirror::ArtMethod* m, int dex_pc,
+                                     mirror::Object* this_object, mirror::ArtField* f,
+                                     const JValue* field_value) {
+  if (!IsDebuggerActive()) {
+    return;
+  }
+  DCHECK(m != nullptr);
+  DCHECK(f != nullptr);
+  DCHECK(field_value != nullptr);
+  JDWP::JdwpLocation location;
+  SetLocation(location, m, dex_pc);
+
+  JDWP::RefTypeId type_id = gRegistry->AddRefType(f->GetDeclaringClass());
+  JDWP::FieldId field_id = ToFieldId(f);
+  JDWP::ObjectId this_id = gRegistry->Add(this_object);
+
+  gJdwpState->PostFieldEvent(&location, type_id, field_id, this_id, field_value, true);
+}
+
+void Dbg::PostException(const ThrowLocation& throw_location,
                         mirror::ArtMethod* catch_method,
                         uint32_t catch_dex_pc, mirror::Throwable* exception_object) {
   if (!IsDebuggerActive()) {
@@ -2469,8 +2561,8 @@
   JDWP::JdwpLocation catch_location;
   SetLocation(catch_location, catch_method, catch_dex_pc);
 
-  // We need 'this' for InstanceOnly filters.
-  JDWP::ObjectId this_id = gRegistry->Add(throw_location.GetThis());
+  // We need 'this' for InstanceOnly filters only.
+  JDWP::ObjectId this_id = GetThisObjectIdForEvent(throw_location.GetThis());
   JDWP::ObjectId exception_id = gRegistry->Add(exception_object);
   JDWP::RefTypeId exception_class_id = gRegistry->AddRefType(exception_object->GetClass());
 
@@ -2520,7 +2612,7 @@
       } else if (single_step_control->step_size == JDWP::SS_MIN) {
         event_flags |= kSingleStep;
         VLOG(jdwp) << "SS new instruction";
-      } else if (single_step_control->dex_pcs.find(dex_pc) == single_step_control->dex_pcs.end()) {
+      } else if (single_step_control->ContainsDexPc(dex_pc)) {
         event_flags |= kSingleStep;
         VLOG(jdwp) << "SS new line";
       }
@@ -2542,7 +2634,7 @@
         if (single_step_control->step_size == JDWP::SS_MIN) {
           event_flags |= kSingleStep;
           VLOG(jdwp) << "SS new instruction";
-        } else if (single_step_control->dex_pcs.find(dex_pc) == single_step_control->dex_pcs.end()) {
+        } else if (single_step_control->ContainsDexPc(dex_pc)) {
           event_flags |= kSingleStep;
           VLOG(jdwp) << "SS new line";
         }
@@ -2579,20 +2671,24 @@
       LOG(WARNING) << "Ignoring empty deoptimization request.";
       break;
     case DeoptimizationRequest::kFullDeoptimization:
-      VLOG(jdwp) << "Deoptimize the world";
+      VLOG(jdwp) << "Deoptimize the world ...";
       instrumentation->DeoptimizeEverything();
+      VLOG(jdwp) << "Deoptimize the world DONE";
       break;
     case DeoptimizationRequest::kFullUndeoptimization:
-      VLOG(jdwp) << "Undeoptimize the world";
+      VLOG(jdwp) << "Undeoptimize the world ...";
       instrumentation->UndeoptimizeEverything();
+      VLOG(jdwp) << "Undeoptimize the world DONE";
       break;
     case DeoptimizationRequest::kSelectiveDeoptimization:
-      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.method);
+      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.method) << " ...";
       instrumentation->Deoptimize(request.method);
+      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.method) << " DONE";
       break;
     case DeoptimizationRequest::kSelectiveUndeoptimization:
-      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.method);
+      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.method) << " ...";
       instrumentation->Undeoptimize(request.method);
+      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.method) << " DONE";
       break;
     default:
       LOG(FATAL) << "Unsupported deoptimization request kind " << request.kind;
@@ -2600,17 +2696,43 @@
   }
 }
 
+void Dbg::DelayFullUndeoptimization() {
+  MutexLock mu(Thread::Current(), *deoptimization_lock_);
+  ++delayed_full_undeoptimization_count_;
+  DCHECK_LE(delayed_full_undeoptimization_count_, full_deoptimization_event_count_);
+}
+
+void Dbg::ProcessDelayedFullUndeoptimizations() {
+  // TODO: avoid taking the lock twice (once here and once in ManageDeoptimization).
+  {
+    MutexLock mu(Thread::Current(), *deoptimization_lock_);
+    while (delayed_full_undeoptimization_count_ > 0) {
+      DeoptimizationRequest req;
+      req.kind = DeoptimizationRequest::kFullUndeoptimization;
+      req.method = nullptr;
+      RequestDeoptimizationLocked(req);
+      --delayed_full_undeoptimization_count_;
+    }
+  }
+  ManageDeoptimization();
+}
+
 void Dbg::RequestDeoptimization(const DeoptimizationRequest& req) {
   if (req.kind == DeoptimizationRequest::kNothing) {
     // Nothing to do.
     return;
   }
   MutexLock mu(Thread::Current(), *deoptimization_lock_);
+  RequestDeoptimizationLocked(req);
+}
+
+void Dbg::RequestDeoptimizationLocked(const DeoptimizationRequest& req) {
   switch (req.kind) {
     case DeoptimizationRequest::kFullDeoptimization: {
       DCHECK(req.method == nullptr);
       if (full_deoptimization_event_count_ == 0) {
-        VLOG(jdwp) << "Request full deoptimization";
+        VLOG(jdwp) << "Queue request #" << deoptimization_requests_.size()
+                   << " for full deoptimization";
         deoptimization_requests_.push_back(req);
       }
       ++full_deoptimization_event_count_;
@@ -2621,20 +2743,23 @@
       DCHECK_GT(full_deoptimization_event_count_, 0U);
       --full_deoptimization_event_count_;
       if (full_deoptimization_event_count_ == 0) {
-        VLOG(jdwp) << "Request full undeoptimization";
+        VLOG(jdwp) << "Queue request #" << deoptimization_requests_.size()
+                   << " for full undeoptimization";
         deoptimization_requests_.push_back(req);
       }
       break;
     }
     case DeoptimizationRequest::kSelectiveDeoptimization: {
       DCHECK(req.method != nullptr);
-      VLOG(jdwp) << "Request deoptimization of " << PrettyMethod(req.method);
+      VLOG(jdwp) << "Queue request #" << deoptimization_requests_.size()
+                 << " for deoptimization of " << PrettyMethod(req.method);
       deoptimization_requests_.push_back(req);
       break;
     }
     case DeoptimizationRequest::kSelectiveUndeoptimization: {
       DCHECK(req.method != nullptr);
-      VLOG(jdwp) << "Request undeoptimization of " << PrettyMethod(req.method);
+      VLOG(jdwp) << "Queue request #" << deoptimization_requests_.size()
+                 << " for undeoptimization of " << PrettyMethod(req.method);
       deoptimization_requests_.push_back(req);
       break;
     }
@@ -2662,7 +2787,9 @@
   const ThreadState old_state = self->SetStateUnsafe(kRunnable);
   {
     MutexLock mu(self, *deoptimization_lock_);
+    size_t req_index = 0;
     for (const DeoptimizationRequest& request : deoptimization_requests_) {
+      VLOG(jdwp) << "Process deoptimization request #" << req_index++;
       ProcessDeoptimizationRequest(request);
     }
     deoptimization_requests_.clear();
@@ -2909,8 +3036,9 @@
   //
 
   struct DebugCallbackContext {
-    explicit DebugCallbackContext(SingleStepControl* single_step_control, int32_t line_number)
-      : single_step_control_(single_step_control), line_number_(line_number),
+    explicit DebugCallbackContext(SingleStepControl* single_step_control, int32_t line_number,
+                                  const DexFile::CodeItem* code_item)
+      : single_step_control_(single_step_control), line_number_(line_number), code_item_(code_item),
         last_pc_valid(false), last_pc(0) {
     }
 
@@ -2937,7 +3065,7 @@
     ~DebugCallbackContext() {
       // If the line number was the last in the position table...
       if (last_pc_valid) {
-        size_t end = MethodHelper(single_step_control_->method).GetCodeItem()->insns_size_in_code_units_;
+        size_t end = code_item_->insns_size_in_code_units_;
         for (uint32_t dex_pc = last_pc; dex_pc < end; ++dex_pc) {
           single_step_control_->dex_pcs.insert(dex_pc);
         }
@@ -2946,15 +3074,17 @@
 
     SingleStepControl* const single_step_control_;
     const int32_t line_number_;
+    const DexFile::CodeItem* const code_item_;
     bool last_pc_valid;
     uint32_t last_pc;
   };
   single_step_control->dex_pcs.clear();
   mirror::ArtMethod* m = single_step_control->method;
   if (!m->IsNative()) {
-    DebugCallbackContext context(single_step_control, line_number);
     MethodHelper mh(m);
-    mh.GetDexFile().DecodeDebugInfo(mh.GetCodeItem(), m->IsStatic(), m->GetDexMethodIndex(),
+    const DexFile::CodeItem* const code_item = mh.GetCodeItem();
+    DebugCallbackContext context(single_step_control, line_number, code_item);
+    mh.GetDexFile().DecodeDebugInfo(code_item, m->IsStatic(), m->GetDexMethodIndex(),
                                     DebugCallbackContext::Callback, NULL, &context);
   }
 
@@ -2974,8 +3104,8 @@
     VLOG(jdwp) << "Single-step current line: " << line_number;
     VLOG(jdwp) << "Single-step current stack depth: " << single_step_control->stack_depth;
     VLOG(jdwp) << "Single-step dex_pc values:";
-    for (std::set<uint32_t>::iterator it = single_step_control->dex_pcs.begin(); it != single_step_control->dex_pcs.end(); ++it) {
-      VLOG(jdwp) << StringPrintf(" %#x", *it);
+    for (uint32_t dex_pc : single_step_control->dex_pcs) {
+      VLOG(jdwp) << StringPrintf(" %#x", dex_pc);
     }
   }
 
@@ -2990,8 +3120,7 @@
   if (error == JDWP::ERR_NONE) {
     SingleStepControl* single_step_control = thread->GetSingleStepControl();
     DCHECK(single_step_control != nullptr);
-    single_step_control->is_active = false;
-    single_step_control->dex_pcs.clear();
+    single_step_control->Clear();
   }
 }
 
diff --git a/runtime/debugger.h b/runtime/debugger.h
index 23c9c6a..bef708c 100644
--- a/runtime/debugger.h
+++ b/runtime/debugger.h
@@ -35,6 +35,7 @@
 
 namespace art {
 namespace mirror {
+class ArtField;
 class ArtMethod;
 class Class;
 class Object;
@@ -85,6 +86,8 @@
   void VisitRoots(RootCallback* callback, void* arg, uint32_t tid, RootType root_type)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  void Clear();
+
  private:
   DISALLOW_COPY_AND_ASSIGN(DebugInvokeReq);
 };
@@ -118,6 +121,10 @@
   void VisitRoots(RootCallback* callback, void* arg, uint32_t tid, RootType root_type)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  bool ContainsDexPc(uint32_t dex_pc) const;
+
+  void Clear();
+
  private:
   DISALLOW_COPY_AND_ASSIGN(SingleStepControl);
 };
@@ -297,6 +304,9 @@
   static void OutputMethodReturnValue(JDWP::MethodId method_id, const JValue* return_value,
                                       JDWP::ExpandBuf* pReply)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  static void OutputFieldValue(JDWP::FieldId field_id, const JValue* field_value,
+                               JDWP::ExpandBuf* pReply)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static JDWP::JdwpError GetBytecodes(JDWP::RefTypeId class_id, JDWP::MethodId method_id,
                                       std::vector<uint8_t>& bytecodes)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -411,8 +421,14 @@
                                 mirror::Object* thisPtr, int eventFlags,
                                 const JValue* return_value)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static void PostException(Thread* thread, const ThrowLocation& throw_location,
-                            mirror::ArtMethod* catch_method,
+  static void PostFieldAccessEvent(mirror::ArtMethod* m, int dex_pc, mirror::Object* this_object,
+                                   mirror::ArtField* f)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  static void PostFieldModificationEvent(mirror::ArtMethod* m, int dex_pc,
+                                         mirror::Object* this_object, mirror::ArtField* f,
+                                         const JValue* field_value)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  static void PostException(const ThrowLocation& throw_location, mirror::ArtMethod* catch_method,
                             uint32_t catch_dex_pc, mirror::Throwable* exception)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static void PostThreadStart(Thread* t)
@@ -432,6 +448,13 @@
       LOCKS_EXCLUDED(deoptimization_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Support delayed full undeoptimization requests. This is currently only used for single-step
+  // events.
+  static void DelayFullUndeoptimization() LOCKS_EXCLUDED(deoptimization_lock_);
+  static void ProcessDelayedFullUndeoptimizations()
+      LOCKS_EXCLUDED(deoptimization_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // Manage deoptimization after updating JDWP events list. Suspends all threads, processes each
   // request and finally resumes all threads.
   static void ManageDeoptimization()
@@ -538,9 +561,16 @@
   static void PostThreadStartOrStop(Thread*, uint32_t)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  static JDWP::ObjectId GetThisObjectIdForEvent(mirror::Object* this_object)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   static void ProcessDeoptimizationRequest(const DeoptimizationRequest& request)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  static void RequestDeoptimizationLocked(const DeoptimizationRequest& req)
+      EXCLUSIVE_LOCKS_REQUIRED(deoptimization_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   static Mutex* alloc_tracker_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
 
   static AllocRecord* recent_allocation_records_ PT_GUARDED_BY(alloc_tracker_lock_);
@@ -562,6 +592,10 @@
   // undeoptimize when the last event is unregistered (when the counter is set to 0).
   static size_t full_deoptimization_event_count_ GUARDED_BY(deoptimization_lock_);
 
+  // Count the number of full undeoptimization requests delayed to next resume or end of debug
+  // session.
+  static size_t delayed_full_undeoptimization_count_ GUARDED_BY(deoptimization_lock_);
+
   DISALLOW_COPY_AND_ASSIGN(Dbg);
 };
 
diff --git a/runtime/elf_file.cc b/runtime/elf_file.cc
index 0c8a4f0..01ca60f 100644
--- a/runtime/elf_file.cc
+++ b/runtime/elf_file.cc
@@ -22,6 +22,7 @@
 #include "base/logging.h"
 #include "base/stl_util.h"
 #include "utils.h"
+#include "instruction_set.h"
 
 namespace art {
 
@@ -773,6 +774,40 @@
 
 bool ElfFile::Load(bool executable, std::string* error_msg) {
   CHECK(program_header_only_) << file_->GetPath();
+
+  if (executable) {
+    InstructionSet elf_ISA = kNone;
+    switch (GetHeader().e_machine) {
+      case EM_ARM: {
+        elf_ISA = kArm;
+        break;
+      }
+      case EM_AARCH64: {
+        elf_ISA = kArm64;
+        break;
+      }
+      case EM_386: {
+        elf_ISA = kX86;
+        break;
+      }
+      case EM_X86_64: {
+        elf_ISA = kX86_64;
+        break;
+      }
+      case EM_MIPS: {
+        elf_ISA = kMips;
+        break;
+      }
+    }
+
+    if (elf_ISA != kRuntimeISA) {
+      std::ostringstream oss;
+      oss << "Expected ISA " << kRuntimeISA << " but found " << elf_ISA;
+      *error_msg = oss.str();
+      return false;
+    }
+  }
+
   for (Elf32_Word i = 0; i < GetProgramHeaderNum(); i++) {
     Elf32_Phdr& program_header = GetProgramHeader(i);
 
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 963c3d1..2b29591 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -250,6 +250,7 @@
         if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
           return fpr_args_ + (fpr_index_ * kBytesPerFprSpillLocation);
         }
+        return stack_args_ + (stack_index_ * kBytesStackArgLocation);
       }
     }
     if (gpr_index_ < kNumQuickGprArgs) {
@@ -283,6 +284,12 @@
   }
 
   void VisitArguments() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    // This implementation doesn't support reg-spill area for hard float
+    // ABI targets such as x86_64 and aarch64. So, for those targets whose
+    // 'kQuickSoftFloatAbi' is 'false':
+    //     (a) 'stack_args_' should point to the first method's argument
+    //     (b) whatever the argument type it is, the 'stack_index_' should
+    //         be moved forward along with every visiting.
     gpr_index_ = 0;
     fpr_index_ = 0;
     stack_index_ = 0;
@@ -290,10 +297,11 @@
       cur_type_ = Primitive::kPrimNot;
       is_split_long_or_double_ = false;
       Visit();
+      if (!kQuickSoftFloatAbi || kNumQuickGprArgs == 0) {
+        stack_index_++;
+      }
       if (kNumQuickGprArgs > 0) {
         gpr_index_++;
-      } else {
-        stack_index_++;
       }
     }
     for (uint32_t shorty_index = 1; shorty_index < shorty_len_; ++shorty_index) {
@@ -307,10 +315,11 @@
         case Primitive::kPrimInt:
           is_split_long_or_double_ = false;
           Visit();
+          if (!kQuickSoftFloatAbi || kNumQuickGprArgs == gpr_index_) {
+            stack_index_++;
+          }
           if (gpr_index_ < kNumQuickGprArgs) {
             gpr_index_++;
-          } else {
-            stack_index_++;
           }
           break;
         case Primitive::kPrimFloat:
@@ -325,9 +334,8 @@
           } else {
             if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
               fpr_index_++;
-            } else {
-              stack_index_++;
             }
+            stack_index_++;
           }
           break;
         case Primitive::kPrimDouble:
@@ -336,16 +344,7 @@
             is_split_long_or_double_ = (kBytesPerGprSpillLocation == 4) &&
                 ((gpr_index_ + 1) == kNumQuickGprArgs);
             Visit();
-            if (gpr_index_ < kNumQuickGprArgs) {
-              gpr_index_++;
-              if (kBytesPerGprSpillLocation == 4) {
-                if (gpr_index_ < kNumQuickGprArgs) {
-                  gpr_index_++;
-                } else {
-                  stack_index_++;
-                }
-              }
-            } else {
+            if (!kQuickSoftFloatAbi || kNumQuickGprArgs == gpr_index_) {
               if (kBytesStackArgLocation == 4) {
                 stack_index_+= 2;
               } else {
@@ -353,6 +352,16 @@
                 stack_index_++;
               }
             }
+            if (gpr_index_ < kNumQuickGprArgs) {
+              gpr_index_++;
+              if (kBytesPerGprSpillLocation == 4) {
+                if (gpr_index_ < kNumQuickGprArgs) {
+                  gpr_index_++;
+                } else if (kQuickSoftFloatAbi) {
+                  stack_index_++;
+                }
+              }
+            }
           } else {
             is_split_long_or_double_ = (kBytesPerFprSpillLocation == 4) &&
                 ((fpr_index_ + 1) == kNumQuickFprArgs);
@@ -362,17 +371,14 @@
               if (kBytesPerFprSpillLocation == 4) {
                 if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
                   fpr_index_++;
-                } else {
-                  stack_index_++;
                 }
               }
+            }
+            if (kBytesStackArgLocation == 4) {
+              stack_index_+= 2;
             } else {
-              if (kBytesStackArgLocation == 4) {
-                stack_index_+= 2;
-              } else {
-                CHECK_EQ(kBytesStackArgLocation, 8U);
-                stack_index_++;
-              }
+              CHECK_EQ(kBytesStackArgLocation, 8U);
+              stack_index_++;
             }
           }
           break;
@@ -389,59 +395,10 @@
       CHECK_EQ(kNumQuickFprArgs, 0U);
       return (kNumQuickGprArgs * kBytesPerGprSpillLocation) + kBytesPerGprSpillLocation /* ArtMethod* */;
     } else {
-      size_t offset = kBytesPerGprSpillLocation;  // Skip Method*.
-      size_t gprs_seen = 0;
-      size_t fprs_seen = 0;
-      if (!is_static && (gprs_seen < kNumQuickGprArgs)) {
-        gprs_seen++;
-        offset += kBytesStackArgLocation;
-      }
-      for (uint32_t i = 1; i < shorty_len; ++i) {
-        switch (shorty[i]) {
-          case 'Z':
-          case 'B':
-          case 'C':
-          case 'S':
-          case 'I':
-          case 'L':
-            if (gprs_seen < kNumQuickGprArgs) {
-              gprs_seen++;
-              offset += kBytesStackArgLocation;
-            }
-            break;
-          case 'J':
-            if (gprs_seen < kNumQuickGprArgs) {
-              gprs_seen++;
-              offset += 2 * kBytesStackArgLocation;
-              if (kBytesPerGprSpillLocation == 4) {
-                if (gprs_seen < kNumQuickGprArgs) {
-                  gprs_seen++;
-                }
-              }
-            }
-            break;
-          case 'F':
-            if ((kNumQuickFprArgs != 0) && (fprs_seen + 1 < kNumQuickFprArgs + 1)) {
-              fprs_seen++;
-              offset += kBytesStackArgLocation;
-            }
-            break;
-          case 'D':
-            if ((kNumQuickFprArgs != 0) && (fprs_seen + 1 < kNumQuickFprArgs + 1)) {
-              fprs_seen++;
-              offset += 2 * kBytesStackArgLocation;
-              if (kBytesPerFprSpillLocation == 4) {
-                if ((kNumQuickFprArgs != 0) && (fprs_seen + 1 < kNumQuickFprArgs + 1)) {
-                  fprs_seen++;
-                }
-              }
-            }
-            break;
-          default:
-            LOG(FATAL) << "Unexpected shorty character: " << shorty[i] << " in " << shorty;
-        }
-      }
-      return offset;
+      // For now, there is no reg-spill area for the targets with
+      // hard float ABI. So, the offset pointing to the first method's
+      // parameter ('this' for non-static methods) should be returned.
+      return kBytesPerGprSpillLocation;  // Skip Method*.
     }
   }
 
@@ -1496,6 +1453,22 @@
 
 extern "C" void* artFindNativeMethod();
 
+uint64_t artQuickGenericJniEndJNIRef(Thread* self, uint32_t cookie, jobject l, jobject lock) {
+  if (lock != nullptr) {
+    return reinterpret_cast<uint64_t>(JniMethodEndWithReferenceSynchronized(l, cookie, lock, self));
+  } else {
+    return reinterpret_cast<uint64_t>(JniMethodEndWithReference(l, cookie, self));
+  }
+}
+
+void artQuickGenericJniEndJNINonRef(Thread* self, uint32_t cookie, jobject lock) {
+  if (lock != nullptr) {
+    JniMethodEndSynchronized(cookie, lock, self);
+  } else {
+    JniMethodEnd(cookie, self);
+  }
+}
+
 /*
  * Initializes an alloca region assumed to be directly below sp for a native call:
  * Create a Sirt and call stack and fill a mini stack with values to be pushed to registers.
@@ -1555,6 +1528,15 @@
 
     if (nativeCode == nullptr) {
       DCHECK(self->IsExceptionPending());    // There should be an exception pending now.
+
+      // End JNI, as the assembly will move to deliver the exception.
+      jobject lock = called->IsSynchronized() ? visitor.GetFirstSirtEntry() : nullptr;
+      if (mh.GetShorty()[0] == 'L') {
+        artQuickGenericJniEndJNIRef(self, cookie, nullptr, lock);
+      } else {
+        artQuickGenericJniEndJNINonRef(self, cookie, lock);
+      }
+
       return -1;
     }
     // Note that the native code pointer will be automatically set by artFindNativeMethod().
@@ -1580,33 +1562,21 @@
   mirror::ArtMethod* called = *sp;
   uint32_t cookie = *(sp32 - 1);
 
+  jobject lock = nullptr;
+  if (called->IsSynchronized()) {
+    StackIndirectReferenceTable* table =
+        reinterpret_cast<StackIndirectReferenceTable*>(
+            reinterpret_cast<uint8_t*>(sp) + kPointerSize);
+    lock = reinterpret_cast<jobject>(table->GetStackReference(0));
+  }
+
   MethodHelper mh(called);
   char return_shorty_char = mh.GetShorty()[0];
 
   if (return_shorty_char == 'L') {
-    // the only special ending call
-    if (called->IsSynchronized()) {
-      StackIndirectReferenceTable* table =
-          reinterpret_cast<StackIndirectReferenceTable*>(
-              reinterpret_cast<uint8_t*>(sp) + kPointerSize);
-      jobject tmp = reinterpret_cast<jobject>(table->GetStackReference(0));
-
-      return reinterpret_cast<uint64_t>(JniMethodEndWithReferenceSynchronized(result.l, cookie, tmp,
-                                                                              self));
-    } else {
-      return reinterpret_cast<uint64_t>(JniMethodEndWithReference(result.l, cookie, self));
-    }
+    return artQuickGenericJniEndJNIRef(self, cookie, result.l, lock);
   } else {
-    if (called->IsSynchronized()) {
-      StackIndirectReferenceTable* table =
-          reinterpret_cast<StackIndirectReferenceTable*>(
-              reinterpret_cast<uint8_t*>(sp) + kPointerSize);
-      jobject tmp = reinterpret_cast<jobject>(table->GetStackReference(0));
-
-      JniMethodEndSynchronized(cookie, tmp, self);
-    } else {
-      JniMethodEnd(cookie, self);
-    }
+    artQuickGenericJniEndJNINonRef(self, cookie, lock);
 
     switch (return_shorty_char) {
       case 'F':  // Fall-through.
diff --git a/runtime/gc/accounting/card_table-inl.h b/runtime/gc/accounting/card_table-inl.h
index 564168e..a1d001e 100644
--- a/runtime/gc/accounting/card_table-inl.h
+++ b/runtime/gc/accounting/card_table-inl.h
@@ -43,7 +43,7 @@
 }
 
 template <typename Visitor>
-inline size_t CardTable::Scan(SpaceBitmap* bitmap, byte* scan_begin, byte* scan_end,
+inline size_t CardTable::Scan(ContinuousSpaceBitmap* bitmap, byte* scan_begin, byte* scan_end,
                               const Visitor& visitor, const byte minimum_age) const {
   DCHECK(bitmap->HasAddress(scan_begin));
   DCHECK(bitmap->HasAddress(scan_end - 1));  // scan_end is the byte after the last byte we scan.
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h
index 8b7bfd3..8d5dc07 100644
--- a/runtime/gc/accounting/card_table.h
+++ b/runtime/gc/accounting/card_table.h
@@ -38,7 +38,7 @@
 
 namespace accounting {
 
-class SpaceBitmap;
+template<size_t kAlignment> class SpaceBitmap;
 
 // Maintain a card table from the the write barrier. All writes of
 // non-NULL values to heap addresses should go through an entry in
@@ -102,7 +102,8 @@
   // For every dirty at least minumum age between begin and end invoke the visitor with the
   // specified argument. Returns how many cards the visitor was run on.
   template <typename Visitor>
-  size_t Scan(SpaceBitmap* bitmap, byte* scan_begin, byte* scan_end, const Visitor& visitor,
+  size_t Scan(SpaceBitmap<kObjectAlignment>* bitmap, byte* scan_begin, byte* scan_end,
+              const Visitor& visitor,
               const byte minimum_age = kCardDirty) const
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/gc/accounting/heap_bitmap-inl.h b/runtime/gc/accounting/heap_bitmap-inl.h
index 04e85d2..c67542f 100644
--- a/runtime/gc/accounting/heap_bitmap-inl.h
+++ b/runtime/gc/accounting/heap_bitmap-inl.h
@@ -30,44 +30,73 @@
   for (const auto& bitmap : continuous_space_bitmaps_) {
     bitmap->VisitMarkedRange(bitmap->HeapBegin(), bitmap->HeapLimit(), visitor);
   }
-  DCHECK(!discontinuous_space_sets_.empty());
-  for (const auto& space_set : discontinuous_space_sets_) {
-    space_set->Visit(visitor);
+  for (const auto& bitmap : large_object_bitmaps_) {
+    bitmap->VisitMarkedRange(bitmap->HeapBegin(), bitmap->HeapLimit(), visitor);
   }
 }
 
 inline bool HeapBitmap::Test(const mirror::Object* obj) {
-  SpaceBitmap* bitmap = GetContinuousSpaceBitmap(obj);
+  ContinuousSpaceBitmap* bitmap = GetContinuousSpaceBitmap(obj);
   if (LIKELY(bitmap != nullptr)) {
     return bitmap->Test(obj);
-  } else {
-    return GetDiscontinuousSpaceObjectSet(obj) != NULL;
   }
+  for (const auto& bitmap : large_object_bitmaps_) {
+    if (LIKELY(bitmap->HasAddress(obj))) {
+      return bitmap->Test(obj);
+    }
+  }
+  LOG(FATAL) << "Invalid object " << obj;
+  return false;
 }
 
 inline void HeapBitmap::Clear(const mirror::Object* obj) {
-  SpaceBitmap* bitmap = GetContinuousSpaceBitmap(obj);
+  ContinuousSpaceBitmap* bitmap = GetContinuousSpaceBitmap(obj);
   if (LIKELY(bitmap != nullptr)) {
     bitmap->Clear(obj);
-  } else {
-    ObjectSet* set = GetDiscontinuousSpaceObjectSet(obj);
-    DCHECK(set != NULL);
-    set->Clear(obj);
+    return;
   }
+  for (const auto& bitmap : large_object_bitmaps_) {
+    if (LIKELY(bitmap->HasAddress(obj))) {
+      bitmap->Clear(obj);
+    }
+  }
+  LOG(FATAL) << "Invalid object " << obj;
 }
 
-inline void HeapBitmap::Set(const mirror::Object* obj) {
-  SpaceBitmap* bitmap = GetContinuousSpaceBitmap(obj);
-  if (LIKELY(bitmap != NULL)) {
-    bitmap->Set(obj);
-  } else {
-    ObjectSet* set = GetDiscontinuousSpaceObjectSet(obj);
-    DCHECK(set != NULL);
-    set->Set(obj);
+template<typename LargeObjectSetVisitor>
+inline bool HeapBitmap::Set(const mirror::Object* obj, const LargeObjectSetVisitor& visitor) {
+  ContinuousSpaceBitmap* bitmap = GetContinuousSpaceBitmap(obj);
+  if (LIKELY(bitmap != nullptr)) {
+    return bitmap->Set(obj);
   }
+  visitor(obj);
+  for (const auto& bitmap : large_object_bitmaps_) {
+    if (LIKELY(bitmap->HasAddress(obj))) {
+      return bitmap->Set(obj);
+    }
+  }
+  LOG(FATAL) << "Invalid object " << obj;
+  return false;
 }
 
-inline SpaceBitmap* HeapBitmap::GetContinuousSpaceBitmap(const mirror::Object* obj) const {
+template<typename LargeObjectSetVisitor>
+inline bool HeapBitmap::AtomicTestAndSet(const mirror::Object* obj,
+                                         const LargeObjectSetVisitor& visitor) {
+  ContinuousSpaceBitmap* bitmap = GetContinuousSpaceBitmap(obj);
+  if (LIKELY(bitmap != nullptr)) {
+    return bitmap->AtomicTestAndSet(obj);
+  }
+  visitor(obj);
+  for (const auto& bitmap : large_object_bitmaps_) {
+    if (LIKELY(bitmap->HasAddress(obj))) {
+      return bitmap->AtomicTestAndSet(obj);
+    }
+  }
+  LOG(FATAL) << "Invalid object " << obj;
+  return false;
+}
+
+inline ContinuousSpaceBitmap* HeapBitmap::GetContinuousSpaceBitmap(const mirror::Object* obj) const {
   for (const auto& bitmap : continuous_space_bitmaps_) {
     if (bitmap->HasAddress(obj)) {
       return bitmap;
@@ -76,15 +105,6 @@
   return nullptr;
 }
 
-inline ObjectSet* HeapBitmap::GetDiscontinuousSpaceObjectSet(const mirror::Object* obj) const {
-  for (const auto& space_set : discontinuous_space_sets_) {
-    if (space_set->Test(obj)) {
-      return space_set;
-    }
-  }
-  return nullptr;
-}
-
 }  // namespace accounting
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/accounting/heap_bitmap.cc b/runtime/gc/accounting/heap_bitmap.cc
index f94cf24..a5d59bf 100644
--- a/runtime/gc/accounting/heap_bitmap.cc
+++ b/runtime/gc/accounting/heap_bitmap.cc
@@ -16,68 +16,67 @@
 
 #include "heap_bitmap.h"
 
+#include "gc/accounting/space_bitmap-inl.h"
 #include "gc/space/space.h"
 
 namespace art {
 namespace gc {
 namespace accounting {
 
-void HeapBitmap::ReplaceBitmap(SpaceBitmap* old_bitmap, SpaceBitmap* new_bitmap) {
-  for (auto& bitmap : continuous_space_bitmaps_) {
-    if (bitmap == old_bitmap) {
-      bitmap = new_bitmap;
-      return;
-    }
-  }
-  LOG(FATAL) << "bitmap " << static_cast<const void*>(old_bitmap) << " not found";
+void HeapBitmap::ReplaceBitmap(ContinuousSpaceBitmap* old_bitmap,
+                               ContinuousSpaceBitmap* new_bitmap) {
+  auto it = std::find(continuous_space_bitmaps_.begin(), continuous_space_bitmaps_.end(),
+                      old_bitmap);
+  CHECK(it != continuous_space_bitmaps_.end()) << " continuous space bitmap " << old_bitmap
+      << " not found";
+  *it = new_bitmap;
 }
 
-void HeapBitmap::ReplaceObjectSet(ObjectSet* old_set, ObjectSet* new_set) {
-  for (auto& space_set : discontinuous_space_sets_) {
-    if (space_set == old_set) {
-      space_set = new_set;
-      return;
-    }
-  }
-  LOG(FATAL) << "object set " << static_cast<const void*>(old_set) << " not found";
+void HeapBitmap::ReplaceLargeObjectBitmap(LargeObjectBitmap* old_bitmap,
+                                          LargeObjectBitmap* new_bitmap) {
+  auto it = std::find(large_object_bitmaps_.begin(), large_object_bitmaps_.end(), old_bitmap);
+  CHECK(it != large_object_bitmaps_.end()) << " large object bitmap " << old_bitmap
+      << " not found";
+  *it = new_bitmap;
 }
 
-void HeapBitmap::AddContinuousSpaceBitmap(accounting::SpaceBitmap* bitmap) {
-  DCHECK(bitmap != NULL);
-
-  // Check for interval overlap.
+void HeapBitmap::AddContinuousSpaceBitmap(accounting::ContinuousSpaceBitmap* bitmap) {
+  DCHECK(bitmap != nullptr);
+  // Check that there is no bitmap overlap.
   for (const auto& cur_bitmap : continuous_space_bitmaps_) {
-    CHECK(!(
-        bitmap->HeapBegin() < cur_bitmap->HeapLimit() &&
-        bitmap->HeapLimit() > cur_bitmap->HeapBegin()))
-        << "Bitmap " << bitmap->Dump() << " overlaps with existing bitmap " << cur_bitmap->Dump();
+    CHECK(bitmap->HeapBegin() >= cur_bitmap->HeapLimit() ||
+          bitmap->HeapLimit() <= cur_bitmap->HeapBegin())
+              << "Bitmap " << bitmap->Dump() << " overlaps with existing bitmap "
+              << cur_bitmap->Dump();
   }
   continuous_space_bitmaps_.push_back(bitmap);
 }
 
-void HeapBitmap::RemoveContinuousSpaceBitmap(accounting::SpaceBitmap* bitmap) {
+void HeapBitmap::RemoveContinuousSpaceBitmap(accounting::ContinuousSpaceBitmap* bitmap) {
+  DCHECK(bitmap != nullptr);
   auto it = std::find(continuous_space_bitmaps_.begin(), continuous_space_bitmaps_.end(), bitmap);
   DCHECK(it != continuous_space_bitmaps_.end());
   continuous_space_bitmaps_.erase(it);
 }
 
-void HeapBitmap::AddDiscontinuousObjectSet(ObjectSet* set) {
-  DCHECK(set != NULL);
-  discontinuous_space_sets_.push_back(set);
+void HeapBitmap::AddLargeObjectBitmap(LargeObjectBitmap* bitmap) {
+  DCHECK(bitmap != nullptr);
+  large_object_bitmaps_.push_back(bitmap);
 }
 
-void HeapBitmap::RemoveDiscontinuousObjectSet(ObjectSet* set) {
-  auto it = std::find(discontinuous_space_sets_.begin(), discontinuous_space_sets_.end(), set);
-  DCHECK(it != discontinuous_space_sets_.end());
-  discontinuous_space_sets_.erase(it);
+void HeapBitmap::RemoveLargeObjectBitmap(LargeObjectBitmap* bitmap) {
+  DCHECK(bitmap != nullptr);
+  auto it = std::find(large_object_bitmaps_.begin(), large_object_bitmaps_.end(), bitmap);
+  DCHECK(it != large_object_bitmaps_.end());
+  large_object_bitmaps_.erase(it);
 }
 
 void HeapBitmap::Walk(ObjectCallback* callback, void* arg) {
   for (const auto& bitmap : continuous_space_bitmaps_) {
     bitmap->Walk(callback, arg);
   }
-  for (const auto& space_set : discontinuous_space_sets_) {
-    space_set->Walk(callback, arg);
+  for (const auto& bitmap : large_object_bitmaps_) {
+    bitmap->Walk(callback, arg);
   }
 }
 
diff --git a/runtime/gc/accounting/heap_bitmap.h b/runtime/gc/accounting/heap_bitmap.h
index f729c0e..814dc06 100644
--- a/runtime/gc/accounting/heap_bitmap.h
+++ b/runtime/gc/accounting/heap_bitmap.h
@@ -33,9 +33,13 @@
  public:
   bool Test(const mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
   void Clear(const mirror::Object* obj) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-  void Set(const mirror::Object* obj) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-  SpaceBitmap* GetContinuousSpaceBitmap(const mirror::Object* obj) const;
-  ObjectSet* GetDiscontinuousSpaceObjectSet(const mirror::Object* obj) const;
+  template<typename LargeObjectSetVisitor>
+  bool Set(const mirror::Object* obj, const LargeObjectSetVisitor& visitor)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) ALWAYS_INLINE;
+  template<typename LargeObjectSetVisitor>
+  bool AtomicTestAndSet(const mirror::Object* obj, const LargeObjectSetVisitor& visitor)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) ALWAYS_INLINE;
+  ContinuousSpaceBitmap* GetContinuousSpaceBitmap(const mirror::Object* obj) const;
 
   void Walk(ObjectCallback* callback, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
@@ -46,11 +50,11 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Find and replace a bitmap pointer, this is used by for the bitmap swapping in the GC.
-  void ReplaceBitmap(SpaceBitmap* old_bitmap, SpaceBitmap* new_bitmap)
+  void ReplaceBitmap(ContinuousSpaceBitmap* old_bitmap, ContinuousSpaceBitmap* new_bitmap)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Find and replace a object set pointer, this is used by for the bitmap swapping in the GC.
-  void ReplaceObjectSet(ObjectSet* old_set, ObjectSet* new_set)
+  void ReplaceLargeObjectBitmap(LargeObjectBitmap* old_bitmap, LargeObjectBitmap* new_bitmap)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   explicit HeapBitmap(Heap* heap) : heap_(heap) {}
@@ -58,16 +62,17 @@
  private:
   const Heap* const heap_;
 
-  void AddContinuousSpaceBitmap(SpaceBitmap* bitmap);
-  void RemoveContinuousSpaceBitmap(SpaceBitmap* bitmap);
-  void AddDiscontinuousObjectSet(ObjectSet* set);
-  void RemoveDiscontinuousObjectSet(ObjectSet* set);
+  void AddContinuousSpaceBitmap(ContinuousSpaceBitmap* bitmap);
+  void RemoveContinuousSpaceBitmap(ContinuousSpaceBitmap* bitmap);
+  void AddLargeObjectBitmap(LargeObjectBitmap* bitmap);
+  void RemoveLargeObjectBitmap(LargeObjectBitmap* bitmap);
 
   // Bitmaps covering continuous spaces.
-  std::vector<SpaceBitmap*, GcAllocator<SpaceBitmap*>> continuous_space_bitmaps_;
+  std::vector<ContinuousSpaceBitmap*, GcAllocator<ContinuousSpaceBitmap*>>
+      continuous_space_bitmaps_;
 
   // Sets covering discontinuous spaces.
-  std::vector<ObjectSet*, GcAllocator<ObjectSet*>> discontinuous_space_sets_;
+  std::vector<LargeObjectBitmap*, GcAllocator<LargeObjectBitmap*>> large_object_bitmaps_;
 
   friend class art::gc::Heap;
 };
diff --git a/runtime/gc/accounting/mod_union_table.cc b/runtime/gc/accounting/mod_union_table.cc
index 34ca654..d744dee 100644
--- a/runtime/gc/accounting/mod_union_table.cc
+++ b/runtime/gc/accounting/mod_union_table.cc
@@ -19,6 +19,7 @@
 #include "base/stl_util.h"
 #include "card_table-inl.h"
 #include "heap_bitmap.h"
+#include "gc/accounting/space_bitmap-inl.h"
 #include "gc/collector/mark_sweep.h"
 #include "gc/collector/mark_sweep-inl.h"
 #include "gc/heap.h"
@@ -222,7 +223,7 @@
 
   // Check the references of each clean card which is also in the mod union table.
   CardTable* card_table = heap_->GetCardTable();
-  SpaceBitmap* live_bitmap = space_->GetLiveBitmap();
+  ContinuousSpaceBitmap* live_bitmap = space_->GetLiveBitmap();
   for (const auto& ref_pair : references_) {
     const byte* card = ref_pair.first;
     if (*card == CardTable::kCardClean) {
@@ -272,7 +273,7 @@
     uintptr_t end = start + CardTable::kCardSize;
     auto* space = heap_->FindContinuousSpaceFromObject(reinterpret_cast<Object*>(start), false);
     DCHECK(space != nullptr);
-    SpaceBitmap* live_bitmap = space->GetLiveBitmap();
+    ContinuousSpaceBitmap* live_bitmap = space->GetLiveBitmap();
     live_bitmap->VisitMarkedRange(start, end, add_visitor);
 
     // Update the corresponding references for the card.
@@ -312,7 +313,7 @@
                                                      void* arg) {
   CardTable* card_table = heap_->GetCardTable();
   ModUnionScanImageRootVisitor scan_visitor(callback, arg);
-  SpaceBitmap* bitmap = space_->GetLiveBitmap();
+  ContinuousSpaceBitmap* bitmap = space_->GetLiveBitmap();
   for (const byte* card_addr : cleared_cards_) {
     uintptr_t start = reinterpret_cast<uintptr_t>(card_table->AddrFromCard(card_addr));
     DCHECK(space_->HasAddress(reinterpret_cast<Object*>(start)));
diff --git a/runtime/gc/accounting/mod_union_table.h b/runtime/gc/accounting/mod_union_table.h
index c3a90e2..5ae7c77 100644
--- a/runtime/gc/accounting/mod_union_table.h
+++ b/runtime/gc/accounting/mod_union_table.h
@@ -44,7 +44,6 @@
 
 namespace accounting {
 
-class SpaceBitmap;
 class HeapBitmap;
 
 // The mod-union table is the union of modified cards. It is used to allow the card table to be
diff --git a/runtime/gc/accounting/remembered_set.cc b/runtime/gc/accounting/remembered_set.cc
index 56f7caa..044216e 100644
--- a/runtime/gc/accounting/remembered_set.cc
+++ b/runtime/gc/accounting/remembered_set.cc
@@ -112,7 +112,7 @@
   bool contains_reference_to_target_space = false;
   RememberedSetObjectVisitor obj_visitor(callback, target_space,
                                          &contains_reference_to_target_space, arg);
-  SpaceBitmap* bitmap = space_->GetLiveBitmap();
+  ContinuousSpaceBitmap* bitmap = space_->GetLiveBitmap();
   CardSet remove_card_set;
   for (byte* const card_addr : dirty_cards_) {
     contains_reference_to_target_space = false;
diff --git a/runtime/gc/accounting/space_bitmap-inl.h b/runtime/gc/accounting/space_bitmap-inl.h
index 880ff1f..ed140e0 100644
--- a/runtime/gc/accounting/space_bitmap-inl.h
+++ b/runtime/gc/accounting/space_bitmap-inl.h
@@ -17,14 +17,26 @@
 #ifndef ART_RUNTIME_GC_ACCOUNTING_SPACE_BITMAP_INL_H_
 #define ART_RUNTIME_GC_ACCOUNTING_SPACE_BITMAP_INL_H_
 
+#include "space_bitmap.h"
+
 #include "base/logging.h"
+#include "dex_file-inl.h"
+#include "heap_bitmap.h"
+#include "mirror/art_field-inl.h"
+#include "mirror/class-inl.h"
+#include "mirror/object-inl.h"
+#include "mirror/object_array-inl.h"
+#include "object_utils.h"
+#include "space_bitmap-inl.h"
+#include "UniquePtr.h"
 #include "utils.h"
 
 namespace art {
 namespace gc {
 namespace accounting {
 
-inline bool SpaceBitmap::AtomicTestAndSet(const mirror::Object* obj) {
+template<size_t kAlignment>
+inline bool SpaceBitmap<kAlignment>::AtomicTestAndSet(const mirror::Object* obj) {
   uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
   DCHECK_GE(addr, heap_begin_);
   const uintptr_t offset = addr - heap_begin_;
@@ -45,7 +57,8 @@
   return false;
 }
 
-inline bool SpaceBitmap::Test(const mirror::Object* obj) const {
+template<size_t kAlignment>
+inline bool SpaceBitmap<kAlignment>::Test(const mirror::Object* obj) const {
   uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
   DCHECK(HasAddress(obj)) << obj;
   DCHECK(bitmap_begin_ != NULL);
@@ -54,9 +67,9 @@
   return (bitmap_begin_[OffsetToIndex(offset)] & OffsetToMask(offset)) != 0;
 }
 
-template <typename Visitor>
-void SpaceBitmap::VisitMarkedRange(uintptr_t visit_begin, uintptr_t visit_end,
-                                   const Visitor& visitor) const {
+template<size_t kAlignment> template<typename Visitor>
+inline void SpaceBitmap<kAlignment>::VisitMarkedRange(uintptr_t visit_begin, uintptr_t visit_end,
+                                                      const Visitor& visitor) const {
   DCHECK_LT(visit_begin, visit_end);
 #if 0
   for (uintptr_t i = visit_begin; i < visit_end; i += kAlignment) {
@@ -148,7 +161,8 @@
 #endif
 }
 
-inline bool SpaceBitmap::Modify(const mirror::Object* obj, bool do_set) {
+template<size_t kAlignment> template<bool kSetBit>
+inline bool SpaceBitmap<kAlignment>::Modify(const mirror::Object* obj) {
   uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
   DCHECK_GE(addr, heap_begin_);
   const uintptr_t offset = addr - heap_begin_;
@@ -157,15 +171,24 @@
   DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_;
   uword* address = &bitmap_begin_[index];
   uword old_word = *address;
-  if (do_set) {
+  if (kSetBit) {
     *address = old_word | mask;
   } else {
     *address = old_word & ~mask;
   }
-  DCHECK_EQ(Test(obj), do_set);
+  DCHECK_EQ(Test(obj), kSetBit);
   return (old_word & mask) != 0;
 }
 
+template<size_t kAlignment>
+inline std::ostream& operator << (std::ostream& stream, const SpaceBitmap<kAlignment>& bitmap) {
+  return stream
+    << bitmap.GetName() << "["
+    << "begin=" << reinterpret_cast<const void*>(bitmap.HeapBegin())
+    << ",end=" << reinterpret_cast<const void*>(bitmap.HeapLimit())
+    << "]";
+}
+
 }  // namespace accounting
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/accounting/space_bitmap.cc b/runtime/gc/accounting/space_bitmap.cc
index 1957c21..31a1537 100644
--- a/runtime/gc/accounting/space_bitmap.cc
+++ b/runtime/gc/accounting/space_bitmap.cc
@@ -14,68 +14,52 @@
  * limitations under the License.
  */
 
-#include "base/logging.h"
-#include "dex_file-inl.h"
-#include "heap_bitmap.h"
-#include "mirror/art_field-inl.h"
-#include "mirror/class-inl.h"
-#include "mirror/object-inl.h"
-#include "mirror/object_array-inl.h"
-#include "object_utils.h"
 #include "space_bitmap-inl.h"
-#include "UniquePtr.h"
-#include "utils.h"
 
 namespace art {
 namespace gc {
 namespace accounting {
 
-std::string SpaceBitmap::GetName() const {
-  return name_;
-}
-
-void SpaceBitmap::SetName(const std::string& name) {
-  name_ = name;
-}
-
-std::string SpaceBitmap::Dump() const {
-  return StringPrintf("%s: %p-%p", name_.c_str(),
-                      reinterpret_cast<void*>(HeapBegin()),
-                      reinterpret_cast<void*>(HeapLimit()));
-}
-
-void ObjectSet::Walk(ObjectCallback* callback, void* arg) {
-  for (const mirror::Object* obj : contained_) {
-    callback(const_cast<mirror::Object*>(obj), arg);
-  }
-}
-
-SpaceBitmap* SpaceBitmap::CreateFromMemMap(const std::string& name, MemMap* mem_map,
-                                           byte* heap_begin, size_t heap_capacity) {
+template<size_t kAlignment>
+SpaceBitmap<kAlignment>* SpaceBitmap<kAlignment>::CreateFromMemMap(
+    const std::string& name, MemMap* mem_map, byte* heap_begin, size_t heap_capacity) {
   CHECK(mem_map != nullptr);
   uword* bitmap_begin = reinterpret_cast<uword*>(mem_map->Begin());
-  size_t bitmap_size = OffsetToIndex(RoundUp(heap_capacity, kAlignment * kBitsPerWord)) * kWordSize;
+  const uint64_t kBytesCoveredPerWord = kAlignment * kBitsPerWord;
+  size_t bitmap_size = (RoundUp(static_cast<uint64_t>(heap_capacity), kBytesCoveredPerWord) /
+      kBytesCoveredPerWord) * kWordSize;
   return new SpaceBitmap(name, mem_map, bitmap_begin, bitmap_size, heap_begin);
 }
 
-SpaceBitmap* SpaceBitmap::Create(const std::string& name, byte* heap_begin, size_t heap_capacity) {
-  CHECK(heap_begin != NULL);
+template<size_t kAlignment>
+SpaceBitmap<kAlignment>::SpaceBitmap(const std::string& name, MemMap* mem_map, uword* bitmap_begin,
+                                     size_t bitmap_size, const void* heap_begin)
+    : mem_map_(mem_map), bitmap_begin_(bitmap_begin), bitmap_size_(bitmap_size),
+      heap_begin_(reinterpret_cast<uintptr_t>(heap_begin)),
+      name_(name) {
+  CHECK(bitmap_begin_ != nullptr);
+  CHECK_NE(bitmap_size, 0U);
+}
+
+template<size_t kAlignment>
+SpaceBitmap<kAlignment>* SpaceBitmap<kAlignment>::Create(
+    const std::string& name, byte* heap_begin, size_t heap_capacity) {
   // Round up since heap_capacity is not necessarily a multiple of kAlignment * kBitsPerWord.
-  size_t bitmap_size = OffsetToIndex(RoundUp(heap_capacity, kAlignment * kBitsPerWord)) * kWordSize;
+  const uint64_t kBytesCoveredPerWord = kAlignment * kBitsPerWord;
+  size_t bitmap_size = (RoundUp(static_cast<uint64_t>(heap_capacity), kBytesCoveredPerWord) /
+      kBytesCoveredPerWord) * kWordSize;
   std::string error_msg;
-  UniquePtr<MemMap> mem_map(MemMap::MapAnonymous(name.c_str(), NULL, bitmap_size,
+  UniquePtr<MemMap> mem_map(MemMap::MapAnonymous(name.c_str(), nullptr, bitmap_size,
                                                  PROT_READ | PROT_WRITE, false, &error_msg));
   if (UNLIKELY(mem_map.get() == nullptr)) {
     LOG(ERROR) << "Failed to allocate bitmap " << name << ": " << error_msg;
-    return NULL;
+    return nullptr;
   }
   return CreateFromMemMap(name, mem_map.release(), heap_begin, heap_capacity);
 }
 
-// Clean up any resources associated with the bitmap.
-SpaceBitmap::~SpaceBitmap() {}
-
-void SpaceBitmap::SetHeapLimit(uintptr_t new_end) {
+template<size_t kAlignment>
+void SpaceBitmap<kAlignment>::SetHeapLimit(uintptr_t new_end) {
   DCHECK(IsAligned<kBitsPerWord * kAlignment>(new_end));
   size_t new_size = OffsetToIndex(new_end - heap_begin_) * kWordSize;
   if (new_size < bitmap_size_) {
@@ -85,7 +69,8 @@
   // should be marked.
 }
 
-void SpaceBitmap::Clear() {
+template<size_t kAlignment>
+void SpaceBitmap<kAlignment>::Clear() {
   if (bitmap_begin_ != NULL) {
     // This returns the memory to the system.  Successive page faults will return zeroed memory.
     int result = madvise(bitmap_begin_, bitmap_size_, MADV_DONTNEED);
@@ -95,14 +80,14 @@
   }
 }
 
-void SpaceBitmap::CopyFrom(SpaceBitmap* source_bitmap) {
+template<size_t kAlignment>
+void SpaceBitmap<kAlignment>::CopyFrom(SpaceBitmap* source_bitmap) {
   DCHECK_EQ(Size(), source_bitmap->Size());
   std::copy(source_bitmap->Begin(), source_bitmap->Begin() + source_bitmap->Size() / kWordSize, Begin());
 }
 
-// Visits set bits in address order.  The callback is not permitted to
-// change the bitmap bits or max during the traversal.
-void SpaceBitmap::Walk(ObjectCallback* callback, void* arg) {
+template<size_t kAlignment>
+void SpaceBitmap<kAlignment>::Walk(ObjectCallback* callback, void* arg) {
   CHECK(bitmap_begin_ != NULL);
   CHECK(callback != NULL);
 
@@ -122,17 +107,13 @@
   }
 }
 
-// Walk through the bitmaps in increasing address order, and find the
-// object pointers that correspond to garbage objects.  Call
-// <callback> zero or more times with lists of these object pointers.
-//
-// The callback is not permitted to increase the max of either bitmap.
-void SpaceBitmap::SweepWalk(const SpaceBitmap& live_bitmap,
-                            const SpaceBitmap& mark_bitmap,
-                            uintptr_t sweep_begin, uintptr_t sweep_end,
-                            SpaceBitmap::SweepCallback* callback, void* arg) {
-  CHECK(live_bitmap.bitmap_begin_ != NULL);
-  CHECK(mark_bitmap.bitmap_begin_ != NULL);
+template<size_t kAlignment>
+void SpaceBitmap<kAlignment>::SweepWalk(const SpaceBitmap<kAlignment>& live_bitmap,
+                                        const SpaceBitmap<kAlignment>& mark_bitmap,
+                                        uintptr_t sweep_begin, uintptr_t sweep_end,
+                                        SpaceBitmap::SweepCallback* callback, void* arg) {
+  CHECK(live_bitmap.bitmap_begin_ != nullptr);
+  CHECK(mark_bitmap.bitmap_begin_ != nullptr);
   CHECK_EQ(live_bitmap.heap_begin_, mark_bitmap.heap_begin_);
   CHECK_EQ(live_bitmap.bitmap_size_, mark_bitmap.bitmap_size_);
   CHECK(callback != NULL);
@@ -174,13 +155,10 @@
   }
 }
 
-static void WalkFieldsInOrder(SpaceBitmap* visited, ObjectCallback* callback, mirror::Object* obj,
-                              void* arg);
-
-// Walk instance fields of the given Class. Separate function to allow recursion on the super
-// class.
-static void WalkInstanceFields(SpaceBitmap* visited, ObjectCallback* callback, mirror::Object* obj,
-                               mirror::Class* klass, void* arg)
+template<size_t kAlignment>
+void SpaceBitmap<kAlignment>::WalkInstanceFields(SpaceBitmap<kAlignment>* visited,
+                                                 ObjectCallback* callback, mirror::Object* obj,
+                                                 mirror::Class* klass, void* arg)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   // Visit fields of parent classes first.
   mirror::Class* super = klass->GetSuperClass();
@@ -203,10 +181,10 @@
   }
 }
 
-// For an unvisited object, visit it then all its children found via fields.
-static void WalkFieldsInOrder(SpaceBitmap* visited, ObjectCallback* callback, mirror::Object* obj,
-                              void* arg)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+template<size_t kAlignment>
+void SpaceBitmap<kAlignment>::WalkFieldsInOrder(SpaceBitmap<kAlignment>* visited,
+                                                ObjectCallback* callback, mirror::Object* obj,
+                                                void* arg) {
   if (visited->Test(obj)) {
     return;
   }
@@ -244,14 +222,13 @@
   }
 }
 
-// Visits set bits with an in order traversal.  The callback is not permitted to change the bitmap
-// bits or max during the traversal.
-void SpaceBitmap::InOrderWalk(ObjectCallback* callback, void* arg) {
-  UniquePtr<SpaceBitmap> visited(Create("bitmap for in-order walk",
-                                       reinterpret_cast<byte*>(heap_begin_),
-                                       IndexToOffset(bitmap_size_ / kWordSize)));
-  CHECK(bitmap_begin_ != NULL);
-  CHECK(callback != NULL);
+template<size_t kAlignment>
+void SpaceBitmap<kAlignment>::InOrderWalk(ObjectCallback* callback, void* arg) {
+  UniquePtr<SpaceBitmap<kAlignment>> visited(
+      Create("bitmap for in-order walk", reinterpret_cast<byte*>(heap_begin_),
+             IndexToOffset(bitmap_size_ / kWordSize)));
+  CHECK(bitmap_begin_ != nullptr);
+  CHECK(callback != nullptr);
   uintptr_t end = Size() / kWordSize;
   for (uintptr_t i = 0; i < end; ++i) {
     // Need uint for unsigned shift.
@@ -268,13 +245,8 @@
   }
 }
 
-std::ostream& operator << (std::ostream& stream, const SpaceBitmap& bitmap) {
-  return stream
-    << bitmap.GetName() << "["
-    << "begin=" << reinterpret_cast<const void*>(bitmap.HeapBegin())
-    << ",end=" << reinterpret_cast<const void*>(bitmap.HeapLimit())
-    << "]";
-}
+template class SpaceBitmap<kObjectAlignment>;
+template class SpaceBitmap<kPageSize>;
 
 }  // namespace accounting
 }  // namespace gc
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index a88f3e4..df3fd37 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -38,11 +38,9 @@
 namespace gc {
 namespace accounting {
 
+template<size_t kAlignment>
 class SpaceBitmap {
  public:
-  // Alignment of objects within spaces.
-  static const size_t kAlignment = 8;
-
   typedef void ScanCallback(mirror::Object* obj, void* finger, void* arg);
 
   typedef void SweepCallback(size_t ptr_count, mirror::Object** ptrs, void* arg);
@@ -57,30 +55,31 @@
   static SpaceBitmap* CreateFromMemMap(const std::string& name, MemMap* mem_map,
                                        byte* heap_begin, size_t heap_capacity);
 
-  ~SpaceBitmap();
+  ~SpaceBitmap() {
+  }
 
   // <offset> is the difference from .base to a pointer address.
   // <index> is the index of .bits that contains the bit representing
   //         <offset>.
-  static size_t OffsetToIndex(size_t offset) {
+  static size_t OffsetToIndex(size_t offset) ALWAYS_INLINE {
     return offset / kAlignment / kBitsPerWord;
   }
 
-  static uintptr_t IndexToOffset(size_t index) {
+  static uintptr_t IndexToOffset(size_t index) ALWAYS_INLINE {
     return static_cast<uintptr_t>(index * kAlignment * kBitsPerWord);
   }
 
   // Bits are packed in the obvious way.
-  static uword OffsetToMask(uintptr_t offset) {
+  static uword OffsetToMask(uintptr_t offset) ALWAYS_INLINE {
     return (static_cast<size_t>(1)) << ((offset / kAlignment) % kBitsPerWord);
   }
 
-  inline bool Set(const mirror::Object* obj) {
-    return Modify(obj, true);
+  bool Set(const mirror::Object* obj) ALWAYS_INLINE {
+    return Modify<true>(obj);
   }
 
-  inline bool Clear(const mirror::Object* obj) {
-    return Modify(obj, false);
+  bool Clear(const mirror::Object* obj) ALWAYS_INLINE {
+    return Modify<false>(obj);
   }
 
   // Returns true if the object was previously marked.
@@ -123,20 +122,26 @@
     }
   }
 
-  /**
-   * Visit the live objects in the range [visit_begin, visit_end).
-   */
+  // Visit the live objects in the range [visit_begin, visit_end).
+  // TODO: Use lock annotations when clang is fixed.
+  // EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   template <typename Visitor>
   void VisitMarkedRange(uintptr_t visit_begin, uintptr_t visit_end, const Visitor& visitor) const
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+      NO_THREAD_SAFETY_ANALYSIS;
 
+  // Visits set bits in address order.  The callback is not permitted to change the bitmap bits or
+  // max during the traversal.
   void Walk(ObjectCallback* callback, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
+  // Visits set bits with an in order traversal.  The callback is not permitted to change the bitmap
+  // bits or max during the traversal.
   void InOrderWalk(ObjectCallback* callback, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
+  // Walk through the bitmaps in increasing address order, and find the object pointers that
+  // correspond to garbage objects.  Call <callback> zero or more times with lists of these object
+  // pointers. The callback is not permitted to increase the max of either bitmap.
   static void SweepWalk(const SpaceBitmap& live, const SpaceBitmap& mark, uintptr_t base,
                         uintptr_t max, SweepCallback* thunk, void* arg);
 
@@ -169,10 +174,18 @@
   // Set the max address which can covered by the bitmap.
   void SetHeapLimit(uintptr_t new_end);
 
-  std::string GetName() const;
-  void SetName(const std::string& name);
+  std::string GetName() const {
+    return name_;
+  }
 
-  std::string Dump() const;
+  void SetName(const std::string& name) {
+    name_ = name;
+  }
+
+  std::string Dump() const {
+    return StringPrintf("%s: %p-%p", name_.c_str(), reinterpret_cast<void*>(HeapBegin()),
+                        reinterpret_cast<void*>(HeapLimit()));
+  }
 
   const void* GetObjectWordAddress(const mirror::Object* obj) const {
     uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
@@ -185,12 +198,19 @@
   // TODO: heap_end_ is initialized so that the heap bitmap is empty, this doesn't require the -1,
   // however, we document that this is expected on heap_end_
   SpaceBitmap(const std::string& name, MemMap* mem_map, uword* bitmap_begin, size_t bitmap_size,
-              const void* heap_begin)
-      : mem_map_(mem_map), bitmap_begin_(bitmap_begin), bitmap_size_(bitmap_size),
-        heap_begin_(reinterpret_cast<uintptr_t>(heap_begin)),
-        name_(name) {}
+              const void* heap_begin);
 
-  bool Modify(const mirror::Object* obj, bool do_set);
+  template<bool kSetBit>
+  bool Modify(const mirror::Object* obj);
+
+  // For an unvisited object, visit it then all its children found via fields.
+  static void WalkFieldsInOrder(SpaceBitmap* visited, ObjectCallback* callback, mirror::Object* obj,
+                                void* arg) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  // Walk instance fields of the given Class. Separate function to allow recursion on the super
+  // class.
+  static void WalkInstanceFields(SpaceBitmap<kAlignment>* visited, ObjectCallback* callback,
+                                 mirror::Object* obj, mirror::Class* klass, void* arg)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Backing storage for bitmap.
   UniquePtr<MemMap> mem_map_;
@@ -209,70 +229,11 @@
   std::string name_;
 };
 
-// Like a bitmap except it keeps track of objects using sets.
-class ObjectSet {
- public:
-  typedef std::set<
-      const mirror::Object*, std::less<const mirror::Object*>,
-      GcAllocator<const mirror::Object*> > Objects;
+typedef SpaceBitmap<kObjectAlignment> ContinuousSpaceBitmap;
+typedef SpaceBitmap<kLargeObjectAlignment> LargeObjectBitmap;
 
-  bool IsEmpty() const {
-    return contained_.empty();
-  }
-
-  inline void Set(const mirror::Object* obj) {
-    contained_.insert(obj);
-  }
-
-  inline void Clear(const mirror::Object* obj) {
-    Objects::iterator found = contained_.find(obj);
-    if (found != contained_.end()) {
-      contained_.erase(found);
-    }
-  }
-
-  void Clear() {
-    contained_.clear();
-  }
-
-  inline bool Test(const mirror::Object* obj) const {
-    return contained_.find(obj) != contained_.end();
-  }
-
-  const std::string& GetName() const {
-    return name_;
-  }
-
-  void SetName(const std::string& name) {
-    name_ = name;
-  }
-
-  void CopyFrom(const ObjectSet& space_set) {
-    contained_ = space_set.contained_;
-  }
-
-  void Walk(ObjectCallback* callback, void* arg) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-
-  template <typename Visitor>
-  void Visit(const Visitor& visitor) NO_THREAD_SAFETY_ANALYSIS {
-    for (const mirror::Object* obj : contained_) {
-      visitor(const_cast<mirror::Object*>(obj));
-    }
-  }
-
-  explicit ObjectSet(const std::string& name) : name_(name) {}
-  ~ObjectSet() {}
-
-  Objects& GetObjects() {
-    return contained_;
-  }
-
- private:
-  std::string name_;
-  Objects contained_;
-};
-
-std::ostream& operator << (std::ostream& stream, const SpaceBitmap& bitmap);
+template<size_t kAlignment>
+std::ostream& operator << (std::ostream& stream, const SpaceBitmap<kAlignment>& bitmap);
 
 }  // namespace accounting
 }  // namespace gc
diff --git a/runtime/gc/accounting/space_bitmap_test.cc b/runtime/gc/accounting/space_bitmap_test.cc
index 68994a8..972f94d 100644
--- a/runtime/gc/accounting/space_bitmap_test.cc
+++ b/runtime/gc/accounting/space_bitmap_test.cc
@@ -32,14 +32,15 @@
 TEST_F(SpaceBitmapTest, Init) {
   byte* heap_begin = reinterpret_cast<byte*>(0x10000000);
   size_t heap_capacity = 16 * MB;
-  UniquePtr<SpaceBitmap> space_bitmap(SpaceBitmap::Create("test bitmap",
-                                                          heap_begin, heap_capacity));
+  UniquePtr<ContinuousSpaceBitmap> space_bitmap(
+      ContinuousSpaceBitmap::Create("test bitmap", heap_begin, heap_capacity));
   EXPECT_TRUE(space_bitmap.get() != NULL);
 }
 
 class BitmapVerify {
  public:
-  BitmapVerify(SpaceBitmap* bitmap, const mirror::Object* begin, const mirror::Object* end)
+  BitmapVerify(ContinuousSpaceBitmap* bitmap, const mirror::Object* begin,
+               const mirror::Object* end)
     : bitmap_(bitmap),
       begin_(begin),
       end_(end) {}
@@ -50,7 +51,7 @@
     EXPECT_EQ(bitmap_->Test(obj), ((reinterpret_cast<uintptr_t>(obj) & 0xF) != 0));
   }
 
-  SpaceBitmap* bitmap_;
+  ContinuousSpaceBitmap* bitmap_;
   const mirror::Object* begin_;
   const mirror::Object* end_;
 };
@@ -59,14 +60,14 @@
   byte* heap_begin = reinterpret_cast<byte*>(0x10000000);
   size_t heap_capacity = 16 * MB;
 
-  UniquePtr<SpaceBitmap> space_bitmap(SpaceBitmap::Create("test bitmap",
-                                                          heap_begin, heap_capacity));
+  UniquePtr<ContinuousSpaceBitmap> space_bitmap(
+      ContinuousSpaceBitmap::Create("test bitmap", heap_begin, heap_capacity));
   EXPECT_TRUE(space_bitmap.get() != NULL);
 
   // Set all the odd bits in the first BitsPerWord * 3 to one.
   for (size_t j = 0; j < kBitsPerWord * 3; ++j) {
     const mirror::Object* obj =
-        reinterpret_cast<mirror::Object*>(heap_begin + j * SpaceBitmap::kAlignment);
+        reinterpret_cast<mirror::Object*>(heap_begin + j * kObjectAlignment);
     if (reinterpret_cast<uintptr_t>(obj) & 0xF) {
       space_bitmap->Set(obj);
     }
@@ -77,10 +78,10 @@
   // words.
   for (size_t i = 0; i < static_cast<size_t>(kBitsPerWord); ++i) {
     mirror::Object* start =
-        reinterpret_cast<mirror::Object*>(heap_begin + i * SpaceBitmap::kAlignment);
+        reinterpret_cast<mirror::Object*>(heap_begin + i * kObjectAlignment);
     for (size_t j = 0; j < static_cast<size_t>(kBitsPerWord * 2); ++j) {
       mirror::Object* end =
-          reinterpret_cast<mirror::Object*>(heap_begin + (i + j) * SpaceBitmap::kAlignment);
+          reinterpret_cast<mirror::Object*>(heap_begin + (i + j) * kObjectAlignment);
       BitmapVerify(space_bitmap.get(), start, end);
     }
   }
@@ -109,7 +110,8 @@
   uint32_t val_;
 };
 
-void compat_test() NO_THREAD_SAFETY_ANALYSIS {
+template <size_t kAlignment>
+void RunTest() NO_THREAD_SAFETY_ANALYSIS {
   byte* heap_begin = reinterpret_cast<byte*>(0x10000000);
   size_t heap_capacity = 16 * MB;
 
@@ -118,11 +120,11 @@
 
 
   for (int i = 0; i < 5 ; ++i) {
-    UniquePtr<SpaceBitmap> space_bitmap(SpaceBitmap::Create("test bitmap",
-                                                            heap_begin, heap_capacity));
+    UniquePtr<ContinuousSpaceBitmap> space_bitmap(
+        ContinuousSpaceBitmap::Create("test bitmap", heap_begin, heap_capacity));
 
     for (int j = 0; j < 10000; ++j) {
-      size_t offset = (r.next() % heap_capacity) & ~(0x7);
+      size_t offset = RoundDown(r.next() % heap_capacity, kAlignment);
       bool set = r.next() % 2 == 1;
 
       if (set) {
@@ -136,15 +138,15 @@
       size_t count = 0;
       SimpleCounter c(&count);
 
-      size_t offset = (r.next() % heap_capacity) & ~(0x7);
+      size_t offset = RoundDown(r.next() % heap_capacity, kAlignment);
       size_t remain = heap_capacity - offset;
-      size_t end = offset + ((r.next() % (remain + 1)) & ~(0x7));
+      size_t end = offset + RoundDown(r.next() % (remain + 1), kAlignment);
 
       space_bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(heap_begin) + offset,
                                      reinterpret_cast<uintptr_t>(heap_begin) + end, c);
 
       size_t manual = 0;
-      for (uintptr_t k = offset; k < end; k += kObjectAlignment) {
+      for (uintptr_t k = offset; k < end; k += kAlignment) {
         if (space_bitmap->Test(reinterpret_cast<mirror::Object*>(heap_begin + k))) {
           manual++;
         }
@@ -155,8 +157,12 @@
   }
 }
 
-TEST_F(SpaceBitmapTest, Visitor) {
-  compat_test();
+TEST_F(SpaceBitmapTest, VisitorObjectAlignment) {
+  RunTest<kObjectAlignment>();
+}
+
+TEST_F(SpaceBitmapTest, VisitorPageAlignment) {
+  RunTest<kPageSize>();
 }
 
 }  // namespace accounting
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 920741f..cbefa6a 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -2005,6 +2005,61 @@
   }
 }
 
+size_t RosAlloc::ReleasePages() {
+  VLOG(heap) << "RosAlloc::ReleasePages()";
+  DCHECK(!DoesReleaseAllPages());
+  Thread* self = Thread::Current();
+  size_t reclaimed_bytes = 0;
+  size_t i = 0;
+  while (true) {
+    MutexLock mu(self, lock_);
+    // Check the page map size which might have changed due to grow/shrink.
+    size_t pm_end = page_map_size_;
+    if (i >= pm_end) {
+      // Reached the end.
+      break;
+    }
+    byte pm = page_map_[i];
+    switch (pm) {
+      case kPageMapEmpty: {
+        // The start of a free page run. Release pages.
+        FreePageRun* fpr = reinterpret_cast<FreePageRun*>(base_ + i * kPageSize);
+        DCHECK(free_page_runs_.find(fpr) != free_page_runs_.end());
+        size_t fpr_size = fpr->ByteSize(this);
+        DCHECK(IsAligned<kPageSize>(fpr_size));
+        byte* start = reinterpret_cast<byte*>(fpr);
+        if (kIsDebugBuild) {
+          // In the debug build, the first page of a free page run
+          // contains a magic number for debugging. Exclude it.
+          start = reinterpret_cast<byte*>(fpr) + kPageSize;
+        }
+        byte* end = reinterpret_cast<byte*>(fpr) + fpr_size;
+        CHECK_EQ(madvise(start, end - start, MADV_DONTNEED), 0);
+        reclaimed_bytes += fpr_size;
+        size_t num_pages = fpr_size / kPageSize;
+        if (kIsDebugBuild) {
+          for (size_t j = i + 1; j < i + num_pages; ++j) {
+            DCHECK_EQ(page_map_[j], kPageMapEmpty);
+          }
+        }
+        i += num_pages;
+        DCHECK_LE(i, pm_end);
+        break;
+      }
+      case kPageMapLargeObject:      // Fall through.
+      case kPageMapLargeObjectPart:  // Fall through.
+      case kPageMapRun:              // Fall through.
+      case kPageMapRunPart:          // Fall through.
+        ++i;
+        break;  // Skip.
+      default:
+        LOG(FATAL) << "Unreachable - page map type: " << pm;
+        break;
+    }
+  }
+  return reclaimed_bytes;
+}
+
 }  // namespace allocator
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index 0b4b189..5d9d75c 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -539,6 +539,8 @@
   void InspectAll(void (*handler)(void* start, void* end, size_t used_bytes, void* callback_arg),
                   void* arg)
       LOCKS_EXCLUDED(lock_);
+  // Release empty pages.
+  size_t ReleasePages() LOCKS_EXCLUDED(lock_);
   // Returns the current footprint.
   size_t Footprint() LOCKS_EXCLUDED(lock_);
   // Returns the current capacity, maximum footprint.
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index 07951e0..6380cba 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -174,8 +174,8 @@
     if (space->GetGcRetentionPolicy() == space::kGcRetentionPolicyAlwaysCollect ||
         (gc_type == kGcTypeFull &&
          space->GetGcRetentionPolicy() == space::kGcRetentionPolicyFullCollect)) {
-      accounting::SpaceBitmap* live_bitmap = space->GetLiveBitmap();
-      accounting::SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
+      accounting::ContinuousSpaceBitmap* live_bitmap = space->GetLiveBitmap();
+      accounting::ContinuousSpaceBitmap* mark_bitmap = space->GetMarkBitmap();
       if (live_bitmap != nullptr && live_bitmap != mark_bitmap) {
         heap_->GetLiveBitmap()->ReplaceBitmap(live_bitmap, mark_bitmap);
         heap_->GetMarkBitmap()->ReplaceBitmap(mark_bitmap, live_bitmap);
@@ -185,12 +185,12 @@
     }
   }
   for (const auto& disc_space : GetHeap()->GetDiscontinuousSpaces()) {
-    space::LargeObjectSpace* space = down_cast<space::LargeObjectSpace*>(disc_space);
-    accounting::ObjectSet* live_set = space->GetLiveObjects();
-    accounting::ObjectSet* mark_set = space->GetMarkObjects();
-    heap_->GetLiveBitmap()->ReplaceObjectSet(live_set, mark_set);
-    heap_->GetMarkBitmap()->ReplaceObjectSet(mark_set, live_set);
-    down_cast<space::LargeObjectSpace*>(space)->SwapBitmaps();
+    space::LargeObjectSpace* space = disc_space->AsLargeObjectSpace();
+    accounting::LargeObjectBitmap* live_set = space->GetLiveBitmap();
+    accounting::LargeObjectBitmap* mark_set = space->GetMarkBitmap();
+    heap_->GetLiveBitmap()->ReplaceLargeObjectBitmap(live_set, mark_set);
+    heap_->GetMarkBitmap()->ReplaceLargeObjectBitmap(mark_set, live_set);
+    space->SwapBitmaps();
   }
 }
 
@@ -201,7 +201,15 @@
 
 uint64_t GarbageCollector::GetEstimatedLastIterationThroughput() const {
   // Add 1ms to prevent possible division by 0.
-  return (freed_bytes_ * 1000) / (NsToMs(GetDurationNs()) + 1);
+  return (static_cast<uint64_t>(freed_bytes_) * 1000) / (NsToMs(GetDurationNs()) + 1);
+}
+
+void GarbageCollector::ResetMeasurements() {
+  cumulative_timings_.Reset();
+  pause_histogram_.Reset();
+  total_time_ns_ = 0;
+  total_freed_objects_ = 0;
+  total_freed_bytes_ = 0;
 }
 
 }  // namespace collector
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index 5b7b8a2..b19ac3f 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -110,6 +110,9 @@
     return pause_histogram_;
   }
 
+  // Reset the cumulative timings and pause histogram.
+  void ResetMeasurements();
+
   // Returns the estimated throughput in bytes / second.
   uint64_t GetEstimatedMeanThroughput() const;
 
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index ca2d0bd..8af4fd8 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -76,7 +76,7 @@
 
 // Turn off kCheckLocks when profiling the GC since it slows the GC down by up to 40%.
 static constexpr bool kCheckLocks = kDebugLocking;
-static constexpr bool kVerifyRoots = kIsDebugBuild;
+static constexpr bool kVerifyRootsMarked = kIsDebugBuild;
 
 // If true, revoke the rosalloc thread-local buffers at the
 // checkpoint, as opposed to during the pause.
@@ -99,7 +99,6 @@
                        name_prefix +
                        (is_concurrent ? "concurrent mark sweep": "mark sweep")),
       gc_barrier_(new Barrier(0)),
-      large_object_lock_("mark sweep large object lock", kMarkSweepLargeObjectLock),
       mark_stack_lock_("mark sweep mark stack lock", kMarkSweepMarkStackLock),
       is_concurrent_(is_concurrent) {
 }
@@ -123,13 +122,15 @@
   mark_immune_count_ = 0;
   mark_fastpath_count_ = 0;
   mark_slowpath_count_ = 0;
-  FindDefaultSpaceBitmap();
   {
     // TODO: I don't think we should need heap bitmap lock to get the mark bitmap.
     ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
     mark_bitmap_ = heap_->GetMarkBitmap();
   }
-
+  if (!clear_soft_references_) {
+    // Always clear soft references if a non-sticky collection.
+    clear_soft_references_ = GetGcType() != collector::kGcTypeSticky;
+  }
   // Do any pre GC verification.
   timings_.NewSplit("PreGcVerification");
   heap_->PreGcVerification(this);
@@ -290,15 +291,21 @@
 void MarkSweep::FindDefaultSpaceBitmap() {
   TimingLogger::ScopedSplit split("FindDefaultMarkBitmap", &timings_);
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
-    accounting::SpaceBitmap* bitmap = space->GetMarkBitmap();
+    accounting::ContinuousSpaceBitmap* bitmap = space->GetMarkBitmap();
+    // We want to have the main space instead of non moving if possible.
     if (bitmap != nullptr &&
         space->GetGcRetentionPolicy() == space::kGcRetentionPolicyAlwaysCollect) {
       current_space_bitmap_ = bitmap;
-      return;
+      // If we are not the non moving space exit the loop early since this will be good enough.
+      if (space != heap_->GetNonMovingSpace()) {
+        break;
+      }
     }
   }
-  GetHeap()->DumpSpaces();
-  LOG(FATAL) << "Could not find a default mark bitmap";
+  if (current_space_bitmap_ == nullptr) {
+    heap_->DumpSpaces();
+    LOG(FATAL) << "Could not find a default mark bitmap";
+  }
 }
 
 void MarkSweep::ExpandMarkStack() {
@@ -320,7 +327,7 @@
 }
 
 inline void MarkSweep::MarkObjectNonNullParallel(Object* obj) {
-  DCHECK(obj != NULL);
+  DCHECK(obj != nullptr);
   if (MarkObjectParallel(obj)) {
     MutexLock mu(Thread::Current(), mark_stack_lock_);
     if (UNLIKELY(mark_stack_->Size() >= mark_stack_->Capacity())) {
@@ -341,6 +348,31 @@
   reinterpret_cast<MarkSweep*>(arg)->MarkObject(ref->AsMirrorPtr());
 }
 
+class MarkSweepMarkObjectSlowPath {
+ public:
+  explicit MarkSweepMarkObjectSlowPath(MarkSweep* mark_sweep) : mark_sweep_(mark_sweep) {
+  }
+
+  void operator()(const Object* obj) const ALWAYS_INLINE {
+    if (kProfileLargeObjects) {
+      // TODO: Differentiate between marking and testing somehow.
+      ++mark_sweep_->large_object_test_;
+      ++mark_sweep_->large_object_mark_;
+    }
+    space::LargeObjectSpace* large_object_space = mark_sweep_->GetHeap()->GetLargeObjectsSpace();
+    if (UNLIKELY(!IsAligned<kPageSize>(obj) ||
+                 (kIsDebugBuild && !large_object_space->Contains(obj)))) {
+      LOG(ERROR) << "Tried to mark " << obj << " not contained by any spaces";
+      LOG(ERROR) << "Attempting see if it's a bad root";
+      mark_sweep_->VerifyRoots();
+      LOG(FATAL) << "Can't mark invalid object";
+    }
+  }
+
+ private:
+  MarkSweep* const mark_sweep_;
+};
+
 inline void MarkSweep::MarkObjectNonNull(Object* obj) {
   DCHECK(obj != nullptr);
   if (kUseBakerOrBrooksReadBarrier) {
@@ -351,27 +383,24 @@
     if (kCountMarkedObjects) {
       ++mark_immune_count_;
     }
-    DCHECK(IsMarked(obj));
-    return;
-  }
-  // Try to take advantage of locality of references within a space, failing this find the space
-  // the hard way.
-  accounting::SpaceBitmap* object_bitmap = current_space_bitmap_;
-  if (UNLIKELY(!object_bitmap->HasAddress(obj))) {
-    object_bitmap = mark_bitmap_->GetContinuousSpaceBitmap(obj);
+    DCHECK(mark_bitmap_->Test(obj));
+  } else if (LIKELY(current_space_bitmap_->HasAddress(obj))) {
+    if (kCountMarkedObjects) {
+      ++mark_fastpath_count_;
+    }
+    if (UNLIKELY(!current_space_bitmap_->Set(obj))) {
+      PushOnMarkStack(obj);  // This object was not previously marked.
+    }
+  } else {
     if (kCountMarkedObjects) {
       ++mark_slowpath_count_;
     }
-    if (UNLIKELY(object_bitmap == nullptr)) {
-      MarkLargeObject(obj, true);
-      return;
+    MarkSweepMarkObjectSlowPath visitor(this);
+    // TODO: We already know that the object is not in the current_space_bitmap_ but MarkBitmap::Set
+    // will check again.
+    if (!mark_bitmap_->Set(obj, visitor)) {
+      PushOnMarkStack(obj);  // Was not already marked, push.
     }
-  } else if (kCountMarkedObjects) {
-    ++mark_fastpath_count_;
-  }
-  // This object was not previously marked.
-  if (!object_bitmap->Set(obj)) {
-    PushOnMarkStack(obj);
   }
 }
 
@@ -385,34 +414,6 @@
   mark_stack_->PushBack(obj);
 }
 
-// Rare case, probably not worth inlining since it will increase instruction cache miss rate.
-bool MarkSweep::MarkLargeObject(const Object* obj, bool set) {
-  // TODO: support >1 discontinuous space.
-  space::LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
-  accounting::ObjectSet* large_objects = large_object_space->GetMarkObjects();
-  if (kProfileLargeObjects) {
-    ++large_object_test_;
-  }
-  if (UNLIKELY(!large_objects->Test(obj))) {
-    if (!large_object_space->Contains(obj)) {
-      LOG(ERROR) << "Tried to mark " << obj << " not contained by any spaces";
-      LOG(ERROR) << "Attempting see if it's a bad root";
-      VerifyRoots();
-      LOG(FATAL) << "Can't mark bad root";
-    }
-    if (kProfileLargeObjects) {
-      ++large_object_mark_;
-    }
-    if (set) {
-      large_objects->Set(obj);
-    } else {
-      large_objects->Clear(obj);
-    }
-    return true;
-  }
-  return false;
-}
-
 inline bool MarkSweep::MarkObjectParallel(const Object* obj) {
   DCHECK(obj != nullptr);
   if (kUseBakerOrBrooksReadBarrier) {
@@ -425,20 +426,12 @@
   }
   // Try to take advantage of locality of references within a space, failing this find the space
   // the hard way.
-  accounting::SpaceBitmap* object_bitmap = current_space_bitmap_;
-  if (UNLIKELY(!object_bitmap->HasAddress(obj))) {
-    accounting::SpaceBitmap* new_bitmap = mark_bitmap_->GetContinuousSpaceBitmap(obj);
-    if (new_bitmap != NULL) {
-      object_bitmap = new_bitmap;
-    } else {
-      // TODO: Remove the Thread::Current here?
-      // TODO: Convert this to some kind of atomic marking?
-      MutexLock mu(Thread::Current(), large_object_lock_);
-      return MarkLargeObject(obj, true);
-    }
+  accounting::ContinuousSpaceBitmap* object_bitmap = current_space_bitmap_;
+  if (LIKELY(object_bitmap->HasAddress(obj))) {
+    return !object_bitmap->AtomicTestAndSet(obj);
   }
-  // Return true if the object was not previously marked.
-  return !object_bitmap->AtomicTestAndSet(obj);
+  MarkSweepMarkObjectSlowPath visitor(this);
+  return !mark_bitmap_->AtomicTestAndSet(obj, visitor);
 }
 
 // Used to mark objects when processing the mark stack. If an object is null, it is not marked.
@@ -466,16 +459,17 @@
 }
 
 void MarkSweep::VerifyRootCallback(const Object* root, void* arg, size_t vreg,
-                                   const StackVisitor* visitor) {
-  reinterpret_cast<MarkSweep*>(arg)->VerifyRoot(root, vreg, visitor);
+                                   const StackVisitor* visitor, RootType root_type) {
+  reinterpret_cast<MarkSweep*>(arg)->VerifyRoot(root, vreg, visitor, root_type);
 }
 
-void MarkSweep::VerifyRoot(const Object* root, size_t vreg, const StackVisitor* visitor) {
+void MarkSweep::VerifyRoot(const Object* root, size_t vreg, const StackVisitor* visitor,
+                           RootType root_type) {
   // See if the root is on any space bitmap.
-  if (GetHeap()->GetLiveBitmap()->GetContinuousSpaceBitmap(root) == NULL) {
+  if (heap_->GetLiveBitmap()->GetContinuousSpaceBitmap(root) == nullptr) {
     space::LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
     if (!large_object_space->Contains(root)) {
-      LOG(ERROR) << "Found invalid root: " << root;
+      LOG(ERROR) << "Found invalid root: " << root << " with type " << root_type;
       if (visitor != NULL) {
         LOG(ERROR) << visitor->DescribeLocation() << " in VReg: " << vreg;
       }
@@ -682,7 +676,8 @@
 
 class CardScanTask : public MarkStackTask<false> {
  public:
-  CardScanTask(ThreadPool* thread_pool, MarkSweep* mark_sweep, accounting::SpaceBitmap* bitmap,
+  CardScanTask(ThreadPool* thread_pool, MarkSweep* mark_sweep,
+               accounting::ContinuousSpaceBitmap* bitmap,
                byte* begin, byte* end, byte minimum_age, size_t mark_stack_size,
                Object** mark_stack_obj)
       : MarkStackTask<false>(thread_pool, mark_sweep, mark_stack_size, mark_stack_obj),
@@ -693,7 +688,7 @@
   }
 
  protected:
-  accounting::SpaceBitmap* const bitmap_;
+  accounting::ContinuousSpaceBitmap* const bitmap_;
   byte* const begin_;
   byte* const end_;
   const byte minimum_age_;
@@ -715,7 +710,7 @@
 
 size_t MarkSweep::GetThreadCount(bool paused) const {
   if (heap_->GetThreadPool() == nullptr || !heap_->CareAboutPauseTimes()) {
-    return 0;
+    return 1;
   }
   if (paused) {
     return heap_->GetParallelGCThreadCount() + 1;
@@ -729,7 +724,7 @@
   ThreadPool* thread_pool = GetHeap()->GetThreadPool();
   size_t thread_count = GetThreadCount(paused);
   // The parallel version with only one thread is faster for card scanning, TODO: fix.
-  if (kParallelCardScan && thread_count > 0) {
+  if (kParallelCardScan && thread_count > 1) {
     Thread* self = Thread::Current();
     // Can't have a different split for each space since multiple spaces can have their cards being
     // scanned at the same time.
@@ -816,7 +811,7 @@
 class RecursiveMarkTask : public MarkStackTask<false> {
  public:
   RecursiveMarkTask(ThreadPool* thread_pool, MarkSweep* mark_sweep,
-                    accounting::SpaceBitmap* bitmap, uintptr_t begin, uintptr_t end)
+                    accounting::ContinuousSpaceBitmap* bitmap, uintptr_t begin, uintptr_t end)
       : MarkStackTask<false>(thread_pool, mark_sweep, 0, NULL),
         bitmap_(bitmap),
         begin_(begin),
@@ -824,7 +819,7 @@
   }
 
  protected:
-  accounting::SpaceBitmap* const bitmap_;
+  accounting::ContinuousSpaceBitmap* const bitmap_;
   const uintptr_t begin_;
   const uintptr_t end_;
 
@@ -918,7 +913,7 @@
                                                           kVisitRootFlagStopLoggingNewRoots |
                                                           kVisitRootFlagClearRootLog));
   timings_.EndSplit();
-  if (kVerifyRoots) {
+  if (kVerifyRootsMarked) {
     timings_.StartSplit("(Paused)VerifyRoots");
     Runtime::Current()->VisitRoots(VerifyRootMarked, this);
     timings_.EndSplit();
@@ -940,14 +935,11 @@
 
 void MarkSweep::VerifyIsLive(const Object* obj) {
   if (!heap_->GetLiveBitmap()->Test(obj)) {
-    space::LargeObjectSpace* large_object_space = heap_->GetLargeObjectsSpace();
-    if (!large_object_space->GetLiveObjects()->Test(obj)) {
-      if (std::find(heap_->allocation_stack_->Begin(), heap_->allocation_stack_->End(), obj) ==
-          heap_->allocation_stack_->End()) {
-        // Object not found!
-        heap_->DumpSpaces();
-        LOG(FATAL) << "Found dead object " << obj;
-      }
+    if (std::find(heap_->allocation_stack_->Begin(), heap_->allocation_stack_->End(), obj) ==
+        heap_->allocation_stack_->End()) {
+      // Object not found!
+      heap_->DumpSpaces();
+      LOG(FATAL) << "Found dead object " << obj;
     }
   }
 }
@@ -1041,8 +1033,8 @@
   // Start by sweeping the continuous spaces.
   for (space::ContinuousSpace* space : sweep_spaces) {
     space::AllocSpace* alloc_space = space->AsAllocSpace();
-    accounting::SpaceBitmap* live_bitmap = space->GetLiveBitmap();
-    accounting::SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
+    accounting::ContinuousSpaceBitmap* live_bitmap = space->GetLiveBitmap();
+    accounting::ContinuousSpaceBitmap* mark_bitmap = space->GetMarkBitmap();
     if (swap_bitmaps) {
       std::swap(live_bitmap, mark_bitmap);
     }
@@ -1082,8 +1074,8 @@
   }
   // Handle the large object space.
   space::LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
-  accounting::ObjectSet* large_live_objects = large_object_space->GetLiveObjects();
-  accounting::ObjectSet* large_mark_objects = large_object_space->GetMarkObjects();
+  accounting::LargeObjectBitmap* large_live_objects = large_object_space->GetLiveBitmap();
+  accounting::LargeObjectBitmap* large_mark_objects = large_object_space->GetMarkBitmap();
   if (swap_bitmaps) {
     std::swap(large_live_objects, large_mark_objects);
   }
@@ -1127,7 +1119,6 @@
   timings_.EndSplit();
 
   DCHECK(mark_stack_->IsEmpty());
-  TimingLogger::ScopedSplit("Sweep", &timings_);
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
     if (space->IsContinuousMemMapAllocSpace()) {
       space::ContinuousMemMapAllocSpace* alloc_space = space->AsContinuousMemMapAllocSpace();
@@ -1145,13 +1136,13 @@
 }
 
 void MarkSweep::SweepLargeObjects(bool swap_bitmaps) {
-  TimingLogger::ScopedSplit("SweepLargeObjects", &timings_);
+  TimingLogger::ScopedSplit split("SweepLargeObjects", &timings_);
   size_t freed_objects = 0;
   size_t freed_bytes = 0;
-  GetHeap()->GetLargeObjectsSpace()->Sweep(swap_bitmaps, &freed_objects, &freed_bytes);
+  heap_->GetLargeObjectsSpace()->Sweep(swap_bitmaps, &freed_objects, &freed_bytes);
   freed_large_objects_.FetchAndAdd(freed_objects);
   freed_large_object_bytes_.FetchAndAdd(freed_bytes);
-  GetHeap()->RecordFree(freed_objects, freed_bytes);
+  heap_->RecordFree(freed_objects, freed_bytes);
 }
 
 // Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index f1fd546..41a7764 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -22,6 +22,7 @@
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "garbage_collector.h"
+#include "gc/accounting/space_bitmap.h"
 #include "immune_region.h"
 #include "object_callbacks.h"
 #include "offsets.h"
@@ -45,7 +46,6 @@
 namespace accounting {
   template<typename T> class AtomicStack;
   typedef AtomicStack<mirror::Object*> ObjectStack;
-  class SpaceBitmap;
 }  // namespace accounting
 
 namespace collector {
@@ -227,11 +227,6 @@
   // Marks an object atomically, safe to use from multiple threads.
   void MarkObjectNonNullParallel(mirror::Object* obj);
 
-  // Marks or unmarks a large object based on whether or not set is true. If set is true, then we
-  // mark, otherwise we unmark.
-  bool MarkLargeObject(const mirror::Object* obj, bool set)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) LOCKS_EXCLUDED(large_object_lock_);
-
   // Returns true if we need to add obj to a mark stack.
   bool MarkObjectParallel(const mirror::Object* obj) NO_THREAD_SAFETY_ANALYSIS;
 
@@ -249,10 +244,10 @@
   size_t GetThreadCount(bool paused) const;
 
   static void VerifyRootCallback(const mirror::Object* root, void* arg, size_t vreg,
-                                 const StackVisitor *visitor);
+                                 const StackVisitor *visitor, RootType root_type);
 
-  void VerifyRoot(const mirror::Object* root, size_t vreg, const StackVisitor* visitor)
-      NO_THREAD_SAFETY_ANALYSIS;
+  void VerifyRoot(const mirror::Object* root, size_t vreg, const StackVisitor* visitor,
+                  RootType root_type) NO_THREAD_SAFETY_ANALYSIS;
 
   // Push a single reference on a mark stack.
   void PushOnMarkStack(mirror::Object* obj);
@@ -283,7 +278,7 @@
 
   // Current space, we check this space first to avoid searching for the appropriate space for an
   // object.
-  accounting::SpaceBitmap* current_space_bitmap_;
+  accounting::ContinuousSpaceBitmap* current_space_bitmap_;
   // Cache the heap's mark bitmap to prevent having to do 2 loads during slow path marking.
   accounting::HeapBitmap* mark_bitmap_;
 
@@ -315,7 +310,6 @@
   size_t live_stack_freeze_size_;
 
   UniquePtr<Barrier> gc_barrier_;
-  Mutex large_object_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   Mutex mark_stack_lock_ ACQUIRED_AFTER(Locks::classlinker_classes_lock_);
 
   const bool is_concurrent_;
@@ -326,8 +320,6 @@
   friend class CheckBitmapVisitor;
   friend class CheckReferenceVisitor;
   friend class art::gc::Heap;
-  friend class InternTableEntryIsUnmarked;
-  friend class MarkIfReachesAllocspaceVisitor;
   friend class MarkObjectVisitor;
   friend class ModUnionCheckReferences;
   friend class ModUnionClearCardVisitor;
@@ -336,10 +328,9 @@
   friend class ModUnionTableBitmap;
   friend class ModUnionTableReferenceCache;
   friend class ModUnionScanImageRootVisitor;
-  friend class ScanBitmapVisitor;
-  friend class ScanImageRootVisitor;
   template<bool kUseFinger> friend class MarkStackTask;
   friend class FifoMarkStackChunk;
+  friend class MarkSweepMarkObjectSlowPath;
 
   DISALLOW_COPY_AND_ASSIGN(MarkSweep);
 };
diff --git a/runtime/gc/collector/semi_space-inl.h b/runtime/gc/collector/semi_space-inl.h
index df731ff..55140f6 100644
--- a/runtime/gc/collector/semi_space-inl.h
+++ b/runtime/gc/collector/semi_space-inl.h
@@ -26,9 +26,24 @@
 namespace gc {
 namespace collector {
 
+class BitmapSetSlowPathVisitor {
+ public:
+  explicit BitmapSetSlowPathVisitor(SemiSpace* semi_space) : semi_space_(semi_space) {
+  }
+
+  void operator()(const mirror::Object* obj) const {
+    CHECK(!semi_space_->to_space_->HasAddress(obj)) << "Marking " << obj << " in to_space_";
+    // Marking a large object, make sure its aligned as a sanity check.
+    CHECK(IsAligned<kPageSize>(obj));
+  }
+
+ private:
+  SemiSpace* const semi_space_;
+};
+
 inline mirror::Object* SemiSpace::GetForwardingAddressInFromSpace(mirror::Object* obj) const {
   DCHECK(from_space_->HasAddress(obj));
-  LockWord lock_word = obj->GetLockWord();
+  LockWord lock_word = obj->GetLockWord(false);
   if (lock_word.GetState() != LockWord::kForwardingAddress) {
     return nullptr;
   }
@@ -53,37 +68,29 @@
     if (from_space_->HasAddress(obj)) {
       mirror::Object* forward_address = GetForwardingAddressInFromSpace(obj);
       // If the object has already been moved, return the new forward address.
-      if (forward_address == nullptr) {
+      if (UNLIKELY(forward_address == nullptr)) {
         forward_address = MarkNonForwardedObject(obj);
         DCHECK(forward_address != nullptr);
         // Make sure to only update the forwarding address AFTER you copy the object so that the
         // monitor word doesn't get stomped over.
-        obj->SetLockWord(LockWord::FromForwardingAddress(
-            reinterpret_cast<size_t>(forward_address)));
+        obj->SetLockWord(
+            LockWord::FromForwardingAddress(reinterpret_cast<size_t>(forward_address)), false);
         // Push the object onto the mark stack for later processing.
         MarkStackPush(forward_address);
       }
       obj_ptr->Assign(forward_address);
     } else {
-      accounting::SpaceBitmap* object_bitmap =
-          heap_->GetMarkBitmap()->GetContinuousSpaceBitmap(obj);
-      if (LIKELY(object_bitmap != nullptr)) {
-        if (generational_) {
-          // If a bump pointer space only collection, we should not
-          // reach here as we don't/won't mark the objects in the
-          // non-moving space (except for the promoted objects.)  Note
-          // the non-moving space is added to the immune space.
-          DCHECK(whole_heap_collection_);
-        }
-        if (!object_bitmap->Set(obj)) {
-          // This object was not previously marked.
-          MarkStackPush(obj);
-        }
-      } else {
-        CHECK(!to_space_->HasAddress(obj)) << "Marking " << obj << " in to_space_";
-        if (MarkLargeObject(obj)) {
-          MarkStackPush(obj);
-        }
+      BitmapSetSlowPathVisitor visitor(this);
+      if (kIsDebugBuild && mark_bitmap_->GetContinuousSpaceBitmap(obj) != nullptr) {
+        // If a bump pointer space only collection, we should not
+        // reach here as we don't/won't mark the objects in the
+        // non-moving space (except for the promoted objects.)  Note
+        // the non-moving space is added to the immune space.
+        DCHECK(!generational_ || whole_heap_collection_);
+      }
+      if (!mark_bitmap_->Set(obj, visitor)) {
+        // This object was not previously marked.
+        MarkStackPush(obj);
       }
     }
   }
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 1366858..b67bbb1 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -63,8 +63,9 @@
 namespace collector {
 
 static constexpr bool kProtectFromSpace = true;
-static constexpr bool kClearFromSpace = true;
 static constexpr bool kStoreStackTraces = false;
+static constexpr bool kUseBytesPromoted = true;
+static constexpr size_t kBytesPromotedThreshold = 4 * MB;
 
 void SemiSpace::BindBitmaps() {
   timings_.StartSplit("BindBitmaps");
@@ -102,8 +103,10 @@
       generational_(generational),
       last_gc_to_space_end_(nullptr),
       bytes_promoted_(0),
+      bytes_promoted_since_last_whole_heap_collection_(0),
       whole_heap_collection_(true),
-      whole_heap_collection_interval_counter_(0) {
+      whole_heap_collection_interval_counter_(0),
+      collector_name_(name_) {
 }
 
 void SemiSpace::InitializePhase() {
@@ -114,12 +117,20 @@
   immune_region_.Reset();
   is_large_object_space_immune_ = false;
   saved_bytes_ = 0;
+  bytes_moved_ = 0;
+  objects_moved_ = 0;
   self_ = Thread::Current();
   // Do any pre GC verification.
   timings_.NewSplit("PreGcVerification");
   heap_->PreGcVerification(this);
+  CHECK(from_space_->CanMoveObjects()) << "Attempting to move from " << *from_space_;
   // Set the initial bitmap.
   to_space_live_bitmap_ = to_space_->GetLiveBitmap();
+  {
+    // TODO: I don't think we should need heap bitmap lock to get the mark bitmap.
+    ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
+    mark_bitmap_ = heap_->GetMarkBitmap();
+  }
 }
 
 void SemiSpace::ProcessReferences(Thread* self) {
@@ -150,20 +161,34 @@
       // collection, collect the whole heap (and reset the interval
       // counter to be consistent.)
       whole_heap_collection_ = true;
-      whole_heap_collection_interval_counter_ = 0;
+      if (!kUseBytesPromoted) {
+        whole_heap_collection_interval_counter_ = 0;
+      }
     }
     if (whole_heap_collection_) {
       VLOG(heap) << "Whole heap collection";
+      name_ = collector_name_ + " whole";
     } else {
       VLOG(heap) << "Bump pointer space only collection";
+      name_ = collector_name_ + " bps";
     }
   }
+
+  if (!clear_soft_references_) {
+    if (!generational_) {
+      // If non-generational, always clear soft references.
+      clear_soft_references_ = true;
+    } else {
+      // If generational, clear soft references if a whole heap collection.
+      if (whole_heap_collection_) {
+        clear_soft_references_ = true;
+      }
+    }
+  }
+
   Locks::mutator_lock_->AssertExclusiveHeld(self_);
 
   TimingLogger::ScopedSplit split("MarkingPhase", &timings_);
-  // Need to do this with mutators paused so that somebody doesn't accidentally allocate into the
-  // wrong space.
-  heap_->SwapSemiSpaces();
   if (generational_) {
     // If last_gc_to_space_end_ is out of the bounds of the from-space
     // (the to-space from last GC), then point it to the beginning of
@@ -294,8 +319,8 @@
   accounting::ObjectStack* live_stack = heap_->GetLiveStack();
   heap_->MarkAllocStackAsLive(live_stack);
   live_stack->Reset();
-  timings_.EndSplit();
 
+  timings_.NewSplit("UpdateAndMarkRememberedSets");
   for (auto& space : heap_->GetContinuousSpaces()) {
     // If the space is immune and has no mod union table (the
     // non-moving space when the bump pointer space only collection is
@@ -315,7 +340,7 @@
           // remain in the space, that is, the remembered set (and the
           // card table) didn't miss any from-space references in the
           // space.
-          accounting::SpaceBitmap* live_bitmap = space->GetLiveBitmap();
+          accounting::ContinuousSpaceBitmap* live_bitmap = space->GetLiveBitmap();
           SemiSpaceVerifyNoFromSpaceReferencesObjectVisitor visitor(this);
           live_bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(space->Begin()),
                                         reinterpret_cast<uintptr_t>(space->End()),
@@ -323,7 +348,7 @@
         }
       } else {
         DCHECK(rem_set == nullptr);
-        accounting::SpaceBitmap* live_bitmap = space->GetLiveBitmap();
+        accounting::ContinuousSpaceBitmap* live_bitmap = space->GetLiveBitmap();
         SemiSpaceScanObjectVisitor visitor(this);
         live_bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(space->Begin()),
                                       reinterpret_cast<uintptr_t>(space->End()),
@@ -333,6 +358,7 @@
   }
 
   if (is_large_object_space_immune_) {
+    timings_.NewSplit("VisitLargeObjects");
     DCHECK(generational_ && !whole_heap_collection_);
     // Delay copying the live set to the marked set until here from
     // BindBitmaps() as the large objects on the allocation stack may
@@ -344,13 +370,13 @@
     // classes (primitive array classes) that could move though they
     // don't contain any other references.
     space::LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
-    accounting::ObjectSet* large_live_objects = large_object_space->GetLiveObjects();
+    accounting::LargeObjectBitmap* large_live_bitmap = large_object_space->GetLiveBitmap();
     SemiSpaceScanObjectVisitor visitor(this);
-    for (const Object* obj : large_live_objects->GetObjects()) {
-      visitor(const_cast<Object*>(obj));
-    }
+    large_live_bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(large_object_space->Begin()),
+                                        reinterpret_cast<uintptr_t>(large_object_space->End()),
+                                        visitor);
   }
-
+  timings_.EndSplit();
   // Recursively process the mark stack.
   ProcessMarkStack();
 }
@@ -364,9 +390,9 @@
   }
   // Record freed memory.
   uint64_t from_bytes = from_space_->GetBytesAllocated();
-  uint64_t to_bytes = to_space_->GetBytesAllocated();
+  uint64_t to_bytes = bytes_moved_;
   uint64_t from_objects = from_space_->GetObjectsAllocated();
-  uint64_t to_objects = to_space_->GetObjectsAllocated();
+  uint64_t to_objects = objects_moved_;
   CHECK_LE(to_objects, from_objects);
   int64_t freed_bytes = from_bytes - to_bytes;
   int64_t freed_objects = from_objects - to_objects;
@@ -375,10 +401,10 @@
   // Note: Freed bytes can be negative if we copy form a compacted space to a free-list backed
   // space.
   heap_->RecordFree(freed_objects, freed_bytes);
+
   timings_.StartSplit("PreSweepingGcVerification");
   heap_->PreSweepingGcVerification(this);
   timings_.EndSplit();
-
   {
     WriterMutexLock mu(self_, *Locks::heap_bitmap_lock_);
     // Reclaim unmarked objects.
@@ -393,11 +419,9 @@
     TimingLogger::ScopedSplit split("UnBindBitmaps", &timings_);
     GetHeap()->UnBindBitmaps();
   }
-  if (kClearFromSpace) {
-    // Release the memory used by the from space.
-    from_space_->Clear();
-  }
-  from_space_->Reset();
+  // TODO: Do this before doing verification since the from space may have objects which weren't
+  // moved and point to dead objects.
+  from_space_->Clear();
   // Protect the from space.
   VLOG(heap) << "Protecting space " << *from_space_;
   if (kProtectFromSpace) {
@@ -434,19 +458,6 @@
   mark_stack_->PushBack(obj);
 }
 
-// Rare case, probably not worth inlining since it will increase instruction cache miss rate.
-bool SemiSpace::MarkLargeObject(const Object* obj) {
-  // TODO: support >1 discontinuous space.
-  space::LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
-  DCHECK(large_object_space->Contains(obj));
-  accounting::ObjectSet* large_objects = large_object_space->GetMarkObjects();
-  if (UNLIKELY(!large_objects->Test(obj))) {
-    large_objects->Set(obj);
-    return true;
-  }
-  return false;
-}
-
 static inline size_t CopyAvoidingDirtyingPages(void* dest, const void* src, size_t size) {
   if (LIKELY(size <= static_cast<size_t>(kPageSize))) {
     // We will dirty the current page and somewhere in the middle of the next page. This means
@@ -505,23 +516,21 @@
     // If it's allocated before the last GC (older), move
     // (pseudo-promote) it to the main free list space (as sort
     // of an old generation.)
-    size_t bytes_promoted;
     space::MallocSpace* promo_dest_space = GetHeap()->GetPrimaryFreeListSpace();
-    forward_address = promo_dest_space->Alloc(self_, object_size, &bytes_promoted, nullptr);
-    if (forward_address == nullptr) {
+    forward_address = promo_dest_space->Alloc(self_, object_size, &bytes_allocated, nullptr);
+    if (UNLIKELY(forward_address == nullptr)) {
       // If out of space, fall back to the to-space.
       forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated, nullptr);
     } else {
-      GetHeap()->num_bytes_allocated_.FetchAndAdd(bytes_promoted);
-      bytes_promoted_ += bytes_promoted;
+      bytes_promoted_ += bytes_allocated;
       // Dirty the card at the destionation as it may contain
       // references (including the class pointer) to the bump pointer
       // space.
       GetHeap()->WriteBarrierEveryFieldOf(forward_address);
       // Handle the bitmaps marking.
-      accounting::SpaceBitmap* live_bitmap = promo_dest_space->GetLiveBitmap();
+      accounting::ContinuousSpaceBitmap* live_bitmap = promo_dest_space->GetLiveBitmap();
       DCHECK(live_bitmap != nullptr);
-      accounting::SpaceBitmap* mark_bitmap = promo_dest_space->GetMarkBitmap();
+      accounting::ContinuousSpaceBitmap* mark_bitmap = promo_dest_space->GetMarkBitmap();
       DCHECK(mark_bitmap != nullptr);
       DCHECK(!live_bitmap->Test(forward_address));
       if (!whole_heap_collection_) {
@@ -557,6 +566,8 @@
     // If it's allocated after the last GC (younger), copy it to the to-space.
     forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated, nullptr);
   }
+  ++objects_moved_;
+  bytes_moved_ += bytes_allocated;
   // Copy over the object and add it to the mark stack since we still need to update its
   // references.
   saved_bytes_ +=
@@ -603,10 +614,9 @@
 
 // Marks all objects in the root set.
 void SemiSpace::MarkRoots() {
-  timings_.StartSplit("MarkRoots");
+  timings_.NewSplit("MarkRoots");
   // TODO: Visit up image roots as well?
   Runtime::Current()->VisitRoots(MarkRootCallback, this);
-  timings_.EndSplit();
 }
 
 mirror::Object* SemiSpace::MarkedForwardingAddressCallback(mirror::Object* object, void* arg) {
@@ -625,7 +635,7 @@
 
 void SemiSpace::Sweep(bool swap_bitmaps) {
   DCHECK(mark_stack_->IsEmpty());
-  TimingLogger::ScopedSplit("Sweep", &timings_);
+  TimingLogger::ScopedSplit split("Sweep", &timings_);
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
     if (space->IsContinuousMemMapAllocSpace()) {
       space::ContinuousMemMapAllocSpace* alloc_space = space->AsContinuousMemMapAllocSpace();
@@ -649,13 +659,13 @@
 
 void SemiSpace::SweepLargeObjects(bool swap_bitmaps) {
   DCHECK(!is_large_object_space_immune_);
-  TimingLogger::ScopedSplit("SweepLargeObjects", &timings_);
+  TimingLogger::ScopedSplit split("SweepLargeObjects", &timings_);
   size_t freed_objects = 0;
   size_t freed_bytes = 0;
-  GetHeap()->GetLargeObjectsSpace()->Sweep(swap_bitmaps, &freed_objects, &freed_bytes);
+  heap_->GetLargeObjectsSpace()->Sweep(swap_bitmaps, &freed_objects, &freed_bytes);
   freed_large_objects_.FetchAndAdd(freed_objects);
   freed_large_object_bytes_.FetchAndAdd(freed_bytes);
-  GetHeap()->RecordFree(freed_objects, freed_bytes);
+  heap_->RecordFree(freed_objects, freed_bytes);
 }
 
 // Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
@@ -694,8 +704,8 @@
 
 // Scan anything that's on the mark stack.
 void SemiSpace::ProcessMarkStack() {
-  space::MallocSpace* promo_dest_space = NULL;
-  accounting::SpaceBitmap* live_bitmap = NULL;
+  space::MallocSpace* promo_dest_space = nullptr;
+  accounting::ContinuousSpaceBitmap* live_bitmap = nullptr;
   if (generational_ && !whole_heap_collection_) {
     // If a bump pointer space only collection (and the promotion is
     // enabled,) we delay the live-bitmap marking of promoted objects
@@ -703,7 +713,7 @@
     promo_dest_space = GetHeap()->GetPrimaryFreeListSpace();
     live_bitmap = promo_dest_space->GetLiveBitmap();
     DCHECK(live_bitmap != nullptr);
-    accounting::SpaceBitmap* mark_bitmap = promo_dest_space->GetMarkBitmap();
+    accounting::ContinuousSpaceBitmap* mark_bitmap = promo_dest_space->GetMarkBitmap();
     DCHECK(mark_bitmap != nullptr);
     DCHECK_EQ(live_bitmap, mark_bitmap);
   }
@@ -762,18 +772,34 @@
   if (generational_) {
     // Decide whether to do a whole heap collection or a bump pointer
     // only space collection at the next collection by updating
-    // whole_heap_collection. Enable whole_heap_collection once every
-    // kDefaultWholeHeapCollectionInterval collections.
+    // whole_heap_collection.
     if (!whole_heap_collection_) {
-      --whole_heap_collection_interval_counter_;
-      DCHECK_GE(whole_heap_collection_interval_counter_, 0);
-      if (whole_heap_collection_interval_counter_ == 0) {
-        whole_heap_collection_ = true;
+      if (!kUseBytesPromoted) {
+        // Enable whole_heap_collection once every
+        // kDefaultWholeHeapCollectionInterval collections.
+        --whole_heap_collection_interval_counter_;
+        DCHECK_GE(whole_heap_collection_interval_counter_, 0);
+        if (whole_heap_collection_interval_counter_ == 0) {
+          whole_heap_collection_ = true;
+        }
+      } else {
+        // Enable whole_heap_collection if the bytes promoted since
+        // the last whole heap collection exceeds a threshold.
+        bytes_promoted_since_last_whole_heap_collection_ += bytes_promoted_;
+        if (bytes_promoted_since_last_whole_heap_collection_ >= kBytesPromotedThreshold) {
+          whole_heap_collection_ = true;
+        }
       }
     } else {
-      DCHECK_EQ(whole_heap_collection_interval_counter_, 0);
-      whole_heap_collection_interval_counter_ = kDefaultWholeHeapCollectionInterval;
-      whole_heap_collection_ = false;
+      if (!kUseBytesPromoted) {
+        DCHECK_EQ(whole_heap_collection_interval_counter_, 0);
+        whole_heap_collection_interval_counter_ = kDefaultWholeHeapCollectionInterval;
+        whole_heap_collection_ = false;
+      } else {
+        // Reset it.
+        bytes_promoted_since_last_whole_heap_collection_ = bytes_promoted_;
+        whole_heap_collection_ = false;
+      }
     }
   }
   // Clear all of the spaces' mark bitmaps.
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index f067cb2..3d635f0 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -21,6 +21,7 @@
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "garbage_collector.h"
+#include "gc/accounting/space_bitmap.h"
 #include "immune_region.h"
 #include "object_callbacks.h"
 #include "offsets.h"
@@ -42,7 +43,6 @@
 namespace accounting {
   template <typename T> class AtomicStack;
   typedef AtomicStack<mirror::Object*> ObjectStack;
-  class SpaceBitmap;
 }  // namespace accounting
 
 namespace space {
@@ -198,8 +198,11 @@
   // Destination and source spaces (can be any type of ContinuousMemMapAllocSpace which either has
   // a live bitmap or doesn't).
   space::ContinuousMemMapAllocSpace* to_space_;
-  accounting::SpaceBitmap* to_space_live_bitmap_;  // Cached live bitmap as an optimization.
+  // Cached live bitmap as an optimization.
+  accounting::ContinuousSpaceBitmap* to_space_live_bitmap_;
   space::ContinuousMemMapAllocSpace* from_space_;
+  // Cached mark bitmap as an optimization.
+  accounting::HeapBitmap* mark_bitmap_;
 
   Thread* self_;
 
@@ -217,6 +220,11 @@
   // bump pointer space to the non-moving space.
   uint64_t bytes_promoted_;
 
+  // Used for the generational mode. Keeps track of how many bytes of
+  // objects have been copied so far from the bump pointer space to
+  // the non-moving space, since the last whole heap collection.
+  uint64_t bytes_promoted_since_last_whole_heap_collection_;
+
   // Used for the generational mode. When true, collect the whole
   // heap. When false, collect only the bump pointer spaces.
   bool whole_heap_collection_;
@@ -225,15 +233,24 @@
   // whole_heap_collection_ once per interval.
   int whole_heap_collection_interval_counter_;
 
+  // How many objects and bytes we moved, used so that we don't need to get the size of the
+  // to_space_ when calculating how many objects and bytes we freed.
+  size_t bytes_moved_;
+  size_t objects_moved_;
+
   // How many bytes we avoided dirtying.
   size_t saved_bytes_;
 
+  // The name of the collector.
+  std::string collector_name_;
+
   // Used for the generational mode. The default interval of the whole
   // heap collection. If N, the whole heap collection occurs every N
   // collections.
   static constexpr int kDefaultWholeHeapCollectionInterval = 5;
 
  private:
+  friend class BitmapSetSlowPathVisitor;
   DISALLOW_COPY_AND_ASSIGN(SemiSpace);
 };
 
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 915e54f..502da12 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -77,10 +77,20 @@
 // Minimum amount of remaining bytes before a concurrent GC is triggered.
 static constexpr size_t kMinConcurrentRemainingBytes = 128 * KB;
 static constexpr size_t kMaxConcurrentRemainingBytes = 512 * KB;
+// Sticky GC throughput adjustment, divided by 4. Increasing this causes sticky GC to occur more
+// relative to partial/full GC. This is desirable since sticky GCs interfere less with mutator
+// threads (lower pauses, use less memory bandwidth).
+static constexpr double kStickyGcThroughputAdjustment = 1.25;
+// Whether or not we use the free list large object space.
+static constexpr bool kUseFreeListSpaceForLOS = false;
+// Whtehr or not we compact the zygote in PreZygoteFork.
+static constexpr bool kCompactZygote = kMovingCollector;
+static constexpr size_t kNonMovingSpaceCapacity = 64 * MB;
 
 Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max_free,
-           double target_utilization, size_t capacity, const std::string& image_file_name,
-           CollectorType post_zygote_collector_type, CollectorType background_collector_type,
+           double target_utilization, double foreground_heap_growth_multiplier, size_t capacity,
+           const std::string& image_file_name,
+           CollectorType foreground_collector_type, CollectorType background_collector_type,
            size_t parallel_gc_threads, size_t conc_gc_threads, bool low_memory_mode,
            size_t long_pause_log_threshold, size_t long_gc_log_threshold,
            bool ignore_max_footprint, bool use_tlab, bool verify_pre_gc_heap,
@@ -91,9 +101,9 @@
       dlmalloc_space_(nullptr),
       main_space_(nullptr),
       collector_type_(kCollectorTypeNone),
-      post_zygote_collector_type_(post_zygote_collector_type),
+      foreground_collector_type_(foreground_collector_type),
       background_collector_type_(background_collector_type),
-      desired_collector_type_(collector_type_),
+      desired_collector_type_(foreground_collector_type_),
       heap_trim_request_lock_(nullptr),
       last_trim_time_(0),
       heap_transition_target_time_(0),
@@ -145,6 +155,7 @@
       min_free_(min_free),
       max_free_(max_free),
       target_utilization_(target_utilization),
+      foreground_heap_growth_multiplier_(foreground_heap_growth_multiplier),
       total_wait_time_(0),
       total_allocation_time_(0),
       verify_object_mode_(kVerifyObjectModeDisabled),
@@ -158,15 +169,11 @@
   // If we aren't the zygote, switch to the default non zygote allocator. This may update the
   // entrypoints.
   if (!is_zygote) {
-    desired_collector_type_ = post_zygote_collector_type_;
     large_object_threshold_ = kDefaultLargeObjectThreshold;
-  } else {
-    if (kMovingCollector) {
-      // We are the zygote, use bump pointer allocation + semi space collector.
-      bool generational = post_zygote_collector_type_ == kCollectorTypeGSS;
-      desired_collector_type_ = generational ? kCollectorTypeGSS : kCollectorTypeSS;
-    } else {
-      desired_collector_type_ = post_zygote_collector_type_;
+    // Background compaction is currently not supported for command line runs.
+    if (background_collector_type_ != foreground_collector_type_) {
+      LOG(WARNING) << "Disabling background compaction for non zygote";
+      background_collector_type_ = foreground_collector_type_;
     }
   }
   ChangeCollector(desired_collector_type_);
@@ -183,73 +190,61 @@
     // isn't going to get in the middle
     byte* oat_file_end_addr = image_space->GetImageHeader().GetOatFileEnd();
     CHECK_GT(oat_file_end_addr, image_space->End());
-    if (oat_file_end_addr > requested_alloc_space_begin) {
-      requested_alloc_space_begin = AlignUp(oat_file_end_addr, kPageSize);
-    }
+    requested_alloc_space_begin = AlignUp(oat_file_end_addr, kPageSize);
   }
-  MemMap* malloc_space_mem_map = nullptr;
-  const char* malloc_space_name = is_zygote ? "zygote space" : "alloc space";
   if (is_zygote) {
-    // Allocate a single mem map that is split into the malloc space
-    // and the post zygote non-moving space to put them adjacent.
-    size_t post_zygote_non_moving_space_size = 64 * MB;
-    size_t non_moving_spaces_size = capacity + post_zygote_non_moving_space_size;
+    // Reserve the address range before we create the non moving space to make sure bitmaps don't
+    // take it.
     std::string error_str;
-    malloc_space_mem_map = MemMap::MapAnonymous(malloc_space_name, requested_alloc_space_begin,
-                                                non_moving_spaces_size, PROT_READ | PROT_WRITE,
-                                                true, &error_str);
-    CHECK(malloc_space_mem_map != nullptr) << error_str;
-    post_zygote_non_moving_space_mem_map_.reset(malloc_space_mem_map->RemapAtEnd(
-        malloc_space_mem_map->Begin() + capacity, "post zygote non-moving space",
-        PROT_READ | PROT_WRITE, &error_str));
-    CHECK(post_zygote_non_moving_space_mem_map_.get() != nullptr) << error_str;
-    VLOG(heap) << "malloc space mem map : " << malloc_space_mem_map;
-    VLOG(heap) << "post zygote non-moving space mem map : "
-               << post_zygote_non_moving_space_mem_map_.get();
+    MemMap* mem_map = MemMap::MapAnonymous(
+        "main space", requested_alloc_space_begin + kNonMovingSpaceCapacity, capacity,
+        PROT_READ | PROT_WRITE, true, &error_str);
+    CHECK(mem_map != nullptr) << error_str;
+    // Non moving space is always dlmalloc since we currently don't have support for multiple
+    // rosalloc spaces.
+    non_moving_space_ = space::DlMallocSpace::Create(
+        "zygote / non moving space", initial_size, kNonMovingSpaceCapacity, kNonMovingSpaceCapacity,
+        requested_alloc_space_begin, false);
+    non_moving_space_->SetFootprintLimit(non_moving_space_->Capacity());
+    CreateMainMallocSpace(mem_map, initial_size, growth_limit, capacity);
   } else {
-    // Allocate a mem map for the malloc space.
     std::string error_str;
-    malloc_space_mem_map = MemMap::MapAnonymous(malloc_space_name, requested_alloc_space_begin,
-                                                capacity, PROT_READ | PROT_WRITE, true, &error_str);
-    CHECK(malloc_space_mem_map != nullptr) << error_str;
-    VLOG(heap) << "malloc space mem map : " << malloc_space_mem_map;
+    MemMap* mem_map = MemMap::MapAnonymous("main/non-moving space", requested_alloc_space_begin,
+                                           capacity, PROT_READ | PROT_WRITE, true, &error_str);
+    CHECK(mem_map != nullptr) << error_str;
+    // Create the main free list space, which doubles as the non moving space. We can do this since
+    // non zygote means that we won't have any background compaction.
+    CreateMainMallocSpace(mem_map, initial_size, growth_limit, capacity);
+    non_moving_space_ = main_space_;
   }
-  CHECK(malloc_space_mem_map != nullptr);
-  space::MallocSpace* malloc_space;
-  if (kUseRosAlloc) {
-    malloc_space = space::RosAllocSpace::CreateFromMemMap(malloc_space_mem_map, malloc_space_name,
-                                                          kDefaultStartingSize, initial_size,
-                                                          growth_limit, capacity, low_memory_mode_);
-    CHECK(malloc_space != nullptr) << "Failed to create rosalloc space";
-  } else {
-    malloc_space = space::DlMallocSpace::CreateFromMemMap(malloc_space_mem_map, malloc_space_name,
-                                                          kDefaultStartingSize, initial_size,
-                                                          growth_limit, capacity);
-    CHECK(malloc_space != nullptr) << "Failed to create dlmalloc space";
-  }
-  VLOG(heap) << "malloc_space : " << malloc_space;
+  CHECK(non_moving_space_ != nullptr);
+
+  // We need to create the bump pointer if the foreground collector is a compacting GC. We only
+  // create the bump pointer space if we are not a moving foreground collector but have a moving
+  // background collector since the heap transition code will create the temp space by recycling
+  // the bitmap from the main space.
   if (kMovingCollector) {
     // TODO: Place bump-pointer spaces somewhere to minimize size of card table.
-    // TODO: Having 3+ spaces as big as the large heap size can cause virtual memory fragmentation
-    // issues.
-    const size_t bump_pointer_space_size = std::min(malloc_space->Capacity(), 128 * MB);
+    // TODO: Not create all the bump pointer spaces if not necessary (currently only GSS needs all
+    // 2 of bump pointer spaces + main space) b/14059466. Divide by 2 for a temporary fix.
+    const size_t bump_pointer_space_capacity = capacity / 2;
     bump_pointer_space_ = space::BumpPointerSpace::Create("Bump pointer space",
-                                                          bump_pointer_space_size, nullptr);
+                                                          bump_pointer_space_capacity, nullptr);
     CHECK(bump_pointer_space_ != nullptr) << "Failed to create bump pointer space";
     AddSpace(bump_pointer_space_);
-    temp_space_ = space::BumpPointerSpace::Create("Bump pointer space 2", bump_pointer_space_size,
-                                                  nullptr);
+    temp_space_ = space::BumpPointerSpace::Create("Bump pointer space 2",
+                                                  bump_pointer_space_capacity, nullptr);
     CHECK(temp_space_ != nullptr) << "Failed to create bump pointer space";
     AddSpace(temp_space_);
-    VLOG(heap) << "bump_pointer_space : " << bump_pointer_space_;
-    VLOG(heap) << "temp_space : " << temp_space_;
   }
-  non_moving_space_ = malloc_space;
-  malloc_space->SetFootprintLimit(malloc_space->Capacity());
-  AddSpace(malloc_space);
+  if (non_moving_space_ != main_space_) {
+    AddSpace(non_moving_space_);
+  }
+  if (main_space_ != nullptr) {
+    AddSpace(main_space_);
+  }
 
   // Allocate the large object space.
-  constexpr bool kUseFreeListSpaceForLOS = false;
   if (kUseFreeListSpaceForLOS) {
     large_object_space_ = space::FreeListSpace::Create("large object space", nullptr, capacity);
   } else {
@@ -264,11 +259,6 @@
   // Relies on the spaces being sorted.
   byte* heap_begin = continuous_spaces_.front()->Begin();
   byte* heap_end = continuous_spaces_.back()->Limit();
-  if (is_zygote) {
-    CHECK(post_zygote_non_moving_space_mem_map_.get() != nullptr);
-    heap_begin = std::min(post_zygote_non_moving_space_mem_map_->Begin(), heap_begin);
-    heap_end = std::max(post_zygote_non_moving_space_mem_map_->End(), heap_end);
-  }
   size_t heap_capacity = heap_end - heap_begin;
 
   // Allocate the card table.
@@ -288,6 +278,12 @@
         new accounting::RememberedSet("Non-moving space remembered set", this, non_moving_space_);
     CHECK(non_moving_space_rem_set != nullptr) << "Failed to create non-moving space remembered set";
     AddRememberedSet(non_moving_space_rem_set);
+    if (main_space_ != nullptr && main_space_ != non_moving_space_) {
+      accounting::RememberedSet* main_space_rem_set =
+          new accounting::RememberedSet("Main space remembered set", this, main_space_);
+      CHECK(main_space_rem_set != nullptr) << "Failed to create main space remembered set";
+      AddRememberedSet(main_space_rem_set);
+    }
   }
 
   // TODO: Count objects in the image space here.
@@ -325,8 +321,9 @@
   }
   if (kMovingCollector) {
     // TODO: Clean this up.
-    bool generational = post_zygote_collector_type_ == kCollectorTypeGSS;
-    semi_space_collector_ = new collector::SemiSpace(this, generational);
+    bool generational = foreground_collector_type_ == kCollectorTypeGSS;
+    semi_space_collector_ = new collector::SemiSpace(this, generational,
+                                                     generational ? "generational" : "");
     garbage_collectors_.push_back(semi_space_collector_);
 
     concurrent_copying_collector_ = new collector::ConcurrentCopying(this);
@@ -342,6 +339,37 @@
   }
 }
 
+void Heap::CreateMainMallocSpace(MemMap* mem_map, size_t initial_size, size_t growth_limit,
+                                 size_t capacity) {
+  // Is background compaction is enabled?
+  bool can_move_objects = IsMovingGc(background_collector_type_) !=
+      IsMovingGc(foreground_collector_type_);
+  // If we are the zygote and don't yet have a zygote space, it means that the zygote fork will
+  // happen in the future. If this happens and we have kCompactZygote enabled we wish to compact
+  // from the main space to the zygote space. If background compaction is enabled, always pass in
+  // that we can move objets.
+  if (kCompactZygote && Runtime::Current()->IsZygote() && !can_move_objects) {
+    // After the zygote we want this to be false if we don't have background compaction enabled so
+    // that getting primitive array elements is faster.
+    can_move_objects = !have_zygote_space_;
+  }
+  if (kUseRosAlloc) {
+    main_space_ = space::RosAllocSpace::CreateFromMemMap(mem_map, "main rosalloc space",
+                                                         kDefaultStartingSize, initial_size,
+                                                         growth_limit, capacity, low_memory_mode_,
+                                                         can_move_objects);
+    CHECK(main_space_ != nullptr) << "Failed to create rosalloc space";
+  } else {
+    main_space_ = space::DlMallocSpace::CreateFromMemMap(mem_map, "main dlmalloc space",
+                                                         kDefaultStartingSize, initial_size,
+                                                         growth_limit, capacity,
+                                                         can_move_objects);
+    CHECK(main_space_ != nullptr) << "Failed to create dlmalloc space";
+  }
+  main_space_->SetFootprintLimit(main_space_->Capacity());
+  VLOG(heap) << "Created main space " << main_space_;
+}
+
 void Heap::ChangeAllocator(AllocatorType allocator) {
   if (current_allocator_ != allocator) {
     // These two allocators are only used internally and don't have any entrypoints.
@@ -355,13 +383,13 @@
 }
 
 void Heap::DisableCompaction() {
-  if (IsCompactingGC(post_zygote_collector_type_)) {
-    post_zygote_collector_type_ = kCollectorTypeCMS;
+  if (IsMovingGc(foreground_collector_type_)) {
+    foreground_collector_type_  = kCollectorTypeCMS;
   }
-  if (IsCompactingGC(background_collector_type_)) {
-    background_collector_type_ = post_zygote_collector_type_;
+  if (IsMovingGc(background_collector_type_)) {
+    background_collector_type_ = foreground_collector_type_;
   }
-  TransitionCollector(post_zygote_collector_type_);
+  TransitionCollector(foreground_collector_type_);
 }
 
 std::string Heap::SafeGetClassDescriptor(mirror::Class* klass) {
@@ -423,14 +451,6 @@
         break;
       }
     }
-    if (space == nullptr) {
-      if (allocator_mem_map_.get() == nullptr || !allocator_mem_map_->HasAddress(obj)) {
-        stream << "obj " << obj << " not a valid heap address";
-        return;
-      } else if (allocator_mem_map_.get() != nullptr) {
-        allocator_mem_map_->Protect(PROT_READ | PROT_WRITE);
-      }
-    }
     // Unprotect all the spaces.
     for (const auto& space : continuous_spaces_) {
       mprotect(space->Begin(), space->Capacity(), PROT_READ | PROT_WRITE);
@@ -473,7 +493,7 @@
   ScopedThreadStateChange tsc(self, kWaitingForGcToComplete);
   MutexLock mu(self, *gc_complete_lock_);
   ++disable_moving_gc_count_;
-  if (IsCompactingGC(collector_type_running_)) {
+  if (IsMovingGc(collector_type_running_)) {
     WaitForGcToCompleteLocked(self);
   }
 }
@@ -491,12 +511,12 @@
       // Start at index 1 to avoid "is always false" warning.
       // Have iteration 1 always transition the collector.
       TransitionCollector((((i & 1) == 1) == (process_state_ == kProcessStateJankPerceptible))
-                          ? post_zygote_collector_type_ : background_collector_type_);
+                          ? foreground_collector_type_ : background_collector_type_);
       usleep(kCollectorTransitionStressWait);
     }
     if (process_state_ == kProcessStateJankPerceptible) {
       // Transition back to foreground right away to prevent jank.
-      RequestCollectorTransition(post_zygote_collector_type_, 0);
+      RequestCollectorTransition(foreground_collector_type_, 0);
     } else {
       // Don't delay for debug builds since we may want to stress test the GC.
       RequestCollectorTransition(background_collector_type_, kIsDebugBuild ? 0 :
@@ -549,7 +569,7 @@
     space2 = space1;
   }
   MarkAllocStack(space1->GetLiveBitmap(), space2->GetLiveBitmap(),
-                 large_object_space_->GetLiveObjects(), stack);
+                 large_object_space_->GetLiveBitmap(), stack);
 }
 
 void Heap::DeleteThreadPool() {
@@ -563,8 +583,8 @@
     DCHECK(!space->IsDiscontinuousSpace());
     space::ContinuousSpace* continuous_space = space->AsContinuousSpace();
     // Continuous spaces don't necessarily have bitmaps.
-    accounting::SpaceBitmap* live_bitmap = continuous_space->GetLiveBitmap();
-    accounting::SpaceBitmap* mark_bitmap = continuous_space->GetMarkBitmap();
+    accounting::ContinuousSpaceBitmap* live_bitmap = continuous_space->GetLiveBitmap();
+    accounting::ContinuousSpaceBitmap* mark_bitmap = continuous_space->GetMarkBitmap();
     if (live_bitmap != nullptr) {
       DCHECK(mark_bitmap != nullptr);
       live_bitmap_->AddContinuousSpaceBitmap(live_bitmap);
@@ -586,10 +606,8 @@
   } else {
     DCHECK(space->IsDiscontinuousSpace());
     space::DiscontinuousSpace* discontinuous_space = space->AsDiscontinuousSpace();
-    DCHECK(discontinuous_space->GetLiveObjects() != nullptr);
-    live_bitmap_->AddDiscontinuousObjectSet(discontinuous_space->GetLiveObjects());
-    DCHECK(discontinuous_space->GetMarkObjects() != nullptr);
-    mark_bitmap_->AddDiscontinuousObjectSet(discontinuous_space->GetMarkObjects());
+    live_bitmap_->AddLargeObjectBitmap(discontinuous_space->GetLiveBitmap());
+    mark_bitmap_->AddLargeObjectBitmap(discontinuous_space->GetMarkBitmap());
     discontinuous_spaces_.push_back(discontinuous_space);
   }
   if (space->IsAllocSpace()) {
@@ -604,8 +622,8 @@
     DCHECK(!space->IsDiscontinuousSpace());
     space::ContinuousSpace* continuous_space = space->AsContinuousSpace();
     // Continuous spaces don't necessarily have bitmaps.
-    accounting::SpaceBitmap* live_bitmap = continuous_space->GetLiveBitmap();
-    accounting::SpaceBitmap* mark_bitmap = continuous_space->GetMarkBitmap();
+    accounting::ContinuousSpaceBitmap* live_bitmap = continuous_space->GetLiveBitmap();
+    accounting::ContinuousSpaceBitmap* mark_bitmap = continuous_space->GetMarkBitmap();
     if (live_bitmap != nullptr) {
       DCHECK(mark_bitmap != nullptr);
       live_bitmap_->RemoveContinuousSpaceBitmap(live_bitmap);
@@ -621,14 +639,16 @@
     }
     if (continuous_space == main_space_) {
       main_space_ = nullptr;
+    } else if (continuous_space == bump_pointer_space_) {
+      bump_pointer_space_ = nullptr;
+    } else if (continuous_space == temp_space_) {
+      temp_space_ = nullptr;
     }
   } else {
     DCHECK(space->IsDiscontinuousSpace());
     space::DiscontinuousSpace* discontinuous_space = space->AsDiscontinuousSpace();
-    DCHECK(discontinuous_space->GetLiveObjects() != nullptr);
-    live_bitmap_->RemoveDiscontinuousObjectSet(discontinuous_space->GetLiveObjects());
-    DCHECK(discontinuous_space->GetMarkObjects() != nullptr);
-    mark_bitmap_->RemoveDiscontinuousObjectSet(discontinuous_space->GetMarkObjects());
+    live_bitmap_->RemoveLargeObjectBitmap(discontinuous_space->GetLiveBitmap());
+    mark_bitmap_->RemoveLargeObjectBitmap(discontinuous_space->GetMarkBitmap());
     auto it = std::find(discontinuous_spaces_.begin(), discontinuous_spaces_.end(),
                         discontinuous_space);
     DCHECK(it != discontinuous_spaces_.end());
@@ -657,12 +677,13 @@
   // Dump cumulative timings.
   os << "Dumping cumulative Gc timings\n";
   uint64_t total_duration = 0;
-
   // Dump cumulative loggers for each GC type.
   uint64_t total_paused_time = 0;
-  for (const auto& collector : garbage_collectors_) {
+  for (auto& collector : garbage_collectors_) {
     const CumulativeLogger& logger = collector->GetCumulativeTimings();
-    if (logger.GetTotalNs() != 0) {
+    const size_t iterations = logger.GetIterations();
+    const Histogram<uint64_t>& pause_histogram = collector->GetPauseHistogram();
+    if (iterations != 0 && pause_histogram.SampleSize() != 0) {
       os << ConstDumpable<CumulativeLogger>(logger);
       const uint64_t total_ns = logger.GetTotalNs();
       const uint64_t total_pause_ns = collector->GetTotalPausedTimeNs();
@@ -670,9 +691,10 @@
       const uint64_t freed_bytes = collector->GetTotalFreedBytes();
       const uint64_t freed_objects = collector->GetTotalFreedObjects();
       Histogram<uint64_t>::CumulativeData cumulative_data;
-      collector->GetPauseHistogram().CreateHistogram(&cumulative_data);
-      collector->GetPauseHistogram().PrintConfidenceIntervals(os, 0.99, cumulative_data);
-      os << collector->GetName() << " total time: " << PrettyDuration(total_ns) << "\n"
+      pause_histogram.CreateHistogram(&cumulative_data);
+      pause_histogram.PrintConfidenceIntervals(os, 0.99, cumulative_data);
+      os << collector->GetName() << " total time: " << PrettyDuration(total_ns)
+         << " mean time: " << PrettyDuration(total_ns / iterations) << "\n"
          << collector->GetName() << " freed: " << freed_objects
          << " objects with total size " << PrettySize(freed_bytes) << "\n"
          << collector->GetName() << " throughput: " << freed_objects / seconds << "/s / "
@@ -680,6 +702,7 @@
       total_duration += total_ns;
       total_paused_time += total_pause_ns;
     }
+    collector->ResetMeasurements();
   }
   uint64_t allocation_time = static_cast<uint64_t>(total_allocation_time_) * kTimeAdjust;
   if (total_duration != 0) {
@@ -914,8 +937,16 @@
   // Transition the collector if the desired collector type is not the same as the current
   // collector type.
   TransitionCollector(desired_collector_type);
-  // Do a heap trim if it is needed.
-  Trim();
+  if (!CareAboutPauseTimes()) {
+    // Deflate the monitors, this can cause a pause but shouldn't matter since we don't care
+    // about pauses.
+    Runtime* runtime = Runtime::Current();
+    runtime->GetThreadList()->SuspendAll();
+    runtime->GetMonitorList()->DeflateMonitors();
+    runtime->GetThreadList()->ResumeAll();
+    // Do a heap trim if it is needed.
+    Trim();
+  }
 }
 
 void Heap::Trim() {
@@ -951,8 +982,10 @@
       managed_reclaimed += alloc_space->Trim();
     }
   }
-  total_alloc_space_allocated = GetBytesAllocated() - large_object_space_->GetBytesAllocated() -
-      bump_pointer_space_->Size();
+  total_alloc_space_allocated = GetBytesAllocated() - large_object_space_->GetBytesAllocated();
+  if (bump_pointer_space_ != nullptr) {
+    total_alloc_space_allocated -= bump_pointer_space_->Size();
+  }
   const float managed_utilization = static_cast<float>(total_alloc_space_allocated) /
       static_cast<float>(total_alloc_space_size);
   uint64_t gc_heap_end_ns = NanoTime();
@@ -1013,7 +1046,7 @@
     return temp_space_->Contains(obj);
   }
   space::ContinuousSpace* c_space = FindContinuousSpaceFromObject(obj, true);
-  space::DiscontinuousSpace* d_space = NULL;
+  space::DiscontinuousSpace* d_space = nullptr;
   if (c_space != nullptr) {
     if (c_space->GetLiveBitmap()->Test(obj)) {
       return true;
@@ -1021,7 +1054,7 @@
   } else {
     d_space = FindDiscontinuousSpaceFromObject(obj, true);
     if (d_space != nullptr) {
-      if (d_space->GetLiveObjects()->Test(obj)) {
+      if (d_space->GetLiveBitmap()->Test(obj)) {
         return true;
       }
     }
@@ -1059,7 +1092,7 @@
     }
   } else {
     d_space = FindDiscontinuousSpaceFromObject(obj, true);
-    if (d_space != nullptr && d_space->GetLiveObjects()->Test(obj)) {
+    if (d_space != nullptr && d_space->GetLiveBitmap()->Test(obj)) {
       return true;
     }
   }
@@ -1068,8 +1101,8 @@
 
 void Heap::DumpSpaces(std::ostream& stream) {
   for (const auto& space : continuous_spaces_) {
-    accounting::SpaceBitmap* live_bitmap = space->GetLiveBitmap();
-    accounting::SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
+    accounting::ContinuousSpaceBitmap* live_bitmap = space->GetLiveBitmap();
+    accounting::ContinuousSpaceBitmap* mark_bitmap = space->GetMarkBitmap();
     stream << space << " " << *space << "\n";
     if (live_bitmap != nullptr) {
       stream << live_bitmap << " " << *live_bitmap << "\n";
@@ -1377,14 +1410,13 @@
   VLOG(heap) << "TransitionCollector: " << static_cast<int>(collector_type_)
              << " -> " << static_cast<int>(collector_type);
   uint64_t start_time = NanoTime();
-  uint32_t before_size  = GetTotalMemory();
   uint32_t before_allocated = num_bytes_allocated_.Load();
   ThreadList* tl = Runtime::Current()->GetThreadList();
   Thread* self = Thread::Current();
   ScopedThreadStateChange tsc(self, kWaitingPerformingGc);
   Locks::mutator_lock_->AssertNotHeld(self);
   const bool copying_transition =
-      IsCompactingGC(background_collector_type_) || IsCompactingGC(post_zygote_collector_type_);
+      IsMovingGc(background_collector_type_) || IsMovingGc(foreground_collector_type_);
   // Busy wait until we can GC (StartGC can fail if we have a non-zero
   // compacting_gc_disable_count_, this should rarely occurs).
   for (;;) {
@@ -1393,6 +1425,13 @@
       MutexLock mu(self, *gc_complete_lock_);
       // Ensure there is only one GC at a time.
       WaitForGcToCompleteLocked(self);
+      // If someone else beat us to it and changed the collector before we could, exit.
+      // This is safe to do before the suspend all since we set the collector_type_running_ before
+      // we exit the loop. If another thread attempts to do the heap transition before we exit,
+      // then it would get blocked on WaitForGcToCompleteLocked.
+      if (collector_type == collector_type_) {
+        return;
+      }
       // GC can be disabled if someone has a used GetPrimitiveArrayCritical but not yet released.
       if (!copying_transition || disable_moving_gc_count_ == 0) {
         // TODO: Not hard code in semi-space collector?
@@ -1408,42 +1447,20 @@
     case kCollectorTypeSS:
       // Fall-through.
     case kCollectorTypeGSS: {
-      mprotect(temp_space_->Begin(), temp_space_->Capacity(), PROT_READ | PROT_WRITE);
-      CHECK(main_space_ != nullptr);
-      Compact(temp_space_, main_space_);
-      DCHECK(allocator_mem_map_.get() == nullptr);
-      allocator_mem_map_.reset(main_space_->ReleaseMemMap());
-      madvise(main_space_->Begin(), main_space_->Size(), MADV_DONTNEED);
-      // RemoveSpace does not delete the removed space.
-      space::Space* old_space = main_space_;
-      RemoveSpace(old_space);
-      delete old_space;
+      if (!IsMovingGc(collector_type_)) {
+        // We are transitioning from non moving GC -> moving GC, since we copied from the bump
+        // pointer space last transition it will be protected.
+        bump_pointer_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
+        Compact(bump_pointer_space_, main_space_);
+      }
       break;
     }
     case kCollectorTypeMS:
       // Fall through.
     case kCollectorTypeCMS: {
-      if (IsCompactingGC(collector_type_)) {
-        // TODO: Use mem-map from temp space?
-        MemMap* mem_map = allocator_mem_map_.release();
-        CHECK(mem_map != nullptr);
-        size_t starting_size = kDefaultStartingSize;
-        size_t initial_size = kDefaultInitialSize;
-        mprotect(mem_map->Begin(), initial_size, PROT_READ | PROT_WRITE);
-        CHECK(main_space_ == nullptr);
-        if (kUseRosAlloc) {
-          main_space_ =
-              space::RosAllocSpace::CreateFromMemMap(mem_map, "alloc space", starting_size,
-                                                     initial_size, mem_map->Size(),
-                                                     mem_map->Size(), low_memory_mode_);
-        } else {
-          main_space_ =
-              space::DlMallocSpace::CreateFromMemMap(mem_map, "alloc space", starting_size,
-                                                     initial_size, mem_map->Size(),
-                                                     mem_map->Size());
-        }
-        main_space_->SetFootprintLimit(main_space_->Capacity());
-        AddSpace(main_space_);
+      if (IsMovingGc(collector_type_)) {
+        // Compact to the main space from the bump pointer space, don't need to swap semispaces.
+        main_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
         Compact(main_space_, bump_pointer_space_);
       }
       break;
@@ -1462,16 +1479,10 @@
   uint64_t duration = NanoTime() - start_time;
   GrowForUtilization(semi_space_collector_);
   FinishGC(self, collector::kGcTypeFull);
-  int32_t after_size = GetTotalMemory();
-  int32_t delta_size = before_size - after_size;
   int32_t after_allocated = num_bytes_allocated_.Load();
   int32_t delta_allocated = before_allocated - after_allocated;
-  const std::string saved_bytes_str =
-      delta_size < 0 ? "-" + PrettySize(-delta_size) : PrettySize(delta_size);
   LOG(INFO) << "Heap transition to " << process_state_ << " took "
-      << PrettyDuration(duration) << " " << PrettySize(before_size) << "->"
-      << PrettySize(after_size) << " from " << PrettySize(delta_allocated) << " to "
-      << PrettySize(delta_size) << " saved";
+      << PrettyDuration(duration) << " saved at least " << PrettySize(delta_allocated);
 }
 
 void Heap::ChangeCollector(CollectorType collector_type) {
@@ -1546,9 +1557,9 @@
   // Maps from bin sizes to locations.
   std::multimap<size_t, uintptr_t> bins_;
   // Live bitmap of the space which contains the bins.
-  accounting::SpaceBitmap* bin_live_bitmap_;
+  accounting::ContinuousSpaceBitmap* bin_live_bitmap_;
   // Mark bitmap of the space which contains the bins.
-  accounting::SpaceBitmap* bin_mark_bitmap_;
+  accounting::ContinuousSpaceBitmap* bin_mark_bitmap_;
 
   static void Callback(mirror::Object* obj, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -1639,11 +1650,12 @@
   VLOG(heap) << "Starting PreZygoteFork";
   // Trim the pages at the end of the non moving space.
   non_moving_space_->Trim();
+  // The end of the non-moving space may be protected, unprotect it so that we can copy the zygote
+  // there.
   non_moving_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
   // Change the collector to the post zygote one.
-  ChangeCollector(post_zygote_collector_type_);
-  // TODO: Delete bump_pointer_space_ and temp_pointer_space_?
-  if (semi_space_collector_ != nullptr) {
+  if (kCompactZygote) {
+    DCHECK(semi_space_collector_ != nullptr);
     // Temporarily disable rosalloc verification because the zygote
     // compaction will mess up the rosalloc internal metadata.
     ScopedDisableRosAllocVerification disable_rosalloc_verif(this);
@@ -1653,18 +1665,47 @@
     space::BumpPointerSpace target_space("zygote bump space", non_moving_space_->End(),
                                          non_moving_space_->Limit());
     // Compact the bump pointer space to a new zygote bump pointer space.
-    temp_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
-    zygote_collector.SetFromSpace(bump_pointer_space_);
+    bool reset_main_space = false;
+    if (IsMovingGc(collector_type_)) {
+      zygote_collector.SetFromSpace(bump_pointer_space_);
+    } else {
+      CHECK(main_space_ != nullptr);
+      // Copy from the main space.
+      zygote_collector.SetFromSpace(main_space_);
+      reset_main_space = true;
+    }
     zygote_collector.SetToSpace(&target_space);
+
+    Runtime::Current()->GetThreadList()->SuspendAll();
     zygote_collector.Run(kGcCauseCollectorTransition, false);
-    CHECK(temp_space_->IsEmpty());
+    if (IsMovingGc(collector_type_)) {
+      SwapSemiSpaces();
+    }
+    Runtime::Current()->GetThreadList()->ResumeAll();
+
+    if (reset_main_space) {
+      main_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
+      madvise(main_space_->Begin(), main_space_->Capacity(), MADV_DONTNEED);
+      MemMap* mem_map = main_space_->ReleaseMemMap();
+      RemoveSpace(main_space_);
+      delete main_space_;
+      main_space_ = nullptr;
+      CreateMainMallocSpace(mem_map, kDefaultInitialSize, mem_map->Size(), mem_map->Size());
+      AddSpace(main_space_);
+    } else {
+      bump_pointer_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
+    }
+    if (temp_space_ != nullptr) {
+      CHECK(temp_space_->IsEmpty());
+    }
     total_objects_freed_ever_ += semi_space_collector_->GetFreedObjects();
     total_bytes_freed_ever_ += semi_space_collector_->GetFreedBytes();
     // Update the end and write out image.
     non_moving_space_->SetEnd(target_space.End());
     non_moving_space_->SetLimit(target_space.Limit());
-    VLOG(heap) << "Zygote size " << non_moving_space_->Size() << " bytes";
+    VLOG(heap) << "Zygote space size " << non_moving_space_->Size() << " bytes";
   }
+  ChangeCollector(foreground_collector_type_);
   // Save the old space so that we can remove it after we complete creating the zygote space.
   space::MallocSpace* old_alloc_space = non_moving_space_;
   // Turn the current alloc space into a zygote space and obtain the new alloc space composed of
@@ -1684,18 +1725,12 @@
   }
   space::ZygoteSpace* zygote_space = old_alloc_space->CreateZygoteSpace("alloc space",
                                                                         low_memory_mode_,
-                                                                        &main_space_);
+                                                                        &non_moving_space_);
   delete old_alloc_space;
   CHECK(zygote_space != nullptr) << "Failed creating zygote space";
   AddSpace(zygote_space, false);
-  CHECK(main_space_ != nullptr);
-  if (main_space_->IsRosAllocSpace()) {
-    rosalloc_space_ = main_space_->AsRosAllocSpace();
-  } else if (main_space_->IsDlMallocSpace()) {
-    dlmalloc_space_ = main_space_->AsDlMallocSpace();
-  }
-  main_space_->SetFootprintLimit(main_space_->Capacity());
-  AddSpace(main_space_);
+  non_moving_space_->SetFootprintLimit(non_moving_space_->Capacity());
+  AddSpace(non_moving_space_);
   have_zygote_space_ = true;
   // Enable large object space allocations.
   large_object_threshold_ = kDefaultLargeObjectThreshold;
@@ -1705,23 +1740,6 @@
   CHECK(mod_union_table != nullptr) << "Failed to create zygote space mod-union table";
   AddModUnionTable(mod_union_table);
   if (collector::SemiSpace::kUseRememberedSet) {
-    // Add a new remembered set for the new main space.
-    accounting::RememberedSet* main_space_rem_set =
-        new accounting::RememberedSet("Main space remembered set", this, main_space_);
-    CHECK(main_space_rem_set != nullptr) << "Failed to create main space remembered set";
-    AddRememberedSet(main_space_rem_set);
-  }
-  // Can't use RosAlloc for non moving space due to thread local buffers.
-  // TODO: Non limited space for non-movable objects?
-  MemMap* mem_map = post_zygote_non_moving_space_mem_map_.release();
-  space::MallocSpace* new_non_moving_space =
-      space::DlMallocSpace::CreateFromMemMap(mem_map, "Non moving dlmalloc space", kPageSize,
-                                             2 * MB, mem_map->Size(), mem_map->Size());
-  AddSpace(new_non_moving_space, false);
-  CHECK(new_non_moving_space != nullptr) << "Failed to create new non-moving space";
-  new_non_moving_space->SetFootprintLimit(new_non_moving_space->Capacity());
-  non_moving_space_ = new_non_moving_space;
-  if (collector::SemiSpace::kUseRememberedSet) {
     // Add a new remembered set for the post-zygote non-moving space.
     accounting::RememberedSet* post_zygote_non_moving_space_rem_set =
         new accounting::RememberedSet("Post-zygote non-moving space remembered set", this,
@@ -1737,9 +1755,9 @@
   allocation_stack_->Reset();
 }
 
-void Heap::MarkAllocStack(accounting::SpaceBitmap* bitmap1,
-                          accounting::SpaceBitmap* bitmap2,
-                          accounting::ObjectSet* large_objects,
+void Heap::MarkAllocStack(accounting::ContinuousSpaceBitmap* bitmap1,
+                          accounting::ContinuousSpaceBitmap* bitmap2,
+                          accounting::LargeObjectBitmap* large_objects,
                           accounting::ObjectStack* stack) {
   DCHECK(bitmap1 != nullptr);
   DCHECK(bitmap2 != nullptr);
@@ -1759,9 +1777,9 @@
 }
 
 void Heap::SwapSemiSpaces() {
-  // Swap the spaces so we allocate into the space which we just evacuated.
+  CHECK(bump_pointer_space_ != nullptr);
+  CHECK(temp_space_ != nullptr);
   std::swap(bump_pointer_space_, temp_space_);
-  bump_pointer_space_->Clear();
 }
 
 void Heap::Compact(space::ContinuousMemMapAllocSpace* target_space,
@@ -1804,7 +1822,7 @@
     MutexLock mu(self, *gc_complete_lock_);
     // Ensure there is only one GC at a time.
     WaitForGcToCompleteLocked(self);
-    compacting_gc = IsCompactingGC(collector_type_);
+    compacting_gc = IsMovingGc(collector_type_);
     // GC can be disabled if someone has a used GetPrimitiveArrayCritical.
     if (compacting_gc && disable_moving_gc_count_ != 0) {
       LOG(WARNING) << "Skipping GC due to disable moving GC count " << disable_moving_gc_count_;
@@ -1859,10 +1877,14 @@
       << "Could not find garbage collector with collector_type="
       << static_cast<size_t>(collector_type_) << " and gc_type=" << gc_type;
   ATRACE_BEGIN(StringPrintf("%s %s GC", PrettyCause(gc_cause), collector->GetName()).c_str());
-  if (!clear_soft_references) {
-    clear_soft_references = gc_type != collector::kGcTypeSticky;  // TODO: GSS?
+  if (compacting_gc) {
+    runtime->GetThreadList()->SuspendAll();
+    collector->Run(gc_cause, clear_soft_references || runtime->IsZygote());
+    SwapSemiSpaces();
+    runtime->GetThreadList()->ResumeAll();
+  } else {
+    collector->Run(gc_cause, clear_soft_references || runtime->IsZygote());
   }
-  collector->Run(gc_cause, clear_soft_references || runtime->IsZygote());
   total_objects_freed_ever_ += collector->GetFreedObjects();
   total_bytes_freed_ever_ += collector->GetFreedBytes();
   RequestHeapTrim();
@@ -2007,7 +2029,8 @@
           accounting::CardTable::kCardSize);
       LOG(ERROR) << "Card " << reinterpret_cast<void*>(card_addr) << " covers " << cover_begin
           << "-" << cover_end;
-      accounting::SpaceBitmap* bitmap = heap_->GetLiveBitmap()->GetContinuousSpaceBitmap(obj);
+      accounting::ContinuousSpaceBitmap* bitmap =
+          heap_->GetLiveBitmap()->GetContinuousSpaceBitmap(obj);
 
       if (bitmap == nullptr) {
         LOG(ERROR) << "Object " << obj << " has no bitmap";
@@ -2377,9 +2400,11 @@
       WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
       // Swapping bound bitmaps does nothing.
       gc->SwapBitmaps();
+      SwapSemiSpaces();
       if (!VerifyHeapReferences()) {
         LOG(FATAL) << "Pre sweeping " << gc->GetName() << " GC verification failed";
       }
+      SwapSemiSpaces();
       gc->SwapBitmaps();
     }
   }
@@ -2465,25 +2490,11 @@
 
 bool Heap::IsMovableObject(const mirror::Object* obj) const {
   if (kMovingCollector) {
-    DCHECK(!IsInTempSpace(obj));
-    if (bump_pointer_space_->HasAddress(obj)) {
-      return true;
+    space::Space* space = FindContinuousSpaceFromObject(obj, true);
+    if (space != nullptr) {
+      // TODO: Check large object?
+      return space->CanMoveObjects();
     }
-    // TODO: Refactor this logic into the space itself?
-    // Objects in the main space are only copied during background -> foreground transitions or
-    // visa versa.
-    if (main_space_ != nullptr && main_space_->HasAddress(obj) &&
-        (IsCompactingGC(background_collector_type_) ||
-            IsCompactingGC(post_zygote_collector_type_))) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool Heap::IsInTempSpace(const mirror::Object* obj) const {
-  if (temp_space_->HasAddress(obj) && !temp_space_->Contains(obj)) {
-    return true;
   }
   return false;
 }
@@ -2511,22 +2522,33 @@
   return nullptr;
 }
 
+double Heap::HeapGrowthMultiplier() const {
+  // If we don't care about pause times we are background, so return 1.0.
+  if (!CareAboutPauseTimes() || IsLowMemoryMode()) {
+    return 1.0;
+  }
+  return foreground_heap_growth_multiplier_;
+}
+
 void Heap::GrowForUtilization(collector::GarbageCollector* collector_ran) {
   // We know what our utilization is at this moment.
   // This doesn't actually resize any memory. It just lets the heap grow more when necessary.
-  const size_t bytes_allocated = GetBytesAllocated();
+  const uint64_t bytes_allocated = GetBytesAllocated();
   last_gc_size_ = bytes_allocated;
   last_gc_time_ns_ = NanoTime();
-  size_t target_size;
+  uint64_t target_size;
   collector::GcType gc_type = collector_ran->GetGcType();
   if (gc_type != collector::kGcTypeSticky) {
     // Grow the heap for non sticky GC.
-    target_size = bytes_allocated / GetTargetHeapUtilization();
-    if (target_size > bytes_allocated + max_free_) {
-      target_size = bytes_allocated + max_free_;
-    } else if (target_size < bytes_allocated + min_free_) {
-      target_size = bytes_allocated + min_free_;
-    }
+    const float multiplier = HeapGrowthMultiplier();  // Use the multiplier to grow more for
+    // foreground.
+    intptr_t delta = bytes_allocated / GetTargetHeapUtilization() - bytes_allocated;
+    CHECK_GE(delta, 0);
+    target_size = bytes_allocated + delta * multiplier;
+    target_size = std::min(target_size,
+                           bytes_allocated + static_cast<uint64_t>(max_free_ * multiplier));
+    target_size = std::max(target_size,
+                           bytes_allocated + static_cast<uint64_t>(min_free_ * multiplier));
     native_need_to_run_finalization_ = true;
     next_gc_type_ = collector::kGcTypeSticky;
   } else {
@@ -2539,7 +2561,7 @@
     // We also check that the bytes allocated aren't over the footprint limit in order to prevent a
     // pathological case where dead objects which aren't reclaimed by sticky could get accumulated
     // if the sticky GC throughput always remained >= the full/partial throughput.
-    if (collector_ran->GetEstimatedLastIterationThroughput() >=
+    if (collector_ran->GetEstimatedLastIterationThroughput() * kStickyGcThroughputAdjustment >=
         non_sticky_collector->GetEstimatedMeanThroughput() &&
         non_sticky_collector->GetIterations() > 0 &&
         bytes_allocated <= max_allowed_footprint_) {
@@ -2551,7 +2573,7 @@
     if (bytes_allocated + max_free_ < max_allowed_footprint_) {
       target_size = bytes_allocated + max_free_;
     } else {
-      target_size = std::max(bytes_allocated, max_allowed_footprint_);
+      target_size = std::max(bytes_allocated, static_cast<uint64_t>(max_allowed_footprint_));
     }
   }
   if (!ignore_max_footprint_) {
@@ -2575,7 +2597,8 @@
       // Start a concurrent GC when we get close to the estimated remaining bytes. When the
       // allocation rate is very high, remaining_bytes could tell us that we should start a GC
       // right away.
-      concurrent_start_bytes_ = std::max(max_allowed_footprint_ - remaining_bytes, bytes_allocated);
+      concurrent_start_bytes_ = std::max(max_allowed_footprint_ - remaining_bytes,
+                                         static_cast<size_t>(bytes_allocated));
     }
   }
 }
@@ -2663,6 +2686,10 @@
 }
 
 void Heap::RequestHeapTrim() {
+  // Request a heap trim only if we do not currently care about pause times.
+  if (CareAboutPauseTimes()) {
+    return;
+  }
   // GC completed and now we must decide whether to request a heap trim (advising pages back to the
   // kernel) or not. Issuing a request will also cause trimming of the libc heap. As a trim scans
   // a space it will hold its lock and can become a cause of jank.
@@ -2684,21 +2711,17 @@
     // as we don't hold the lock while requesting the trim).
     return;
   }
-
-  // Request a heap trim only if we do not currently care about pause times.
-  if (!CareAboutPauseTimes()) {
-    {
-      MutexLock mu(self, *heap_trim_request_lock_);
-      if (last_trim_time_ + kHeapTrimWait >= NanoTime()) {
-        // We have done a heap trim in the last kHeapTrimWait nanosecs, don't request another one
-        // just yet.
-        return;
-      }
-      heap_trim_request_pending_ = true;
+  {
+    MutexLock mu(self, *heap_trim_request_lock_);
+    if (last_trim_time_ + kHeapTrimWait >= NanoTime()) {
+      // We have done a heap trim in the last kHeapTrimWait nanosecs, don't request another one
+      // just yet.
+      return;
     }
-    // Notify the daemon thread which will actually do the heap trim.
-    SignalHeapTrimDaemon(self);
+    heap_trim_request_pending_ = true;
   }
+  // Notify the daemon thread which will actually do the heap trim.
+  SignalHeapTrimDaemon(self);
 }
 
 void Heap::SignalHeapTrimDaemon(Thread* self) {
@@ -2785,7 +2808,7 @@
       if (IsGcConcurrent()) {
         RequestConcurrentGC(self);
       } else {
-        CollectGarbageInternal(gc_type, kGcCauseForAlloc, false);
+        CollectGarbageInternal(gc_type, kGcCauseForNativeAlloc, false);
       }
     }
   }
@@ -2854,14 +2877,14 @@
 void Heap::ClearMarkedObjects() {
   // Clear all of the spaces' mark bitmaps.
   for (const auto& space : GetContinuousSpaces()) {
-    accounting::SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
+    accounting::ContinuousSpaceBitmap* mark_bitmap = space->GetMarkBitmap();
     if (space->GetLiveBitmap() != mark_bitmap) {
       mark_bitmap->Clear();
     }
   }
   // Clear the marked objects in the discontinous space object sets.
   for (const auto& space : GetDiscontinuousSpaces()) {
-    space->GetMarkObjects()->Clear();
+    space->GetMarkBitmap()->Clear();
   }
 }
 
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index ffb4e59..ceba8b6 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -55,7 +55,6 @@
 namespace accounting {
   class HeapBitmap;
   class ModUnionTable;
-  class ObjectSet;
   class RememberedSet;
 }  // namespace accounting
 
@@ -115,6 +114,8 @@
 };
 std::ostream& operator<<(std::ostream& os, const ProcessState& process_state);
 
+std::ostream& operator<<(std::ostream& os, const RootType& root_type);
+
 class Heap {
  public:
   // If true, measure the total allocation time.
@@ -130,9 +131,8 @@
   static constexpr size_t kDefaultLongPauseLogThreshold = MsToNs(5);
   static constexpr size_t kDefaultLongGCLogThreshold = MsToNs(100);
   static constexpr size_t kDefaultTLABSize = 256 * KB;
-
-  // Default target utilization.
   static constexpr double kDefaultTargetUtilization = 0.5;
+  static constexpr double kDefaultHeapGrowthMultiplier = 2.0;
 
   // Used so that we don't overflow the allocation time atomic integer.
   static constexpr size_t kTimeAdjust = 1024;
@@ -146,9 +146,10 @@
   // image_file_names names specify Spaces to load based on
   // ImageWriter output.
   explicit Heap(size_t initial_size, size_t growth_limit, size_t min_free,
-                size_t max_free, double target_utilization, size_t capacity,
+                size_t max_free, double target_utilization,
+                double foreground_heap_growth_multiplier, size_t capacity,
                 const std::string& original_image_file_name,
-                CollectorType post_zygote_collector_type, CollectorType background_collector_type,
+                CollectorType foreground_collector_type, CollectorType background_collector_type,
                 size_t parallel_gc_threads, size_t conc_gc_threads, bool low_memory_mode,
                 size_t long_pause_threshold, size_t long_gc_threshold,
                 bool ignore_max_footprint, bool use_tlab, bool verify_pre_gc_heap,
@@ -194,8 +195,6 @@
   void VisitObjects(ObjectCallback callback, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
-  void SwapSemiSpaces() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   void CheckPreconditionsForAllocObject(mirror::Class* c, size_t byte_count)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void ThrowOutOfMemoryError(size_t byte_count, bool large_object_allocation);
@@ -247,10 +246,6 @@
   // Returns true if there is any chance that the object (obj) will move.
   bool IsMovableObject(const mirror::Object* obj) const;
 
-  // Returns true if an object is in the temp space, if this happens its usually indicative of
-  // compaction related errors.
-  bool IsInTempSpace(const mirror::Object* obj) const;
-
   // Enables us to compacting GC until objects are released.
   void IncrementDisableMovingGC(Thread* self);
   void DecrementDisableMovingGC(Thread* self);
@@ -355,6 +350,10 @@
     return low_memory_mode_;
   }
 
+  // Returns the heap growth multiplier, this affects how much we grow the heap after a GC.
+  // Scales heap growth, min free, and max free.
+  double HeapGrowthMultiplier() const;
+
   // Freed bytes can be negative in cases where we copy objects from a compacted space to a
   // free-list backed space.
   void RecordFree(ssize_t freed_objects, ssize_t freed_bytes);
@@ -474,8 +473,11 @@
       LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_, Locks::thread_list_lock_);
 
   // Mark all the objects in the allocation stack in the specified bitmap.
-  void MarkAllocStack(accounting::SpaceBitmap* bitmap1, accounting::SpaceBitmap* bitmap2,
-                      accounting::ObjectSet* large_objects, accounting::ObjectStack* stack)
+  // TODO: Refactor?
+  void MarkAllocStack(accounting::SpaceBitmap<kObjectAlignment>* bitmap1,
+                      accounting::SpaceBitmap<kObjectAlignment>* bitmap2,
+                      accounting::SpaceBitmap<kLargeObjectAlignment>* large_objects,
+                      accounting::ObjectStack* stack)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Mark the specified allocation stack as live.
@@ -566,7 +568,8 @@
 
  private:
   void Compact(space::ContinuousMemMapAllocSpace* target_space,
-               space::ContinuousMemMapAllocSpace* source_space);
+               space::ContinuousMemMapAllocSpace* source_space)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void FinishGC(Thread* self, collector::GcType gc_type) LOCKS_EXCLUDED(gc_complete_lock_);
 
@@ -578,7 +581,7 @@
   static ALWAYS_INLINE bool AllocatorMayHaveConcurrentGC(AllocatorType allocator_type) {
     return AllocatorHasAllocationStack(allocator_type);
   }
-  static bool IsCompactingGC(CollectorType collector_type) {
+  static bool IsMovingGc(CollectorType collector_type) {
     return collector_type == kCollectorTypeSS || collector_type == kCollectorTypeGSS ||
         collector_type == kCollectorTypeCC;
   }
@@ -607,6 +610,10 @@
                                size_t bytes)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Need to do this with mutators paused so that somebody doesn't accidentally allocate into the
+  // wrong space.
+  void SwapSemiSpaces() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // Try to allocate a number of bytes, this function never does any GCs. Needs to be inlined so
   // that the switch statement is constant optimized in the entrypoints.
   template <const bool kInstrumented, const bool kGrow>
@@ -666,6 +673,10 @@
   // Find a collector based on GC type.
   collector::GarbageCollector* FindCollectorByGcType(collector::GcType gc_type);
 
+  // Create the main free list space, typically either a RosAlloc space or DlMalloc space.
+  void CreateMainMallocSpace(MemMap* mem_map, size_t initial_size, size_t growth_limit,
+                             size_t capacity);
+
   // Given the current contents of the alloc space, increase the allowed heap footprint to match
   // the target utilization ratio.  This should only be called immediately after a full garbage
   // collection.
@@ -735,17 +746,10 @@
   // A remembered set remembers all of the references from the it's space to the target space.
   SafeMap<space::Space*, accounting::RememberedSet*> remembered_sets_;
 
-  // Keep the free list allocator mem map lying around when we transition to background so that we
-  // don't have to worry about virtual address space fragmentation.
-  UniquePtr<MemMap> allocator_mem_map_;
-
-  // The mem-map which we will use for the non-moving space after the zygote is done forking:
-  UniquePtr<MemMap> post_zygote_non_moving_space_mem_map_;
-
   // The current collector type.
   CollectorType collector_type_;
-  // Which collector we will switch to after zygote fork.
-  CollectorType post_zygote_collector_type_;
+  // Which collector we use when the app is in the foreground.
+  CollectorType foreground_collector_type_;
   // Which collector we will use when the app is notified of a transition to background.
   CollectorType background_collector_type_;
   // Desired collector type, heap trimming daemon transitions the heap if it is != collector_type_.
@@ -927,6 +931,9 @@
   // Target ideal heap utilization ratio
   double target_utilization_;
 
+  // How much more we grow the heap when we are a foreground app instead of background.
+  double foreground_heap_growth_multiplier_;
+
   // Total time which mutators are paused or waiting for GC to complete.
   uint64_t total_wait_time_;
 
diff --git a/runtime/gc/heap_test.cc b/runtime/gc/heap_test.cc
index 07e5088..a85ad4d 100644
--- a/runtime/gc/heap_test.cc
+++ b/runtime/gc/heap_test.cc
@@ -60,13 +60,11 @@
 
 TEST_F(HeapTest, HeapBitmapCapacityTest) {
   byte* heap_begin = reinterpret_cast<byte*>(0x1000);
-  const size_t heap_capacity = accounting::SpaceBitmap::kAlignment * (sizeof(intptr_t) * 8 + 1);
-  UniquePtr<accounting::SpaceBitmap> bitmap(accounting::SpaceBitmap::Create("test bitmap",
-                                                                            heap_begin,
-                                                                            heap_capacity));
+  const size_t heap_capacity = kObjectAlignment * (sizeof(intptr_t) * 8 + 1);
+  UniquePtr<accounting::ContinuousSpaceBitmap> bitmap(
+      accounting::ContinuousSpaceBitmap::Create("test bitmap", heap_begin, heap_capacity));
   mirror::Object* fake_end_of_heap_object =
-      reinterpret_cast<mirror::Object*>(&heap_begin[heap_capacity -
-                                                    accounting::SpaceBitmap::kAlignment]);
+      reinterpret_cast<mirror::Object*>(&heap_begin[heap_capacity - kObjectAlignment]);
   bitmap->Set(fake_end_of_heap_object);
 }
 
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index a955cc8..90ffe59 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -38,6 +38,10 @@
   return new BumpPointerSpace(name, mem_map.release());
 }
 
+BumpPointerSpace* BumpPointerSpace::CreateFromMemMap(const std::string& name, MemMap* mem_map) {
+  return new BumpPointerSpace(name, mem_map);
+}
+
 BumpPointerSpace::BumpPointerSpace(const std::string& name, byte* begin, byte* limit)
     : ContinuousMemMapAllocSpace(name, nullptr, begin, begin, limit,
                                  kGcRetentionPolicyAlwaysCollect),
@@ -61,9 +65,6 @@
 void BumpPointerSpace::Clear() {
   // Release the pages back to the operating system.
   CHECK_NE(madvise(Begin(), Limit() - Begin(), MADV_DONTNEED), -1) << "madvise failed";
-}
-
-void BumpPointerSpace::Reset() {
   // Reset the end of the space back to the beginning, we move the end forward as we allocate
   // objects.
   SetEnd(Begin());
@@ -196,7 +197,7 @@
   }
 }
 
-accounting::SpaceBitmap::SweepCallback* BumpPointerSpace::GetSweepCallback() {
+accounting::ContinuousSpaceBitmap::SweepCallback* BumpPointerSpace::GetSweepCallback() {
   LOG(FATAL) << "Unimplemented";
   return nullptr;
 }
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 3ab5df4..e52a9a3 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -43,6 +43,7 @@
   // guaranteed to be granted, if it is required, the caller should call Begin on the returned
   // space to confirm the request was granted.
   static BumpPointerSpace* Create(const std::string& name, size_t capacity, byte* requested_begin);
+  static BumpPointerSpace* CreateFromMemMap(const std::string& name, MemMap* mem_map);
 
   // Allocate num_bytes, returns nullptr if the space is full.
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
@@ -84,19 +85,16 @@
     return GetMemMap()->Size();
   }
 
-  accounting::SpaceBitmap* GetLiveBitmap() const OVERRIDE {
+  accounting::ContinuousSpaceBitmap* GetLiveBitmap() const OVERRIDE {
     return nullptr;
   }
 
-  accounting::SpaceBitmap* GetMarkBitmap() const OVERRIDE {
+  accounting::ContinuousSpaceBitmap* GetMarkBitmap() const OVERRIDE {
     return nullptr;
   }
 
-  // Madvise the memory back to the OS.
-  void Clear() OVERRIDE;
-
-  // Reset the pointer to the start of the space.
-  void Reset() OVERRIDE LOCKS_EXCLUDED(block_lock_);
+  // Reset the space to empty.
+  void Clear() OVERRIDE LOCKS_EXCLUDED(block_lock_);
 
   void Dump(std::ostream& os) const;
 
@@ -113,6 +111,9 @@
     return Begin() == End();
   }
 
+  bool CanMoveObjects() const OVERRIDE {
+    return true;
+  }
 
   bool Contains(const mirror::Object* obj) const {
     const byte* byte_obj = reinterpret_cast<const byte*>(obj);
@@ -137,7 +138,7 @@
   void Walk(ObjectCallback* callback, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  accounting::SpaceBitmap::SweepCallback* GetSweepCallback() OVERRIDE;
+  accounting::ContinuousSpaceBitmap::SweepCallback* GetSweepCallback() OVERRIDE;
 
   // Object alignment within the space.
   static constexpr size_t kAlignment = 8;
diff --git a/runtime/gc/space/dlmalloc_space-inl.h b/runtime/gc/space/dlmalloc_space-inl.h
index 02d8b54..4c8a35e 100644
--- a/runtime/gc/space/dlmalloc_space-inl.h
+++ b/runtime/gc/space/dlmalloc_space-inl.h
@@ -52,7 +52,7 @@
 inline mirror::Object* DlMallocSpace::AllocWithoutGrowthLocked(Thread* /*self*/, size_t num_bytes,
                                                                size_t* bytes_allocated,
                                                                size_t* usable_size) {
-  mirror::Object* result = reinterpret_cast<mirror::Object*>(mspace_malloc(mspace_for_alloc_, num_bytes));
+  mirror::Object* result = reinterpret_cast<mirror::Object*>(mspace_malloc(mspace_, num_bytes));
   if (LIKELY(result != NULL)) {
     if (kDebugSpaces) {
       CHECK(Contains(result)) << "Allocation (" << reinterpret_cast<void*>(result)
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index 60f566c..41a0458 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "dlmalloc_space.h"
-
 #include "dlmalloc_space-inl.h"
+
 #include "gc/accounting/card_table.h"
+#include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
@@ -36,15 +36,19 @@
 template class ValgrindMallocSpace<DlMallocSpace, void*>;
 
 DlMallocSpace::DlMallocSpace(const std::string& name, MemMap* mem_map, void* mspace, byte* begin,
-                             byte* end, byte* limit, size_t growth_limit)
-    : MallocSpace(name, mem_map, begin, end, limit, growth_limit),
-      mspace_(mspace), mspace_for_alloc_(mspace) {
+                             byte* end, byte* limit, size_t growth_limit,
+                             bool can_move_objects, size_t starting_size,
+                             size_t initial_size)
+    : MallocSpace(name, mem_map, begin, end, limit, growth_limit, true, can_move_objects,
+                  starting_size, initial_size),
+      mspace_(mspace) {
   CHECK(mspace != NULL);
 }
 
 DlMallocSpace* DlMallocSpace::CreateFromMemMap(MemMap* mem_map, const std::string& name,
                                                size_t starting_size, size_t initial_size,
-                                               size_t growth_limit, size_t capacity) {
+                                               size_t growth_limit, size_t capacity,
+                                               bool can_move_objects) {
   DCHECK(mem_map != nullptr);
   void* mspace = CreateMspace(mem_map->Begin(), starting_size, initial_size);
   if (mspace == nullptr) {
@@ -62,14 +66,17 @@
   byte* begin = mem_map->Begin();
   if (Runtime::Current()->RunningOnValgrind()) {
     return new ValgrindMallocSpace<DlMallocSpace, void*>(
-        name, mem_map, mspace, begin, end, begin + capacity, growth_limit, initial_size);
+        name, mem_map, mspace, begin, end, begin + capacity, growth_limit, initial_size,
+        can_move_objects, starting_size);
   } else {
-    return new DlMallocSpace(name, mem_map, mspace, begin, end, begin + capacity, growth_limit);
+    return new DlMallocSpace(name, mem_map, mspace, begin, end, begin + capacity, growth_limit,
+                             can_move_objects, starting_size, initial_size);
   }
 }
 
-DlMallocSpace* DlMallocSpace::Create(const std::string& name, size_t initial_size, size_t growth_limit,
-                                     size_t capacity, byte* requested_begin) {
+DlMallocSpace* DlMallocSpace::Create(const std::string& name, size_t initial_size,
+                                     size_t growth_limit, size_t capacity, byte* requested_begin,
+                                     bool can_move_objects) {
   uint64_t start_time = 0;
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     start_time = NanoTime();
@@ -93,7 +100,7 @@
     return nullptr;
   }
   DlMallocSpace* space = CreateFromMemMap(mem_map, name, starting_size, initial_size,
-                                          growth_limit, capacity);
+                                          growth_limit, capacity, can_move_objects);
   // We start out with only the initial size possibly containing objects.
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "DlMallocSpace::Create exiting (" << PrettyDuration(NanoTime() - start_time)
@@ -143,8 +150,10 @@
 
 MallocSpace* DlMallocSpace::CreateInstance(const std::string& name, MemMap* mem_map,
                                            void* allocator, byte* begin, byte* end,
-                                           byte* limit, size_t growth_limit) {
-  return new DlMallocSpace(name, mem_map, allocator, begin, end, limit, growth_limit);
+                                           byte* limit, size_t growth_limit,
+                                           bool can_move_objects) {
+  return new DlMallocSpace(name, mem_map, allocator, begin, end, limit, growth_limit,
+                           can_move_objects, starting_size_, initial_size_);
 }
 
 size_t DlMallocSpace::Free(Thread* self, mirror::Object* ptr) {
@@ -280,13 +289,13 @@
 }
 
 void DlMallocSpace::Clear() {
+  size_t footprint_limit = GetFootprintLimit();
   madvise(GetMemMap()->Begin(), GetMemMap()->Size(), MADV_DONTNEED);
-  GetLiveBitmap()->Clear();
-  GetMarkBitmap()->Clear();
-}
-
-void DlMallocSpace::Reset() {
-  // TODO: Delete and create new mspace here.
+  live_bitmap_->Clear();
+  mark_bitmap_->Clear();
+  end_ = Begin() + starting_size_;
+  mspace_ = CreateMspace(mem_map_->Begin(), starting_size_, initial_size_);
+  SetFootprintLimit(footprint_limit);
 }
 
 #ifndef NDEBUG
diff --git a/runtime/gc/space/dlmalloc_space.h b/runtime/gc/space/dlmalloc_space.h
index 76c4489..accd26b 100644
--- a/runtime/gc/space/dlmalloc_space.h
+++ b/runtime/gc/space/dlmalloc_space.h
@@ -36,14 +36,15 @@
   // Create a DlMallocSpace from an existing mem_map.
   static DlMallocSpace* CreateFromMemMap(MemMap* mem_map, const std::string& name,
                                          size_t starting_size, size_t initial_size,
-                                         size_t growth_limit, size_t capacity);
+                                         size_t growth_limit, size_t capacity,
+                                         bool can_move_objects);
 
   // Create a DlMallocSpace with the requested sizes. The requested
   // base address is not guaranteed to be granted, if it is required,
   // the caller should call Begin on the returned space to confirm the
   // request was granted.
   static DlMallocSpace* Create(const std::string& name, size_t initial_size, size_t growth_limit,
-                               size_t capacity, byte* requested_begin);
+                               size_t capacity, byte* requested_begin, bool can_move_objects);
 
   // Virtual to allow ValgrindMallocSpace to intercept.
   virtual mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes, size_t* bytes_allocated,
@@ -107,13 +108,13 @@
   void SetFootprintLimit(size_t limit) OVERRIDE;
 
   MallocSpace* CreateInstance(const std::string& name, MemMap* mem_map, void* allocator,
-                              byte* begin, byte* end, byte* limit, size_t growth_limit);
+                              byte* begin, byte* end, byte* limit, size_t growth_limit,
+                              bool can_move_objects);
 
   uint64_t GetBytesAllocated() OVERRIDE;
   uint64_t GetObjectsAllocated() OVERRIDE;
 
-  void Clear() OVERRIDE;
-  void Reset() OVERRIDE;
+  virtual void Clear() OVERRIDE;
 
   bool IsDlMallocSpace() const OVERRIDE {
     return true;
@@ -125,7 +126,8 @@
 
  protected:
   DlMallocSpace(const std::string& name, MemMap* mem_map, void* mspace, byte* begin, byte* end,
-                byte* limit, size_t growth_limit);
+                byte* limit, size_t growth_limit, bool can_move_objects, size_t starting_size,
+                size_t initial_size);
 
  private:
   mirror::Object* AllocWithoutGrowthLocked(Thread* self, size_t num_bytes, size_t* bytes_allocated,
@@ -142,11 +144,7 @@
   static const size_t kChunkOverhead = kWordSize;
 
   // Underlying malloc space.
-  void* const mspace_;
-
-  // An mspace pointer used for allocation. Equals  mspace_ or nullptr after InvalidateAllocator()
-  // is called.
-  void* mspace_for_alloc_;
+  void* mspace_;
 
   friend class collector::MarkSweep;
 
diff --git a/runtime/gc/space/dlmalloc_space_base_test.cc b/runtime/gc/space/dlmalloc_space_base_test.cc
index 508d869..129eace 100644
--- a/runtime/gc/space/dlmalloc_space_base_test.cc
+++ b/runtime/gc/space/dlmalloc_space_base_test.cc
@@ -23,7 +23,7 @@
 
 MallocSpace* CreateDlMallocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
-  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin);
+  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin, false);
 }
 
 TEST_SPACE_CREATE_FN_BASE(DlMallocSpace, CreateDlMallocSpace)
diff --git a/runtime/gc/space/dlmalloc_space_random_test.cc b/runtime/gc/space/dlmalloc_space_random_test.cc
index 43a1bf0..c4f8bae 100644
--- a/runtime/gc/space/dlmalloc_space_random_test.cc
+++ b/runtime/gc/space/dlmalloc_space_random_test.cc
@@ -23,7 +23,7 @@
 
 MallocSpace* CreateDlMallocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
-  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin);
+  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin, false);
 }
 
 TEST_SPACE_CREATE_FN_RANDOM(DlMallocSpace, CreateDlMallocSpace)
diff --git a/runtime/gc/space/dlmalloc_space_static_test.cc b/runtime/gc/space/dlmalloc_space_static_test.cc
index 4fbc81e..edaa198 100644
--- a/runtime/gc/space/dlmalloc_space_static_test.cc
+++ b/runtime/gc/space/dlmalloc_space_static_test.cc
@@ -23,7 +23,7 @@
 
 MallocSpace* CreateDlMallocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
-  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin);
+  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin, false);
 }
 
 TEST_SPACE_CREATE_FN_STATIC(DlMallocSpace, CreateDlMallocSpace)
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index faa539f..91d8820 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -35,7 +35,7 @@
 Atomic<uint32_t> ImageSpace::bitmap_index_(0);
 
 ImageSpace::ImageSpace(const std::string& name, MemMap* mem_map,
-                       accounting::SpaceBitmap* live_bitmap)
+                       accounting::ContinuousSpaceBitmap* live_bitmap)
     : MemMapSpace(name, mem_map, mem_map->Begin(), mem_map->End(), mem_map->End(),
                   kGcRetentionPolicyNeverCollect) {
   DCHECK(live_bitmap != nullptr);
@@ -197,10 +197,10 @@
   uint32_t bitmap_index = bitmap_index_.FetchAndAdd(1);
   std::string bitmap_name(StringPrintf("imagespace %s live-bitmap %u", image_file_name,
                                        bitmap_index));
-  UniquePtr<accounting::SpaceBitmap> bitmap(
-      accounting::SpaceBitmap::CreateFromMemMap(bitmap_name, image_map.release(),
-                                                reinterpret_cast<byte*>(map->Begin()),
-                                                map->Size()));
+  UniquePtr<accounting::ContinuousSpaceBitmap> bitmap(
+      accounting::ContinuousSpaceBitmap::CreateFromMemMap(bitmap_name, image_map.release(),
+                                                          reinterpret_cast<byte*>(map->Begin()),
+                                                          map->Size()));
   if (bitmap.get() == nullptr) {
     *error_msg = StringPrintf("Could not create bitmap '%s'", bitmap_name.c_str());
     return nullptr;
diff --git a/runtime/gc/space/image_space.h b/runtime/gc/space/image_space.h
index 116c498..f6daf89 100644
--- a/runtime/gc/space/image_space.h
+++ b/runtime/gc/space/image_space.h
@@ -17,6 +17,7 @@
 #ifndef ART_RUNTIME_GC_SPACE_IMAGE_SPACE_H_
 #define ART_RUNTIME_GC_SPACE_IMAGE_SPACE_H_
 
+#include "gc/accounting/space_bitmap.h"
 #include "space.h"
 
 namespace art {
@@ -59,11 +60,11 @@
     return GetName();
   }
 
-  accounting::SpaceBitmap* GetLiveBitmap() const {
+  accounting::ContinuousSpaceBitmap* GetLiveBitmap() const OVERRIDE {
     return live_bitmap_.get();
   }
 
-  accounting::SpaceBitmap* GetMarkBitmap() const {
+  accounting::ContinuousSpaceBitmap* GetMarkBitmap() const OVERRIDE {
     // ImageSpaces have the same bitmap for both live and marked. This helps reduce the number of
     // special cases to test against.
     return live_bitmap_.get();
@@ -75,6 +76,10 @@
   void Sweep(bool /* swap_bitmaps */, size_t* /* freed_objects */, size_t* /* freed_bytes */) {
   }
 
+  bool CanMoveObjects() const OVERRIDE {
+    return false;
+  }
+
  private:
   // Tries to initialize an ImageSpace from the given image path,
   // returning NULL on error.
@@ -96,9 +101,10 @@
 
   static Atomic<uint32_t> bitmap_index_;
 
-  UniquePtr<accounting::SpaceBitmap> live_bitmap_;
+  UniquePtr<accounting::ContinuousSpaceBitmap> live_bitmap_;
 
-  ImageSpace(const std::string& name, MemMap* mem_map, accounting::SpaceBitmap* live_bitmap);
+  ImageSpace(const std::string& name, MemMap* mem_map,
+             accounting::ContinuousSpaceBitmap* live_bitmap);
 
   // The OatFile associated with the image during early startup to
   // reserve space contiguous to the image. It is later released to
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 0b353c7..ce11b3d 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -16,12 +16,14 @@
 
 #include "large_object_space.h"
 
+#include "gc/accounting/space_bitmap-inl.h"
 #include "base/logging.h"
 #include "base/mutex-inl.h"
 #include "base/stl_util.h"
 #include "UniquePtr.h"
 #include "image.h"
 #include "os.h"
+#include "space-inl.h"
 #include "thread-inl.h"
 #include "utils.h"
 
@@ -74,26 +76,27 @@
 };
 
 void LargeObjectSpace::SwapBitmaps() {
-  live_objects_.swap(mark_objects_);
+  live_bitmap_.swap(mark_bitmap_);
   // Swap names to get more descriptive diagnostics.
-  std::string temp_name = live_objects_->GetName();
-  live_objects_->SetName(mark_objects_->GetName());
-  mark_objects_->SetName(temp_name);
+  std::string temp_name = live_bitmap_->GetName();
+  live_bitmap_->SetName(mark_bitmap_->GetName());
+  mark_bitmap_->SetName(temp_name);
 }
 
-LargeObjectSpace::LargeObjectSpace(const std::string& name)
+LargeObjectSpace::LargeObjectSpace(const std::string& name, byte* begin, byte* end)
     : DiscontinuousSpace(name, kGcRetentionPolicyAlwaysCollect),
       num_bytes_allocated_(0), num_objects_allocated_(0), total_bytes_allocated_(0),
-      total_objects_allocated_(0) {
+      total_objects_allocated_(0), begin_(begin), end_(end) {
 }
 
 
 void LargeObjectSpace::CopyLiveToMarked() {
-  mark_objects_->CopyFrom(*live_objects_.get());
+  mark_bitmap_->CopyFrom(live_bitmap_.get());
 }
 
+// TODO: Use something cleaner than 0xFFFFFFFF.
 LargeObjectMapSpace::LargeObjectMapSpace(const std::string& name)
-    : LargeObjectSpace(name),
+    : LargeObjectSpace(name, reinterpret_cast<byte*>(0xFFFFFFFF), nullptr),
       lock_("large object map space lock", kAllocSpaceLock) {}
 
 LargeObjectMapSpace* LargeObjectMapSpace::Create(const std::string& name) {
@@ -118,7 +121,9 @@
   large_objects_.push_back(obj);
   mem_maps_.Put(obj, mem_map);
   size_t allocation_size = mem_map->Size();
-  DCHECK(bytes_allocated != NULL);
+  DCHECK(bytes_allocated != nullptr);
+  begin_ = std::min(begin_, reinterpret_cast<byte*>(obj));
+  end_ = std::max(end_, reinterpret_cast<byte*>(obj) + allocation_size);
   *bytes_allocated = allocation_size;
   if (usable_size != nullptr) {
     *usable_size = allocation_size;
@@ -191,9 +196,7 @@
 }
 
 FreeListSpace::FreeListSpace(const std::string& name, MemMap* mem_map, byte* begin, byte* end)
-    : LargeObjectSpace(name),
-      begin_(begin),
-      end_(end),
+    : LargeObjectSpace(name, begin, end),
       mem_map_(mem_map),
       lock_("free list space lock", kAllocSpaceLock) {
   free_end_ = end - begin;
@@ -389,27 +392,41 @@
   }
 }
 
-void LargeObjectSpace::Sweep(bool swap_bitmaps, size_t* freed_objects, size_t* freed_bytes) {
-  // Sweep large objects
-  accounting::ObjectSet* large_live_objects = GetLiveObjects();
-  accounting::ObjectSet* large_mark_objects = GetMarkObjects();
-  if (swap_bitmaps) {
-    std::swap(large_live_objects, large_mark_objects);
-  }
-  DCHECK(freed_objects != nullptr);
-  DCHECK(freed_bytes != nullptr);
-  // O(n*log(n)) but hopefully there are not too many large objects.
-  size_t objects = 0;
-  size_t bytes = 0;
-  Thread* self = Thread::Current();
-  for (const mirror::Object* obj : large_live_objects->GetObjects()) {
-    if (!large_mark_objects->Test(obj)) {
-      bytes += Free(self, const_cast<mirror::Object*>(obj));
-      ++objects;
+void LargeObjectSpace::SweepCallback(size_t num_ptrs, mirror::Object** ptrs, void* arg) {
+  SweepCallbackContext* context = static_cast<SweepCallbackContext*>(arg);
+  space::LargeObjectSpace* space = context->space->AsLargeObjectSpace();
+  Thread* self = context->self;
+  Locks::heap_bitmap_lock_->AssertExclusiveHeld(self);
+  // If the bitmaps aren't swapped we need to clear the bits since the GC isn't going to re-swap
+  // the bitmaps as an optimization.
+  if (!context->swap_bitmaps) {
+    accounting::LargeObjectBitmap* bitmap = space->GetLiveBitmap();
+    for (size_t i = 0; i < num_ptrs; ++i) {
+      bitmap->Clear(ptrs[i]);
     }
   }
-  *freed_objects += objects;
-  *freed_bytes += bytes;
+  context->freed_objects += num_ptrs;
+  context->freed_bytes += space->FreeList(self, num_ptrs, ptrs);
+}
+
+void LargeObjectSpace::Sweep(bool swap_bitmaps, size_t* out_freed_objects,
+                             size_t* out_freed_bytes) {
+  if (Begin() >= End()) {
+    return;
+  }
+  accounting::LargeObjectBitmap* live_bitmap = GetLiveBitmap();
+  accounting::LargeObjectBitmap* mark_bitmap = GetMarkBitmap();
+  if (swap_bitmaps) {
+    std::swap(live_bitmap, mark_bitmap);
+  }
+  DCHECK(out_freed_objects != nullptr);
+  DCHECK(out_freed_bytes != nullptr);
+  SweepCallbackContext scc(swap_bitmaps, this);
+  accounting::LargeObjectBitmap::SweepWalk(*live_bitmap, *mark_bitmap,
+                                           reinterpret_cast<uintptr_t>(Begin()),
+                                           reinterpret_cast<uintptr_t>(End()), SweepCallback, &scc);
+  *out_freed_objects += scc.freed_objects;
+  *out_freed_bytes += scc.freed_bytes;
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/large_object_space.h b/runtime/gc/space/large_object_space.h
index eb01325..0daefba 100644
--- a/runtime/gc/space/large_object_space.h
+++ b/runtime/gc/space/large_object_space.h
@@ -49,11 +49,11 @@
     return num_objects_allocated_;
   }
 
-  uint64_t GetTotalBytesAllocated() {
+  uint64_t GetTotalBytesAllocated() const {
     return total_bytes_allocated_;
   }
 
-  uint64_t GetTotalObjectsAllocated() {
+  uint64_t GetTotalObjectsAllocated() const {
     return total_objects_allocated_;
   }
 
@@ -73,16 +73,36 @@
     return this;
   }
 
-  void Sweep(bool swap_bitmaps, size_t* freed_objects, size_t* freed_bytes);
+  void Sweep(bool swap_bitmaps, size_t* out_freed_objects, size_t* out_freed_bytes);
+
+  virtual bool CanMoveObjects() const OVERRIDE {
+    return false;
+  }
+
+  // Current address at which the space begins, which may vary as the space is filled.
+  byte* Begin() const {
+    return begin_;
+  }
+
+  // Current address at which the space ends, which may vary as the space is filled.
+  byte* End() const {
+    return end_;
+  }
 
  protected:
-  explicit LargeObjectSpace(const std::string& name);
+  explicit LargeObjectSpace(const std::string& name, byte* begin, byte* end);
+
+  static void SweepCallback(size_t num_ptrs, mirror::Object** ptrs, void* arg);
 
   // Approximate number of bytes which have been allocated into the space.
-  size_t num_bytes_allocated_;
-  size_t num_objects_allocated_;
-  size_t total_bytes_allocated_;
-  size_t total_objects_allocated_;
+  uint64_t num_bytes_allocated_;
+  uint64_t num_objects_allocated_;
+  uint64_t total_bytes_allocated_;
+  uint64_t total_objects_allocated_;
+
+  // Begin and end, may change as more large objects are allocated.
+  byte* begin_;
+  byte* end_;
 
   friend class Space;
 
@@ -238,9 +258,6 @@
   typedef std::set<AllocationHeader*, AllocationHeader::SortByPrevFree,
                    accounting::GcAllocator<AllocationHeader*> > FreeBlocks;
 
-  byte* const begin_;
-  byte* const end_;
-
   // There is not footer for any allocations at the end of the space, so we keep track of how much
   // free space there is at the end manually.
   UniquePtr<MemMap> mem_map_;
diff --git a/runtime/gc/space/malloc_space.cc b/runtime/gc/space/malloc_space.cc
index dac043e..7493c19 100644
--- a/runtime/gc/space/malloc_space.cc
+++ b/runtime/gc/space/malloc_space.cc
@@ -37,24 +37,26 @@
 
 MallocSpace::MallocSpace(const std::string& name, MemMap* mem_map,
                          byte* begin, byte* end, byte* limit, size_t growth_limit,
-                         bool create_bitmaps)
+                         bool create_bitmaps, bool can_move_objects, size_t starting_size,
+                         size_t initial_size)
     : ContinuousMemMapAllocSpace(name, mem_map, begin, end, limit, kGcRetentionPolicyAlwaysCollect),
       recent_free_pos_(0), lock_("allocation space lock", kAllocSpaceLock),
-      growth_limit_(growth_limit) {
+      growth_limit_(growth_limit), can_move_objects_(can_move_objects),
+      starting_size_(starting_size), initial_size_(initial_size) {
   if (create_bitmaps) {
     size_t bitmap_index = bitmap_index_++;
     static const uintptr_t kGcCardSize = static_cast<uintptr_t>(accounting::CardTable::kCardSize);
     CHECK(IsAligned<kGcCardSize>(reinterpret_cast<uintptr_t>(mem_map->Begin())));
     CHECK(IsAligned<kGcCardSize>(reinterpret_cast<uintptr_t>(mem_map->End())));
-    live_bitmap_.reset(accounting::SpaceBitmap::Create(
+    live_bitmap_.reset(accounting::ContinuousSpaceBitmap::Create(
         StringPrintf("allocspace %s live-bitmap %d", name.c_str(), static_cast<int>(bitmap_index)),
         Begin(), Capacity()));
-    DCHECK(live_bitmap_.get() != NULL) << "could not create allocspace live bitmap #"
+    DCHECK(live_bitmap_.get() != nullptr) << "could not create allocspace live bitmap #"
         << bitmap_index;
-    mark_bitmap_.reset(accounting::SpaceBitmap::Create(
+    mark_bitmap_.reset(accounting::ContinuousSpaceBitmap::Create(
         StringPrintf("allocspace %s mark-bitmap %d", name.c_str(), static_cast<int>(bitmap_index)),
         Begin(), Capacity()));
-    DCHECK(live_bitmap_.get() != NULL) << "could not create allocspace mark bitmap #"
+    DCHECK(live_bitmap_.get() != nullptr) << "could not create allocspace mark bitmap #"
         << bitmap_index;
   }
   for (auto& freed : recent_freed_objects_) {
@@ -178,11 +180,6 @@
              << "GrowthLimit " << growth_limit_ << "\n"
              << "Capacity " << Capacity();
   SetGrowthLimit(RoundUp(size, kPageSize));
-  SetFootprintLimit(RoundUp(size, kPageSize));
-
-  // TODO: Not hardcode these in?
-  const size_t starting_size = kPageSize;
-  const size_t initial_size = 2 * MB;
   // FIXME: Do we need reference counted pointers here?
   // Make the two spaces share the same mark bitmaps since the bitmaps span both of the spaces.
   VLOG(heap) << "Creating new AllocSpace: ";
@@ -194,14 +191,14 @@
   UniquePtr<MemMap> mem_map(GetMemMap()->RemapAtEnd(end_, alloc_space_name,
                                                     PROT_READ | PROT_WRITE, &error_msg));
   CHECK(mem_map.get() != nullptr) << error_msg;
-  void* allocator = CreateAllocator(end_, starting_size, initial_size, capacity, low_memory_mode);
+  void* allocator = CreateAllocator(end_, starting_size_, initial_size_, capacity, low_memory_mode);
   // Protect memory beyond the initial size.
-  byte* end = mem_map->Begin() + starting_size;
-  if (capacity - initial_size > 0) {
-    CHECK_MEMORY_CALL(mprotect, (end, capacity - initial_size, PROT_NONE), alloc_space_name);
+  byte* end = mem_map->Begin() + starting_size_;
+  if (capacity > initial_size_) {
+    CHECK_MEMORY_CALL(mprotect, (end, capacity - initial_size_, PROT_NONE), alloc_space_name);
   }
   *out_malloc_space = CreateInstance(alloc_space_name, mem_map.release(), allocator, end_, end,
-                                     limit_, growth_limit);
+                                     limit_, growth_limit, CanMoveObjects());
   SetLimit(End());
   live_bitmap_->SetHeapLimit(reinterpret_cast<uintptr_t>(End()));
   CHECK_EQ(live_bitmap_->HeapLimit(), reinterpret_cast<uintptr_t>(End()));
@@ -229,14 +226,13 @@
 
 void MallocSpace::SweepCallback(size_t num_ptrs, mirror::Object** ptrs, void* arg) {
   SweepCallbackContext* context = static_cast<SweepCallbackContext*>(arg);
-  DCHECK(context->space->IsMallocSpace());
   space::MallocSpace* space = context->space->AsMallocSpace();
   Thread* self = context->self;
   Locks::heap_bitmap_lock_->AssertExclusiveHeld(self);
   // If the bitmaps aren't swapped we need to clear the bits since the GC isn't going to re-swap
   // the bitmaps as an optimization.
   if (!context->swap_bitmaps) {
-    accounting::SpaceBitmap* bitmap = space->GetLiveBitmap();
+    accounting::ContinuousSpaceBitmap* bitmap = space->GetLiveBitmap();
     for (size_t i = 0; i < num_ptrs; ++i) {
       bitmap->Clear(ptrs[i]);
     }
diff --git a/runtime/gc/space/malloc_space.h b/runtime/gc/space/malloc_space.h
index fbcee5f..d24016c 100644
--- a/runtime/gc/space/malloc_space.h
+++ b/runtime/gc/space/malloc_space.h
@@ -114,7 +114,8 @@
   void SetGrowthLimit(size_t growth_limit);
 
   virtual MallocSpace* CreateInstance(const std::string& name, MemMap* mem_map, void* allocator,
-                                      byte* begin, byte* end, byte* limit, size_t growth_limit) = 0;
+                                      byte* begin, byte* end, byte* limit, size_t growth_limit,
+                                      bool can_move_objects) = 0;
 
   // Splits ourself into a zygote space and new malloc space which has our unused memory. When true,
   // the low memory mode argument specifies that the heap wishes the created space to be more
@@ -127,9 +128,14 @@
   // Returns the class of a recently freed object.
   mirror::Class* FindRecentFreedObject(const mirror::Object* obj);
 
+  bool CanMoveObjects() const OVERRIDE {
+    return can_move_objects_;
+  }
+
  protected:
   MallocSpace(const std::string& name, MemMap* mem_map, byte* begin, byte* end,
-              byte* limit, size_t growth_limit, bool create_bitmaps = true);
+              byte* limit, size_t growth_limit, bool create_bitmaps, bool can_move_objects,
+              size_t starting_size, size_t initial_size);
 
   static MemMap* CreateMemMap(const std::string& name, size_t starting_size, size_t* initial_size,
                               size_t* growth_limit, size_t* capacity, byte* requested_begin);
@@ -143,7 +149,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
-  virtual accounting::SpaceBitmap::SweepCallback* GetSweepCallback() {
+  virtual accounting::ContinuousSpaceBitmap::SweepCallback* GetSweepCallback() {
     return &SweepCallback;
   }
 
@@ -167,6 +173,13 @@
   // one time by a call to ClearGrowthLimit.
   size_t growth_limit_;
 
+  // True if objects in the space are movable.
+  const bool can_move_objects_;
+
+  // Starting and initial sized, used when you reset the space.
+  const size_t starting_size_;
+  const size_t initial_size_;
+
  private:
   static void SweepCallback(size_t num_ptrs, mirror::Object** ptrs, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc
index 012267b..5a7d941 100644
--- a/runtime/gc/space/rosalloc_space.cc
+++ b/runtime/gc/space/rosalloc_space.cc
@@ -15,10 +15,10 @@
  * limitations under the License.
  */
 
-#include "rosalloc_space.h"
-
 #include "rosalloc_space-inl.h"
+
 #include "gc/accounting/card_table.h"
+#include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
@@ -34,19 +34,23 @@
 
 static constexpr bool kPrefetchDuringRosAllocFreeList = true;
 
-template class ValgrindMallocSpace<RosAllocSpace, allocator::RosAlloc*>;
+// TODO: Fix
+// template class ValgrindMallocSpace<RosAllocSpace, allocator::RosAlloc*>;
 
 RosAllocSpace::RosAllocSpace(const std::string& name, MemMap* mem_map,
                              art::gc::allocator::RosAlloc* rosalloc, byte* begin, byte* end,
-                             byte* limit, size_t growth_limit)
-    : MallocSpace(name, mem_map, begin, end, limit, growth_limit), rosalloc_(rosalloc) {
-  CHECK(rosalloc != NULL);
+                             byte* limit, size_t growth_limit, bool can_move_objects,
+                             size_t starting_size, size_t initial_size, bool low_memory_mode)
+    : MallocSpace(name, mem_map, begin, end, limit, growth_limit, true, can_move_objects,
+                  starting_size, initial_size),
+      rosalloc_(rosalloc), low_memory_mode_(low_memory_mode) {
+  CHECK(rosalloc != nullptr);
 }
 
 RosAllocSpace* RosAllocSpace::CreateFromMemMap(MemMap* mem_map, const std::string& name,
                                                size_t starting_size, size_t initial_size,
                                                size_t growth_limit, size_t capacity,
-                                               bool low_memory_mode) {
+                                               bool low_memory_mode, bool can_move_objects) {
   DCHECK(mem_map != nullptr);
   allocator::RosAlloc* rosalloc = CreateRosAlloc(mem_map->Begin(), starting_size, initial_size,
                                                  capacity, low_memory_mode);
@@ -66,10 +70,10 @@
   // TODO: Fix RosAllocSpace to support valgrind. There is currently some issues with
   // AllocationSize caused by redzones. b/12944686
   if (false && Runtime::Current()->GetHeap()->RunningOnValgrind()) {
-    return new ValgrindMallocSpace<RosAllocSpace, allocator::RosAlloc*>(
-        name, mem_map, rosalloc, begin, end, begin + capacity, growth_limit, initial_size);
+    LOG(FATAL) << "Unimplemented";
   } else {
-    return new RosAllocSpace(name, mem_map, rosalloc, begin, end, begin + capacity, growth_limit);
+    return new RosAllocSpace(name, mem_map, rosalloc, begin, end, begin + capacity, growth_limit,
+                             can_move_objects, starting_size, initial_size, low_memory_mode);
   }
 }
 
@@ -79,7 +83,7 @@
 
 RosAllocSpace* RosAllocSpace::Create(const std::string& name, size_t initial_size,
                                      size_t growth_limit, size_t capacity, byte* requested_begin,
-                                     bool low_memory_mode) {
+                                     bool low_memory_mode, bool can_move_objects) {
   uint64_t start_time = 0;
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     start_time = NanoTime();
@@ -104,7 +108,8 @@
   }
 
   RosAllocSpace* space = CreateFromMemMap(mem_map, name, starting_size, initial_size,
-                                          growth_limit, capacity, low_memory_mode);
+                                          growth_limit, capacity, low_memory_mode,
+                                          can_move_objects);
   // We start out with only the initial size possibly containing objects.
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "RosAllocSpace::Create exiting (" << PrettyDuration(NanoTime() - start_time)
@@ -113,7 +118,8 @@
   return space;
 }
 
-allocator::RosAlloc* RosAllocSpace::CreateRosAlloc(void* begin, size_t morecore_start, size_t initial_size,
+allocator::RosAlloc* RosAllocSpace::CreateRosAlloc(void* begin, size_t morecore_start,
+                                                   size_t initial_size,
                                                    size_t maximum_size, bool low_memory_mode) {
   // clear errno to allow PLOG on error
   errno = 0;
@@ -154,9 +160,11 @@
 }
 
 MallocSpace* RosAllocSpace::CreateInstance(const std::string& name, MemMap* mem_map, void* allocator,
-                                           byte* begin, byte* end, byte* limit, size_t growth_limit) {
+                                           byte* begin, byte* end, byte* limit, size_t growth_limit,
+                                           bool can_move_objects) {
   return new RosAllocSpace(name, mem_map, reinterpret_cast<allocator::RosAlloc*>(allocator),
-                           begin, end, limit, growth_limit);
+                           begin, end, limit, growth_limit, can_move_objects, starting_size_,
+                           initial_size_, low_memory_mode_);
 }
 
 size_t RosAllocSpace::Free(Thread* self, mirror::Object* ptr) {
@@ -222,6 +230,7 @@
 }
 
 size_t RosAllocSpace::Trim() {
+  VLOG(heap) << "RosAllocSpace::Trim() ";
   {
     MutexLock mu(Thread::Current(), lock_);
     // Trim to release memory at the end of the space.
@@ -229,10 +238,7 @@
   }
   // Attempt to release pages if it does not release all empty pages.
   if (!rosalloc_->DoesReleaseAllPages()) {
-    VLOG(heap) << "RosAllocSpace::Trim() ";
-    size_t reclaimed = 0;
-    InspectAllRosAlloc(DlmallocMadviseCallback, &reclaimed, false);
-    return reclaimed;
+    return rosalloc_->ReleasePages();
   }
   return 0;
 }
@@ -335,13 +341,15 @@
 }
 
 void RosAllocSpace::Clear() {
+  size_t footprint_limit = GetFootprintLimit();
   madvise(GetMemMap()->Begin(), GetMemMap()->Size(), MADV_DONTNEED);
-  GetLiveBitmap()->Clear();
-  GetMarkBitmap()->Clear();
-}
-
-void RosAllocSpace::Reset() {
-  // TODO: Delete and create new mspace here.
+  live_bitmap_->Clear();
+  mark_bitmap_->Clear();
+  end_ = begin_ + starting_size_;
+  delete rosalloc_;
+  rosalloc_ = CreateRosAlloc(mem_map_->Begin(), starting_size_, initial_size_, Capacity(),
+                             low_memory_mode_);
+  SetFootprintLimit(footprint_limit);
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/rosalloc_space.h b/runtime/gc/space/rosalloc_space.h
index 900e7a9..a156738 100644
--- a/runtime/gc/space/rosalloc_space.h
+++ b/runtime/gc/space/rosalloc_space.h
@@ -39,11 +39,12 @@
   // the caller should call Begin on the returned space to confirm the
   // request was granted.
   static RosAllocSpace* Create(const std::string& name, size_t initial_size, size_t growth_limit,
-                               size_t capacity, byte* requested_begin, bool low_memory_mode);
+                               size_t capacity, byte* requested_begin, bool low_memory_mode,
+                               bool can_move_objects);
   static RosAllocSpace* CreateFromMemMap(MemMap* mem_map, const std::string& name,
                                          size_t starting_size, size_t initial_size,
                                          size_t growth_limit, size_t capacity,
-                                         bool low_memory_mode);
+                                         bool low_memory_mode, bool can_move_objects);
 
   mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes, size_t* bytes_allocated,
                                   size_t* usable_size) OVERRIDE LOCKS_EXCLUDED(lock_);
@@ -80,9 +81,10 @@
   void SetFootprintLimit(size_t limit) OVERRIDE;
 
   void Clear() OVERRIDE;
-  void Reset() OVERRIDE;
+
   MallocSpace* CreateInstance(const std::string& name, MemMap* mem_map, void* allocator,
-                              byte* begin, byte* end, byte* limit, size_t growth_limit);
+                              byte* begin, byte* end, byte* limit, size_t growth_limit,
+                              bool can_move_objects) OVERRIDE;
 
   uint64_t GetBytesAllocated() OVERRIDE;
   uint64_t GetObjectsAllocated() OVERRIDE;
@@ -110,7 +112,8 @@
 
  protected:
   RosAllocSpace(const std::string& name, MemMap* mem_map, allocator::RosAlloc* rosalloc,
-                byte* begin, byte* end, byte* limit, size_t growth_limit);
+                byte* begin, byte* end, byte* limit, size_t growth_limit, bool can_move_objects,
+                size_t starting_size, size_t initial_size, bool low_memory_mode);
 
  private:
   mirror::Object* AllocCommon(Thread* self, size_t num_bytes, size_t* bytes_allocated,
@@ -132,7 +135,9 @@
       LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_, Locks::thread_list_lock_);
 
   // Underlying rosalloc.
-  allocator::RosAlloc* const rosalloc_;
+  allocator::RosAlloc* rosalloc_;
+
+  const bool low_memory_mode_;
 
   friend class collector::MarkSweep;
 
diff --git a/runtime/gc/space/rosalloc_space_base_test.cc b/runtime/gc/space/rosalloc_space_base_test.cc
index df42076..c3157fa 100644
--- a/runtime/gc/space/rosalloc_space_base_test.cc
+++ b/runtime/gc/space/rosalloc_space_base_test.cc
@@ -23,7 +23,7 @@
 MallocSpace* CreateRosAllocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
   return RosAllocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin,
-                               Runtime::Current()->GetHeap()->IsLowMemoryMode());
+                               Runtime::Current()->GetHeap()->IsLowMemoryMode(), false);
 }
 
 TEST_SPACE_CREATE_FN_BASE(RosAllocSpace, CreateRosAllocSpace)
diff --git a/runtime/gc/space/rosalloc_space_random_test.cc b/runtime/gc/space/rosalloc_space_random_test.cc
index 4d37c9e..864bbc9 100644
--- a/runtime/gc/space/rosalloc_space_random_test.cc
+++ b/runtime/gc/space/rosalloc_space_random_test.cc
@@ -23,7 +23,7 @@
 MallocSpace* CreateRosAllocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
   return RosAllocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin,
-                               Runtime::Current()->GetHeap()->IsLowMemoryMode());
+                               Runtime::Current()->GetHeap()->IsLowMemoryMode(), false);
 }
 
 TEST_SPACE_CREATE_FN_RANDOM(RosAllocSpace, CreateRosAllocSpace)
diff --git a/runtime/gc/space/rosalloc_space_static_test.cc b/runtime/gc/space/rosalloc_space_static_test.cc
index 9f11fd0..c0e2ac8 100644
--- a/runtime/gc/space/rosalloc_space_static_test.cc
+++ b/runtime/gc/space/rosalloc_space_static_test.cc
@@ -23,7 +23,7 @@
 MallocSpace* CreateRosAllocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
   return RosAllocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin,
-                               Runtime::Current()->GetHeap()->IsLowMemoryMode());
+                               Runtime::Current()->GetHeap()->IsLowMemoryMode(), false);
 }
 
 TEST_SPACE_CREATE_FN_STATIC(RosAllocSpace, CreateRosAllocSpace)
diff --git a/runtime/gc/space/space.cc b/runtime/gc/space/space.cc
index 4af65a9..4e28416 100644
--- a/runtime/gc/space/space.cc
+++ b/runtime/gc/space/space.cc
@@ -18,6 +18,7 @@
 
 #include "base/logging.h"
 #include "gc/accounting/heap_bitmap.h"
+#include "gc/accounting/space_bitmap-inl.h"
 #include "runtime.h"
 #include "thread-inl.h"
 
@@ -69,36 +70,34 @@
 
 DiscontinuousSpace::DiscontinuousSpace(const std::string& name,
                                        GcRetentionPolicy gc_retention_policy) :
-    Space(name, gc_retention_policy),
-    live_objects_(new accounting::ObjectSet("large live objects")),
-    mark_objects_(new accounting::ObjectSet("large marked objects")) {
+    Space(name, gc_retention_policy) {
+  // TODO: Fix this if we ever support objects not in the low 32 bit.
+  const size_t capacity = static_cast<size_t>(std::numeric_limits<uint32_t>::max());
+  live_bitmap_.reset(accounting::LargeObjectBitmap::Create("large live objects", nullptr,
+                                                           capacity));
+  CHECK(live_bitmap_.get() != nullptr);
+  mark_bitmap_.reset(accounting::LargeObjectBitmap::Create("large marked objects", nullptr,
+                                                           capacity));
+  CHECK(mark_bitmap_.get() != nullptr);
 }
 
 void ContinuousMemMapAllocSpace::Sweep(bool swap_bitmaps, size_t* freed_objects, size_t* freed_bytes) {
   DCHECK(freed_objects != nullptr);
   DCHECK(freed_bytes != nullptr);
-  accounting::SpaceBitmap* live_bitmap = GetLiveBitmap();
-  accounting::SpaceBitmap* mark_bitmap = GetMarkBitmap();
+  accounting::ContinuousSpaceBitmap* live_bitmap = GetLiveBitmap();
+  accounting::ContinuousSpaceBitmap* mark_bitmap = GetMarkBitmap();
   // If the bitmaps are bound then sweeping this space clearly won't do anything.
   if (live_bitmap == mark_bitmap) {
     return;
   }
-  SweepCallbackContext scc;
-  scc.swap_bitmaps = swap_bitmaps;
-  scc.heap = Runtime::Current()->GetHeap();
-  scc.self = Thread::Current();
-  scc.space = this;
-  scc.freed_objects = 0;
-  scc.freed_bytes = 0;
+  SweepCallbackContext scc(swap_bitmaps, this);
   if (swap_bitmaps) {
     std::swap(live_bitmap, mark_bitmap);
   }
   // Bitmaps are pre-swapped for optimization which enables sweeping with the heap unlocked.
-  accounting::SpaceBitmap::SweepWalk(*live_bitmap, *mark_bitmap,
-                                     reinterpret_cast<uintptr_t>(Begin()),
-                                     reinterpret_cast<uintptr_t>(End()),
-                                     GetSweepCallback(),
-                                     reinterpret_cast<void*>(&scc));
+  accounting::ContinuousSpaceBitmap::SweepWalk(
+      *live_bitmap, *mark_bitmap, reinterpret_cast<uintptr_t>(Begin()),
+      reinterpret_cast<uintptr_t>(End()), GetSweepCallback(), reinterpret_cast<void*>(&scc));
   *freed_objects += scc.freed_objects;
   *freed_bytes += scc.freed_bytes;
 }
@@ -106,9 +105,9 @@
 // Returns the old mark bitmap.
 void ContinuousMemMapAllocSpace::BindLiveToMarkBitmap() {
   CHECK(!HasBoundBitmaps());
-  accounting::SpaceBitmap* live_bitmap = GetLiveBitmap();
+  accounting::ContinuousSpaceBitmap* live_bitmap = GetLiveBitmap();
   if (live_bitmap != mark_bitmap_.get()) {
-    accounting::SpaceBitmap* mark_bitmap = mark_bitmap_.release();
+    accounting::ContinuousSpaceBitmap* mark_bitmap = mark_bitmap_.release();
     Runtime::Current()->GetHeap()->GetMarkBitmap()->ReplaceBitmap(mark_bitmap, live_bitmap);
     temp_bitmap_.reset(mark_bitmap);
     mark_bitmap_.reset(live_bitmap);
@@ -122,7 +121,7 @@
 void ContinuousMemMapAllocSpace::UnBindBitmaps() {
   CHECK(HasBoundBitmaps());
   // At this point, the temp_bitmap holds our old mark bitmap.
-  accounting::SpaceBitmap* new_bitmap = temp_bitmap_.release();
+  accounting::ContinuousSpaceBitmap* new_bitmap = temp_bitmap_.release();
   Runtime::Current()->GetHeap()->GetMarkBitmap()->ReplaceBitmap(mark_bitmap_.get(), new_bitmap);
   CHECK_EQ(mark_bitmap_.release(), live_bitmap_.get());
   mark_bitmap_.reset(new_bitmap);
@@ -137,6 +136,11 @@
   mark_bitmap_->SetName(temp_name);
 }
 
+Space::SweepCallbackContext::SweepCallbackContext(bool swap_bitmaps, space::Space* space)
+    : swap_bitmaps(swap_bitmaps), space(space), self(Thread::Current()), freed_objects(0),
+      freed_bytes(0) {
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index 37d7c80..0a87a16 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -34,10 +34,6 @@
 
 namespace gc {
 
-namespace accounting {
-  class SpaceBitmap;
-}  // namespace accounting
-
 class Heap;
 
 namespace space {
@@ -160,6 +156,9 @@
   }
   virtual ContinuousMemMapAllocSpace* AsContinuousMemMapAllocSpace();
 
+  // Returns true if objects in the space are movable.
+  virtual bool CanMoveObjects() const = 0;
+
   virtual ~Space() {}
 
  protected:
@@ -174,10 +173,11 @@
 
  protected:
   struct SweepCallbackContext {
-    bool swap_bitmaps;
-    Heap* heap;
-    space::Space* space;
-    Thread* self;
+   public:
+    SweepCallbackContext(bool swap_bitmaps, space::Space* space);
+    const bool swap_bitmaps;
+    space::Space* const space;
+    Thread* const self;
     size_t freed_objects;
     size_t freed_bytes;
   };
@@ -265,8 +265,8 @@
     return End() - Begin();
   }
 
-  virtual accounting::SpaceBitmap* GetLiveBitmap() const = 0;
-  virtual accounting::SpaceBitmap* GetMarkBitmap() const = 0;
+  virtual accounting::ContinuousSpaceBitmap* GetLiveBitmap() const = 0;
+  virtual accounting::ContinuousSpaceBitmap* GetMarkBitmap() const = 0;
 
   // Maximum which the mapped space can grow to.
   virtual size_t Capacity() const {
@@ -314,15 +314,15 @@
 // is suitable for use for large primitive arrays.
 class DiscontinuousSpace : public Space {
  public:
-  accounting::ObjectSet* GetLiveObjects() const {
-    return live_objects_.get();
+  accounting::LargeObjectBitmap* GetLiveBitmap() const {
+    return live_bitmap_.get();
   }
 
-  accounting::ObjectSet* GetMarkObjects() const {
-    return mark_objects_.get();
+  accounting::LargeObjectBitmap* GetMarkBitmap() const {
+    return mark_bitmap_.get();
   }
 
-  virtual bool IsDiscontinuousSpace() const {
+  virtual bool IsDiscontinuousSpace() const OVERRIDE {
     return true;
   }
 
@@ -331,8 +331,8 @@
  protected:
   DiscontinuousSpace(const std::string& name, GcRetentionPolicy gc_retention_policy);
 
-  UniquePtr<accounting::ObjectSet> live_objects_;
-  UniquePtr<accounting::ObjectSet> mark_objects_;
+  UniquePtr<accounting::LargeObjectBitmap> live_bitmap_;
+  UniquePtr<accounting::LargeObjectBitmap> mark_bitmap_;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(DiscontinuousSpace);
@@ -396,27 +396,24 @@
   // Swap the live and mark bitmaps of this space. This is used by the GC for concurrent sweeping.
   void SwapBitmaps();
 
-  // Free all memory associated with this space.
+  // Clear the space back to an empty space.
   virtual void Clear() = 0;
 
-  // Reset the space back to an empty space.
-  virtual void Reset() = 0;
-
-  accounting::SpaceBitmap* GetLiveBitmap() const {
+  accounting::ContinuousSpaceBitmap* GetLiveBitmap() const {
     return live_bitmap_.get();
   }
 
-  accounting::SpaceBitmap* GetMarkBitmap() const {
+  accounting::ContinuousSpaceBitmap* GetMarkBitmap() const {
     return mark_bitmap_.get();
   }
 
   void Sweep(bool swap_bitmaps, size_t* freed_objects, size_t* freed_bytes);
-  virtual accounting::SpaceBitmap::SweepCallback* GetSweepCallback() = 0;
+  virtual accounting::ContinuousSpaceBitmap::SweepCallback* GetSweepCallback() = 0;
 
  protected:
-  UniquePtr<accounting::SpaceBitmap> live_bitmap_;
-  UniquePtr<accounting::SpaceBitmap> mark_bitmap_;
-  UniquePtr<accounting::SpaceBitmap> temp_bitmap_;
+  UniquePtr<accounting::ContinuousSpaceBitmap> live_bitmap_;
+  UniquePtr<accounting::ContinuousSpaceBitmap> mark_bitmap_;
+  UniquePtr<accounting::ContinuousSpaceBitmap> temp_bitmap_;
 
   ContinuousMemMapAllocSpace(const std::string& name, MemMap* mem_map, byte* begin,
                              byte* end, byte* limit, GcRetentionPolicy gc_retention_policy)
diff --git a/runtime/gc/space/valgrind_malloc_space-inl.h b/runtime/gc/space/valgrind_malloc_space-inl.h
index ed97e60..966c276 100644
--- a/runtime/gc/space/valgrind_malloc_space-inl.h
+++ b/runtime/gc/space/valgrind_malloc_space-inl.h
@@ -95,8 +95,10 @@
 ValgrindMallocSpace<S, A>::ValgrindMallocSpace(const std::string& name, MemMap* mem_map,
                                                A allocator, byte* begin,
                                                byte* end, byte* limit, size_t growth_limit,
-                                               size_t initial_size) :
-    S(name, mem_map, allocator, begin, end, limit, growth_limit) {
+                                               size_t initial_size,
+                                               bool can_move_objects, size_t starting_size) :
+    S(name, mem_map, allocator, begin, end, limit, growth_limit, can_move_objects, starting_size,
+      initial_size) {
   VALGRIND_MAKE_MEM_UNDEFINED(mem_map->Begin() + initial_size, mem_map->Size() - initial_size);
 }
 
diff --git a/runtime/gc/space/valgrind_malloc_space.h b/runtime/gc/space/valgrind_malloc_space.h
index 6b755c4..200ad83 100644
--- a/runtime/gc/space/valgrind_malloc_space.h
+++ b/runtime/gc/space/valgrind_malloc_space.h
@@ -48,7 +48,7 @@
 
   ValgrindMallocSpace(const std::string& name, MemMap* mem_map, AllocatorType allocator,
                       byte* begin, byte* end, byte* limit, size_t growth_limit,
-                      size_t initial_size);
+                      size_t initial_size, bool can_move_objects, size_t starting_size);
   virtual ~ValgrindMallocSpace() {}
 
  private:
diff --git a/runtime/gc/space/zygote_space.cc b/runtime/gc/space/zygote_space.cc
index d1c3d03..0466413 100644
--- a/runtime/gc/space/zygote_space.cc
+++ b/runtime/gc/space/zygote_space.cc
@@ -40,8 +40,8 @@
 };
 
 ZygoteSpace* ZygoteSpace::Create(const std::string& name, MemMap* mem_map,
-                                 accounting::SpaceBitmap* live_bitmap,
-                                 accounting::SpaceBitmap* mark_bitmap) {
+                                 accounting::ContinuousSpaceBitmap* live_bitmap,
+                                 accounting::ContinuousSpaceBitmap* mark_bitmap) {
   DCHECK(live_bitmap != nullptr);
   DCHECK(mark_bitmap != nullptr);
   size_t objects_allocated = 0;
@@ -61,10 +61,6 @@
   LOG(FATAL) << "Unimplemented";
 }
 
-void ZygoteSpace::Reset() {
-  LOG(FATAL) << "Unimplemented";
-}
-
 ZygoteSpace::ZygoteSpace(const std::string& name, MemMap* mem_map, size_t objects_allocated)
     : ContinuousMemMapAllocSpace(name, mem_map, mem_map->Begin(), mem_map->End(), mem_map->End(),
                                  kGcRetentionPolicyFullCollect),
@@ -105,11 +101,11 @@
   DCHECK(context->space->IsZygoteSpace());
   ZygoteSpace* zygote_space = context->space->AsZygoteSpace();
   Locks::heap_bitmap_lock_->AssertExclusiveHeld(context->self);
-  accounting::CardTable* card_table = context->heap->GetCardTable();
+  accounting::CardTable* card_table = Runtime::Current()->GetHeap()->GetCardTable();
   // If the bitmaps aren't swapped we need to clear the bits since the GC isn't going to re-swap
   // the bitmaps as an optimization.
   if (!context->swap_bitmaps) {
-    accounting::SpaceBitmap* bitmap = zygote_space->GetLiveBitmap();
+    accounting::ContinuousSpaceBitmap* bitmap = zygote_space->GetLiveBitmap();
     for (size_t i = 0; i < num_ptrs; ++i) {
       bitmap->Clear(ptrs[i]);
     }
diff --git a/runtime/gc/space/zygote_space.h b/runtime/gc/space/zygote_space.h
index 8880548..50fc62b 100644
--- a/runtime/gc/space/zygote_space.h
+++ b/runtime/gc/space/zygote_space.h
@@ -17,16 +17,13 @@
 #ifndef ART_RUNTIME_GC_SPACE_ZYGOTE_SPACE_H_
 #define ART_RUNTIME_GC_SPACE_ZYGOTE_SPACE_H_
 
+#include "gc/accounting/space_bitmap.h"
 #include "malloc_space.h"
 #include "mem_map.h"
 
 namespace art {
 namespace gc {
 
-namespace accounting {
-class SpaceBitmap;
-}
-
 namespace space {
 
 // An zygote space is a space which you cannot allocate into or free from.
@@ -34,8 +31,8 @@
  public:
   // Returns the remaining storage in the out_map field.
   static ZygoteSpace* Create(const std::string& name, MemMap* mem_map,
-                             accounting::SpaceBitmap* live_bitmap,
-                             accounting::SpaceBitmap* mark_bitmap)
+                             accounting::ContinuousSpaceBitmap* live_bitmap,
+                             accounting::ContinuousSpaceBitmap* mark_bitmap)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void Dump(std::ostream& os) const;
@@ -72,10 +69,13 @@
   }
 
   void Clear() OVERRIDE;
-  void Reset() OVERRIDE;
+
+  bool CanMoveObjects() const OVERRIDE {
+    return false;
+  }
 
  protected:
-  virtual accounting::SpaceBitmap::SweepCallback* GetSweepCallback() {
+  virtual accounting::ContinuousSpaceBitmap::SweepCallback* GetSweepCallback() {
     return &SweepCallback;
   }
 
diff --git a/runtime/globals.h b/runtime/globals.h
index 7e85231..e3c54b8 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -50,9 +50,6 @@
 // Required stack alignment
 static constexpr size_t kStackAlignment = 16;
 
-// Required object alignment
-static constexpr size_t kObjectAlignment = 8;
-
 // ARM instruction alignment. ARM processors require code to be 4-byte aligned,
 // but ARM ELF requires 8..
 static constexpr size_t kArmAlignment = 8;
@@ -72,6 +69,10 @@
 // compile-time constant so the compiler can generate better code.
 static constexpr int kPageSize = 4096;
 
+// Required object alignment
+static constexpr size_t kObjectAlignment = 8;
+static constexpr size_t kLargeObjectAlignment = kPageSize;
+
 // Whether or not this is a debug build. Useful in conditionals where NDEBUG isn't.
 #if defined(NDEBUG)
 static constexpr bool kIsDebugBuild = false;
diff --git a/runtime/instruction_set.cc b/runtime/instruction_set.cc
index c964629..73d4279 100644
--- a/runtime/instruction_set.cc
+++ b/runtime/instruction_set.cc
@@ -16,8 +16,78 @@
 
 #include "instruction_set.h"
 
+#include "globals.h"
+#include "base/logging.h"  // Logging is required for FATAL in the helper functions.
+
 namespace art {
 
+size_t GetInstructionSetPointerSize(InstructionSet isa) {
+  switch (isa) {
+    case kArm:
+      // Fall-through.
+    case kThumb2:
+      return kArmPointerSize;
+    case kArm64:
+      return kArm64PointerSize;
+    case kX86:
+      return kX86PointerSize;
+    case kX86_64:
+      return kX86_64PointerSize;
+    case kMips:
+      return kMipsPointerSize;
+    case kNone:
+      LOG(FATAL) << "ISA kNone does not have pointer size.";
+      return 0;
+    default:
+      LOG(FATAL) << "Unknown ISA " << isa;
+      return 0;
+  }
+}
+
+size_t GetInstructionSetAlignment(InstructionSet isa) {
+  switch (isa) {
+    case kArm:
+      // Fall-through.
+    case kThumb2:
+      return kArmAlignment;
+    case kArm64:
+      return kArm64Alignment;
+    case kX86:
+      // Fall-through.
+    case kX86_64:
+      return kX86Alignment;
+    case kMips:
+      return kMipsAlignment;
+    case kNone:
+      LOG(FATAL) << "ISA kNone does not have alignment.";
+      return 0;
+    default:
+      LOG(FATAL) << "Unknown ISA " << isa;
+      return 0;
+  }
+}
+
+bool Is64BitInstructionSet(InstructionSet isa) {
+  switch (isa) {
+    case kArm:
+    case kThumb2:
+    case kX86:
+    case kMips:
+      return false;
+
+    case kArm64:
+    case kX86_64:
+      return true;
+
+    case kNone:
+      LOG(FATAL) << "ISA kNone does not have bit width.";
+      return 0;
+    default:
+      LOG(FATAL) << "Unknown ISA " << isa;
+      return 0;
+  }
+}
+
 std::string InstructionSetFeatures::GetFeatureString() const {
   std::string result;
   if ((mask_ & kHwDiv) != 0) {
diff --git a/runtime/instruction_set.h b/runtime/instruction_set.h
index c5a4ec8..c746e06 100644
--- a/runtime/instruction_set.h
+++ b/runtime/instruction_set.h
@@ -35,6 +35,24 @@
 };
 std::ostream& operator<<(std::ostream& os, const InstructionSet& rhs);
 
+size_t GetInstructionSetPointerSize(InstructionSet isa);
+size_t GetInstructionSetAlignment(InstructionSet isa);
+bool Is64BitInstructionSet(InstructionSet isa);
+
+#if defined(__arm__)
+static constexpr InstructionSet kRuntimeISA = kArm;
+#elif defined(__aarch64__)
+static constexpr InstructionSet kRuntimeISA = kArm64;
+#elif defined(__mips__)
+static constexpr InstructionSet kRuntimeISA = kMips;
+#elif defined(__i386__)
+static constexpr InstructionSet kRuntimeISA = kX86;
+#elif defined(__x86_64__)
+static constexpr InstructionSet kRuntimeISA = kX86_64;
+#else
+static constexpr InstructionSet kRuntimeISA = kNone;
+#endif
+
 enum InstructionFeatures {
   kHwDiv = 1                  // Supports hardware divide.
 };
@@ -67,6 +85,10 @@
     return mask_ != peer.mask_;
   }
 
+  bool operator<=(const InstructionSetFeatures &peer) const {
+    return (mask_ & peer.mask_) == mask_;
+  }
+
  private:
   uint32_t mask_;
 };
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index c798fbf..2cd7f49 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -63,6 +63,7 @@
       interpret_only_(false), forced_interpret_only_(false),
       have_method_entry_listeners_(false), have_method_exit_listeners_(false),
       have_method_unwind_listeners_(false), have_dex_pc_listeners_(false),
+      have_field_read_listeners_(false), have_field_write_listeners_(false),
       have_exception_caught_listeners_(false),
       deoptimized_methods_lock_("deoptimized methods lock"),
       deoptimization_enabled_(false),
@@ -255,7 +256,7 @@
   visitor.WalkStack(true);
   CHECK_EQ(visitor.dex_pcs_.size(), thread->GetInstrumentationStack()->size());
 
-  if (!instrumentation->ShouldNotifyMethodEnterExitEvents()) {
+  if (instrumentation->ShouldNotifyMethodEnterExitEvents()) {
     // Create method enter events for all methods currently on the thread's stack. We only do this
     // if no debugger is attached to prevent from posting events twice.
     typedef std::deque<InstrumentationStackFrame>::const_reverse_iterator It;
@@ -302,8 +303,9 @@
       }
       bool removed_stub = false;
       // TODO: make this search more efficient?
-      for (InstrumentationStackFrame instrumentation_frame : *instrumentation_stack_) {
-        if (instrumentation_frame.frame_id_ == GetFrameId()) {
+      const size_t frameId = GetFrameId();
+      for (const InstrumentationStackFrame& instrumentation_frame : *instrumentation_stack_) {
+        if (instrumentation_frame.frame_id_ == frameId) {
           if (kVerboseInstrumentation) {
             LOG(INFO) << "  Removing exit stub in " << DescribeLocation();
           }
@@ -313,7 +315,7 @@
             CHECK(m == instrumentation_frame.method_) << PrettyMethod(m);
           }
           SetReturnPc(instrumentation_frame.return_pc_);
-          if (!instrumentation_->ShouldNotifyMethodEnterExitEvents()) {
+          if (instrumentation_->ShouldNotifyMethodEnterExitEvents()) {
             // Create the method exit events. As the methods didn't really exit the result is 0.
             // We only do this if no debugger is attached to prevent from posting events twice.
             instrumentation_->MethodExitEvent(thread_, instrumentation_frame.this_object_, m,
@@ -373,6 +375,14 @@
     dex_pc_listeners_.push_back(listener);
     have_dex_pc_listeners_ = true;
   }
+  if ((events & kFieldRead) != 0) {
+    field_read_listeners_.push_back(listener);
+    have_field_read_listeners_ = true;
+  }
+  if ((events & kFieldWritten) != 0) {
+    field_write_listeners_.push_back(listener);
+    have_field_write_listeners_ = true;
+  }
   if ((events & kExceptionCaught) != 0) {
     exception_caught_listeners_.push_back(listener);
     have_exception_caught_listeners_ = true;
@@ -410,6 +420,22 @@
     }
     have_dex_pc_listeners_ = dex_pc_listeners_.size() > 0;
   }
+  if ((events & kFieldRead) != 0) {
+    bool contains = std::find(field_read_listeners_.begin(), field_read_listeners_.end(),
+                              listener) != field_read_listeners_.end();
+    if (contains) {
+      field_read_listeners_.remove(listener);
+    }
+    have_field_read_listeners_ = field_read_listeners_.size() > 0;
+  }
+  if ((events & kFieldWritten) != 0) {
+    bool contains = std::find(field_write_listeners_.begin(), field_write_listeners_.end(),
+                              listener) != field_write_listeners_.end();
+    if (contains) {
+      field_write_listeners_.remove(listener);
+    }
+    have_field_write_listeners_ = field_write_listeners_.size() > 0;
+  }
   if ((events & kExceptionCaught) != 0) {
     exception_caught_listeners_.remove(listener);
     have_exception_caught_listeners_ = exception_caught_listeners_.size() > 0;
@@ -439,7 +465,7 @@
     // We're already set.
     return;
   }
-  Thread* self = Thread::Current();
+  Thread* const self = Thread::Current();
   Runtime* runtime = Runtime::Current();
   Locks::thread_list_lock_->AssertNotHeld(self);
   if (desired_level > 0) {
@@ -451,7 +477,7 @@
     }
     runtime->GetClassLinker()->VisitClasses(InstallStubsClassVisitor, this);
     instrumentation_stubs_installed_ = true;
-    MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
+    MutexLock mu(self, *Locks::thread_list_lock_);
     runtime->GetThreadList()->ForEach(InstrumentationInstallStack, this);
   } else {
     interpreter_stubs_installed_ = false;
@@ -657,7 +683,7 @@
 
 // Indicates if instrumentation should notify method enter/exit events to the listeners.
 bool Instrumentation::ShouldNotifyMethodEnterExitEvents() const {
-  return deoptimization_enabled_ || interpreter_stubs_installed_;
+  return !deoptimization_enabled_ && !interpreter_stubs_installed_;
 }
 
 void Instrumentation::DeoptimizeEverything() {
@@ -743,6 +769,30 @@
   }
 }
 
+void Instrumentation::FieldReadEventImpl(Thread* thread, mirror::Object* this_object,
+                                         mirror::ArtMethod* method, uint32_t dex_pc,
+                                         mirror::ArtField* field) const {
+  if (have_field_read_listeners_) {
+    // TODO: same comment than DexPcMovedEventImpl.
+    std::list<InstrumentationListener*> copy(field_read_listeners_);
+    for (InstrumentationListener* listener : copy) {
+      listener->FieldRead(thread, this_object, method, dex_pc, field);
+    }
+  }
+}
+
+void Instrumentation::FieldWriteEventImpl(Thread* thread, mirror::Object* this_object,
+                                         mirror::ArtMethod* method, uint32_t dex_pc,
+                                         mirror::ArtField* field, const JValue& field_value) const {
+  if (have_field_write_listeners_) {
+    // TODO: same comment than DexPcMovedEventImpl.
+    std::list<InstrumentationListener*> copy(field_write_listeners_);
+    for (InstrumentationListener* listener : copy) {
+      listener->FieldWritten(thread, this_object, method, dex_pc, field, field_value);
+    }
+  }
+}
+
 void Instrumentation::ExceptionCaughtEvent(Thread* thread, const ThrowLocation& throw_location,
                                            mirror::ArtMethod* catch_method,
                                            uint32_t catch_dex_pc,
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index 2a9c35f..3de0728 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -28,6 +28,7 @@
 
 namespace art {
 namespace mirror {
+  class ArtField;
   class ArtMethod;
   class Class;
   class Object;
@@ -78,6 +79,14 @@
                           mirror::ArtMethod* method, uint32_t new_dex_pc)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
 
+  // Call-back for when we read from a field.
+  virtual void FieldRead(Thread* thread, mirror::Object* this_object, mirror::ArtMethod* method,
+                         uint32_t dex_pc, mirror::ArtField* field) = 0;
+
+  // Call-back for when we write into a field.
+  virtual void FieldWritten(Thread* thread, mirror::Object* this_object, mirror::ArtMethod* method,
+                            uint32_t dex_pc, mirror::ArtField* field, const JValue& field_value) = 0;
+
   // Call-back when an exception is caught.
   virtual void ExceptionCaught(Thread* thread, const ThrowLocation& throw_location,
                                mirror::ArtMethod* catch_method, uint32_t catch_dex_pc,
@@ -92,11 +101,13 @@
 class Instrumentation {
  public:
   enum InstrumentationEvent {
-    kMethodEntered = 1,
-    kMethodExited = 2,
-    kMethodUnwind = 4,
-    kDexPcMoved = 8,
-    kExceptionCaught = 16
+    kMethodEntered =   1 << 0,
+    kMethodExited =    1 << 1,
+    kMethodUnwind =    1 << 2,
+    kDexPcMoved =      1 << 3,
+    kFieldRead =       1 << 4,
+    kFieldWritten =    1 << 5,
+    kExceptionCaught = 1 << 6,
   };
 
   Instrumentation();
@@ -217,6 +228,14 @@
     return have_dex_pc_listeners_;
   }
 
+  bool HasFieldReadListeners() const {
+    return have_field_read_listeners_;
+  }
+
+  bool HasFieldWriteListeners() const {
+    return have_field_write_listeners_;
+  }
+
   bool IsActive() const {
     return have_dex_pc_listeners_ || have_method_entry_listeners_ || have_method_exit_listeners_ ||
         have_exception_caught_listeners_ || have_method_unwind_listeners_;
@@ -256,6 +275,26 @@
     }
   }
 
+  // Inform listeners that we read a field (only supported by the interpreter).
+  void FieldReadEvent(Thread* thread, mirror::Object* this_object,
+                      mirror::ArtMethod* method, uint32_t dex_pc,
+                      mirror::ArtField* field) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    if (UNLIKELY(HasFieldReadListeners())) {
+      FieldReadEventImpl(thread, this_object, method, dex_pc, field);
+    }
+  }
+
+  // Inform listeners that we write a field (only supported by the interpreter).
+  void FieldWriteEvent(Thread* thread, mirror::Object* this_object,
+                       mirror::ArtMethod* method, uint32_t dex_pc,
+                       mirror::ArtField* field, const JValue& field_value) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    if (UNLIKELY(HasFieldWriteListeners())) {
+      FieldWriteEventImpl(thread, this_object, method, dex_pc, field, field_value);
+    }
+  }
+
   // Inform listeners that an exception was caught.
   void ExceptionCaughtEvent(Thread* thread, const ThrowLocation& throw_location,
                             mirror::ArtMethod* catch_method, uint32_t catch_dex_pc,
@@ -313,6 +352,14 @@
   void DexPcMovedEventImpl(Thread* thread, mirror::Object* this_object,
                            mirror::ArtMethod* method, uint32_t dex_pc) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void FieldReadEventImpl(Thread* thread, mirror::Object* this_object,
+                           mirror::ArtMethod* method, uint32_t dex_pc,
+                           mirror::ArtField* field) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void FieldWriteEventImpl(Thread* thread, mirror::Object* this_object,
+                           mirror::ArtMethod* method, uint32_t dex_pc,
+                           mirror::ArtField* field, const JValue& field_value) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Have we hijacked ArtMethod::code_ so that it calls instrumentation/interpreter code?
   bool instrumentation_stubs_installed_;
@@ -345,6 +392,14 @@
   // instrumentation_lock_.
   bool have_dex_pc_listeners_;
 
+  // Do we have any listeners for field read events? Short-cut to avoid taking the
+  // instrumentation_lock_.
+  bool have_field_read_listeners_;
+
+  // Do we have any listeners for field write events? Short-cut to avoid taking the
+  // instrumentation_lock_.
+  bool have_field_write_listeners_;
+
   // Do we have any exception caught listeners? Short-cut to avoid taking the instrumentation_lock_.
   bool have_exception_caught_listeners_;
 
@@ -353,6 +408,8 @@
   std::list<InstrumentationListener*> method_exit_listeners_ GUARDED_BY(Locks::mutator_lock_);
   std::list<InstrumentationListener*> method_unwind_listeners_ GUARDED_BY(Locks::mutator_lock_);
   std::list<InstrumentationListener*> dex_pc_listeners_ GUARDED_BY(Locks::mutator_lock_);
+  std::list<InstrumentationListener*> field_read_listeners_ GUARDED_BY(Locks::mutator_lock_);
+  std::list<InstrumentationListener*> field_write_listeners_ GUARDED_BY(Locks::mutator_lock_);
   std::list<InstrumentationListener*> exception_caught_listeners_ GUARDED_BY(Locks::mutator_lock_);
 
   // The set of methods being deoptimized (by the debugger) which must be executed with interpreter
diff --git a/runtime/jdwp/jdwp.h b/runtime/jdwp/jdwp.h
index 66ebb96..1477324 100644
--- a/runtime/jdwp/jdwp.h
+++ b/runtime/jdwp/jdwp.h
@@ -197,6 +197,17 @@
      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   /*
+   * A field of interest has been accessed or modified. This is used for field access and field
+   * modification events.
+   *
+   * "fieldValue" is non-null for field modification events only.
+   * "is_modification" is true for field modification, false for field access.
+   */
+  bool PostFieldEvent(const JdwpLocation* pLoc, RefTypeId typeId, FieldId fieldId,
+                      ObjectId thisPtr, const JValue* fieldValue, bool is_modification)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  /*
    * An exception has been thrown.
    *
    * Pass in a zeroed-out "*pCatchLoc" if the exception wasn't caught.
diff --git a/runtime/jdwp/jdwp_event.cc b/runtime/jdwp/jdwp_event.cc
index 9b3ea2e..adc1074 100644
--- a/runtime/jdwp/jdwp_event.cc
+++ b/runtime/jdwp/jdwp_event.cc
@@ -121,26 +121,14 @@
   /* nothing for StepOnly -- handled differently */
 };
 
-/*
- * Dump an event to the log file.
- */
-static void dumpEvent(const JdwpEvent* pEvent) {
-  LOG(INFO) << StringPrintf("Event id=0x%4x %p (prev=%p next=%p):", pEvent->requestId, pEvent, pEvent->prev, pEvent->next);
-  LOG(INFO) << "  kind=" << pEvent->eventKind << " susp=" << pEvent->suspend_policy << " modCount=" << pEvent->modCount;
-
-  for (int i = 0; i < pEvent->modCount; i++) {
-    const JdwpEventMod* pMod = &pEvent->mods[i];
-    LOG(INFO) << "  " << pMod->modKind;
-    /* TODO - show details */
-  }
-}
-
 static bool NeedsFullDeoptimization(JdwpEventKind eventKind) {
   switch (eventKind) {
       case EK_METHOD_ENTRY:
       case EK_METHOD_EXIT:
       case EK_METHOD_EXIT_WITH_RETURN_VALUE:
       case EK_SINGLE_STEP:
+      case EK_FIELD_ACCESS:
+      case EK_FIELD_MODIFICATION:
         return true;
       default:
         return false;
@@ -177,9 +165,6 @@
       if (status != ERR_NONE) {
         return status;
       }
-    } else if (pMod->modKind == MK_FIELD_ONLY) {
-      /* should be for EK_FIELD_ACCESS or EK_FIELD_MODIFICATION */
-      dumpEvent(pEvent);  /* TODO - need for field watches */
     }
   }
   if (NeedsFullDeoptimization(pEvent->eventKind)) {
@@ -248,7 +233,16 @@
       Dbg::UnconfigureStep(pMod->step.threadId);
     }
   }
-  if (NeedsFullDeoptimization(pEvent->eventKind)) {
+  if (pEvent->eventKind == EK_SINGLE_STEP) {
+    // Special case for single-steps where we want to avoid the slow pattern deoptimize/undeoptimize
+    // loop between each single-step. In a IDE, this would happens each time the user click on the
+    // "single-step" button. Here we delay the full undeoptimization to the next resume
+    // (VM.Resume or ThreadReference.Resume) or the end of the debugging session (VM.Dispose or
+    // runtime shutdown).
+    // Therefore, in a singles-stepping sequence, only the first single-step will trigger a full
+    // deoptimization and only the last single-step will trigger a full undeoptimization.
+    Dbg::DelayFullUndeoptimization();
+  } else if (NeedsFullDeoptimization(pEvent->eventKind)) {
     CHECK_EQ(req.kind, DeoptimizationRequest::kNothing);
     CHECK(req.method == nullptr);
     req.kind = DeoptimizationRequest::kFullUndeoptimization;
@@ -422,6 +416,9 @@
     case MK_COUNT:
       CHECK_GT(pMod->count.count, 0);
       pMod->count.count--;
+      if (pMod->count.count > 0) {
+        return false;
+      }
       break;
     case MK_CONDITIONAL:
       CHECK(false);  // should not be getting these
@@ -843,6 +840,86 @@
   return match_count != 0;
 }
 
+bool JdwpState::PostFieldEvent(const JdwpLocation* pLoc, RefTypeId typeId, FieldId fieldId,
+                               ObjectId thisPtr, const JValue* fieldValue, bool is_modification) {
+  ModBasket basket;
+  basket.pLoc = pLoc;
+  basket.classId = pLoc->class_id;
+  basket.thisPtr = thisPtr;
+  basket.threadId = Dbg::GetThreadSelfId();
+  basket.className = Dbg::GetClassName(pLoc->class_id);
+  basket.field = fieldId;
+
+  if (InvokeInProgress()) {
+    VLOG(jdwp) << "Not posting field event during invoke";
+    return false;
+  }
+
+  // Get field's reference type tag.
+  JDWP::JdwpTypeTag type_tag;
+  uint32_t class_status;  // unused here.
+  JdwpError error = Dbg::GetClassInfo(typeId, &type_tag, &class_status, NULL);
+  if (error != ERR_NONE) {
+    return false;
+  }
+
+  // Get instance type tag.
+  uint8_t tag;
+  error = Dbg::GetObjectTag(thisPtr, tag);
+  if (error != ERR_NONE) {
+    return false;
+  }
+
+  int match_count = 0;
+  ExpandBuf* pReq = NULL;
+  JdwpSuspendPolicy suspend_policy = SP_NONE;
+  {
+    MutexLock mu(Thread::Current(), event_list_lock_);
+    JdwpEvent** match_list = AllocMatchList(event_list_size_);
+
+    if (is_modification) {
+      FindMatchingEvents(EK_FIELD_MODIFICATION, &basket, match_list, &match_count);
+    } else {
+      FindMatchingEvents(EK_FIELD_ACCESS, &basket, match_list, &match_count);
+    }
+    if (match_count != 0) {
+      VLOG(jdwp) << "EVENT: " << match_list[0]->eventKind << "(" << match_count << " total) "
+                 << basket.className << "." << Dbg::GetMethodName(pLoc->method_id)
+                 << StringPrintf(" thread=%#" PRIx64 "  dex_pc=%#" PRIx64 ")",
+                                 basket.threadId, pLoc->dex_pc);
+
+      suspend_policy = scanSuspendPolicy(match_list, match_count);
+      VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+
+      pReq = eventPrep();
+      expandBufAdd1(pReq, suspend_policy);
+      expandBufAdd4BE(pReq, match_count);
+
+      for (int i = 0; i < match_count; i++) {
+        expandBufAdd1(pReq, match_list[i]->eventKind);
+        expandBufAdd4BE(pReq, match_list[i]->requestId);
+        expandBufAdd8BE(pReq, basket.threadId);
+        expandBufAddLocation(pReq, *pLoc);
+        expandBufAdd1(pReq, type_tag);
+        expandBufAddRefTypeId(pReq, typeId);
+        expandBufAddFieldId(pReq, fieldId);
+        expandBufAdd1(pReq, tag);
+        expandBufAddObjectId(pReq, thisPtr);
+        if (is_modification) {
+          Dbg::OutputFieldValue(fieldId, fieldValue, pReq);
+        }
+      }
+    }
+
+    CleanupMatchList(match_list, match_count);
+  }
+
+  Dbg::ManageDeoptimization();
+
+  SendRequestAndPossiblySuspend(pReq, suspend_policy, basket.threadId);
+  return match_count != 0;
+}
+
 /*
  * A thread is starting or stopping.
  *
diff --git a/runtime/jdwp/jdwp_handler.cc b/runtime/jdwp/jdwp_handler.cc
index c2a2b54..8ef375b 100644
--- a/runtime/jdwp/jdwp_handler.cc
+++ b/runtime/jdwp/jdwp_handler.cc
@@ -291,6 +291,7 @@
  */
 static JdwpError VM_Resume(JdwpState*, Request&, ExpandBuf*)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  Dbg::ProcessDelayedFullUndeoptimizations();
   Dbg::ResumeVM();
   return ERR_NONE;
 }
@@ -372,7 +373,7 @@
   expandBufAdd1(reply, false);   // canAddMethod
   expandBufAdd1(reply, false);   // canUnrestrictedlyRedefineClasses
   expandBufAdd1(reply, false);   // canPopFrames
-  expandBufAdd1(reply, false);   // canUseInstanceFilters
+  expandBufAdd1(reply, true);    // canUseInstanceFilters
   expandBufAdd1(reply, false);   // canGetSourceDebugExtension
   expandBufAdd1(reply, false);   // canRequestVMDeathEvent
   expandBufAdd1(reply, false);   // canSetDefaultStratum
@@ -980,6 +981,8 @@
     return ERR_NONE;
   }
 
+  Dbg::ProcessDelayedFullUndeoptimizations();
+
   Dbg::ResumeThread(thread_id);
   return ERR_NONE;
 }
diff --git a/runtime/jdwp/jdwp_main.cc b/runtime/jdwp/jdwp_main.cc
index 5fc0228..f480256 100644
--- a/runtime/jdwp/jdwp_main.cc
+++ b/runtime/jdwp/jdwp_main.cc
@@ -237,55 +237,41 @@
   Locks::mutator_lock_->AssertNotHeld(self);
   UniquePtr<JdwpState> state(new JdwpState(options));
   switch (options->transport) {
-  case kJdwpTransportSocket:
-    InitSocketTransport(state.get(), options);
-    break;
+    case kJdwpTransportSocket:
+      InitSocketTransport(state.get(), options);
+      break;
 #ifdef HAVE_ANDROID_OS
-  case kJdwpTransportAndroidAdb:
-    InitAdbTransport(state.get(), options);
-    break;
+    case kJdwpTransportAndroidAdb:
+      InitAdbTransport(state.get(), options);
+      break;
 #endif
-  default:
-    LOG(FATAL) << "Unknown transport: " << options->transport;
+    default:
+      LOG(FATAL) << "Unknown transport: " << options->transport;
   }
 
-  if (!options->suspend) {
+  {
     /*
      * Grab a mutex before starting the thread.  This ensures they
      * won't signal the cond var before we're waiting.
      */
     MutexLock thread_start_locker(self, state->thread_start_lock_);
+
     /*
      * We have bound to a port, or are trying to connect outbound to a
      * debugger.  Create the JDWP thread and let it continue the mission.
      */
-    CHECK_PTHREAD_CALL(pthread_create, (&state->pthread_, NULL, StartJdwpThread, state.get()), "JDWP thread");
+    CHECK_PTHREAD_CALL(pthread_create, (&state->pthread_, nullptr, StartJdwpThread, state.get()),
+                       "JDWP thread");
 
     /*
      * Wait until the thread finishes basic initialization.
-     * TODO: cond vars should be waited upon in a loop
      */
-    state->thread_start_cond_.Wait(self);
-  } else {
-    {
-      /*
-       * Grab a mutex before starting the thread.  This ensures they
-       * won't signal the cond var before we're waiting.
-       */
-      MutexLock thread_start_locker(self, state->thread_start_lock_);
-      /*
-       * We have bound to a port, or are trying to connect outbound to a
-       * debugger.  Create the JDWP thread and let it continue the mission.
-       */
-      CHECK_PTHREAD_CALL(pthread_create, (&state->pthread_, NULL, StartJdwpThread, state.get()), "JDWP thread");
-
-      /*
-       * Wait until the thread finishes basic initialization.
-       * TODO: cond vars should be waited upon in a loop
-       */
+    while (!state->debug_thread_started_) {
       state->thread_start_cond_.Wait(self);
     }
+  }
 
+  if (options->suspend) {
     /*
      * For suspend=y, wait for the debugger to connect to us or for us to
      * connect to the debugger.
@@ -332,6 +318,8 @@
     CHECK(event_list_ == NULL);
   }
 
+  Dbg::ProcessDelayedFullUndeoptimizations();
+
   /*
    * Should not have one of these in progress.  If the debugger went away
    * mid-request, though, we could see this.
@@ -481,11 +469,8 @@
     /* process requests until the debugger drops */
     bool first = true;
     while (!Dbg::IsDisposed()) {
-      {
-        // sanity check -- shouldn't happen?
-        MutexLock mu(thread_, *Locks::thread_suspend_count_lock_);
-        CHECK_EQ(thread_->GetState(), kWaitingInMainDebuggerLoop);
-      }
+      // sanity check -- shouldn't happen?
+      CHECK_EQ(thread_->GetState(), kWaitingInMainDebuggerLoop);
 
       if (!netState->ProcessIncoming()) {
         /* blocking read */
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index a6db387..b195dea 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -51,13 +51,14 @@
       OFFSET_OF_OBJECT_MEMBER(Object, klass_), new_klass, false);
 }
 
-inline LockWord Object::GetLockWord() {
-  return LockWord(GetField32(OFFSET_OF_OBJECT_MEMBER(Object, monitor_), true));
+inline LockWord Object::GetLockWord(bool as_volatile) {
+  return LockWord(GetField32(OFFSET_OF_OBJECT_MEMBER(Object, monitor_), as_volatile));
 }
 
-inline void Object::SetLockWord(LockWord new_val) {
+inline void Object::SetLockWord(LockWord new_val, bool as_volatile) {
   // Force use of non-transactional mode and do not check.
-  SetField32<false, false>(OFFSET_OF_OBJECT_MEMBER(Object, monitor_), new_val.GetValue(), true);
+  SetField32<false, false>(OFFSET_OF_OBJECT_MEMBER(Object, monitor_), new_val.GetValue(),
+                           as_volatile);
 }
 
 inline bool Object::CasLockWord(LockWord old_val, LockWord new_val) {
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index d9155f5..766bbc9 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -39,6 +39,32 @@
 namespace art {
 namespace mirror {
 
+class CopyReferenceFieldsWithReadBarrierVisitor {
+ public:
+  explicit CopyReferenceFieldsWithReadBarrierVisitor(Object* dest_obj)
+      : dest_obj_(dest_obj) {}
+
+  void operator()(Object* obj, MemberOffset offset, bool /* is_static */) const
+      ALWAYS_INLINE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    // GetFieldObject() contains a RB.
+    Object* ref = obj->GetFieldObject<Object>(offset, false);
+    // No WB here as a large object space does not have a card table
+    // coverage. Instead, cards will be marked separately.
+    dest_obj_->SetFieldObjectWithoutWriteBarrier<false, false>(offset, ref, false);
+  }
+
+  void operator()(mirror::Class* klass, mirror::Reference* ref) const
+      ALWAYS_INLINE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    // Copy java.lang.ref.Reference.referent which isn't visited in
+    // Object::VisitReferences().
+    DCHECK(klass->IsReferenceClass());
+    this->operator()(ref, mirror::Reference::ReferentOffset(), false);
+  }
+
+ private:
+  Object* const dest_obj_;
+};
+
 static Object* CopyObject(Thread* self, mirror::Object* dest, mirror::Object* src, size_t num_bytes)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   // Copy instance data.  We assume memcpy copies by words.
@@ -47,6 +73,13 @@
   byte* dst_bytes = reinterpret_cast<byte*>(dest);
   size_t offset = sizeof(Object);
   memcpy(dst_bytes + offset, src_bytes + offset, num_bytes - offset);
+  if (kUseBakerOrBrooksReadBarrier) {
+    // We need a RB here. After the memcpy that covers the whole
+    // object above, copy references fields one by one again with a
+    // RB. TODO: Optimize this later?
+    CopyReferenceFieldsWithReadBarrierVisitor visitor(dest);
+    src->VisitReferences<true>(visitor, visitor);
+  }
   gc::Heap* heap = Runtime::Current()->GetHeap();
   // Perform write barriers on copied object references.
   Class* c = src->GetClass();
@@ -117,7 +150,7 @@
 int32_t Object::IdentityHashCode() const {
   mirror::Object* current_this = const_cast<mirror::Object*>(this);
   while (true) {
-    LockWord lw = current_this->GetLockWord();
+    LockWord lw = current_this->GetLockWord(false);
     switch (lw.GetState()) {
       case LockWord::kUnlocked: {
         // Try to compare and swap in a new hash, if we succeed we will return the hash on the next
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index f652202..fd31dfb 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -100,8 +100,10 @@
     return OFFSET_OF_OBJECT_MEMBER(Object, monitor_);
   }
 
-  LockWord GetLockWord();
-  void SetLockWord(LockWord new_val);
+  // As volatile can be false if the mutators are suspended. This is an optimization since it
+  // avoids the barriers.
+  LockWord GetLockWord(bool as_volatile);
+  void SetLockWord(LockWord new_val, bool as_volatile);
   bool CasLockWord(LockWord old_val, LockWord new_val) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   uint32_t GetLockOwnerThreadId();
 
diff --git a/runtime/mirror/object_array-inl.h b/runtime/mirror/object_array-inl.h
index 8032cc3..e0c14c3 100644
--- a/runtime/mirror/object_array-inl.h
+++ b/runtime/mirror/object_array-inl.h
@@ -128,7 +128,27 @@
   CHECK_EQ(sizeof(HeapReference<T>), sizeof(uint32_t));
   IntArray* dstAsIntArray = reinterpret_cast<IntArray*>(this);
   IntArray* srcAsIntArray = reinterpret_cast<IntArray*>(src);
-  dstAsIntArray->Memmove(dst_pos, srcAsIntArray, src_pos, count);
+  if (kUseBakerOrBrooksReadBarrier) {
+    // TODO: Optimize this later?
+    const bool copy_forward = (src != this) || (dst_pos < src_pos) || (dst_pos - src_pos >= count);
+    if (copy_forward) {
+      // Forward copy.
+      for (int i = 0; i < count; ++i) {
+        // We need a RB here. ObjectArray::GetWithoutChecks() contains a RB.
+        Object* obj = src->GetWithoutChecks(src_pos + i);
+        SetWithoutChecks<false>(dst_pos + i, obj);
+      }
+    } else {
+      // Backward copy.
+      for (int i = count - 1; i >= 0; --i) {
+        // We need a RB here. ObjectArray::GetWithoutChecks() contains a RB.
+        Object* obj = src->GetWithoutChecks(src_pos + i);
+        SetWithoutChecks<false>(dst_pos + i, obj);
+      }
+    }
+  } else {
+    dstAsIntArray->Memmove(dst_pos, srcAsIntArray, src_pos, count);
+  }
   Runtime::Current()->GetHeap()->WriteBarrierArray(this, dst_pos, count);
   if (kIsDebugBuild) {
     for (int i = 0; i < count; ++i) {
@@ -151,7 +171,16 @@
   CHECK_EQ(sizeof(HeapReference<T>), sizeof(uint32_t));
   IntArray* dstAsIntArray = reinterpret_cast<IntArray*>(this);
   IntArray* srcAsIntArray = reinterpret_cast<IntArray*>(src);
-  dstAsIntArray->Memcpy(dst_pos, srcAsIntArray, src_pos, count);
+  if (kUseBakerOrBrooksReadBarrier) {
+    // TODO: Optimize this later?
+    for (int i = 0; i < count; ++i) {
+      // We need a RB here. ObjectArray::GetWithoutChecks() contains a RB.
+      T* obj = src->GetWithoutChecks(src_pos + i);
+      SetWithoutChecks<false>(dst_pos + i, obj);
+    }
+  } else {
+    dstAsIntArray->Memcpy(dst_pos, srcAsIntArray, src_pos, count);
+  }
   Runtime::Current()->GetHeap()->WriteBarrierArray(this, dst_pos, count);
   if (kIsDebugBuild) {
     for (int i = 0; i < count; ++i) {
@@ -176,6 +205,7 @@
   int i = 0;
   for (; i < count; ++i) {
     // The follow get operations force the objects to be verified.
+    // We need a RB here. ObjectArray::GetWithoutChecks() contains a RB.
     o = src->GetWithoutChecks(src_pos + i);
     if (o == nullptr) {
       // Null is always assignable.
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index bcaf8ec..38b77d1 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -111,7 +111,7 @@
   MutexLock mu(self, monitor_lock_);  // Uncontended mutex acquisition as monitor isn't yet public.
   CHECK(owner_ == nullptr || owner_ == self || owner_->IsSuspended());
   // Propagate the lock state.
-  LockWord lw(obj_->GetLockWord());
+  LockWord lw(obj_->GetLockWord(false));
   switch (lw.GetState()) {
     case LockWord::kThinLocked: {
       CHECK_EQ(owner_->GetThreadId(), lw.ThinLockOwner());
@@ -205,7 +205,7 @@
 void Monitor::Lock(Thread* self) {
   MutexLock mu(self, monitor_lock_);
   while (true) {
-    if (owner_ == NULL) {  // Unowned.
+    if (owner_ == nullptr) {  // Unowned.
       owner_ = self;
       CHECK_EQ(lock_count_, 0);
       // When debugging, save the current monitor holder for future
@@ -223,15 +223,15 @@
     uint64_t wait_start_ms = log_contention ? 0 : MilliTime();
     mirror::ArtMethod* owners_method = locking_method_;
     uint32_t owners_dex_pc = locking_dex_pc_;
+    // Do this before releasing the lock so that we don't get deflated.
+    ++num_waiters_;
     monitor_lock_.Unlock(self);  // Let go of locks in order.
     {
       ScopedThreadStateChange tsc(self, kBlocked);  // Change to blocked and give up mutator_lock_.
       self->SetMonitorEnterObject(obj_);
       MutexLock mu2(self, monitor_lock_);  // Reacquire monitor_lock_ without mutator_lock_ for Wait.
       if (owner_ != NULL) {  // Did the owner_ give the lock up?
-        ++num_waiters_;
         monitor_contenders_.Wait(self);  // Still contended so wait.
-        --num_waiters_;
         // Woken from contention.
         if (log_contention) {
           uint64_t wait_ms = MilliTime() - wait_start_ms;
@@ -252,6 +252,7 @@
       self->SetMonitorEnterObject(nullptr);
     }
     monitor_lock_.Lock(self);  // Reacquire locks in order.
+    --num_waiters_;
   }
 }
 
@@ -431,6 +432,7 @@
    * not order sensitive as we hold the pthread mutex.
    */
   AppendToWaitSet(self);
+  ++num_waiters_;
   int prev_lock_count = lock_count_;
   lock_count_ = 0;
   owner_ = NULL;
@@ -507,6 +509,7 @@
   lock_count_ = prev_lock_count;
   locking_method_ = saved_method;
   locking_dex_pc_ = saved_dex_pc;
+  --num_waiters_;
   RemoveFromWaitSet(self);
 
   if (was_interrupted) {
@@ -571,12 +574,17 @@
 
 bool Monitor::Deflate(Thread* self, mirror::Object* obj) {
   DCHECK(obj != nullptr);
-  LockWord lw(obj->GetLockWord());
+  // Don't need volatile since we only deflate with mutators suspended.
+  LockWord lw(obj->GetLockWord(false));
   // If the lock isn't an inflated monitor, then we don't need to deflate anything.
   if (lw.GetState() == LockWord::kFatLocked) {
     Monitor* monitor = lw.FatLockMonitor();
-    CHECK(monitor != nullptr);
+    DCHECK(monitor != nullptr);
     MutexLock mu(self, monitor->monitor_lock_);
+    // Can't deflate if we have anybody waiting on the CV.
+    if (monitor->num_waiters_ > 0) {
+      return false;
+    }
     Thread* owner = monitor->owner_;
     if (owner != nullptr) {
       // Can't deflate if we are locked and have a hash code.
@@ -587,17 +595,17 @@
       if (monitor->lock_count_ > LockWord::kThinLockMaxCount) {
         return false;
       }
-      // Can't deflate if we have anybody waiting on the CV.
-      if (monitor->num_waiters_ > 0) {
-        return false;
-      }
       // Deflate to a thin lock.
-      obj->SetLockWord(LockWord::FromThinLockId(owner->GetTid(), monitor->lock_count_));
+      obj->SetLockWord(LockWord::FromThinLockId(owner->GetThreadId(), monitor->lock_count_), false);
+      VLOG(monitor) << "Deflated " << obj << " to thin lock " << owner->GetTid() << " / "
+          << monitor->lock_count_;
     } else if (monitor->HasHashCode()) {
-      obj->SetLockWord(LockWord::FromHashCode(monitor->GetHashCode()));
+      obj->SetLockWord(LockWord::FromHashCode(monitor->GetHashCode()), false);
+      VLOG(monitor) << "Deflated " << obj << " to hash monitor " << monitor->GetHashCode();
     } else {
       // No lock and no hash, just put an empty lock word inside the object.
-      obj->SetLockWord(LockWord());
+      obj->SetLockWord(LockWord(), false);
+      VLOG(monitor) << "Deflated" << obj << " to empty lock word";
     }
     // The monitor is deflated, mark the object as nullptr so that we know to delete it during the
     // next GC.
@@ -620,7 +628,7 @@
     VLOG(monitor) << "monitor: thread " << owner->GetThreadId()
                     << " created monitor " << m.get() << " for object " << obj;
     Runtime::Current()->GetMonitorList()->Add(m.release());
-    CHECK_EQ(obj->GetLockWord().GetState(), LockWord::kFatLocked);
+    CHECK_EQ(obj->GetLockWord(true).GetState(), LockWord::kFatLocked);
   }
 }
 
@@ -636,12 +644,12 @@
     // Suspend the owner, inflate. First change to blocked and give up mutator_lock_.
     ScopedThreadStateChange tsc(self, kBlocked);
     self->SetMonitorEnterObject(obj.get());
-    if (lock_word == obj->GetLockWord()) {  // If lock word hasn't changed.
+    if (lock_word == obj->GetLockWord(true)) {  // If lock word hasn't changed.
       bool timed_out;
       Thread* owner = thread_list->SuspendThreadByThreadId(owner_thread_id, false, &timed_out);
       if (owner != nullptr) {
         // We succeeded in suspending the thread, check the lock's status didn't change.
-        lock_word = obj->GetLockWord();
+        lock_word = obj->GetLockWord(true);
         if (lock_word.GetState() == LockWord::kThinLocked &&
             lock_word.ThinLockOwner() == owner_thread_id) {
           // Go ahead and inflate the lock.
@@ -674,7 +682,7 @@
   size_t contention_count = 0;
   SirtRef<mirror::Object> sirt_obj(self, obj);
   while (true) {
-    LockWord lock_word = sirt_obj->GetLockWord();
+    LockWord lock_word = sirt_obj->GetLockWord(true);
     switch (lock_word.GetState()) {
       case LockWord::kUnlocked: {
         LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0));
@@ -691,7 +699,7 @@
           uint32_t new_count = lock_word.ThinLockCount() + 1;
           if (LIKELY(new_count <= LockWord::kThinLockMaxCount)) {
             LockWord thin_locked(LockWord::FromThinLockId(thread_id, new_count));
-            sirt_obj->SetLockWord(thin_locked);
+            sirt_obj->SetLockWord(thin_locked, true);
             return sirt_obj.get();  // Success!
           } else {
             // We'd overflow the recursion count, so inflate the monitor.
@@ -731,13 +739,13 @@
   DCHECK(self != NULL);
   DCHECK(obj != NULL);
   obj = FakeUnlock(obj);
-  LockWord lock_word = obj->GetLockWord();
+  LockWord lock_word = obj->GetLockWord(true);
   SirtRef<mirror::Object> sirt_obj(self, obj);
   switch (lock_word.GetState()) {
     case LockWord::kHashCode:
       // Fall-through.
     case LockWord::kUnlocked:
-      FailedUnlock(sirt_obj.get(), self, NULL, NULL);
+      FailedUnlock(sirt_obj.get(), self, nullptr, nullptr);
       return false;  // Failure.
     case LockWord::kThinLocked: {
       uint32_t thread_id = self->GetThreadId();
@@ -746,16 +754,16 @@
         // TODO: there's a race here with the owner dying while we unlock.
         Thread* owner =
             Runtime::Current()->GetThreadList()->FindThreadByThreadId(lock_word.ThinLockOwner());
-        FailedUnlock(sirt_obj.get(), self, owner, NULL);
+        FailedUnlock(sirt_obj.get(), self, owner, nullptr);
         return false;  // Failure.
       } else {
         // We own the lock, decrease the recursion count.
         if (lock_word.ThinLockCount() != 0) {
           uint32_t new_count = lock_word.ThinLockCount() - 1;
           LockWord thin_locked(LockWord::FromThinLockId(thread_id, new_count));
-          sirt_obj->SetLockWord(thin_locked);
+          sirt_obj->SetLockWord(thin_locked, true);
         } else {
-          sirt_obj->SetLockWord(LockWord());
+          sirt_obj->SetLockWord(LockWord(), true);
         }
         return true;  // Success!
       }
@@ -776,10 +784,9 @@
  */
 void Monitor::Wait(Thread* self, mirror::Object *obj, int64_t ms, int32_t ns,
                    bool interruptShouldThrow, ThreadState why) {
-  DCHECK(self != NULL);
-  DCHECK(obj != NULL);
-
-  LockWord lock_word = obj->GetLockWord();
+  DCHECK(self != nullptr);
+  DCHECK(obj != nullptr);
+  LockWord lock_word = obj->GetLockWord(true);
   switch (lock_word.GetState()) {
     case LockWord::kHashCode:
       // Fall-through.
@@ -795,7 +802,7 @@
       } else {
         // We own the lock, inflate to enqueue ourself on the Monitor.
         Inflate(self, self, obj, 0);
-        lock_word = obj->GetLockWord();
+        lock_word = obj->GetLockWord(true);
       }
       break;
     }
@@ -811,10 +818,9 @@
 }
 
 void Monitor::DoNotify(Thread* self, mirror::Object* obj, bool notify_all) {
-  DCHECK(self != NULL);
-  DCHECK(obj != NULL);
-
-  LockWord lock_word = obj->GetLockWord();
+  DCHECK(self != nullptr);
+  DCHECK(obj != nullptr);
+  LockWord lock_word = obj->GetLockWord(true);
   switch (lock_word.GetState()) {
     case LockWord::kHashCode:
       // Fall-through.
@@ -849,9 +855,8 @@
 }
 
 uint32_t Monitor::GetLockOwnerThreadId(mirror::Object* obj) {
-  DCHECK(obj != NULL);
-
-  LockWord lock_word = obj->GetLockWord();
+  DCHECK(obj != nullptr);
+  LockWord lock_word = obj->GetLockWord(true);
   switch (lock_word.GetState()) {
     case LockWord::kHashCode:
       // Fall-through.
@@ -896,7 +901,7 @@
     if (pretty_object == nullptr) {
       os << wait_message << "an unknown object";
     } else {
-      if ((pretty_object->GetLockWord().GetState() == LockWord::kThinLocked) &&
+      if ((pretty_object->GetLockWord(true).GetState() == LockWord::kThinLocked) &&
           Locks::mutator_lock_->IsExclusiveHeld(Thread::Current())) {
         // Getting the identity hashcode here would result in lock inflation and suspension of the
         // current thread, which isn't safe if this is the only runnable thread.
@@ -1054,7 +1059,7 @@
 }
 
 MonitorList::MonitorList()
-    : allow_new_monitors_(true), monitor_list_lock_("MonitorList lock"),
+    : allow_new_monitors_(true), monitor_list_lock_("MonitorList lock", kMonitorListLock),
       monitor_add_condition_("MonitorList disallow condition", monitor_list_lock_) {
 }
 
@@ -1103,10 +1108,25 @@
   }
 }
 
-MonitorInfo::MonitorInfo(mirror::Object* obj) : owner_(NULL), entry_count_(0) {
-  DCHECK(obj != NULL);
+static mirror::Object* MonitorDeflateCallback(mirror::Object* object, void* arg)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  if (Monitor::Deflate(reinterpret_cast<Thread*>(arg), object)) {
+    DCHECK_NE(object->GetLockWord(true).GetState(), LockWord::kFatLocked);
+    // If we deflated, return nullptr so that the monitor gets removed from the array.
+    return nullptr;
+  }
+  return object;  // Monitor was not deflated.
+}
 
-  LockWord lock_word = obj->GetLockWord();
+void MonitorList::DeflateMonitors() {
+  Thread* self = Thread::Current();
+  Locks::mutator_lock_->AssertExclusiveHeld(self);
+  SweepMonitorList(MonitorDeflateCallback, reinterpret_cast<Thread*>(self));
+}
+
+MonitorInfo::MonitorInfo(mirror::Object* obj) : owner_(NULL), entry_count_(0) {
+  DCHECK(obj != nullptr);
+  LockWord lock_word = obj->GetLockWord(true);
   switch (lock_word.GetState()) {
     case LockWord::kUnlocked:
       // Fall-through.
diff --git a/runtime/monitor.h b/runtime/monitor.h
index 55504b5..0b80892 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -224,11 +224,17 @@
   void Add(Monitor* m);
 
   void SweepMonitorList(IsMarkedCallback* callback, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void DisallowNewMonitors();
-  void AllowNewMonitors();
+      LOCKS_EXCLUDED(monitor_list_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void DisallowNewMonitors() LOCKS_EXCLUDED(monitor_list_lock_);
+  void AllowNewMonitors() LOCKS_EXCLUDED(monitor_list_lock_);
+  void DeflateMonitors() LOCKS_EXCLUDED(monitor_list_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
+  // During sweeping we may free an object and on a separate thread have an object created using
+  // the newly freed memory. That object may then have its lock-word inflated and a monitor created.
+  // If we allow new monitor registration during sweeping this monitor may be incorrectly freed as
+  // the object wasn't marked when sweeping began.
   bool allow_new_monitors_ GUARDED_BY(monitor_list_lock_);
   Mutex monitor_list_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   ConditionVariable monitor_add_condition_ GUARDED_BY(monitor_list_lock_);
diff --git a/runtime/native/dalvik_system_DexFile.cc b/runtime/native/dalvik_system_DexFile.cc
index bab0604..6af16f4 100644
--- a/runtime/native/dalvik_system_DexFile.cc
+++ b/runtime/native/dalvik_system_DexFile.cc
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
-#include <unistd.h>
+#include <algorithm>
 #include <fcntl.h>
+#include <set>
+#include <unistd.h>
 
 #include "base/logging.h"
 #include "class_linker.h"
@@ -30,6 +32,7 @@
 #include "mirror/string.h"
 #include "oat.h"
 #include "os.h"
+#include "profiler.h"
 #include "runtime.h"
 #include "scoped_thread_state_change.h"
 #include "ScopedLocalRef.h"
@@ -101,6 +104,7 @@
 
   uint32_t dex_location_checksum;
   uint32_t* dex_location_checksum_pointer = &dex_location_checksum;
+  std::vector<std::string> error_msgs;
   std::string error_msg;
   if (!DexFile::GetChecksum(sourceName.c_str(), dex_location_checksum_pointer, &error_msg)) {
     dex_location_checksum_pointer = NULL;
@@ -110,9 +114,8 @@
   const DexFile* dex_file;
   if (outputName.c_str() == nullptr) {
     // FindOrCreateOatFileForDexLocation can tolerate a missing dex_location_checksum
-    error_msg.clear();
     dex_file = linker->FindDexFileInOatFileFromDexLocation(sourceName.c_str(),
-                                                           dex_location_checksum_pointer, &error_msg);
+                                                           dex_location_checksum_pointer, &error_msgs);
   } else {
     // FindOrCreateOatFileForDexLocation requires the dex_location_checksum
     if (dex_location_checksum_pointer == NULL) {
@@ -122,12 +125,19 @@
       return 0;
     }
     dex_file = linker->FindOrCreateOatFileForDexLocation(sourceName.c_str(), dex_location_checksum,
-                                                         outputName.c_str(), &error_msg);
+                                                         outputName.c_str(), &error_msgs);
   }
   if (dex_file == nullptr) {
     ScopedObjectAccess soa(env);
-    CHECK(!error_msg.empty());
-    ThrowIOException("%s", error_msg.c_str());
+    CHECK(!error_msgs.empty());
+    // The most important message is at the end. So set up nesting by going forward, which will
+    // wrap the existing exception as a cause for the following one.
+    auto it = error_msgs.begin();
+    auto itEnd = error_msgs.end();
+    for ( ; it != itEnd; ++it) {
+      ThrowWrappedIOException("%s", it->c_str());
+    }
+
     return 0;
   }
   return static_cast<jlong>(reinterpret_cast<uintptr_t>(dex_file));
@@ -230,13 +240,31 @@
   close(fd2);
 }
 
+static double GetDoubleProperty(const char* property, double minValue, double maxValue, double defaultValue) {
+#ifndef HAVE_ANDROID_OS
+  return defaultValue;
+#else
+  char buf[PROP_VALUE_MAX];
+  char* endptr;
+
+  property_get(property, buf, "");
+  double value = strtod(buf, &endptr);
+
+  if (value == 0 && endptr == buf) {
+    value = defaultValue;
+  } else if (value < minValue || value > maxValue) {
+    value = defaultValue;
+  }
+  return value;
+#endif
+}
+
 static jboolean DexFile_isDexOptNeededInternal(JNIEnv* env, jclass, jstring javaFilename,
     jstring javaPkgname, jboolean defer) {
   const bool kVerboseLogging = false;  // Spammy logging.
   const bool kDebugLogging = true;  // Logging useful for debugging.
 
   ScopedUtfChars filename(env, javaFilename);
-
   if ((filename.c_str() == nullptr) || !OS::FileExists(filename.c_str())) {
     LOG(ERROR) << "DexFile_isDexOptNeeded file '" << filename.c_str() << "' does not exist";
     ScopedLocalRef<jclass> fnfe(env, env->FindClass("java/io/FileNotFoundException"));
@@ -282,7 +310,6 @@
     struct stat profstat, prevstat;
     int e1 = stat(profile_file.c_str(), &profstat);
     int e2 = stat(prev_profile_file.c_str(), &prevstat);
-
     if (e1 < 0) {
       // No profile file, need to run dex2oat
       if (kDebugLogging) {
@@ -290,48 +317,47 @@
       }
       return JNI_TRUE;
     }
+
     if (e2 == 0) {
       // There is a previous profile file.  Check if the profile has changed significantly.
-      // Let's use the file size as a proxy for significance.  If the new profile is 10%
-      // different in size than the the old profile then we run dex2oat.
-      double newsize = profstat.st_size;
-      double oldsize = prevstat.st_size;
-      bool need_profile = false;
+      // A change in profile is considered significant if X% (change_thr property) of the top K%
+      // (compile_thr property) samples has changed.
 
-      double ratio = 0;     // If the old file was empty and the new one not
-      if (oldsize > 0 && newsize > 0) {
-        ratio = newsize / oldsize;
-      } else if (oldsize == 0 && newsize > 0) {
-        need_profile = true;
-      } else if (oldsize > 0 && newsize == 0) {
-        // Unlikely to happen, but cover all the bases.
-        need_profile = true;
+      double topKThreshold = GetDoubleProperty("dalvik.vm.profiler.dex2oat.compile_thr", 10.0, 90.0, 90.0);
+      double changeThreshold = GetDoubleProperty("dalvik.vm.profiler.dex2oat.change_thr", 1.0, 90.0, 10.0);
+      double changePercent = 0.0;
+      std::set<std::string> newTopK, oldTopK;
+      bool newOk = ProfileHelper::LoadTopKSamples(newTopK, profile_file, topKThreshold);
+      bool oldOk = ProfileHelper::LoadTopKSamples(oldTopK, prev_profile_file, topKThreshold);
+      if (!newOk || !oldOk) {
+        if (kDebugLogging) {
+          LOG(INFO) << "DexFile_isDexOptNeeded Ignoring invalid profiles: "
+                    << (newOk ?  "" : profile_file) << " " << (oldOk ? "" : prev_profile_file);
+        }
+      } else if (newTopK.empty()) {
+        if (kDebugLogging && kVerboseLogging) {
+          LOG(INFO) << "DexFile_isDexOptNeeded empty profile: " << profile_file;
+        }
+        // If the new topK is empty we shouldn't optimize so we leave the changePercent at 0.0.
+      } else {
+        std::set<std::string> diff;
+        std::set_difference(newTopK.begin(), newTopK.end(), oldTopK.begin(), oldTopK.end(),
+          std::inserter(diff, diff.end()));
+        // TODO: consider using the usedPercentage instead of the plain diff count.
+        changePercent = 100.0 * static_cast<double>(diff.size()) / static_cast<double>(newTopK.size());
+        if (kDebugLogging && kVerboseLogging) {
+          std::set<std::string>::iterator end = diff.end();
+          for (std::set<std::string>::iterator it = diff.begin(); it != end; it++) {
+            LOG(INFO) << "DexFile_isDexOptNeeded new in topK: " << *it;
+          }
+        }
       }
 
-      double significant_difference = 10.0;
-#ifdef HAVE_ANDROID_OS
-      // Switch off profiler if the dalvik.vm.profiler property has value 0.
-      char buf[PROP_VALUE_MAX];
-      property_get("dalvik.vm.profiler.dex2oat.threshold", buf, "10.0");
-      significant_difference = strtod(buf, nullptr);
-
-      // Something reasonable?
-      if (significant_difference < 1.0 || significant_difference > 90.0) {
-        significant_difference = 10.0;
-      }
-#endif      // The percentage difference that we consider as being significant.
-      double diff_hwm = 1.0 + significant_difference/10.0;
-      double diff_lwm = 1.0 - significant_difference/10.0;
-
-      if (ratio > diff_hwm || ratio < diff_lwm) {
-        need_profile = true;
-      }
-
-      if (need_profile) {
+      if (changePercent > changeThreshold) {
         if (kDebugLogging) {
           LOG(INFO) << "DexFile_isDexOptNeeded size of new profile file " << profile_file <<
-          " is significantly different from old profile file " << prev_profile_file << " (new: " <<
-          newsize << ", old: " << oldsize << ", ratio: " << ratio << ")";
+          " is significantly different from old profile file " << prev_profile_file << " (top "
+          << topKThreshold << "% samples changed in proportion of " << changePercent << "%)";
         }
         if (!defer) {
           CopyProfileFile(profile_file.c_str(), prev_profile_file.c_str());
diff --git a/runtime/object_callbacks.h b/runtime/object_callbacks.h
index 89ee34e..9198c90 100644
--- a/runtime/object_callbacks.h
+++ b/runtime/object_callbacks.h
@@ -56,7 +56,7 @@
     __attribute__((warn_unused_result));
 // A callback for verifying roots.
 typedef void (VerifyRootCallback)(const mirror::Object* root, void* arg, size_t vreg,
-    const StackVisitor* visitor);
+    const StackVisitor* visitor, RootType root_type);
 
 typedef void (MarkHeapReferenceCallback)(mirror::HeapReference<mirror::Object>* ref, void* arg);
 
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 08a674f..084e8f6 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -131,6 +131,7 @@
   heap_min_free_ = gc::Heap::kDefaultMinFree;
   heap_max_free_ = gc::Heap::kDefaultMaxFree;
   heap_target_utilization_ = gc::Heap::kDefaultTargetUtilization;
+  foreground_heap_growth_multiplier_ = gc::Heap::kDefaultHeapGrowthMultiplier;
   heap_growth_limit_ = 0;  // 0 means no growth limit .
   // Default to number of processors minus one since the main GC thread also does work.
   parallel_gc_threads_ = sysconf(_SC_NPROCESSORS_CONF) - 1;
@@ -194,6 +195,7 @@
   profile_duration_s_ = 20;          // Seconds.
   profile_interval_us_ = 500;       // Microseconds.
   profile_backoff_coefficient_ = 2.0;
+  profile_start_immediately_ = true;
   profile_clock_source_ = kDefaultProfilerClockSource;
 
   verify_ = true;
@@ -249,7 +251,7 @@
       // TODO: support -Djava.class.path
       i++;
       if (i == options.size()) {
-        Usage("Missing required class path value for %s", option.c_str());
+        Usage("Missing required class path value for %s\n", option.c_str());
         return false;
       }
       const StringPiece& value = options[i].first;
@@ -277,35 +279,35 @@
     } else if (StartsWith(option, "-Xms")) {
       size_t size = ParseMemoryOption(option.substr(strlen("-Xms")).c_str(), 1024);
       if (size == 0) {
-        Usage("Failed to parse memory option %s", option.c_str());
+        Usage("Failed to parse memory option %s\n", option.c_str());
         return false;
       }
       heap_initial_size_ = size;
     } else if (StartsWith(option, "-Xmx")) {
       size_t size = ParseMemoryOption(option.substr(strlen("-Xmx")).c_str(), 1024);
       if (size == 0) {
-        Usage("Failed to parse memory option %s", option.c_str());
+        Usage("Failed to parse memory option %s\n", option.c_str());
         return false;
       }
       heap_maximum_size_ = size;
     } else if (StartsWith(option, "-XX:HeapGrowthLimit=")) {
       size_t size = ParseMemoryOption(option.substr(strlen("-XX:HeapGrowthLimit=")).c_str(), 1024);
       if (size == 0) {
-        Usage("Failed to parse memory option %s", option.c_str());
+        Usage("Failed to parse memory option %s\n", option.c_str());
         return false;
       }
       heap_growth_limit_ = size;
     } else if (StartsWith(option, "-XX:HeapMinFree=")) {
       size_t size = ParseMemoryOption(option.substr(strlen("-XX:HeapMinFree=")).c_str(), 1024);
       if (size == 0) {
-        Usage("Failed to parse memory option %s", option.c_str());
+        Usage("Failed to parse memory option %s\n", option.c_str());
         return false;
       }
       heap_min_free_ = size;
     } else if (StartsWith(option, "-XX:HeapMaxFree=")) {
       size_t size = ParseMemoryOption(option.substr(strlen("-XX:HeapMaxFree=")).c_str(), 1024);
       if (size == 0) {
-        Usage("Failed to parse memory option %s", option.c_str());
+        Usage("Failed to parse memory option %s\n", option.c_str());
         return false;
       }
       heap_max_free_ = size;
@@ -313,6 +315,10 @@
       if (!ParseDouble(option, '=', 0.1, 0.9, &heap_target_utilization_)) {
         return false;
       }
+    } else if (StartsWith(option, "-XX:ForegroundHeapGrowthMultiplier=")) {
+      if (!ParseDouble(option, '=', 0.1, 10.0, &foreground_heap_growth_multiplier_)) {
+        return false;
+      }
     } else if (StartsWith(option, "-XX:ParallelGCThreads=")) {
       if (!ParseUnsignedInteger(option, '=', &parallel_gc_threads_)) {
         return false;
@@ -324,7 +330,7 @@
     } else if (StartsWith(option, "-Xss")) {
       size_t size = ParseMemoryOption(option.substr(strlen("-Xss")).c_str(), 1);
       if (size == 0) {
-        Usage("Failed to parse memory option %s", option.c_str());
+        Usage("Failed to parse memory option %s\n", option.c_str());
         return false;
       }
       stack_size_ = size;
@@ -392,7 +398,7 @@
                    (gc_option == "noverifycardtable")) {
           // Ignored for backwards compatibility.
         } else {
-          Usage("Unknown -Xgc option %s", gc_option.c_str());
+          Usage("Unknown -Xgc option %s\n", gc_option.c_str());
           return false;
         }
       }
@@ -405,7 +411,7 @@
       if (collector_type != gc::kCollectorTypeNone) {
         background_collector_type_ = collector_type;
       } else {
-        Usage("Unknown -XX:BackgroundGC option %s", substring.c_str());
+        Usage("Unknown -XX:BackgroundGC option %s\n", substring.c_str());
         return false;
       }
     } else if (option == "-XX:+DisableExplicitGC") {
@@ -437,10 +443,13 @@
         } else if (verbose_options[i] == "threads") {
           gLogVerbosity.threads = true;
         } else {
-          Usage("Unknown -verbose option %s", verbose_options[i].c_str());
+          Usage("Unknown -verbose option %s\n", verbose_options[i].c_str());
           return false;
         }
       }
+    } else if (StartsWith(option, "-verbose-methods:")) {
+      gLogVerbosity.compiler = false;
+      Split(option.substr(strlen("-verbose-methods:")), ',', gVerboseMethods);
     } else if (StartsWith(option, "-Xlockprofthreshold:")) {
       if (!ParseUnsignedInteger(option, ':', &lock_profiling_threshold_)) {
         return false;
@@ -470,7 +479,7 @@
     } else if (option == "abort") {
       const void* hook = options[i].second;
       if (hook == nullptr) {
-        Usage("abort was NULL");
+        Usage("abort was NULL\n");
         return false;
       }
       hook_abort_ = reinterpret_cast<void(*)()>(const_cast<void*>(hook));
@@ -509,6 +518,8 @@
       if (!ParseDouble(option, ':', 1.0, 10.0, &profile_backoff_coefficient_)) {
         return false;
       }
+    } else if (option == "-Xprofile-start-lazy") {
+      profile_start_immediately_ = false;
     } else if (StartsWith(option, "-implicit-checks:")) {
       std::string checks;
       if (!ParseStringAfterChar(option, ':', &checks)) {
@@ -560,14 +571,14 @@
     } else if (option == "-Xcompiler-option") {
       i++;
       if (i == options.size()) {
-        Usage("Missing required compiler option for %s", option.c_str());
+        Usage("Missing required compiler option for %s\n", option.c_str());
         return false;
       }
       compiler_options_.push_back(options[i].first);
     } else if (option == "-Ximage-compiler-option") {
       i++;
       if (i == options.size()) {
-        Usage("Missing required compiler option for %s", option.c_str());
+        Usage("Missing required compiler option for %s\n", option.c_str());
         return false;
       }
       image_compiler_options_.push_back(options[i].first);
@@ -578,13 +589,13 @@
       } else if (verify_mode == "remote" || verify_mode == "all") {
         verify_ = true;
       } else {
-        Usage("Unknown -Xverify option %s", verify_mode.c_str());
+        Usage("Unknown -Xverify option %s\n", verify_mode.c_str());
         return false;
       }
-    } else if (StartsWith(option, "-ea:") ||
-               StartsWith(option, "-da:") ||
-               StartsWith(option, "-enableassertions:") ||
-               StartsWith(option, "-disableassertions:") ||
+    } else if (StartsWith(option, "-ea") ||
+               StartsWith(option, "-da") ||
+               StartsWith(option, "-enableassertions") ||
+               StartsWith(option, "-disableassertions") ||
                (option == "--runtime-arg") ||
                (option == "-esa") ||
                (option == "-dsa") ||
@@ -618,7 +629,7 @@
                StartsWith(option, "-XX:mainThreadStackSize=")) {
       // Ignored for backwards compatibility.
     } else if (!ignore_unrecognized) {
-      Usage("Unrecognized option %s", option.c_str());
+      Usage("Unrecognized option %s\n", option.c_str());
       return false;
     }
   }
@@ -709,6 +720,7 @@
   UsageMessage(stream, "  -XX:HeapMinFree=N\n");
   UsageMessage(stream, "  -XX:HeapMaxFree=N\n");
   UsageMessage(stream, "  -XX:HeapTargetUtilization=doublevalue\n");
+  UsageMessage(stream, "  -XX:ForegroundHeapGrowthMultiplier=doublevalue\n");
   UsageMessage(stream, "  -XX:LowMemoryMode\n");
   UsageMessage(stream, "  -Xprofile:{threadcpuclock,wallclock,dualclock}\n");
   UsageMessage(stream, "\n");
@@ -781,7 +793,7 @@
 bool ParsedOptions::ParseStringAfterChar(const std::string& s, char c, std::string* parsed_value) {
   std::string::size_type colon = s.find(c);
   if (colon == std::string::npos) {
-    Usage("Missing char %c in option %s", c, s.c_str());
+    Usage("Missing char %c in option %s\n", c, s.c_str());
     return false;
   }
   // Add one to remove the char we were trimming until.
@@ -792,14 +804,14 @@
 bool ParsedOptions::ParseInteger(const std::string& s, char after_char, int* parsed_value) {
   std::string::size_type colon = s.find(after_char);
   if (colon == std::string::npos) {
-    Usage("Missing char %c in option %s", after_char, s.c_str());
+    Usage("Missing char %c in option %s\n", after_char, s.c_str());
     return false;
   }
   const char* begin = &s[colon + 1];
   char* end;
   size_t result = strtoul(begin, &end, 10);
   if (begin == end || *end != '\0') {
-    Usage("Failed to parse integer from %s ", s.c_str());
+    Usage("Failed to parse integer from %s\n", s.c_str());
     return false;
   }
   *parsed_value = result;
@@ -813,7 +825,7 @@
     return false;
   }
   if (i < 0) {
-    Usage("Negative value %d passed for unsigned option %s", i, s.c_str());
+    Usage("Negative value %d passed for unsigned option %s\n", i, s.c_str());
     return false;
   }
   *parsed_value = i;
@@ -832,7 +844,7 @@
   // Ensure that we have a value, there was no cruft after it and it satisfies a sensible range.
   const bool sane_val = iss.eof() && (value >= min) && (value <= max);
   if (!sane_val) {
-    Usage("Invalid double value %s for option %s", option.c_str());
+    Usage("Invalid double value %s for option %s\n", substring.c_str(), option.c_str());
     return false;
   }
   *parsed_value = value;
diff --git a/runtime/parsed_options.h b/runtime/parsed_options.h
index 416bc78..770e4ae 100644
--- a/runtime/parsed_options.h
+++ b/runtime/parsed_options.h
@@ -54,6 +54,7 @@
   size_t heap_min_free_;
   size_t heap_max_free_;
   double heap_target_utilization_;
+  double foreground_heap_growth_multiplier_;
   unsigned int parallel_gc_threads_;
   unsigned int conc_gc_threads_;
   gc::CollectorType collector_type_;
@@ -79,6 +80,7 @@
   uint32_t profile_duration_s_;
   uint32_t profile_interval_us_;
   double profile_backoff_coefficient_;
+  bool profile_start_immediately_;
   ProfilerClockSource profile_clock_source_;
   bool verify_;
 
diff --git a/runtime/profiler.cc b/runtime/profiler.cc
index 4770a54..7b117f4 100644
--- a/runtime/profiler.cc
+++ b/runtime/profiler.cc
@@ -16,6 +16,7 @@
 
 #include "profiler.h"
 
+#include <fstream>
 #include <sys/uio.h>
 #include <sys/file.h>
 
@@ -302,14 +303,12 @@
 
   // Only on target...
 #ifdef HAVE_ANDROID_OS
-  if (!startImmediately) {
-    // Switch off profiler if the dalvik.vm.profiler property has value 0.
-    char buf[PROP_VALUE_MAX];
-    property_get("dalvik.vm.profiler", buf, "0");
-    if (strcmp(buf, "0") == 0) {
-      LOG(INFO) << "Profiler disabled.  To enable setprop dalvik.vm.profiler 1";
-      return;
-    }
+  // Switch off profiler if the dalvik.vm.profiler property has value 0.
+  char buf[PROP_VALUE_MAX];
+  property_get("dalvik.vm.profiler", buf, "0");
+  if (strcmp(buf, "0") == 0) {
+    LOG(INFO) << "Profiler disabled.  To enable setprop dalvik.vm.profiler 1";
+    return;
   }
 #endif
 
@@ -579,5 +578,101 @@
     previous_[methodname] = PreviousValue(count, size);
   }
 }
-}  // namespace art
 
+bool ProfileHelper::LoadProfileMap(ProfileMap& profileMap, const std::string& fileName) {
+  LOG(VERBOSE) << "reading profile file " << fileName;
+  struct stat st;
+  int err = stat(fileName.c_str(), &st);
+  if (err == -1) {
+    LOG(VERBOSE) << "not found";
+    return false;
+  }
+  if (st.st_size == 0) {
+    return false;  // Empty profiles are invalid.
+  }
+  std::ifstream in(fileName.c_str());
+  if (!in) {
+    LOG(VERBOSE) << "profile file " << fileName << " exists but can't be opened";
+    LOG(VERBOSE) << "file owner: " << st.st_uid << ":" << st.st_gid;
+    LOG(VERBOSE) << "me: " << getuid() << ":" << getgid();
+    LOG(VERBOSE) << "file permissions: " << std::oct << st.st_mode;
+    LOG(VERBOSE) << "errno: " << errno;
+    return false;
+  }
+  // The first line contains summary information.
+  std::string line;
+  std::getline(in, line);
+  if (in.eof()) {
+    return false;
+  }
+  std::vector<std::string> summary_info;
+  Split(line, '/', summary_info);
+  if (summary_info.size() != 3) {
+    // Bad summary info.  It should be count/total/bootpath.
+    return false;
+  }
+  // This is the number of hits in all methods.
+  uint32_t total_count = 0;
+  for (int i = 0 ; i < 3; ++i) {
+    total_count += atoi(summary_info[i].c_str());
+  }
+
+  // Now read each line until the end of file.  Each line consists of 3 fields separated by '/'.
+  // Store the info in descending order given by the most used methods.
+  typedef std::set<std::pair<int, std::vector<std::string>>> ProfileSet;
+  ProfileSet countSet;
+  while (!in.eof()) {
+    std::getline(in, line);
+    if (in.eof()) {
+      break;
+    }
+    std::vector<std::string> info;
+    Split(line, '/', info);
+    if (info.size() != 3) {
+      // Malformed.
+      break;
+    }
+    int count = atoi(info[1].c_str());
+    countSet.insert(std::make_pair(-count, info));
+  }
+
+  uint32_t curTotalCount = 0;
+  ProfileSet::iterator end = countSet.end();
+  const ProfileData* prevData = nullptr;
+  for (ProfileSet::iterator it = countSet.begin(); it != end ; it++) {
+    const std::string& methodname = it->second[0];
+    uint32_t count = -it->first;
+    uint32_t size = atoi(it->second[2].c_str());
+    double usedPercent = (count * 100.0) / total_count;
+
+    curTotalCount += count;
+    // Methods with the same count should be part of the same top K percentage bucket.
+    double topKPercentage = (prevData != nullptr) && (prevData->GetCount() == count)
+      ? prevData->GetTopKUsedPercentage()
+      : 100 * static_cast<double>(curTotalCount) / static_cast<double>(total_count);
+
+    // Add it to the profile map.
+    ProfileData curData = ProfileData(methodname, count, size, usedPercent, topKPercentage);
+    profileMap[methodname] = curData;
+    prevData = &curData;
+  }
+  return true;
+}
+
+bool ProfileHelper::LoadTopKSamples(std::set<std::string>& topKSamples, const std::string& fileName,
+                                    double topKPercentage) {
+  ProfileMap profileMap;
+  bool loadOk = LoadProfileMap(profileMap, fileName);
+  if (!loadOk) {
+    return false;
+  }
+  ProfileMap::iterator end = profileMap.end();
+  for (ProfileMap::iterator it = profileMap.begin(); it != end; it++) {
+    if (it->second.GetTopKUsedPercentage() < topKPercentage) {
+      topKSamples.insert(it->first);
+    }
+  }
+  return true;
+}
+
+}  // namespace art
diff --git a/runtime/profiler.h b/runtime/profiler.h
index b03b170..31fdc79 100644
--- a/runtime/profiler.h
+++ b/runtime/profiler.h
@@ -39,7 +39,6 @@
 }  // namespace mirror
 class Thread;
 
-
 //
 // This class holds all the results for all runs of the profiler.  It also
 // counts the number of null methods (where we can't determine the method) and
@@ -63,7 +62,7 @@
  private:
   uint32_t Hash(mirror::ArtMethod* method);
   static constexpr int kHashSize = 17;
-  Mutex& lock_;         // Reference to the main profiler lock - we don't need two of them.
+  Mutex& lock_;                   // Reference to the main profiler lock - we don't need two of them.
   uint32_t num_samples_;          // Total number of samples taken.
   uint32_t num_null_methods_;     // Number of samples where can don't know the method.
   uint32_t num_boot_methods_;     // Number of samples in the boot path.
@@ -189,6 +188,54 @@
   DISALLOW_COPY_AND_ASSIGN(BackgroundMethodSamplingProfiler);
 };
 
+// TODO: incorporate in ProfileSampleResults
+
+// Profile data.  This is generated from previous runs of the program and stored
+// in a file.  It is used to determine whether to compile a particular method or not.
+class ProfileData {
+ public:
+  ProfileData() : count_(0), method_size_(0), usedPercent_(0) {}
+  ProfileData(const std::string& method_name, uint32_t count, uint32_t method_size,
+    double usedPercent, double topKUsedPercentage) :
+    method_name_(method_name), count_(count), method_size_(method_size),
+    usedPercent_(usedPercent), topKUsedPercentage_(topKUsedPercentage) {
+    // TODO: currently method_size_ and count_ are unused.
+    UNUSED(method_size_);
+    UNUSED(count_);
+  }
+
+  bool IsAbove(double v) const { return usedPercent_ >= v; }
+  double GetUsedPercent() const { return usedPercent_; }
+  uint32_t GetCount() const { return count_; }
+  double GetTopKUsedPercentage() const { return topKUsedPercentage_; }
+
+ private:
+  std::string method_name_;    // Method name.
+  uint32_t count_;             // Number of times it has been called.
+  uint32_t method_size_;       // Size of the method on dex instructions.
+  double usedPercent_;         // Percentage of how many times this method was called.
+  double topKUsedPercentage_;  // The percentage of the group that comprise K% of the total used
+                               // methods this methods belongs to.
+};
+
+// Profile data is stored in a map, indexed by the full method name.
+typedef std::map<std::string, ProfileData> ProfileMap;
+
+class ProfileHelper {
+ private:
+  ProfileHelper();
+
+ public:
+  // Read the profile data from the given file.  Calculates the percentage for each method.
+  // Returns false if there was no profile file or it was malformed.
+  static bool LoadProfileMap(ProfileMap& profileMap, const std::string& fileName);
+
+  // Read the profile data from the given file and computes the group that comprise
+  // topKPercentage of the total used methods.
+  static bool LoadTopKSamples(std::set<std::string>& topKMethods, const std::string& fileName,
+                              double topKPercentage);
+};
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_PROFILER_H_
diff --git a/runtime/quick/inline_method_analyser.cc b/runtime/quick/inline_method_analyser.cc
index a9072d8..8bd8dba 100644
--- a/runtime/quick/inline_method_analyser.cc
+++ b/runtime/quick/inline_method_analyser.cc
@@ -135,6 +135,12 @@
   }
 }
 
+bool InlineMethodAnalyser::IsSyntheticAccessor(MethodReference ref) {
+  const DexFile::MethodId& method_id = ref.dex_file->GetMethodId(ref.dex_method_index);
+  const char* method_name = ref.dex_file->GetMethodName(method_id);
+  return strncmp(method_name, "access$", strlen("access$")) == 0;
+}
+
 bool InlineMethodAnalyser::AnalyseReturnMethod(const DexFile::CodeItem* code_item,
                                                InlineMethod* result) {
   const Instruction* return_instruction = Instruction::At(code_item->insns_);
@@ -218,13 +224,24 @@
   uint32_t arg_start = code_item->registers_size_ - code_item->ins_size_;
   DCHECK_GE(object_reg, arg_start);
   DCHECK_LT(object_reg, code_item->registers_size_);
+  uint32_t object_arg = object_reg - arg_start;
+
   DCHECK_LT(opcode == Instruction::IGET_WIDE ? dst_reg + 1 : dst_reg, code_item->registers_size_);
   if (dst_reg != return_reg) {
     return false;  // Not returning the value retrieved by IGET?
   }
 
-  if ((verifier->GetAccessFlags() & kAccStatic) != 0 || object_reg != arg_start) {
-    // TODO: Support inlining IGET on other register than "this".
+  if ((verifier->GetAccessFlags() & kAccStatic) != 0u || object_arg != 0u) {
+    // TODO: Implement inlining of IGET on non-"this" registers (needs correct stack trace for NPE).
+    // Allow synthetic accessors. We don't care about losing their stack frame in NPE.
+    if (!IsSyntheticAccessor(verifier->GetMethodReference())) {
+      return false;
+    }
+  }
+
+  // InlineIGetIPutData::object_arg is only 4 bits wide.
+  static constexpr uint16_t kMaxObjectArg = 15u;
+  if (object_arg > kMaxObjectArg) {
     return false;
   }
 
@@ -236,10 +253,10 @@
     result->opcode = kInlineOpIGet;
     result->flags = kInlineSpecial;
     data->op_variant = IGetVariant(opcode);
-    data->object_arg = object_reg - arg_start;  // Allow IGET on any register, not just "this".
-    data->src_arg = 0;
-    data->method_is_static = (verifier->GetAccessFlags() & kAccStatic) != 0;
-    data->reserved = 0;
+    data->method_is_static = (verifier->GetAccessFlags() & kAccStatic) != 0u ? 1u : 0u;
+    data->object_arg = object_arg;  // Allow IGET on any register, not just "this".
+    data->src_arg = 0u;
+    data->return_arg_plus1 = 0u;
   }
   return true;
 }
@@ -253,26 +270,45 @@
 
   const Instruction* return_instruction = instruction->Next();
   Instruction::Code return_opcode = return_instruction->Opcode();
+  uint32_t arg_start = code_item->registers_size_ - code_item->ins_size_;
+  uint16_t return_arg_plus1 = 0u;
   if (return_opcode != Instruction::RETURN_VOID) {
-    // TODO: Support returning an argument.
-    // This is needed by builder classes and generated accessor setters.
-    //    builder.setX(value): iput value, this, fieldX; return-object this;
-    //    object.access$nnn(value): iput value, this, fieldX; return value;
-    // Use InlineIGetIPutData::reserved to hold the information.
-    return false;
+    if (return_opcode != Instruction::RETURN &&
+        return_opcode != Instruction::RETURN_OBJECT &&
+        return_opcode != Instruction::RETURN_WIDE) {
+      return false;
+    }
+    // Returning an argument.
+    uint32_t return_reg = return_instruction->VRegA_11x();
+    DCHECK_GE(return_reg, arg_start);
+    DCHECK_LT(return_opcode == Instruction::RETURN_WIDE ? return_reg + 1u : return_reg,
+              code_item->registers_size_);
+    return_arg_plus1 = return_reg - arg_start + 1u;
   }
 
   uint32_t src_reg = instruction->VRegA_22c();
   uint32_t object_reg = instruction->VRegB_22c();
   uint32_t field_idx = instruction->VRegC_22c();
-  uint32_t arg_start = code_item->registers_size_ - code_item->ins_size_;
   DCHECK_GE(object_reg, arg_start);
   DCHECK_LT(object_reg, code_item->registers_size_);
   DCHECK_GE(src_reg, arg_start);
   DCHECK_LT(opcode == Instruction::IPUT_WIDE ? src_reg + 1 : src_reg, code_item->registers_size_);
+  uint32_t object_arg = object_reg - arg_start;
+  uint32_t src_arg = src_reg - arg_start;
 
-  if ((verifier->GetAccessFlags() & kAccStatic) != 0 || object_reg != arg_start) {
-    // TODO: Support inlining IPUT on other register than "this".
+  if ((verifier->GetAccessFlags() & kAccStatic) != 0u || object_arg != 0u) {
+    // TODO: Implement inlining of IPUT on non-"this" registers (needs correct stack trace for NPE).
+    // Allow synthetic accessors. We don't care about losing their stack frame in NPE.
+    if (!IsSyntheticAccessor(verifier->GetMethodReference())) {
+      return false;
+    }
+  }
+
+  // InlineIGetIPutData::object_arg/src_arg/return_arg_plus1 are each only 4 bits wide.
+  static constexpr uint16_t kMaxObjectArg = 15u;
+  static constexpr uint16_t kMaxSrcArg = 15u;
+  static constexpr uint16_t kMaxReturnArgPlus1 = 15u;
+  if (object_arg > kMaxObjectArg || src_arg > kMaxSrcArg || return_arg_plus1 > kMaxReturnArgPlus1) {
     return false;
   }
 
@@ -284,10 +320,10 @@
     result->opcode = kInlineOpIPut;
     result->flags = kInlineSpecial;
     data->op_variant = IPutVariant(opcode);
-    data->object_arg = object_reg - arg_start;  // Allow IPUT on any register, not just "this".
-    data->src_arg = src_reg - arg_start;
-    data->method_is_static = (verifier->GetAccessFlags() & kAccStatic) != 0;
-    data->reserved = 0;
+    data->method_is_static = (verifier->GetAccessFlags() & kAccStatic) != 0u ? 1u : 0u;
+    data->object_arg = object_arg;  // Allow IPUT on any register, not just "this".
+    data->src_arg = src_arg;
+    data->return_arg_plus1 = return_arg_plus1;
   }
   return true;
 }
diff --git a/runtime/quick/inline_method_analyser.h b/runtime/quick/inline_method_analyser.h
index 8e1a408..ddee89b 100644
--- a/runtime/quick/inline_method_analyser.h
+++ b/runtime/quick/inline_method_analyser.h
@@ -21,6 +21,7 @@
 #include "base/mutex.h"
 #include "dex_file.h"
 #include "dex_instruction.h"
+#include "method_reference.h"
 
 /*
  * NOTE: This code is part of the quick compiler. It lives in the runtime
@@ -98,10 +99,10 @@
   // opcode-Instruction::IPUT for IPUTs. This is because the runtime
   // doesn't know the OpSize enumeration.
   uint16_t op_variant : 3;
+  uint16_t method_is_static : 1;
   uint16_t object_arg : 4;
   uint16_t src_arg : 4;  // iput only
-  uint16_t method_is_static : 1;
-  uint16_t reserved : 4;
+  uint16_t return_arg_plus1 : 4;  // iput only, method argument to return + 1, 0 = return void.
   uint16_t field_idx;
   uint32_t is_volatile : 1;
   uint32_t field_offset : 31;
@@ -156,6 +157,9 @@
     return opcode - Instruction::IPUT;
   }
 
+  // Determines whether the method is a synthetic accessor (method name starts with "access$").
+  static bool IsSyntheticAccessor(MethodReference ref);
+
  private:
   static bool AnalyseReturnMethod(const DexFile::CodeItem* code_item, InlineMethod* result);
   static bool AnalyseConstMethod(const DexFile::CodeItem* code_item, InlineMethod* result);
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index edc3b33..611ce0b 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -121,6 +121,7 @@
       profile_duration_s_(0),
       profile_interval_us_(0),
       profile_backoff_coefficient_(0),
+      profile_start_immediately_(true),
       method_trace_(false),
       method_trace_file_size_(0),
       instrumentation_(),
@@ -187,7 +188,7 @@
 }
 
 struct AbortState {
-  void Dump(std::ostream& os) {
+  void Dump(std::ostream& os) NO_THREAD_SAFETY_ANALYSIS {
     if (gAborting > 1) {
       os << "Runtime aborting --- recursively, so no thread-specific detail!\n";
       return;
@@ -199,26 +200,33 @@
       return;
     }
     Thread* self = Thread::Current();
-    if (self == NULL) {
+    if (self == nullptr) {
       os << "(Aborting thread was not attached to runtime!)\n";
     } else {
-      // TODO: we're aborting and the ScopedObjectAccess may attempt to acquire the mutator_lock_
-      //       which may block indefinitely if there's a misbehaving thread holding it exclusively.
-      //       The code below should be made robust to this.
-      ScopedObjectAccess soa(self);
       os << "Aborting thread:\n";
-      self->Dump(os);
-      if (self->IsExceptionPending()) {
-        ThrowLocation throw_location;
-        mirror::Throwable* exception = self->GetException(&throw_location);
-        os << "Pending exception " << PrettyTypeOf(exception)
-            << " thrown by '" << throw_location.Dump() << "'\n"
-            << exception->Dump();
+      if (Locks::mutator_lock_->IsExclusiveHeld(self) || Locks::mutator_lock_->IsSharedHeld(self)) {
+        DumpThread(os, self);
+      } else {
+        if (Locks::mutator_lock_->SharedTryLock(self)) {
+          DumpThread(os, self);
+          Locks::mutator_lock_->SharedUnlock(self);
+        }
       }
     }
     DumpAllThreads(os, self);
   }
 
+  void DumpThread(std::ostream& os, Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    self->Dump(os);
+    if (self->IsExceptionPending()) {
+      ThrowLocation throw_location;
+      mirror::Throwable* exception = self->GetException(&throw_location);
+      os << "Pending exception " << PrettyTypeOf(exception)
+          << " thrown by '" << throw_location.Dump() << "'\n"
+          << exception->Dump();
+    }
+  }
+
   void DumpAllThreads(std::ostream& os, Thread* self) NO_THREAD_SAFETY_ANALYSIS {
     bool tll_already_held = Locks::thread_list_lock_->IsExclusiveHeld(self);
     bool ml_already_held = Locks::mutator_lock_->IsSharedHeld(self);
@@ -391,7 +399,7 @@
     if (fd >= 0) {
       close(fd);
     }
-    StartProfiler(profile_output_filename_.c_str(), "", true);
+    StartProfiler(profile_output_filename_.c_str(), "");
   }
 
   return true;
@@ -556,6 +564,7 @@
                        options->heap_min_free_,
                        options->heap_max_free_,
                        options->heap_target_utilization_,
+                       options->foreground_heap_growth_multiplier_,
                        options->heap_maximum_size_,
                        options->image_,
                        options->collector_type_,
@@ -616,6 +625,7 @@
   profile_duration_s_ = options->profile_duration_s_;
   profile_interval_us_ = options->profile_interval_us_;
   profile_backoff_coefficient_ = options->profile_backoff_coefficient_;
+  profile_start_immediately_ = options->profile_start_immediately_;
   profile_ = options->profile_;
   profile_output_filename_ = options->profile_output_filename_;
   // TODO: move this to just be an Trace::Start argument
@@ -1143,10 +1153,9 @@
   method_verifiers_.erase(it);
 }
 
-void Runtime::StartProfiler(const char* appDir, const char* procName, bool startImmediately) {
+void Runtime::StartProfiler(const char* appDir, const char* procName) {
   BackgroundMethodSamplingProfiler::Start(profile_period_s_, profile_duration_s_, appDir,
-      procName, profile_interval_us_,
-      profile_backoff_coefficient_, startImmediately);
+      procName, profile_interval_us_, profile_backoff_coefficient_, profile_start_immediately_);
 }
 
 // Transaction support.
@@ -1221,6 +1230,10 @@
 
 void Runtime::AddCurrentRuntimeFeaturesAsDex2OatArguments(std::vector<std::string>* argv)
     const {
+  if (GetInstrumentation()->InterpretOnly()) {
+    argv->push_back("--compiler-filter=interpret-only");
+  }
+
   argv->push_back("--runtime-arg");
   std::string checkstr = "-implicit-checks";
 
diff --git a/runtime/runtime.h b/runtime/runtime.h
index e94072c..1ee0b1a 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -359,6 +359,10 @@
   bool InitZygote();
   void DidForkFromZygote();
 
+  const instrumentation::Instrumentation* GetInstrumentation() const {
+    return &instrumentation_;
+  }
+
   instrumentation::Instrumentation* GetInstrumentation() {
     return &instrumentation_;
   }
@@ -374,7 +378,7 @@
   const std::vector<const DexFile*>& GetCompileTimeClassPath(jobject class_loader);
   void SetCompileTimeClassPath(jobject class_loader, std::vector<const DexFile*>& class_path);
 
-  void StartProfiler(const char* appDir, const char* procName, bool startImmediately = false);
+  void StartProfiler(const char* appDir, const char* procName);
   void UpdateProfilerState(int state);
 
   // Transaction support.
@@ -542,6 +546,8 @@
   uint32_t profile_duration_s_;         // Run profile for n seconds.
   uint32_t profile_interval_us_;        // Microseconds between samples.
   double profile_backoff_coefficient_;  // Coefficient to exponential backoff.
+  bool profile_start_immediately_;      // Whether the profile should start upon app
+                                        // startup or be delayed by some random offset.
 
   bool method_trace_;
   std::string method_trace_file_;
diff --git a/runtime/runtime_linux.cc b/runtime/runtime_linux.cc
index da1b2ca..960d332 100644
--- a/runtime/runtime_linux.cc
+++ b/runtime/runtime_linux.cc
@@ -33,7 +33,7 @@
 
 struct Backtrace {
   void Dump(std::ostream& os) {
-    DumpNativeStack(os, GetTid(), "\t", true);
+    DumpNativeStack(os, GetTid(), "\t");
   }
 };
 
diff --git a/runtime/stack_indirect_reference_table.h b/runtime/stack_indirect_reference_table.h
index 6049e06..b113129 100644
--- a/runtime/stack_indirect_reference_table.h
+++ b/runtime/stack_indirect_reference_table.h
@@ -44,6 +44,10 @@
     return number_of_references_;
   }
 
+  // We have versions with and without explicit pointer size of the following. The first two are
+  // used at runtime, so OFFSETOF_MEMBER computes the right offsets automatically. The last one
+  // takes the pointer size explicitly so that at compile time we can cross-compile correctly.
+
   // Returns the size of a StackIndirectReferenceTable containing num_references sirts.
   static size_t SizeOf(uint32_t num_references) {
     size_t header_size = OFFSETOF_MEMBER(StackIndirectReferenceTable, references_);
@@ -60,7 +64,7 @@
   // Get the size of the SIRT for the number of entries, with padding added for potential alignment.
   static size_t GetAlignedSirtSizeTarget(size_t pointer_size, uint32_t num_references) {
     // Assume that the layout is packed.
-    size_t header_size = pointer_size + sizeof(uint32_t);
+    size_t header_size = pointer_size + sizeof(number_of_references_);
     // This assumes there is no layout change between 32 and 64b.
     size_t data_size = sizeof(StackReference<mirror::Object>) * num_references;
     size_t sirt_size = header_size + data_size;
@@ -109,18 +113,18 @@
   }
 
   // Offset of link within SIRT, used by generated code
-  static size_t LinkOffset() {
-    return OFFSETOF_MEMBER(StackIndirectReferenceTable, link_);
+  static size_t LinkOffset(size_t pointer_size) {
+    return 0;
   }
 
   // Offset of length within SIRT, used by generated code
-  static uint32_t NumberOfReferencesOffset() {
-    return OFFSETOF_MEMBER(StackIndirectReferenceTable, number_of_references_);
+  static size_t NumberOfReferencesOffset(size_t pointer_size) {
+    return pointer_size;
   }
 
   // Offset of link within SIRT, used by generated code
-  static size_t ReferencesOffset() {
-    return OFFSETOF_MEMBER(StackIndirectReferenceTable, references_);
+  static size_t ReferencesOffset(size_t pointer_size) {
+    return pointer_size + sizeof(number_of_references_);
   }
 
  private:
diff --git a/runtime/stack_indirect_reference_table_test.cc b/runtime/stack_indirect_reference_table_test.cc
new file mode 100644
index 0000000..72ef6b6
--- /dev/null
+++ b/runtime/stack_indirect_reference_table_test.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stack_indirect_reference_table.h"
+#include "gtest/gtest.h"
+
+namespace art {
+
+// Test the offsets computed for members of StackIndirectReferenceTable. Because of cross-compiling
+// it is impossible the use OFFSETOF_MEMBER, so we do some reasonable computations ourselves. This
+// test checks whether we do the right thing.
+TEST(StackIndirectReferenceTableTest, Offsets) {
+  // As the members of StackIndirectReferenceTable are private, we cannot use OFFSETOF_MEMBER
+  // here. So do the inverse: set some data, and access it through pointers created from the offsets.
+
+  StackIndirectReferenceTable test_table(reinterpret_cast<mirror::Object*>(0x1234));
+  test_table.SetLink(reinterpret_cast<StackIndirectReferenceTable*>(0x5678));
+  test_table.SetNumberOfReferences(0x9ABC);
+
+  byte* table_base_ptr = reinterpret_cast<byte*>(&test_table);
+
+  {
+    uintptr_t* link_ptr = reinterpret_cast<uintptr_t*>(table_base_ptr +
+        StackIndirectReferenceTable::LinkOffset(kPointerSize));
+    EXPECT_EQ(*link_ptr, static_cast<size_t>(0x5678));
+  }
+
+  {
+    uint32_t* num_ptr = reinterpret_cast<uint32_t*>(table_base_ptr +
+        StackIndirectReferenceTable::NumberOfReferencesOffset(kPointerSize));
+    EXPECT_EQ(*num_ptr, static_cast<size_t>(0x9ABC));
+  }
+
+  {
+    // Assume sizeof(StackReference<mirror::Object>) == sizeof(uint32_t)
+    // TODO: How can we make this assumption-less but still access directly and fully?
+    EXPECT_EQ(sizeof(StackReference<mirror::Object>), sizeof(uint32_t));
+
+    uint32_t* ref_ptr = reinterpret_cast<uint32_t*>(table_base_ptr +
+        StackIndirectReferenceTable::ReferencesOffset(kPointerSize));
+    EXPECT_EQ(*ref_ptr, static_cast<uint32_t>(0x1234));
+  }
+}
+
+}  // namespace art
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 5a2410a..3408dd3 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -711,7 +711,9 @@
   bool is_daemon = false;
   Thread* self = Thread::Current();
 
-  if (self != nullptr && thread != nullptr && thread->tlsPtr_.opeer != nullptr) {
+  // Don't do this if we are aborting since the GC may have all the threads suspended. This will
+  // cause ScopedObjectAccessUnchecked to deadlock.
+  if (gAborting == 0 && self != nullptr && thread != nullptr && thread->tlsPtr_.opeer != nullptr) {
     ScopedObjectAccessUnchecked soa(self);
     priority = soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)
         ->GetInt(thread->tlsPtr_.opeer);
@@ -874,7 +876,7 @@
     if (o == nullptr) {
       os << "an unknown object";
     } else {
-      if ((o->GetLockWord().GetState() == LockWord::kThinLocked) &&
+      if ((o->GetLockWord(false).GetState() == LockWord::kThinLocked) &&
           Locks::mutator_lock_->IsExclusiveHeld(Thread::Current())) {
         // Getting the identity hashcode here would result in lock inflation and suspension of the
         // current thread, which isn't safe if this is the only runnable thread.
@@ -937,7 +939,7 @@
     if (dump_for_abort || ShouldShowNativeStack(this)) {
       DumpKernelStack(os, GetTid(), "  kernel: ", false);
       SirtRef<mirror::ArtMethod> method_ref(Thread::Current(), GetCurrentMethod(nullptr));
-      DumpNativeStack(os, GetTid(), "  native: ", false, method_ref.get());
+      DumpNativeStack(os, GetTid(), "  native: ", method_ref.get());
     }
     DumpJavaStack(os);
   } else {
@@ -1092,7 +1094,7 @@
     if (lock != nullptr) {
       SirtRef<mirror::Object> sirt_obj(self, lock);
       ObjectLock<mirror::Object> locker(self, &sirt_obj);
-      locker.Notify();
+      locker.NotifyAll();
     }
   }
 
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 7de9433..6f93566 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -78,7 +78,7 @@
   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
   for (const auto& thread : list_) {
     os << "DUMPING THREAD " << thread->GetTid() << "\n";
-    DumpNativeStack(os, thread->GetTid(), "\t", true);
+    DumpNativeStack(os, thread->GetTid(), "\t");
     os << "\n";
   }
 }
@@ -99,7 +99,7 @@
   // TODO: Reenable this when the native code in system_server can handle it.
   // Currently "adb shell kill -3 `pid system_server`" will cause it to exit.
   if (false) {
-    DumpNativeStack(os, tid, "  native: ", false);
+    DumpNativeStack(os, tid, "  native: ");
   }
   os << "\n";
 }
@@ -617,7 +617,7 @@
   DCHECK(pReq != NULL);
   if (pReq->invoke_needed) {
     // Clear this before signaling.
-    pReq->invoke_needed = false;
+    pReq->Clear();
 
     VLOG(jdwp) << "invoke complete, signaling";
     MutexLock mu(self, pReq->lock);
@@ -823,9 +823,9 @@
 };
 
 static void VerifyRootWrapperCallback(mirror::Object** root, void* arg, uint32_t /*thread_id*/,
-                                      RootType /*root_type*/) {
+                                      RootType root_type) {
   VerifyRootWrapperArg* wrapperArg = reinterpret_cast<VerifyRootWrapperArg*>(arg);
-  wrapperArg->callback_(*root, wrapperArg->arg_, 0, NULL);
+  wrapperArg->callback_(*root, wrapperArg->arg_, 0, NULL, root_type);
 }
 
 void ThreadList::VerifyRoots(VerifyRootCallback* callback, void* arg) const {
diff --git a/runtime/trace.cc b/runtime/trace.cc
index 1f24478..b85eb7e 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -549,6 +549,21 @@
   LOG(ERROR) << "Unexpected dex PC event in tracing " << PrettyMethod(method) << " " << new_dex_pc;
 };
 
+void Trace::FieldRead(Thread* /*thread*/, mirror::Object* this_object,
+                       mirror::ArtMethod* method, uint32_t dex_pc, mirror::ArtField* field)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  // We're not recorded to listen to this kind of event, so complain.
+  LOG(ERROR) << "Unexpected field read event in tracing " << PrettyMethod(method) << " " << dex_pc;
+}
+
+void Trace::FieldWritten(Thread* /*thread*/, mirror::Object* this_object,
+                          mirror::ArtMethod* method, uint32_t dex_pc, mirror::ArtField* field,
+                          const JValue& field_value)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  // We're not recorded to listen to this kind of event, so complain.
+  LOG(ERROR) << "Unexpected field write event in tracing " << PrettyMethod(method) << " " << dex_pc;
+}
+
 void Trace::MethodEntered(Thread* thread, mirror::Object* this_object,
                           mirror::ArtMethod* method, uint32_t dex_pc) {
   uint32_t thread_clock_diff = 0;
diff --git a/runtime/trace.h b/runtime/trace.h
index 1af1283..bf4995a 100644
--- a/runtime/trace.h
+++ b/runtime/trace.h
@@ -32,6 +32,7 @@
 namespace art {
 
 namespace mirror {
+  class ArtField;
   class ArtMethod;
 }  // namespace mirror
 class Thread;
@@ -54,7 +55,7 @@
   kSampleProfilingActive,
 };
 
-class Trace : public instrumentation::InstrumentationListener {
+class Trace FINAL : public instrumentation::InstrumentationListener {
  public:
   enum TraceFlag {
     kTraceCountAllocs = 1,
@@ -78,23 +79,31 @@
   void CompareAndUpdateStackTrace(Thread* thread, std::vector<mirror::ArtMethod*>* stack_trace)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  virtual void MethodEntered(Thread* thread, mirror::Object* this_object,
-                             mirror::ArtMethod* method, uint32_t dex_pc)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  virtual void MethodExited(Thread* thread, mirror::Object* this_object,
-                            mirror::ArtMethod* method, uint32_t dex_pc,
-                            const JValue& return_value)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  virtual void MethodUnwind(Thread* thread, mirror::Object* this_object,
-                            mirror::ArtMethod* method, uint32_t dex_pc)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  virtual void DexPcMoved(Thread* thread, mirror::Object* this_object,
-                          mirror::ArtMethod* method, uint32_t new_dex_pc)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  virtual void ExceptionCaught(Thread* thread, const ThrowLocation& throw_location,
-                               mirror::ArtMethod* catch_method, uint32_t catch_dex_pc,
-                               mirror::Throwable* exception_object)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  // InstrumentationListener implementation.
+  void MethodEntered(Thread* thread, mirror::Object* this_object,
+                     mirror::ArtMethod* method, uint32_t dex_pc)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
+  void MethodExited(Thread* thread, mirror::Object* this_object,
+                    mirror::ArtMethod* method, uint32_t dex_pc,
+                    const JValue& return_value)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
+  void MethodUnwind(Thread* thread, mirror::Object* this_object,
+                    mirror::ArtMethod* method, uint32_t dex_pc)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
+  void DexPcMoved(Thread* thread, mirror::Object* this_object,
+                  mirror::ArtMethod* method, uint32_t new_dex_pc)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
+  void FieldRead(Thread* thread, mirror::Object* this_object,
+                 mirror::ArtMethod* method, uint32_t dex_pc, mirror::ArtField* field)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
+  void FieldWritten(Thread* thread, mirror::Object* this_object,
+                    mirror::ArtMethod* method, uint32_t dex_pc, mirror::ArtField* field,
+                    const JValue& field_value)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
+  void ExceptionCaught(Thread* thread, const ThrowLocation& throw_location,
+                       mirror::ArtMethod* catch_method, uint32_t catch_dex_pc,
+                       mirror::Throwable* exception_object)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
 
   // Reuse an old stack trace if it exists, otherwise allocate a new one.
   static std::vector<mirror::ArtMethod*>* AllocStackTrace();
diff --git a/runtime/transaction_test.cc b/runtime/transaction_test.cc
index 76b6f27..1dc2da0 100644
--- a/runtime/transaction_test.cc
+++ b/runtime/transaction_test.cc
@@ -55,18 +55,18 @@
 
   // Lock object's monitor outside the transaction.
   sirt_obj->MonitorEnter(soa.Self());
-  uint32_t old_lock_word = sirt_obj->GetLockWord().GetValue();
+  uint32_t old_lock_word = sirt_obj->GetLockWord(false).GetValue();
 
   Transaction transaction;
   Runtime::Current()->EnterTransactionMode(&transaction);
   // Unlock object's monitor inside the transaction.
   sirt_obj->MonitorExit(soa.Self());
-  uint32_t new_lock_word = sirt_obj->GetLockWord().GetValue();
+  uint32_t new_lock_word = sirt_obj->GetLockWord(false).GetValue();
   Runtime::Current()->ExitTransactionMode();
 
   // Aborting transaction must not clear the Object::class field.
   transaction.Abort();
-  uint32_t aborted_lock_word = sirt_obj->GetLockWord().GetValue();
+  uint32_t aborted_lock_word = sirt_obj->GetLockWord(false).GetValue();
   EXPECT_NE(old_lock_word, new_lock_word);
   EXPECT_EQ(aborted_lock_word, new_lock_word);
 }
diff --git a/runtime/utils.cc b/runtime/utils.cc
index afbcbb7..c4d1a78 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -1041,20 +1041,7 @@
   return "";
 }
 
-static std::string CleanMapName(const backtrace_map_t* map) {
-  if (map == NULL || map->name.empty()) {
-    return "???";
-  }
-  // Turn "/usr/local/google/home/enh/clean-dalvik-dev/out/host/linux-x86/lib/libartd.so"
-  // into "libartd.so".
-  size_t last_slash = map->name.rfind('/');
-  if (last_slash == std::string::npos) {
-    return map->name;
-  }
-  return map->name.substr(last_slash + 1);
-}
-
-void DumpNativeStack(std::ostream& os, pid_t tid, const char* prefix, bool include_count,
+void DumpNativeStack(std::ostream& os, pid_t tid, const char* prefix,
     mirror::ArtMethod* current_method) {
   // We may be called from contexts where current_method is not null, so we must assert this.
   if (current_method != nullptr) {
@@ -1072,27 +1059,34 @@
   for (Backtrace::const_iterator it = backtrace->begin();
        it != backtrace->end(); ++it) {
     // We produce output like this:
-    // ]    #00 unwind_backtrace_thread+536 [0x55d75bb8] (libbacktrace.so)
-    os << prefix;
-    if (include_count) {
-      os << StringPrintf("#%02zu ", it->num);
-    }
-    if (!it->func_name.empty()) {
-      os << it->func_name;
+    // ]    #00 pc 000075bb8  /system/lib/libc.so (unwind_backtrace_thread+536)
+    // In order for parsing tools to continue to function, the stack dump
+    // format must at least adhere to this format:
+    //  #XX pc <RELATIVE_ADDR>  <FULL_PATH_TO_SHARED_LIBRARY> ...
+    // The parsers require a single space before and after pc, and two spaces
+    // after the <RELATIVE_ADDR>. There can be any prefix data before the
+    // #XX. <RELATIVE_ADDR> has to be a hex number but with no 0x prefix.
+    os << prefix << StringPrintf("#%02zu pc ", it->num);
+    if (!it->map) {
+      os << StringPrintf("%08" PRIxPTR "  ???", it->pc);
     } else {
-      if (current_method != nullptr && current_method->IsWithinQuickCode(it->pc)) {
+      os << StringPrintf("%08" PRIxPTR "  ", it->pc - it->map->start)
+         << it->map->name << " (";
+      if (!it->func_name.empty()) {
+        os << it->func_name;
+        if (it->func_offset != 0) {
+          os << "+" << it->func_offset;
+        }
+      } else if (current_method != nullptr && current_method->IsWithinQuickCode(it->pc)) {
         const void* start_of_code = current_method->GetEntryPointFromQuickCompiledCode();
         os << JniLongName(current_method) << "+"
            << (it->pc - reinterpret_cast<uintptr_t>(start_of_code));
       } else {
         os << "???";
       }
+      os << ")";
     }
-    if (it->func_offset != 0) {
-      os << "+" << it->func_offset;
-    }
-    os << StringPrintf(" [%p]", reinterpret_cast<void*>(it->pc));
-    os << " (" << CleanMapName(it->map) << ")\n";
+    os << "\n";
   }
 }
 
diff --git a/runtime/utils.h b/runtime/utils.h
index dbc3ab7..6ab1013 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -62,12 +62,20 @@
   return IsAligned<n>(reinterpret_cast<const uintptr_t>(x));
 }
 
+template<typename T>
+static inline bool IsAlignedParam(T x, int n) {
+  return (x & (n - 1)) == 0;
+}
+
 #define CHECK_ALIGNED(value, alignment) \
   CHECK(::art::IsAligned<alignment>(value)) << reinterpret_cast<const void*>(value)
 
 #define DCHECK_ALIGNED(value, alignment) \
   DCHECK(::art::IsAligned<alignment>(value)) << reinterpret_cast<const void*>(value)
 
+#define DCHECK_ALIGNED_PARAM(value, alignment) \
+  DCHECK(::art::IsAlignedParam(value, alignment)) << reinterpret_cast<const void*>(value)
+
 // Check whether an N-bit two's-complement representation can hold value.
 static inline bool IsInt(int N, word value) {
   CHECK_LT(0, N);
@@ -374,7 +382,7 @@
 
 // Dumps the native stack for thread 'tid' to 'os'.
 void DumpNativeStack(std::ostream& os, pid_t tid, const char* prefix = "",
-    bool include_count = true, mirror::ArtMethod* current_method = nullptr)
+    mirror::ArtMethod* current_method = nullptr)
     NO_THREAD_SAFETY_ANALYSIS;
 
 // Dumps the kernel stack for thread 'tid' to 'os'. Note that this is only available on linux-x86.
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 21e3e44..535c76d 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -361,7 +361,7 @@
   SirtRef<mirror::DexCache> dex_cache(self, mh.GetDexCache());
   SirtRef<mirror::ClassLoader> class_loader(self, mh.GetClassLoader());
   MethodVerifier verifier(&mh.GetDexFile(), &dex_cache, &class_loader, &mh.GetClassDef(),
-                          mh.GetCodeItem(), m->GetDexMethodIndex(), m, m->GetAccessFlags(), false,
+                          mh.GetCodeItem(), m->GetDexMethodIndex(), m, m->GetAccessFlags(), true,
                           true);
   return verifier.FindAccessedFieldAtDexPc(dex_pc);
 }
@@ -375,11 +375,11 @@
   // got what we wanted.
   bool success = Verify();
   if (!success) {
-    return NULL;
+    return nullptr;
   }
   RegisterLine* register_line = reg_table_.GetLine(dex_pc);
   if (register_line == NULL) {
-    return NULL;
+    return nullptr;
   }
   const Instruction* inst = Instruction::At(code_item_->insns_ + dex_pc);
   return GetQuickFieldAccess(inst, register_line);
@@ -392,7 +392,7 @@
   SirtRef<mirror::DexCache> dex_cache(self, mh.GetDexCache());
   SirtRef<mirror::ClassLoader> class_loader(self, mh.GetClassLoader());
   MethodVerifier verifier(&mh.GetDexFile(), &dex_cache, &class_loader, &mh.GetClassDef(),
-                          mh.GetCodeItem(), m->GetDexMethodIndex(), m, m->GetAccessFlags(), false,
+                          mh.GetCodeItem(), m->GetDexMethodIndex(), m, m->GetAccessFlags(), true,
                           true);
   return verifier.FindInvokedMethodAtDexPc(dex_pc);
 }
@@ -3118,34 +3118,14 @@
   DCHECK(inst->Opcode() == Instruction::INVOKE_VIRTUAL_QUICK ||
          inst->Opcode() == Instruction::INVOKE_VIRTUAL_RANGE_QUICK);
   const RegType& actual_arg_type = reg_line->GetInvocationThis(inst, is_range);
-  if (actual_arg_type.IsConflict()) {  // GetInvocationThis failed.
-    return NULL;
-  } else if (actual_arg_type.IsZero()) {  // Invoke on "null" instance: we can't go further.
-    return NULL;
+  if (!actual_arg_type.HasClass()) {
+    VLOG(verifier) << "Failed to get mirror::Class* from '" << actual_arg_type << "'";
+    return nullptr;
   }
-  mirror::Class* this_class = NULL;
-  if (!actual_arg_type.IsUnresolvedTypes()) {
-    this_class = actual_arg_type.GetClass();
-  } else {
-    const std::string& descriptor(actual_arg_type.GetDescriptor());
-    Thread* self = Thread::Current();
-    ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
-    this_class = class_linker->FindClass(self, descriptor.c_str(), *class_loader_);
-    if (this_class == NULL) {
-      Thread* self = Thread::Current();
-      self->ClearException();
-      // Look for a system class
-      SirtRef<mirror::ClassLoader> null_class_loader(self, nullptr);
-      this_class = class_linker->FindClass(self, descriptor.c_str(), null_class_loader);
-    }
-  }
-  if (this_class == NULL) {
-    return NULL;
-  }
-  mirror::ObjectArray<mirror::ArtMethod>* vtable = this_class->GetVTable();
-  CHECK(vtable != NULL);
+  mirror::ObjectArray<mirror::ArtMethod>* vtable = actual_arg_type.GetClass()->GetVTable();
+  CHECK(vtable != nullptr);
   uint16_t vtable_index = is_range ? inst->VRegB_3rc() : inst->VRegB_35c();
-  CHECK(vtable_index < vtable->GetLength());
+  CHECK_LT(static_cast<int32_t>(vtable_index), vtable->GetLength());
   mirror::ArtMethod* res_method = vtable->Get(vtable_index);
   CHECK(!Thread::Current()->IsExceptionPending());
   return res_method;
@@ -3633,12 +3613,12 @@
   if (klass->GetSuperClass() != NULL) {
     return FindInstanceFieldWithOffset(klass->GetSuperClass(), field_offset);
   } else {
-    return NULL;
+    VLOG(verifier) << "Failed to find instance field at offset '" << field_offset
+        << "' from '" << PrettyDescriptor(klass) << "'";
+    return nullptr;
   }
 }
 
-// Returns the access field of a quick field access (iget/iput-quick) or NULL
-// if it cannot be found.
 mirror::ArtField* MethodVerifier::GetQuickFieldAccess(const Instruction* inst,
                                                       RegisterLine* reg_line) {
   DCHECK(inst->Opcode() == Instruction::IGET_QUICK ||
@@ -3648,29 +3628,12 @@
          inst->Opcode() == Instruction::IPUT_WIDE_QUICK ||
          inst->Opcode() == Instruction::IPUT_OBJECT_QUICK);
   const RegType& object_type = reg_line->GetRegisterType(inst->VRegB_22c());
-  mirror::Class* object_class = NULL;
-  if (!object_type.IsUnresolvedTypes()) {
-    object_class = object_type.GetClass();
-  } else {
-    // We need to resolve the class from its descriptor.
-    const std::string& descriptor(object_type.GetDescriptor());
-    ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
-    Thread* self = Thread::Current();
-    object_class = class_linker->FindClass(self, descriptor.c_str(), *class_loader_);
-    if (object_class == NULL) {
-      self->ClearException();
-      // Look for a system class
-      SirtRef<mirror::ClassLoader> null_class_loader(self, nullptr);
-      object_class = class_linker->FindClass(self, descriptor.c_str(), null_class_loader);
-    }
-  }
-  if (object_class == NULL) {
-    // Failed to get the Class* from reg type.
-    LOG(WARNING) << "Failed to get Class* from " << object_type;
-    return NULL;
+  if (!object_type.HasClass()) {
+    VLOG(verifier) << "Failed to get mirror::Class* from '" << object_type << "'";
+    return nullptr;
   }
   uint32_t field_offset = static_cast<uint32_t>(inst->VRegC_22c());
-  return FindInstanceFieldWithOffset(object_class, field_offset);
+  return FindInstanceFieldWithOffset(object_type.GetClass(), field_offset);
 }
 
 void MethodVerifier::VerifyIGetQuick(const Instruction* inst, const RegType& insn_type,
diff --git a/runtime/verifier/reg_type_cache.cc b/runtime/verifier/reg_type_cache.cc
index 9dd57b8..111e867 100644
--- a/runtime/verifier/reg_type_cache.cc
+++ b/runtime/verifier/reg_type_cache.cc
@@ -156,15 +156,6 @@
   return klass;
 }
 
-void RegTypeCache::ClearException() {
-  if (can_load_classes_) {
-    DCHECK(Thread::Current()->IsExceptionPending());
-    Thread::Current()->ClearException();
-  } else {
-    DCHECK(!Thread::Current()->IsExceptionPending());
-  }
-}
-
 const RegType& RegTypeCache::From(mirror::ClassLoader* loader, const char* descriptor,
                                   bool precise) {
   // Try looking up the class in the cache first.
@@ -199,7 +190,12 @@
   } else {  // Class not resolved.
     // We tried loading the class and failed, this might get an exception raised
     // so we want to clear it before we go on.
-    ClearException();
+    if (can_load_classes_) {
+      DCHECK(Thread::Current()->IsExceptionPending());
+      Thread::Current()->ClearException();
+    } else {
+      DCHECK(!Thread::Current()->IsExceptionPending());
+    }
     if (IsValidDescriptor(descriptor)) {
       RegType* entry = new UnresolvedReferenceType(descriptor, entries_.size());
       entries_.push_back(entry);
@@ -238,6 +234,14 @@
   }
 }
 
+RegTypeCache::RegTypeCache(bool can_load_classes) : can_load_classes_(can_load_classes) {
+  if (kIsDebugBuild && can_load_classes) {
+    Thread::Current()->AssertThreadSuspensionIsAllowable();
+  }
+  entries_.reserve(64);
+  FillPrimitiveAndSmallConstantTypes();
+}
+
 RegTypeCache::~RegTypeCache() {
   CHECK_LE(primitive_count_, entries_.size());
   // Delete only the non primitive types.
diff --git a/runtime/verifier/reg_type_cache.h b/runtime/verifier/reg_type_cache.h
index 4cc7e61..70d5f07 100644
--- a/runtime/verifier/reg_type_cache.h
+++ b/runtime/verifier/reg_type_cache.h
@@ -38,10 +38,7 @@
 
 class RegTypeCache {
  public:
-  explicit RegTypeCache(bool can_load_classes) : can_load_classes_(can_load_classes) {
-    entries_.reserve(64);
-    FillPrimitiveAndSmallConstantTypes();
-  }
+  explicit RegTypeCache(bool can_load_classes);
   ~RegTypeCache();
   static void Init() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (!RegTypeCache::primitive_initialized_) {
@@ -152,7 +149,6 @@
   void FillPrimitiveAndSmallConstantTypes() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Class* ResolveClass(const char* descriptor, mirror::ClassLoader* loader)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void ClearException();
   bool MatchDescriptor(size_t idx, const char* descriptor, bool precise)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   const ConstantType& FromCat1NonSmallConstant(int32_t value, bool precise)
diff --git a/test/082-inline-execute/src/Main.java b/test/082-inline-execute/src/Main.java
index 86a03ab..55ecf69 100644
--- a/test/082-inline-execute/src/Main.java
+++ b/test/082-inline-execute/src/Main.java
@@ -97,6 +97,7 @@
   }
 
   static int start;
+  private static int[] negIndex = { -100000 };
   public static void test_String_indexOf() {
     String str0 = "";
     String str1 = "/";
@@ -125,6 +126,7 @@
     Assert.assertEquals(str0.indexOf('a',0), -1);
     Assert.assertEquals(str0.indexOf('a',-1), -1);
     Assert.assertEquals(str1.indexOf('/',++start), -1);
+    Assert.assertEquals(str1.indexOf('a',negIndex[0]), -1);
     Assert.assertEquals(str3.indexOf('a',0), 0);
     Assert.assertEquals(str3.indexOf('a',1), -1);
     Assert.assertEquals(str3.indexOf('a',1234), -1);
diff --git a/test/083-compiler-regressions/expected.txt b/test/083-compiler-regressions/expected.txt
index c1e30bc..db50300 100644
--- a/test/083-compiler-regressions/expected.txt
+++ b/test/083-compiler-regressions/expected.txt
@@ -21,6 +21,10 @@
 wideGetterSetterTest passes
 wideIdentityTest passes
 returnConstantTest passes
+setterTestWithReturnArgIgnoreReturn passes
+setterTestWithReturnArgUseReturn passes
+wideSetterTestWithReturnArgIgnoreReturn passes
+wideSetterTestWithReturnArgUseReturn passes
 LVNTests.testNPE1 passes
 LVNTests.testNPE2 passes
 longDivTest passes
diff --git a/test/083-compiler-regressions/src/Main.java b/test/083-compiler-regressions/src/Main.java
index 586ff87..d32c037 100644
--- a/test/083-compiler-regressions/src/Main.java
+++ b/test/083-compiler-regressions/src/Main.java
@@ -43,6 +43,10 @@
         wideGetterSetterTest();
         wideIdentityTest();
         returnConstantTest();
+        setterTestWithReturnArgIgnoreReturn();
+        setterTestWithReturnArgUseReturn();
+        wideSetterTestWithReturnArgIgnoreReturn();
+        wideSetterTestWithReturnArgUseReturn();
         LVNTests.testNPE1();
         LVNTests.testNPE2();
         ZeroTests.longDivTest();
@@ -179,6 +183,576 @@
         }
     }
 
+    static void setterTestWithReturnArgIgnoreReturn() {
+        Foo foo = new Foo();
+        int sum = foo.getBar0();
+        sum += foo.getBar0();
+        foo.setBar1ReturnThis(sum);
+        sum += foo.getBar0();
+        foo.setBar2ReturnThis(1,sum);
+        sum += foo.getBar0();
+        foo.setBar3ReturnThis(1,2,sum);
+        sum += foo.getBar0();
+        foo.setBar4ReturnThis(1,2,3,sum);
+        sum += foo.getBar0();
+        foo.setBar5ReturnThis(1,2,3,4,sum);
+        sum += foo.getBar0();
+        foo.setBar1ReturnBarArg(sum);
+        sum += foo.getBar0();
+        foo.setBar2ReturnBarArg(1,sum);
+        sum += foo.getBar0();
+        foo.setBar3ReturnBarArg(1,2,sum);
+        sum += foo.getBar0();
+        foo.setBar4ReturnBarArg(1,2,3,sum);
+        sum += foo.getBar0();
+        foo.setBar5ReturnBarArg(1,2,3,4,sum);
+        sum += foo.getBar0();
+        foo.setBar2ReturnDummyArg1(1,sum);
+        sum += foo.getBar0();
+        foo.setBar3ReturnDummyArg2(1,2,sum);
+        sum += foo.getBar0();
+        foo.setBar4ReturnDummyArg3(1,2,3,sum);
+        sum += foo.getBar0();
+        foo.setBar5ReturnDummyArg4(1,2,3,4,sum);
+        sum += foo.getBar0();
+        Foo nullFoo = Foo.getNullFoo();
+        try {
+            nullFoo.setBar1ReturnThis(sum);
+        } catch(NullPointerException npe) {
+            sum += 404;
+        }
+        try {
+            nullFoo.setBar2ReturnThis(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 2 * 404;
+        }
+        try {
+            nullFoo.setBar3ReturnThis(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 3 * 404;
+        }
+        try {
+            nullFoo.setBar4ReturnThis(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 4 * 404;
+        }
+        try {
+            nullFoo.setBar5ReturnThis(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 5 * 404;
+        }
+        try {
+            nullFoo.setBar1ReturnBarArg(sum);
+        } catch(NullPointerException npe) {
+            sum += 6 * 404;
+        }
+        try {
+            nullFoo.setBar2ReturnBarArg(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 7 * 404;
+        }
+        try {
+            nullFoo.setBar3ReturnBarArg(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 8 * 404;
+        }
+        try {
+            nullFoo.setBar4ReturnBarArg(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 9 * 404;
+        }
+        try {
+            nullFoo.setBar5ReturnBarArg(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 10 * 404;
+        }
+        try {
+            nullFoo.setBar2ReturnDummyArg1(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 11 * 404;
+        }
+        try {
+            nullFoo.setBar3ReturnDummyArg2(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 12 * 404;
+        }
+        try {
+            nullFoo.setBar4ReturnDummyArg3(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 13 * 404;
+        }
+        try {
+            nullFoo.setBar5ReturnDummyArg4(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 14 * 404;
+        }
+        int expected = (1234 << 15) + 404 * (15 * 14 / 2);
+        if (sum == expected) {
+            System.out.println("setterTestWithReturnArgIgnoreReturn passes");
+        }
+        else {
+            System.out.println("setterTestWithReturnArgIgnoreReturn fails: " + sum +
+                               " (expecting " + expected + ")");
+        }
+    }
+
+    static void setterTestWithReturnArgUseReturn() {
+        Foo foo = new Foo();
+        int sum = foo.getBar0();
+        int sumDummy = 0;
+        sum += foo.getBar0();
+        Foo foo2 = foo.setBar1ReturnThis(sum);
+        sum += foo2.getBar0();
+        foo = foo2.setBar2ReturnThis(1,sum);
+        sum += foo.getBar0();
+        foo2 = foo.setBar3ReturnThis(1,2,sum);
+        sum += foo2.getBar0();
+        foo = foo2.setBar4ReturnThis(1,2,3,sum);
+        sum += foo.getBar0();
+        foo = foo.setBar5ReturnThis(1,2,3,4,sum);
+        sum += foo.getBar0();
+        sum += foo.setBar1ReturnBarArg(sum);
+        sum += foo.getBar0();
+        sum += foo.setBar2ReturnBarArg(1,sum);
+        sum += foo.getBar0();
+        sum += foo.setBar3ReturnBarArg(1,2,sum);
+        sum += foo.getBar0();
+        sum += foo.setBar4ReturnBarArg(1,2,3,sum);
+        sum += foo.getBar0();
+        sum += foo.setBar5ReturnBarArg(1,2,3,4,sum);
+        sum += foo.getBar0();
+        sumDummy += foo.setBar2ReturnDummyArg1(1,sum);
+        sum += foo.getBar0();
+        sumDummy += foo.setBar3ReturnDummyArg2(1,2,sum);
+        sum += foo.getBar0();
+        sumDummy += foo.setBar4ReturnDummyArg3(1,2,3,sum);
+        sum += foo.getBar0();
+        sumDummy += foo.setBar5ReturnDummyArg4(1,2,3,4,sum);
+        sum += foo.getBar0();
+        Foo nullFoo = Foo.getNullFoo();
+        try {
+            foo = nullFoo.setBar1ReturnThis(sum);
+        } catch(NullPointerException npe) {
+            sum += 404;
+        }
+        try {
+            foo = nullFoo.setBar2ReturnThis(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 2 * 404;
+        }
+        try {
+            foo = nullFoo.setBar3ReturnThis(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 3 * 404;
+        }
+        try {
+            foo = nullFoo.setBar4ReturnThis(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 4 * 404;
+        }
+        try {
+            foo = nullFoo.setBar5ReturnThis(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 5 * 404;
+        }
+        try {
+            sum += nullFoo.setBar1ReturnBarArg(sum);
+        } catch(NullPointerException npe) {
+            sum += 6 * 404;
+        }
+        try {
+            sum += nullFoo.setBar2ReturnBarArg(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 7 * 404;
+        }
+        try {
+            sum += nullFoo.setBar3ReturnBarArg(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 8 * 404;
+        }
+        try {
+            sum += nullFoo.setBar4ReturnBarArg(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 9 * 404;
+        }
+        try {
+            sum += nullFoo.setBar5ReturnBarArg(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 10 * 404;
+        }
+        try {
+            sumDummy += nullFoo.setBar2ReturnDummyArg1(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 11 * 404;
+        }
+        try {
+            sumDummy += nullFoo.setBar3ReturnDummyArg2(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 12 * 404;
+        }
+        try {
+            sumDummy += nullFoo.setBar4ReturnDummyArg3(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 13 * 404;
+        }
+        try {
+            sumDummy += nullFoo.setBar5ReturnDummyArg4(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 14 * 404;
+        }
+        int expected = (1234 << 10) * 3 * 3 * 3 * 3 * 3 + 404 * (15 * 14 / 2);
+        int expectedDummy = 5 * 4 / 2;
+        if (sum == expected && sumDummy == expectedDummy) {
+            System.out.println("setterTestWithReturnArgUseReturn passes");
+        }
+        else {
+            System.out.println("setterTestWithReturnArgUseReturn fails: " + sum +
+                               " (expecting " + expected + "), sumDummy = " + sumDummy +
+                               "(expecting " + expectedDummy + ")");
+        }
+    }
+
+    static void wideSetterTestWithReturnArgIgnoreReturn() {
+        Foo foo = new Foo();
+        long sum = foo.wideGetBar0();
+        sum += foo.wideGetBar0();
+        foo.wideSetBar1ReturnThis(sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar2ReturnThis(1,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar3ReturnThis(1,2,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar4ReturnThis(1,2,3,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar5ReturnThis(1,2,3,4,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar1ReturnBarArg(sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar2ReturnBarArg(1,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar3ReturnBarArg(1,2,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar4ReturnBarArg(1,2,3,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar5ReturnBarArg(1,2,3,4,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar2iReturnBarArg(1,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar3iReturnBarArg(1,2,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar4iReturnBarArg(1,2,3,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar5iReturnBarArg(1,2,3,4,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar2ReturnDummyArg1(1,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar3ReturnDummyArg2(1,2,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar4ReturnDummyArg3(1,2,3,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar5ReturnDummyArg4(1,2,3,4,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar2iReturnDummyArg1(1,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar3iReturnDummyArg2(1,2,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar4iReturnDummyArg3(1,2,3,sum);
+        sum += foo.wideGetBar0();
+        foo.wideSetBar5iReturnDummyArg4(1,2,3,4,sum);
+        sum += foo.wideGetBar0();
+        Foo nullFoo = Foo.getNullFoo();
+        try {
+            nullFoo.wideSetBar1ReturnThis(sum);
+        } catch(NullPointerException npe) {
+            sum += 404;
+        }
+        try {
+            nullFoo.wideSetBar2ReturnThis(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 2 * 404;
+        }
+        try {
+            nullFoo.wideSetBar3ReturnThis(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 3 * 404;
+        }
+        try {
+            nullFoo.wideSetBar4ReturnThis(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 4 * 404;
+        }
+        try {
+            nullFoo.wideSetBar5ReturnThis(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 5 * 404;
+        }
+        try {
+            nullFoo.wideSetBar1ReturnBarArg(sum);
+        } catch(NullPointerException npe) {
+            sum += 6 * 404;
+        }
+        try {
+            nullFoo.wideSetBar2ReturnBarArg(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 7 * 404;
+        }
+        try {
+            nullFoo.wideSetBar3ReturnBarArg(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 8 * 404;
+        }
+        try {
+            nullFoo.wideSetBar4ReturnBarArg(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 9 * 404;
+        }
+        try {
+            nullFoo.wideSetBar5ReturnBarArg(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 10 * 404;
+        }
+        try {
+            nullFoo.wideSetBar2iReturnBarArg(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 11 * 404;
+        }
+        try {
+            nullFoo.wideSetBar3iReturnBarArg(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 12 * 404;
+        }
+        try {
+            nullFoo.wideSetBar4iReturnBarArg(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 13 * 404;
+        }
+        try {
+            nullFoo.wideSetBar5iReturnBarArg(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 14 * 404;
+        }
+        try {
+            nullFoo.wideSetBar2ReturnDummyArg1(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 15 * 404;
+        }
+        try {
+            nullFoo.wideSetBar3ReturnDummyArg2(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 16 * 404;
+        }
+        try {
+            nullFoo.wideSetBar4ReturnDummyArg3(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 17 * 404;
+        }
+        try {
+            nullFoo.wideSetBar5ReturnDummyArg4(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 18 * 404;
+        }
+        try {
+            nullFoo.wideSetBar2iReturnDummyArg1(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 19 * 404;
+        }
+        try {
+            nullFoo.wideSetBar3iReturnDummyArg2(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 20 * 404;
+        }
+        try {
+            nullFoo.wideSetBar4iReturnDummyArg3(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 21 * 404;
+        }
+        try {
+            nullFoo.wideSetBar5iReturnDummyArg4(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 22 * 404;
+        }
+        long expected = (1234L << 23) + 404 * (23 * 22 / 2);
+        if (sum == expected) {
+            System.out.println("wideSetterTestWithReturnArgIgnoreReturn passes");
+        }
+        else {
+            System.out.println("wideSetterTestWithReturnArgIgnoreReturn fails: " + sum +
+                               " (expecting " + expected + ")");
+        }
+    }
+
+    static void wideSetterTestWithReturnArgUseReturn() {
+        Foo foo = new Foo();
+        long sum = foo.wideGetBar0();
+        long sumDummy = 0;
+        sum += foo.wideGetBar0();
+        Foo foo2 = foo.wideSetBar1ReturnThis(sum);
+        sum += foo2.wideGetBar0();
+        foo = foo2.wideSetBar2ReturnThis(1,sum);
+        sum += foo.wideGetBar0();
+        foo2 = foo.wideSetBar3ReturnThis(1,2,sum);
+        sum += foo2.wideGetBar0();
+        foo = foo2.wideSetBar4ReturnThis(1,2,3,sum);
+        sum += foo.wideGetBar0();
+        foo = foo.wideSetBar5ReturnThis(1,2,3,4,sum);
+        sum += foo.wideGetBar0();
+        sum += foo.wideSetBar1ReturnBarArg(sum);
+        sum += foo.wideGetBar0();
+        sum += foo.wideSetBar2ReturnBarArg(1,sum);
+        sum += foo.wideGetBar0();
+        sum += foo.wideSetBar3ReturnBarArg(1,2,sum);
+        sum += foo.wideGetBar0();
+        sum += foo.wideSetBar4ReturnBarArg(1,2,3,sum);
+        sum += foo.wideGetBar0();
+        sum += foo.wideSetBar5ReturnBarArg(1,2,3,4,sum);
+        sum += foo.wideGetBar0();
+        sum += foo.wideSetBar2iReturnBarArg(1,sum);
+        sum += foo.wideGetBar0();
+        sum += foo.wideSetBar3iReturnBarArg(1,2,sum);
+        sum += foo.wideGetBar0();
+        sum += foo.wideSetBar4iReturnBarArg(1,2,3,sum);
+        sum += foo.wideGetBar0();
+        sum += foo.wideSetBar5iReturnBarArg(1,2,3,4,sum);
+        sum += foo.wideGetBar0();
+        sumDummy += foo.wideSetBar2ReturnDummyArg1(1,sum);
+        sum += foo.wideGetBar0();
+        sumDummy += foo.wideSetBar3ReturnDummyArg2(1,2,sum);
+        sum += foo.wideGetBar0();
+        sumDummy += foo.wideSetBar4ReturnDummyArg3(1,2,3,sum);
+        sum += foo.wideGetBar0();
+        sumDummy += foo.wideSetBar5ReturnDummyArg4(1,2,3,4,sum);
+        sum += foo.wideGetBar0();
+        sumDummy += foo.wideSetBar2iReturnDummyArg1(1,sum);
+        sum += foo.wideGetBar0();
+        sumDummy += foo.wideSetBar3iReturnDummyArg2(1,2,sum);
+        sum += foo.wideGetBar0();
+        sumDummy += foo.wideSetBar4iReturnDummyArg3(1,2,3,sum);
+        sum += foo.wideGetBar0();
+        sumDummy += foo.wideSetBar5iReturnDummyArg4(1,2,3,4,sum);
+        sum += foo.wideGetBar0();
+        Foo nullFoo = Foo.getNullFoo();
+        try {
+            foo = nullFoo.wideSetBar1ReturnThis(sum);
+        } catch(NullPointerException npe) {
+            sum += 404;
+        }
+        try {
+            foo = nullFoo.wideSetBar2ReturnThis(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 2 * 404;
+        }
+        try {
+            foo = nullFoo.wideSetBar3ReturnThis(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 3 * 404;
+        }
+        try {
+            foo = nullFoo.wideSetBar4ReturnThis(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 4 * 404;
+        }
+        try {
+            foo = nullFoo.wideSetBar5ReturnThis(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 5 * 404;
+        }
+        try {
+            sum += nullFoo.wideSetBar1ReturnBarArg(sum);
+        } catch(NullPointerException npe) {
+            sum += 6 * 404;
+        }
+        try {
+            sum += nullFoo.wideSetBar2ReturnBarArg(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 7 * 404;
+        }
+        try {
+            sum += nullFoo.wideSetBar3ReturnBarArg(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 8 * 404;
+        }
+        try {
+            sum += nullFoo.wideSetBar4ReturnBarArg(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 9 * 404;
+        }
+        try {
+            sum += nullFoo.wideSetBar5ReturnBarArg(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 10 * 404;
+        }
+        try {
+            sum += nullFoo.wideSetBar2iReturnBarArg(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 11 * 404;
+        }
+        try {
+            sum += nullFoo.wideSetBar3iReturnBarArg(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 12 * 404;
+        }
+        try {
+            sum += nullFoo.wideSetBar4iReturnBarArg(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 13 * 404;
+        }
+        try {
+            sum += nullFoo.wideSetBar5iReturnBarArg(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 14 * 404;
+        }
+        try {
+            sumDummy += nullFoo.wideSetBar2ReturnDummyArg1(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 15 * 404;
+        }
+        try {
+            sumDummy += nullFoo.wideSetBar3ReturnDummyArg2(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 16 * 404;
+        }
+        try {
+            sumDummy += nullFoo.wideSetBar4ReturnDummyArg3(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 17 * 404;
+        }
+        try {
+            sumDummy += nullFoo.wideSetBar5ReturnDummyArg4(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 18 * 404;
+        }
+        try {
+            sumDummy += nullFoo.wideSetBar2iReturnDummyArg1(1, sum);
+        } catch(NullPointerException npe) {
+            sum += 19 * 404;
+        }
+        try {
+            sumDummy += nullFoo.wideSetBar3iReturnDummyArg2(1, 2, sum);
+        } catch(NullPointerException npe) {
+            sum += 20 * 404;
+        }
+        try {
+            sumDummy += nullFoo.wideSetBar4iReturnDummyArg3(1, 2, 3, sum);
+        } catch(NullPointerException npe) {
+            sum += 21 * 404;
+        }
+        try {
+            sumDummy += nullFoo.wideSetBar5iReturnDummyArg4(1, 2, 3, 4, sum);
+        } catch(NullPointerException npe) {
+            sum += 22 * 404;
+        }
+        long expected = (1234L << 14) * 3 * 3 * 3 * 3 * 3 * 3 * 3 * 3 * 3  + 404 * (23 * 22 / 2);
+        long expectedDummy = 2 * (5 * 4 / 2);
+        if (sum == expected && sumDummy == expectedDummy) {
+            System.out.println("wideSetterTestWithReturnArgUseReturn passes");
+        }
+        else {
+            System.out.println("wideSetterTestWithReturnArgUseReturn fails: " + sum +
+                               " (expecting " + expected + "), sumDummy = " + sumDummy +
+                               "(expecting " + expectedDummy + ")");
+        }
+    }
+
     static void mulBy1Test() {
         long res;
         long j = 1;
@@ -8645,6 +9219,12 @@
     private int bar = 1234;
     private long lbar = 1234;
 
+    public static Foo getNullFoo() {
+      // Make this a bit complicated so that it's not inlined.
+      Foo foo = new Foo();
+      return (barBar(foo) != 0) ? null : foo;
+    }
+
     // Looks similar to a direct method, make sure we're null checking
     static int barBar(Foo foo) {
         return foo.bar;
@@ -8786,6 +9366,166 @@
     public long wideIdent5(int a6, int a5, int a4, int a3, int a2, long a1) {
         return a1;
     }
+    public Foo setBar1ReturnThis(int a1) {
+        bar = a1;
+        return this;
+    }
+    public Foo setBar2ReturnThis(int a1, int a2) {
+        bar = a2;
+        return this;
+    }
+    public Foo setBar3ReturnThis(int a1, int a2, int a3) {
+        bar = a3;
+        return this;
+    }
+    public Foo setBar4ReturnThis(int a1, int a2, int a3, int a4) {
+        bar = a4;
+        return this;
+    }
+    public Foo setBar5ReturnThis(int a1, int a2, int a3, int a4, int a5) {
+        bar = a5;
+        return this;
+    }
+    public Foo wideSetBar1ReturnThis(long a1) {
+        lbar = a1;
+        return this;
+    }
+    public Foo wideSetBar2ReturnThis(long a1, long a2) {
+        lbar = a2;
+        return this;
+    }
+    public Foo wideSetBar3ReturnThis(long a1, long a2, long a3) {
+        lbar = a3;
+        return this;
+    }
+    public Foo wideSetBar4ReturnThis(long a1, long a2, long a3, long a4) {
+        lbar = a4;
+        return this;
+    }
+    public Foo wideSetBar5ReturnThis(long a1, long a2, long a3, long a4, long a5) {
+        lbar = a5;
+        return this;
+    }
+    public Foo wideSetBar2iReturnThis(int a1, long a2) {
+        lbar = a2;
+        return this;
+    }
+    public Foo wideSetBar3iReturnThis(int a1, int a2, long a3) {
+        lbar = a3;
+        return this;
+    }
+    public Foo wideSetBar4iReturnThis(int a1, int a2, int a3, long a4) {
+        lbar = a4;
+        return this;
+    }
+    public Foo wideSetBar5iReturnThis(int a1, int a2, int a3, int a4, long a5) {
+        lbar = a5;
+        return this;
+    }
+    public int setBar1ReturnBarArg(int a1) {
+        bar = a1;
+        return a1;
+    }
+    public int setBar2ReturnBarArg(int a1, int a2) {
+        bar = a2;
+        return a2;
+    }
+    public int setBar3ReturnBarArg(int a1, int a2, int a3) {
+        bar = a3;
+        return a3;
+    }
+    public int setBar4ReturnBarArg(int a1, int a2, int a3, int a4) {
+        bar = a4;
+        return a4;
+    }
+    public int setBar5ReturnBarArg(int a1, int a2, int a3, int a4, int a5) {
+        bar = a5;
+        return a5;
+    }
+    public long wideSetBar1ReturnBarArg(long a1) {
+        lbar = a1;
+        return a1;
+    }
+    public long wideSetBar2ReturnBarArg(long a1, long a2) {
+        lbar = a2;
+        return a2;
+    }
+    public long wideSetBar3ReturnBarArg(long a1, long a2, long a3) {
+        lbar = a3;
+        return a3;
+    }
+    public long wideSetBar4ReturnBarArg(long a1, long a2, long a3, long a4) {
+        lbar = a4;
+        return a4;
+    }
+    public long wideSetBar5ReturnBarArg(long a1, long a2, long a3, long a4, long a5) {
+        lbar = a5;
+        return a5;
+    }
+    public long wideSetBar2iReturnBarArg(int a1, long a2) {
+        lbar = a2;
+        return a2;
+    }
+    public long wideSetBar3iReturnBarArg(int a1, int a2, long a3) {
+        lbar = a3;
+        return a3;
+    }
+    public long wideSetBar4iReturnBarArg(int a1, int a2, int a3, long a4) {
+        lbar = a4;
+        return a4;
+    }
+    public long wideSetBar5iReturnBarArg(int a1, int a2, int a3, int a4, long a5) {
+        lbar = a5;
+        return a5;
+    }
+    public int setBar2ReturnDummyArg1(int a1, int a2) {
+        bar = a2;
+        return a1;
+    }
+    public int setBar3ReturnDummyArg2(int a1, int a2, int a3) {
+        bar = a3;
+        return a2;
+    }
+    public int setBar4ReturnDummyArg3(int a1, int a2, int a3, int a4) {
+        bar = a4;
+        return a3;
+    }
+    public int setBar5ReturnDummyArg4(int a1, int a2, int a3, int a4, int a5) {
+        bar = a5;
+        return a4;
+    }
+    public long wideSetBar2ReturnDummyArg1(long a1, long a2) {
+        lbar = a2;
+        return a1;
+    }
+    public long wideSetBar3ReturnDummyArg2(long a1, long a2, long a3) {
+        lbar = a3;
+        return a2;
+    }
+    public long wideSetBar4ReturnDummyArg3(long a1, long a2, long a3, long a4) {
+        lbar = a4;
+        return a3;
+    }
+    public long wideSetBar5ReturnDummyArg4(long a1, long a2, long a3, long a4, long a5) {
+        lbar = a5;
+        return a4;
+    }
+    public int wideSetBar2iReturnDummyArg1(int a1, long a2) {
+        lbar = a2;
+        return a1;
+    }
+    public int wideSetBar3iReturnDummyArg2(int a1, int a2, long a3) {
+        lbar = a3;
+        return a2;
+    }
+    public int wideSetBar4iReturnDummyArg3(int a1, int a2, int a3, long a4) {
+        lbar = a4;
+        return a3;
+    }
+    public int wideSetBar5iReturnDummyArg4(int a1, int a2, int a3, int a4, long a5) {
+        lbar = a5;
+        return a4;
+    }
 }
 
 class LVNTests {
diff --git a/test/401-optimizing-compiler/expected.txt b/test/401-optimizing-compiler/expected.txt
index 268da55..97492a4 100644
--- a/test/401-optimizing-compiler/expected.txt
+++ b/test/401-optimizing-compiler/expected.txt
@@ -4,3 +4,10 @@
 In static method with 7 args 1 2 3 4 5 6 7
 Forced GC
 java.lang.Error: Error
+Forced GC
+In static method with object arg class java.lang.Object
+Forced GC
+Forced GC
+Forced GC
+Forced GC
+Forced GC
diff --git a/test/401-optimizing-compiler/src/Main.java b/test/401-optimizing-compiler/src/Main.java
index 4031ff1..e5706a5 100644
--- a/test/401-optimizing-compiler/src/Main.java
+++ b/test/401-optimizing-compiler/src/Main.java
@@ -26,6 +26,86 @@
       error = e;
     }
     System.out.println(error);
+
+    $opt$TestInvokeNew();
+
+    int result = $opt$TestInvokeIntParameter(42);
+    if (result != 42) {
+      throw new Error("Different value returned: " + result);
+    }
+
+
+    $opt$TestInvokeObjectParameter(new Object());
+
+    Object a = new Object();
+    Object b = $opt$TestInvokeObjectParameter(a);
+    if (a != b) {
+      throw new Error("Different object returned " + a + " " + b);
+    }
+
+    result = $opt$TestInvokeWith2Parameters(10, 9);
+    if (result != 1) {
+      throw new Error("Unexpected result: " + result);
+    }
+
+    result = $opt$TestInvokeWith3Parameters(10, 9, 1);
+    if (result != 0) {
+      throw new Error("Unexpected result: " + result);
+    }
+
+    result = $opt$TestInvokeWith5Parameters(10000, 1000, 100, 10, 1);
+    if (result != 8889) {
+      throw new Error("Unexpected result: " + result);
+    }
+
+    result = $opt$TestInvokeWith7Parameters(100, 6, 5, 4, 3, 2, 1);
+    if (result != 79) {
+      throw new Error("Unexpected result: " + result);
+    }
+
+    Main m = new Main();
+    if (m.$opt$TestThisParameter(m) != m) {
+      throw new Error("Unexpected value returned");
+    }
+
+    if (m.$opt$TestOtherParameter(new Main()) == m) {
+      throw new Error("Unexpected value returned");
+    }
+  }
+
+  static int $opt$TestInvokeIntParameter(int param) {
+    return param;
+  }
+
+  static Object $opt$TestInvokeObjectParameter(Object a) {
+    forceGCStaticMethod();
+    return a;
+  }
+
+  static int $opt$TestInvokeWith2Parameters(int a, int b) {
+    return a - b;
+  }
+
+  static int $opt$TestInvokeWith3Parameters(int a, int b, int c) {
+    return a - b - c;
+  }
+
+  static int $opt$TestInvokeWith5Parameters(int a, int b, int c, int d, int e) {
+    return a - b - c - d - e;
+  }
+
+  static int $opt$TestInvokeWith7Parameters(int a, int b, int c, int d, int e, int f, int g) {
+    return a - b - c - d - e - f - g;
+  }
+
+  Object $opt$TestThisParameter(Object other) {
+    forceGCStaticMethod();
+    return other;
+  }
+
+  Object $opt$TestOtherParameter(Object other) {
+    forceGCStaticMethod();
+    return other;
   }
 
   public static void $opt$TestInvokeStatic() {
@@ -37,6 +117,13 @@
     throwStaticMethod();
   }
 
+  public static void $opt$TestInvokeNew() {
+    Object o = new Object();
+    forceGCStaticMethod();
+    printStaticMethodWithObjectArg(o);
+    forceGCStaticMethod();
+  }
+
   public static void printStaticMethod() {
     System.out.println("In static method");
   }
@@ -55,6 +142,10 @@
         + a + " " + b + " " + c + " " + d + " " + e + " " + f + " " + g);
   }
 
+  public static void printStaticMethodWithObjectArg(Object a) {
+    System.out.println("In static method with object arg " + a.getClass());
+  }
+
   public static void forceGCStaticMethod() {
     Runtime.getRuntime().gc();
     Runtime.getRuntime().gc();
diff --git a/test/402-optimizing-control-flow/expected.txt b/test/402-optimizing-control-flow/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/402-optimizing-control-flow/expected.txt
diff --git a/test/402-optimizing-control-flow/info.txt b/test/402-optimizing-control-flow/info.txt
new file mode 100644
index 0000000..37d9458
--- /dev/null
+++ b/test/402-optimizing-control-flow/info.txt
@@ -0,0 +1 @@
+A set of tests for testing control flow instructions on the optimizing compiler.
diff --git a/test/402-optimizing-control-flow/src/Main.java b/test/402-optimizing-control-flow/src/Main.java
new file mode 100644
index 0000000..3339ef4
--- /dev/null
+++ b/test/402-optimizing-control-flow/src/Main.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Note that $opt$ is a marker for the optimizing compiler to ensure
+// it does compile the method.
+
+public class Main {
+
+  public static void expectEquals(int expected, int value) {
+    if (expected != value) {
+      throw new Error("Expected: " + expected + ", found: " + value);
+    }
+  }
+
+  public static void main(String[] args) {
+    int result = $opt$testIfEq1(42);
+    expectEquals(42, result);
+
+    result = $opt$testIfEq2(42);
+    expectEquals(7, result);
+
+    result = $opt$testWhileLoop(42);
+    expectEquals(45, result);
+
+    result = $opt$testDoWhileLoop(42);
+    expectEquals(45, result);
+
+    result = $opt$testForLoop(42);
+    expectEquals(44, result);
+  }
+
+  static int $opt$testIfEq1(int a) {
+    if (a + 1 == 43) {
+      return 42;
+    } else {
+      return 7;
+    }
+  }
+
+  static int $opt$testIfEq2(int a) {
+    if (a + 1 == 41) {
+      return 42;
+    } else {
+      return 7;
+    }
+  }
+
+  static int $opt$testWhileLoop(int a) {
+    while (a++ != 44) {}
+    return a;
+  }
+
+  static int $opt$testDoWhileLoop(int a) {
+    do {
+    } while (a++ != 44);
+    return a;
+  }
+
+  static int $opt$testForLoop(int a) {
+    for (; a != 44; a++) {}
+    return a;
+  }
+}
diff --git a/test/403-optimizing-long/expected.txt b/test/403-optimizing-long/expected.txt
new file mode 100644
index 0000000..dff83cf
--- /dev/null
+++ b/test/403-optimizing-long/expected.txt
@@ -0,0 +1 @@
+Long: 42
diff --git a/test/403-optimizing-long/info.txt b/test/403-optimizing-long/info.txt
new file mode 100644
index 0000000..dc2d668
--- /dev/null
+++ b/test/403-optimizing-long/info.txt
@@ -0,0 +1 @@
+Tests long support on optimizing compiler.
diff --git a/test/403-optimizing-long/src/Main.java b/test/403-optimizing-long/src/Main.java
new file mode 100644
index 0000000..21af4e1
--- /dev/null
+++ b/test/403-optimizing-long/src/Main.java
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Note that $opt$ is a marker for the optimizing compiler to ensure
+// it does compile the method.
+
+public class Main {
+  public static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void main(String[] args) {
+    long l = $opt$ReturnLong();
+    expectEquals(42, l);
+    System.out.println("Long: " + l);
+
+    l = $opt$TakeOneLong1(42);
+    expectEquals(42, l);
+
+    l = $opt$TakeOneLong2(0, 42);
+    expectEquals(42, l);
+
+    l = $opt$TakeOneLong3(0, 1, 42);
+    expectEquals(42, l);
+
+    l = $opt$TakeOneLong4(0, 1, 2, 42);
+    expectEquals(42, l);
+
+    l = $opt$AddTwoLongs(42, 41);
+    expectEquals(83, l);
+
+    l = $opt$SubTwoLongs(42, 41);
+    expectEquals(1, l);
+
+    l = $opt$MakeCallsWithLongs1();
+    expectEquals(57, l);
+
+    l = $opt$MakeCallsWithLongs2();
+    expectEquals(900000000006L, l);
+
+    l = $opt$SubTwoLongs(-600000000006L, -200000000002L);
+    expectEquals(-400000000004L, l);
+
+    l = $opt$AddTwoLongs(-600000000006L, -200000000002L);
+    expectEquals(-800000000008L, l);
+  }
+
+  static long $opt$MakeCallsWithLongs1() {
+    long l = $opt$SubTwoLongs(-600000000006L, -200000000002L);
+    expectEquals(-400000000004L, l);
+
+    l = $opt$AddTwoLongs(-600000000006L, -200000000002L);
+    expectEquals(-800000000008L, l);
+
+    return $opt$ReturnLong() + $opt$TakeOneLong1(1) + $opt$TakeOneLong2(0, 2)
+        + $opt$TakeOneLong3(0, 0, 3) + $opt$TakeOneLong4(0, 0, 0, 4)
+        // Test invoke-range.
+        + $opt$TakeOneLong5(0, 0, 0, 0, 5);
+  }
+
+  static long $opt$MakeCallsWithLongs2() {
+    return $opt$AddThreeLongs(400000000003L, 200000000002L, 300000000001L);
+  }
+
+  static long $opt$ReturnLong() {
+    return 42;
+  }
+
+  static long $opt$TakeOneLong1(long l) {
+    return l;
+  }
+
+  static long $opt$TakeOneLong2(int a, long l) {
+    return l;
+  }
+
+  static long $opt$TakeOneLong3(int a, int b, long l) {
+    return l;
+  }
+
+  static long $opt$TakeOneLong4(int a, int b, int c, long l) {
+    return l;
+  }
+
+  static long $opt$TakeOneLong5(int a, int b, int c,int d,  long l) {
+    return l;
+  }
+
+  static long $opt$AddTwoLongs(long a, long b) {
+    return a + b;
+  }
+
+  static long $opt$AddThreeLongs(long a, long b, long c) {
+    return a + b + c;
+  }
+
+  static long $opt$SubTwoLongs(long a, long b) {
+    return a - b;
+  }
+}
diff --git a/test/Android.mk b/test/Android.mk
index 5879039..08a925c 100644
--- a/test/Android.mk
+++ b/test/Android.mk
@@ -57,6 +57,8 @@
 #	StackWalk2 \
 
 ART_TEST_TARGET_DEX_FILES :=
+ART_TEST_TARGET_DEX_FILES$(ART_PHONY_TEST_TARGET_SUFFIX) :=
+ART_TEST_TARGET_DEX_FILES$(2ND_ART_PHONY_TEST_TARGET_SUFFIX) :=
 ART_TEST_HOST_DEX_FILES :=
 
 # $(1): module prefix
@@ -76,13 +78,17 @@
     LOCAL_ADDITIONAL_DEPENDENCIES := art/build/Android.common.mk
     LOCAL_ADDITIONAL_DEPENDENCIES += $(LOCAL_PATH)/Android.mk
     include $(BUILD_JAVA_LIBRARY)
+    
     ART_TEST_TARGET_DEX_FILES += $$(LOCAL_INSTALLED_MODULE)
+    ART_TEST_TARGET_DEX_FILES$(ART_PHONY_TEST_TARGET_SUFFIX) += $$(LOCAL_INSTALLED_MODULE)
 
     ifdef TARGET_2ND_ARCH
+	    ART_TEST_TARGET_DEX_FILES$(2ND_ART_PHONY_TEST_TARGET_SUFFIX) += $(4)/$(1)-$(2).jar
+
       # TODO: make this a simple copy
-$(4)/$(1)-$(2).jar: $(3)/$(1)-$(2).jar
+$(4)/$(1)-$(2).jar: $(3)/$(1)-$(2).jar $(4)
 	cp $$< $(4)/
-    endif
+    endif    
   endif
 
   ifeq ($(ART_BUILD_HOST),true)
@@ -102,9 +108,22 @@
 $(foreach dir,$(TEST_DEX_DIRECTORIES), $(eval $(call build-art-test-dex,art-test-dex,$(dir),$(ART_NATIVETEST_OUT),$(2ND_ART_NATIVETEST_OUT))))
 $(foreach dir,$(TEST_OAT_DIRECTORIES), $(eval $(call build-art-test-dex,oat-test-dex,$(dir),$(ART_TEST_OUT),$(2ND_ART_TEST_OUT))))
 
+# Rules to explicitly create 2nd-arch test directories, as we use a "cp" for them
+# instead of BUILD_JAVA_LIBRARY
+ifneq ($(2ND_ART_NATIVETEST_OUT),)
+$(2ND_ART_NATIVETEST_OUT):
+	$(hide) mkdir -p $@
+endif
+
+ifneq ($(2ND_ART_TEST_OUT),)
+$(2ND_ART_TEST_OUT):
+	$(hide) mkdir -p $@
+endif
+
 ########################################################################
 
-ART_TEST_TARGET_OAT_TARGETS :=
+ART_TEST_TARGET_OAT_TARGETS$(ART_PHONY_TEST_TARGET_SUFFIX) :=
+ART_TEST_TARGET_OAT_TARGETS$(2ND_ART_PHONY_TEST_TARGET_SUFFIX) :=
 ART_TEST_HOST_OAT_DEFAULT_TARGETS :=
 ART_TEST_HOST_OAT_INTERPRETER_TARGETS :=
 
@@ -160,7 +179,10 @@
 .PHONY: test-art-oat-$(1)
 test-art-oat-$(1): test-art-host-oat-$(1) test-art-target-oat-$(1)
 
-ART_TEST_TARGET_OAT_TARGETS += test-art-target-oat-$(1)
+ART_TEST_TARGET_OAT_TARGETS$(ART_PHONY_TEST_TARGET_SUFFIX) += test-art-target-oat-$(1)$(ART_PHONY_TEST_TARGET_SUFFIX)
+ifdef TARGET_2ND_ARCH
+  ART_TEST_TARGET_OAT_TARGETS$(2ND_ART_PHONY_TEST_TARGET_SUFFIX) += test-art-target-oat-$(1)$(2ND_ART_PHONY_TEST_TARGET_SUFFIX)
+endif
 ART_TEST_HOST_OAT_DEFAULT_TARGETS += test-art-host-oat-default-$(1)
 ART_TEST_HOST_OAT_INTERPRETER_TARGETS += test-art-host-oat-interpreter-$(1)
 endef
@@ -188,7 +210,7 @@
 # Expand all tests.
 TEST_ART_RUN_TESTS := $(wildcard $(LOCAL_PATH)/[0-9]*)
 TEST_ART_RUN_TESTS := $(subst $(LOCAL_PATH)/,, $(TEST_ART_RUN_TESTS))
-TEST_ART_TIMING_SENSITIVE_RUN_TESTS := 055-enum-performance
+TEST_ART_TIMING_SENSITIVE_RUN_TESTS := 053-wait-some 055-enum-performance
 ifdef dist_goal # disable timing sensitive tests on "dist" builds.
   $(foreach test, $(TEST_ART_TIMING_SENSITIVE_RUN_TESTS), \
     $(info Skipping $(test)) \
diff --git a/test/MyClassNatives/MyClassNatives.java b/test/MyClassNatives/MyClassNatives.java
index 6e7a426..b5e0204 100644
--- a/test/MyClassNatives/MyClassNatives.java
+++ b/test/MyClassNatives/MyClassNatives.java
@@ -79,4 +79,17 @@
         Object o240, Object o241, Object o242, Object o243, Object o244, Object o245, Object o246, Object o247,
         Object o248, Object o249, Object o250, Object o251, Object o252, Object o253);
 
+    native void withoutImplementation();
+    
+    native static void stackArgsIntsFirst(int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+        int i8, int i9, int i10, float f1, float f2, float f3, float f4, float f5, float f6,
+        float f7, float f8, float f9, float f10);
+    
+    native static void stackArgsFloatsFirst(float f1, float f2, float f3, float f4, float f5,
+        float f6, float f7, float f8, float f9, float f10, int i1, int i2, int i3, int i4, int i5,
+        int i6, int i7, int i8, int i9, int i10);
+    
+    native static void stackArgsMixed(int i1, float f1, int i2, float f2, int i3, float f3, int i4,
+        float f4, int i5, float f5, int i6, float f6, int i7, float f7, int i8, float f8, int i9,
+        float f9, int i10, float f10);
 }
diff --git a/test/run-test b/test/run-test
index 58de980..6e59641 100755
--- a/test/run-test
+++ b/test/run-test
@@ -68,6 +68,7 @@
 runtime="art"
 usage="no"
 build_only="no"
+suffix64=""
 
 while true; do
     if [ "x$1" = "x--host" ]; then
@@ -154,6 +155,7 @@
         shift
     elif [ "x$1" = "x--64" ]; then
         run_args="${run_args} --64"
+        suffix64="64"
         shift
     elif expr "x$1" : "x--" >/dev/null 2>&1; then
         echo "unknown $0 option: $1" 1>&2
@@ -187,7 +189,7 @@
         fi
         run_args="${run_args} --boot -Ximage:${ANDROID_HOST_OUT}/framework/core.art"
     else
-        run_args="${run_args} --boot -Ximage:/data/art-test/core.art"
+        run_args="${run_args} --boot -Ximage:/data/art-test${suffix64}/core.art"
     fi
 fi