Merge "Revert "Revert "Revert "Revert "Make dex2dex return a CompiledMethod after quickening."""""
diff --git a/Android.mk b/Android.mk
index c01464a..49b61bb 100644
--- a/Android.mk
+++ b/Android.mk
@@ -427,6 +427,7 @@
 	adb shell setprop dalvik.vm.dex2oat-filter \"\"
 	adb shell setprop dalvik.vm.image-dex2oat-filter \"\"
 	adb shell setprop persist.sys.dalvik.vm.lib.2 libart.so
+	adb shell setprop dalvik.vm.usejit false
 	adb shell start
 
 .PHONY: use-artd-full
@@ -437,16 +438,18 @@
 	adb shell setprop dalvik.vm.dex2oat-filter \"\"
 	adb shell setprop dalvik.vm.image-dex2oat-filter \"\"
 	adb shell setprop persist.sys.dalvik.vm.lib.2 libartd.so
+	adb shell setprop dalvik.vm.usejit false
 	adb shell start
 
-.PHONY: use-art-verify-at-runtime
-use-art-verify-at-runtime:
+.PHONY: use-art-jit
+use-art-jit:
 	adb root
 	adb wait-for-device shell stop
 	adb shell rm -rf $(ART_TARGET_DALVIK_CACHE_DIR)/*
 	adb shell setprop dalvik.vm.dex2oat-filter "verify-at-runtime"
 	adb shell setprop dalvik.vm.image-dex2oat-filter "verify-at-runtime"
 	adb shell setprop persist.sys.dalvik.vm.lib.2 libart.so
+	adb shell setprop dalvik.vm.usejit true
 	adb shell start
 
 .PHONY: use-art-interpret-only
@@ -457,6 +460,7 @@
 	adb shell setprop dalvik.vm.dex2oat-filter "interpret-only"
 	adb shell setprop dalvik.vm.image-dex2oat-filter "interpret-only"
 	adb shell setprop persist.sys.dalvik.vm.lib.2 libart.so
+	adb shell setprop dalvik.vm.usejit false
 	adb shell start
 
 .PHONY: use-artd-interpret-only
@@ -467,6 +471,7 @@
 	adb shell setprop dalvik.vm.dex2oat-filter "interpret-only"
 	adb shell setprop dalvik.vm.image-dex2oat-filter "interpret-only"
 	adb shell setprop persist.sys.dalvik.vm.lib.2 libartd.so
+	adb shell setprop dalvik.vm.usejit false
 	adb shell start
 
 .PHONY: use-art-verify-none
@@ -477,6 +482,7 @@
 	adb shell setprop dalvik.vm.dex2oat-filter "verify-none"
 	adb shell setprop dalvik.vm.image-dex2oat-filter "verify-none"
 	adb shell setprop persist.sys.dalvik.vm.lib.2 libart.so
+	adb shell setprop dalvik.vm.usejit false
 	adb shell start
 
 ########################################################################
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index ee0cb09..5d4feb8 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -60,6 +60,11 @@
 $(info Enabling ART_BUILD_HOST_STATIC)
 endif
 
+ifeq ($(ART_TEST_DEBUG_GC),true)
+  ART_DEFAULT_GC_TYPE := SS
+  ART_USE_TLAB := true
+endif
+
 #
 # Used to enable JIT
 #
@@ -266,10 +271,10 @@
   # Larger frame-size for host clang builds today
   ifneq ($(ART_COVERAGE),true)
     ifneq ($(NATIVE_COVERAGE),true)
-      ifndef SANITIZE_HOST
-        art_host_non_debug_cflags += -Wframe-larger-than=2700
-      endif
-      ifndef SANITIZE_TARGET
+      art_host_non_debug_cflags += -Wframe-larger-than=2700
+      ifdef SANITIZE_TARGET
+        art_target_non_debug_cflags += -Wframe-larger-than=5450
+      else
         art_target_non_debug_cflags += -Wframe-larger-than=1728
       endif
     endif
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 62d8a69..1523383 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -18,7 +18,7 @@
 #define ART_COMPILER_IMAGE_WRITER_H_
 
 #include <stdint.h>
-#include <valgrind.h>
+#include "base/memory_tool.h"
 
 #include <cstddef>
 #include <memory>
diff --git a/compiler/linker/arm/relative_patcher_thumb2_test.cc b/compiler/linker/arm/relative_patcher_thumb2_test.cc
index a057a4c..b4aa286 100644
--- a/compiler/linker/arm/relative_patcher_thumb2_test.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2_test.cc
@@ -242,8 +242,10 @@
   };
 
   constexpr uint32_t max_positive_disp = 16 * MB - 2u + 4u /* PC adjustment */;
-  bool thunk_in_gap = Create2MethodsWithGap(method1_code, method1_patches,
-                                            kNopCode, ArrayRef<const LinkerPatch>(),
+  bool thunk_in_gap = Create2MethodsWithGap(method1_code,
+                                            ArrayRef<const LinkerPatch>(method1_patches),
+                                            kNopCode,
+                                            ArrayRef<const LinkerPatch>(),
                                             bl_offset_in_method1 + max_positive_disp);
   ASSERT_FALSE(thunk_in_gap);  // There should be no thunk.
 
@@ -262,8 +264,10 @@
   };
 
   constexpr uint32_t just_over_max_negative_disp = 16 * MB - 4u /* PC adjustment */;
-  bool thunk_in_gap = Create2MethodsWithGap(kNopCode, ArrayRef<const LinkerPatch>(),
-                                            method3_code, method3_patches,
+  bool thunk_in_gap = Create2MethodsWithGap(kNopCode,
+                                            ArrayRef<const LinkerPatch>(),
+                                            method3_code,
+                                            ArrayRef<const LinkerPatch>(method3_patches),
                                             just_over_max_negative_disp - bl_offset_in_method3);
   ASSERT_FALSE(thunk_in_gap);  // There should be no thunk.
 
@@ -282,8 +286,10 @@
   };
 
   constexpr uint32_t just_over_max_positive_disp = 16 * MB + 4u /* PC adjustment */;
-  bool thunk_in_gap = Create2MethodsWithGap(method1_code, method1_patches,
-                                            kNopCode, ArrayRef<const LinkerPatch>(),
+  bool thunk_in_gap = Create2MethodsWithGap(method1_code,
+                                            ArrayRef<const LinkerPatch>(method1_patches),
+                                            kNopCode,
+                                            ArrayRef<const LinkerPatch>(),
                                             bl_offset_in_method1 + just_over_max_positive_disp);
   ASSERT_TRUE(thunk_in_gap);
 
@@ -311,8 +317,10 @@
   };
 
   constexpr uint32_t just_over_max_negative_disp = 16 * MB + 2 - 4u /* PC adjustment */;
-  bool thunk_in_gap = Create2MethodsWithGap(kNopCode, ArrayRef<const LinkerPatch>(),
-                                            method3_code, method3_patches,
+  bool thunk_in_gap = Create2MethodsWithGap(kNopCode,
+                                            ArrayRef<const LinkerPatch>(),
+                                            method3_code,
+                                            ArrayRef<const LinkerPatch>(method3_patches),
                                             just_over_max_negative_disp - bl_offset_in_method3);
   ASSERT_FALSE(thunk_in_gap);  // There should be a thunk but it should be after the method2.
 
diff --git a/compiler/linker/arm64/relative_patcher_arm64_test.cc b/compiler/linker/arm64/relative_patcher_arm64_test.cc
index 21f9367..1bad8a9 100644
--- a/compiler/linker/arm64/relative_patcher_arm64_test.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64_test.cc
@@ -396,8 +396,10 @@
   };
 
   constexpr uint32_t max_positive_disp = 128 * MB - 4u;
-  uint32_t last_method_idx = Create2MethodsWithGap(method1_code, method1_patches,
-                                                   kNopCode, ArrayRef<const LinkerPatch>(),
+  uint32_t last_method_idx = Create2MethodsWithGap(method1_code,
+                                                   ArrayRef<const LinkerPatch>(method1_patches),
+                                                   kNopCode,
+                                                   ArrayRef<const LinkerPatch>(),
                                                    bl_offset_in_method1 + max_positive_disp);
   ASSERT_EQ(expected_last_method_idx, last_method_idx);
 
@@ -420,8 +422,10 @@
   };
 
   constexpr uint32_t max_negative_disp = 128 * MB;
-  uint32_t last_method_idx = Create2MethodsWithGap(kNopCode, ArrayRef<const LinkerPatch>(),
-                                                   last_method_code, last_method_patches,
+  uint32_t last_method_idx = Create2MethodsWithGap(kNopCode,
+                                                   ArrayRef<const LinkerPatch>(),
+                                                   last_method_code,
+                                                   ArrayRef<const LinkerPatch>(last_method_patches),
                                                    max_negative_disp - bl_offset_in_last_method);
   uint32_t method1_offset = GetMethodOffset(1u);
   uint32_t last_method_offset = GetMethodOffset(last_method_idx);
@@ -445,7 +449,10 @@
 
   constexpr uint32_t just_over_max_positive_disp = 128 * MB;
   uint32_t last_method_idx = Create2MethodsWithGap(
-      method1_code, method1_patches, kNopCode, ArrayRef<const LinkerPatch>(),
+      method1_code,
+      ArrayRef<const LinkerPatch>(method1_patches),
+      kNopCode,
+      ArrayRef<const LinkerPatch>(),
       bl_offset_in_method1 + just_over_max_positive_disp);
   ASSERT_EQ(expected_last_method_idx, last_method_idx);
 
@@ -474,7 +481,8 @@
 
   constexpr uint32_t just_over_max_negative_disp = 128 * MB + 4;
   uint32_t last_method_idx = Create2MethodsWithGap(
-      kNopCode, ArrayRef<const LinkerPatch>(), last_method_code, last_method_patches,
+      kNopCode, ArrayRef<const LinkerPatch>(), last_method_code,
+      ArrayRef<const LinkerPatch>(last_method_patches),
       just_over_max_negative_disp - bl_offset_in_last_method);
   uint32_t method1_offset = GetMethodOffset(1u);
   uint32_t last_method_offset = GetMethodOffset(last_method_idx);
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 069c9e1..11de4ee 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -77,10 +77,9 @@
     case kCondLE: return le;
     case kCondGT: return gt;
     case kCondGE: return ge;
-    default:
-      LOG(FATAL) << "Unknown if condition";
   }
-  return nv;  // Unreachable.
+  LOG(FATAL) << "Unreachable";
+  UNREACHABLE();
 }
 
 Location ARM64ReturnLocation(Primitive::Type return_type) {
@@ -1653,6 +1652,11 @@
   GenerateClassInitializationCheck(slow_path, InputRegisterAt(check, 0));
 }
 
+static bool IsFloatingPointZeroConstant(HInstruction* instruction) {
+  return (instruction->IsFloatConstant() && (instruction->AsFloatConstant()->GetValue() == 0.0f))
+      || (instruction->IsDoubleConstant() && (instruction->AsDoubleConstant()->GetValue() == 0.0));
+}
+
 void LocationsBuilderARM64::VisitCompare(HCompare* compare) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(compare, LocationSummary::kNoCall);
@@ -1667,13 +1671,10 @@
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      HInstruction* right = compare->InputAt(1);
-      if ((right->IsFloatConstant() && (right->AsFloatConstant()->GetValue() == 0.0f)) ||
-          (right->IsDoubleConstant() && (right->AsDoubleConstant()->GetValue() == 0.0))) {
-        locations->SetInAt(1, Location::ConstantLocation(right->AsConstant()));
-      } else {
-        locations->SetInAt(1, Location::RequiresFpuRegister());
-      }
+      locations->SetInAt(1,
+                         IsFloatingPointZeroConstant(compare->InputAt(1))
+                             ? Location::ConstantLocation(compare->InputAt(1)->AsConstant())
+                             : Location::RequiresFpuRegister());
       locations->SetOut(Location::RequiresRegister());
       break;
     }
@@ -1704,12 +1705,8 @@
       Register result = OutputRegister(compare);
       FPRegister left = InputFPRegisterAt(compare, 0);
       if (compare->GetLocations()->InAt(1).IsConstant()) {
-        if (kIsDebugBuild) {
-          HInstruction* right = compare->GetLocations()->InAt(1).GetConstant();
-          DCHECK((right->IsFloatConstant() && (right->AsFloatConstant()->GetValue() == 0.0f)) ||
-                  (right->IsDoubleConstant() && (right->AsDoubleConstant()->GetValue() == 0.0)));
-        }
-        // 0.0 is the only immediate that can be encoded directly in a FCMP instruction.
+        DCHECK(IsFloatingPointZeroConstant(compare->GetLocations()->InAt(1).GetConstant()));
+        // 0.0 is the only immediate that can be encoded directly in an FCMP instruction.
         __ Fcmp(left, 0.0);
       } else {
         __ Fcmp(left, InputFPRegisterAt(compare, 1));
@@ -1729,8 +1726,19 @@
 
 void LocationsBuilderARM64::VisitCondition(HCondition* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, ARM64EncodableConstantOrRegister(instruction->InputAt(1), instruction));
+
+  if (Primitive::IsFloatingPointType(instruction->InputAt(0)->GetType())) {
+    locations->SetInAt(0, Location::RequiresFpuRegister());
+    locations->SetInAt(1,
+                       IsFloatingPointZeroConstant(instruction->InputAt(1))
+                           ? Location::ConstantLocation(instruction->InputAt(1)->AsConstant())
+                           : Location::RequiresFpuRegister());
+  } else {
+    // Integer cases.
+    locations->SetInAt(0, Location::RequiresRegister());
+    locations->SetInAt(1, ARM64EncodableConstantOrRegister(instruction->InputAt(1), instruction));
+  }
+
   if (instruction->NeedsMaterialization()) {
     locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
   }
@@ -1742,13 +1750,34 @@
   }
 
   LocationSummary* locations = instruction->GetLocations();
-  Register lhs = InputRegisterAt(instruction, 0);
-  Operand rhs = InputOperandAt(instruction, 1);
   Register res = RegisterFrom(locations->Out(), instruction->GetType());
-  Condition cond = ARM64Condition(instruction->GetCondition());
+  IfCondition if_cond = instruction->GetCondition();
+  Condition arm64_cond = ARM64Condition(if_cond);
 
-  __ Cmp(lhs, rhs);
-  __ Cset(res, cond);
+  if (Primitive::IsFloatingPointType(instruction->InputAt(0)->GetType())) {
+    FPRegister lhs = InputFPRegisterAt(instruction, 0);
+    if (locations->InAt(1).IsConstant()) {
+      DCHECK(IsFloatingPointZeroConstant(locations->InAt(1).GetConstant()));
+      // 0.0 is the only immediate that can be encoded directly in an FCMP instruction.
+      __ Fcmp(lhs, 0.0);
+    } else {
+      __ Fcmp(lhs, InputFPRegisterAt(instruction, 1));
+    }
+    __ Cset(res, arm64_cond);
+    if (instruction->IsFPConditionTrueIfNaN()) {
+      // res = IsUnordered(arm64_cond) ? 1 : res  <=>  res = IsNotUnordered(arm64_cond) ? res : 1
+      __ Csel(res, res, Operand(1), vc);  // VC for "not unordered".
+    } else if (instruction->IsFPConditionFalseIfNaN()) {
+      // res = IsUnordered(arm64_cond) ? 0 : res  <=>  res = IsNotUnordered(arm64_cond) ? res : 0
+      __ Csel(res, res, Operand(0), vc);  // VC for "not unordered".
+    }
+  } else {
+    // Integer cases.
+    Register lhs = InputRegisterAt(instruction, 0);
+    Operand rhs = InputOperandAt(instruction, 1);
+    __ Cmp(lhs, rhs);
+    __ Cset(res, arm64_cond);
+  }
 }
 
 #define FOR_EACH_CONDITION_INSTRUCTION(M)                                                \
@@ -2080,33 +2109,58 @@
   } else {
     // The condition instruction has not been materialized, use its inputs as
     // the comparison and its condition as the branch condition.
-    Register lhs = InputRegisterAt(condition, 0);
-    Operand rhs = InputOperandAt(condition, 1);
-    Condition arm64_cond = ARM64Condition(condition->GetCondition());
-    if ((arm64_cond != gt && arm64_cond != le) && rhs.IsImmediate() && (rhs.immediate() == 0)) {
-      switch (arm64_cond) {
-        case eq:
-          __ Cbz(lhs, true_target);
-          break;
-        case ne:
-          __ Cbnz(lhs, true_target);
-          break;
-        case lt:
-          // Test the sign bit and branch accordingly.
-          __ Tbnz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, true_target);
-          break;
-        case ge:
-          // Test the sign bit and branch accordingly.
-          __ Tbz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, true_target);
-          break;
-        default:
-          // Without the `static_cast` the compiler throws an error for
-          // `-Werror=sign-promo`.
-          LOG(FATAL) << "Unexpected condition: " << static_cast<int>(arm64_cond);
+    Primitive::Type type =
+        cond->IsCondition() ? cond->InputAt(0)->GetType() : Primitive::kPrimInt;
+
+    if (Primitive::IsFloatingPointType(type)) {
+      // FP compares don't like null false_targets.
+      if (false_target == nullptr) {
+        false_target = codegen_->GetLabelOf(instruction->AsIf()->IfFalseSuccessor());
       }
+      FPRegister lhs = InputFPRegisterAt(condition, 0);
+      if (condition->GetLocations()->InAt(1).IsConstant()) {
+        DCHECK(IsFloatingPointZeroConstant(condition->GetLocations()->InAt(1).GetConstant()));
+        // 0.0 is the only immediate that can be encoded directly in an FCMP instruction.
+        __ Fcmp(lhs, 0.0);
+      } else {
+        __ Fcmp(lhs, InputFPRegisterAt(condition, 1));
+      }
+      if (condition->IsFPConditionTrueIfNaN()) {
+        __ B(vs, true_target);  // VS for unordered.
+      } else if (condition->IsFPConditionFalseIfNaN()) {
+        __ B(vs, false_target);  // VS for unordered.
+      }
+      __ B(ARM64Condition(condition->GetCondition()), true_target);
     } else {
-      __ Cmp(lhs, rhs);
-      __ B(arm64_cond, true_target);
+      // Integer cases.
+      Register lhs = InputRegisterAt(condition, 0);
+      Operand rhs = InputOperandAt(condition, 1);
+      Condition arm64_cond = ARM64Condition(condition->GetCondition());
+      if ((arm64_cond != gt && arm64_cond != le) && rhs.IsImmediate() && (rhs.immediate() == 0)) {
+        switch (arm64_cond) {
+          case eq:
+            __ Cbz(lhs, true_target);
+            break;
+          case ne:
+            __ Cbnz(lhs, true_target);
+            break;
+          case lt:
+            // Test the sign bit and branch accordingly.
+            __ Tbnz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, true_target);
+            break;
+          case ge:
+            // Test the sign bit and branch accordingly.
+            __ Tbz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, true_target);
+            break;
+          default:
+            // Without the `static_cast` the compiler throws an error for
+            // `-Werror=sign-promo`.
+            LOG(FATAL) << "Unexpected condition: " << static_cast<int>(arm64_cond);
+        }
+      } else {
+        __ Cmp(lhs, rhs);
+        __ B(arm64_cond, true_target);
+      }
     }
   }
   if (false_target != nullptr) {
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index c86d797..b30b6c7 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -518,10 +518,10 @@
 void InstructionSimplifierVisitor::VisitCondition(HCondition* condition) {
   // Try to fold an HCompare into this HCondition.
 
-  // This simplification is currently only supported on x86, x86_64 and ARM.
-  // TODO: Implement it for ARM64 and MIPS64.
+  // This simplification is currently supported on x86, x86_64, ARM and ARM64.
+  // TODO: Implement it for MIPS64.
   InstructionSet instruction_set = GetGraph()->GetInstructionSet();
-  if (instruction_set != kX86 && instruction_set != kX86_64 && instruction_set != kThumb2) {
+  if (instruction_set == kMips64) {
     return;
   }
 
diff --git a/compiler/utils/array_ref.h b/compiler/utils/array_ref.h
index ff5a77c..303e0d5 100644
--- a/compiler/utils/array_ref.h
+++ b/compiler/utils/array_ref.h
@@ -62,14 +62,14 @@
   }
 
   template <size_t size>
-  constexpr ArrayRef(T (&array)[size])
+  explicit constexpr ArrayRef(T (&array)[size])
     : array_(array), size_(size) {
   }
 
   template <typename U, size_t size>
-  constexpr ArrayRef(U (&array)[size],
-                     typename std::enable_if<std::is_same<T, const U>::value, tag>::type
-                         t ATTRIBUTE_UNUSED = tag())
+  explicit constexpr ArrayRef(U (&array)[size],
+                              typename std::enable_if<std::is_same<T, const U>::value, tag>::type
+                                  t ATTRIBUTE_UNUSED = tag())
     : array_(array), size_(size) {
   }
 
@@ -83,9 +83,9 @@
   }
 
   template <typename U, typename Alloc>
-  ArrayRef(const std::vector<U, Alloc>& v,
-           typename std::enable_if<std::is_same<T, const U>::value, tag>::type
-               t ATTRIBUTE_UNUSED = tag())
+  explicit ArrayRef(const std::vector<U, Alloc>& v,
+                    typename std::enable_if<std::is_same<T, const U>::value, tag>::type
+                        t ATTRIBUTE_UNUSED = tag())
       : array_(v.data()), size_(v.size()) {
   }
 
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index a4e74d4..74ec2ed 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -18,7 +18,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/stat.h>
-#include <valgrind.h>
+#include "base/memory_tool.h"
 
 #include <fstream>
 #include <iostream>
@@ -519,7 +519,7 @@
     // the runtime.
     LogCompletionTime();
 
-    if (kIsDebugBuild || (RUNNING_ON_VALGRIND != 0)) {
+    if (kIsDebugBuild || (RUNNING_ON_MEMORY_TOOL && kMemoryToolDetectsLeaks)) {
       delete runtime_;  // See field declaration for why this is manual.
     }
   }
@@ -2003,7 +2003,7 @@
   // Everything was done, do an explicit exit here to avoid running Runtime destructors that take
   // time (bug 10645725) unless we're a debug build or running on valgrind. Note: The Dex2Oat class
   // should not destruct the runtime in this case.
-  if (!art::kIsDebugBuild && (RUNNING_ON_VALGRIND == 0)) {
+  if (!art::kIsDebugBuild && (RUNNING_ON_MEMORY_TOOL == 0)) {
     exit(result);
   }
   return result;
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index a7826a7..068f458 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -325,23 +325,25 @@
      * On success this wrapper will restore arguments and *jump* to the target, leaving the lr
      * pointing back to the original caller.
      */
-.macro INVOKE_TRAMPOLINE c_name, cxx_name
+.macro INVOKE_TRAMPOLINE_BODY cxx_name
     .extern \cxx_name
-ENTRY \c_name
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME r2, r3  @ save callee saves in case allocation triggers GC
     mov    r2, r9                         @ pass Thread::Current
     mov    r3, sp
-    bl     \cxx_name                      @ (method_idx, this, caller, Thread*, SP)
+    bl     \cxx_name                      @ (method_idx, this, Thread*, SP)
     mov    r12, r1                        @ save Method*->code_
     RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     cbz    r0, 1f                         @ did we find the target? if not go to exception delivery
     bx     r12                            @ tail call to target
 1:
     DELIVER_PENDING_EXCEPTION
+.endm
+.macro INVOKE_TRAMPOLINE c_name, cxx_name
+ENTRY \c_name
+    INVOKE_TRAMPOLINE_BODY \cxx_name
 END \c_name
 .endm
 
-INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline, artInvokeInterfaceTrampoline
 INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline_with_access_check, artInvokeInterfaceTrampolineWithAccessCheck
 
 INVOKE_TRAMPOLINE art_quick_invoke_static_trampoline_with_access_check, artInvokeStaticTrampolineWithAccessCheck
@@ -894,7 +896,7 @@
      */
 ENTRY art_quick_imt_conflict_trampoline
     mov    r0, r12
-    b art_quick_invoke_interface_trampoline
+    INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 END art_quick_imt_conflict_trampoline
 
     .extern artQuickResolutionTrampoline
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 614936b..6d9b44a 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -450,9 +450,8 @@
      *
      * Clobbers xIP0.
      */
-.macro INVOKE_TRAMPOLINE c_name, cxx_name
+.macro INVOKE_TRAMPOLINE_BODY cxx_name
     .extern \cxx_name
-ENTRY \c_name
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME  // save callee saves in case allocation triggers GC
     // Helper signature is always
     // (method_idx, *this_object, *caller_method, *self, sp)
@@ -466,10 +465,13 @@
     br     xIP0                           // tail call to target
 1:
     DELIVER_PENDING_EXCEPTION
+.endm
+.macro INVOKE_TRAMPOLINE c_name, cxx_name
+ENTRY \c_name
+    INVOKE_TRAMPOLINE_BODY \cxx_name
 END \c_name
 .endm
 
-INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline, artInvokeInterfaceTrampoline
 INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline_with_access_check, artInvokeInterfaceTrampolineWithAccessCheck
 
 INVOKE_TRAMPOLINE art_quick_invoke_static_trampoline_with_access_check, artInvokeStaticTrampolineWithAccessCheck
@@ -1429,9 +1431,10 @@
      * Called to resolve an imt conflict. xIP1 is a hidden argument that holds the target method's
      * dex method index.
      */
+    .extern artInvokeInterfaceTrampoline
 ENTRY art_quick_imt_conflict_trampoline
     mov    x0, xIP1
-    b art_quick_invoke_interface_trampoline
+    INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 END art_quick_imt_conflict_trampoline
 
 ENTRY art_quick_resolution_trampoline
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index cc1de43..2819f92 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -459,9 +459,8 @@
      * On success this wrapper will restore arguments and *jump* to the target, leaving the lr
      * pointing back to the original caller.
      */
-.macro INVOKE_TRAMPOLINE c_name, cxx_name
+.macro INVOKE_TRAMPOLINE_BODY cxx_name
     .extern \cxx_name
-ENTRY \c_name
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME  # save callee saves in case allocation triggers GC
     move  $a2, rSELF                       # pass Thread::Current
     jal   \cxx_name                        # (method_idx, this, Thread*, $sp)
@@ -474,10 +473,13 @@
     nop
 1:
     DELIVER_PENDING_EXCEPTION
+.endm
+.macro INVOKE_TRAMPOLINE c_name, cxx_name
+ENTRY \c_name
+    INVOKE_TRAMPOLINE_BODY \cxx_name
 END \c_name
 .endm
 
-INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline, artInvokeInterfaceTrampoline
 INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline_with_access_check, artInvokeInterfaceTrampolineWithAccessCheck
 
 INVOKE_TRAMPOLINE art_quick_invoke_static_trampoline_with_access_check, artInvokeStaticTrampolineWithAccessCheck
@@ -1103,9 +1105,8 @@
      * dex method index.
      */
 ENTRY art_quick_imt_conflict_trampoline
-    la      $t9, art_quick_invoke_interface_trampoline
-    jalr    $zero, $t9
     move    $a0, $t0
+    INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 END art_quick_imt_conflict_trampoline
 
     .extern artQuickResolutionTrampoline
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index 37c6c5b..abca70b 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -525,9 +525,8 @@
      * On success this wrapper will restore arguments and *jump* to the target, leaving the ra
      * pointing back to the original caller.
      */
-.macro INVOKE_TRAMPOLINE c_name, cxx_name
+.macro INVOKE_TRAMPOLINE_BODY cxx_name
     .extern \cxx_name
-ENTRY \c_name
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME  # save callee saves in case allocation triggers GC
     move  $a2, rSELF                       # pass Thread::Current
     jal   \cxx_name                        # (method_idx, this, Thread*, $sp)
@@ -541,10 +540,13 @@
     nop
 1:
     DELIVER_PENDING_EXCEPTION
+.endm
+.macro INVOKE_TRAMPOLINE c_name, cxx_name
+ENTRY \c_name
+    INVOKE_TRAMPOLINE_BODY \cxx_name
 END \c_name
 .endm
 
-INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline, artInvokeInterfaceTrampoline
 INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline_with_access_check, artInvokeInterfaceTrampolineWithAccessCheck
 
 INVOKE_TRAMPOLINE art_quick_invoke_static_trampoline_with_access_check, artInvokeStaticTrampolineWithAccessCheck
@@ -1369,10 +1371,8 @@
      * dex method index.
      */
 ENTRY art_quick_imt_conflict_trampoline
-    dla     $t9, art_quick_invoke_interface_trampoline
-    .cpreturn
-    jalr    $zero, $t9
     move    $a0, $t0
+    INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 END art_quick_imt_conflict_trampoline
 
     .extern artQuickResolutionTrampoline
diff --git a/runtime/arch/x86/jni_entrypoints_x86.S b/runtime/arch/x86/jni_entrypoints_x86.S
index 5d27e47..aca5a37 100644
--- a/runtime/arch/x86/jni_entrypoints_x86.S
+++ b/runtime/arch/x86/jni_entrypoints_x86.S
@@ -23,6 +23,7 @@
     subl LITERAL(8), %esp         // align stack
     CFI_ADJUST_CFA_OFFSET(8)
     pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    CFI_ADJUST_CFA_OFFSET(4)
     call SYMBOL(artFindNativeMethod)  // (Thread*)
     addl LITERAL(12), %esp        // remove argument & padding
     CFI_ADJUST_CFA_OFFSET(-12)
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index c9bc977..7086b5b 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -293,8 +293,7 @@
      * On success this wrapper will restore arguments and *jump* to the target, leaving the lr
      * pointing back to the original caller.
      */
-MACRO2(INVOKE_TRAMPOLINE, c_name, cxx_name)
-    DEFINE_FUNCTION RAW_VAR(c_name, 0)
+MACRO1(INVOKE_TRAMPOLINE_BODY, cxx_name)
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME ebx, ebx
     movl %esp, %edx  // remember SP
 
@@ -304,7 +303,7 @@
     CFI_ADJUST_CFA_OFFSET(4)
     PUSH ecx                      // pass arg2
     PUSH eax                      // pass arg1
-    call VAR(cxx_name, 1)         // cxx_name(arg1, arg2, arg3, Thread*, SP)
+    call VAR(cxx_name, 0)         // cxx_name(arg1, arg2, arg3, Thread*, SP)
     movl %edx, %edi               // save code pointer in EDI
     addl MACRO_LITERAL(20), %esp  // Pop arguments skip eax
     CFI_ADJUST_CFA_OFFSET(-20)
@@ -334,10 +333,13 @@
     addl MACRO_LITERAL(4), %esp   // Pop code pointer off stack
     CFI_ADJUST_CFA_OFFSET(-4)
     DELIVER_PENDING_EXCEPTION
+END_MACRO
+MACRO2(INVOKE_TRAMPOLINE, c_name, cxx_name)
+    DEFINE_FUNCTION RAW_VAR(c_name, 0)
+    INVOKE_TRAMPOLINE_BODY RAW_VAR(cxx_name, 1)
     END_FUNCTION RAW_VAR(c_name, 0)
 END_MACRO
 
-INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline, artInvokeInterfaceTrampoline
 INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline_with_access_check, artInvokeInterfaceTrampolineWithAccessCheck
 
 INVOKE_TRAMPOLINE art_quick_invoke_static_trampoline_with_access_check, artInvokeStaticTrampolineWithAccessCheck
@@ -1111,7 +1113,7 @@
     POP eax                       // pop arguments
     POP ecx
     addl LITERAL(4), %esp
-    CFI_ADJUST_CFA_OFFSET(-12)
+    CFI_ADJUST_CFA_OFFSET(-4)
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  ebx, ebx  // save all registers as basis for long jump context
     // Outgoing argument set up
     PUSH eax                      // alignment padding
@@ -1183,8 +1185,8 @@
     PUSH eax
 #else
     pushl MIRROR_OBJECT_CLASS_OFFSET(%edx)  // pass arg2 - type of the value to be stored
-#endif
     CFI_ADJUST_CFA_OFFSET(4)
+#endif
     PUSH ebx                      // pass arg1 - component type of the array
     call SYMBOL(artIsAssignableFromCode)  // (Class* a, Class* b)
     addl LITERAL(16), %esp        // pop arguments
@@ -1415,7 +1417,7 @@
      */
 DEFINE_FUNCTION art_quick_imt_conflict_trampoline
     movd %xmm7, %eax              // get target method index stored in xmm7
-    jmp SYMBOL(art_quick_invoke_interface_trampoline)
+    INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 END_FUNCTION art_quick_imt_conflict_trampoline
 
 DEFINE_FUNCTION art_quick_resolution_trampoline
@@ -1429,6 +1431,7 @@
     call SYMBOL(artQuickResolutionTrampoline) // (Method* called, receiver, Thread*, SP)
     movl %eax, %edi               // remember code pointer in EDI
     addl LITERAL(16), %esp        // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-16)
     test %eax, %eax               // if code pointer is null goto deliver pending exception
     jz 1f
     RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME_AND_JUMP
@@ -1559,6 +1562,7 @@
     PUSH eax                      // Pass Method*.
     call SYMBOL(artInstrumentationMethodEntryFromCode) // (Method*, Object*, Thread*, LR)
     addl LITERAL(28), %esp        // Pop arguments upto saved Method*.
+    CFI_ADJUST_CFA_OFFSET(-28)
     movl 60(%esp), %edi           // Restore edi.
     movl %eax, 60(%esp)           // Place code* over edi, just under return pc.
     movl SYMBOL(art_quick_instrumentation_exit)@GOT(%ebx), %ebx
@@ -1578,11 +1582,13 @@
     movl 52(%esp), %ebp           // Restore ebp.
     movl 56(%esp), %esi           // Restore esi.
     addl LITERAL(60), %esp        // Wind stack back upto code*.
+    CFI_ADJUST_CFA_OFFSET(-60)
     ret                           // Call method (and pop).
 END_FUNCTION art_quick_instrumentation_entry
 
 DEFINE_FUNCTION art_quick_instrumentation_exit
     pushl LITERAL(0)              // Push a fake return PC as there will be none on the stack.
+    CFI_ADJUST_CFA_OFFSET(4)
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME ebx, ebx
     mov  %esp, %ecx               // Remember SP
     subl LITERAL(8), %esp         // Save float return value.
@@ -1611,6 +1617,7 @@
     CFI_ADJUST_CFA_OFFSET(-8)
     RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     addl LITERAL(4), %esp         // Remove fake return pc.
+    CFI_ADJUST_CFA_OFFSET(-4)
     jmp   *%ecx                   // Return.
 END_FUNCTION art_quick_instrumentation_exit
 
@@ -1619,7 +1626,7 @@
      * will long jump to the upcall with a special exception of -1.
      */
 DEFINE_FUNCTION art_quick_deoptimize
-    pushl %ebx                    // Entry point for a jump. Fake that we were called.
+    PUSH ebx                      // Entry point for a jump. Fake that we were called.
 .globl SYMBOL(art_quick_deoptimize_from_compiled_slow_path)  // Entry point for real calls
                                                              // from compiled slow paths.
 SYMBOL(art_quick_deoptimize_from_compiled_slow_path):
@@ -1682,8 +1689,8 @@
 DEFINE_FUNCTION art_nested_signal_return
     SETUP_GOT_NOSAVE ebx            // sets %ebx for call into PLT
     movl LITERAL(1), %ecx
-    pushl %ecx                      // second arg to longjmp (1)
-    pushl %eax                      // first arg to longjmp (jmp_buf)
+    PUSH ecx                        // second arg to longjmp (1)
+    PUSH eax                        // first arg to longjmp (jmp_buf)
     call PLT_SYMBOL(longjmp)
     int3                            // won't get here.
 END_FUNCTION art_nested_signal_return
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 7d86c3a..ad89bca 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -353,8 +353,7 @@
      *
      * Adapted from x86 code.
      */
-MACRO2(INVOKE_TRAMPOLINE, c_name, cxx_name)
-    DEFINE_FUNCTION VAR(c_name, 0)
+MACRO1(INVOKE_TRAMPOLINE_BODY, cxx_name)
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME  // save callee saves in case allocation triggers GC
     // Helper signature is always
     // (method_idx, *this_object, *caller_method, *self, sp)
@@ -362,7 +361,7 @@
     movq %gs:THREAD_SELF_OFFSET, %rdx                      // pass Thread
     movq %rsp, %rcx                                        // pass SP
 
-    call VAR(cxx_name, 1)                   // cxx_name(arg1, arg2, Thread*, SP)
+    call VAR(cxx_name, 0)                   // cxx_name(arg1, arg2, Thread*, SP)
                                                            // save the code pointer
     movq %rax, %rdi
     movq %rdx, %rax
@@ -375,10 +374,13 @@
     jmp *%rax
 1:
     DELIVER_PENDING_EXCEPTION
-    END_FUNCTION VAR(c_name, 0)
+END_MACRO
+MACRO2(INVOKE_TRAMPOLINE, c_name, cxx_name)
+    DEFINE_FUNCTION RAW_VAR(c_name, 0)
+    INVOKE_TRAMPOLINE_BODY RAW_VAR(cxx_name, 1)
+    END_FUNCTION RAW_VAR(c_name, 0)
 END_MACRO
 
-INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline, artInvokeInterfaceTrampoline
 INVOKE_TRAMPOLINE art_quick_invoke_interface_trampoline_with_access_check, artInvokeInterfaceTrampolineWithAccessCheck
 
 INVOKE_TRAMPOLINE art_quick_invoke_static_trampoline_with_access_check, artInvokeStaticTrampolineWithAccessCheck
@@ -1352,7 +1354,7 @@
     int3
 #else
     movq %rax, %rdi
-    jmp art_quick_invoke_interface_trampoline
+    INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 #endif  // __APPLE__
 END_FUNCTION art_quick_imt_conflict_trampoline
 
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 20d75f3..b1d0841 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -89,7 +89,7 @@
             art::Thread::ThinLockIdOffset<__SIZEOF_POINTER__>().Int32Value())
 
 // Offset of field Thread::tlsPtr_.card_table.
-#define THREAD_CARD_TABLE_OFFSET 128
+#define THREAD_CARD_TABLE_OFFSET 136
 ADD_TEST_EQ(THREAD_CARD_TABLE_OFFSET,
             art::Thread::CardTableOffset<__SIZEOF_POINTER__>().Int32Value())
 
@@ -108,7 +108,7 @@
 ADD_TEST_EQ(THREAD_SELF_OFFSET,
             art::Thread::SelfOffset<__SIZEOF_POINTER__>().Int32Value())
 
-#define THREAD_LOCAL_POS_OFFSET (THREAD_CARD_TABLE_OFFSET + 147 * __SIZEOF_POINTER__)
+#define THREAD_LOCAL_POS_OFFSET (THREAD_CARD_TABLE_OFFSET + 150 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_POS_OFFSET,
             art::Thread::ThreadLocalPosOffset<__SIZEOF_POINTER__>().Int32Value())
 #define THREAD_LOCAL_END_OFFSET (THREAD_LOCAL_POS_OFFSET + __SIZEOF_POINTER__)
diff --git a/runtime/base/arena_allocator.cc b/runtime/base/arena_allocator.cc
index 8f2d94b..e5832e1 100644
--- a/runtime/base/arena_allocator.cc
+++ b/runtime/base/arena_allocator.cc
@@ -23,11 +23,11 @@
 #include "mem_map.h"
 #include "mutex.h"
 #include "thread-inl.h"
-#include <memcheck/memcheck.h>
+#include "base/memory_tool.h"
 
 namespace art {
 
-static constexpr size_t kValgrindRedZoneBytes = 8;
+static constexpr size_t kMemoryToolRedZoneBytes = 8;
 constexpr size_t Arena::kDefaultSize;
 
 template <bool kCount>
@@ -217,9 +217,9 @@
 }
 
 void ArenaPool::FreeArenaChain(Arena* first) {
-  if (UNLIKELY(RUNNING_ON_VALGRIND > 0)) {
+  if (UNLIKELY(RUNNING_ON_MEMORY_TOOL > 0)) {
     for (Arena* arena = first; arena != nullptr; arena = arena->next_) {
-      VALGRIND_MAKE_MEM_UNDEFINED(arena->memory_, arena->bytes_allocated_);
+      MEMORY_TOOL_MAKE_UNDEFINED(arena->memory_, arena->bytes_allocated_);
     }
   }
   if (first != nullptr) {
@@ -255,7 +255,7 @@
     end_(nullptr),
     ptr_(nullptr),
     arena_head_(nullptr),
-    running_on_valgrind_(RUNNING_ON_VALGRIND > 0) {
+    is_running_on_memory_tool_(RUNNING_ON_MEMORY_TOOL) {
 }
 
 void ArenaAllocator::UpdateBytesAllocated() {
@@ -267,7 +267,7 @@
 }
 
 void* ArenaAllocator::AllocValgrind(size_t bytes, ArenaAllocKind kind) {
-  size_t rounded_bytes = RoundUp(bytes + kValgrindRedZoneBytes, 8);
+  size_t rounded_bytes = RoundUp(bytes + kMemoryToolRedZoneBytes, 8);
   if (UNLIKELY(ptr_ + rounded_bytes > end_)) {
     // Obtain a new block.
     ObtainNewArenaForAllocation(rounded_bytes);
@@ -282,7 +282,7 @@
   for (uint8_t* ptr = ret; ptr < ptr_; ++ptr) {
     CHECK_EQ(*ptr, 0U);
   }
-  VALGRIND_MAKE_MEM_NOACCESS(ret + bytes, rounded_bytes - bytes);
+  MEMORY_TOOL_MAKE_NOACCESS(ret + bytes, rounded_bytes - bytes);
   return ret;
 }
 
diff --git a/runtime/base/arena_allocator.h b/runtime/base/arena_allocator.h
index d9723b5..d977941 100644
--- a/runtime/base/arena_allocator.h
+++ b/runtime/base/arena_allocator.h
@@ -207,7 +207,7 @@
 
   // Returns zeroed memory.
   void* Alloc(size_t bytes, ArenaAllocKind kind = kArenaAllocMisc) ALWAYS_INLINE {
-    if (UNLIKELY(running_on_valgrind_)) {
+    if (UNLIKELY(is_running_on_memory_tool_)) {
       return AllocValgrind(bytes, kind);
     }
     bytes = RoundUp(bytes, kAlignment);
@@ -280,7 +280,7 @@
   uint8_t* end_;
   uint8_t* ptr_;
   Arena* arena_head_;
-  bool running_on_valgrind_;
+  bool is_running_on_memory_tool_;
 
   template <typename U>
   friend class ArenaAllocatorAdapter;
diff --git a/runtime/base/memory_tool.h b/runtime/base/memory_tool.h
new file mode 100644
index 0000000..31162a3
--- /dev/null
+++ b/runtime/base/memory_tool.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_BASE_MEMORY_TOOL_H_
+#define ART_RUNTIME_BASE_MEMORY_TOOL_H_
+
+#include <stddef.h>
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+
+#if __has_feature(address_sanitizer)
+
+#include <sanitizer/asan_interface.h>
+#define ADDRESS_SANITIZER
+#define MEMORY_TOOL_MAKE_NOACCESS(p, s) __asan_poison_memory_region(p, s)
+#define MEMORY_TOOL_MAKE_UNDEFINED(p, s) __asan_unpoison_memory_region(p, s)
+#define MEMORY_TOOL_MAKE_DEFINED(p, s) __asan_unpoison_memory_region(p, s)
+#define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
+#define RUNNING_ON_MEMORY_TOOL 1U
+constexpr bool kMemoryToolIsValgrind = false;
+constexpr bool kMemoryToolDetectsLeaks = true;
+constexpr bool kMemoryToolAddsRedzones = true;
+constexpr size_t kMemoryToolStackGuardSizeScale = 2;
+
+#else
+
+#include <valgrind.h>
+#include <memcheck/memcheck.h>
+#define MEMORY_TOOL_MAKE_NOACCESS(p, s) VALGRIND_MAKE_MEM_NOACCESS(p, s)
+#define MEMORY_TOOL_MAKE_UNDEFINED(p, s) VALGRIND_MAKE_MEM_UNDEFINED(p, s)
+#define MEMORY_TOOL_MAKE_DEFINED(p, s) VALGRIND_MAKE_MEM_DEFINED(p, s)
+#define ATTRIBUTE_NO_SANITIZE_ADDRESS
+#define RUNNING_ON_MEMORY_TOOL RUNNING_ON_VALGRIND
+constexpr bool kMemoryToolIsValgrind = true;
+constexpr bool kMemoryToolDetectsLeaks = true;
+constexpr bool kMemoryToolAddsRedzones = true;
+constexpr size_t kMemoryToolStackGuardSizeScale = 1;
+
+#endif
+
+#endif  // ART_RUNTIME_BASE_MEMORY_TOOL_H_
diff --git a/runtime/base/mutex-inl.h b/runtime/base/mutex-inl.h
index 87840e7..bd8de87 100644
--- a/runtime/base/mutex-inl.h
+++ b/runtime/base/mutex-inl.h
@@ -218,6 +218,16 @@
 #endif
 }
 
+inline void MutatorMutex::TransitionFromRunnableToSuspended(Thread* self) {
+  AssertSharedHeld(self);
+  RegisterAsUnlocked(self);
+}
+
+inline void MutatorMutex::TransitionFromSuspendedToRunnable(Thread* self) {
+  RegisterAsLocked(self);
+  AssertSharedHeld(self);
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_BASE_MUTEX_INL_H_
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 5c6065d..e48d170 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -47,7 +47,7 @@
 Mutex* Locks::logging_lock_ = nullptr;
 Mutex* Locks::mem_maps_lock_ = nullptr;
 Mutex* Locks::modify_ldt_lock_ = nullptr;
-ReaderWriterMutex* Locks::mutator_lock_ = nullptr;
+MutatorMutex* Locks::mutator_lock_ = nullptr;
 Mutex* Locks::profiler_lock_ = nullptr;
 Mutex* Locks::reference_processor_lock_ = nullptr;
 Mutex* Locks::reference_queue_cleared_references_lock_ = nullptr;
@@ -738,6 +738,11 @@
   return os;
 }
 
+std::ostream& operator<<(std::ostream& os, const MutatorMutex& mu) {
+  mu.Dump(os);
+  return os;
+}
+
 ConditionVariable::ConditionVariable(const char* name, Mutex& guard)
     : name_(name), guard_(guard) {
 #if ART_USE_FUTEXES
@@ -958,7 +963,7 @@
 
     UPDATE_CURRENT_LOCK_LEVEL(kMutatorLock);
     DCHECK(mutator_lock_ == nullptr);
-    mutator_lock_ = new ReaderWriterMutex("mutator lock", current_lock_level);
+    mutator_lock_ = new MutatorMutex("mutator lock", current_lock_level);
 
     UPDATE_CURRENT_LOCK_LEVEL(kHeapBitmapLock);
     DCHECK(heap_bitmap_lock_ == nullptr);
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 678d55b..f87467a 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -44,6 +44,7 @@
 namespace art {
 
 class LOCKABLE ReaderWriterMutex;
+class LOCKABLE MutatorMutex;
 class ScopedContentionRecorder;
 class Thread;
 
@@ -138,6 +139,7 @@
 
   virtual bool IsMutex() const { return false; }
   virtual bool IsReaderWriterMutex() const { return false; }
+  virtual bool IsMutatorMutex() const { return false; }
 
   virtual void Dump(std::ostream& os) const = 0;
 
@@ -385,6 +387,36 @@
   DISALLOW_COPY_AND_ASSIGN(ReaderWriterMutex);
 };
 
+// MutatorMutex is a special kind of ReaderWriterMutex created specifically for the
+// Locks::mutator_lock_ mutex. The behaviour is identical to the ReaderWriterMutex except that
+// thread state changes also play a part in lock ownership. The mutator_lock_ will not be truly
+// held by any mutator threads. However, a thread in the kRunnable state is considered to have
+// shared ownership of the mutator lock and therefore transitions in and out of the kRunnable
+// state have associated implications on lock ownership. Extra methods to handle the state
+// transitions have been added to the interface but are only accessible to the methods dealing
+// with state transitions. The thread state and flags attributes are used to ensure thread state
+// transitions are consistent with the permitted behaviour of the mutex.
+//
+// *) The most important consequence of this behaviour is that all threads must be in one of the
+// suspended states before exclusive ownership of the mutator mutex is sought.
+//
+std::ostream& operator<<(std::ostream& os, const MutatorMutex& mu);
+class LOCKABLE MutatorMutex : public ReaderWriterMutex {
+ public:
+  explicit MutatorMutex(const char* name, LockLevel level = kDefaultMutexLevel)
+    : ReaderWriterMutex(name, level) {}
+  ~MutatorMutex() {}
+
+  virtual bool IsMutatorMutex() const { return true; }
+
+ private:
+  friend class Thread;
+  void TransitionFromRunnableToSuspended(Thread* self) UNLOCK_FUNCTION() ALWAYS_INLINE;
+  void TransitionFromSuspendedToRunnable(Thread* self) SHARED_LOCK_FUNCTION() ALWAYS_INLINE;
+
+  DISALLOW_COPY_AND_ASSIGN(MutatorMutex);
+};
+
 // ConditionVariables allow threads to queue and sleep. Threads may then be resumed individually
 // (Signal) or all at once (Broadcast).
 class ConditionVariable {
@@ -495,35 +527,28 @@
   // Guards allocation entrypoint instrumenting.
   static Mutex* instrument_entrypoints_lock_;
 
-  // The mutator_lock_ is used to allow mutators to execute in a shared (reader) mode or to block
-  // mutators by having an exclusive (writer) owner. In normal execution each mutator thread holds
-  // a share on the mutator_lock_. The garbage collector may also execute with shared access but
-  // at times requires exclusive access to the heap (not to be confused with the heap meta-data
-  // guarded by the heap_lock_ below). When the garbage collector requires exclusive access it asks
-  // the mutators to suspend themselves which also involves usage of the thread_suspend_count_lock_
-  // to cover weaknesses in using ReaderWriterMutexes with ConditionVariables. We use a condition
-  // variable to wait upon in the suspension logic as releasing and then re-acquiring a share on
-  // the mutator lock doesn't necessarily allow the exclusive user (e.g the garbage collector)
-  // chance to acquire the lock.
+  // A barrier is used to synchronize the GC/Debugger thread with mutator threads. When GC/Debugger
+  // thread wants to suspend all mutator threads, it needs to wait for all mutator threads to pass
+  // a barrier. Threads that are already suspended will get their barrier passed by the GC/Debugger
+  // thread; threads in the runnable state will pass the barrier when they transit to the suspended
+  // state. GC/Debugger thread will be woken up when all mutator threads are suspended.
   //
   // Thread suspension:
-  // Shared users                                  | Exclusive user
-  // (holding mutator lock and in kRunnable state) |   .. running ..
+  // mutator thread                                | GC/Debugger
+  //   .. running ..                               |   .. running ..
   //   .. running ..                               | Request thread suspension by:
   //   .. running ..                               |   - acquiring thread_suspend_count_lock_
   //   .. running ..                               |   - incrementing Thread::suspend_count_ on
   //   .. running ..                               |     all mutator threads
   //   .. running ..                               |   - releasing thread_suspend_count_lock_
-  //   .. running ..                               | Block trying to acquire exclusive mutator lock
+  //   .. running ..                               | Block wait for all threads to pass a barrier
   // Poll Thread::suspend_count_ and enter full    |   .. blocked ..
   // suspend code.                                 |   .. blocked ..
-  // Change state to kSuspended                    |   .. blocked ..
-  // x: Release share on mutator_lock_             | Carry out exclusive access
-  // Acquire thread_suspend_count_lock_            |   .. exclusive ..
-  // while Thread::suspend_count_ > 0              |   .. exclusive ..
-  //   - wait on Thread::resume_cond_              |   .. exclusive ..
-  //     (releases thread_suspend_count_lock_)     |   .. exclusive ..
-  //   .. waiting ..                               | Release mutator_lock_
+  // Change state to kSuspended (pass the barrier) | Wake up when all threads pass the barrier
+  // x: Acquire thread_suspend_count_lock_         |   .. running ..
+  // while Thread::suspend_count_ > 0              |   .. running ..
+  //   - wait on Thread::resume_cond_              |   .. running ..
+  //     (releases thread_suspend_count_lock_)     |   .. running ..
   //   .. waiting ..                               | Request thread resumption by:
   //   .. waiting ..                               |   - acquiring thread_suspend_count_lock_
   //   .. waiting ..                               |   - decrementing Thread::suspend_count_ on
@@ -531,29 +556,13 @@
   //   .. waiting ..                               |   - notifying on Thread::resume_cond_
   //    - re-acquire thread_suspend_count_lock_    |   - releasing thread_suspend_count_lock_
   // Release thread_suspend_count_lock_            |  .. running ..
-  // Acquire share on mutator_lock_                |  .. running ..
-  //  - This could block but the thread still      |  .. running ..
-  //    has a state of kSuspended and so this      |  .. running ..
-  //    isn't an issue.                            |  .. running ..
-  // Acquire thread_suspend_count_lock_            |  .. running ..
-  //  - we poll here as we're transitioning into   |  .. running ..
-  //    kRunnable and an individual thread suspend |  .. running ..
-  //    request (e.g for debugging) won't try      |  .. running ..
-  //    to acquire the mutator lock (which would   |  .. running ..
-  //    block as we hold the mutator lock). This   |  .. running ..
-  //    poll ensures that if the suspender thought |  .. running ..
-  //    we were suspended by incrementing our      |  .. running ..
-  //    Thread::suspend_count_ and then reading    |  .. running ..
-  //    our state we go back to waiting on         |  .. running ..
-  //    Thread::resume_cond_.                      |  .. running ..
-  // can_go_runnable = Thread::suspend_count_ == 0 |  .. running ..
-  // Release thread_suspend_count_lock_            |  .. running ..
-  // if can_go_runnable                            |  .. running ..
-  //   Change state to kRunnable                   |  .. running ..
-  // else                                          |  .. running ..
-  //   Goto x                                      |  .. running ..
+  // Change to kRunnable                           |  .. running ..
+  //  - this uses a CAS operation to ensure the    |  .. running ..
+  //    suspend request flag isn't raised as the   |  .. running ..
+  //    state is changed                           |  .. running ..
+  //  - if the CAS operation fails then goto x     |  .. running ..
   //  .. running ..                                |  .. running ..
-  static ReaderWriterMutex* mutator_lock_ ACQUIRED_AFTER(instrument_entrypoints_lock_);
+  static MutatorMutex* mutator_lock_ ACQUIRED_AFTER(instrument_entrypoints_lock_);
 
   // Allow reader-writer mutual exclusion on the mark and live bitmaps of the heap.
   static ReaderWriterMutex* heap_bitmap_lock_ ACQUIRED_AFTER(mutator_lock_);
diff --git a/runtime/base/scoped_arena_allocator.cc b/runtime/base/scoped_arena_allocator.cc
index 4a7be38..d823edd 100644
--- a/runtime/base/scoped_arena_allocator.cc
+++ b/runtime/base/scoped_arena_allocator.cc
@@ -17,11 +17,11 @@
 #include "scoped_arena_allocator.h"
 
 #include "arena_allocator.h"
-#include <memcheck/memcheck.h>
+#include "base/memory_tool.h"
 
 namespace art {
 
-static constexpr size_t kValgrindRedZoneBytes = 8;
+static constexpr size_t kMemoryToolRedZoneBytes = 8;
 
 ArenaStack::ArenaStack(ArenaPool* arena_pool)
   : DebugStackRefCounter(),
@@ -30,7 +30,7 @@
     top_arena_(nullptr),
     top_ptr_(nullptr),
     top_end_(nullptr),
-    running_on_valgrind_(RUNNING_ON_VALGRIND > 0) {
+    is_running_on_memory_tool_(RUNNING_ON_MEMORY_TOOL > 0) {
 }
 
 ArenaStack::~ArenaStack() {
@@ -92,7 +92,7 @@
 }
 
 void* ArenaStack::AllocValgrind(size_t bytes, ArenaAllocKind kind) {
-  size_t rounded_bytes = RoundUp(bytes + kValgrindRedZoneBytes, 8);
+  size_t rounded_bytes = RoundUp(bytes + kMemoryToolRedZoneBytes, 8);
   uint8_t* ptr = top_ptr_;
   if (UNLIKELY(static_cast<size_t>(top_end_ - ptr) < rounded_bytes)) {
     ptr = AllocateFromNextArena(rounded_bytes);
@@ -100,8 +100,8 @@
   }
   CurrentStats()->RecordAlloc(bytes, kind);
   top_ptr_ = ptr + rounded_bytes;
-  VALGRIND_MAKE_MEM_UNDEFINED(ptr, bytes);
-  VALGRIND_MAKE_MEM_NOACCESS(ptr + bytes, rounded_bytes - bytes);
+  MEMORY_TOOL_MAKE_UNDEFINED(ptr, bytes);
+  MEMORY_TOOL_MAKE_NOACCESS(ptr + bytes, rounded_bytes - bytes);
   return ptr;
 }
 
diff --git a/runtime/base/scoped_arena_allocator.h b/runtime/base/scoped_arena_allocator.h
index bbedeac..ca514e4 100644
--- a/runtime/base/scoped_arena_allocator.h
+++ b/runtime/base/scoped_arena_allocator.h
@@ -64,7 +64,7 @@
 
   // Private - access via ScopedArenaAllocator or ScopedArenaAllocatorAdapter.
   void* Alloc(size_t bytes, ArenaAllocKind kind) ALWAYS_INLINE {
-    if (UNLIKELY(running_on_valgrind_)) {
+    if (UNLIKELY(is_running_on_memory_tool_)) {
       return AllocValgrind(bytes, kind);
     }
     size_t rounded_bytes = RoundUp(bytes, 8);
@@ -88,7 +88,7 @@
   uint8_t* top_ptr_;
   uint8_t* top_end_;
 
-  const bool running_on_valgrind_;
+  const bool is_running_on_memory_tool_;
 
   friend class ScopedArenaAllocator;
   template <typename T>
diff --git a/runtime/dex_file_verifier.cc b/runtime/dex_file_verifier.cc
index 90b8fdb..eec4983 100644
--- a/runtime/dex_file_verifier.cc
+++ b/runtime/dex_file_verifier.cc
@@ -794,13 +794,13 @@
   }
 
   const DexFile::TryItem* try_items = DexFile::GetTryItems(*code_item, 0);
-  ptr_ = DexFile::GetCatchHandlerData(*code_item, 0);
-  uint32_t handlers_size = DecodeUnsignedLeb128(&ptr_);
-
   if (!CheckListSize(try_items, try_items_size, sizeof(DexFile::TryItem), "try_items size")) {
     return false;
   }
 
+  ptr_ = DexFile::GetCatchHandlerData(*code_item, 0);
+  uint32_t handlers_size = DecodeUnsignedLeb128(&ptr_);
+
   if (UNLIKELY((handlers_size == 0) || (handlers_size >= 65536))) {
     ErrorStringPrintf("Invalid handlers_size: %ud", handlers_size);
     return false;
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 0a5ebfa..c05c935 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -116,7 +116,7 @@
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, last_no_thread_suspension_cause, checkpoint_functions,
                         sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, checkpoint_functions, interpreter_entrypoints,
-                        sizeof(void*) * 3);
+                        sizeof(void*) * 6);
 
     // Skip across the entrypoints structures.
 
@@ -133,7 +133,8 @@
                         sizeof(void*) * kLockLevelCount);
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, nested_signal_state, flip_function, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, flip_function, method_verifier, sizeof(void*));
-    EXPECT_OFFSET_DIFF(Thread, tlsPtr_.method_verifier, Thread, wait_mutex_, sizeof(void*),
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, method_verifier, thread_local_mark_stack, sizeof(void*));
+    EXPECT_OFFSET_DIFF(Thread, tlsPtr_.thread_local_mark_stack, Thread, wait_mutex_, sizeof(void*),
                        thread_tlsptr_end);
   }
 
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index ac716ea..93f32e8 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -156,6 +156,10 @@
     return Size() == 0;
   }
 
+  bool IsFull() const {
+    return Size() == growth_limit_;
+  }
+
   size_t Size() const {
     DCHECK_LE(front_index_.LoadRelaxed(), back_index_.LoadRelaxed());
     return back_index_.LoadRelaxed() - front_index_.LoadRelaxed();
diff --git a/runtime/gc/accounting/mod_union_table.cc b/runtime/gc/accounting/mod_union_table.cc
index cd3f910..009254b 100644
--- a/runtime/gc/accounting/mod_union_table.cc
+++ b/runtime/gc/accounting/mod_union_table.cc
@@ -21,16 +21,11 @@
 #include "base/stl_util.h"
 #include "bitmap-inl.h"
 #include "card_table-inl.h"
-#include "heap_bitmap.h"
 #include "gc/accounting/space_bitmap-inl.h"
-#include "gc/collector/mark_sweep.h"
-#include "gc/collector/mark_sweep-inl.h"
 #include "gc/heap.h"
-#include "gc/space/space.h"
 #include "gc/space/image_space.h"
+#include "gc/space/space.h"
 #include "mirror/object-inl.h"
-#include "mirror/class-inl.h"
-#include "mirror/object_array-inl.h"
 #include "space_bitmap-inl.h"
 #include "thread.h"
 
@@ -95,11 +90,11 @@
 
 class ModUnionUpdateObjectReferencesVisitor {
  public:
-  ModUnionUpdateObjectReferencesVisitor(MarkHeapReferenceCallback* callback, void* arg,
+  ModUnionUpdateObjectReferencesVisitor(MarkObjectVisitor* visitor,
                                         space::ContinuousSpace* from_space,
                                         space::ContinuousSpace* immune_space,
                                         bool* contains_reference_to_other_space)
-    : callback_(callback), arg_(arg), from_space_(from_space), immune_space_(immune_space),
+    : visitor_(visitor), from_space_(from_space), immune_space_(immune_space),
       contains_reference_to_other_space_(contains_reference_to_other_space) {
   }
 
@@ -111,13 +106,12 @@
     mirror::Object* ref = obj_ptr->AsMirrorPtr();
     if (ref != nullptr && !from_space_->HasAddress(ref) && !immune_space_->HasAddress(ref)) {
       *contains_reference_to_other_space_ = true;
-      callback_(obj_ptr, arg_);
+      visitor_->MarkHeapReference(obj_ptr);
     }
   }
 
  private:
-  MarkHeapReferenceCallback* const callback_;
-  void* const arg_;
+  MarkObjectVisitor* const visitor_;
   // Space which we are scanning
   space::ContinuousSpace* const from_space_;
   space::ContinuousSpace* const immune_space_;
@@ -129,25 +123,24 @@
  public:
   // Immune space is any other space which we don't care about references to. Currently this is
   // the image space in the case of the zygote mod union table.
-  ModUnionScanImageRootVisitor(MarkHeapReferenceCallback* callback, void* arg,
+  ModUnionScanImageRootVisitor(MarkObjectVisitor* visitor,
                                space::ContinuousSpace* from_space,
                                space::ContinuousSpace* immune_space,
                                bool* contains_reference_to_other_space)
-      : callback_(callback), arg_(arg), from_space_(from_space), immune_space_(immune_space),
+      : visitor_(visitor), from_space_(from_space), immune_space_(immune_space),
         contains_reference_to_other_space_(contains_reference_to_other_space) {}
 
   void operator()(Object* root) const
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(root != nullptr);
-    ModUnionUpdateObjectReferencesVisitor ref_visitor(callback_, arg_, from_space_, immune_space_,
+    ModUnionUpdateObjectReferencesVisitor ref_visitor(visitor_, from_space_, immune_space_,
                                                       contains_reference_to_other_space_);
     root->VisitReferences<kMovingClasses>(ref_visitor, VoidFunctor());
   }
 
  private:
-  MarkHeapReferenceCallback* const callback_;
-  void* const arg_;
+  MarkObjectVisitor* const visitor_;
   // Space which we are scanning
   space::ContinuousSpace* const from_space_;
   space::ContinuousSpace* const immune_space_;
@@ -305,8 +298,7 @@
   }
 }
 
-void ModUnionTableReferenceCache::UpdateAndMarkReferences(MarkHeapReferenceCallback* callback,
-                                                          void* arg) {
+void ModUnionTableReferenceCache::UpdateAndMarkReferences(MarkObjectVisitor* visitor) {
   CardTable* card_table = heap_->GetCardTable();
 
   std::vector<mirror::HeapReference<Object>*> cards_references;
@@ -338,7 +330,7 @@
   size_t count = 0;
   for (const auto& ref : references_) {
     for (mirror::HeapReference<Object>* obj_ptr : ref.second) {
-      callback(obj_ptr, arg);
+      visitor->MarkHeapReference(obj_ptr);
     }
     count += ref.second.size();
   }
@@ -362,9 +354,9 @@
 
 class CardBitVisitor {
  public:
-  CardBitVisitor(MarkHeapReferenceCallback* callback, void* arg, space::ContinuousSpace* space,
+  CardBitVisitor(MarkObjectVisitor* visitor, space::ContinuousSpace* space,
                  space::ContinuousSpace* immune_space, ModUnionTable::CardBitmap* card_bitmap)
-      : callback_(callback), arg_(arg), space_(space), immune_space_(immune_space),
+      : visitor_(visitor), space_(space), immune_space_(immune_space),
         bitmap_(space->GetLiveBitmap()), card_bitmap_(card_bitmap) {
     DCHECK(immune_space_ != nullptr);
   }
@@ -374,7 +366,7 @@
     DCHECK(space_->HasAddress(reinterpret_cast<mirror::Object*>(start)))
         << start << " " << *space_;
     bool reference_to_other_space = false;
-    ModUnionScanImageRootVisitor scan_visitor(callback_, arg_, space_, immune_space_,
+    ModUnionScanImageRootVisitor scan_visitor(visitor_, space_, immune_space_,
                                               &reference_to_other_space);
     bitmap_->VisitMarkedRange(start, start + CardTable::kCardSize, scan_visitor);
     if (!reference_to_other_space) {
@@ -384,8 +376,7 @@
   }
 
  private:
-  MarkHeapReferenceCallback* const callback_;
-  void* const arg_;
+  MarkObjectVisitor* const visitor_;
   space::ContinuousSpace* const space_;
   space::ContinuousSpace* const immune_space_;
   ContinuousSpaceBitmap* const bitmap_;
@@ -400,15 +391,14 @@
 }
 
 // Mark all references to the alloc space(s).
-void ModUnionTableCardCache::UpdateAndMarkReferences(MarkHeapReferenceCallback* callback,
-                                                     void* arg) {
+void ModUnionTableCardCache::UpdateAndMarkReferences(MarkObjectVisitor* visitor) {
   auto* image_space = heap_->GetImageSpace();
   // If we don't have an image space, just pass in space_ as the immune space. Pass in the same
   // space_ instead of image_space to avoid a null check in ModUnionUpdateObjectReferencesVisitor.
-  CardBitVisitor visitor(callback, arg, space_, image_space != nullptr ? image_space : space_,
+  CardBitVisitor bit_visitor(visitor, space_, image_space != nullptr ? image_space : space_,
       card_bitmap_.get());
   card_bitmap_->VisitSetBits(
-      0, RoundUp(space_->Size(), CardTable::kCardSize) / CardTable::kCardSize, visitor);
+      0, RoundUp(space_->Size(), CardTable::kCardSize) / CardTable::kCardSize, bit_visitor);
 }
 
 void ModUnionTableCardCache::Dump(std::ostream& os) {
diff --git a/runtime/gc/accounting/mod_union_table.h b/runtime/gc/accounting/mod_union_table.h
index 2e232ca..520cc1c 100644
--- a/runtime/gc/accounting/mod_union_table.h
+++ b/runtime/gc/accounting/mod_union_table.h
@@ -76,7 +76,7 @@
   // Update the mod-union table using data stored by ClearCards. There may be multiple ClearCards
   // before a call to update, for example, back-to-back sticky GCs. Also mark references to other
   // spaces which are stored in the mod-union table.
-  virtual void UpdateAndMarkReferences(MarkHeapReferenceCallback* callback, void* arg) = 0;
+  virtual void UpdateAndMarkReferences(MarkObjectVisitor* visitor) = 0;
 
   // Verification, sanity checks that we don't have clean cards which conflict with out cached data
   // for said cards. Exclusive lock is required since verify sometimes uses
@@ -117,7 +117,7 @@
   void ClearCards() OVERRIDE;
 
   // Update table based on cleared cards and mark all references to the other spaces.
-  void UpdateAndMarkReferences(MarkHeapReferenceCallback* callback, void* arg) OVERRIDE
+  void UpdateAndMarkReferences(MarkObjectVisitor* visitor) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -157,7 +157,7 @@
   virtual void ClearCards() OVERRIDE;
 
   // Mark all references to the alloc space(s).
-  virtual void UpdateAndMarkReferences(MarkHeapReferenceCallback* callback, void* arg) OVERRIDE
+  virtual void UpdateAndMarkReferences(MarkObjectVisitor* visitor) OVERRIDE
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
diff --git a/runtime/gc/accounting/mod_union_table_test.cc b/runtime/gc/accounting/mod_union_table_test.cc
index 363b76a..aad8a25 100644
--- a/runtime/gc/accounting/mod_union_table_test.cc
+++ b/runtime/gc/accounting/mod_union_table_test.cc
@@ -93,12 +93,24 @@
 };
 
 // Collect visited objects into container.
-static void CollectVisitedCallback(mirror::HeapReference<mirror::Object>* ref, void* arg)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  DCHECK(ref != nullptr);
-  DCHECK(arg != nullptr);
-  reinterpret_cast<std::set<mirror::Object*>*>(arg)->insert(ref->AsMirrorPtr());
-}
+class CollectVisitedVisitor : public MarkObjectVisitor {
+ public:
+  explicit CollectVisitedVisitor(std::set<mirror::Object*>* out) : out_(out) {}
+  virtual void MarkHeapReference(mirror::HeapReference<mirror::Object>* ref) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    DCHECK(ref != nullptr);
+    MarkObject(ref->AsMirrorPtr());
+  }
+  virtual mirror::Object* MarkObject(mirror::Object* obj) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    DCHECK(obj != nullptr);
+    out_->insert(obj);
+    return obj;
+  }
+
+ private:
+  std::set<mirror::Object*>* const out_;
+};
 
 // A mod union table that only holds references to a specified target space.
 class ModUnionTableRefCacheToSpace : public ModUnionTableReferenceCache {
@@ -199,7 +211,8 @@
   obj2->Set(3, other_space_ref2);
   table->ClearCards();
   std::set<mirror::Object*> visited_before;
-  table->UpdateAndMarkReferences(&CollectVisitedCallback, &visited_before);
+  CollectVisitedVisitor collector_before(&visited_before);
+  table->UpdateAndMarkReferences(&collector_before);
   // Check that we visited all the references in other spaces only.
   ASSERT_GE(visited_before.size(), 2u);
   ASSERT_TRUE(visited_before.find(other_space_ref1) != visited_before.end());
@@ -230,7 +243,8 @@
   }
   // Visit again and make sure the cards got cleared back to their sane state.
   std::set<mirror::Object*> visited_after;
-  table->UpdateAndMarkReferences(&CollectVisitedCallback, &visited_after);
+  CollectVisitedVisitor collector_after(&visited_after);
+  table->UpdateAndMarkReferences(&collector_after);
   // Check that we visited a superset after.
   for (auto* obj : visited_before) {
     ASSERT_TRUE(visited_after.find(obj) != visited_after.end()) << obj;
diff --git a/runtime/gc/accounting/remembered_set.cc b/runtime/gc/accounting/remembered_set.cc
index eeb385e..23ab8df 100644
--- a/runtime/gc/accounting/remembered_set.cc
+++ b/runtime/gc/accounting/remembered_set.cc
@@ -61,11 +61,10 @@
 
 class RememberedSetReferenceVisitor {
  public:
-  RememberedSetReferenceVisitor(MarkHeapReferenceCallback* callback,
-                                DelayReferenceReferentCallback* ref_callback,
-                                space::ContinuousSpace* target_space,
-                                bool* const contains_reference_to_target_space, void* arg)
-      : callback_(callback), ref_callback_(ref_callback), target_space_(target_space), arg_(arg),
+  RememberedSetReferenceVisitor(space::ContinuousSpace* target_space,
+                                bool* const contains_reference_to_target_space,
+                                collector::GarbageCollector* collector)
+      : collector_(collector), target_space_(target_space),
         contains_reference_to_target_space_(contains_reference_to_target_space) {}
 
   void operator()(mirror::Object* obj, MemberOffset offset, bool /* is_static */) const
@@ -74,7 +73,7 @@
     mirror::HeapReference<mirror::Object>* ref_ptr = obj->GetFieldObjectReferenceAddr(offset);
     if (target_space_->HasAddress(ref_ptr->AsMirrorPtr())) {
       *contains_reference_to_target_space_ = true;
-      callback_(ref_ptr, arg_);
+      collector_->MarkHeapReference(ref_ptr);
       DCHECK(!target_space_->HasAddress(ref_ptr->AsMirrorPtr()));
     }
   }
@@ -84,49 +83,43 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
     if (target_space_->HasAddress(ref->GetReferent())) {
       *contains_reference_to_target_space_ = true;
-      ref_callback_(klass, ref, arg_);
+      collector_->DelayReferenceReferent(klass, ref);
     }
   }
 
  private:
-  MarkHeapReferenceCallback* const callback_;
-  DelayReferenceReferentCallback* const ref_callback_;
+  collector::GarbageCollector* const collector_;
   space::ContinuousSpace* const target_space_;
-  void* const arg_;
   bool* const contains_reference_to_target_space_;
 };
 
 class RememberedSetObjectVisitor {
  public:
-  RememberedSetObjectVisitor(MarkHeapReferenceCallback* callback,
-                             DelayReferenceReferentCallback* ref_callback,
-                             space::ContinuousSpace* target_space,
-                             bool* const contains_reference_to_target_space, void* arg)
-      : callback_(callback), ref_callback_(ref_callback), target_space_(target_space), arg_(arg),
+  RememberedSetObjectVisitor(space::ContinuousSpace* target_space,
+                             bool* const contains_reference_to_target_space,
+                             collector::GarbageCollector* collector)
+      : collector_(collector), target_space_(target_space),
         contains_reference_to_target_space_(contains_reference_to_target_space) {}
 
   void operator()(mirror::Object* obj) const EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    RememberedSetReferenceVisitor visitor(callback_, ref_callback_, target_space_,
-                                          contains_reference_to_target_space_, arg_);
+    RememberedSetReferenceVisitor visitor(target_space_, contains_reference_to_target_space_,
+                                          collector_);
     obj->VisitReferences<kMovingClasses>(visitor, visitor);
   }
 
  private:
-  MarkHeapReferenceCallback* const callback_;
-  DelayReferenceReferentCallback* const ref_callback_;
+  collector::GarbageCollector* const collector_;
   space::ContinuousSpace* const target_space_;
-  void* const arg_;
   bool* const contains_reference_to_target_space_;
 };
 
-void RememberedSet::UpdateAndMarkReferences(MarkHeapReferenceCallback* callback,
-                                            DelayReferenceReferentCallback* ref_callback,
-                                            space::ContinuousSpace* target_space, void* arg) {
+void RememberedSet::UpdateAndMarkReferences(space::ContinuousSpace* target_space,
+                                            collector::GarbageCollector* collector) {
   CardTable* card_table = heap_->GetCardTable();
   bool contains_reference_to_target_space = false;
-  RememberedSetObjectVisitor obj_visitor(callback, ref_callback, target_space,
-                                         &contains_reference_to_target_space, arg);
+  RememberedSetObjectVisitor obj_visitor(target_space, &contains_reference_to_target_space,
+                                         collector);
   ContinuousSpaceBitmap* bitmap = space_->GetLiveBitmap();
   CardSet remove_card_set;
   for (uint8_t* const card_addr : dirty_cards_) {
diff --git a/runtime/gc/accounting/remembered_set.h b/runtime/gc/accounting/remembered_set.h
index c51e26d..affe863 100644
--- a/runtime/gc/accounting/remembered_set.h
+++ b/runtime/gc/accounting/remembered_set.h
@@ -29,6 +29,7 @@
 namespace gc {
 
 namespace collector {
+  class GarbageCollector;
   class MarkSweep;
 }  // namespace collector
 namespace space {
@@ -53,9 +54,8 @@
   void ClearCards();
 
   // Mark through all references to the target space.
-  void UpdateAndMarkReferences(MarkHeapReferenceCallback* callback,
-                               DelayReferenceReferentCallback* ref_callback,
-                               space::ContinuousSpace* target_space, void* arg)
+  void UpdateAndMarkReferences(space::ContinuousSpace* target_space,
+                               collector::GarbageCollector* collector)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index 35faff3..e0661b6 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -30,6 +30,7 @@
 namespace art {
 
 namespace mirror {
+  class Class;
   class Object;
 }  // namespace mirror
 class MemMap;
diff --git a/runtime/gc/allocation_record.cc b/runtime/gc/allocation_record.cc
index 6537ed2..3108b7c 100644
--- a/runtime/gc/allocation_record.cc
+++ b/runtime/gc/allocation_record.cc
@@ -110,23 +110,24 @@
   }
 }
 
-static inline void SweepClassObject(AllocRecord* record, IsMarkedCallback* callback, void* arg)
+static inline void SweepClassObject(AllocRecord* record, IsMarkedVisitor* visitor)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
     EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
   GcRoot<mirror::Class>& klass = record->GetClassGcRoot();
   // This does not need a read barrier because this is called by GC.
   mirror::Object* old_object = klass.Read<kWithoutReadBarrier>();
-  // The class object can become null if we implement class unloading.
-  // In that case we might still want to keep the class name string (not implemented).
-  mirror::Object* new_object = UNLIKELY(old_object == nullptr) ?
-      nullptr : callback(old_object, arg);
-  if (UNLIKELY(old_object != new_object)) {
-    mirror::Class* new_klass = UNLIKELY(new_object == nullptr) ? nullptr : new_object->AsClass();
-    klass = GcRoot<mirror::Class>(new_klass);
+  if (old_object != nullptr) {
+    // The class object can become null if we implement class unloading.
+    // In that case we might still want to keep the class name string (not implemented).
+    mirror::Object* new_object = visitor->IsMarked(old_object);
+    DCHECK(new_object != nullptr);
+    if (UNLIKELY(old_object != new_object)) {
+      klass = GcRoot<mirror::Class>(new_object->AsClass());
+    }
   }
 }
 
-void AllocRecordObjectMap::SweepAllocationRecords(IsMarkedCallback* callback, void* arg) {
+void AllocRecordObjectMap::SweepAllocationRecords(IsMarkedVisitor* visitor) {
   VLOG(heap) << "Start SweepAllocationRecords()";
   size_t count_deleted = 0, count_moved = 0, count = 0;
   // Only the first (size - recent_record_max_) number of records can be deleted.
@@ -141,11 +142,11 @@
     // This does not need a read barrier because this is called by GC.
     mirror::Object* old_object = it->first.Read<kWithoutReadBarrier>();
     AllocRecord* record = it->second;
-    mirror::Object* new_object = old_object == nullptr ? nullptr : callback(old_object, arg);
+    mirror::Object* new_object = old_object == nullptr ? nullptr : visitor->IsMarked(old_object);
     if (new_object == nullptr) {
       if (count > delete_bound) {
         it->first = GcRoot<mirror::Object>(nullptr);
-        SweepClassObject(record, callback, arg);
+        SweepClassObject(record, visitor);
         ++it;
       } else {
         delete record;
@@ -157,7 +158,7 @@
         it->first = GcRoot<mirror::Object>(new_object);
         ++count_moved;
       }
-      SweepClassObject(record, callback, arg);
+      SweepClassObject(record, visitor);
       ++it;
     }
   }
@@ -265,7 +266,8 @@
   }
 
   // Wait for GC's sweeping to complete and allow new records
-  while (UNLIKELY(!records->allow_new_record_)) {
+  while (UNLIKELY((!kUseReadBarrier && !records->allow_new_record_) ||
+                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     records->new_record_condition_.WaitHoldingLocks(self);
   }
 
diff --git a/runtime/gc/allocation_record.h b/runtime/gc/allocation_record.h
index 06721c8..933363b 100644
--- a/runtime/gc/allocation_record.h
+++ b/runtime/gc/allocation_record.h
@@ -261,7 +261,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_);
 
-  void SweepAllocationRecords(IsMarkedCallback* callback, void* arg)
+  void SweepAllocationRecords(IsMarkedVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_);
 
diff --git a/runtime/gc/allocator/rosalloc-inl.h b/runtime/gc/allocator/rosalloc-inl.h
index bba92a1..25fdd7c 100644
--- a/runtime/gc/allocator/rosalloc-inl.h
+++ b/runtime/gc/allocator/rosalloc-inl.h
@@ -24,7 +24,7 @@
 namespace allocator {
 
 inline ALWAYS_INLINE bool RosAlloc::ShouldCheckZeroMemory() {
-  return kCheckZeroMemory && !running_on_valgrind_;
+  return kCheckZeroMemory && !is_running_on_memory_tool_;
 }
 
 template<bool kThreadSafe>
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 49c7fda..bd10f7b 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -16,8 +16,9 @@
 
 #include "rosalloc.h"
 
+#include "base/memory_tool.h"
 #include "base/mutex-inl.h"
-#include "gc/space/valgrind_settings.h"
+#include "gc/space/memory_tool_settings.h"
 #include "mem_map.h"
 #include "mirror/class-inl.h"
 #include "mirror/object.h"
@@ -50,7 +51,7 @@
     reinterpret_cast<RosAlloc::Run*>(dedicated_full_run_storage_);
 
 RosAlloc::RosAlloc(void* base, size_t capacity, size_t max_capacity,
-                   PageReleaseMode page_release_mode, bool running_on_valgrind,
+                   PageReleaseMode page_release_mode, bool running_on_memory_tool,
                    size_t page_release_size_threshold)
     : base_(reinterpret_cast<uint8_t*>(base)), footprint_(capacity),
       capacity_(capacity), max_capacity_(max_capacity),
@@ -58,7 +59,7 @@
       bulk_free_lock_("rosalloc bulk free lock", kRosAllocBulkFreeLock),
       page_release_mode_(page_release_mode),
       page_release_size_threshold_(page_release_size_threshold),
-      running_on_valgrind_(running_on_valgrind) {
+      is_running_on_memory_tool_(running_on_memory_tool) {
   DCHECK_EQ(RoundUp(capacity, kPageSize), capacity);
   DCHECK_EQ(RoundUp(max_capacity, kPageSize), max_capacity);
   CHECK_LE(capacity, max_capacity);
@@ -110,6 +111,9 @@
   for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
     delete size_bracket_locks_[i];
   }
+  if (is_running_on_memory_tool_) {
+    MEMORY_TOOL_MAKE_DEFINED(base_, capacity_);
+  }
 }
 
 void* RosAlloc::AllocPages(Thread* self, size_t num_pages, uint8_t page_map_type) {
@@ -1897,8 +1901,8 @@
     MutexLock lock_mu(self, lock_);
     size_t pm_end = page_map_size_;
     size_t i = 0;
-    size_t valgrind_modifier =  running_on_valgrind_ ?
-        2 * ::art::gc::space::kDefaultValgrindRedZoneBytes :  // Redzones before and after.
+    size_t memory_tool_modifier =  is_running_on_memory_tool_ ?
+        2 * ::art::gc::space::kDefaultMemoryToolRedZoneBytes :  // Redzones before and after.
         0;
     while (i < pm_end) {
       uint8_t pm = page_map_[i];
@@ -1938,15 +1942,15 @@
             idx++;
           }
           uint8_t* start = base_ + i * kPageSize;
-          if (running_on_valgrind_) {
-            start += ::art::gc::space::kDefaultValgrindRedZoneBytes;
+          if (is_running_on_memory_tool_) {
+            start += ::art::gc::space::kDefaultMemoryToolRedZoneBytes;
           }
           mirror::Object* obj = reinterpret_cast<mirror::Object*>(start);
           size_t obj_size = obj->SizeOf();
-          CHECK_GT(obj_size + valgrind_modifier, kLargeSizeThreshold)
+          CHECK_GT(obj_size + memory_tool_modifier, kLargeSizeThreshold)
               << "A rosalloc large object size must be > " << kLargeSizeThreshold;
-          CHECK_EQ(num_pages, RoundUp(obj_size + valgrind_modifier, kPageSize) / kPageSize)
-              << "A rosalloc large object size " << obj_size + valgrind_modifier
+          CHECK_EQ(num_pages, RoundUp(obj_size + memory_tool_modifier, kPageSize) / kPageSize)
+              << "A rosalloc large object size " << obj_size + memory_tool_modifier
               << " does not match the page map table " << (num_pages * kPageSize)
               << std::endl << DumpPageMap();
           i += num_pages;
@@ -2011,11 +2015,11 @@
   }
   // Call Verify() here for the lock order.
   for (auto& run : runs) {
-    run->Verify(self, this, running_on_valgrind_);
+    run->Verify(self, this, is_running_on_memory_tool_);
   }
 }
 
-void RosAlloc::Run::Verify(Thread* self, RosAlloc* rosalloc, bool running_on_valgrind) {
+void RosAlloc::Run::Verify(Thread* self, RosAlloc* rosalloc, bool running_on_memory_tool) {
   DCHECK_EQ(magic_num_, kMagicNum) << "Bad magic number : " << Dump();
   const size_t idx = size_bracket_idx_;
   CHECK_LT(idx, kNumOfSizeBrackets) << "Out of range size bracket index : " << Dump();
@@ -2098,8 +2102,8 @@
   }
   // Check each slot.
   size_t slots = 0;
-  size_t valgrind_modifier = running_on_valgrind ?
-      2 * ::art::gc::space::kDefaultValgrindRedZoneBytes :
+  size_t memory_tool_modifier = running_on_memory_tool ?
+      2 * ::art::gc::space::kDefaultMemoryToolRedZoneBytes :
       0U;
   for (size_t v = 0; v < num_vec; v++, slots += 32) {
     DCHECK_GE(num_slots, slots) << "Out of bounds";
@@ -2113,16 +2117,16 @@
       bool is_thread_local_freed = IsThreadLocal() && ((thread_local_free_vec >> i) & 0x1) != 0;
       if (is_allocated && !is_thread_local_freed) {
         uint8_t* slot_addr = slot_base + (slots + i) * bracket_size;
-        if (running_on_valgrind) {
-          slot_addr += ::art::gc::space::kDefaultValgrindRedZoneBytes;
+        if (running_on_memory_tool) {
+          slot_addr += ::art::gc::space::kDefaultMemoryToolRedZoneBytes;
         }
         mirror::Object* obj = reinterpret_cast<mirror::Object*>(slot_addr);
         size_t obj_size = obj->SizeOf();
-        CHECK_LE(obj_size + valgrind_modifier, kLargeSizeThreshold)
+        CHECK_LE(obj_size + memory_tool_modifier, kLargeSizeThreshold)
             << "A run slot contains a large object " << Dump();
-        CHECK_EQ(SizeToIndex(obj_size + valgrind_modifier), idx)
+        CHECK_EQ(SizeToIndex(obj_size + memory_tool_modifier), idx)
             << PrettyTypeOf(obj) << " "
-            << "obj_size=" << obj_size << "(" << obj_size + valgrind_modifier << "), idx=" << idx
+            << "obj_size=" << obj_size << "(" << obj_size + memory_tool_modifier << "), idx=" << idx
             << " A run slot contains an object with wrong size " << Dump();
       }
     }
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index 0fcfe72..c356a39 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -253,7 +253,7 @@
     // Dump the run metadata for debugging.
     std::string Dump();
     // Verify for debugging.
-    void Verify(Thread* self, RosAlloc* rosalloc, bool running_on_valgrind)
+    void Verify(Thread* self, RosAlloc* rosalloc, bool running_on_memory_tool)
         EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
         EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_list_lock_);
 
@@ -503,7 +503,7 @@
   const size_t page_release_size_threshold_;
 
   // Whether this allocator is running under Valgrind.
-  bool running_on_valgrind_;
+  bool is_running_on_memory_tool_;
 
   // The base address of the memory region that's managed by this allocator.
   uint8_t* Begin() { return base_; }
@@ -561,7 +561,7 @@
  public:
   RosAlloc(void* base, size_t capacity, size_t max_capacity,
            PageReleaseMode page_release_mode,
-           bool running_on_valgrind,
+           bool running_on_memory_tool,
            size_t page_release_size_threshold = kDefaultPageReleaseSizeThreshold);
   ~RosAlloc();
 
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index ccf5154..b5d5c34 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -17,6 +17,7 @@
 #include "concurrent_copying.h"
 
 #include "art_field-inl.h"
+#include "base/stl_util.h"
 #include "gc/accounting/heap_bitmap-inl.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/reference_processor.h"
@@ -38,17 +39,22 @@
     : GarbageCollector(heap,
                        name_prefix + (name_prefix.empty() ? "" : " ") +
                        "concurrent copying + mark sweep"),
-      region_space_(nullptr), gc_barrier_(new Barrier(0)), mark_queue_(2 * MB),
+      region_space_(nullptr), gc_barrier_(new Barrier(0)),
+      gc_mark_stack_(accounting::ObjectStack::Create("concurrent copying gc mark stack",
+                                                     2 * MB, 2 * MB)),
+      mark_stack_lock_("concurrent copying mark stack lock", kMarkSweepMarkStackLock),
+      thread_running_gc_(nullptr),
       is_marking_(false), is_active_(false), is_asserting_to_space_invariant_(false),
-      heap_mark_bitmap_(nullptr), live_stack_freeze_size_(0),
+      heap_mark_bitmap_(nullptr), live_stack_freeze_size_(0), mark_stack_mode_(kMarkStackModeOff),
+      weak_ref_access_enabled_(true),
       skipped_blocks_lock_("concurrent copying bytes blocks lock", kMarkSweepMarkStackLock),
       rb_table_(heap_->GetReadBarrierTable()),
       force_evacuate_all_(false) {
   static_assert(space::RegionSpace::kRegionSize == accounting::ReadBarrierTable::kRegionSize,
                 "The region space size and the read barrier table region size must match");
   cc_heap_bitmap_.reset(new accounting::HeapBitmap(heap));
+  Thread* self = Thread::Current();
   {
-    Thread* self = Thread::Current();
     ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
     // Cache this so that we won't have to lock heap_bitmap_lock_ in
     // Mark() which could cause a nested lock on heap_bitmap_lock_
@@ -56,9 +62,25 @@
     // (class_linker_lock_ and heap_bitmap_lock_).
     heap_mark_bitmap_ = heap->GetMarkBitmap();
   }
+  {
+    MutexLock mu(self, mark_stack_lock_);
+    for (size_t i = 0; i < kMarkStackPoolSize; ++i) {
+      accounting::AtomicStack<mirror::Object>* mark_stack =
+          accounting::AtomicStack<mirror::Object>::Create(
+              "thread local mark stack", kMarkStackSize, kMarkStackSize);
+      pooled_mark_stacks_.push_back(mark_stack);
+    }
+  }
+}
+
+void ConcurrentCopying::MarkHeapReference(
+    mirror::HeapReference<mirror::Object>* from_ref ATTRIBUTE_UNUSED) {
+  // Unused, usually called from mod union tables.
+  UNIMPLEMENTED(FATAL);
 }
 
 ConcurrentCopying::~ConcurrentCopying() {
+  STLDeleteElements(&pooled_mark_stacks_);
 }
 
 void ConcurrentCopying::RunPhases() {
@@ -66,6 +88,7 @@
   CHECK(!is_active_);
   is_active_ = true;
   Thread* self = Thread::Current();
+  thread_running_gc_ = self;
   Locks::mutator_lock_->AssertNotHeld(self);
   {
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
@@ -80,7 +103,7 @@
   if (kEnableNoFromSpaceRefsVerification || kIsDebugBuild) {
     TimingLogger::ScopedTiming split("(Paused)VerifyNoFromSpaceReferences", GetTimings());
     ScopedPause pause(this);
-    CheckEmptyMarkQueue();
+    CheckEmptyMarkStack();
     if (kVerboseMode) {
       LOG(INFO) << "Verifying no from-space refs";
     }
@@ -88,7 +111,7 @@
     if (kVerboseMode) {
       LOG(INFO) << "Done verifying no from-space refs";
     }
-    CheckEmptyMarkQueue();
+    CheckEmptyMarkStack();
   }
   {
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
@@ -97,6 +120,7 @@
   FinishPhase();
   CHECK(is_active_);
   is_active_ = false;
+  thread_running_gc_ = nullptr;
 }
 
 void ConcurrentCopying::BindBitmaps() {
@@ -133,7 +157,7 @@
     LOG(INFO) << "Region-space : " << reinterpret_cast<void*>(region_space_->Begin()) << "-"
               << reinterpret_cast<void*>(region_space_->Limit());
   }
-  CHECK(mark_queue_.IsEmpty());
+  CheckEmptyMarkStack();
   immune_region_.Reset();
   bytes_moved_.StoreRelaxed(0);
   objects_moved_.StoreRelaxed(0);
@@ -210,6 +234,7 @@
       cc->from_space_num_bytes_at_first_pause_ = cc->region_space_->GetBytesAllocated();
     }
     cc->is_marking_ = true;
+    cc->mark_stack_mode_.StoreRelaxed(ConcurrentCopying::kMarkStackModeThreadLocal);
     if (UNLIKELY(Runtime::Current()->IsActiveTransaction())) {
       CHECK(Runtime::Current()->IsAotCompiler());
       TimingLogger::ScopedTiming split2("(Paused)VisitTransactionRoots", cc->GetTimings());
@@ -284,12 +309,12 @@
     } else {
       // Newly marked. Set the gray bit and push it onto the mark stack.
       CHECK(!kUseBakerReadBarrier || obj->GetReadBarrierPointer() == ReadBarrier::GrayPtr());
-      collector_->PushOntoMarkStack<true>(obj);
+      collector_->PushOntoMarkStack(obj);
     }
   }
 
  private:
-  ConcurrentCopying* collector_;
+  ConcurrentCopying* const collector_;
 };
 
 class EmptyCheckpoint : public Closure {
@@ -320,6 +345,7 @@
   if (kVerboseMode) {
     LOG(INFO) << "GC MarkingPhase";
   }
+  CHECK(weak_ref_access_enabled_);
   {
     // Mark the image root. The WB-based collectors do not need to
     // scan the image objects from roots by relying on the card table,
@@ -371,37 +397,47 @@
   Thread* self = Thread::Current();
   {
     TimingLogger::ScopedTiming split6("ProcessMarkStack", GetTimings());
-    // Process the mark stack and issue an empty check point. If the
-    // mark stack is still empty after the check point, we're
-    // done. Otherwise, repeat.
+    // We transition through three mark stack modes (thread-local, shared, GC-exclusive). The
+    // primary reasons are the fact that we need to use a checkpoint to process thread-local mark
+    // stacks, but after we disable weak refs accesses, we can't use a checkpoint due to a deadlock
+    // issue because running threads potentially blocking at WaitHoldingLocks, and that once we
+    // reach the point where we process weak references, we can avoid using a lock when accessing
+    // the GC mark stack, which makes mark stack processing more efficient.
+
+    // Process the mark stack once in the thread local stack mode. This marks most of the live
+    // objects, aside from weak ref accesses with read barriers (Reference::GetReferent() and system
+    // weaks) that may happen concurrently while we processing the mark stack and newly mark/gray
+    // objects and push refs on the mark stack.
     ProcessMarkStack();
-    size_t count = 0;
-    while (!ProcessMarkStack()) {
-      ++count;
-      if (kVerboseMode) {
-        LOG(INFO) << "Issue an empty check point. " << count;
-      }
-      IssueEmptyCheckpoint();
-    }
-    // Need to ensure the mark stack is empty before reference
-    // processing to get rid of non-reference gray objects.
-    CheckEmptyMarkQueue();
-    // Enable the GetReference slow path and disallow access to the system weaks.
-    GetHeap()->GetReferenceProcessor()->EnableSlowPath();
-    Runtime::Current()->DisallowNewSystemWeaks();
-    QuasiAtomic::ThreadFenceForConstructor();
-    // Lock-unlock the system weak locks so that there's no thread in
-    // the middle of accessing system weaks.
-    Runtime::Current()->EnsureNewSystemWeaksDisallowed();
-    // Note: Do not issue a checkpoint from here to the
-    // SweepSystemWeaks call or else a deadlock due to
-    // WaitHoldingLocks() would occur.
+    // Switch to the shared mark stack mode. That is, revoke and process thread-local mark stacks
+    // for the last time before transitioning to the shared mark stack mode, which would process new
+    // refs that may have been concurrently pushed onto the mark stack during the ProcessMarkStack()
+    // call above. At the same time, disable weak ref accesses using a per-thread flag. It's
+    // important to do these together in a single checkpoint so that we can ensure that mutators
+    // won't newly gray objects and push new refs onto the mark stack due to weak ref accesses and
+    // mutators safely transition to the shared mark stack mode (without leaving unprocessed refs on
+    // the thread-local mark stacks), without a race. This is why we use a thread-local weak ref
+    // access flag Thread::tls32_.weak_ref_access_enabled_ instead of the global ones.
+    SwitchToSharedMarkStackMode();
+    CHECK(!self->GetWeakRefAccessEnabled());
+    // Now that weak refs accesses are disabled, once we exhaust the shared mark stack again here
+    // (which may be non-empty if there were refs found on thread-local mark stacks during the above
+    // SwitchToSharedMarkStackMode() call), we won't have new refs to process, that is, mutators
+    // (via read barriers) have no way to produce any more refs to process. Marking converges once
+    // before we process weak refs below.
+    ProcessMarkStack();
+    CheckEmptyMarkStack();
+    // Switch to the GC exclusive mark stack mode so that we can process the mark stack without a
+    // lock from this point on.
+    SwitchToGcExclusiveMarkStackMode();
+    CheckEmptyMarkStack();
     if (kVerboseMode) {
-      LOG(INFO) << "Enabled the ref proc slow path & disabled access to system weaks.";
       LOG(INFO) << "ProcessReferences";
     }
-    ProcessReferences(self, true);
-    CheckEmptyMarkQueue();
+    // Process weak references. This may produce new refs to process and have them processed via
+    // ProcessMarkStack (in the GC exclusive mark stack mode).
+    ProcessReferences(self);
+    CheckEmptyMarkStack();
     if (kVerboseMode) {
       LOG(INFO) << "SweepSystemWeaks";
     }
@@ -409,33 +445,52 @@
     if (kVerboseMode) {
       LOG(INFO) << "SweepSystemWeaks done";
     }
-    // Because hash_set::Erase() can call the hash function for
-    // arbitrary elements in the weak intern table in
-    // InternTable::Table::SweepWeaks(), the above SweepSystemWeaks()
-    // call may have marked some objects (strings) alive. So process
-    // the mark stack here once again.
+    // Process the mark stack here one last time because the above SweepSystemWeaks() call may have
+    // marked some objects (strings alive) as hash_set::Erase() can call the hash function for
+    // arbitrary elements in the weak intern table in InternTable::Table::SweepWeaks().
     ProcessMarkStack();
-    CheckEmptyMarkQueue();
-    if (kVerboseMode) {
-      LOG(INFO) << "AllowNewSystemWeaks";
-    }
-    Runtime::Current()->AllowNewSystemWeaks();
+    CheckEmptyMarkStack();
+    // Re-enable weak ref accesses.
+    ReenableWeakRefAccess(self);
+    // Issue an empty checkpoint to ensure no threads are still in the middle of a read barrier
+    // which may have a from-space ref cached in a local variable.
     IssueEmptyCheckpoint();
-    // Disable marking.
+    // Marking is done. Disable marking.
     if (kUseTableLookupReadBarrier) {
       heap_->rb_table_->ClearAll();
       DCHECK(heap_->rb_table_->IsAllCleared());
     }
-    is_mark_queue_push_disallowed_.StoreSequentiallyConsistent(1);
-    is_marking_ = false;
-    CheckEmptyMarkQueue();
+    is_mark_stack_push_disallowed_.StoreSequentiallyConsistent(1);
+    is_marking_ = false;  // This disables the read barrier/marking of weak roots.
+    mark_stack_mode_.StoreSequentiallyConsistent(kMarkStackModeOff);
+    CheckEmptyMarkStack();
   }
 
+  CHECK(weak_ref_access_enabled_);
   if (kVerboseMode) {
     LOG(INFO) << "GC end of MarkingPhase";
   }
 }
 
+void ConcurrentCopying::ReenableWeakRefAccess(Thread* self) {
+  if (kVerboseMode) {
+    LOG(INFO) << "ReenableWeakRefAccess";
+  }
+  weak_ref_access_enabled_.StoreRelaxed(true);  // This is for new threads.
+  QuasiAtomic::ThreadFenceForConstructor();
+  // Iterate all threads (don't need to or can't use a checkpoint) and re-enable weak ref access.
+  {
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+    for (Thread* thread : thread_list) {
+      thread->SetWeakRefAccessEnabled(true);
+    }
+  }
+  // Unblock blocking threads.
+  GetHeap()->GetReferenceProcessor()->BroadcastForSlowPath(self);
+  Runtime::Current()->BroadcastForNewSystemWeaks();
+}
+
 void ConcurrentCopying::IssueEmptyCheckpoint() {
   Thread* self = Thread::Current();
   EmptyCheckpoint check_point(this);
@@ -456,18 +511,61 @@
   Locks::mutator_lock_->SharedLock(self);
 }
 
-mirror::Object* ConcurrentCopying::PopOffMarkStack() {
-  return mark_queue_.Dequeue();
-}
-
-template<bool kThreadSafe>
 void ConcurrentCopying::PushOntoMarkStack(mirror::Object* to_ref) {
-  CHECK_EQ(is_mark_queue_push_disallowed_.LoadRelaxed(), 0)
+  CHECK_EQ(is_mark_stack_push_disallowed_.LoadRelaxed(), 0)
       << " " << to_ref << " " << PrettyTypeOf(to_ref);
-  if (kThreadSafe) {
-    CHECK(mark_queue_.Enqueue(to_ref)) << "Mark queue overflow";
+  Thread* self = Thread::Current();  // TODO: pass self as an argument from call sites?
+  CHECK(thread_running_gc_ != nullptr);
+  MarkStackMode mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  if (mark_stack_mode == kMarkStackModeThreadLocal) {
+    if (self == thread_running_gc_) {
+      // If GC-running thread, use the GC mark stack instead of a thread-local mark stack.
+      CHECK(self->GetThreadLocalMarkStack() == nullptr);
+      CHECK(!gc_mark_stack_->IsFull());
+      gc_mark_stack_->PushBack(to_ref);
+    } else {
+      // Otherwise, use a thread-local mark stack.
+      accounting::AtomicStack<mirror::Object>* tl_mark_stack = self->GetThreadLocalMarkStack();
+      if (UNLIKELY(tl_mark_stack == nullptr || tl_mark_stack->IsFull())) {
+        MutexLock mu(self, mark_stack_lock_);
+        // Get a new thread local mark stack.
+        accounting::AtomicStack<mirror::Object>* new_tl_mark_stack;
+        if (!pooled_mark_stacks_.empty()) {
+          // Use a pooled mark stack.
+          new_tl_mark_stack = pooled_mark_stacks_.back();
+          pooled_mark_stacks_.pop_back();
+        } else {
+          // None pooled. Create a new one.
+          new_tl_mark_stack =
+              accounting::AtomicStack<mirror::Object>::Create(
+                  "thread local mark stack", 4 * KB, 4 * KB);
+        }
+        DCHECK(new_tl_mark_stack != nullptr);
+        DCHECK(new_tl_mark_stack->IsEmpty());
+        new_tl_mark_stack->PushBack(to_ref);
+        self->SetThreadLocalMarkStack(new_tl_mark_stack);
+        if (tl_mark_stack != nullptr) {
+          // Store the old full stack into a vector.
+          revoked_mark_stacks_.push_back(tl_mark_stack);
+        }
+      } else {
+        tl_mark_stack->PushBack(to_ref);
+      }
+    }
+  } else if (mark_stack_mode == kMarkStackModeShared) {
+    // Access the shared GC mark stack with a lock.
+    MutexLock mu(self, mark_stack_lock_);
+    CHECK(!gc_mark_stack_->IsFull());
+    gc_mark_stack_->PushBack(to_ref);
   } else {
-    CHECK(mark_queue_.EnqueueThreadUnsafe(to_ref)) << "Mark queue overflow";
+    CHECK_EQ(static_cast<uint32_t>(mark_stack_mode),
+             static_cast<uint32_t>(kMarkStackModeGcExclusive));
+    CHECK(self == thread_running_gc_)
+        << "Only GC-running thread should access the mark stack "
+        << "in the GC exclusive mark stack mode";
+    // Access the GC mark stack without a lock.
+    CHECK(!gc_mark_stack_->IsFull());
+    gc_mark_stack_->PushBack(to_ref);
   }
 }
 
@@ -552,7 +650,7 @@
   }
 
  private:
-  ConcurrentCopying* collector_;
+  ConcurrentCopying* const collector_;
 };
 
 class ConcurrentCopyingVerifyNoFromSpaceRefsObjectVisitor {
@@ -640,16 +738,9 @@
     }
     collector_->AssertToSpaceInvariant(nullptr, MemberOffset(0), ref);
   }
-  static void RootCallback(mirror::Object** root, void *arg, const RootInfo& /*root_info*/)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    ConcurrentCopying* collector = reinterpret_cast<ConcurrentCopying*>(arg);
-    ConcurrentCopyingAssertToSpaceInvariantRefsVisitor visitor(collector);
-    DCHECK(root != nullptr);
-    visitor(*root);
-  }
 
  private:
-  ConcurrentCopying* collector_;
+  ConcurrentCopying* const collector_;
 };
 
 class ConcurrentCopyingAssertToSpaceInvariantFieldVisitor {
@@ -670,7 +761,7 @@
   }
 
  private:
-  ConcurrentCopying* collector_;
+  ConcurrentCopying* const collector_;
 };
 
 class ConcurrentCopyingAssertToSpaceInvariantObjectVisitor {
@@ -693,93 +784,310 @@
   }
 
  private:
-  ConcurrentCopying* collector_;
+  ConcurrentCopying* const collector_;
 };
 
-bool ConcurrentCopying::ProcessMarkStack() {
+class RevokeThreadLocalMarkStackCheckpoint : public Closure {
+ public:
+  explicit RevokeThreadLocalMarkStackCheckpoint(ConcurrentCopying* concurrent_copying,
+                                                bool disable_weak_ref_access)
+      : concurrent_copying_(concurrent_copying),
+        disable_weak_ref_access_(disable_weak_ref_access) {
+  }
+
+  virtual void Run(Thread* thread) OVERRIDE NO_THREAD_SAFETY_ANALYSIS {
+    // Note: self is not necessarily equal to thread since thread may be suspended.
+    Thread* self = Thread::Current();
+    CHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
+        << thread->GetState() << " thread " << thread << " self " << self;
+    // Revoke thread local mark stacks.
+    accounting::AtomicStack<mirror::Object>* tl_mark_stack = thread->GetThreadLocalMarkStack();
+    if (tl_mark_stack != nullptr) {
+      MutexLock mu(self, concurrent_copying_->mark_stack_lock_);
+      concurrent_copying_->revoked_mark_stacks_.push_back(tl_mark_stack);
+      thread->SetThreadLocalMarkStack(nullptr);
+    }
+    // Disable weak ref access.
+    if (disable_weak_ref_access_) {
+      thread->SetWeakRefAccessEnabled(false);
+    }
+    // If thread is a running mutator, then act on behalf of the garbage collector.
+    // See the code in ThreadList::RunCheckpoint.
+    if (thread->GetState() == kRunnable) {
+      concurrent_copying_->GetBarrier().Pass(self);
+    }
+  }
+
+ private:
+  ConcurrentCopying* const concurrent_copying_;
+  const bool disable_weak_ref_access_;
+};
+
+void ConcurrentCopying::RevokeThreadLocalMarkStacks(bool disable_weak_ref_access) {
+  Thread* self = Thread::Current();
+  RevokeThreadLocalMarkStackCheckpoint check_point(this, disable_weak_ref_access);
+  ThreadList* thread_list = Runtime::Current()->GetThreadList();
+  gc_barrier_->Init(self, 0);
+  size_t barrier_count = thread_list->RunCheckpoint(&check_point);
+  // If there are no threads to wait which implys that all the checkpoint functions are finished,
+  // then no need to release the mutator lock.
+  if (barrier_count == 0) {
+    return;
+  }
+  Locks::mutator_lock_->SharedUnlock(self);
+  {
+    ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
+    gc_barrier_->Increment(self, barrier_count);
+  }
+  Locks::mutator_lock_->SharedLock(self);
+}
+
+void ConcurrentCopying::RevokeThreadLocalMarkStack(Thread* thread) {
+  Thread* self = Thread::Current();
+  CHECK_EQ(self, thread);
+  accounting::AtomicStack<mirror::Object>* tl_mark_stack = thread->GetThreadLocalMarkStack();
+  if (tl_mark_stack != nullptr) {
+    CHECK(is_marking_);
+    MutexLock mu(self, mark_stack_lock_);
+    revoked_mark_stacks_.push_back(tl_mark_stack);
+    thread->SetThreadLocalMarkStack(nullptr);
+  }
+}
+
+void ConcurrentCopying::ProcessMarkStack() {
   if (kVerboseMode) {
     LOG(INFO) << "ProcessMarkStack. ";
   }
-  size_t count = 0;
-  mirror::Object* to_ref;
-  while ((to_ref = PopOffMarkStack()) != nullptr) {
-    ++count;
-    DCHECK(!region_space_->IsInFromSpace(to_ref));
-    if (kUseBakerReadBarrier) {
-      DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr())
-          << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
-          << " is_marked=" << IsMarked(to_ref);
+  bool empty_prev = false;
+  while (true) {
+    bool empty = ProcessMarkStackOnce();
+    if (empty_prev && empty) {
+      // Saw empty mark stack for a second time, done.
+      break;
     }
-    // Scan ref fields.
-    Scan(to_ref);
-    // Mark the gray ref as white or black.
-    if (kUseBakerReadBarrier) {
-      DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr())
-          << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
-          << " is_marked=" << IsMarked(to_ref);
-    }
-    if (to_ref->GetClass<kVerifyNone, kWithoutReadBarrier>()->IsTypeOfReferenceClass() &&
-        to_ref->AsReference()->GetReferent<kWithoutReadBarrier>() != nullptr &&
-        !IsInToSpace(to_ref->AsReference()->GetReferent<kWithoutReadBarrier>())) {
-      // Leave References gray so that GetReferent() will trigger RB.
-      CHECK(to_ref->AsReference()->IsEnqueued()) << "Left unenqueued ref gray " << to_ref;
-    } else {
-#ifdef USE_BAKER_OR_BROOKS_READ_BARRIER
-      if (kUseBakerReadBarrier) {
-        if (region_space_->IsInToSpace(to_ref)) {
-          // If to-space, change from gray to white.
-          bool success = to_ref->AtomicSetReadBarrierPointer(ReadBarrier::GrayPtr(),
-                                                             ReadBarrier::WhitePtr());
-          CHECK(success) << "Must succeed as we won the race.";
-          CHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::WhitePtr());
-        } else {
-          // If non-moving space/unevac from space, change from gray
-          // to black. We can't change gray to white because it's not
-          // safe to use CAS if two threads change values in opposite
-          // directions (A->B and B->A). So, we change it to black to
-          // indicate non-moving objects that have been marked
-          // through. Note we'd need to change from black to white
-          // later (concurrently).
-          bool success = to_ref->AtomicSetReadBarrierPointer(ReadBarrier::GrayPtr(),
-                                                             ReadBarrier::BlackPtr());
-          CHECK(success) << "Must succeed as we won the race.";
-          CHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::BlackPtr());
-        }
-      }
-#else
-      DCHECK(!kUseBakerReadBarrier);
-#endif
-    }
-    if (ReadBarrier::kEnableToSpaceInvariantChecks || kIsDebugBuild) {
-      ConcurrentCopyingAssertToSpaceInvariantObjectVisitor visitor(this);
-      visitor(to_ref);
-    }
+    empty_prev = empty;
   }
+}
+
+bool ConcurrentCopying::ProcessMarkStackOnce() {
+  Thread* self = Thread::Current();
+  CHECK(thread_running_gc_ != nullptr);
+  CHECK(self == thread_running_gc_);
+  CHECK(self->GetThreadLocalMarkStack() == nullptr);
+  size_t count = 0;
+  MarkStackMode mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  if (mark_stack_mode == kMarkStackModeThreadLocal) {
+    // Process the thread-local mark stacks and the GC mark stack.
+    count += ProcessThreadLocalMarkStacks(false);
+    while (!gc_mark_stack_->IsEmpty()) {
+      mirror::Object* to_ref = gc_mark_stack_->PopBack();
+      ProcessMarkStackRef(to_ref);
+      ++count;
+    }
+    gc_mark_stack_->Reset();
+  } else if (mark_stack_mode == kMarkStackModeShared) {
+    // Process the shared GC mark stack with a lock.
+    {
+      MutexLock mu(self, mark_stack_lock_);
+      CHECK(revoked_mark_stacks_.empty());
+    }
+    while (true) {
+      std::vector<mirror::Object*> refs;
+      {
+        // Copy refs with lock. Note the number of refs should be small.
+        MutexLock mu(self, mark_stack_lock_);
+        if (gc_mark_stack_->IsEmpty()) {
+          break;
+        }
+        for (StackReference<mirror::Object>* p = gc_mark_stack_->Begin();
+             p != gc_mark_stack_->End(); ++p) {
+          refs.push_back(p->AsMirrorPtr());
+        }
+        gc_mark_stack_->Reset();
+      }
+      for (mirror::Object* ref : refs) {
+        ProcessMarkStackRef(ref);
+        ++count;
+      }
+    }
+  } else {
+    CHECK_EQ(static_cast<uint32_t>(mark_stack_mode),
+             static_cast<uint32_t>(kMarkStackModeGcExclusive));
+    {
+      MutexLock mu(self, mark_stack_lock_);
+      CHECK(revoked_mark_stacks_.empty());
+    }
+    // Process the GC mark stack in the exclusive mode. No need to take the lock.
+    while (!gc_mark_stack_->IsEmpty()) {
+      mirror::Object* to_ref = gc_mark_stack_->PopBack();
+      ProcessMarkStackRef(to_ref);
+      ++count;
+    }
+    gc_mark_stack_->Reset();
+  }
+
   // Return true if the stack was empty.
   return count == 0;
 }
 
-void ConcurrentCopying::CheckEmptyMarkQueue() {
-  if (!mark_queue_.IsEmpty()) {
-    while (!mark_queue_.IsEmpty()) {
-      mirror::Object* obj = mark_queue_.Dequeue();
-      if (kUseBakerReadBarrier) {
-        mirror::Object* rb_ptr = obj->GetReadBarrierPointer();
-        LOG(INFO) << "On mark queue : " << obj << " " << PrettyTypeOf(obj) << " rb_ptr=" << rb_ptr
-                  << " is_marked=" << IsMarked(obj);
+size_t ConcurrentCopying::ProcessThreadLocalMarkStacks(bool disable_weak_ref_access) {
+  // Run a checkpoint to collect all thread local mark stacks and iterate over them all.
+  RevokeThreadLocalMarkStacks(disable_weak_ref_access);
+  size_t count = 0;
+  std::vector<accounting::AtomicStack<mirror::Object>*> mark_stacks;
+  {
+    MutexLock mu(Thread::Current(), mark_stack_lock_);
+    // Make a copy of the mark stack vector.
+    mark_stacks = revoked_mark_stacks_;
+    revoked_mark_stacks_.clear();
+  }
+  for (accounting::AtomicStack<mirror::Object>* mark_stack : mark_stacks) {
+    for (StackReference<mirror::Object>* p = mark_stack->Begin(); p != mark_stack->End(); ++p) {
+      mirror::Object* to_ref = p->AsMirrorPtr();
+      ProcessMarkStackRef(to_ref);
+      ++count;
+    }
+    {
+      MutexLock mu(Thread::Current(), mark_stack_lock_);
+      if (pooled_mark_stacks_.size() >= kMarkStackPoolSize) {
+        // The pool has enough. Delete it.
+        delete mark_stack;
       } else {
-        LOG(INFO) << "On mark queue : " << obj << " " << PrettyTypeOf(obj)
-                  << " is_marked=" << IsMarked(obj);
+        // Otherwise, put it into the pool for later reuse.
+        mark_stack->Reset();
+        pooled_mark_stacks_.push_back(mark_stack);
       }
     }
-    LOG(FATAL) << "mark queue is not empty";
+  }
+  return count;
+}
+
+void ConcurrentCopying::ProcessMarkStackRef(mirror::Object* to_ref) {
+  DCHECK(!region_space_->IsInFromSpace(to_ref));
+  if (kUseBakerReadBarrier) {
+    DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr())
+        << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
+        << " is_marked=" << IsMarked(to_ref);
+  }
+  // Scan ref fields.
+  Scan(to_ref);
+  // Mark the gray ref as white or black.
+  if (kUseBakerReadBarrier) {
+    DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr())
+        << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
+        << " is_marked=" << IsMarked(to_ref);
+  }
+  if (to_ref->GetClass<kVerifyNone, kWithoutReadBarrier>()->IsTypeOfReferenceClass() &&
+      to_ref->AsReference()->GetReferent<kWithoutReadBarrier>() != nullptr &&
+      !IsInToSpace(to_ref->AsReference()->GetReferent<kWithoutReadBarrier>())) {
+    // Leave References gray so that GetReferent() will trigger RB.
+    CHECK(to_ref->AsReference()->IsEnqueued()) << "Left unenqueued ref gray " << to_ref;
+  } else {
+#ifdef USE_BAKER_OR_BROOKS_READ_BARRIER
+    if (kUseBakerReadBarrier) {
+      if (region_space_->IsInToSpace(to_ref)) {
+        // If to-space, change from gray to white.
+        bool success = to_ref->AtomicSetReadBarrierPointer(ReadBarrier::GrayPtr(),
+                                                           ReadBarrier::WhitePtr());
+        CHECK(success) << "Must succeed as we won the race.";
+        CHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::WhitePtr());
+      } else {
+        // If non-moving space/unevac from space, change from gray
+        // to black. We can't change gray to white because it's not
+        // safe to use CAS if two threads change values in opposite
+        // directions (A->B and B->A). So, we change it to black to
+        // indicate non-moving objects that have been marked
+        // through. Note we'd need to change from black to white
+        // later (concurrently).
+        bool success = to_ref->AtomicSetReadBarrierPointer(ReadBarrier::GrayPtr(),
+                                                           ReadBarrier::BlackPtr());
+        CHECK(success) << "Must succeed as we won the race.";
+        CHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::BlackPtr());
+      }
+    }
+#else
+    DCHECK(!kUseBakerReadBarrier);
+#endif
+  }
+  if (ReadBarrier::kEnableToSpaceInvariantChecks || kIsDebugBuild) {
+    ConcurrentCopyingAssertToSpaceInvariantObjectVisitor visitor(this);
+    visitor(to_ref);
+  }
+}
+
+void ConcurrentCopying::SwitchToSharedMarkStackMode() {
+  Thread* self = Thread::Current();
+  CHECK(thread_running_gc_ != nullptr);
+  CHECK_EQ(self, thread_running_gc_);
+  CHECK(self->GetThreadLocalMarkStack() == nullptr);
+  MarkStackMode before_mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  CHECK_EQ(static_cast<uint32_t>(before_mark_stack_mode),
+           static_cast<uint32_t>(kMarkStackModeThreadLocal));
+  mark_stack_mode_.StoreRelaxed(kMarkStackModeShared);
+  CHECK(weak_ref_access_enabled_.LoadRelaxed());
+  weak_ref_access_enabled_.StoreRelaxed(false);
+  QuasiAtomic::ThreadFenceForConstructor();
+  // Process the thread local mark stacks one last time after switching to the shared mark stack
+  // mode and disable weak ref accesses.
+  ProcessThreadLocalMarkStacks(true);
+  if (kVerboseMode) {
+    LOG(INFO) << "Switched to shared mark stack mode and disabled weak ref access";
+  }
+}
+
+void ConcurrentCopying::SwitchToGcExclusiveMarkStackMode() {
+  Thread* self = Thread::Current();
+  CHECK(thread_running_gc_ != nullptr);
+  CHECK_EQ(self, thread_running_gc_);
+  CHECK(self->GetThreadLocalMarkStack() == nullptr);
+  MarkStackMode before_mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  CHECK_EQ(static_cast<uint32_t>(before_mark_stack_mode),
+           static_cast<uint32_t>(kMarkStackModeShared));
+  mark_stack_mode_.StoreRelaxed(kMarkStackModeGcExclusive);
+  QuasiAtomic::ThreadFenceForConstructor();
+  if (kVerboseMode) {
+    LOG(INFO) << "Switched to GC exclusive mark stack mode";
+  }
+}
+
+void ConcurrentCopying::CheckEmptyMarkStack() {
+  Thread* self = Thread::Current();
+  CHECK(thread_running_gc_ != nullptr);
+  CHECK_EQ(self, thread_running_gc_);
+  CHECK(self->GetThreadLocalMarkStack() == nullptr);
+  MarkStackMode mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  if (mark_stack_mode == kMarkStackModeThreadLocal) {
+    // Thread-local mark stack mode.
+    RevokeThreadLocalMarkStacks(false);
+    MutexLock mu(Thread::Current(), mark_stack_lock_);
+    if (!revoked_mark_stacks_.empty()) {
+      for (accounting::AtomicStack<mirror::Object>* mark_stack : revoked_mark_stacks_) {
+        while (!mark_stack->IsEmpty()) {
+          mirror::Object* obj = mark_stack->PopBack();
+          if (kUseBakerReadBarrier) {
+            mirror::Object* rb_ptr = obj->GetReadBarrierPointer();
+            LOG(INFO) << "On mark queue : " << obj << " " << PrettyTypeOf(obj) << " rb_ptr=" << rb_ptr
+                      << " is_marked=" << IsMarked(obj);
+          } else {
+            LOG(INFO) << "On mark queue : " << obj << " " << PrettyTypeOf(obj)
+                      << " is_marked=" << IsMarked(obj);
+          }
+        }
+      }
+      LOG(FATAL) << "mark stack is not empty";
+    }
+  } else {
+    // Shared, GC-exclusive, or off.
+    MutexLock mu(Thread::Current(), mark_stack_lock_);
+    CHECK(gc_mark_stack_->IsEmpty());
+    CHECK(revoked_mark_stacks_.empty());
   }
 }
 
 void ConcurrentCopying::SweepSystemWeaks(Thread* self) {
   TimingLogger::ScopedTiming split("SweepSystemWeaks", GetTimings());
   ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  Runtime::Current()->SweepSystemWeaks(IsMarkedCallback, this);
+  Runtime::Current()->SweepSystemWeaks(this);
 }
 
 void ConcurrentCopying::Sweep(bool swap_bitmaps) {
@@ -792,7 +1100,7 @@
     heap_->MarkAllocStackAsLive(live_stack);
     live_stack->Reset();
   }
-  CHECK(mark_queue_.IsEmpty());
+  CheckEmptyMarkStack();
   TimingLogger::ScopedTiming split("Sweep", GetTimings());
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
     if (space->IsContinuousMemMapAllocSpace()) {
@@ -888,8 +1196,8 @@
     }
     IssueEmptyCheckpoint();
     // Disable the check.
-    is_mark_queue_push_disallowed_.StoreSequentiallyConsistent(0);
-    CheckEmptyMarkQueue();
+    is_mark_stack_push_disallowed_.StoreSequentiallyConsistent(0);
+    CheckEmptyMarkStack();
   }
 
   {
@@ -956,6 +1264,8 @@
     region_space_bitmap_ = nullptr;
   }
 
+  CheckEmptyMarkStack();
+
   if (kVerboseMode) {
     LOG(INFO) << "GC end of ReclaimPhase";
   }
@@ -982,7 +1292,7 @@
   }
 
  private:
-  ConcurrentCopying* collector_;
+  ConcurrentCopying* const collector_;
 };
 
 // Compute how much live objects are left in regions.
@@ -1479,7 +1789,7 @@
       }
       DCHECK(GetFwdPtr(from_ref) == to_ref);
       CHECK_NE(to_ref->GetLockWord(false).GetState(), LockWord::kForwardingAddress);
-      PushOntoMarkStack<true>(to_ref);
+      PushOntoMarkStack(to_ref);
       return to_ref;
     } else {
       // The CAS failed. It may have lost the race or may have failed
@@ -1612,7 +1922,7 @@
       if (kUseBakerReadBarrier) {
         DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr());
       }
-      PushOntoMarkStack<true>(to_ref);
+      PushOntoMarkStack(to_ref);
     }
   } else {
     // from_ref is in a non-moving space.
@@ -1639,7 +1949,7 @@
         if (kUseBakerReadBarrier) {
           DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr());
         }
-        PushOntoMarkStack<true>(to_ref);
+        PushOntoMarkStack(to_ref);
       }
     } else {
       // Use the mark bitmap.
@@ -1695,7 +2005,7 @@
             if (kUseBakerReadBarrier) {
               DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr());
             }
-            PushOntoMarkStack<true>(to_ref);
+            PushOntoMarkStack(to_ref);
           }
         }
       }
@@ -1705,9 +2015,11 @@
 }
 
 void ConcurrentCopying::FinishPhase() {
+  {
+    MutexLock mu(Thread::Current(), mark_stack_lock_);
+    CHECK_EQ(pooled_mark_stacks_.size(), kMarkStackPoolSize);
+  }
   region_space_ = nullptr;
-  CHECK(mark_queue_.IsEmpty());
-  mark_queue_.Clear();
   {
     MutexLock mu(Thread::Current(), skipped_blocks_lock_);
     skipped_blocks_map_.clear();
@@ -1716,14 +2028,9 @@
   heap_->ClearMarkedObjects();
 }
 
-mirror::Object* ConcurrentCopying::IsMarkedCallback(mirror::Object* from_ref, void* arg) {
-  return reinterpret_cast<ConcurrentCopying*>(arg)->IsMarked(from_ref);
-}
-
-bool ConcurrentCopying::IsHeapReferenceMarkedCallback(
-    mirror::HeapReference<mirror::Object>* field, void* arg) {
+bool ConcurrentCopying::IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* field) {
   mirror::Object* from_ref = field->AsMirrorPtr();
-  mirror::Object* to_ref = reinterpret_cast<ConcurrentCopying*>(arg)->IsMarked(from_ref);
+  mirror::Object* to_ref = IsMarked(from_ref);
   if (to_ref == nullptr) {
     return false;
   }
@@ -1735,25 +2042,20 @@
   return true;
 }
 
-mirror::Object* ConcurrentCopying::MarkCallback(mirror::Object* from_ref, void* arg) {
-  return reinterpret_cast<ConcurrentCopying*>(arg)->Mark(from_ref);
-}
-
-void ConcurrentCopying::ProcessMarkStackCallback(void* arg) {
-  reinterpret_cast<ConcurrentCopying*>(arg)->ProcessMarkStack();
+mirror::Object* ConcurrentCopying::MarkObject(mirror::Object* from_ref) {
+  return Mark(from_ref);
 }
 
 void ConcurrentCopying::DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference) {
-  heap_->GetReferenceProcessor()->DelayReferenceReferent(
-      klass, reference, &IsHeapReferenceMarkedCallback, this);
+  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, reference, this);
 }
 
-void ConcurrentCopying::ProcessReferences(Thread* self, bool concurrent) {
+void ConcurrentCopying::ProcessReferences(Thread* self) {
   TimingLogger::ScopedTiming split("ProcessReferences", GetTimings());
+  // We don't really need to lock the heap bitmap lock as we use CAS to mark in bitmaps.
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   GetHeap()->GetReferenceProcessor()->ProcessReferences(
-      concurrent, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(),
-      &IsHeapReferenceMarkedCallback, &MarkCallback, &ProcessMarkStackCallback, this);
+      true /*concurrent*/, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(), this);
 }
 
 void ConcurrentCopying::RevokeAllThreadLocalBuffers() {
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index b1897b8..4f92ea0 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -49,89 +49,6 @@
 
 namespace collector {
 
-// Concurrent queue. Used as the mark stack. TODO: use a concurrent
-// stack for locality.
-class MarkQueue {
- public:
-  explicit MarkQueue(size_t size) : size_(size) {
-    CHECK(IsPowerOfTwo(size_));
-    buf_.reset(new Atomic<mirror::Object*>[size_]);
-    CHECK(buf_.get() != nullptr);
-    Clear();
-  }
-
-  ALWAYS_INLINE Atomic<mirror::Object*>* GetSlotAddr(size_t index) {
-    return &(buf_.get()[index & (size_ - 1)]);
-  }
-
-  // Multiple-proceducer enqueue.
-  bool Enqueue(mirror::Object* to_ref) {
-    size_t t;
-    do {
-      t = tail_.LoadRelaxed();
-      size_t h = head_.LoadSequentiallyConsistent();
-      if (t + size_ == h) {
-        // It's full.
-        return false;
-      }
-    } while (!tail_.CompareExchangeWeakSequentiallyConsistent(t, t + 1));
-    // We got a slot but its content has not been filled yet at this point.
-    GetSlotAddr(t)->StoreSequentiallyConsistent(to_ref);
-    return true;
-  }
-
-  // Thread-unsafe.
-  bool EnqueueThreadUnsafe(mirror::Object* to_ref) {
-    size_t t = tail_.LoadRelaxed();
-    size_t h = head_.LoadRelaxed();
-    if (t + size_ == h) {
-      // It's full.
-      return false;
-    }
-    GetSlotAddr(t)->StoreRelaxed(to_ref);
-    tail_.StoreRelaxed(t + 1);
-    return true;
-  }
-
-  // Single-consumer dequeue.
-  mirror::Object* Dequeue() {
-    size_t h = head_.LoadRelaxed();
-    size_t t = tail_.LoadSequentiallyConsistent();
-    if (h == t) {
-      // it's empty.
-      return nullptr;
-    }
-    Atomic<mirror::Object*>* slot = GetSlotAddr(h);
-    mirror::Object* ref = slot->LoadSequentiallyConsistent();
-    while (ref == nullptr) {
-      // Wait until the slot content becomes visible.
-      ref = slot->LoadSequentiallyConsistent();
-    }
-    slot->StoreRelaxed(nullptr);
-    head_.StoreSequentiallyConsistent(h + 1);
-    return ref;
-  }
-
-  bool IsEmpty() {
-    size_t h = head_.LoadSequentiallyConsistent();
-    size_t t = tail_.LoadSequentiallyConsistent();
-    return h == t;
-  }
-
-  void Clear() {
-    head_.StoreRelaxed(0);
-    tail_.StoreRelaxed(0);
-    memset(buf_.get(), 0, size_ * sizeof(Atomic<mirror::Object*>));
-  }
-
- private:
-  Atomic<size_t> head_;
-  Atomic<size_t> tail_;
-
-  size_t size_;
-  std::unique_ptr<Atomic<mirror::Object*>[]> buf_;
-};
-
 class ConcurrentCopying : public GarbageCollector {
  public:
   // TODO: disable thse flags for production use.
@@ -185,10 +102,12 @@
   Barrier& GetBarrier() {
     return *gc_barrier_;
   }
+  bool IsWeakRefAccessEnabled() {
+    return weak_ref_access_enabled_.LoadRelaxed();
+  }
+  void RevokeThreadLocalMarkStack(Thread* thread) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
-  mirror::Object* PopOffMarkStack();
-  template<bool kThreadSafe>
   void PushOntoMarkStack(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* Copy(mirror::Object* from_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void Scan(mirror::Object* to_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -202,20 +121,25 @@
   void VerifyNoFromSpaceReferences() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   accounting::ObjectStack* GetAllocationStack();
   accounting::ObjectStack* GetLiveStack();
-  bool ProcessMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference)
+  void ProcessMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool ProcessMarkStackOnce() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ProcessMarkStackRef(mirror::Object* to_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  size_t ProcessThreadLocalMarkStacks(bool disable_weak_ref_access)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void ProcessReferences(Thread* self, bool concurrent)
+  void RevokeThreadLocalMarkStacks(bool disable_weak_ref_access)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  mirror::Object* IsMarked(mirror::Object* from_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static mirror::Object* MarkCallback(mirror::Object* from_ref, void* arg)
+  void SwitchToSharedMarkStackMode() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SwitchToGcExclusiveMarkStackMode() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  virtual void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static mirror::Object* IsMarkedCallback(mirror::Object* from_ref, void* arg)
+  void ProcessReferences(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  virtual mirror::Object* MarkObject(mirror::Object* from_ref) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static bool IsHeapReferenceMarkedCallback(
-      mirror::HeapReference<mirror::Object>* field, void* arg)
+  virtual void MarkHeapReference(mirror::HeapReference<mirror::Object>* from_ref) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static void ProcessMarkStackCallback(void* arg)
+  virtual mirror::Object* IsMarked(mirror::Object* from_ref) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  virtual bool IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* field) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void SweepSystemWeaks(Thread* self)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) LOCKS_EXCLUDED(Locks::heap_bitmap_lock_);
@@ -229,7 +153,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* AllocateInSkippedBlock(size_t alloc_size)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void CheckEmptyMarkQueue() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void CheckEmptyMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void IssueEmptyCheckpoint() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   bool IsOnAllocStack(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* GetFwdPtr(mirror::Object* from_ref)
@@ -242,10 +166,19 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void AssertToSpaceInvariantInNonMovingSpace(mirror::Object* obj, mirror::Object* ref)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ReenableWeakRefAccess(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   space::RegionSpace* region_space_;      // The underlying region space.
   std::unique_ptr<Barrier> gc_barrier_;
-  MarkQueue mark_queue_;
+  std::unique_ptr<accounting::ObjectStack> gc_mark_stack_;
+  Mutex mark_stack_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  std::vector<accounting::ObjectStack*> revoked_mark_stacks_
+      GUARDED_BY(mark_stack_lock_);
+  static constexpr size_t kMarkStackSize = kPageSize;
+  static constexpr size_t kMarkStackPoolSize = 256;
+  std::vector<accounting::ObjectStack*> pooled_mark_stacks_
+      GUARDED_BY(mark_stack_lock_);
+  Thread* thread_running_gc_;
   bool is_marking_;                       // True while marking is ongoing.
   bool is_active_;                        // True while the collection is ongoing.
   bool is_asserting_to_space_invariant_;  // True while asserting the to-space invariant.
@@ -258,7 +191,18 @@
   size_t live_stack_freeze_size_;
   size_t from_space_num_objects_at_first_pause_;
   size_t from_space_num_bytes_at_first_pause_;
-  Atomic<int> is_mark_queue_push_disallowed_;
+  Atomic<int> is_mark_stack_push_disallowed_;
+  enum MarkStackMode {
+    kMarkStackModeOff = 0,      // Mark stack is off.
+    kMarkStackModeThreadLocal,  // All threads except for the GC-running thread push refs onto
+                                // thread-local mark stacks. The GC-running thread pushes onto and
+                                // pops off the GC mark stack without a lock.
+    kMarkStackModeShared,       // All threads share the GC mark stack with a lock.
+    kMarkStackModeGcExclusive   // The GC-running thread pushes onto and pops from the GC mark stack
+                                // without a lock. Other threads won't access the mark stack.
+  };
+  Atomic<MarkStackMode> mark_stack_mode_;
+  Atomic<bool> weak_ref_access_enabled_;
 
   // How many objects and bytes we moved. Used for accounting.
   Atomic<size_t> bytes_moved_;
@@ -284,6 +228,7 @@
   friend class ThreadFlipVisitor;
   friend class FlipCallback;
   friend class ConcurrentCopyingComputeUnevacFromSpaceLiveRatioVisitor;
+  friend class RevokeThreadLocalMarkStackCheckpoint;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(ConcurrentCopying);
 };
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index 9b76d1a..e10bef4 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -17,6 +17,9 @@
 #ifndef ART_RUNTIME_GC_COLLECTOR_GARBAGE_COLLECTOR_H_
 #define ART_RUNTIME_GC_COLLECTOR_GARBAGE_COLLECTOR_H_
 
+#include <stdint.h>
+#include <vector>
+
 #include "base/histogram.h"
 #include "base/mutex.h"
 #include "base/timing_logger.h"
@@ -24,10 +27,16 @@
 #include "gc/gc_cause.h"
 #include "gc_root.h"
 #include "gc_type.h"
-#include <stdint.h>
-#include <vector>
+#include "object_callbacks.h"
 
 namespace art {
+
+namespace mirror {
+class Class;
+class Object;
+class Reference;
+}  // namespace mirror
+
 namespace gc {
 
 class Heap;
@@ -113,7 +122,7 @@
   DISALLOW_COPY_AND_ASSIGN(Iteration);
 };
 
-class GarbageCollector : public RootVisitor {
+class GarbageCollector : public RootVisitor, public IsMarkedVisitor, public MarkObjectVisitor {
  public:
   class SCOPED_LOCKABLE ScopedPause {
    public:
@@ -172,6 +181,22 @@
   void RecordFreeLOS(const ObjectBytePair& freed);
   void DumpPerformanceInfo(std::ostream& os) LOCKS_EXCLUDED(pause_histogram_lock_);
 
+  // Helper functions for querying if objects are marked at compile time. These are used for
+  // reading system weaks, processing references.
+  virtual mirror::Object* IsMarked(mirror::Object* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
+  virtual bool IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
+  // Used by reference processor.
+  virtual void ProcessMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
+  // Force mark an object.
+  virtual mirror::Object* MarkObject(mirror::Object* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
+  virtual void MarkHeapReference(mirror::HeapReference<mirror::Object>* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
+  virtual void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
+
  protected:
   // Run all of the GC phases.
   virtual void RunPhases() = 0;
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index 3c247cd..65e6b40 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -21,34 +21,19 @@
 #include "base/timing_logger.h"
 #include "gc/accounting/heap_bitmap-inl.h"
 #include "gc/accounting/mod_union_table.h"
-#include "gc/accounting/remembered_set.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
 #include "gc/reference_processor.h"
-#include "gc/space/bump_pointer_space.h"
 #include "gc/space/bump_pointer_space-inl.h"
-#include "gc/space/image_space.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
-#include "indirect_reference_table.h"
-#include "intern_table.h"
-#include "jni_internal.h"
-#include "mark_sweep-inl.h"
-#include "monitor.h"
 #include "mirror/class-inl.h"
-#include "mirror/class_loader.h"
-#include "mirror/dex_cache.h"
-#include "mirror/reference-inl.h"
 #include "mirror/object-inl.h"
-#include "mirror/object_array.h"
-#include "mirror/object_array-inl.h"
 #include "runtime.h"
 #include "stack.h"
 #include "thread-inl.h"
 #include "thread_list.h"
 
-using ::art::mirror::Object;
-
 namespace art {
 namespace gc {
 namespace collector {
@@ -67,7 +52,7 @@
 
 MarkCompact::MarkCompact(Heap* heap, const std::string& name_prefix)
     : GarbageCollector(heap, name_prefix + (name_prefix.empty() ? "" : " ") + "mark compact"),
-      space_(nullptr), collector_name_(name_) {
+      space_(nullptr), collector_name_(name_), updating_references_(false) {
 }
 
 void MarkCompact::RunPhases() {
@@ -107,7 +92,7 @@
   void operator()(mirror::Object* obj) const EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_,
                                                                       Locks::heap_bitmap_lock_) {
     DCHECK_ALIGNED(obj, space::BumpPointerSpace::kAlignment);
-    DCHECK(collector_->IsMarked(obj));
+    DCHECK(collector_->IsMarked(obj) != nullptr);
     collector_->ForwardObject(obj);
   }
 
@@ -141,8 +126,7 @@
 void MarkCompact::ProcessReferences(Thread* self) {
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   heap_->GetReferenceProcessor()->ProcessReferences(
-      false, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(),
-      &HeapReferenceMarkedCallback, &MarkObjectCallback, &ProcessMarkStackCallback, this);
+      false, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(), this);
 }
 
 class BitmapSetSlowPathVisitor {
@@ -156,29 +140,29 @@
   }
 };
 
-inline void MarkCompact::MarkObject(mirror::Object* obj) {
+inline mirror::Object* MarkCompact::MarkObject(mirror::Object* obj) {
   if (obj == nullptr) {
-    return;
+    return obj;
   }
   if (kUseBakerOrBrooksReadBarrier) {
     // Verify all the objects have the correct forward pointer installed.
     obj->AssertReadBarrierPointer();
   }
-  if (immune_region_.ContainsObject(obj)) {
-    return;
-  }
-  if (objects_before_forwarding_->HasAddress(obj)) {
-    if (!objects_before_forwarding_->Set(obj)) {
-      MarkStackPush(obj);  // This object was not previously marked.
-    }
-  } else {
-    DCHECK(!space_->HasAddress(obj));
-    BitmapSetSlowPathVisitor visitor;
-    if (!mark_bitmap_->Set(obj, visitor)) {
-      // This object was not previously marked.
-      MarkStackPush(obj);
+  if (!immune_region_.ContainsObject(obj)) {
+    if (objects_before_forwarding_->HasAddress(obj)) {
+      if (!objects_before_forwarding_->Set(obj)) {
+        MarkStackPush(obj);  // This object was not previously marked.
+      }
+    } else {
+      DCHECK(!space_->HasAddress(obj));
+      BitmapSetSlowPathVisitor visitor;
+      if (!mark_bitmap_->Set(obj, visitor)) {
+        // This object was not previously marked.
+        MarkStackPush(obj);
+      }
     }
   }
+  return obj;
 }
 
 void MarkCompact::MarkingPhase() {
@@ -240,7 +224,7 @@
         TimingLogger::ScopedTiming t2(
             space->IsZygoteSpace() ? "UpdateAndMarkZygoteModUnionTable" :
                                      "UpdateAndMarkImageModUnionTable", GetTimings());
-        table->UpdateAndMarkReferences(MarkHeapReferenceCallback, this);
+        table->UpdateAndMarkReferences(this);
       }
     }
   }
@@ -272,7 +256,7 @@
 }
 
 void MarkCompact::ResizeMarkStack(size_t new_size) {
-  std::vector<StackReference<Object>> temp(mark_stack_->Begin(), mark_stack_->End());
+  std::vector<StackReference<mirror::Object>> temp(mark_stack_->Begin(), mark_stack_->End());
   CHECK_LE(mark_stack_->Size(), new_size);
   mark_stack_->Resize(new_size);
   for (auto& obj : temp) {
@@ -280,7 +264,7 @@
   }
 }
 
-inline void MarkCompact::MarkStackPush(Object* obj) {
+inline void MarkCompact::MarkStackPush(mirror::Object* obj) {
   if (UNLIKELY(mark_stack_->Size() >= mark_stack_->Capacity())) {
     ResizeMarkStack(mark_stack_->Capacity() * 2);
   }
@@ -288,23 +272,12 @@
   mark_stack_->PushBack(obj);
 }
 
-void MarkCompact::ProcessMarkStackCallback(void* arg) {
-  reinterpret_cast<MarkCompact*>(arg)->ProcessMarkStack();
-}
-
-mirror::Object* MarkCompact::MarkObjectCallback(mirror::Object* root, void* arg) {
-  reinterpret_cast<MarkCompact*>(arg)->MarkObject(root);
-  return root;
-}
-
-void MarkCompact::MarkHeapReferenceCallback(mirror::HeapReference<mirror::Object>* obj_ptr,
-                                            void* arg) {
-  reinterpret_cast<MarkCompact*>(arg)->MarkObject(obj_ptr->AsMirrorPtr());
-}
-
-void MarkCompact::DelayReferenceReferentCallback(mirror::Class* klass, mirror::Reference* ref,
-                                                 void* arg) {
-  reinterpret_cast<MarkCompact*>(arg)->DelayReferenceReferent(klass, ref);
+void MarkCompact::MarkHeapReference(mirror::HeapReference<mirror::Object>* obj_ptr) {
+  if (updating_references_) {
+    UpdateHeapReference(obj_ptr);
+  } else {
+    MarkObject(obj_ptr->AsMirrorPtr());
+  }
 }
 
 void MarkCompact::VisitRoots(
@@ -373,6 +346,7 @@
 
 void MarkCompact::UpdateReferences() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  updating_references_ = true;
   Runtime* runtime = Runtime::Current();
   // Update roots.
   UpdateRootVisitor update_root_visitor(this);
@@ -387,7 +361,7 @@
           space->IsZygoteSpace() ? "UpdateZygoteModUnionTableReferences" :
                                    "UpdateImageModUnionTableReferences",
                                    GetTimings());
-      table->UpdateAndMarkReferences(&UpdateHeapReferenceCallback, this);
+      table->UpdateAndMarkReferences(this);
     } else {
       // No mod union table, so we need to scan the space using bitmap visit.
       // Scan the space using bitmap visit.
@@ -403,14 +377,15 @@
   CHECK(!kMovingClasses)
       << "Didn't update large object classes since they are assumed to not move.";
   // Update the system weaks, these should already have been swept.
-  runtime->SweepSystemWeaks(&MarkedForwardingAddressCallback, this);
+  runtime->SweepSystemWeaks(this);
   // Update the objects in the bump pointer space last, these objects don't have a bitmap.
   UpdateObjectReferencesVisitor visitor(this);
   objects_before_forwarding_->VisitMarkedRange(reinterpret_cast<uintptr_t>(space_->Begin()),
                                                reinterpret_cast<uintptr_t>(space_->End()),
                                                visitor);
   // Update the reference processor cleared list.
-  heap_->GetReferenceProcessor()->UpdateRoots(&MarkedForwardingAddressCallback, this);
+  heap_->GetReferenceProcessor()->UpdateRoots(this);
+  updating_references_ = false;
 }
 
 void MarkCompact::Compact() {
@@ -436,10 +411,6 @@
   Runtime::Current()->VisitRoots(this);
 }
 
-mirror::Object* MarkCompact::MarkedForwardingAddressCallback(mirror::Object* obj, void* arg) {
-  return reinterpret_cast<MarkCompact*>(arg)->GetMarkedForwardAddress(obj);
-}
-
 inline void MarkCompact::UpdateHeapReference(mirror::HeapReference<mirror::Object>* reference) {
   mirror::Object* obj = reference->AsMirrorPtr();
   if (obj != nullptr) {
@@ -451,17 +422,12 @@
   }
 }
 
-void MarkCompact::UpdateHeapReferenceCallback(mirror::HeapReference<mirror::Object>* reference,
-                                              void* arg) {
-  reinterpret_cast<MarkCompact*>(arg)->UpdateHeapReference(reference);
-}
-
 class UpdateReferenceVisitor {
  public:
   explicit UpdateReferenceVisitor(MarkCompact* collector) : collector_(collector) {
   }
 
-  void operator()(Object* obj, MemberOffset offset, bool /*is_static*/) const
+  void operator()(mirror::Object* obj, MemberOffset offset, bool /*is_static*/) const
       ALWAYS_INLINE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
     collector_->UpdateHeapReference(obj->GetFieldObjectReferenceAddr<kVerifyNone>(offset));
   }
@@ -481,7 +447,7 @@
   obj->VisitReferences<kMovingClasses>(visitor, visitor);
 }
 
-inline mirror::Object* MarkCompact::GetMarkedForwardAddress(mirror::Object* obj) const {
+inline mirror::Object* MarkCompact::GetMarkedForwardAddress(mirror::Object* obj) {
   DCHECK(obj != nullptr);
   if (objects_before_forwarding_->HasAddress(obj)) {
     DCHECK(objects_before_forwarding_->Test(obj));
@@ -491,33 +457,30 @@
     return ret;
   }
   DCHECK(!space_->HasAddress(obj));
-  DCHECK(IsMarked(obj));
   return obj;
 }
 
-inline bool MarkCompact::IsMarked(const Object* object) const {
+mirror::Object* MarkCompact::IsMarked(mirror::Object* object) {
   if (immune_region_.ContainsObject(object)) {
-    return true;
+    return object;
+  }
+  if (updating_references_) {
+    return GetMarkedForwardAddress(object);
   }
   if (objects_before_forwarding_->HasAddress(object)) {
-    return objects_before_forwarding_->Test(object);
+    return objects_before_forwarding_->Test(object) ? object : nullptr;
   }
-  return mark_bitmap_->Test(object);
+  return mark_bitmap_->Test(object) ? object : nullptr;
 }
 
-mirror::Object* MarkCompact::IsMarkedCallback(mirror::Object* object, void* arg) {
-  return reinterpret_cast<MarkCompact*>(arg)->IsMarked(object) ? object : nullptr;
-}
-
-bool MarkCompact::HeapReferenceMarkedCallback(mirror::HeapReference<mirror::Object>* ref_ptr,
-                                              void* arg) {
+bool MarkCompact::IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* ref_ptr) {
   // Side effect free since we call this before ever moving objects.
-  return reinterpret_cast<MarkCompact*>(arg)->IsMarked(ref_ptr->AsMirrorPtr());
+  return IsMarked(ref_ptr->AsMirrorPtr()) != nullptr;
 }
 
 void MarkCompact::SweepSystemWeaks() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
-  Runtime::Current()->SweepSystemWeaks(IsMarkedCallback, this);
+  Runtime::Current()->SweepSystemWeaks(this);
 }
 
 bool MarkCompact::ShouldSweepSpace(space::ContinuousSpace* space) const {
@@ -592,8 +555,7 @@
 // Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
 // marked, put it on the appropriate list in the heap for later processing.
 void MarkCompact::DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference) {
-  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, reference,
-                                                         &HeapReferenceMarkedCallback, this);
+  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, reference, this);
 }
 
 class MarkCompactMarkObjectVisitor {
@@ -601,7 +563,7 @@
   explicit MarkCompactMarkObjectVisitor(MarkCompact* collector) : collector_(collector) {
   }
 
-  void operator()(Object* obj, MemberOffset offset, bool /*is_static*/) const ALWAYS_INLINE
+  void operator()(mirror::Object* obj, MemberOffset offset, bool /*is_static*/) const ALWAYS_INLINE
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
     // Object was already verified when we scanned it.
     collector_->MarkObject(obj->GetFieldObject<mirror::Object, kVerifyNone>(offset));
@@ -618,7 +580,7 @@
 };
 
 // Visit all of the references of an object and update.
-void MarkCompact::ScanObject(Object* obj) {
+void MarkCompact::ScanObject(mirror::Object* obj) {
   MarkCompactMarkObjectVisitor visitor(this);
   obj->VisitReferences<kMovingClasses>(visitor, visitor);
 }
@@ -627,7 +589,7 @@
 void MarkCompact::ProcessMarkStack() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   while (!mark_stack_->IsEmpty()) {
-    Object* obj = mark_stack_->PopBack();
+    mirror::Object* obj = mark_stack_->PopBack();
     DCHECK(obj != nullptr);
     ScanObject(obj);
   }
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
index f59a2cd..89d66b5 100644
--- a/runtime/gc/collector/mark_compact.h
+++ b/runtime/gc/collector/mark_compact.h
@@ -121,23 +121,6 @@
                           const RootInfo& info)
       OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
 
-  static mirror::Object* MarkObjectCallback(mirror::Object* root, void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
-
-  static void MarkHeapReferenceCallback(mirror::HeapReference<mirror::Object>* obj_ptr, void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
-
-  static bool HeapReferenceMarkedCallback(mirror::HeapReference<mirror::Object>* ref_ptr,
-                                          void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
-
-  static void ProcessMarkStackCallback(void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
-
-  static void DelayReferenceReferentCallback(mirror::Class* klass, mirror::Reference* ref,
-                                             void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
-
   // Schedules an unmarked object for reference processing.
   void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
@@ -145,11 +128,7 @@
  protected:
   // Returns null if the object is not marked, otherwise returns the forwarding address (same as
   // object for non movable things).
-  mirror::Object* GetMarkedForwardAddress(mirror::Object* object) const
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-
-  static mirror::Object* MarkedForwardingAddressCallback(mirror::Object* object, void* arg)
+  mirror::Object* GetMarkedForwardAddress(mirror::Object* object)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -184,30 +163,27 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
   // Update the references of objects by using the forwarding addresses.
   void UpdateReferences() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
-  static void UpdateRootCallback(mirror::Object** root, void* arg, const RootInfo& /*root_info*/)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
   // Move objects and restore lock words.
   void MoveObjects() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   // Move a single object to its forward address.
   void MoveObject(mirror::Object* obj, size_t len) EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   // Mark a single object.
-  void MarkObject(mirror::Object* obj) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_,
-                                                                Locks::mutator_lock_);
-  bool IsMarked(const mirror::Object* obj) const
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-  static mirror::Object* IsMarkedCallback(mirror::Object* object, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+  virtual mirror::Object* MarkObject(mirror::Object* obj) OVERRIDE
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+  virtual void MarkHeapReference(mirror::HeapReference<mirror::Object>* obj_ptr) OVERRIDE
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+  virtual mirror::Object* IsMarked(mirror::Object* obj) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
+  virtual bool IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* obj) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   void ForwardObject(mirror::Object* obj) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_,
                                                                    Locks::mutator_lock_);
   // Update a single heap reference.
   void UpdateHeapReference(mirror::HeapReference<mirror::Object>* reference)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static void UpdateHeapReferenceCallback(mirror::HeapReference<mirror::Object>* reference,
-                                          void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   // Update all of the references of a single object.
   void UpdateObjectReferences(mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
@@ -242,6 +218,9 @@
   // Which lock words we need to restore as we are moving objects.
   std::deque<LockWord> lock_words_to_restore_;
 
+  // State whether or not we are updating references.
+  bool updating_references_;
+
  private:
   friend class BitmapSetSlowPathVisitor;
   friend class CalculateObjectForwardingAddressVisitor;
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 1c9c412..e0d6d6b 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -37,7 +37,6 @@
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
 #include "gc/reference_processor.h"
-#include "gc/space/image_space.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
 #include "mark_sweep-inl.h"
@@ -47,8 +46,6 @@
 #include "thread-inl.h"
 #include "thread_list.h"
 
-using ::art::mirror::Object;
-
 namespace art {
 namespace gc {
 namespace collector {
@@ -175,8 +172,7 @@
 void MarkSweep::ProcessReferences(Thread* self) {
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   GetHeap()->GetReferenceProcessor()->ProcessReferences(
-      true, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(),
-      &HeapReferenceMarkedCallback, &MarkObjectCallback, &ProcessMarkStackCallback, this);
+      true, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(), this);
 }
 
 void MarkSweep::PausePhase() {
@@ -273,7 +269,7 @@
       TimingLogger::ScopedTiming t(name, GetTimings());
       accounting::ModUnionTable* mod_union_table = heap_->FindModUnionTableFromSpace(space);
       CHECK(mod_union_table != nullptr);
-      mod_union_table->UpdateAndMarkReferences(MarkHeapReferenceCallback, this);
+      mod_union_table->UpdateAndMarkReferences(this);
     }
   }
 }
@@ -333,7 +329,7 @@
     // Someone else acquired the lock and expanded the mark stack before us.
     return;
   }
-  std::vector<StackReference<Object>> temp(mark_stack_->Begin(), mark_stack_->End());
+  std::vector<StackReference<mirror::Object>> temp(mark_stack_->Begin(), mark_stack_->End());
   CHECK_LE(mark_stack_->Size(), new_size);
   mark_stack_->Resize(new_size);
   for (auto& obj : temp) {
@@ -341,7 +337,7 @@
   }
 }
 
-inline void MarkSweep::MarkObjectNonNullParallel(Object* obj) {
+inline void MarkSweep::MarkObjectNonNullParallel(mirror::Object* obj) {
   DCHECK(obj != nullptr);
   if (MarkObjectParallel(obj)) {
     MutexLock mu(Thread::Current(), mark_stack_lock_);
@@ -353,28 +349,18 @@
   }
 }
 
-mirror::Object* MarkSweep::MarkObjectCallback(mirror::Object* obj, void* arg) {
-  MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
-  mark_sweep->MarkObject(obj);
-  return obj;
-}
-
-void MarkSweep::MarkHeapReferenceCallback(mirror::HeapReference<mirror::Object>* ref, void* arg) {
-  reinterpret_cast<MarkSweep*>(arg)->MarkObject(ref->AsMirrorPtr());
-}
-
-bool MarkSweep::HeapReferenceMarkedCallback(mirror::HeapReference<mirror::Object>* ref, void* arg) {
-  return reinterpret_cast<MarkSweep*>(arg)->IsMarked(ref->AsMirrorPtr());
+bool MarkSweep::IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* ref) {
+  return IsMarked(ref->AsMirrorPtr());
 }
 
 class MarkSweepMarkObjectSlowPath {
  public:
-  explicit MarkSweepMarkObjectSlowPath(MarkSweep* mark_sweep, Object* holder = nullptr,
+  explicit MarkSweepMarkObjectSlowPath(MarkSweep* mark_sweep, mirror::Object* holder = nullptr,
                                        MemberOffset offset = MemberOffset(0))
       : mark_sweep_(mark_sweep), holder_(holder), offset_(offset) {
   }
 
-  void operator()(const Object* obj) const ALWAYS_INLINE NO_THREAD_SAFETY_ANALYSIS {
+  void operator()(const mirror::Object* obj) const ALWAYS_INLINE NO_THREAD_SAFETY_ANALYSIS {
     if (kProfileLargeObjects) {
       // TODO: Differentiate between marking and testing somehow.
       ++mark_sweep_->large_object_test_;
@@ -450,7 +436,8 @@
   MemberOffset offset_;
 };
 
-inline void MarkSweep::MarkObjectNonNull(Object* obj, Object* holder, MemberOffset offset) {
+inline void MarkSweep::MarkObjectNonNull(mirror::Object* obj, mirror::Object* holder,
+                                         MemberOffset offset) {
   DCHECK(obj != nullptr);
   if (kUseBakerOrBrooksReadBarrier) {
     // Verify all the objects have the correct pointer installed.
@@ -481,7 +468,7 @@
   }
 }
 
-inline void MarkSweep::PushOnMarkStack(Object* obj) {
+inline void MarkSweep::PushOnMarkStack(mirror::Object* obj) {
   if (UNLIKELY(mark_stack_->Size() >= mark_stack_->Capacity())) {
     // Lock is not needed but is here anyways to please annotalysis.
     MutexLock mu(Thread::Current(), mark_stack_lock_);
@@ -491,14 +478,14 @@
   mark_stack_->PushBack(obj);
 }
 
-inline bool MarkSweep::MarkObjectParallel(const Object* obj) {
+inline bool MarkSweep::MarkObjectParallel(mirror::Object* obj) {
   DCHECK(obj != nullptr);
   if (kUseBakerOrBrooksReadBarrier) {
     // Verify all the objects have the correct pointer installed.
     obj->AssertReadBarrierPointer();
   }
   if (immune_region_.ContainsObject(obj)) {
-    DCHECK(IsMarked(obj));
+    DCHECK(IsMarked(obj) != nullptr);
     return false;
   }
   // Try to take advantage of locality of references within a space, failing this find the space
@@ -511,8 +498,18 @@
   return !mark_bitmap_->AtomicTestAndSet(obj, visitor);
 }
 
+mirror::Object* MarkSweep::MarkObject(mirror::Object* obj) {
+  MarkObject(obj, nullptr, MemberOffset(0));
+  return obj;
+}
+
+void MarkSweep::MarkHeapReference(mirror::HeapReference<mirror::Object>* ref) {
+  MarkObject(ref->AsMirrorPtr(), nullptr, MemberOffset(0));
+}
+
 // Used to mark objects when processing the mark stack. If an object is null, it is not marked.
-inline void MarkSweep::MarkObject(Object* obj, Object* holder, MemberOffset offset) {
+inline void MarkSweep::MarkObject(mirror::Object* obj, mirror::Object* holder,
+                                  MemberOffset offset) {
   if (obj != nullptr) {
     MarkObjectNonNull(obj, holder, offset);
   } else if (kCountMarkedObjects) {
@@ -526,7 +523,7 @@
 
   void VisitRoot(mirror::Object* root, const RootInfo& info) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
-    CHECK(collector_->IsMarked(root)) << info.ToString();
+    CHECK(collector_->IsMarked(root) != nullptr) << info.ToString();
   }
 
  private:
@@ -599,7 +596,8 @@
   explicit ScanObjectVisitor(MarkSweep* const mark_sweep) ALWAYS_INLINE
       : mark_sweep_(mark_sweep) {}
 
-  void operator()(Object* obj) const ALWAYS_INLINE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+  void operator()(mirror::Object* obj) const ALWAYS_INLINE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
     if (kCheckLocks) {
       Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
@@ -631,7 +629,7 @@
 class MarkStackTask : public Task {
  public:
   MarkStackTask(ThreadPool* thread_pool, MarkSweep* mark_sweep, size_t mark_stack_size,
-                StackReference<Object>* mark_stack)
+                StackReference<mirror::Object>* mark_stack)
       : mark_sweep_(mark_sweep),
         thread_pool_(thread_pool),
         mark_stack_pos_(mark_stack_size) {
@@ -655,7 +653,7 @@
                                        MarkSweep* mark_sweep) ALWAYS_INLINE
             : chunk_task_(chunk_task), mark_sweep_(mark_sweep) {}
 
-    void operator()(Object* obj, MemberOffset offset, bool /* static */) const ALWAYS_INLINE
+    void operator()(mirror::Object* obj, MemberOffset offset, bool /* static */) const ALWAYS_INLINE
         SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
       mirror::Object* ref = obj->GetFieldObject<mirror::Object>(offset);
       if (ref != nullptr && mark_sweep_->MarkObjectParallel(ref)) {
@@ -681,7 +679,7 @@
         : chunk_task_(chunk_task) {}
 
     // No thread safety analysis since multiple threads will use this visitor.
-    void operator()(Object* obj) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+    void operator()(mirror::Object* obj) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
         EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
       MarkSweep* const mark_sweep = chunk_task_->mark_sweep_;
       MarkObjectParallelVisitor mark_visitor(chunk_task_, mark_sweep);
@@ -704,11 +702,12 @@
   MarkSweep* const mark_sweep_;
   ThreadPool* const thread_pool_;
   // Thread local mark stack for this task.
-  StackReference<Object> mark_stack_[kMaxSize];
+  StackReference<mirror::Object> mark_stack_[kMaxSize];
   // Mark stack position.
   size_t mark_stack_pos_;
 
-  ALWAYS_INLINE void MarkStackPush(Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  ALWAYS_INLINE void MarkStackPush(mirror::Object* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (UNLIKELY(mark_stack_pos_ == kMaxSize)) {
       // Mark stack overflow, give 1/2 the stack to the thread pool as a new work task.
       mark_stack_pos_ /= 2;
@@ -732,12 +731,12 @@
     ScanObjectParallelVisitor visitor(this);
     // TODO: Tune this.
     static const size_t kFifoSize = 4;
-    BoundedFifoPowerOfTwo<Object*, kFifoSize> prefetch_fifo;
+    BoundedFifoPowerOfTwo<mirror::Object*, kFifoSize> prefetch_fifo;
     for (;;) {
-      Object* obj = nullptr;
+      mirror::Object* obj = nullptr;
       if (kUseMarkStackPrefetch) {
         while (mark_stack_pos_ != 0 && prefetch_fifo.size() < kFifoSize) {
-          Object* const mark_stack_obj = mark_stack_[--mark_stack_pos_].AsMirrorPtr();
+          mirror::Object* const mark_stack_obj = mark_stack_[--mark_stack_pos_].AsMirrorPtr();
           DCHECK(mark_stack_obj != nullptr);
           __builtin_prefetch(mark_stack_obj);
           prefetch_fifo.push_back(mark_stack_obj);
@@ -764,7 +763,7 @@
   CardScanTask(ThreadPool* thread_pool, MarkSweep* mark_sweep,
                accounting::ContinuousSpaceBitmap* bitmap,
                uint8_t* begin, uint8_t* end, uint8_t minimum_age, size_t mark_stack_size,
-               StackReference<Object>* mark_stack_obj, bool clear_card)
+               StackReference<mirror::Object>* mark_stack_obj, bool clear_card)
       : MarkStackTask<false>(thread_pool, mark_sweep, mark_stack_size, mark_stack_obj),
         bitmap_(bitmap),
         begin_(begin),
@@ -815,8 +814,8 @@
     TimingLogger::ScopedTiming t(paused ? "(Paused)ScanGrayObjects" : __FUNCTION__,
         GetTimings());
     // Try to take some of the mark stack since we can pass this off to the worker tasks.
-    StackReference<Object>* mark_stack_begin = mark_stack_->Begin();
-    StackReference<Object>* mark_stack_end = mark_stack_->End();
+    StackReference<mirror::Object>* mark_stack_begin = mark_stack_->Begin();
+    StackReference<mirror::Object>* mark_stack_end = mark_stack_->End();
     const size_t mark_stack_size = mark_stack_end - mark_stack_begin;
     // Estimated number of work tasks we will create.
     const size_t mark_stack_tasks = GetHeap()->GetContinuousSpaces().size() * thread_count;
@@ -988,13 +987,6 @@
   ProcessMarkStack(false);
 }
 
-mirror::Object* MarkSweep::IsMarkedCallback(mirror::Object* object, void* arg) {
-  if (reinterpret_cast<MarkSweep*>(arg)->IsMarked(object)) {
-    return object;
-  }
-  return nullptr;
-}
-
 void MarkSweep::RecursiveMarkDirtyObjects(bool paused, uint8_t minimum_age) {
   ScanGrayObjects(paused, minimum_age);
   ProcessMarkStack(paused);
@@ -1015,16 +1007,23 @@
 void MarkSweep::SweepSystemWeaks(Thread* self) {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  Runtime::Current()->SweepSystemWeaks(IsMarkedCallback, this);
+  Runtime::Current()->SweepSystemWeaks(this);
 }
 
-mirror::Object* MarkSweep::VerifySystemWeakIsLiveCallback(Object* obj, void* arg) {
-  reinterpret_cast<MarkSweep*>(arg)->VerifyIsLive(obj);
-  // We don't actually want to sweep the object, so lets return "marked"
-  return obj;
-}
+class VerifySystemWeakVisitor : public IsMarkedVisitor {
+ public:
+  explicit VerifySystemWeakVisitor(MarkSweep* mark_sweep) : mark_sweep_(mark_sweep) {}
 
-void MarkSweep::VerifyIsLive(const Object* obj) {
+  virtual mirror::Object* IsMarked(mirror::Object* obj) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
+    mark_sweep_->VerifyIsLive(obj);
+    return obj;
+  }
+
+  MarkSweep* const mark_sweep_;
+};
+
+void MarkSweep::VerifyIsLive(const mirror::Object* obj) {
   if (!heap_->GetLiveBitmap()->Test(obj)) {
     // TODO: Consider live stack? Has this code bitrotted?
     CHECK(!heap_->allocation_stack_->Contains(obj))
@@ -1035,7 +1034,8 @@
 void MarkSweep::VerifySystemWeaks() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Verify system weaks, uses a special object visitor which returns the input object.
-  Runtime::Current()->SweepSystemWeaks(VerifySystemWeakIsLiveCallback, this);
+  VerifySystemWeakVisitor visitor(this);
+  Runtime::Current()->SweepSystemWeaks(&visitor);
 }
 
 class CheckpointMarkThreadRoots : public Closure, public RootVisitor {
@@ -1122,7 +1122,7 @@
   ObjectBytePair freed;
   ObjectBytePair freed_los;
   // How many objects are left in the array, modified after each space is swept.
-  StackReference<Object>* objects = allocations->Begin();
+  StackReference<mirror::Object>* objects = allocations->Begin();
   size_t count = allocations->Size();
   // Change the order to ensure that the non-moving space last swept as an optimization.
   std::vector<space::ContinuousSpace*> sweep_spaces;
@@ -1150,9 +1150,9 @@
     if (swap_bitmaps) {
       std::swap(live_bitmap, mark_bitmap);
     }
-    StackReference<Object>* out = objects;
+    StackReference<mirror::Object>* out = objects;
     for (size_t i = 0; i < count; ++i) {
-      Object* const obj = objects[i].AsMirrorPtr();
+      mirror::Object* const obj = objects[i].AsMirrorPtr();
       if (kUseThreadLocalAllocationStack && obj == nullptr) {
         continue;
       }
@@ -1191,7 +1191,7 @@
       std::swap(large_live_objects, large_mark_objects);
     }
     for (size_t i = 0; i < count; ++i) {
-      Object* const obj = objects[i].AsMirrorPtr();
+      mirror::Object* const obj = objects[i].AsMirrorPtr();
       // Handle large objects.
       if (kUseThreadLocalAllocationStack && obj == nullptr) {
         continue;
@@ -1250,16 +1250,15 @@
   if (kCountJavaLangRefs) {
     ++reference_count_;
   }
-  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, ref, &HeapReferenceMarkedCallback,
-                                                         this);
+  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, ref, this);
 }
 
-class MarkObjectVisitor {
+class MarkVisitor {
  public:
-  explicit MarkObjectVisitor(MarkSweep* const mark_sweep) ALWAYS_INLINE : mark_sweep_(mark_sweep) {
+  explicit MarkVisitor(MarkSweep* const mark_sweep) ALWAYS_INLINE : mark_sweep_(mark_sweep) {
   }
 
-  void operator()(Object* obj, MemberOffset offset, bool /* is_static */) const
+  void operator()(mirror::Object* obj, MemberOffset offset, bool /* is_static */) const
       ALWAYS_INLINE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
     if (kCheckLocks) {
@@ -1275,16 +1274,12 @@
 
 // Scans an object reference.  Determines the type of the reference
 // and dispatches to a specialized scanning routine.
-void MarkSweep::ScanObject(Object* obj) {
-  MarkObjectVisitor mark_visitor(this);
+void MarkSweep::ScanObject(mirror::Object* obj) {
+  MarkVisitor mark_visitor(this);
   DelayReferenceReferentVisitor ref_visitor(this);
   ScanObjectVisit(obj, mark_visitor, ref_visitor);
 }
 
-void MarkSweep::ProcessMarkStackCallback(void* arg) {
-  reinterpret_cast<MarkSweep*>(arg)->ProcessMarkStack(false);
-}
-
 void MarkSweep::ProcessMarkStackParallel(size_t thread_count) {
   Thread* self = Thread::Current();
   ThreadPool* thread_pool = GetHeap()->GetThreadPool();
@@ -1317,12 +1312,12 @@
   } else {
     // TODO: Tune this.
     static const size_t kFifoSize = 4;
-    BoundedFifoPowerOfTwo<Object*, kFifoSize> prefetch_fifo;
+    BoundedFifoPowerOfTwo<mirror::Object*, kFifoSize> prefetch_fifo;
     for (;;) {
-      Object* obj = nullptr;
+      mirror::Object* obj = nullptr;
       if (kUseMarkStackPrefetch) {
         while (!mark_stack_->IsEmpty() && prefetch_fifo.size() < kFifoSize) {
-          Object* mark_stack_obj = mark_stack_->PopBack();
+          mirror::Object* mark_stack_obj = mark_stack_->PopBack();
           DCHECK(mark_stack_obj != nullptr);
           __builtin_prefetch(mark_stack_obj);
           prefetch_fifo.push_back(mark_stack_obj);
@@ -1344,14 +1339,14 @@
   }
 }
 
-inline bool MarkSweep::IsMarked(const Object* object) const {
+inline mirror::Object* MarkSweep::IsMarked(mirror::Object* object) {
   if (immune_region_.ContainsObject(object)) {
-    return true;
+    return object;
   }
   if (current_space_bitmap_->HasAddress(object)) {
-    return current_space_bitmap_->Test(object);
+    return current_space_bitmap_->Test(object) ? object : nullptr;
   }
-  return mark_bitmap_->Test(object);
+  return mark_bitmap_->Test(object) ? object : nullptr;
 }
 
 void MarkSweep::FinishPhase() {
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index d29d87a..c13755c 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -170,18 +170,9 @@
 
   // Verify that an object is live, either in a live bitmap or in the allocation stack.
   void VerifyIsLive(const mirror::Object* obj)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
 
-  static mirror::Object* MarkObjectCallback(mirror::Object* obj, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-
-  static void MarkHeapReferenceCallback(mirror::HeapReference<mirror::Object>* ref, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-
-  static bool HeapReferenceMarkedCallback(mirror::HeapReference<mirror::Object>* ref, void* arg)
+  virtual bool IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* ref) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -194,13 +185,14 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  static void ProcessMarkStackCallback(void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   // Marks an object.
-  void MarkObject(mirror::Object* obj, mirror::Object* holder = nullptr,
-                  MemberOffset offset = MemberOffset(0))
+  virtual mirror::Object* MarkObject(mirror::Object* obj) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+  void MarkObject(mirror::Object* obj, mirror::Object* holder, MemberOffset offset)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+  virtual void MarkHeapReference(mirror::HeapReference<mirror::Object>* ref) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -214,15 +206,9 @@
 
  protected:
   // Returns true if the object has its bit set in the mark bitmap.
-  bool IsMarked(const mirror::Object* object) const
+  virtual mirror::Object* IsMarked(mirror::Object* object) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  static mirror::Object* IsMarkedCallback(mirror::Object* object, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-
-  static void VerifyImageRootVisitor(mirror::Object* root, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
-
   void MarkObjectNonNull(mirror::Object* obj, mirror::Object* holder = nullptr,
                          MemberOffset offset = MemberOffset(0))
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
@@ -233,7 +219,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Returns true if we need to add obj to a mark stack.
-  bool MarkObjectParallel(const mirror::Object* obj) NO_THREAD_SAFETY_ANALYSIS;
+  bool MarkObjectParallel(mirror::Object* obj) NO_THREAD_SAFETY_ANALYSIS;
 
   // Verify the roots of the heap and print out information related to any invalid roots.
   // Called in MarkObject, so may we may not hold the mutator lock.
@@ -258,6 +244,11 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  virtual void ProcessMarkStack() OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    ProcessMarkStack(false);
+  }
+
   // Recursively blackens objects on the mark stack.
   void ProcessMarkStack(bool paused)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 82d02e7..2a9f47a 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -157,8 +157,7 @@
 void SemiSpace::ProcessReferences(Thread* self) {
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   GetHeap()->GetReferenceProcessor()->ProcessReferences(
-      false, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(),
-      &HeapReferenceMarkedCallback, &MarkObjectCallback, &ProcessMarkStackCallback, this);
+      false, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(), this);
 }
 
 void SemiSpace::MarkingPhase() {
@@ -336,7 +335,7 @@
           space->IsZygoteSpace() ? "UpdateAndMarkZygoteModUnionTable" :
                                    "UpdateAndMarkImageModUnionTable",
                                    GetTimings());
-      table->UpdateAndMarkReferences(MarkHeapReferenceCallback, this);
+      table->UpdateAndMarkReferences(this);
       DCHECK(GetHeap()->FindRememberedSetFromSpace(space) == nullptr);
     } else if (collect_from_space_only_ && space->GetLiveBitmap() != nullptr) {
       // If the space has no mod union table (the non-moving space and main spaces when the bump
@@ -351,8 +350,7 @@
       CHECK_EQ(rem_set != nullptr, kUseRememberedSet);
       if (rem_set != nullptr) {
         TimingLogger::ScopedTiming t2("UpdateAndMarkRememberedSet", GetTimings());
-        rem_set->UpdateAndMarkReferences(MarkHeapReferenceCallback, DelayReferenceReferentCallback,
-                                         from_space_, this);
+        rem_set->UpdateAndMarkReferences(from_space_, this);
         if (kIsDebugBuild) {
           // Verify that there are no from-space references that
           // remain in the space, that is, the remembered set (and the
@@ -583,24 +581,14 @@
   return forward_address;
 }
 
-void SemiSpace::ProcessMarkStackCallback(void* arg) {
-  reinterpret_cast<SemiSpace*>(arg)->ProcessMarkStack();
-}
-
-mirror::Object* SemiSpace::MarkObjectCallback(mirror::Object* root, void* arg) {
+mirror::Object* SemiSpace::MarkObject(mirror::Object* root) {
   auto ref = StackReference<mirror::Object>::FromMirrorPtr(root);
-  reinterpret_cast<SemiSpace*>(arg)->MarkObject(&ref);
+  MarkObject(&ref);
   return ref.AsMirrorPtr();
 }
 
-void SemiSpace::MarkHeapReferenceCallback(mirror::HeapReference<mirror::Object>* obj_ptr,
-                                          void* arg) {
-  reinterpret_cast<SemiSpace*>(arg)->MarkObject(obj_ptr);
-}
-
-void SemiSpace::DelayReferenceReferentCallback(mirror::Class* klass, mirror::Reference* ref,
-                                               void* arg) {
-  reinterpret_cast<SemiSpace*>(arg)->DelayReferenceReferent(klass, ref);
+void SemiSpace::MarkHeapReference(mirror::HeapReference<mirror::Object>* obj_ptr) {
+  MarkObject(obj_ptr);
 }
 
 void SemiSpace::VisitRoots(mirror::Object*** roots, size_t count,
@@ -628,29 +616,9 @@
   Runtime::Current()->VisitRoots(this);
 }
 
-bool SemiSpace::HeapReferenceMarkedCallback(mirror::HeapReference<mirror::Object>* object,
-                                            void* arg) {
-  mirror::Object* obj = object->AsMirrorPtr();
-  mirror::Object* new_obj =
-      reinterpret_cast<SemiSpace*>(arg)->GetMarkedForwardAddress(obj);
-  if (new_obj == nullptr) {
-    return false;
-  }
-  if (new_obj != obj) {
-    // Write barrier is not necessary since it still points to the same object, just at a different
-    // address.
-    object->Assign(new_obj);
-  }
-  return true;
-}
-
-mirror::Object* SemiSpace::MarkedForwardingAddressCallback(mirror::Object* object, void* arg) {
-  return reinterpret_cast<SemiSpace*>(arg)->GetMarkedForwardAddress(object);
-}
-
 void SemiSpace::SweepSystemWeaks() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
-  Runtime::Current()->SweepSystemWeaks(MarkedForwardingAddressCallback, this);
+  Runtime::Current()->SweepSystemWeaks(this);
 }
 
 bool SemiSpace::ShouldSweepSpace(space::ContinuousSpace* space) const {
@@ -688,8 +656,7 @@
 // Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
 // marked, put it on the appropriate list in the heap for later processing.
 void SemiSpace::DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference) {
-  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, reference,
-                                                         &HeapReferenceMarkedCallback, this);
+  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, reference, this);
 }
 
 class SemiSpaceMarkObjectVisitor {
@@ -746,8 +713,7 @@
   }
 }
 
-inline Object* SemiSpace::GetMarkedForwardAddress(mirror::Object* obj) const
-    SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
+mirror::Object* SemiSpace::IsMarked(mirror::Object* obj) {
   // All immune objects are assumed marked.
   if (from_space_->HasAddress(obj)) {
     // Returns either the forwarding address or null.
@@ -759,6 +725,20 @@
   return mark_bitmap_->Test(obj) ? obj : nullptr;
 }
 
+bool SemiSpace::IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* object) {
+  mirror::Object* obj = object->AsMirrorPtr();
+  mirror::Object* new_obj = IsMarked(obj);
+  if (new_obj == nullptr) {
+    return false;
+  }
+  if (new_obj != obj) {
+    // Write barrier is not necessary since it still points to the same object, just at a different
+    // address.
+    object->Assign(new_obj);
+  }
+  return true;
+}
+
 void SemiSpace::SetToSpace(space::ContinuousMemMapAllocSpace* to_space) {
   DCHECK(to_space != nullptr);
   to_space_ = to_space;
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index 3c25f53..6b7ea0d 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -103,6 +103,12 @@
   void MarkObject(mirror::ObjectReference<kPoisonReferences, mirror::Object>* obj_ptr)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
+  virtual mirror::Object* MarkObject(mirror::Object* root) OVERRIDE
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+
+  virtual void MarkHeapReference(mirror::HeapReference<mirror::Object>* obj_ptr) OVERRIDE
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+
   void ScanObject(mirror::Object* obj)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
@@ -140,19 +146,6 @@
                           const RootInfo& info) OVERRIDE
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
 
-  static mirror::Object* MarkObjectCallback(mirror::Object* root, void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
-
-  static void MarkHeapReferenceCallback(mirror::HeapReference<mirror::Object>* obj_ptr, void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
-
-  static void ProcessMarkStackCallback(void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
-
-  static void DelayReferenceReferentCallback(mirror::Class* klass, mirror::Reference* ref,
-                                             void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
-
   virtual mirror::Object* MarkNonForwardedObject(mirror::Object* obj)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
@@ -163,15 +156,11 @@
  protected:
   // Returns null if the object is not marked, otherwise returns the forwarding address (same as
   // object for non movable things).
-  mirror::Object* GetMarkedForwardAddress(mirror::Object* object) const
+  virtual mirror::Object* IsMarked(mirror::Object* object) OVERRIDE
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  static bool HeapReferenceMarkedCallback(mirror::HeapReference<mirror::Object>* object, void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-
-  static mirror::Object* MarkedForwardingAddressCallback(mirror::Object* object, void* arg)
+  virtual bool IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* object) OVERRIDE
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 2e66160..cb750eb 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -92,7 +92,7 @@
   } else if (!kInstrumented && allocator == kAllocatorTypeRosAlloc &&
              (obj = rosalloc_space_->AllocThreadLocal(self, byte_count, &bytes_allocated)) &&
              LIKELY(obj != nullptr)) {
-    DCHECK(!running_on_valgrind_);
+    DCHECK(!is_running_on_memory_tool_);
     obj->SetClass(klass);
     if (kUseBakerOrBrooksReadBarrier) {
       if (kUseBrooksReadBarrier) {
@@ -244,8 +244,8 @@
       break;
     }
     case kAllocatorTypeRosAlloc: {
-      if (kInstrumented && UNLIKELY(running_on_valgrind_)) {
-        // If running on valgrind, we should be using the instrumented path.
+      if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) {
+        // If running on valgrind or asan, we should be using the instrumented path.
         size_t max_bytes_tl_bulk_allocated = rosalloc_space_->MaxBytesBulkAllocatedFor(alloc_size);
         if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type,
                                                       max_bytes_tl_bulk_allocated))) {
@@ -254,7 +254,7 @@
         ret = rosalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
                                      bytes_tl_bulk_allocated);
       } else {
-        DCHECK(!running_on_valgrind_);
+        DCHECK(!is_running_on_memory_tool_);
         size_t max_bytes_tl_bulk_allocated =
             rosalloc_space_->MaxBytesBulkAllocatedForNonvirtual(alloc_size);
         if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type,
@@ -270,12 +270,12 @@
       break;
     }
     case kAllocatorTypeDlMalloc: {
-      if (kInstrumented && UNLIKELY(running_on_valgrind_)) {
+      if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) {
         // If running on valgrind, we should be using the instrumented path.
         ret = dlmalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
                                      bytes_tl_bulk_allocated);
       } else {
-        DCHECK(!running_on_valgrind_);
+        DCHECK(!is_running_on_memory_tool_);
         ret = dlmalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size,
                                                bytes_tl_bulk_allocated);
       }
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 0ae9cdf..795d2a2 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -192,7 +192,7 @@
       total_allocation_time_(0),
       verify_object_mode_(kVerifyObjectModeDisabled),
       disable_moving_gc_count_(0),
-      running_on_valgrind_(Runtime::Current()->RunningOnValgrind()),
+      is_running_on_memory_tool_(Runtime::Current()->IsRunningOnMemoryTool()),
       use_tlab_(use_tlab),
       main_space_backup_(nullptr),
       min_interval_homogeneous_space_compaction_by_oom_(
@@ -518,7 +518,7 @@
   if (gc_stress_mode_) {
     backtrace_lock_ = new Mutex("GC complete lock");
   }
-  if (running_on_valgrind_ || gc_stress_mode_) {
+  if (is_running_on_memory_tool_ || gc_stress_mode_) {
     instrumentation->InstrumentQuickAllocEntryPoints();
   }
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
@@ -2077,9 +2077,12 @@
 // Special compacting collector which uses sub-optimal bin packing to reduce zygote space size.
 class ZygoteCompactingCollector FINAL : public collector::SemiSpace {
  public:
-  explicit ZygoteCompactingCollector(gc::Heap* heap) : SemiSpace(heap, false, "zygote collector"),
-      bin_live_bitmap_(nullptr), bin_mark_bitmap_(nullptr) {
-  }
+  explicit ZygoteCompactingCollector(gc::Heap* heap,
+                                     bool is_running_on_memory_tool)
+      : SemiSpace(heap, false, "zygote collector"),
+        bin_live_bitmap_(nullptr),
+        bin_mark_bitmap_(nullptr),
+        is_running_on_memory_tool_(is_running_on_memory_tool) {}
 
   void BuildBins(space::ContinuousSpace* space) {
     bin_live_bitmap_ = space->GetLiveBitmap();
@@ -2105,6 +2108,7 @@
   accounting::ContinuousSpaceBitmap* bin_live_bitmap_;
   // Mark bitmap of the space which contains the bins.
   accounting::ContinuousSpaceBitmap* bin_mark_bitmap_;
+  const bool is_running_on_memory_tool_;
 
   static void Callback(mirror::Object* obj, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -2119,6 +2123,9 @@
   }
 
   void AddBin(size_t size, uintptr_t position) {
+    if (is_running_on_memory_tool_) {
+      MEMORY_TOOL_MAKE_DEFINED(reinterpret_cast<void*>(position), size);
+    }
     if (size != 0) {
       bins_.insert(std::make_pair(size, position));
     }
@@ -2212,7 +2219,7 @@
     // Temporarily disable rosalloc verification because the zygote
     // compaction will mess up the rosalloc internal metadata.
     ScopedDisableRosAllocVerification disable_rosalloc_verif(this);
-    ZygoteCompactingCollector zygote_collector(this);
+    ZygoteCompactingCollector zygote_collector(this, is_running_on_memory_tool_);
     zygote_collector.BuildBins(non_moving_space_);
     // Create a new bump pointer space which we will compact into.
     space::BumpPointerSpace target_space("zygote bump space", non_moving_space_->End(),
@@ -3048,8 +3055,13 @@
   }
 }
 
-static void IdentityMarkHeapReferenceCallback(mirror::HeapReference<mirror::Object>*, void*) {
-}
+struct IdentityMarkHeapReferenceVisitor : public MarkObjectVisitor {
+  virtual mirror::Object* MarkObject(mirror::Object* obj) OVERRIDE {
+    return obj;
+  }
+  virtual void MarkHeapReference(mirror::HeapReference<mirror::Object>*) OVERRIDE {
+  }
+};
 
 void Heap::PreGcVerificationPaused(collector::GarbageCollector* gc) {
   Thread* const self = Thread::Current();
@@ -3078,7 +3090,8 @@
     ReaderMutexLock reader_lock(self, *Locks::heap_bitmap_lock_);
     for (const auto& table_pair : mod_union_tables_) {
       accounting::ModUnionTable* mod_union_table = table_pair.second;
-      mod_union_table->UpdateAndMarkReferences(IdentityMarkHeapReferenceCallback, nullptr);
+      IdentityMarkHeapReferenceVisitor visitor;
+      mod_union_table->UpdateAndMarkReferences(&visitor);
       mod_union_table->Verify();
     }
   }
@@ -3707,11 +3720,11 @@
   }
 }
 
-void Heap::SweepAllocationRecords(IsMarkedCallback* visitor, void* arg) const {
+void Heap::SweepAllocationRecords(IsMarkedVisitor* visitor) const {
   if (IsAllocTrackingEnabled()) {
     MutexLock mu(Thread::Current(), *Locks::alloc_tracker_lock_);
     if (IsAllocTrackingEnabled()) {
-      GetAllocationRecords()->SweepAllocationRecords(visitor, arg);
+      GetAllocationRecords()->SweepAllocationRecords(visitor);
     }
   }
 }
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index d0040f2..ee3d510 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -705,7 +705,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       LOCKS_EXCLUDED(Locks::alloc_tracker_lock_);
 
-  void SweepAllocationRecords(IsMarkedCallback* visitor, void* arg) const
+  void SweepAllocationRecords(IsMarkedVisitor* visitor) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       LOCKS_EXCLUDED(Locks::alloc_tracker_lock_);
 
@@ -1173,7 +1173,7 @@
   collector::MarkCompact* mark_compact_collector_;
   collector::ConcurrentCopying* concurrent_copying_collector_;
 
-  const bool running_on_valgrind_;
+  const bool is_running_on_memory_tool_;
   const bool use_tlab_;
 
   // Pointer to the space which becomes the new main space when we do homogeneous space compaction.
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index 4d51d38..256cdd2 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc
@@ -17,6 +17,7 @@
 #include "reference_processor.h"
 
 #include "base/time_utils.h"
+#include "collector/garbage_collector.h"
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/reference-inl.h"
@@ -34,7 +35,7 @@
 static constexpr bool kAsyncReferenceQueueAdd = false;
 
 ReferenceProcessor::ReferenceProcessor()
-    : process_references_args_(nullptr, nullptr, nullptr),
+    : collector_(nullptr),
       preserving_references_(false),
       condition_("reference processor condition", *Locks::reference_processor_lock_) ,
       soft_reference_queue_(Locks::reference_queue_soft_references_lock_),
@@ -53,15 +54,27 @@
   condition_.Broadcast(self);
 }
 
+void ReferenceProcessor::BroadcastForSlowPath(Thread* self) {
+  CHECK(kUseReadBarrier);
+  MutexLock mu(self, *Locks::reference_processor_lock_);
+  condition_.Broadcast(self);
+}
+
 mirror::Object* ReferenceProcessor::GetReferent(Thread* self, mirror::Reference* reference) {
-  mirror::Object* const referent = reference->GetReferent();
-  // If the referent is null then it is already cleared, we can just return null since there is no
-  // scenario where it becomes non-null during the reference processing phase.
-  if (UNLIKELY(!SlowPathEnabled()) || referent == nullptr) {
-    return referent;
+  if (!kUseReadBarrier || self->GetWeakRefAccessEnabled()) {
+    // Under read barrier / concurrent copying collector, it's not safe to call GetReferent() when
+    // weak ref access is disabled as the call includes a read barrier which may push a ref onto the
+    // mark stack and interfere with termination of marking.
+    mirror::Object* const referent = reference->GetReferent();
+    // If the referent is null then it is already cleared, we can just return null since there is no
+    // scenario where it becomes non-null during the reference processing phase.
+    if (UNLIKELY(!SlowPathEnabled()) || referent == nullptr) {
+      return referent;
+    }
   }
   MutexLock mu(self, *Locks::reference_processor_lock_);
-  while (SlowPathEnabled()) {
+  while ((!kUseReadBarrier && SlowPathEnabled()) ||
+         (kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
     mirror::HeapReference<mirror::Object>* const referent_addr =
         reference->GetReferentReferenceAddr();
     // If the referent became cleared, return it. Don't need barrier since thread roots can't get
@@ -71,16 +84,14 @@
     }
     // Try to see if the referent is already marked by using the is_marked_callback. We can return
     // it to the mutator as long as the GC is not preserving references.
-    IsHeapReferenceMarkedCallback* const is_marked_callback =
-        process_references_args_.is_marked_callback_;
-    if (LIKELY(is_marked_callback != nullptr)) {
+    if (LIKELY(collector_ != nullptr)) {
       // If it's null it means not marked, but it could become marked if the referent is reachable
       // by finalizer referents. So we can not return in this case and must block. Otherwise, we
       // can return it to the mutator as long as the GC is not preserving references, in which
       // case only black nodes can be safely returned. If the GC is preserving references, the
       // mutator could take a white field from a grey or white node and move it somewhere else
       // in the heap causing corruption since this field would get swept.
-      if (is_marked_callback(referent_addr, process_references_args_.arg_)) {
+      if (collector_->IsMarkedHeapReference(referent_addr)) {
         if (!preserving_references_ ||
            (LIKELY(!reference->IsFinalizerReferenceInstance()) && !reference->IsEnqueued())) {
           return referent_addr->AsMirrorPtr();
@@ -92,16 +103,6 @@
   return reference->GetReferent();
 }
 
-bool ReferenceProcessor::PreserveSoftReferenceCallback(mirror::HeapReference<mirror::Object>* obj,
-                                                       void* arg) {
-  auto* const args = reinterpret_cast<ProcessReferencesArgs*>(arg);
-  // TODO: Add smarter logic for preserving soft references.
-  mirror::Object* new_obj = args->mark_callback_(obj->AsMirrorPtr(), args->arg_);
-  DCHECK(new_obj != nullptr);
-  obj->Assign(new_obj);
-  return true;
-}
-
 void ReferenceProcessor::StartPreservingReferences(Thread* self) {
   MutexLock mu(self, *Locks::reference_processor_lock_);
   preserving_references_ = true;
@@ -117,18 +118,18 @@
 // Process reference class instances and schedule finalizations.
 void ReferenceProcessor::ProcessReferences(bool concurrent, TimingLogger* timings,
                                            bool clear_soft_references,
-                                           IsHeapReferenceMarkedCallback* is_marked_callback,
-                                           MarkObjectCallback* mark_object_callback,
-                                           ProcessMarkStackCallback* process_mark_stack_callback,
-                                           void* arg) {
+                                           collector::GarbageCollector* collector) {
   TimingLogger::ScopedTiming t(concurrent ? __FUNCTION__ : "(Paused)ProcessReferences", timings);
   Thread* self = Thread::Current();
   {
     MutexLock mu(self, *Locks::reference_processor_lock_);
-    process_references_args_.is_marked_callback_ = is_marked_callback;
-    process_references_args_.mark_callback_ = mark_object_callback;
-    process_references_args_.arg_ = arg;
-    CHECK_EQ(SlowPathEnabled(), concurrent) << "Slow path must be enabled iff concurrent";
+    collector_ = collector;
+    if (!kUseReadBarrier) {
+      CHECK_EQ(SlowPathEnabled(), concurrent) << "Slow path must be enabled iff concurrent";
+    } else {
+      // Weak ref access is enabled at Zygote compaction by SemiSpace (concurrent == false).
+      CHECK_EQ(!self->GetWeakRefAccessEnabled(), concurrent);
+    }
   }
   // Unless required to clear soft references with white references, preserve some white referents.
   if (!clear_soft_references) {
@@ -137,16 +138,16 @@
     if (concurrent) {
       StartPreservingReferences(self);
     }
-    soft_reference_queue_.ForwardSoftReferences(&PreserveSoftReferenceCallback,
-                                                &process_references_args_);
-    process_mark_stack_callback(arg);
+    // TODO: Add smarter logic for preserving soft references.
+    soft_reference_queue_.ForwardSoftReferences(collector);
+    collector->ProcessMarkStack();
     if (concurrent) {
       StopPreservingReferences(self);
     }
   }
   // Clear all remaining soft and weak references with white referents.
-  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, is_marked_callback, arg);
-  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, is_marked_callback, arg);
+  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
+  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
   {
     TimingLogger::ScopedTiming t2(concurrent ? "EnqueueFinalizerReferences" :
         "(Paused)EnqueueFinalizerReferences", timings);
@@ -154,18 +155,17 @@
       StartPreservingReferences(self);
     }
     // Preserve all white objects with finalize methods and schedule them for finalization.
-    finalizer_reference_queue_.EnqueueFinalizerReferences(&cleared_references_, is_marked_callback,
-                                                          mark_object_callback, arg);
-    process_mark_stack_callback(arg);
+    finalizer_reference_queue_.EnqueueFinalizerReferences(&cleared_references_, collector);
+    collector->ProcessMarkStack();
     if (concurrent) {
       StopPreservingReferences(self);
     }
   }
   // Clear all finalizer referent reachable soft and weak references with white referents.
-  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, is_marked_callback, arg);
-  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, is_marked_callback, arg);
+  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
+  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
   // Clear all phantom references with white referents.
-  phantom_reference_queue_.ClearWhiteReferences(&cleared_references_, is_marked_callback, arg);
+  phantom_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
   // At this point all reference queues other than the cleared references should be empty.
   DCHECK(soft_reference_queue_.IsEmpty());
   DCHECK(weak_reference_queue_.IsEmpty());
@@ -177,8 +177,8 @@
     // could result in a stale is_marked_callback_ being called before the reference processing
     // starts since there is a small window of time where slow_path_enabled_ is enabled but the
     // callback isn't yet set.
-    process_references_args_.is_marked_callback_ = nullptr;
-    if (concurrent) {
+    collector_ = nullptr;
+    if (!kUseReadBarrier && concurrent) {
       // Done processing, disable the slow path and broadcast to the waiters.
       DisableSlowPath(self);
     }
@@ -188,13 +188,12 @@
 // Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
 // marked, put it on the appropriate list in the heap for later processing.
 void ReferenceProcessor::DelayReferenceReferent(mirror::Class* klass, mirror::Reference* ref,
-                                                IsHeapReferenceMarkedCallback* is_marked_callback,
-                                                void* arg) {
+                                                collector::GarbageCollector* collector) {
   // klass can be the class of the old object if the visitor already updated the class of ref.
   DCHECK(klass != nullptr);
   DCHECK(klass->IsTypeOfReferenceClass());
   mirror::HeapReference<mirror::Object>* referent = ref->GetReferentReferenceAddr();
-  if (referent->AsMirrorPtr() != nullptr && !is_marked_callback(referent, arg)) {
+  if (referent->AsMirrorPtr() != nullptr && !collector->IsMarkedHeapReference(referent)) {
     Thread* self = Thread::Current();
     // TODO: Remove these locks, and use atomic stacks for storing references?
     // We need to check that the references haven't already been enqueued since we can end up
@@ -214,8 +213,8 @@
   }
 }
 
-void ReferenceProcessor::UpdateRoots(IsMarkedCallback* callback, void* arg) {
-  cleared_references_.UpdateRoots(callback, arg);
+void ReferenceProcessor::UpdateRoots(IsMarkedVisitor* visitor) {
+  cleared_references_.UpdateRoots(visitor);
 }
 
 class ClearedReferenceTask : public HeapTask {
@@ -264,7 +263,8 @@
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::reference_processor_lock_);
   // Wait untul we are done processing reference.
-  while (SlowPathEnabled()) {
+  while ((!kUseReadBarrier && SlowPathEnabled()) ||
+         (kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
     condition_.WaitHoldingLocks(self);
   }
   // At this point, since the sentinel of the reference is live, it is guaranteed to not be
diff --git a/runtime/gc/reference_processor.h b/runtime/gc/reference_processor.h
index a44319b..95877d1 100644
--- a/runtime/gc/reference_processor.h
+++ b/runtime/gc/reference_processor.h
@@ -28,6 +28,7 @@
 class TimingLogger;
 
 namespace mirror {
+class Class;
 class FinalizerReference;
 class Object;
 class Reference;
@@ -35,18 +36,18 @@
 
 namespace gc {
 
+namespace collector {
+class GarbageCollector;
+}  // namespace collector
+
 class Heap;
 
 // Used to process java.lang.References concurrently or paused.
 class ReferenceProcessor {
  public:
   explicit ReferenceProcessor();
-  static bool PreserveSoftReferenceCallback(mirror::HeapReference<mirror::Object>* obj, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void ProcessReferences(bool concurrent, TimingLogger* timings, bool clear_soft_references,
-                         IsHeapReferenceMarkedCallback* is_marked_callback,
-                         MarkObjectCallback* mark_object_callback,
-                         ProcessMarkStackCallback* process_mark_stack_callback, void* arg)
+                         gc::collector::GarbageCollector* collector)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       LOCKS_EXCLUDED(Locks::reference_processor_lock_);
@@ -54,14 +55,15 @@
   // Only allow setting this with mutators suspended so that we can avoid using a lock in the
   // GetReferent fast path as an optimization.
   void EnableSlowPath() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void BroadcastForSlowPath(Thread* self);
   // Decode the referent, may block if references are being processed.
   mirror::Object* GetReferent(Thread* self, mirror::Reference* reference)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) LOCKS_EXCLUDED(Locks::reference_processor_lock_);
   void EnqueueClearedReferences(Thread* self) LOCKS_EXCLUDED(Locks::mutator_lock_);
   void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* ref,
-                              IsHeapReferenceMarkedCallback* is_marked_callback, void* arg)
+                              collector::GarbageCollector* collector)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void UpdateRoots(IsMarkedCallback* callback, void* arg)
+  void UpdateRoots(IsMarkedVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
   // Make a circular list with reference if it is not enqueued. Uses the finalizer queue lock.
   bool MakeCircularListIfUnenqueued(mirror::FinalizerReference* reference)
@@ -70,21 +72,6 @@
                      Locks::reference_queue_finalizer_references_lock_);
 
  private:
-  class ProcessReferencesArgs {
-   public:
-    ProcessReferencesArgs(IsHeapReferenceMarkedCallback* is_marked_callback,
-                          MarkObjectCallback* mark_callback, void* arg)
-        : is_marked_callback_(is_marked_callback), mark_callback_(mark_callback), arg_(arg) {
-    }
-
-    // The is marked callback is null when the args aren't set up.
-    IsHeapReferenceMarkedCallback* is_marked_callback_;
-    MarkObjectCallback* mark_callback_;
-    void* arg_;
-
-   private:
-    DISALLOW_IMPLICIT_CONSTRUCTORS(ProcessReferencesArgs);
-  };
   bool SlowPathEnabled() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   // Called by ProcessReferences.
   void DisableSlowPath(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(Locks::reference_processor_lock_)
@@ -94,8 +81,9 @@
   // referents.
   void StartPreservingReferences(Thread* self) LOCKS_EXCLUDED(Locks::reference_processor_lock_);
   void StopPreservingReferences(Thread* self) LOCKS_EXCLUDED(Locks::reference_processor_lock_);
-  // Process args, used by the GetReferent to return referents which are already marked.
-  ProcessReferencesArgs process_references_args_ GUARDED_BY(Locks::reference_processor_lock_);
+  // Collector which is clearing references, used by the GetReferent to return referents which are
+  // already marked.
+  collector::GarbageCollector* collector_ GUARDED_BY(Locks::reference_processor_lock_);
   // Boolean for whether or not we are preserving references (either soft references or finalizers).
   // If this is true, then we cannot return a referent (see comment in GetReferent).
   bool preserving_references_ GUARDED_BY(Locks::reference_processor_lock_);
diff --git a/runtime/gc/reference_queue.cc b/runtime/gc/reference_queue.cc
index 4ba3983..f505428 100644
--- a/runtime/gc/reference_queue.cc
+++ b/runtime/gc/reference_queue.cc
@@ -137,12 +137,12 @@
 }
 
 void ReferenceQueue::ClearWhiteReferences(ReferenceQueue* cleared_references,
-                                          IsHeapReferenceMarkedCallback* preserve_callback,
-                                          void* arg) {
+                                          collector::GarbageCollector* collector) {
   while (!IsEmpty()) {
     mirror::Reference* ref = DequeuePendingReference();
     mirror::HeapReference<mirror::Object>* referent_addr = ref->GetReferentReferenceAddr();
-    if (referent_addr->AsMirrorPtr() != nullptr && !preserve_callback(referent_addr, arg)) {
+    if (referent_addr->AsMirrorPtr() != nullptr &&
+        !collector->IsMarkedHeapReference(referent_addr)) {
       // Referent is white, clear it.
       if (Runtime::Current()->IsActiveTransaction()) {
         ref->ClearReferent<true>();
@@ -157,14 +157,13 @@
 }
 
 void ReferenceQueue::EnqueueFinalizerReferences(ReferenceQueue* cleared_references,
-                                                IsHeapReferenceMarkedCallback* is_marked_callback,
-                                                MarkObjectCallback* mark_object_callback,
-                                                void* arg) {
+                                                collector::GarbageCollector* collector) {
   while (!IsEmpty()) {
     mirror::FinalizerReference* ref = DequeuePendingReference()->AsFinalizerReference();
     mirror::HeapReference<mirror::Object>* referent_addr = ref->GetReferentReferenceAddr();
-    if (referent_addr->AsMirrorPtr() != nullptr && !is_marked_callback(referent_addr, arg)) {
-      mirror::Object* forward_address = mark_object_callback(referent_addr->AsMirrorPtr(), arg);
+    if (referent_addr->AsMirrorPtr() != nullptr &&
+        !collector->IsMarkedHeapReference(referent_addr)) {
+      mirror::Object* forward_address = collector->MarkObject(referent_addr->AsMirrorPtr());
       // If the referent is non-null the reference must queuable.
       DCHECK(ref->IsEnqueuable());
       // Move the updated referent to the zombie field.
@@ -180,8 +179,7 @@
   }
 }
 
-void ReferenceQueue::ForwardSoftReferences(IsHeapReferenceMarkedCallback* preserve_callback,
-                                           void* arg) {
+void ReferenceQueue::ForwardSoftReferences(MarkObjectVisitor* visitor) {
   if (UNLIKELY(IsEmpty())) {
     return;
   }
@@ -190,15 +188,15 @@
   do {
     mirror::HeapReference<mirror::Object>* referent_addr = ref->GetReferentReferenceAddr();
     if (referent_addr->AsMirrorPtr() != nullptr) {
-      UNUSED(preserve_callback(referent_addr, arg));
+      visitor->MarkHeapReference(referent_addr);
     }
     ref = ref->GetPendingNext();
   } while (LIKELY(ref != head));
 }
 
-void ReferenceQueue::UpdateRoots(IsMarkedCallback* callback, void* arg) {
+void ReferenceQueue::UpdateRoots(IsMarkedVisitor* visitor) {
   if (list_ != nullptr) {
-    list_ = down_cast<mirror::Reference*>(callback(list_, arg));
+    list_ = down_cast<mirror::Reference*>(visitor->IsMarked(list_));
   }
 }
 
diff --git a/runtime/gc/reference_queue.h b/runtime/gc/reference_queue.h
index c45be85..7d9ddf6 100644
--- a/runtime/gc/reference_queue.h
+++ b/runtime/gc/reference_queue.h
@@ -36,6 +36,10 @@
 
 namespace gc {
 
+namespace collector {
+class GarbageCollector;
+}  // namespace collector
+
 class Heap;
 
 // Used to temporarily store java.lang.ref.Reference(s) during GC and prior to queueing on the
@@ -65,20 +69,19 @@
   // Enqueues finalizer references with white referents.  White referents are blackened, moved to
   // the zombie field, and the referent field is cleared.
   void EnqueueFinalizerReferences(ReferenceQueue* cleared_references,
-                                  IsHeapReferenceMarkedCallback* is_marked_callback,
-                                  MarkObjectCallback* mark_object_callback, void* arg)
+                                  collector::GarbageCollector* collector)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Walks the reference list marking any references subject to the reference clearing policy.
   // References with a black referent are removed from the list.  References with white referents
   // biased toward saving are blackened and also removed from the list.
-  void ForwardSoftReferences(IsHeapReferenceMarkedCallback* preserve_callback, void* arg)
+  void ForwardSoftReferences(MarkObjectVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Unlink the reference list clearing references objects with white referents. Cleared references
   // registered to a reference queue are scheduled for appending by the heap worker thread.
   void ClearWhiteReferences(ReferenceQueue* cleared_references,
-                            IsHeapReferenceMarkedCallback* is_marked_callback, void* arg)
+                            collector::GarbageCollector* collector)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void Dump(std::ostream& os) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -95,7 +98,7 @@
   }
 
   // Visits list_, currently only used for the mark compact GC.
-  void UpdateRoots(IsMarkedCallback* callback, void* arg)
+  void UpdateRoots(IsMarkedVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index 5237c7b..e1c5b64 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -20,13 +20,13 @@
 #include "gc/accounting/card_table.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
+#include "memory_tool_malloc_space-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "runtime.h"
 #include "thread.h"
 #include "thread_list.h"
 #include "utils.h"
-#include "valgrind_malloc_space-inl.h"
 
 namespace art {
 namespace gc {
@@ -62,8 +62,8 @@
 
   // Everything is set so record in immutable structure and leave
   uint8_t* begin = mem_map->Begin();
-  if (Runtime::Current()->RunningOnValgrind()) {
-    return new ValgrindMallocSpace<DlMallocSpace, kDefaultValgrindRedZoneBytes, true, false>(
+  if (Runtime::Current()->IsRunningOnMemoryTool()) {
+    return new MemoryToolMallocSpace<DlMallocSpace, kDefaultMemoryToolRedZoneBytes, true, false>(
         mem_map, initial_size, name, mspace, begin, end, begin + capacity, growth_limit,
         can_move_objects, starting_size);
   } else {
@@ -152,8 +152,8 @@
                                            void* allocator, uint8_t* begin, uint8_t* end,
                                            uint8_t* limit, size_t growth_limit,
                                            bool can_move_objects) {
-  if (Runtime::Current()->RunningOnValgrind()) {
-    return new ValgrindMallocSpace<DlMallocSpace, kDefaultValgrindRedZoneBytes, true, false>(
+  if (Runtime::Current()->IsRunningOnMemoryTool()) {
+    return new MemoryToolMallocSpace<DlMallocSpace, kDefaultMemoryToolRedZoneBytes, true, false>(
         mem_map, initial_size_, name, allocator, begin, end, limit, growth_limit,
         can_move_objects, starting_size_);
   } else {
diff --git a/runtime/gc/space/dlmalloc_space.h b/runtime/gc/space/dlmalloc_space.h
index 1f80f1f..ab527a4 100644
--- a/runtime/gc/space/dlmalloc_space.h
+++ b/runtime/gc/space/dlmalloc_space.h
@@ -30,7 +30,7 @@
 namespace space {
 
 // An alloc space is a space where objects may be allocated and garbage collected. Not final as may
-// be overridden by a ValgrindMallocSpace.
+// be overridden by a MemoryToolMallocSpace.
 class DlMallocSpace : public MallocSpace {
  public:
   // Create a DlMallocSpace from an existing mem_map.
@@ -46,27 +46,27 @@
   static DlMallocSpace* Create(const std::string& name, size_t initial_size, size_t growth_limit,
                                size_t capacity, uint8_t* requested_begin, bool can_move_objects);
 
-  // Virtual to allow ValgrindMallocSpace to intercept.
+  // Virtual to allow MemoryToolMallocSpace to intercept.
   virtual mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes, size_t* bytes_allocated,
                                           size_t* usable_size,
                                           size_t* bytes_tl_bulk_allocated)
       OVERRIDE LOCKS_EXCLUDED(lock_);
-  // Virtual to allow ValgrindMallocSpace to intercept.
+  // Virtual to allow MemoryToolMallocSpace to intercept.
   virtual mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
                                 size_t* usable_size, size_t* bytes_tl_bulk_allocated)
       OVERRIDE LOCKS_EXCLUDED(lock_) {
     return AllocNonvirtual(self, num_bytes, bytes_allocated, usable_size,
                            bytes_tl_bulk_allocated);
   }
-  // Virtual to allow ValgrindMallocSpace to intercept.
+  // Virtual to allow MemoryToolMallocSpace to intercept.
   virtual size_t AllocationSize(mirror::Object* obj, size_t* usable_size) OVERRIDE {
     return AllocationSizeNonvirtual(obj, usable_size);
   }
-  // Virtual to allow ValgrindMallocSpace to intercept.
+  // Virtual to allow MemoryToolMallocSpace to intercept.
   virtual size_t Free(Thread* self, mirror::Object* ptr) OVERRIDE
       LOCKS_EXCLUDED(lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  // Virtual to allow ValgrindMallocSpace to intercept.
+  // Virtual to allow MemoryToolMallocSpace to intercept.
   virtual size_t FreeList(Thread* self, size_t num_ptrs, mirror::Object** ptrs) OVERRIDE
       LOCKS_EXCLUDED(lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 2b567fe..a913e59 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -34,12 +34,12 @@
 namespace gc {
 namespace space {
 
-class ValgrindLargeObjectMapSpace FINAL : public LargeObjectMapSpace {
+class MemoryToolLargeObjectMapSpace FINAL : public LargeObjectMapSpace {
  public:
-  explicit ValgrindLargeObjectMapSpace(const std::string& name) : LargeObjectMapSpace(name) {
+  explicit MemoryToolLargeObjectMapSpace(const std::string& name) : LargeObjectMapSpace(name) {
   }
 
-  ~ValgrindLargeObjectMapSpace() OVERRIDE {
+  ~MemoryToolLargeObjectMapSpace() OVERRIDE {
     // Keep valgrind happy if there is any large objects such as dex cache arrays which aren't
     // freed since they are held live by the class linker.
     MutexLock mu(Thread::Current(), lock_);
@@ -52,13 +52,14 @@
                         size_t* usable_size, size_t* bytes_tl_bulk_allocated)
       OVERRIDE {
     mirror::Object* obj =
-        LargeObjectMapSpace::Alloc(self, num_bytes + kValgrindRedZoneBytes * 2, bytes_allocated,
+        LargeObjectMapSpace::Alloc(self, num_bytes + kMemoryToolRedZoneBytes * 2, bytes_allocated,
                                    usable_size, bytes_tl_bulk_allocated);
     mirror::Object* object_without_rdz = reinterpret_cast<mirror::Object*>(
-        reinterpret_cast<uintptr_t>(obj) + kValgrindRedZoneBytes);
-    VALGRIND_MAKE_MEM_NOACCESS(reinterpret_cast<void*>(obj), kValgrindRedZoneBytes);
-    VALGRIND_MAKE_MEM_NOACCESS(reinterpret_cast<uint8_t*>(object_without_rdz) + num_bytes,
-                               kValgrindRedZoneBytes);
+        reinterpret_cast<uintptr_t>(obj) + kMemoryToolRedZoneBytes);
+    MEMORY_TOOL_MAKE_NOACCESS(reinterpret_cast<void*>(obj), kMemoryToolRedZoneBytes);
+    MEMORY_TOOL_MAKE_NOACCESS(
+        reinterpret_cast<uint8_t*>(object_without_rdz) + num_bytes,
+        kMemoryToolRedZoneBytes);
     if (usable_size != nullptr) {
       *usable_size = num_bytes;  // Since we have redzones, shrink the usable size.
     }
@@ -75,7 +76,7 @@
 
   size_t Free(Thread* self, mirror::Object* obj) OVERRIDE {
     mirror::Object* object_with_rdz = ObjectWithRedzone(obj);
-    VALGRIND_MAKE_MEM_UNDEFINED(object_with_rdz, AllocationSize(obj, nullptr));
+    MEMORY_TOOL_MAKE_UNDEFINED(object_with_rdz, AllocationSize(obj, nullptr));
     return LargeObjectMapSpace::Free(self, object_with_rdz);
   }
 
@@ -86,15 +87,15 @@
  private:
   static const mirror::Object* ObjectWithRedzone(const mirror::Object* obj) {
     return reinterpret_cast<const mirror::Object*>(
-        reinterpret_cast<uintptr_t>(obj) - kValgrindRedZoneBytes);
+        reinterpret_cast<uintptr_t>(obj) - kMemoryToolRedZoneBytes);
   }
 
   static mirror::Object* ObjectWithRedzone(mirror::Object* obj) {
     return reinterpret_cast<mirror::Object*>(
-        reinterpret_cast<uintptr_t>(obj) - kValgrindRedZoneBytes);
+        reinterpret_cast<uintptr_t>(obj) - kMemoryToolRedZoneBytes);
   }
 
-  static constexpr size_t kValgrindRedZoneBytes = kPageSize;
+  static constexpr size_t kMemoryToolRedZoneBytes = kPageSize;
 };
 
 void LargeObjectSpace::SwapBitmaps() {
@@ -121,8 +122,8 @@
       lock_("large object map space lock", kAllocSpaceLock) {}
 
 LargeObjectMapSpace* LargeObjectMapSpace::Create(const std::string& name) {
-  if (Runtime::Current()->RunningOnValgrind()) {
-    return new ValgrindLargeObjectMapSpace(name);
+  if (Runtime::Current()->IsRunningOnMemoryTool()) {
+    return new MemoryToolLargeObjectMapSpace(name);
   } else {
     return new LargeObjectMapSpace(name);
   }
diff --git a/runtime/gc/space/malloc_space.h b/runtime/gc/space/malloc_space.h
index 9495864..6c689cd 100644
--- a/runtime/gc/space/malloc_space.h
+++ b/runtime/gc/space/malloc_space.h
@@ -20,6 +20,7 @@
 #include "space.h"
 
 #include <ostream>
+#include "base/memory_tool.h"
 
 namespace art {
 namespace gc {
diff --git a/runtime/gc/space/valgrind_malloc_space-inl.h b/runtime/gc/space/memory_tool_malloc_space-inl.h
similarity index 72%
rename from runtime/gc/space/valgrind_malloc_space-inl.h
rename to runtime/gc/space/memory_tool_malloc_space-inl.h
index bc329e1..ea8b8aa 100644
--- a/runtime/gc/space/valgrind_malloc_space-inl.h
+++ b/runtime/gc/space/memory_tool_malloc_space-inl.h
@@ -14,22 +14,20 @@
  * limitations under the License.
  */
 
-#ifndef ART_RUNTIME_GC_SPACE_VALGRIND_MALLOC_SPACE_INL_H_
-#define ART_RUNTIME_GC_SPACE_VALGRIND_MALLOC_SPACE_INL_H_
+#ifndef ART_RUNTIME_GC_SPACE_MEMORY_TOOL_MALLOC_SPACE_INL_H_
+#define ART_RUNTIME_GC_SPACE_MEMORY_TOOL_MALLOC_SPACE_INL_H_
 
-#include "valgrind_malloc_space.h"
-
-#include <memcheck/memcheck.h>
-
-#include "valgrind_settings.h"
+#include "base/memory_tool.h"
+#include "memory_tool_malloc_space.h"
+#include "memory_tool_settings.h"
 
 namespace art {
 namespace gc {
 namespace space {
 
-namespace valgrind_details {
+namespace memory_tool_details {
 
-template <size_t kValgrindRedZoneBytes, bool kUseObjSizeForUsable>
+template <size_t kMemoryToolRedZoneBytes, bool kUseObjSizeForUsable>
 inline mirror::Object* AdjustForValgrind(void* obj_with_rdz, size_t num_bytes,
                                          size_t bytes_allocated, size_t usable_size,
                                          size_t bytes_tl_bulk_allocated,
@@ -48,26 +46,26 @@
     if (kUseObjSizeForUsable) {
       *usable_size_out = num_bytes;
     } else {
-      *usable_size_out = usable_size - 2 * kValgrindRedZoneBytes;
+      *usable_size_out = usable_size - 2 * kMemoryToolRedZoneBytes;
     }
   }
 
   // Left redzone.
-  VALGRIND_MAKE_MEM_NOACCESS(obj_with_rdz, kValgrindRedZoneBytes);
+  MEMORY_TOOL_MAKE_NOACCESS(obj_with_rdz, kMemoryToolRedZoneBytes);
 
   // Make requested memory readable.
   // (If the allocator assumes memory is zeroed out, we might get UNDEFINED warnings, so make
   //  everything DEFINED initially.)
   mirror::Object* result = reinterpret_cast<mirror::Object*>(
-      reinterpret_cast<uint8_t*>(obj_with_rdz) + kValgrindRedZoneBytes);
-  VALGRIND_MAKE_MEM_DEFINED(result, num_bytes);
+      reinterpret_cast<uint8_t*>(obj_with_rdz) + kMemoryToolRedZoneBytes);
+  MEMORY_TOOL_MAKE_DEFINED(result, num_bytes);
 
   // Right redzone. Assumes that if bytes_allocated > usable_size, then the difference is
   // management data at the upper end, and for simplicity we will not protect that.
   // At the moment, this fits RosAlloc (no management data in a slot, usable_size == alloc_size)
   // and DlMalloc (allocation_size = (usable_size == num_bytes) + 4, 4 is management)
-  VALGRIND_MAKE_MEM_NOACCESS(reinterpret_cast<uint8_t*>(result) + num_bytes,
-                             usable_size - (num_bytes + kValgrindRedZoneBytes));
+  MEMORY_TOOL_MAKE_NOACCESS(reinterpret_cast<uint8_t*>(result) + num_bytes,
+                    usable_size - (num_bytes + kMemoryToolRedZoneBytes));
 
   return result;
 }
@@ -76,15 +74,15 @@
   return obj->SizeOf<kVerifyNone>();
 }
 
-}  // namespace valgrind_details
+}  // namespace memory_tool_details
 
 template <typename S,
-          size_t kValgrindRedZoneBytes,
+          size_t kMemoryToolRedZoneBytes,
           bool kAdjustForRedzoneInAllocSize,
           bool kUseObjSizeForUsable>
 mirror::Object*
-ValgrindMallocSpace<S,
-                    kValgrindRedZoneBytes,
+MemoryToolMallocSpace<S,
+                    kMemoryToolRedZoneBytes,
                     kAdjustForRedzoneInAllocSize,
                     kUseObjSizeForUsable>::AllocWithGrowth(
     Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out,
@@ -92,14 +90,14 @@
   size_t bytes_allocated;
   size_t usable_size;
   size_t bytes_tl_bulk_allocated;
-  void* obj_with_rdz = S::AllocWithGrowth(self, num_bytes + 2 * kValgrindRedZoneBytes,
+  void* obj_with_rdz = S::AllocWithGrowth(self, num_bytes + 2 * kMemoryToolRedZoneBytes,
                                           &bytes_allocated, &usable_size,
                                           &bytes_tl_bulk_allocated);
   if (obj_with_rdz == nullptr) {
     return nullptr;
   }
 
-  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes, kUseObjSizeForUsable>(
+  return memory_tool_details::AdjustForValgrind<kMemoryToolRedZoneBytes, kUseObjSizeForUsable>(
       obj_with_rdz, num_bytes,
       bytes_allocated, usable_size,
       bytes_tl_bulk_allocated,
@@ -109,11 +107,11 @@
 }
 
 template <typename S,
-          size_t kValgrindRedZoneBytes,
+          size_t kMemoryToolRedZoneBytes,
           bool kAdjustForRedzoneInAllocSize,
           bool kUseObjSizeForUsable>
-mirror::Object* ValgrindMallocSpace<S,
-                                    kValgrindRedZoneBytes,
+mirror::Object* MemoryToolMallocSpace<S,
+                                    kMemoryToolRedZoneBytes,
                                     kAdjustForRedzoneInAllocSize,
                                     kUseObjSizeForUsable>::Alloc(
     Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out,
@@ -121,13 +119,13 @@
   size_t bytes_allocated;
   size_t usable_size;
   size_t bytes_tl_bulk_allocated;
-  void* obj_with_rdz = S::Alloc(self, num_bytes + 2 * kValgrindRedZoneBytes,
+  void* obj_with_rdz = S::Alloc(self, num_bytes + 2 * kMemoryToolRedZoneBytes,
                                 &bytes_allocated, &usable_size, &bytes_tl_bulk_allocated);
   if (obj_with_rdz == nullptr) {
     return nullptr;
   }
 
-  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes,
+  return memory_tool_details::AdjustForValgrind<kMemoryToolRedZoneBytes,
                                              kUseObjSizeForUsable>(obj_with_rdz, num_bytes,
                                                                    bytes_allocated, usable_size,
                                                                    bytes_tl_bulk_allocated,
@@ -137,11 +135,11 @@
 }
 
 template <typename S,
-          size_t kValgrindRedZoneBytes,
+          size_t kMemoryToolRedZoneBytes,
           bool kAdjustForRedzoneInAllocSize,
           bool kUseObjSizeForUsable>
-mirror::Object* ValgrindMallocSpace<S,
-                                    kValgrindRedZoneBytes,
+mirror::Object* MemoryToolMallocSpace<S,
+                                    kMemoryToolRedZoneBytes,
                                     kAdjustForRedzoneInAllocSize,
                                     kUseObjSizeForUsable>::AllocThreadUnsafe(
     Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out,
@@ -149,14 +147,14 @@
   size_t bytes_allocated;
   size_t usable_size;
   size_t bytes_tl_bulk_allocated;
-  void* obj_with_rdz = S::AllocThreadUnsafe(self, num_bytes + 2 * kValgrindRedZoneBytes,
+  void* obj_with_rdz = S::AllocThreadUnsafe(self, num_bytes + 2 * kMemoryToolRedZoneBytes,
                                             &bytes_allocated, &usable_size,
                                             &bytes_tl_bulk_allocated);
   if (obj_with_rdz == nullptr) {
     return nullptr;
   }
 
-  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes, kUseObjSizeForUsable>(
+  return memory_tool_details::AdjustForValgrind<kMemoryToolRedZoneBytes, kUseObjSizeForUsable>(
       obj_with_rdz, num_bytes,
       bytes_allocated, usable_size,
       bytes_tl_bulk_allocated,
@@ -166,38 +164,39 @@
 }
 
 template <typename S,
-          size_t kValgrindRedZoneBytes,
+          size_t kMemoryToolRedZoneBytes,
           bool kAdjustForRedzoneInAllocSize,
           bool kUseObjSizeForUsable>
-size_t ValgrindMallocSpace<S,
-                           kValgrindRedZoneBytes,
+size_t MemoryToolMallocSpace<S,
+                           kMemoryToolRedZoneBytes,
                            kAdjustForRedzoneInAllocSize,
                            kUseObjSizeForUsable>::AllocationSize(
     mirror::Object* obj, size_t* usable_size) {
   size_t result = S::AllocationSize(reinterpret_cast<mirror::Object*>(
-      reinterpret_cast<uint8_t*>(obj) - (kAdjustForRedzoneInAllocSize ? kValgrindRedZoneBytes : 0)),
+      reinterpret_cast<uint8_t*>(obj) - (kAdjustForRedzoneInAllocSize ? kMemoryToolRedZoneBytes : 0)),
       usable_size);
   if (usable_size != nullptr) {
     if (kUseObjSizeForUsable) {
-      *usable_size = valgrind_details::GetObjSizeNoThreadSafety(obj);
+      *usable_size = memory_tool_details::GetObjSizeNoThreadSafety(obj);
     } else {
-      *usable_size = *usable_size - 2 * kValgrindRedZoneBytes;
+      *usable_size = *usable_size - 2 * kMemoryToolRedZoneBytes;
     }
   }
   return result;
 }
 
 template <typename S,
-          size_t kValgrindRedZoneBytes,
+          size_t kMemoryToolRedZoneBytes,
           bool kAdjustForRedzoneInAllocSize,
           bool kUseObjSizeForUsable>
-size_t ValgrindMallocSpace<S,
-                           kValgrindRedZoneBytes,
+size_t MemoryToolMallocSpace<S,
+                           kMemoryToolRedZoneBytes,
                            kAdjustForRedzoneInAllocSize,
                            kUseObjSizeForUsable>::Free(
     Thread* self, mirror::Object* ptr) {
   void* obj_after_rdz = reinterpret_cast<void*>(ptr);
-  uint8_t* obj_with_rdz = reinterpret_cast<uint8_t*>(obj_after_rdz) - kValgrindRedZoneBytes;
+  uint8_t* obj_with_rdz = reinterpret_cast<uint8_t*>(obj_after_rdz) - kMemoryToolRedZoneBytes;
+
   // Make redzones undefined.
   size_t usable_size;
   size_t allocation_size = AllocationSize(ptr, &usable_size);
@@ -206,20 +205,20 @@
   // Use the obj-size-for-usable flag to determine whether usable_size is the more important one,
   // e.g., whether there's data in the allocation_size (and usable_size can't be trusted).
   if (kUseObjSizeForUsable) {
-    VALGRIND_MAKE_MEM_UNDEFINED(obj_with_rdz, allocation_size);
+    MEMORY_TOOL_MAKE_UNDEFINED(obj_with_rdz, allocation_size);
   } else {
-    VALGRIND_MAKE_MEM_UNDEFINED(obj_with_rdz, usable_size + 2 * kValgrindRedZoneBytes);
+    MEMORY_TOOL_MAKE_UNDEFINED(obj_with_rdz, usable_size + 2 * kMemoryToolRedZoneBytes);
   }
 
   return S::Free(self, reinterpret_cast<mirror::Object*>(obj_with_rdz));
 }
 
 template <typename S,
-          size_t kValgrindRedZoneBytes,
+          size_t kMemoryToolRedZoneBytes,
           bool kAdjustForRedzoneInAllocSize,
           bool kUseObjSizeForUsable>
-size_t ValgrindMallocSpace<S,
-                           kValgrindRedZoneBytes,
+size_t MemoryToolMallocSpace<S,
+                           kMemoryToolRedZoneBytes,
                            kAdjustForRedzoneInAllocSize,
                            kUseObjSizeForUsable>::FreeList(
     Thread* self, size_t num_ptrs, mirror::Object** ptrs) {
@@ -232,32 +231,33 @@
 }
 
 template <typename S,
-          size_t kValgrindRedZoneBytes,
+          size_t kMemoryToolRedZoneBytes,
           bool kAdjustForRedzoneInAllocSize,
           bool kUseObjSizeForUsable>
 template <typename... Params>
-ValgrindMallocSpace<S,
-                    kValgrindRedZoneBytes,
+MemoryToolMallocSpace<S,
+                    kMemoryToolRedZoneBytes,
                     kAdjustForRedzoneInAllocSize,
-                    kUseObjSizeForUsable>::ValgrindMallocSpace(
+                    kUseObjSizeForUsable>::MemoryToolMallocSpace(
     MemMap* mem_map, size_t initial_size, Params... params) : S(mem_map, initial_size, params...) {
-  VALGRIND_MAKE_MEM_UNDEFINED(mem_map->Begin() + initial_size,
-                              mem_map->Size() - initial_size);
+  MEMORY_TOOL_MAKE_DEFINED(mem_map->Begin(), initial_size);
+  MEMORY_TOOL_MAKE_UNDEFINED(mem_map->Begin() + initial_size,
+                     mem_map->Size() - initial_size);
 }
 
 template <typename S,
-          size_t kValgrindRedZoneBytes,
+          size_t kMemoryToolRedZoneBytes,
           bool kAdjustForRedzoneInAllocSize,
           bool kUseObjSizeForUsable>
-size_t ValgrindMallocSpace<S,
-                           kValgrindRedZoneBytes,
+size_t MemoryToolMallocSpace<S,
+                           kMemoryToolRedZoneBytes,
                            kAdjustForRedzoneInAllocSize,
                            kUseObjSizeForUsable>::MaxBytesBulkAllocatedFor(size_t num_bytes) {
-  return S::MaxBytesBulkAllocatedFor(num_bytes + 2 * kValgrindRedZoneBytes);
+  return S::MaxBytesBulkAllocatedFor(num_bytes + 2 * kMemoryToolRedZoneBytes);
 }
 
 }  // namespace space
 }  // namespace gc
 }  // namespace art
 
-#endif  // ART_RUNTIME_GC_SPACE_VALGRIND_MALLOC_SPACE_INL_H_
+#endif  // ART_RUNTIME_GC_SPACE_MEMORY_TOOL_MALLOC_SPACE_INL_H_
diff --git a/runtime/gc/space/valgrind_malloc_space.h b/runtime/gc/space/memory_tool_malloc_space.h
similarity index 78%
rename from runtime/gc/space/valgrind_malloc_space.h
rename to runtime/gc/space/memory_tool_malloc_space.h
index a6b010a..64c6f35 100644
--- a/runtime/gc/space/valgrind_malloc_space.h
+++ b/runtime/gc/space/memory_tool_malloc_space.h
@@ -14,24 +14,22 @@
  * limitations under the License.
  */
 
-#ifndef ART_RUNTIME_GC_SPACE_VALGRIND_MALLOC_SPACE_H_
-#define ART_RUNTIME_GC_SPACE_VALGRIND_MALLOC_SPACE_H_
+#ifndef ART_RUNTIME_GC_SPACE_MEMORY_TOOL_MALLOC_SPACE_H_
+#define ART_RUNTIME_GC_SPACE_MEMORY_TOOL_MALLOC_SPACE_H_
 
 #include "malloc_space.h"
 
-#include <valgrind.h>
-
 namespace art {
 namespace gc {
 namespace space {
 
-// A specialization of DlMallocSpace/RosAllocSpace that places valgrind red zones around
-// allocations.
+// A specialization of DlMallocSpace/RosAllocSpace that places memory tool red
+// zones around allocations.
 template <typename BaseMallocSpaceType,
-          size_t kValgrindRedZoneBytes,
+          size_t kMemoryToolRedZoneBytes,
           bool kAdjustForRedzoneInAllocSize,
           bool kUseObjSizeForUsable>
-class ValgrindMallocSpace FINAL : public BaseMallocSpaceType {
+class MemoryToolMallocSpace FINAL : public BaseMallocSpaceType {
  public:
   mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes, size_t* bytes_allocated,
                                   size_t* usable_size, size_t* bytes_tl_bulk_allocated)
@@ -57,15 +55,15 @@
   size_t MaxBytesBulkAllocatedFor(size_t num_bytes) OVERRIDE;
 
   template <typename... Params>
-  explicit ValgrindMallocSpace(MemMap* mem_map, size_t initial_size, Params... params);
-  virtual ~ValgrindMallocSpace() {}
+  explicit MemoryToolMallocSpace(MemMap* mem_map, size_t initial_size, Params... params);
+  virtual ~MemoryToolMallocSpace() {}
 
  private:
-  DISALLOW_COPY_AND_ASSIGN(ValgrindMallocSpace);
+  DISALLOW_COPY_AND_ASSIGN(MemoryToolMallocSpace);
 };
 
 }  // namespace space
 }  // namespace gc
 }  // namespace art
 
-#endif  // ART_RUNTIME_GC_SPACE_VALGRIND_MALLOC_SPACE_H_
+#endif  // ART_RUNTIME_GC_SPACE_MEMORY_TOOL_MALLOC_SPACE_H_
diff --git a/runtime/gc/space/valgrind_settings.h b/runtime/gc/space/memory_tool_settings.h
similarity index 80%
rename from runtime/gc/space/valgrind_settings.h
rename to runtime/gc/space/memory_tool_settings.h
index 73da0fd..e9333c8 100644
--- a/runtime/gc/space/valgrind_settings.h
+++ b/runtime/gc/space/memory_tool_settings.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef ART_RUNTIME_GC_SPACE_VALGRIND_SETTINGS_H_
-#define ART_RUNTIME_GC_SPACE_VALGRIND_SETTINGS_H_
+#ifndef ART_RUNTIME_GC_SPACE_MEMORY_TOOL_SETTINGS_H_
+#define ART_RUNTIME_GC_SPACE_MEMORY_TOOL_SETTINGS_H_
 
 namespace art {
 namespace gc {
@@ -23,10 +23,10 @@
 
 // Default number of bytes to use as a red zone (rdz). A red zone of this size will be placed before
 // and after each allocation. 8 bytes provides long/double alignment.
-static constexpr size_t kDefaultValgrindRedZoneBytes = 8;
+static constexpr size_t kDefaultMemoryToolRedZoneBytes = 8;
 
 }  // namespace space
 }  // namespace gc
 }  // namespace art
 
-#endif  // ART_RUNTIME_GC_SPACE_VALGRIND_SETTINGS_H_
+#endif  // ART_RUNTIME_GC_SPACE_MEMORY_TOOL_SETTINGS_H_
diff --git a/runtime/gc/space/rosalloc_space-inl.h b/runtime/gc/space/rosalloc_space-inl.h
index f94ec23..8bff2b4 100644
--- a/runtime/gc/space/rosalloc_space-inl.h
+++ b/runtime/gc/space/rosalloc_space-inl.h
@@ -17,10 +17,9 @@
 #ifndef ART_RUNTIME_GC_SPACE_ROSALLOC_SPACE_INL_H_
 #define ART_RUNTIME_GC_SPACE_ROSALLOC_SPACE_INL_H_
 
-#include <valgrind.h>
-
+#include "base/memory_tool.h"
 #include "gc/allocator/rosalloc-inl.h"
-#include "gc/space/valgrind_settings.h"
+#include "gc/space/memory_tool_settings.h"
 #include "rosalloc_space.h"
 #include "thread.h"
 
@@ -28,26 +27,26 @@
 namespace gc {
 namespace space {
 
-template<bool kMaybeRunningOnValgrind>
+template<bool kMaybeIsRunningOnMemoryTool>
 inline size_t RosAllocSpace::AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size) {
   // obj is a valid object. Use its class in the header to get the size.
   // Don't use verification since the object may be dead if we are sweeping.
   size_t size = obj->SizeOf<kVerifyNone>();
-  bool running_on_valgrind = false;
-  if (kMaybeRunningOnValgrind) {
-    running_on_valgrind = RUNNING_ON_VALGRIND != 0;
-    if (running_on_valgrind) {
-      size += 2 * kDefaultValgrindRedZoneBytes;
+  bool add_redzones = false;
+  if (kMaybeIsRunningOnMemoryTool) {
+    add_redzones = RUNNING_ON_MEMORY_TOOL ? kMemoryToolAddsRedzones : 0;
+    if (add_redzones) {
+      size += 2 * kDefaultMemoryToolRedZoneBytes;
     }
   } else {
-    DCHECK_EQ(RUNNING_ON_VALGRIND, 0U);
+    DCHECK_EQ(RUNNING_ON_MEMORY_TOOL, 0U);
   }
   size_t size_by_size = rosalloc_->UsableSize(size);
   if (kIsDebugBuild) {
-    // On valgrind, the red zone has an impact...
+    // On memory tool, the red zone has an impact...
     const uint8_t* obj_ptr = reinterpret_cast<const uint8_t*>(obj);
     size_t size_by_ptr = rosalloc_->UsableSize(
-        obj_ptr - (running_on_valgrind ? kDefaultValgrindRedZoneBytes : 0));
+        obj_ptr - (add_redzones ? kDefaultMemoryToolRedZoneBytes : 0));
     if (size_by_size != size_by_ptr) {
       LOG(INFO) << "Found a bad sized obj of size " << size
                 << " at " << std::hex << reinterpret_cast<intptr_t>(obj_ptr) << std::dec
diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc
index bc4414d..1a193c3 100644
--- a/runtime/gc/space/rosalloc_space.cc
+++ b/runtime/gc/space/rosalloc_space.cc
@@ -30,7 +30,7 @@
 #include "thread.h"
 #include "thread_list.h"
 #include "utils.h"
-#include "valgrind_malloc_space-inl.h"
+#include "memory_tool_malloc_space-inl.h"
 
 namespace art {
 namespace gc {
@@ -43,7 +43,7 @@
 static constexpr bool kVerifyFreedBytes = false;
 
 // TODO: Fix
-// template class ValgrindMallocSpace<RosAllocSpace, allocator::RosAlloc*>;
+// template class MemoryToolMallocSpace<RosAllocSpace, allocator::RosAlloc*>;
 
 RosAllocSpace::RosAllocSpace(MemMap* mem_map, size_t initial_size, const std::string& name,
                              art::gc::allocator::RosAlloc* rosalloc, uint8_t* begin, uint8_t* end,
@@ -61,10 +61,10 @@
                                                bool low_memory_mode, bool can_move_objects) {
   DCHECK(mem_map != nullptr);
 
-  bool running_on_valgrind = Runtime::Current()->RunningOnValgrind();
+  bool running_on_memory_tool = Runtime::Current()->IsRunningOnMemoryTool();
 
   allocator::RosAlloc* rosalloc = CreateRosAlloc(mem_map->Begin(), starting_size, initial_size,
-                                                 capacity, low_memory_mode, running_on_valgrind);
+                                                 capacity, low_memory_mode, running_on_memory_tool);
   if (rosalloc == nullptr) {
     LOG(ERROR) << "Failed to initialize rosalloc for alloc space (" << name << ")";
     return nullptr;
@@ -78,10 +78,10 @@
 
   // Everything is set so record in immutable structure and leave
   uint8_t* begin = mem_map->Begin();
-  // TODO: Fix RosAllocSpace to support valgrind. There is currently some issues with
+  // TODO: Fix RosAllocSpace to support Valgrind/ASan. There is currently some issues with
   // AllocationSize caused by redzones. b/12944686
-  if (running_on_valgrind) {
-    return new ValgrindMallocSpace<RosAllocSpace, kDefaultValgrindRedZoneBytes, false, true>(
+  if (running_on_memory_tool) {
+    return new MemoryToolMallocSpace<RosAllocSpace, kDefaultMemoryToolRedZoneBytes, false, true>(
         mem_map, initial_size, name, rosalloc, begin, end, begin + capacity, growth_limit,
         can_move_objects, starting_size, low_memory_mode);
   } else {
@@ -134,7 +134,7 @@
 allocator::RosAlloc* RosAllocSpace::CreateRosAlloc(void* begin, size_t morecore_start,
                                                    size_t initial_size,
                                                    size_t maximum_size, bool low_memory_mode,
-                                                   bool running_on_valgrind) {
+                                                   bool running_on_memory_tool) {
   // clear errno to allow PLOG on error
   errno = 0;
   // create rosalloc using our backing storage starting at begin and
@@ -145,7 +145,7 @@
       low_memory_mode ?
           art::gc::allocator::RosAlloc::kPageReleaseModeAll :
           art::gc::allocator::RosAlloc::kPageReleaseModeSizeAndEnd,
-      running_on_valgrind);
+      running_on_memory_tool);
   if (rosalloc != nullptr) {
     rosalloc->SetFootprintLimit(initial_size);
   } else {
@@ -180,8 +180,8 @@
                                            void* allocator, uint8_t* begin, uint8_t* end,
                                            uint8_t* limit, size_t growth_limit,
                                            bool can_move_objects) {
-  if (Runtime::Current()->RunningOnValgrind()) {
-    return new ValgrindMallocSpace<RosAllocSpace, kDefaultValgrindRedZoneBytes, false, true>(
+  if (Runtime::Current()->IsRunningOnMemoryTool()) {
+    return new MemoryToolMallocSpace<RosAllocSpace, kDefaultMemoryToolRedZoneBytes, false, true>(
         mem_map, initial_size_, name, reinterpret_cast<allocator::RosAlloc*>(allocator), begin, end,
         limit, growth_limit, can_move_objects, starting_size_, low_memory_mode_);
   } else {
@@ -370,7 +370,7 @@
   delete rosalloc_;
   rosalloc_ = CreateRosAlloc(mem_map_->Begin(), starting_size_, initial_size_,
                              NonGrowthLimitCapacity(), low_memory_mode_,
-                             Runtime::Current()->RunningOnValgrind());
+                             Runtime::Current()->IsRunningOnMemoryTool());
   SetFootprintLimit(footprint_limit);
 }
 
diff --git a/runtime/gc/space/rosalloc_space.h b/runtime/gc/space/rosalloc_space.h
index 36268f7..9dc6f31 100644
--- a/runtime/gc/space/rosalloc_space.h
+++ b/runtime/gc/space/rosalloc_space.h
@@ -31,7 +31,7 @@
 namespace space {
 
 // An alloc space implemented using a runs-of-slots memory allocator. Not final as may be
-// overridden by a ValgrindMallocSpace.
+// overridden by a MemoryToolMallocSpace.
 class RosAllocSpace : public MallocSpace {
  public:
   // Create a RosAllocSpace with the requested sizes. The requested
@@ -95,7 +95,7 @@
   ALWAYS_INLINE size_t MaxBytesBulkAllocatedForNonvirtual(size_t num_bytes);
 
   // TODO: NO_THREAD_SAFETY_ANALYSIS because SizeOf() requires that mutator_lock is held.
-  template<bool kMaybeRunningOnValgrind>
+  template<bool kMaybeIsRunningOnMemoryTool>
   size_t AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size)
       NO_THREAD_SAFETY_ANALYSIS;
 
@@ -158,11 +158,11 @@
   void* CreateAllocator(void* base, size_t morecore_start, size_t initial_size,
                         size_t maximum_size, bool low_memory_mode) OVERRIDE {
     return CreateRosAlloc(base, morecore_start, initial_size, maximum_size, low_memory_mode,
-                          RUNNING_ON_VALGRIND != 0);
+                          RUNNING_ON_MEMORY_TOOL != 0);
   }
   static allocator::RosAlloc* CreateRosAlloc(void* base, size_t morecore_start, size_t initial_size,
                                              size_t maximum_size, bool low_memory_mode,
-                                             bool running_on_valgrind);
+                                             bool running_on_memory_tool);
 
   void InspectAllRosAlloc(void (*callback)(void *start, void *end, size_t num_bytes, void* callback_arg),
                           void* arg, bool do_null_callback_at_end)
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 2a96278..6ea047f 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -19,6 +19,7 @@
 #include <memory>
 
 #include "gc_root-inl.h"
+#include "gc/collector/garbage_collector.h"
 #include "gc/space/image_space.h"
 #include "mirror/dex_cache.h"
 #include "mirror/object_array-inl.h"
@@ -231,13 +232,21 @@
   CHECK(!allow_new_interns_);
 }
 
+void InternTable::BroadcastForNewInterns() {
+  CHECK(kUseReadBarrier);
+  Thread* self = Thread::Current();
+  MutexLock mu(self, *Locks::intern_table_lock_);
+  new_intern_condition_.Broadcast(self);
+}
+
 mirror::String* InternTable::Insert(mirror::String* s, bool is_strong) {
   if (s == nullptr) {
     return nullptr;
   }
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::intern_table_lock_);
-  while (UNLIKELY(!allow_new_interns_)) {
+  while (UNLIKELY((!kUseReadBarrier && !allow_new_interns_) ||
+                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     new_intern_condition_.WaitHoldingLocks(self);
   }
   // Check the strong table for a match.
@@ -288,9 +297,9 @@
   return LookupWeak(s) == s;
 }
 
-void InternTable::SweepInternTableWeaks(IsMarkedCallback* callback, void* arg) {
+void InternTable::SweepInternTableWeaks(IsMarkedVisitor* visitor) {
   MutexLock mu(Thread::Current(), *Locks::intern_table_lock_);
-  weak_interns_.SweepWeaks(callback, arg);
+  weak_interns_.SweepWeaks(visitor);
 }
 
 void InternTable::AddImageInternTable(gc::space::ImageSpace* image_space) {
@@ -393,16 +402,16 @@
   }
 }
 
-void InternTable::Table::SweepWeaks(IsMarkedCallback* callback, void* arg) {
-  SweepWeaks(&pre_zygote_table_, callback, arg);
-  SweepWeaks(&post_zygote_table_, callback, arg);
+void InternTable::Table::SweepWeaks(IsMarkedVisitor* visitor) {
+  SweepWeaks(&pre_zygote_table_, visitor);
+  SweepWeaks(&post_zygote_table_, visitor);
 }
 
-void InternTable::Table::SweepWeaks(UnorderedSet* set, IsMarkedCallback* callback, void* arg) {
+void InternTable::Table::SweepWeaks(UnorderedSet* set, IsMarkedVisitor* visitor) {
   for (auto it = set->begin(), end = set->end(); it != end;) {
     // This does not need a read barrier because this is called by GC.
     mirror::Object* object = it->Read<kWithoutReadBarrier>();
-    mirror::Object* new_object = callback(object, arg);
+    mirror::Object* new_object = visitor->IsMarked(object);
     if (new_object == nullptr) {
       it = set->Erase(it);
     } else {
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index 97ce73c..67a8b34 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -68,7 +68,7 @@
   // Interns a potentially new string in the 'weak' table. (See above.)
   mirror::String* InternWeak(mirror::String* s) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void SweepInternTableWeaks(IsMarkedCallback* callback, void* arg)
+  void SweepInternTableWeaks(IsMarkedVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   bool ContainsWeak(mirror::String* s) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -88,6 +88,7 @@
   void DisallowNewInterns() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void AllowNewInterns() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void EnsureNewInternsDisallowed() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void BroadcastForNewInterns() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Adds all of the resolved image strings from the image space into the intern table. The
   // advantage of doing this is preventing expensive DexFile::FindStringId calls.
@@ -142,7 +143,7 @@
     void VisitRoots(RootVisitor* visitor)
         SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
         EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
-    void SweepWeaks(IsMarkedCallback* callback, void* arg)
+    void SweepWeaks(IsMarkedVisitor* visitor)
         SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
         EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
     void SwapPostZygoteWithPreZygote() EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
@@ -162,7 +163,7 @@
     typedef HashSet<GcRoot<mirror::String>, GcRootEmptyFn, StringHashEquals, StringHashEquals,
         TrackingAllocator<GcRoot<mirror::String>, kAllocatorTagInternTable>> UnorderedSet;
 
-    void SweepWeaks(UnorderedSet* set, IsMarkedCallback* callback, void* arg)
+    void SweepWeaks(UnorderedSet* set, IsMarkedVisitor* visitor)
         SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
         EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
 
diff --git a/runtime/intern_table_test.cc b/runtime/intern_table_test.cc
index 194d0af..c987180 100644
--- a/runtime/intern_table_test.cc
+++ b/runtime/intern_table_test.cc
@@ -60,9 +60,9 @@
   EXPECT_EQ(2U, t.Size());
 }
 
-class TestPredicate {
+class TestPredicate : public IsMarkedVisitor {
  public:
-  bool IsMarked(const mirror::Object* s) const {
+  mirror::Object* IsMarked(mirror::Object* s) OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     bool erased = false;
     for (auto it = expected_.begin(), end = expected_.end(); it != end; ++it) {
       if (*it == s) {
@@ -72,7 +72,7 @@
       }
     }
     EXPECT_TRUE(erased);
-    return false;
+    return nullptr;
   }
 
   void Expect(const mirror::String* s) {
@@ -87,13 +87,6 @@
   mutable std::vector<const mirror::String*> expected_;
 };
 
-mirror::Object* IsMarkedSweepingCallback(mirror::Object* object, void* arg) {
-  if (reinterpret_cast<TestPredicate*>(arg)->IsMarked(object)) {
-    return object;
-  }
-  return nullptr;
-}
-
 TEST_F(InternTableTest, SweepInternTableWeaks) {
   ScopedObjectAccess soa(Thread::Current());
   InternTable t;
@@ -115,7 +108,7 @@
   p.Expect(s1.Get());
   {
     ReaderMutexLock mu(soa.Self(), *Locks::heap_bitmap_lock_);
-    t.SweepInternTableWeaks(IsMarkedSweepingCallback, &p);
+    t.SweepInternTableWeaks(&p);
   }
 
   EXPECT_EQ(2U, t.Size());
diff --git a/runtime/java_vm_ext.cc b/runtime/java_vm_ext.cc
index f1deacf..36e3aa3 100644
--- a/runtime/java_vm_ext.cc
+++ b/runtime/java_vm_ext.cc
@@ -473,7 +473,8 @@
     return nullptr;
   }
   MutexLock mu(self, weak_globals_lock_);
-  while (UNLIKELY(!allow_new_weak_globals_)) {
+  while (UNLIKELY((!kUseReadBarrier && !allow_new_weak_globals_) ||
+                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     weak_globals_add_condition_.WaitHoldingLocks(self);
   }
   IndirectRef ref = weak_globals_.Add(IRT_FIRST_SEGMENT, obj);
@@ -559,6 +560,13 @@
   CHECK(!allow_new_weak_globals_);
 }
 
+void JavaVMExt::BroadcastForNewWeakGlobals() {
+  CHECK(kUseReadBarrier);
+  Thread* self = Thread::Current();
+  MutexLock mu(self, weak_globals_lock_);
+  weak_globals_add_condition_.Broadcast(self);
+}
+
 mirror::Object* JavaVMExt::DecodeGlobal(Thread* self, IndirectRef ref) {
   return globals_.SynchronizedGet(self, &globals_lock_, ref);
 }
@@ -570,7 +578,8 @@
 
 mirror::Object* JavaVMExt::DecodeWeakGlobal(Thread* self, IndirectRef ref) {
   MutexLock mu(self, weak_globals_lock_);
-  while (UNLIKELY(!allow_new_weak_globals_)) {
+  while (UNLIKELY((!kUseReadBarrier && !allow_new_weak_globals_) ||
+                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     weak_globals_add_condition_.WaitHoldingLocks(self);
   }
   return weak_globals_.Get(ref);
@@ -757,7 +766,7 @@
   return native_method;
 }
 
-void JavaVMExt::SweepJniWeakGlobals(IsMarkedCallback* callback, void* arg) {
+void JavaVMExt::SweepJniWeakGlobals(IsMarkedVisitor* visitor) {
   MutexLock mu(Thread::Current(), weak_globals_lock_);
   Runtime* const runtime = Runtime::Current();
   for (auto* entry : weak_globals_) {
@@ -765,7 +774,7 @@
     if (!entry->IsNull()) {
       // Since this is called by the GC, we don't need a read barrier.
       mirror::Object* obj = entry->Read<kWithoutReadBarrier>();
-      mirror::Object* new_obj = callback(obj, arg);
+      mirror::Object* new_obj = visitor->IsMarked(obj);
       if (new_obj == nullptr) {
         new_obj = runtime->GetClearedJniWeakGlobal();
       }
diff --git a/runtime/java_vm_ext.h b/runtime/java_vm_ext.h
index 4fdf45a..97fbbc5 100644
--- a/runtime/java_vm_ext.h
+++ b/runtime/java_vm_ext.h
@@ -108,6 +108,7 @@
   void DisallowNewWeakGlobals() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void AllowNewWeakGlobals() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void EnsureNewWeakGlobalsDisallowed() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void BroadcastForNewWeakGlobals() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   jobject AddGlobalRef(Thread* self, mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -119,7 +120,7 @@
 
   void DeleteWeakGlobalRef(Thread* self, jweak obj);
 
-  void SweepJniWeakGlobals(IsMarkedCallback* callback, void* arg)
+  void SweepJniWeakGlobals(IsMarkedVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   mirror::Object* DecodeGlobal(Thread* self, IndirectRef ref)
diff --git a/runtime/mem_map.cc b/runtime/mem_map.cc
index 7e640c6..dbae7f8 100644
--- a/runtime/mem_map.cc
+++ b/runtime/mem_map.cc
@@ -16,6 +16,7 @@
 
 #include "mem_map.h"
 
+#include "base/memory_tool.h"
 #include <backtrace/BacktraceMap.h>
 #include <inttypes.h>
 
@@ -481,6 +482,12 @@
   uint8_t* page_aligned_expected =
       (expected_ptr == nullptr) ? nullptr : (expected_ptr - page_offset);
 
+  size_t redzone_size = 0;
+  if (RUNNING_ON_MEMORY_TOOL && kMemoryToolAddsRedzones && expected_ptr == nullptr) {
+    redzone_size = kPageSize;
+    page_aligned_byte_count += redzone_size;
+  }
+
   uint8_t* actual = reinterpret_cast<uint8_t*>(mmap(page_aligned_expected,
                                               page_aligned_byte_count,
                                               prot,
@@ -503,15 +510,35 @@
   if (!CheckMapRequest(expected_ptr, actual, page_aligned_byte_count, error_msg)) {
     return nullptr;
   }
+  if (redzone_size != 0) {
+    const uint8_t *real_start = actual + page_offset;
+    const uint8_t *real_end = actual + page_offset + byte_count;
+    const uint8_t *mapping_end = actual + page_aligned_byte_count;
+
+    MEMORY_TOOL_MAKE_NOACCESS(actual, real_start - actual);
+    MEMORY_TOOL_MAKE_NOACCESS(real_end, mapping_end - real_end);
+    page_aligned_byte_count -= redzone_size;
+  }
+
   return new MemMap(filename, actual + page_offset, byte_count, actual, page_aligned_byte_count,
-                    prot, reuse);
+                    prot, reuse, redzone_size);
 }
 
 MemMap::~MemMap() {
   if (base_begin_ == nullptr && base_size_ == 0) {
     return;
   }
+
+  // Unlike Valgrind, AddressSanitizer requires that all manually poisoned memory is unpoisoned
+  // before it is returned to the system.
+  if (redzone_size_ != 0) {
+    MEMORY_TOOL_MAKE_UNDEFINED(
+        reinterpret_cast<char*>(base_begin_) + base_size_ - redzone_size_,
+        redzone_size_);
+  }
+
   if (!reuse_) {
+    MEMORY_TOOL_MAKE_UNDEFINED(base_begin_, base_size_);
     int result = munmap(base_begin_, base_size_);
     if (result == -1) {
       PLOG(FATAL) << "munmap failed";
@@ -534,9 +561,9 @@
 }
 
 MemMap::MemMap(const std::string& name, uint8_t* begin, size_t size, void* base_begin,
-               size_t base_size, int prot, bool reuse)
+               size_t base_size, int prot, bool reuse, size_t redzone_size)
     : name_(name), begin_(begin), size_(size), base_begin_(base_begin), base_size_(base_size),
-      prot_(prot), reuse_(reuse) {
+      prot_(prot), reuse_(reuse), redzone_size_(redzone_size) {
   if (size_ == 0) {
     CHECK(begin_ == nullptr);
     CHECK(base_begin_ == nullptr);
@@ -595,6 +622,8 @@
   int flags = MAP_PRIVATE | MAP_ANONYMOUS;
 #endif
 
+
+  MEMORY_TOOL_MAKE_UNDEFINED(tail_base_begin, tail_base_size);
   // Unmap/map the tail region.
   int result = munmap(tail_base_begin, tail_base_size);
   if (result == -1) {
@@ -778,6 +807,10 @@
   CHECK_ALIGNED(new_size, kPageSize);
   CHECK_EQ(base_size_, size_) << "Unsupported";
   CHECK_LE(new_size, base_size_);
+  MEMORY_TOOL_MAKE_UNDEFINED(
+      reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(BaseBegin()) +
+                              new_size),
+      base_size_ - new_size);
   CHECK_EQ(munmap(reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(BaseBegin()) + new_size),
                   base_size_ - new_size), 0) << new_size << " " << base_size_;
   base_size_ = new_size;
diff --git a/runtime/mem_map.h b/runtime/mem_map.h
index 14387ee..01e29c9 100644
--- a/runtime/mem_map.h
+++ b/runtime/mem_map.h
@@ -153,7 +153,7 @@
 
  private:
   MemMap(const std::string& name, uint8_t* begin, size_t size, void* base_begin, size_t base_size,
-         int prot, bool reuse) LOCKS_EXCLUDED(Locks::mem_maps_lock_);
+         int prot, bool reuse, size_t redzone_size = 0) LOCKS_EXCLUDED(Locks::mem_maps_lock_);
 
   static void DumpMapsLocked(std::ostream& os, bool terse)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mem_maps_lock_);
@@ -175,6 +175,8 @@
   // unmapping.
   const bool reuse_;
 
+  const size_t redzone_size_;
+
 #if USE_ART_LOW_4G_ALLOCATOR
   static uintptr_t next_mem_pos_;   // Next memory location to check for low_4g extent.
 #endif
diff --git a/runtime/mem_map_test.cc b/runtime/mem_map_test.cc
index f635b5d..13bf5b7 100644
--- a/runtime/mem_map_test.cc
+++ b/runtime/mem_map_test.cc
@@ -18,7 +18,7 @@
 
 #include <memory>
 
-#include <valgrind.h>
+#include "base/memory_tool.h"
 
 #include "gtest/gtest.h"
 
@@ -216,7 +216,7 @@
 TEST_F(MemMapTest, MapAnonymousExactAddr32bitHighAddr) {
   CommonInit();
   // This test may not work under valgrind.
-  if (RUNNING_ON_VALGRIND == 0) {
+  if (RUNNING_ON_MEMORY_TOOL == 0) {
     uintptr_t start_addr = ART_BASE_ADDRESS + 0x1000000;
     std::string error_msg;
     std::unique_ptr<MemMap> map(MemMap::MapAnonymous("MapAnonymousExactAddr32bitHighAddr",
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 4be25d6..fd9c1b1 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -1146,16 +1146,24 @@
   CHECK(!allow_new_monitors_);
 }
 
+void MonitorList::BroadcastForNewMonitors() {
+  CHECK(kUseReadBarrier);
+  Thread* self = Thread::Current();
+  MutexLock mu(self, monitor_list_lock_);
+  monitor_add_condition_.Broadcast(self);
+}
+
 void MonitorList::Add(Monitor* m) {
   Thread* self = Thread::Current();
   MutexLock mu(self, monitor_list_lock_);
-  while (UNLIKELY(!allow_new_monitors_)) {
+  while (UNLIKELY((!kUseReadBarrier && !allow_new_monitors_) ||
+                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     monitor_add_condition_.WaitHoldingLocks(self);
   }
   list_.push_front(m);
 }
 
-void MonitorList::SweepMonitorList(IsMarkedCallback* callback, void* arg) {
+void MonitorList::SweepMonitorList(IsMarkedVisitor* visitor) {
   Thread* self = Thread::Current();
   MutexLock mu(self, monitor_list_lock_);
   for (auto it = list_.begin(); it != list_.end(); ) {
@@ -1163,7 +1171,7 @@
     // Disable the read barrier in GetObject() as this is called by GC.
     mirror::Object* obj = m->GetObject<kWithoutReadBarrier>();
     // The object of a monitor can be null if we have deflated it.
-    mirror::Object* new_obj = obj != nullptr ? callback(obj, arg) : nullptr;
+    mirror::Object* new_obj = obj != nullptr ? visitor->IsMarked(obj) : nullptr;
     if (new_obj == nullptr) {
       VLOG(monitor) << "freeing monitor " << m << " belonging to unmarked object "
                     << obj;
@@ -1176,29 +1184,30 @@
   }
 }
 
-struct MonitorDeflateArgs {
-  MonitorDeflateArgs() : self(Thread::Current()), deflate_count(0) {}
-  Thread* const self;
-  size_t deflate_count;
+class MonitorDeflateVisitor : public IsMarkedVisitor {
+ public:
+  MonitorDeflateVisitor() : self_(Thread::Current()), deflate_count_(0) {}
+
+  virtual mirror::Object* IsMarked(mirror::Object* object) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    if (Monitor::Deflate(self_, object)) {
+      DCHECK_NE(object->GetLockWord(true).GetState(), LockWord::kFatLocked);
+      ++deflate_count_;
+      // If we deflated, return null so that the monitor gets removed from the array.
+      return nullptr;
+    }
+    return object;  // Monitor was not deflated.
+  }
+
+  Thread* const self_;
+  size_t deflate_count_;
 };
 
-static mirror::Object* MonitorDeflateCallback(mirror::Object* object, void* arg)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  MonitorDeflateArgs* args = reinterpret_cast<MonitorDeflateArgs*>(arg);
-  if (Monitor::Deflate(args->self, object)) {
-    DCHECK_NE(object->GetLockWord(true).GetState(), LockWord::kFatLocked);
-    ++args->deflate_count;
-    // If we deflated, return null so that the monitor gets removed from the array.
-    return nullptr;
-  }
-  return object;  // Monitor was not deflated.
-}
-
 size_t MonitorList::DeflateMonitors() {
-  MonitorDeflateArgs args;
-  Locks::mutator_lock_->AssertExclusiveHeld(args.self);
-  SweepMonitorList(MonitorDeflateCallback, &args);
-  return args.deflate_count;
+  MonitorDeflateVisitor visitor;
+  Locks::mutator_lock_->AssertExclusiveHeld(visitor.self_);
+  SweepMonitorList(&visitor);
+  return visitor.deflate_count_;
 }
 
 MonitorInfo::MonitorInfo(mirror::Object* obj) : owner_(nullptr), entry_count_(0) {
diff --git a/runtime/monitor.h b/runtime/monitor.h
index 8f3a91d..09a6cb6 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -287,11 +287,12 @@
 
   void Add(Monitor* m) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void SweepMonitorList(IsMarkedCallback* callback, void* arg)
+  void SweepMonitorList(IsMarkedVisitor* visitor)
       LOCKS_EXCLUDED(monitor_list_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void DisallowNewMonitors() LOCKS_EXCLUDED(monitor_list_lock_);
   void AllowNewMonitors() LOCKS_EXCLUDED(monitor_list_lock_);
   void EnsureNewMonitorsDisallowed() LOCKS_EXCLUDED(monitor_list_lock_);
+  void BroadcastForNewMonitors() LOCKS_EXCLUDED(monitor_list_lock_);
   // Returns how many monitors were deflated.
   size_t DeflateMonitors() LOCKS_EXCLUDED(monitor_list_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/oat.h b/runtime/oat.h
index 5706c4e..3451d0f 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '6', '5', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '6', '6', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/object_callbacks.h b/runtime/object_callbacks.h
index df34ce7..4d726ec 100644
--- a/runtime/object_callbacks.h
+++ b/runtime/object_callbacks.h
@@ -21,31 +21,30 @@
 
 namespace art {
 namespace mirror {
-  class Class;
   class Object;
   template<class MirrorType> class HeapReference;
-  class Reference;
 }  // namespace mirror
-class StackVisitor;
 
 // A callback for visiting an object in the heap.
 typedef void (ObjectCallback)(mirror::Object* obj, void* arg);
-// A callback used for marking an object, returns the new address of the object if the object moved.
-typedef mirror::Object* (MarkObjectCallback)(mirror::Object* obj, void* arg) WARN_UNUSED;
 
-typedef void (MarkHeapReferenceCallback)(mirror::HeapReference<mirror::Object>* ref, void* arg);
-typedef void (DelayReferenceReferentCallback)(mirror::Class* klass, mirror::Reference* ref,
-    void* arg);
+class IsMarkedVisitor {
+ public:
+  virtual ~IsMarkedVisitor() {}
+  // Return null if an object is not marked, otherwise returns the new address of that object.
+  // May return the same address as the input if the object did not move.
+  virtual mirror::Object* IsMarked(mirror::Object* obj) = 0;
+};
 
-// A callback for testing if an object is marked, returns null if not marked, otherwise the new
-// address the object (if the object didn't move, returns the object input parameter).
-typedef mirror::Object* (IsMarkedCallback)(mirror::Object* object, void* arg) WARN_UNUSED;
-
-// Returns true if the object in the heap reference is marked, if it is marked and has moved the
-// callback updates the heap reference contain the new value.
-typedef bool (IsHeapReferenceMarkedCallback)(mirror::HeapReference<mirror::Object>* object,
-    void* arg) WARN_UNUSED;
-typedef void (ProcessMarkStackCallback)(void* arg);
+class MarkObjectVisitor {
+ public:
+  virtual ~MarkObjectVisitor() {}
+  // Mark an object and return the new address of an object.
+  // May return the same address as the input if the object did not move.
+  virtual mirror::Object* MarkObject(mirror::Object* obj) = 0;
+  // Mark an object and update the value stored in the heap reference if the object moved.
+  virtual void MarkHeapReference(mirror::HeapReference<mirror::Object>* obj) = 0;
+};
 
 }  // namespace art
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 884662d..3b0ca9e 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -26,7 +26,7 @@
 #include <cutils/trace.h>
 #include <signal.h>
 #include <sys/syscall.h>
-#include <valgrind.h>
+#include "base/memory_tool.h"
 
 #include <cstdio>
 #include <cstdlib>
@@ -177,7 +177,7 @@
       exit_(nullptr),
       abort_(nullptr),
       stats_enabled_(false),
-      running_on_valgrind_(RUNNING_ON_VALGRIND > 0),
+      is_running_on_memory_tool_(RUNNING_ON_MEMORY_TOOL),
       profiler_started_(false),
       instrumentation_(),
       main_thread_group_(nullptr),
@@ -403,11 +403,11 @@
   }
 }
 
-void Runtime::SweepSystemWeaks(IsMarkedCallback* visitor, void* arg) {
-  GetInternTable()->SweepInternTableWeaks(visitor, arg);
-  GetMonitorList()->SweepMonitorList(visitor, arg);
-  GetJavaVM()->SweepJniWeakGlobals(visitor, arg);
-  GetHeap()->SweepAllocationRecords(visitor, arg);
+void Runtime::SweepSystemWeaks(IsMarkedVisitor* visitor) {
+  GetInternTable()->SweepInternTableWeaks(visitor);
+  GetMonitorList()->SweepMonitorList(visitor);
+  GetJavaVM()->SweepJniWeakGlobals(visitor);
+  GetHeap()->SweepAllocationRecords(visitor);
 }
 
 bool Runtime::Create(const RuntimeOptions& options, bool ignore_unrecognized) {
@@ -938,7 +938,7 @@
     case kMips64:
       implicit_null_checks_ = true;
       // Installing stack protection does not play well with valgrind.
-      implicit_so_checks_ = (RUNNING_ON_VALGRIND == 0);
+      implicit_so_checks_ = !(RUNNING_ON_MEMORY_TOOL && kMemoryToolIsValgrind);
       break;
     default:
       // Keep the defaults.
@@ -1517,6 +1517,13 @@
   java_vm_->EnsureNewWeakGlobalsDisallowed();
 }
 
+void Runtime::BroadcastForNewSystemWeaks() {
+  CHECK(kUseReadBarrier);
+  monitor_list_->BroadcastForNewMonitors();
+  intern_table_->BroadcastForNewInterns();
+  java_vm_->BroadcastForNewWeakGlobals();
+}
+
 void Runtime::SetInstructionSet(InstructionSet instruction_set) {
   instruction_set_ = instruction_set;
   if ((instruction_set_ == kThumb2) || (instruction_set_ == kArm)) {
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 6fd1b07..9ee96a3 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -299,6 +299,7 @@
   void DisallowNewSystemWeaks() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void AllowNewSystemWeaks() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void EnsureNewSystemWeaksDisallowed() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void BroadcastForNewSystemWeaks() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Visit all the roots. If only_dirty is true then non-dirty roots won't be visited. If
   // clean_dirty is true then dirty roots will be marked as non-dirty after visiting.
@@ -335,7 +336,7 @@
 
   // Sweep system weaks, the system weak is deleted if the visitor return null. Otherwise, the
   // system weak is updated to be the visitor's returned value.
-  void SweepSystemWeaks(IsMarkedCallback* visitor, void* arg)
+  void SweepSystemWeaks(IsMarkedVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Constant roots are the roots which never change after the runtime is initialized, they only
@@ -511,8 +512,8 @@
     return cpu_abilist_;
   }
 
-  bool RunningOnValgrind() const {
-    return running_on_valgrind_;
+  bool IsRunningOnMemoryTool() const {
+    return is_running_on_memory_tool_;
   }
 
   void SetTargetSdkVersion(int32_t version) {
@@ -677,7 +678,7 @@
   bool stats_enabled_;
   RuntimeStats stats_;
 
-  const bool running_on_valgrind_;
+  const bool is_running_on_memory_tool_;
 
   std::string profile_output_filename_;
   ProfilerOptions profiler_options_;
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 5f965f1..39ef68a 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -67,8 +67,10 @@
 }
 
 inline ThreadState Thread::SetState(ThreadState new_state) {
-  // Cannot use this code to change into Runnable as changing to Runnable should fail if
-  // old_state_and_flags.suspend_request is true.
+  // Should only be used to change between suspended states.
+  // Cannot use this code to change into or from Runnable as changing to Runnable should
+  // fail if old_state_and_flags.suspend_request is true and changing from Runnable might
+  // miss passing an active suspend barrier.
   DCHECK_NE(new_state, kRunnable);
   if (kIsDebugBuild && this != Thread::Current()) {
     std::string name;
@@ -78,6 +80,7 @@
   }
   union StateAndFlags old_state_and_flags;
   old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
+  CHECK_NE(old_state_and_flags.as_struct.state, kRunnable);
   tls32_.state_and_flags.as_struct.state = new_state;
   return static_cast<ThreadState>(old_state_and_flags.as_struct.state);
 }
@@ -126,20 +129,34 @@
     new_state_and_flags.as_struct.flags = old_state_and_flags.as_struct.flags;
     new_state_and_flags.as_struct.state = new_state;
 
-    // CAS the value without a memory ordering as that is given by the lock release below.
+    // CAS the value with a memory ordering.
     bool done =
-        tls32_.state_and_flags.as_atomic_int.CompareExchangeWeakRelaxed(old_state_and_flags.as_int,
+        tls32_.state_and_flags.as_atomic_int.CompareExchangeWeakRelease(old_state_and_flags.as_int,
                                                                         new_state_and_flags.as_int);
     if (LIKELY(done)) {
       break;
     }
   }
-  // Release share on mutator_lock_.
-  Locks::mutator_lock_->SharedUnlock(this);
+
+  // Change to non-runnable state, thereby appearing suspended to the system.
+  // Mark the release of the share of the mutator_lock_.
+  Locks::mutator_lock_->TransitionFromRunnableToSuspended(this);
+
+  // Once suspended - check the active suspend barrier flag
+  while (true) {
+    uint16_t current_flags = tls32_.state_and_flags.as_struct.flags;
+    if (LIKELY((current_flags & (kCheckpointRequest | kActiveSuspendBarrier)) == 0)) {
+      break;
+    } else if ((current_flags & kActiveSuspendBarrier) != 0) {
+      PassActiveSuspendBarriers(this);
+    } else {
+      // Impossible
+      LOG(FATAL) << "Fatal, thread transited into suspended without running the checkpoint";
+    }
+  }
 }
 
 inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
-  bool done = false;
   union StateAndFlags old_state_and_flags;
   old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
   int16_t old_state = old_state_and_flags.as_struct.state;
@@ -148,7 +165,26 @@
     Locks::mutator_lock_->AssertNotHeld(this);  // Otherwise we starve GC..
     old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
-    if (UNLIKELY((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0)) {
+    if (LIKELY(old_state_and_flags.as_struct.flags == 0)) {
+      // Optimize for the return from native code case - this is the fast path.
+      // Atomically change from suspended to runnable if no suspend request pending.
+      union StateAndFlags new_state_and_flags;
+      new_state_and_flags.as_int = old_state_and_flags.as_int;
+      new_state_and_flags.as_struct.state = kRunnable;
+      // CAS the value with a memory barrier.
+      if (LIKELY(tls32_.state_and_flags.as_atomic_int.CompareExchangeWeakAcquire(
+                                                 old_state_and_flags.as_int,
+                                                 new_state_and_flags.as_int))) {
+        // Mark the acquisition of a share of the mutator_lock_.
+        Locks::mutator_lock_->TransitionFromSuspendedToRunnable(this);
+        break;
+      }
+    } else if ((old_state_and_flags.as_struct.flags & kActiveSuspendBarrier) != 0) {
+      PassActiveSuspendBarriers(this);
+    } else if ((old_state_and_flags.as_struct.flags & kCheckpointRequest) != 0) {
+      // Impossible
+      LOG(FATAL) << "Fatal, wrong checkpoint flag";
+    } else if ((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0) {
       // Wait while our suspend count is non-zero.
       MutexLock mu(this, *Locks::thread_suspend_count_lock_);
       old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
@@ -161,32 +197,13 @@
       }
       DCHECK_EQ(GetSuspendCount(), 0);
     }
-    // Re-acquire shared mutator_lock_ access.
-    Locks::mutator_lock_->SharedLock(this);
-    // Atomically change from suspended to runnable if no suspend request pending.
-    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
-    DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
-    if (LIKELY((old_state_and_flags.as_struct.flags & kSuspendRequest) == 0)) {
-      union StateAndFlags new_state_and_flags;
-      new_state_and_flags.as_int = old_state_and_flags.as_int;
-      new_state_and_flags.as_struct.state = kRunnable;
-      // CAS the value without a memory ordering as that is given by the lock acquisition above.
-      done =
-          tls32_.state_and_flags.as_atomic_int.CompareExchangeWeakRelaxed(old_state_and_flags.as_int,
-                                                                          new_state_and_flags.as_int);
-    }
-    if (UNLIKELY(!done)) {
-      // Failed to transition to Runnable. Release shared mutator_lock_ access and try again.
-      Locks::mutator_lock_->SharedUnlock(this);
-    } else {
-      // Run the flip function, if set.
-      Closure* flip_func = GetFlipFunction();
-      if (flip_func != nullptr) {
-        flip_func->Run(this);
-      }
-      return static_cast<ThreadState>(old_state);
-    }
   } while (true);
+  // Run the flip function, if set.
+  Closure* flip_func = GetFlipFunction();
+  if (flip_func != nullptr) {
+    flip_func->Run(this);
+  }
+  return static_cast<ThreadState>(old_state);
 }
 
 inline void Thread::VerifyStack() {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 6656fe5..cede998 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -35,6 +35,7 @@
 #include "art_field-inl.h"
 #include "art_method-inl.h"
 #include "base/bit_utils.h"
+#include "base/memory_tool.h"
 #include "base/mutex.h"
 #include "base/timing_logger.h"
 #include "base/to_str.h"
@@ -74,6 +75,14 @@
 #include "vmap_table.h"
 #include "well_known_classes.h"
 
+#if ART_USE_FUTEXES
+#include "linux/futex.h"
+#include "sys/syscall.h"
+#ifndef SYS_futex
+#define SYS_futex __NR_futex
+#endif
+#endif  // ART_USE_FUTEXES
+
 namespace art {
 
 bool Thread::is_started_ = false;
@@ -81,6 +90,12 @@
 ConditionVariable* Thread::resume_cond_ = nullptr;
 const size_t Thread::kStackOverflowImplicitCheckSize = GetStackOverflowReservedBytes(kRuntimeISA);
 
+// For implicit overflow checks we reserve an extra piece of memory at the bottom
+// of the stack (lowest memory).  The higher portion of the memory
+// is protected against reads and the lower is available for use while
+// throwing the StackOverflow exception.
+constexpr size_t kStackOverflowProtectedSize = 4 * kMemoryToolStackGuardSizeScale * KB;
+
 static const char* kThreadNameDuringStartup = "<native thread without managed peer>";
 
 void Thread::InitCardTable() {
@@ -351,6 +366,10 @@
 // to make sure the pages for the stack are mapped in before we call mprotect.  We do
 // this by reading every page from the stack bottom (highest address) to the stack top.
 // We then madvise this away.
+
+// AddressSanitizer does not like the part of this functions that reads every stack page.
+// Looks a lot like an out-of-bounds access.
+ATTRIBUTE_NO_SANITIZE_ADDRESS
 void Thread::InstallImplicitProtection() {
   uint8_t* pregion = tlsPtr_.stack_begin - kStackOverflowProtectedSize;
   uint8_t* stack_himem = tlsPtr_.stack_end;
@@ -786,7 +805,8 @@
   LOG(FATAL) << ss.str();
 }
 
-void Thread::ModifySuspendCount(Thread* self, int delta, bool for_debugger) {
+bool Thread::ModifySuspendCount(Thread* self, int delta, AtomicInteger* suspend_barrier,
+                                bool for_debugger) {
   if (kIsDebugBuild) {
     DCHECK(delta == -1 || delta == +1 || delta == -tls32_.debug_suspend_count)
           << delta << " " << tls32_.debug_suspend_count << " " << this;
@@ -798,7 +818,24 @@
   }
   if (UNLIKELY(delta < 0 && tls32_.suspend_count <= 0)) {
     UnsafeLogFatalForSuspendCount(self, this);
-    return;
+    return false;
+  }
+
+  uint16_t flags = kSuspendRequest;
+  if (delta > 0 && suspend_barrier != nullptr) {
+    uint32_t available_barrier = kMaxSuspendBarriers;
+    for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
+      if (tlsPtr_.active_suspend_barriers[i] == nullptr) {
+        available_barrier = i;
+        break;
+      }
+    }
+    if (available_barrier == kMaxSuspendBarriers) {
+      // No barrier spaces available, we can't add another.
+      return false;
+    }
+    tlsPtr_.active_suspend_barriers[available_barrier] = suspend_barrier;
+    flags |= kActiveSuspendBarrier;
   }
 
   tls32_.suspend_count += delta;
@@ -809,9 +846,76 @@
   if (tls32_.suspend_count == 0) {
     AtomicClearFlag(kSuspendRequest);
   } else {
-    AtomicSetFlag(kSuspendRequest);
+    // Two bits might be set simultaneously.
+    tls32_.state_and_flags.as_atomic_int.FetchAndOrSequentiallyConsistent(flags);
     TriggerSuspend();
   }
+  return true;
+}
+
+bool Thread::PassActiveSuspendBarriers(Thread* self) {
+  // Grab the suspend_count lock and copy the current set of
+  // barriers. Then clear the list and the flag. The ModifySuspendCount
+  // function requires the lock so we prevent a race between setting
+  // the kActiveSuspendBarrier flag and clearing it.
+  AtomicInteger* pass_barriers[kMaxSuspendBarriers];
+  {
+    MutexLock mu(self, *Locks::thread_suspend_count_lock_);
+    if (!ReadFlag(kActiveSuspendBarrier)) {
+      // quick exit test: the barriers have already been claimed - this is
+      // possible as there may be a race to claim and it doesn't matter
+      // who wins.
+      // All of the callers of this function (except the SuspendAllInternal)
+      // will first test the kActiveSuspendBarrier flag without lock. Here
+      // double-check whether the barrier has been passed with the
+      // suspend_count lock.
+      return false;
+    }
+
+    for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
+      pass_barriers[i] = tlsPtr_.active_suspend_barriers[i];
+      tlsPtr_.active_suspend_barriers[i] = nullptr;
+    }
+    AtomicClearFlag(kActiveSuspendBarrier);
+  }
+
+  uint32_t barrier_count = 0;
+  for (uint32_t i = 0; i < kMaxSuspendBarriers; i++) {
+    AtomicInteger* pending_threads = pass_barriers[i];
+    if (pending_threads != nullptr) {
+      bool done = false;
+      do {
+        int32_t cur_val = pending_threads->LoadRelaxed();
+        CHECK_GT(cur_val, 0) << "Unexpected value for PassActiveSuspendBarriers(): " << cur_val;
+        // Reduce value by 1.
+        done = pending_threads->CompareExchangeWeakRelaxed(cur_val, cur_val - 1);
+#if ART_USE_FUTEXES
+        if (done && (cur_val - 1) == 0) {  // Weak CAS may fail spuriously.
+          futex(pending_threads->Address(), FUTEX_WAKE, -1, nullptr, nullptr, 0);
+        }
+#endif
+      } while (!done);
+      ++barrier_count;
+    }
+  }
+  CHECK_GT(barrier_count, 0U);
+  return true;
+}
+
+void Thread::ClearSuspendBarrier(AtomicInteger* target) {
+  CHECK(ReadFlag(kActiveSuspendBarrier));
+  bool clear_flag = true;
+  for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
+    AtomicInteger* ptr = tlsPtr_.active_suspend_barriers[i];
+    if (ptr == target) {
+      tlsPtr_.active_suspend_barriers[i] = nullptr;
+    } else if (ptr != nullptr) {
+      clear_flag = false;
+    }
+  }
+  if (LIKELY(clear_flag)) {
+    AtomicClearFlag(kActiveSuspendBarrier);
+  }
 }
 
 void Thread::RunCheckpointFunction() {
@@ -1316,7 +1420,11 @@
   for (uint32_t i = 0; i < kMaxCheckpoints; ++i) {
     tlsPtr_.checkpoint_functions[i] = nullptr;
   }
+  for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
+    tlsPtr_.active_suspend_barriers[i] = nullptr;
+  }
   tlsPtr_.flip_function = nullptr;
+  tlsPtr_.thread_local_mark_stack = nullptr;
   tls32_.suspended_at_suspend_check = false;
 }
 
@@ -1435,6 +1543,9 @@
   {
     ScopedObjectAccess soa(self);
     Runtime::Current()->GetHeap()->RevokeThreadLocalBuffers(this);
+    if (kUseReadBarrier) {
+      Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->RevokeThreadLocalMarkStack(this);
+    }
   }
 }
 
diff --git a/runtime/thread.h b/runtime/thread.h
index 0e71c08..7826e62 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -46,6 +46,9 @@
 namespace art {
 
 namespace gc {
+namespace accounting {
+  template<class T> class AtomicStack;
+}  // namespace accounting
 namespace collector {
   class SemiSpace;
 }  // namespace collector
@@ -98,7 +101,8 @@
 enum ThreadFlag {
   kSuspendRequest   = 1,  // If set implies that suspend_count_ > 0 and the Thread should enter the
                           // safepoint handler.
-  kCheckpointRequest = 2  // Request that the thread do some checkpoint work and then continue.
+  kCheckpointRequest = 2,  // Request that the thread do some checkpoint work and then continue.
+  kActiveSuspendBarrier = 4  // Register that at least 1 suspend barrier needs to be passed.
 };
 
 enum class StackedShadowFrameType {
@@ -138,11 +142,6 @@
 
 class Thread {
  public:
-  // For implicit overflow checks we reserve an extra piece of memory at the bottom
-  // of the stack (lowest memory).  The higher portion of the memory
-  // is protected against reads and the lower is available for use while
-  // throwing the StackOverflow exception.
-  static constexpr size_t kStackOverflowProtectedSize = 4 * KB;
   static const size_t kStackOverflowImplicitCheckSize;
 
   // Creates a new native thread corresponding to the given managed peer.
@@ -223,7 +222,7 @@
         (state_and_flags.as_struct.flags & kSuspendRequest) != 0;
   }
 
-  void ModifySuspendCount(Thread* self, int delta, bool for_debugger)
+  bool ModifySuspendCount(Thread* self, int delta, AtomicInteger* suspend_barrier, bool for_debugger)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_);
 
   bool RequestCheckpoint(Closure* function)
@@ -232,6 +231,15 @@
   void SetFlipFunction(Closure* function);
   Closure* GetFlipFunction();
 
+  gc::accounting::AtomicStack<mirror::Object>* GetThreadLocalMarkStack() {
+    CHECK(kUseReadBarrier);
+    return tlsPtr_.thread_local_mark_stack;
+  }
+  void SetThreadLocalMarkStack(gc::accounting::AtomicStack<mirror::Object>* stack) {
+    CHECK(kUseReadBarrier);
+    tlsPtr_.thread_local_mark_stack = stack;
+  }
+
   // Called when thread detected that the thread_suspend_count_ was non-zero. Gives up share of
   // mutator_lock_ and waits until it is resumed and thread_suspend_count_ is zero.
   void FullSuspendCheck()
@@ -772,6 +780,16 @@
     tls32_.debug_method_entry_ = false;
   }
 
+  bool GetWeakRefAccessEnabled() const {
+    CHECK(kUseReadBarrier);
+    return tls32_.weak_ref_access_enabled;
+  }
+
+  void SetWeakRefAccessEnabled(bool enabled) {
+    CHECK(kUseReadBarrier);
+    tls32_.weak_ref_access_enabled = enabled;
+  }
+
   // Activates single step control for debugging. The thread takes the
   // ownership of the given SingleStepControl*. It is deleted by a call
   // to DeactivateSingleStepControl or upon thread destruction.
@@ -846,6 +864,12 @@
 
   void RunCheckpointFunction();
 
+  bool PassActiveSuspendBarriers(Thread* self)
+      LOCKS_EXCLUDED(Locks::thread_suspend_count_lock_);
+
+  void ClearSuspendBarrier(AtomicInteger* target)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_);
+
   bool ReadFlag(ThreadFlag flag) const {
     return (tls32_.state_and_flags.as_struct.flags & flag) != 0;
   }
@@ -964,6 +988,11 @@
   ThreadState SetStateUnsafe(ThreadState new_state) {
     ThreadState old_state = GetState();
     tls32_.state_and_flags.as_struct.state = new_state;
+    // if transit to a suspended state, check the pass barrier request.
+    if (UNLIKELY((new_state != kRunnable) &&
+                 (tls32_.state_and_flags.as_struct.flags & kActiveSuspendBarrier))) {
+      PassActiveSuspendBarriers(this);
+    }
     return old_state;
   }
 
@@ -1034,6 +1063,9 @@
   // Maximum number of checkpoint functions.
   static constexpr uint32_t kMaxCheckpoints = 3;
 
+  // Maximum number of suspend barriers.
+  static constexpr uint32_t kMaxSuspendBarriers = 3;
+
   // Has Thread::Startup been called?
   static bool is_started_;
 
@@ -1060,7 +1092,7 @@
       daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
       thread_exit_check_count(0), handling_signal_(false),
       deoptimization_return_value_is_reference(false), suspended_at_suspend_check(false),
-      ready_for_debug_invoke(false), debug_method_entry_(false) {
+      ready_for_debug_invoke(false), debug_method_entry_(false), weak_ref_access_enabled(true) {
     }
 
     union StateAndFlags state_and_flags;
@@ -1117,6 +1149,15 @@
     // True if the thread enters a method. This is used to detect method entry
     // event for the debugger.
     bool32_t debug_method_entry_;
+
+    // True if the thread is allowed to access a weak ref (Reference::GetReferent() and system
+    // weaks) and to potentially mark an object alive/gray. This is used for concurrent reference
+    // processing of the CC collector only. This is thread local so that we can enable/disable weak
+    // ref access by using a checkpoint and avoid a race around the time weak ref access gets
+    // disabled and concurrent reference processing begins (if weak ref access is disabled during a
+    // pause, this is not an issue.) Other collectors use Runtime::DisallowNewSystemWeaks() and
+    // ReferenceProcessor::EnableSlowPath().
+    bool32_t weak_ref_access_enabled;
   } tls32_;
 
   struct PACKED(8) tls_64bit_sized_values {
@@ -1238,6 +1279,12 @@
     // Locks::thread_suspend_count_lock_.
     Closure* checkpoint_functions[kMaxCheckpoints];
 
+    // Pending barriers that require passing or NULL if non-pending. Installation guarding by
+    // Locks::thread_suspend_count_lock_.
+    // They work effectively as art::Barrier, but implemented directly using AtomicInteger and futex
+    // to avoid additional cost of a mutex and a condition variable, as used in art::Barrier.
+    AtomicInteger* active_suspend_barriers[kMaxSuspendBarriers];
+
     // Entrypoint function pointers.
     // TODO: move this to more of a global offset table model to avoid per-thread duplication.
     InterpreterEntryPoints interpreter_entrypoints;
@@ -1268,6 +1315,9 @@
 
     // Current method verifier, used for root marking.
     verifier::MethodVerifier* method_verifier;
+
+    // Thread-local mark stack for the concurrent copying collector.
+    gc::accounting::AtomicStack<mirror::Object>* thread_local_mark_stack;
   } tlsPtr_;
 
   // Guards the 'interrupted_' and 'wait_monitor_' members.
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 7e8128f..7c40eb7 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -32,6 +32,7 @@
 #include "base/time_utils.h"
 #include "base/timing_logger.h"
 #include "debugger.h"
+#include "gc/collector/concurrent_copying.h"
 #include "jni_internal.h"
 #include "lock_word.h"
 #include "monitor.h"
@@ -40,6 +41,14 @@
 #include "trace.h"
 #include "well_known_classes.h"
 
+#if ART_USE_FUTEXES
+#include "linux/futex.h"
+#include "sys/syscall.h"
+#ifndef SYS_futex
+#define SYS_futex __NR_futex
+#endif
+#endif  // ART_USE_FUTEXES
+
 namespace art {
 
 static constexpr uint64_t kLongThreadSuspendThreshold = MsToNs(5);
@@ -278,7 +287,7 @@
               // Spurious fail, try again.
               continue;
             }
-            thread->ModifySuspendCount(self, +1, false);
+            thread->ModifySuspendCount(self, +1, nullptr, false);
             suspended_count_modified_threads.push_back(thread);
             break;
           }
@@ -316,7 +325,7 @@
     checkpoint_function->Run(thread);
     {
       MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-      thread->ModifySuspendCount(self, -1, false);
+      thread->ModifySuspendCount(self, -1, nullptr, false);
     }
   }
 
@@ -386,7 +395,7 @@
       if (thread == self) {
         continue;
       }
-      thread->ModifySuspendCount(self, +1, false);
+      thread->ModifySuspendCount(self, +1, nullptr, false);
     }
   }
 
@@ -413,7 +422,7 @@
       thread->SetFlipFunction(thread_flip_visitor);
       if (thread->IsSuspendedAtSuspendCheck()) {
         // The thread will resume right after the broadcast.
-        thread->ModifySuspendCount(self, -1, false);
+        thread->ModifySuspendCount(self, -1, nullptr, false);
         runnable_threads.push_back(thread);
       } else {
         other_threads.push_back(thread);
@@ -439,7 +448,7 @@
   {
     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
     for (const auto& thread : other_threads) {
-      thread->ModifySuspendCount(self, -1, false);
+      thread->ModifySuspendCount(self, -1, nullptr, false);
     }
     Thread::resume_cond_->Broadcast(self);
   }
@@ -458,28 +467,9 @@
   ATRACE_BEGIN("Suspending mutator threads");
   const uint64_t start_time = NanoTime();
 
-  Locks::mutator_lock_->AssertNotHeld(self);
-  Locks::thread_list_lock_->AssertNotHeld(self);
-  Locks::thread_suspend_count_lock_->AssertNotHeld(self);
-  if (kDebugLocking && self != nullptr) {
-    CHECK_NE(self->GetState(), kRunnable);
-  }
-  {
-    MutexLock mu(self, *Locks::thread_list_lock_);
-    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-    // Update global suspend all state for attaching threads.
-    ++suspend_all_count_;
-    // Increment everybody's suspend count (except our own).
-    for (const auto& thread : list_) {
-      if (thread == self) {
-        continue;
-      }
-      VLOG(threads) << "requesting thread suspend: " << *thread;
-      thread->ModifySuspendCount(self, +1, false);
-    }
-  }
-
-  // Block on the mutator lock until all Runnable threads release their share of access.
+  SuspendAllInternal(self, self);
+  // All threads are known to have suspended (but a thread may still own the mutator lock)
+  // Make sure this thread grabs exclusive access to the mutator lock and its protected data.
 #if HAVE_TIMED_RWLOCK
   while (true) {
     if (Locks::mutator_lock_->ExclusiveLockWithTimeout(self, kThreadSuspendTimeoutMs, 0)) {
@@ -519,6 +509,114 @@
   }
 }
 
+// Ensures all threads running Java suspend and that those not running Java don't start.
+// Debugger thread might be set to kRunnable for a short period of time after the
+// SuspendAllInternal. This is safe because it will be set back to suspended state before
+// the SuspendAll returns.
+void ThreadList::SuspendAllInternal(Thread* self, Thread* ignore1, Thread* ignore2,
+                                    bool debug_suspend) {
+  Locks::mutator_lock_->AssertNotExclusiveHeld(self);
+  Locks::thread_list_lock_->AssertNotHeld(self);
+  Locks::thread_suspend_count_lock_->AssertNotHeld(self);
+  if (kDebugLocking && self != nullptr) {
+    CHECK_NE(self->GetState(), kRunnable);
+  }
+
+  // First request that all threads suspend, then wait for them to suspend before
+  // returning. This suspension scheme also relies on other behaviour:
+  // 1. Threads cannot be deleted while they are suspended or have a suspend-
+  //    request flag set - (see Unregister() below).
+  // 2. When threads are created, they are created in a suspended state (actually
+  //    kNative) and will never begin executing Java code without first checking
+  //    the suspend-request flag.
+
+  // The atomic counter for number of threads that need to pass the barrier.
+  AtomicInteger pending_threads;
+  uint32_t num_ignored = 0;
+  if (ignore1 != nullptr) {
+    ++num_ignored;
+  }
+  if (ignore2 != nullptr && ignore1 != ignore2) {
+    ++num_ignored;
+  }
+  {
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+    // Update global suspend all state for attaching threads.
+    ++suspend_all_count_;
+    if (debug_suspend)
+      ++debug_suspend_all_count_;
+    pending_threads.StoreRelaxed(list_.size() - num_ignored);
+    // Increment everybody's suspend count (except those that should be ignored).
+    for (const auto& thread : list_) {
+      if (thread == ignore1 || thread == ignore2) {
+        continue;
+      }
+      VLOG(threads) << "requesting thread suspend: " << *thread;
+      while (true) {
+        if (LIKELY(thread->ModifySuspendCount(self, +1, &pending_threads, debug_suspend))) {
+          break;
+        } else {
+          // Failure means the list of active_suspend_barriers is full, we should release the
+          // thread_suspend_count_lock_ (to avoid deadlock) and wait till the target thread has
+          // executed Thread::PassActiveSuspendBarriers(). Note that we could not simply wait for
+          // the thread to change to a suspended state, because it might need to run checkpoint
+          // function before the state change, which also needs thread_suspend_count_lock_.
+
+          // This is very unlikely to happen since more than kMaxSuspendBarriers threads need to
+          // execute SuspendAllInternal() simultaneously, and target thread stays in kRunnable
+          // in the mean time.
+          Locks::thread_suspend_count_lock_->ExclusiveUnlock(self);
+          NanoSleep(100000);
+          Locks::thread_suspend_count_lock_->ExclusiveLock(self);
+        }
+      }
+
+      // Must install the pending_threads counter first, then check thread->IsSuspend() and clear
+      // the counter. Otherwise there's a race with Thread::TransitionFromRunnableToSuspended()
+      // that can lead a thread to miss a call to PassActiveSuspendBarriers().
+      if (thread->IsSuspended()) {
+        // Only clear the counter for the current thread.
+        thread->ClearSuspendBarrier(&pending_threads);
+        pending_threads.FetchAndSubSequentiallyConsistent(1);
+      }
+    }
+  }
+
+  // Wait for the barrier to be passed by all runnable threads. This wait
+  // is done with a timeout so that we can detect problems.
+#if ART_USE_FUTEXES
+  timespec wait_timeout;
+  InitTimeSpec(true, CLOCK_MONOTONIC, 10000, 0, &wait_timeout);
+#endif
+  while (true) {
+    int32_t cur_val = pending_threads.LoadRelaxed();
+    if (LIKELY(cur_val > 0)) {
+#if ART_USE_FUTEXES
+      if (futex(pending_threads.Address(), FUTEX_WAIT, cur_val, &wait_timeout, nullptr, 0) != 0) {
+        // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
+        if ((errno != EAGAIN) && (errno != EINTR)) {
+          if (errno == ETIMEDOUT) {
+            LOG(kIsDebugBuild ? FATAL : ERROR) << "Unexpected time out during suspend all.";
+          } else {
+            PLOG(FATAL) << "futex wait failed for SuspendAllInternal()";
+          }
+        }
+      } else {
+        cur_val = pending_threads.LoadRelaxed();
+        CHECK_EQ(cur_val, 0);
+        break;
+      }
+#else
+      // Spin wait. This is likely to be slow, but on most architecture ART_USE_FUTEXES is set.
+#endif
+    } else {
+      CHECK_EQ(cur_val, 0);
+      break;
+    }
+  }
+}
+
 void ThreadList::ResumeAll() {
   Thread* self = Thread::Current();
 
@@ -549,7 +647,7 @@
       if (thread == self) {
         continue;
       }
-      thread->ModifySuspendCount(self, -1, false);
+      thread->ModifySuspendCount(self, -1, nullptr, false);
     }
 
     // Broadcast a notification to all suspended threads, some or all of
@@ -592,7 +690,7 @@
           << ") thread not within thread list";
       return;
     }
-    thread->ModifySuspendCount(self, -1, for_debugger);
+    thread->ModifySuspendCount(self, -1, nullptr, for_debugger);
   }
 
   {
@@ -644,7 +742,7 @@
           // If we incremented the suspend count but the thread reset its peer, we need to
           // re-decrement it since it is shutting down and may deadlock the runtime in
           // ThreadList::WaitForOtherNonDaemonThreadsToExit.
-          suspended_thread->ModifySuspendCount(soa.Self(), -1, debug_suspension);
+          suspended_thread->ModifySuspendCount(soa.Self(), -1, nullptr, debug_suspension);
         }
         ThreadSuspendByPeerWarning(self, WARNING, "No such thread for suspend", peer);
         return nullptr;
@@ -667,7 +765,7 @@
           }
           CHECK(suspended_thread == nullptr);
           suspended_thread = thread;
-          suspended_thread->ModifySuspendCount(self, +1, debug_suspension);
+          suspended_thread->ModifySuspendCount(self, +1, nullptr, debug_suspension);
           request_suspension = false;
         } else {
           // If the caller isn't requesting suspension, a suspension should have already occurred.
@@ -696,7 +794,7 @@
           ThreadSuspendByPeerWarning(self, FATAL, "Thread suspension timed out", peer);
           if (suspended_thread != nullptr) {
             CHECK_EQ(suspended_thread, thread);
-            suspended_thread->ModifySuspendCount(soa.Self(), -1, debug_suspension);
+            suspended_thread->ModifySuspendCount(soa.Self(), -1, nullptr, debug_suspension);
           }
           *timed_out = true;
           return nullptr;
@@ -765,7 +863,7 @@
             // which will allow this thread to be suspended.
             continue;
           }
-          thread->ModifySuspendCount(self, +1, debug_suspension);
+          thread->ModifySuspendCount(self, +1, nullptr, debug_suspension);
           suspended_thread = thread;
         } else {
           CHECK_EQ(suspended_thread, thread);
@@ -794,7 +892,7 @@
         if (total_delay >= MsToNs(kThreadSuspendTimeoutMs)) {
           ThreadSuspendByThreadIdWarning(WARNING, "Thread suspension timed out", thread_id);
           if (suspended_thread != nullptr) {
-            thread->ModifySuspendCount(soa.Self(), -1, debug_suspension);
+            thread->ModifySuspendCount(soa.Self(), -1, nullptr, debug_suspension);
           }
           *timed_out = true;
           return nullptr;
@@ -831,25 +929,7 @@
 
   VLOG(threads) << *self << " SuspendAllForDebugger starting...";
 
-  {
-    MutexLock thread_list_mu(self, *Locks::thread_list_lock_);
-    {
-      MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
-      // Update global suspend all state for attaching threads.
-      DCHECK_GE(suspend_all_count_, debug_suspend_all_count_);
-      ++suspend_all_count_;
-      ++debug_suspend_all_count_;
-      // Increment everybody's suspend count (except our own).
-      for (const auto& thread : list_) {
-        if (thread == self || thread == debug_thread) {
-          continue;
-        }
-        VLOG(threads) << "requesting thread suspend: " << *thread;
-        thread->ModifySuspendCount(self, +1, true);
-      }
-    }
-  }
-
+  SuspendAllInternal(self, self, debug_thread, true);
   // Block on the mutator lock until all Runnable threads release their share of access then
   // immediately unlock again.
 #if HAVE_TIMED_RWLOCK
@@ -887,7 +967,7 @@
     // to ensure that we're the only one fiddling with the suspend count
     // though.
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
-    self->ModifySuspendCount(self, +1, true);
+    self->ModifySuspendCount(self, +1, nullptr, true);
     CHECK_GT(self->GetSuspendCount(), 0);
 
     VLOG(threads) << *self << " self-suspending (debugger)";
@@ -971,7 +1051,7 @@
           continue;
         }
         VLOG(threads) << "requesting thread resume: " << *thread;
-        thread->ModifySuspendCount(self, -1, true);
+        thread->ModifySuspendCount(self, -1, nullptr, true);
       }
     }
   }
@@ -1000,7 +1080,7 @@
       if (thread == self || thread->GetDebugSuspendCount() == 0) {
         continue;
       }
-      thread->ModifySuspendCount(self, -thread->GetDebugSuspendCount(), true);
+      thread->ModifySuspendCount(self, -thread->GetDebugSuspendCount(), nullptr, true);
     }
   }
 
@@ -1053,7 +1133,7 @@
       // daemons.
       CHECK(thread->IsDaemon()) << *thread;
       if (thread != self) {
-        thread->ModifySuspendCount(self, +1, false);
+        thread->ModifySuspendCount(self, +1, nullptr, false);
       }
     }
   }
@@ -1094,13 +1174,19 @@
   // Modify suspend count in increments of 1 to maintain invariants in ModifySuspendCount. While
   // this isn't particularly efficient the suspend counts are most commonly 0 or 1.
   for (int delta = debug_suspend_all_count_; delta > 0; delta--) {
-    self->ModifySuspendCount(self, +1, true);
+    self->ModifySuspendCount(self, +1, nullptr, true);
   }
   for (int delta = suspend_all_count_ - debug_suspend_all_count_; delta > 0; delta--) {
-    self->ModifySuspendCount(self, +1, false);
+    self->ModifySuspendCount(self, +1, nullptr, false);
   }
   CHECK(!Contains(self));
   list_.push_back(self);
+  if (kUseReadBarrier) {
+    // Initialize this according to the state of the CC collector.
+    bool weak_ref_access_enabled =
+        Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->IsWeakRefAccessEnabled();
+    self->SetWeakRefAccessEnabled(weak_ref_access_enabled);
+  }
 }
 
 void ThreadList::Unregister(Thread* self) {
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 2c1f813..edd1e05 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -155,6 +155,8 @@
 
   bool Contains(Thread* thread) EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_list_lock_);
   bool Contains(pid_t tid) EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_list_lock_);
+  size_t RunCheckpoint(Closure* checkpoint_function, bool includeSuspended)
+      LOCKS_EXCLUDED(Locks::thread_list_lock_, Locks::thread_suspend_count_lock_);
 
   void DumpUnattachedThreads(std::ostream& os)
       LOCKS_EXCLUDED(Locks::thread_list_lock_);
@@ -166,6 +168,11 @@
       LOCKS_EXCLUDED(Locks::thread_list_lock_,
                      Locks::thread_suspend_count_lock_);
 
+  void SuspendAllInternal(Thread* self, Thread* ignore1, Thread* ignore2 = nullptr,
+                          bool debug_suspend = false)
+      LOCKS_EXCLUDED(Locks::thread_list_lock_,
+                     Locks::thread_suspend_count_lock_);
+
   void AssertThreadsAreSuspended(Thread* self, Thread* ignore1, Thread* ignore2 = nullptr)
       LOCKS_EXCLUDED(Locks::thread_list_lock_,
                      Locks::thread_suspend_count_lock_);
diff --git a/runtime/utils.cc b/runtime/utils.cc
index 4923342..194d9fe 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -1094,7 +1094,7 @@
     ArtMethod* current_method, void* ucontext_ptr) {
 #if __linux__
   // b/18119146
-  if (RUNNING_ON_VALGRIND != 0) {
+  if (RUNNING_ON_MEMORY_TOOL != 0) {
     return;
   }
 
diff --git a/runtime/utils_test.cc b/runtime/utils_test.cc
index 66e38b1..f00edff 100644
--- a/runtime/utils_test.cc
+++ b/runtime/utils_test.cc
@@ -26,7 +26,7 @@
 #include "scoped_thread_state_change.h"
 #include "handle_scope-inl.h"
 
-#include <valgrind.h>
+#include "base/memory_tool.h"
 
 namespace art {
 
@@ -358,7 +358,7 @@
     command.push_back("/usr/bin/id");
   }
   std::string error_msg;
-  if (RUNNING_ON_VALGRIND == 0) {
+  if (!(RUNNING_ON_MEMORY_TOOL && kMemoryToolDetectsLeaks)) {
     // Running on valgrind fails due to some memory that leaks in thread alternate signal stacks.
     EXPECT_TRUE(Exec(command, &error_msg));
   }
@@ -372,7 +372,7 @@
   std::vector<std::string> command;
   command.push_back("bogus");
   std::string error_msg;
-  if (RUNNING_ON_VALGRIND == 0) {
+  if (!(RUNNING_ON_MEMORY_TOOL && kMemoryToolDetectsLeaks)) {
     // Running on valgrind fails due to some memory that leaks in thread alternate signal stacks.
     EXPECT_FALSE(Exec(command, &error_msg));
     EXPECT_NE(0U, error_msg.size());
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 38973f7..13428ec 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -46,6 +46,10 @@
     $(JILL_JAR)
 endif
 
+ifeq ($(ART_TEST_DEBUG_GC),true)
+  ART_TEST_WITH_STRACE := true
+endif
+
 # Helper to create individual build targets for tests. Must be called with $(eval).
 # $(1): the test number
 define define-build-art-run-test
@@ -615,6 +619,9 @@
   else
     run_test_options += --build-with-javac-dx
   endif
+  ifeq ($(ART_TEST_WITH_STRACE),true)
+    run_test_options += --strace
+  endif
   ifeq ($(ART_TEST_RUN_TEST_ALWAYS_CLEAN),true)
     run_test_options += --always-clean
   endif
diff --git a/test/run-test b/test/run-test
index f5fff09a..bdf680b 100755
--- a/test/run-test
+++ b/test/run-test
@@ -117,6 +117,7 @@
 output="output.txt"
 build_output="build-output.txt"
 cfg_output="graph.cfg"
+strace_output="strace-output.txt"
 lib="libartd.so"
 run_args="--quiet"
 build_args=""
@@ -137,6 +138,7 @@
 basic_verify="false"
 gc_verify="false"
 gc_stress="false"
+strace="false"
 always_clean="no"
 never_clean="no"
 have_dex2oat="yes"
@@ -235,6 +237,10 @@
         run_args="${run_args} --gdb"
         dev_mode="yes"
         shift
+    elif [ "x$1" = "x--strace" ]; then
+        strace="yes"
+        run_args="${run_args} --invoke-with strace --invoke-with -o --invoke-with $tmp_dir/$strace_output"
+        shift
     elif [ "x$1" = "x--zygote" ]; then
         run_args="${run_args} --zygote"
         shift
@@ -750,6 +756,11 @@
         echo '#################### diffs'
         diff --strip-trailing-cr -u "$expected" "$output" | tail -n 2000
         echo '####################'
+        if [ "$strace" = "yes" ]; then
+            echo '#################### strace output'
+            tail -n 2000 "$tmp_dir/$strace_output"
+            echo '####################'
+        fi
         echo ' '
     fi