Merge "Quick compiler: Fix liveness tracking"
diff --git a/Android.mk b/Android.mk
index 01819ee..fe631d9 100644
--- a/Android.mk
+++ b/Android.mk
@@ -449,9 +449,7 @@
 use-art-full:
 	adb root && sleep 3
 	adb shell stop
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.dex
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.oat
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.art
+	adb shell rm -rf $(ART_DALVIK_CACHE_DIR)/*
 	adb shell setprop dalvik.vm.dex2oat-flags ""
 	adb shell setprop dalvik.vm.image-dex2oat-flags ""
 	adb shell setprop persist.sys.dalvik.vm.lib.1 libart.so
@@ -461,9 +459,7 @@
 use-art-smart:
 	adb root && sleep 3
 	adb shell stop
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.dex
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.oat
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.art
+	adb shell rm -rf $(ART_DALVIK_CACHE_DIR)/*
 	adb shell setprop dalvik.vm.dex2oat-flags "--compiler-filter=interpret-only"
 	adb shell setprop dalvik.vm.image-dex2oat-flags ""
 	adb shell setprop persist.sys.dalvik.vm.lib.1 libart.so
@@ -473,9 +469,7 @@
 use-art-interpret-only:
 	adb root && sleep 3
 	adb shell stop
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.dex
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.oat
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.art
+	adb shell rm -rf $(ART_DALVIK_CACHE_DIR)/*
 	adb shell setprop dalvik.vm.dex2oat-flags "--compiler-filter=interpret-only"
 	adb shell setprop dalvik.vm.image-dex2oat-flags "--compiler-filter=interpret-only"
 	adb shell setprop persist.sys.dalvik.vm.lib.1 libart.so
@@ -485,9 +479,7 @@
 use-art-verify-none:
 	adb root && sleep 3
 	adb shell stop
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.dex
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.oat
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.art
+	adb shell rm -rf $(ART_DALVIK_CACHE_DIR)/*
 	adb shell setprop dalvik.vm.dex2oat-flags "--compiler-filter=verify-none"
 	adb shell setprop dalvik.vm.image-dex2oat-flags "--compiler-filter=verify-none"
 	adb shell setprop persist.sys.dalvik.vm.lib.1 libart.so
diff --git a/compiler/dex/local_value_numbering.h b/compiler/dex/local_value_numbering.h
index 535b613..6d67afb 100644
--- a/compiler/dex/local_value_numbering.h
+++ b/compiler/dex/local_value_numbering.h
@@ -20,6 +20,7 @@
 #include "compiler_internals.h"
 #include "UniquePtr.h"
 #include "utils/scoped_arena_allocator.h"
+#include "utils/scoped_arena_containers.h"
 
 #define NO_VALUE 0xffff
 #define ARRAY_REF 0xfffe
@@ -75,20 +76,16 @@
   };
 
   // Key is s_reg, value is value name.
-  typedef SafeMap<uint16_t, uint16_t, std::less<uint16_t>,
-      ScopedArenaAllocatorAdapter<std::pair<uint16_t, uint16_t> > > SregValueMap;
+  typedef ScopedArenaSafeMap<uint16_t, uint16_t> SregValueMap;
   // Key is concatenation of opcode, operand1, operand2 and modifier, value is value name.
-  typedef SafeMap<uint64_t, uint16_t, std::less<uint64_t>,
-      ScopedArenaAllocatorAdapter<std::pair<uint64_t, uint16_t> > > ValueMap;
+  typedef ScopedArenaSafeMap<uint64_t, uint16_t> ValueMap;
   // Key represents a memory address, value is generation.
-  typedef SafeMap<MemoryVersionKey, uint16_t, MemoryVersionKeyComparator,
-      ScopedArenaAllocatorAdapter<std::pair<MemoryVersionKey, uint16_t> > > MemoryVersionMap;
+  typedef ScopedArenaSafeMap<MemoryVersionKey, uint16_t, MemoryVersionKeyComparator
+      > MemoryVersionMap;
   // Maps field key to field id for resolved fields.
-  typedef SafeMap<FieldReference, uint32_t, FieldReferenceComparator,
-      ScopedArenaAllocatorAdapter<std::pair<FieldReference, uint16_t> > > FieldIndexMap;
+  typedef ScopedArenaSafeMap<FieldReference, uint32_t, FieldReferenceComparator> FieldIndexMap;
   // A set of value names.
-  typedef std::set<uint16_t, std::less<uint16_t>,
-      ScopedArenaAllocatorAdapter<uint16_t> > ValueNameSet;
+  typedef ScopedArenaSet<uint16_t> ValueNameSet;
 
  public:
   static LocalValueNumbering* Create(CompilationUnit* cu) {
diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc
index 200795e..c3b5a25 100644
--- a/compiler/dex/mir_analysis.cc
+++ b/compiler/dex/mir_analysis.cc
@@ -24,6 +24,7 @@
 #include "dex/quick/dex_file_to_method_inliner_map.h"
 #include "driver/compiler_options.h"
 #include "UniquePtr.h"
+#include "utils/scoped_arena_containers.h"
 
 namespace art {
 
@@ -1205,17 +1206,16 @@
     MethodReferenceComparator devirt_cmp;
   };
 
-  // Map invoke key (see MapEntry) to lowering info index.
-  typedef std::set<MapEntry, MapEntryComparator, ScopedArenaAllocatorAdapter<MapEntry> > InvokeMap;
-
   ScopedArenaAllocator allocator(&cu_->arena_stack);
 
   // All INVOKE instructions take 3 code units and there must also be a RETURN.
   uint32_t max_refs = (current_code_item_->insns_size_in_code_units_ - 1u) / 3u;
 
+  // Map invoke key (see MapEntry) to lowering info index and vice versa.
   // The invoke_map and sequential entries are essentially equivalent to Boost.MultiIndex's
   // multi_index_container with one ordered index and one sequential index.
-  InvokeMap invoke_map(MapEntryComparator(), allocator.Adapter());
+  ScopedArenaSet<MapEntry, MapEntryComparator> invoke_map(MapEntryComparator(),
+                                                          allocator.Adapter());
   const MapEntry** sequential_entries = reinterpret_cast<const MapEntry**>(
       allocator.Alloc(max_refs * sizeof(sequential_entries[0]), kArenaAllocMisc));
 
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index 413b4e0..8e8a593 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -19,6 +19,7 @@
 #include "dataflow_iterator-inl.h"
 #include "dex/quick/dex_file_method_inliner.h"
 #include "dex/quick/dex_file_to_method_inliner_map.h"
+#include "utils/scoped_arena_containers.h"
 
 namespace art {
 
@@ -964,11 +965,9 @@
       }
     };
 
-    typedef std::set<MapEntry, MapEntryComparator, ScopedArenaAllocatorAdapter<MapEntry> >
-        ClassToIndexMap;
-
     ScopedArenaAllocator allocator(&cu_->arena_stack);
-    ClassToIndexMap class_to_index_map(MapEntryComparator(), allocator.Adapter());
+    ScopedArenaSet<MapEntry, MapEntryComparator> class_to_index_map(MapEntryComparator(),
+                                                                    allocator.Adapter());
 
     // First, find all SGET/SPUTs that may need class initialization checks, record INVOKE_STATICs.
     AllNodesIterator iter(this);
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index aab6b46..56f4830 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -32,6 +32,8 @@
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<4> offset);
+    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                       OpSize size) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
@@ -40,6 +42,8 @@
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
+    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                        OpSize size) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
@@ -86,6 +90,11 @@
     int GetInsnSize(LIR* lir);
     bool IsUnconditionalBranch(LIR* lir);
 
+    // Check support for volatile load/store of a given size.
+    bool SupportsVolatileLoadStore(OpSize size) OVERRIDE;
+    // Get the register class for load/store of a field.
+    RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
+
     // Required for target - Dalvik-level generators.
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2);
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index f59720b..d0c81d5 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -522,6 +522,21 @@
   return ((lir->opcode == kThumbBUncond) || (lir->opcode == kThumb2BUncond));
 }
 
+bool ArmMir2Lir::SupportsVolatileLoadStore(OpSize size) {
+  return true;
+}
+
+RegisterClass ArmMir2Lir::RegClassForFieldLoadStore(OpSize size, bool is_volatile) {
+  if (UNLIKELY(is_volatile)) {
+    // On arm, atomic 64-bit load/store requires a core register pair.
+    // Smaller aligned load/store is atomic for both core and fp registers.
+    if (size == k64 || size == kDouble) {
+      return kCoreReg;
+    }
+  }
+  return RegClassBySize(size);
+}
+
 ArmMir2Lir::ArmMir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena) {
   // Sanity check - make sure encoding map lines up.
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 1745c18..1afd890 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -952,7 +952,26 @@
   return load;
 }
 
-LIR* ArmMir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size) {
+LIR* ArmMir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                                      OpSize size) {
+  // Only 64-bit load needs special handling.
+  if (UNLIKELY(size == k64 || size == kDouble)) {
+    DCHECK(!r_dest.IsFloat());  // See RegClassForFieldLoadSave().
+    // If the cpu supports LPAE, aligned LDRD is atomic - fall through to LoadBaseDisp().
+    if (!cu_->compiler_driver->GetInstructionSetFeatures().HasLpae()) {
+      // Use LDREXD for the atomic load. (Expect displacement > 0, don't optimize for == 0.)
+      RegStorage r_ptr = AllocTemp();
+      OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+      LIR* lir = NewLIR3(kThumb2Ldrexd, r_dest.GetLowReg(), r_dest.GetHighReg(), r_ptr.GetReg());
+      FreeTemp(r_ptr);
+      return lir;
+    }
+  }
+  return LoadBaseDisp(r_base, displacement, r_dest, size);
+}
+
+LIR* ArmMir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size) {
   // TODO: base this on target.
   if (size == kWord) {
     size = k32;
@@ -1072,6 +1091,42 @@
   return store;
 }
 
+LIR* ArmMir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                                       OpSize size) {
+  // Only 64-bit store needs special handling.
+  if (UNLIKELY(size == k64 || size == kDouble)) {
+    DCHECK(!r_src.IsFloat());  // See RegClassForFieldLoadSave().
+    // If the cpu supports LPAE, aligned STRD is atomic - fall through to StoreBaseDisp().
+    if (!cu_->compiler_driver->GetInstructionSetFeatures().HasLpae()) {
+      // Use STREXD for the atomic store. (Expect displacement > 0, don't optimize for == 0.)
+      RegStorage r_ptr = AllocTemp();
+      OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+      LIR* fail_target = NewLIR0(kPseudoTargetLabel);
+      // We have only 5 temporary registers available and if r_base, r_src and r_ptr already
+      // take 4, we can't directly allocate 2 more for LDREXD temps. In that case clobber r_ptr
+      // in LDREXD and recalculate it from r_base.
+      RegStorage r_temp = AllocTemp();
+      RegStorage r_temp_high = AllocFreeTemp();  // We may not have another temp.
+      if (r_temp_high.Valid()) {
+        NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_temp_high.GetReg(), r_ptr.GetReg());
+        FreeTemp(r_temp_high);
+        FreeTemp(r_temp);
+      } else {
+        // If we don't have another temp, clobber r_ptr in LDREXD and reload it.
+        NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_ptr.GetReg(), r_ptr.GetReg());
+        FreeTemp(r_temp);  // May need the temp for kOpAdd.
+        OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+      }
+      LIR* lir = NewLIR4(kThumb2Strexd, r_temp.GetReg(), r_src.GetLowReg(), r_src.GetHighReg(),
+                         r_ptr.GetReg());
+      OpCmpImmBranch(kCondNe, r_temp, 0, fail_target);
+      FreeTemp(r_ptr);
+      return lir;
+    }
+  }
+  return StoreBaseDisp(r_base, displacement, r_src, size);
+}
+
 LIR* ArmMir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                                OpSize size) {
   // TODO: base this on target.
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 903be10..26084a2 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -32,6 +32,8 @@
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(A64ThreadOffset offset);
+    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                       OpSize size) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
@@ -40,6 +42,8 @@
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
+    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                        OpSize size) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
@@ -86,6 +90,11 @@
     int GetInsnSize(LIR* lir);
     bool IsUnconditionalBranch(LIR* lir);
 
+    // Check support for volatile load/store of a given size.
+    bool SupportsVolatileLoadStore(OpSize size) OVERRIDE;
+    // Get the register class for load/store of a field.
+    RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
+
     // Required for target - Dalvik-level generators.
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2);
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 7e07e15..e4764eb 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -535,6 +535,20 @@
   return (lir->opcode == kA64B1t);
 }
 
+bool Arm64Mir2Lir::SupportsVolatileLoadStore(OpSize size) {
+  return true;
+}
+
+RegisterClass Arm64Mir2Lir::RegClassForFieldLoadStore(OpSize size, bool is_volatile) {
+  if (UNLIKELY(is_volatile)) {
+    // On arm64, fp register load/store is atomic only for single bytes.
+    if (size != kSignedByte && size != kUnsignedByte) {
+      return kCoreReg;
+    }
+  }
+  return RegClassBySize(size);
+}
+
 Arm64Mir2Lir::Arm64Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena) {
   // Sanity check - make sure encoding map lines up.
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index e46e201..ae17711 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -930,6 +930,13 @@
   return load;
 }
 
+LIR* Arm64Mir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                                        OpSize size) {
+  // LoadBaseDisp() will emit correct insn for atomic load on arm64
+  // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
+  return LoadBaseDisp(r_base, displacement, r_dest, size);
+}
+
 LIR* Arm64Mir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                                 OpSize size) {
   return LoadBaseDispBody(r_base, displacement, r_dest, size);
@@ -1032,8 +1039,15 @@
   return store;
 }
 
+LIR* Arm64Mir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                                         OpSize size) {
+  // StoreBaseDisp() will emit correct insn for atomic store on arm64
+  // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
+  return StoreBaseDisp(r_base, displacement, r_src, size);
+}
+
 LIR* Arm64Mir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                               OpSize size) {
+                                 OpSize size) {
   return StoreBaseDispBody(r_base, displacement, r_src, size);
 }
 
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 83d5045..732e776 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -500,7 +500,9 @@
                       bool is_object) {
   const MirSFieldLoweringInfo& field_info = mir_graph_->GetSFieldLoweringInfo(mir);
   cu_->compiler_driver->ProcessedStaticField(field_info.FastPut(), field_info.IsReferrersClass());
-  if (field_info.FastPut() && !SLOW_FIELD_PATH) {
+  OpSize store_size = LoadStoreOpSize(is_long_or_double, is_object);
+  if (!SLOW_FIELD_PATH && field_info.FastPut() &&
+      (!field_info.IsVolatile() || SupportsVolatileLoadStore(store_size))) {
     DCHECK_GE(field_info.FieldOffset().Int32Value(), 0);
     RegStorage r_base;
     if (field_info.IsReferrersClass()) {
@@ -550,25 +552,20 @@
       FreeTemp(r_method);
     }
     // rBase now holds static storage base
+    RegisterClass reg_class = RegClassForFieldLoadStore(store_size, field_info.IsVolatile());
     if (is_long_or_double) {
-      RegisterClass register_kind = kAnyReg;
-      if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
-        // Force long/double volatile stores into SSE registers to avoid tearing.
-        register_kind = kFPReg;
-      }
-      rl_src = LoadValueWide(rl_src, register_kind);
+      rl_src = LoadValueWide(rl_src, reg_class);
     } else {
-      rl_src = LoadValue(rl_src, kAnyReg);
+      rl_src = LoadValue(rl_src, reg_class);
     }
     if (field_info.IsVolatile()) {
       // There might have been a store before this volatile one so insert StoreStore barrier.
       GenMemBarrier(kStoreStore);
-    }
-    OpSize size = LoadStoreOpSize(is_long_or_double, rl_src.ref);
-    StoreBaseDisp(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg, size);
-    if (field_info.IsVolatile()) {
+      StoreBaseDispVolatile(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg, store_size);
       // A load might follow the volatile store so insert a StoreLoad barrier.
       GenMemBarrier(kStoreLoad);
+    } else {
+      StoreBaseDisp(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg, store_size);
     }
     if (is_object && !mir_graph_->IsConstantNullRef(rl_src)) {
       MarkGCCard(rl_src.reg, r_base);
@@ -588,7 +585,9 @@
                       bool is_long_or_double, bool is_object) {
   const MirSFieldLoweringInfo& field_info = mir_graph_->GetSFieldLoweringInfo(mir);
   cu_->compiler_driver->ProcessedStaticField(field_info.FastGet(), field_info.IsReferrersClass());
-  if (field_info.FastGet() && !SLOW_FIELD_PATH) {
+  OpSize load_size = LoadStoreOpSize(is_long_or_double, is_object);
+  if (!SLOW_FIELD_PATH && field_info.FastGet() &&
+      (!field_info.IsVolatile() || SupportsVolatileLoadStore(load_size))) {
     DCHECK_GE(field_info.FieldOffset().Int32Value(), 0);
     RegStorage r_base;
     if (field_info.IsReferrersClass()) {
@@ -634,23 +633,20 @@
       FreeTemp(r_method);
     }
     // r_base now holds static storage base
-    RegisterClass result_reg_kind = kAnyReg;
-    if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
-      // Force long/double volatile loads into SSE registers to avoid tearing.
-      result_reg_kind = kFPReg;
-    }
-    RegLocation rl_result = EvalLoc(rl_dest, result_reg_kind, true);
+    RegisterClass reg_class = RegClassForFieldLoadStore(load_size, field_info.IsVolatile());
+    RegLocation rl_result = EvalLoc(rl_dest, reg_class, true);
 
-    OpSize size = LoadStoreOpSize(is_long_or_double, rl_result.ref);
-    LoadBaseDisp(r_base, field_info.FieldOffset().Int32Value(), rl_result.reg, size);
-    FreeTemp(r_base);
-
+    int field_offset = field_info.FieldOffset().Int32Value();
     if (field_info.IsVolatile()) {
+      LoadBaseDispVolatile(r_base, field_offset, rl_result.reg, load_size);
       // Without context sensitive analysis, we must issue the most conservative barriers.
       // In this case, either a load or store may follow so we issue both barriers.
       GenMemBarrier(kLoadLoad);
       GenMemBarrier(kLoadStore);
+    } else {
+      LoadBaseDisp(r_base, field_offset, rl_result.reg, load_size);
     }
+    FreeTemp(r_base);
 
     if (is_long_or_double) {
       StoreValueWide(rl_dest, rl_result);
@@ -689,55 +685,29 @@
                       bool is_object) {
   const MirIFieldLoweringInfo& field_info = mir_graph_->GetIFieldLoweringInfo(mir);
   cu_->compiler_driver->ProcessedInstanceField(field_info.FastGet());
-  if (field_info.FastGet() && !SLOW_FIELD_PATH) {
-    RegLocation rl_result;
-    RegisterClass reg_class = RegClassBySize(size);
+  OpSize load_size = LoadStoreOpSize(is_long_or_double, is_object);
+  if (!SLOW_FIELD_PATH && field_info.FastGet() &&
+      (!field_info.IsVolatile() || SupportsVolatileLoadStore(load_size))) {
+    RegisterClass reg_class = RegClassForFieldLoadStore(load_size, field_info.IsVolatile());
     DCHECK_GE(field_info.FieldOffset().Int32Value(), 0);
     rl_obj = LoadValue(rl_obj, kCoreReg);
+    GenNullCheck(rl_obj.reg, opt_flags);
+    RegLocation rl_result = EvalLoc(rl_dest, reg_class, true);
+    int field_offset = field_info.FieldOffset().Int32Value();
+    if (field_info.IsVolatile()) {
+      LoadBaseDispVolatile(rl_obj.reg, field_offset, rl_result.reg, load_size);
+      MarkPossibleNullPointerException(opt_flags);
+      // Without context sensitive analysis, we must issue the most conservative barriers.
+      // In this case, either a load or store may follow so we issue both barriers.
+      GenMemBarrier(kLoadLoad);
+      GenMemBarrier(kLoadStore);
+    } else {
+      LoadBaseDisp(rl_obj.reg, field_offset, rl_result.reg, load_size);
+      MarkPossibleNullPointerException(opt_flags);
+    }
     if (is_long_or_double) {
-      DCHECK(rl_dest.wide);
-      GenNullCheck(rl_obj.reg, opt_flags);
-      if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
-        RegisterClass result_reg_kind = kAnyReg;
-        if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
-          // Force long/double volatile loads into SSE registers to avoid tearing.
-          result_reg_kind = kFPReg;
-        }
-        rl_result = EvalLoc(rl_dest, result_reg_kind, true);
-        LoadBaseDisp(rl_obj.reg, field_info.FieldOffset().Int32Value(), rl_result.reg, size);
-        MarkPossibleNullPointerException(opt_flags);
-        if (field_info.IsVolatile()) {
-          // Without context sensitive analysis, we must issue the most conservative barriers.
-          // In this case, either a load or store may follow so we issue both barriers.
-          GenMemBarrier(kLoadLoad);
-          GenMemBarrier(kLoadStore);
-        }
-      } else {
-        RegStorage reg_ptr = AllocTemp();
-        OpRegRegImm(kOpAdd, reg_ptr, rl_obj.reg, field_info.FieldOffset().Int32Value());
-        rl_result = EvalLoc(rl_dest, reg_class, true);
-        LoadBaseDisp(reg_ptr, 0, rl_result.reg, size);
-        MarkPossibleNullPointerException(opt_flags);
-        if (field_info.IsVolatile()) {
-          // Without context sensitive analysis, we must issue the most conservative barriers.
-          // In this case, either a load or store may follow so we issue both barriers.
-          GenMemBarrier(kLoadLoad);
-          GenMemBarrier(kLoadStore);
-        }
-        FreeTemp(reg_ptr);
-      }
       StoreValueWide(rl_dest, rl_result);
     } else {
-      rl_result = EvalLoc(rl_dest, reg_class, true);
-      GenNullCheck(rl_obj.reg, opt_flags);
-      LoadBaseDisp(rl_obj.reg, field_info.FieldOffset().Int32Value(), rl_result.reg, k32);
-      MarkPossibleNullPointerException(opt_flags);
-      if (field_info.IsVolatile()) {
-        // Without context sensitive analysis, we must issue the most conservative barriers.
-        // In this case, either a load or store may follow so we issue both barriers.
-        GenMemBarrier(kLoadLoad);
-        GenMemBarrier(kLoadStore);
-      }
       StoreValue(rl_dest, rl_result);
     }
   } else {
@@ -761,47 +731,32 @@
                       bool is_object) {
   const MirIFieldLoweringInfo& field_info = mir_graph_->GetIFieldLoweringInfo(mir);
   cu_->compiler_driver->ProcessedInstanceField(field_info.FastPut());
-  if (field_info.FastPut() && !SLOW_FIELD_PATH) {
-    RegisterClass reg_class = RegClassBySize(size);
+  OpSize store_size = LoadStoreOpSize(is_long_or_double, is_object);
+  if (!SLOW_FIELD_PATH && field_info.FastPut() &&
+      (!field_info.IsVolatile() || SupportsVolatileLoadStore(store_size))) {
+    RegisterClass reg_class = RegClassForFieldLoadStore(store_size, field_info.IsVolatile());
     DCHECK_GE(field_info.FieldOffset().Int32Value(), 0);
     rl_obj = LoadValue(rl_obj, kCoreReg);
     if (is_long_or_double) {
-      RegisterClass src_reg_kind = kAnyReg;
-      if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
-        // Force long/double volatile stores into SSE registers to avoid tearing.
-        src_reg_kind = kFPReg;
-      }
-      rl_src = LoadValueWide(rl_src, src_reg_kind);
-      GenNullCheck(rl_obj.reg, opt_flags);
-      RegStorage reg_ptr = AllocTemp();
-      OpRegRegImm(kOpAdd, reg_ptr, rl_obj.reg, field_info.FieldOffset().Int32Value());
-      if (field_info.IsVolatile()) {
-        // There might have been a store before this volatile one so insert StoreStore barrier.
-        GenMemBarrier(kStoreStore);
-      }
-      StoreBaseDisp(reg_ptr, 0, rl_src.reg, size);
-      MarkPossibleNullPointerException(opt_flags);
-      if (field_info.IsVolatile()) {
-        // A load might follow the volatile store so insert a StoreLoad barrier.
-        GenMemBarrier(kStoreLoad);
-      }
-      FreeTemp(reg_ptr);
+      rl_src = LoadValueWide(rl_src, reg_class);
     } else {
       rl_src = LoadValue(rl_src, reg_class);
-      GenNullCheck(rl_obj.reg, opt_flags);
-      if (field_info.IsVolatile()) {
-        // There might have been a store before this volatile one so insert StoreStore barrier.
-        GenMemBarrier(kStoreStore);
-      }
-      Store32Disp(rl_obj.reg, field_info.FieldOffset().Int32Value(), rl_src.reg);
+    }
+    GenNullCheck(rl_obj.reg, opt_flags);
+    int field_offset = field_info.FieldOffset().Int32Value();
+    if (field_info.IsVolatile()) {
+      // There might have been a store before this volatile one so insert StoreStore barrier.
+      GenMemBarrier(kStoreStore);
+      StoreBaseDispVolatile(rl_obj.reg, field_offset, rl_src.reg, store_size);
       MarkPossibleNullPointerException(opt_flags);
-      if (field_info.IsVolatile()) {
-        // A load might follow the volatile store so insert a StoreLoad barrier.
-        GenMemBarrier(kStoreLoad);
-      }
-      if (is_object && !mir_graph_->IsConstantNullRef(rl_src)) {
-        MarkGCCard(rl_src.reg, rl_obj.reg);
-      }
+      // A load might follow the volatile store so insert a StoreLoad barrier.
+      GenMemBarrier(kStoreLoad);
+    } else {
+      StoreBaseDisp(rl_obj.reg, field_offset, rl_src.reg, store_size);
+      MarkPossibleNullPointerException(opt_flags);
+    }
+    if (is_object && !mir_graph_->IsConstantNullRef(rl_src)) {
+      MarkGCCard(rl_src.reg, rl_obj.reg);
     }
   } else {
     ThreadOffset<4> setter_offset =
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 20fd4b1..90d5a28 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -32,6 +32,8 @@
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<4> offset);
+    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                       OpSize size) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
@@ -40,6 +42,8 @@
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
+    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                        OpSize size) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
@@ -84,6 +88,11 @@
     int GetInsnSize(LIR* lir);
     bool IsUnconditionalBranch(LIR* lir);
 
+    // Check support for volatile load/store of a given size.
+    bool SupportsVolatileLoadStore(OpSize size) OVERRIDE;
+    // Get the register class for load/store of a field.
+    RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
+
     // Required for target - Dalvik-level generators.
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2);
diff --git a/compiler/dex/quick/mips/target_mips.cc b/compiler/dex/quick/mips/target_mips.cc
index 8d91aba..570c220 100644
--- a/compiler/dex/quick/mips/target_mips.cc
+++ b/compiler/dex/quick/mips/target_mips.cc
@@ -555,6 +555,18 @@
   return (lir->opcode == kMipsB);
 }
 
+bool MipsMir2Lir::SupportsVolatileLoadStore(OpSize size) {
+  // No support for 64-bit atomic load/store on mips.
+  return size != k64 && size != kDouble;
+}
+
+RegisterClass MipsMir2Lir::RegClassForFieldLoadStore(OpSize size, bool is_volatile) {
+  // No support for 64-bit atomic load/store on mips.
+  DCHECK(size != k64 && size != kDouble);
+  // TODO: Verify that both core and fp registers are suitable for smaller sizes.
+  return RegClassBySize(size);
+}
+
 MipsMir2Lir::MipsMir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena) {
   for (int i = 0; i < kMipsLast; i++) {
diff --git a/compiler/dex/quick/mips/utility_mips.cc b/compiler/dex/quick/mips/utility_mips.cc
index 8397411..58fbace 100644
--- a/compiler/dex/quick/mips/utility_mips.cc
+++ b/compiler/dex/quick/mips/utility_mips.cc
@@ -545,6 +545,12 @@
   return load;
 }
 
+LIR* MipsMir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                                       OpSize size) {
+  DCHECK(size != k64 && size != kDouble);
+  return LoadBaseDisp(r_base, displacement, r_dest, size);
+}
+
 LIR* MipsMir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                                OpSize size) {
   // TODO: base this on target.
@@ -640,6 +646,12 @@
   return res;
 }
 
+LIR* MipsMir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                                        OpSize size) {
+  DCHECK(size != k64 && size != kDouble);
+  return StoreBaseDisp(r_base, displacement, r_src, size);
+}
+
 LIR* MipsMir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                                 OpSize size) {
   // TODO: base this on target.
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index d10296f..0ffd189 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -128,6 +128,9 @@
   bool wide = (data.op_variant == InlineMethodAnalyser::IGetVariant(Instruction::IGET_WIDE));
   bool ref = (data.op_variant == InlineMethodAnalyser::IGetVariant(Instruction::IGET_OBJECT));
   OpSize size = LoadStoreOpSize(wide, ref);
+  if (data.is_volatile && !SupportsVolatileLoadStore(size)) {
+    return false;
+  }
 
   // The inliner doesn't distinguish kDouble or kFloat, use shorty.
   bool double_or_float = cu_->shorty[0] == 'F' || cu_->shorty[0] == 'D';
@@ -137,12 +140,14 @@
   LockArg(data.object_arg);
   RegLocation rl_dest = wide ? GetReturnWide(double_or_float) : GetReturn(double_or_float);
   RegStorage reg_obj = LoadArg(data.object_arg);
-  LoadBaseDisp(reg_obj, data.field_offset, rl_dest.reg, size);
   if (data.is_volatile) {
+    LoadBaseDispVolatile(reg_obj, data.field_offset, rl_dest.reg, size);
     // Without context sensitive analysis, we must issue the most conservative barriers.
     // In this case, either a load or store may follow so we issue both barriers.
     GenMemBarrier(kLoadLoad);
     GenMemBarrier(kLoadStore);
+  } else {
+    LoadBaseDisp(reg_obj, data.field_offset, rl_dest.reg, size);
   }
   return true;
 }
@@ -162,6 +167,9 @@
   bool wide = (data.op_variant == InlineMethodAnalyser::IPutVariant(Instruction::IPUT_WIDE));
   bool ref = (data.op_variant == InlineMethodAnalyser::IGetVariant(Instruction::IGET_OBJECT));
   OpSize size = LoadStoreOpSize(wide, ref);
+  if (data.is_volatile && !SupportsVolatileLoadStore(size)) {
+    return false;
+  }
 
   // Point of no return - no aborts after this
   GenPrintLabel(mir);
@@ -172,11 +180,11 @@
   if (data.is_volatile) {
     // There might have been a store before this volatile one so insert StoreStore barrier.
     GenMemBarrier(kStoreStore);
-  }
-  StoreBaseDisp(reg_obj, data.field_offset, reg_src, size);
-  if (data.is_volatile) {
+    StoreBaseDispVolatile(reg_obj, data.field_offset, reg_src, size);
     // A load might follow the volatile store so insert a StoreLoad barrier.
     GenMemBarrier(kStoreLoad);
+  } else {
+    StoreBaseDisp(reg_obj, data.field_offset, reg_src, size);
   }
   if (ref) {
     MarkGCCard(reg_src, reg_obj);
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index f762047..74245a4 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -1003,6 +1003,8 @@
     virtual bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) = 0;
     virtual LIR* CheckSuspendUsingLoad() = 0;
     virtual RegStorage LoadHelper(ThreadOffset<4> offset) = 0;
+    virtual LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                                      OpSize size) = 0;
     virtual LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                               OpSize size) = 0;
     virtual LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest,
@@ -1011,6 +1013,8 @@
                                      int displacement, RegStorage r_dest, OpSize size) = 0;
     virtual LIR* LoadConstantNoClobber(RegStorage r_dest, int value) = 0;
     virtual LIR* LoadConstantWide(RegStorage r_dest, int64_t value) = 0;
+    virtual LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                                       OpSize size) = 0;
     virtual LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                                OpSize size) = 0;
     virtual LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src,
@@ -1052,6 +1056,11 @@
     virtual int GetInsnSize(LIR* lir) = 0;
     virtual bool IsUnconditionalBranch(LIR* lir) = 0;
 
+    // Check support for volatile load/store of a given size.
+    virtual bool SupportsVolatileLoadStore(OpSize size) = 0;
+    // Get the register class for load/store of a field.
+    virtual RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) = 0;
+
     // Required for target - Dalvik-level generators.
     virtual void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                                    RegLocation rl_src1, RegLocation rl_src2) = 0;
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 7436e39..b8481e2 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -742,7 +742,7 @@
   EmitPrefixAndOpcode(entry);
   EmitModrmSibDisp(entry->skeleton.modrm_opcode, base, index, scale, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
-  EmitImm(entry, static_cast<int16_t>(imm));
+  EmitImm(entry, imm);
 }
 
 void X86Mir2Lir::EmitRegThread(const X86EncodingMap* entry, uint8_t reg, int disp) {
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index c57b813..9648312 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -32,6 +32,8 @@
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<4> offset);
+    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                       OpSize size) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
@@ -40,6 +42,8 @@
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
+    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                        OpSize size) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
@@ -84,6 +88,11 @@
     int GetInsnSize(LIR* lir);
     bool IsUnconditionalBranch(LIR* lir);
 
+    // Check support for volatile load/store of a given size.
+    bool SupportsVolatileLoadStore(OpSize size) OVERRIDE;
+    // Get the register class for load/store of a field.
+    RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
+
     // Required for target - Dalvik-level generators.
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                            RegLocation rl_src2);
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index bc33cb1..c401baf 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -545,6 +545,21 @@
   return (lir->opcode == kX86Jmp8 || lir->opcode == kX86Jmp32);
 }
 
+bool X86Mir2Lir::SupportsVolatileLoadStore(OpSize size) {
+  return true;
+}
+
+RegisterClass X86Mir2Lir::RegClassForFieldLoadStore(OpSize size, bool is_volatile) {
+  if (UNLIKELY(is_volatile)) {
+    // On x86, atomic 64-bit load/store requires an fp register.
+    // Smaller aligned load/store is atomic for both core and fp registers.
+    if (size == k64 || size == kDouble) {
+      return kFPReg;
+    }
+  }
+  return RegClassBySize(size);
+}
+
 X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena),
       base_of_code_(nullptr), store_method_addr_(false), store_method_addr_used_(false),
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index 03312fd..a4e1255 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -666,6 +666,13 @@
   return LoadBaseIndexedDisp(r_base, r_index, scale, 0, r_dest, size);
 }
 
+LIR* X86Mir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                                      OpSize size) {
+  // LoadBaseDisp() will emit correct insn for atomic load on x86
+  // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
+  return LoadBaseDisp(r_base, displacement, r_dest, size);
+}
+
 LIR* X86Mir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                               OpSize size) {
   // TODO: base this on target.
@@ -755,6 +762,13 @@
   return StoreBaseIndexedDisp(r_base, r_index, scale, 0, r_src, size);
 }
 
+LIR* X86Mir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement,
+                                       RegStorage r_src, OpSize size) {
+  // StoreBaseDisp() will emit correct insn for atomic store on x86
+  // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
+  return StoreBaseDisp(r_base, displacement, r_src, size);
+}
+
 LIR* X86Mir2Lir::StoreBaseDisp(RegStorage r_base, int displacement,
                                RegStorage r_src, OpSize size) {
   // TODO: base this on target.
diff --git a/compiler/utils/debug_stack.h b/compiler/utils/debug_stack.h
index 2e02b43..1bb0624 100644
--- a/compiler/utils/debug_stack.h
+++ b/compiler/utils/debug_stack.h
@@ -118,7 +118,7 @@
     CheckTop();
   }
   DebugStackIndirectTopRefImpl& operator=(const DebugStackIndirectTopRefImpl& other) {
-    CHECK(ref_ == other->ref_);
+    CHECK(ref_ == other.ref_);
     CheckTop();
     return *this;
   }
diff --git a/compiler/utils/scoped_arena_allocator.h b/compiler/utils/scoped_arena_allocator.h
index d5b003c..c090062 100644
--- a/compiler/utils/scoped_arena_allocator.h
+++ b/compiler/utils/scoped_arena_allocator.h
@@ -235,8 +235,24 @@
 
   template <typename U>
   friend class ScopedArenaAllocatorAdapter;
+
+  template <typename U>
+  friend bool operator==(const ScopedArenaAllocatorAdapter<U>& lhs,
+                         const ScopedArenaAllocatorAdapter<U>& rhs);
 };
 
+template <typename T>
+inline bool operator==(const ScopedArenaAllocatorAdapter<T>& lhs,
+                       const ScopedArenaAllocatorAdapter<T>& rhs) {
+  return lhs.arena_stack_ == rhs.arena_stack_;
+}
+
+template <typename T>
+inline bool operator!=(const ScopedArenaAllocatorAdapter<T>& lhs,
+                       const ScopedArenaAllocatorAdapter<T>& rhs) {
+  return !(lhs == rhs);
+}
+
 inline ScopedArenaAllocatorAdapter<void> ScopedArenaAllocator::Adapter() {
   return ScopedArenaAllocatorAdapter<void>(this);
 }
diff --git a/compiler/utils/scoped_arena_containers.h b/compiler/utils/scoped_arena_containers.h
new file mode 100644
index 0000000..c6fefde
--- /dev/null
+++ b/compiler/utils/scoped_arena_containers.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_SCOPED_ARENA_CONTAINERS_H_
+#define ART_COMPILER_UTILS_SCOPED_ARENA_CONTAINERS_H_
+
+#include <vector>
+#include <set>
+
+#include "utils/scoped_arena_allocator.h"
+#include "safe_map.h"
+
+namespace art {
+
+template <typename T>
+using ScopedArenaVector = std::vector<T, ScopedArenaAllocatorAdapter<T> >;
+
+template <typename T, typename Comparator = std::less<T> >
+using ScopedArenaSet = std::set<T, Comparator, ScopedArenaAllocatorAdapter<T> >;
+
+template <typename K, typename V, typename Comparator = std::less<K> >
+using ScopedArenaSafeMap =
+    SafeMap<K, V, Comparator, ScopedArenaAllocatorAdapter<std::pair<const K, V> > >;
+
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_SCOPED_ARENA_CONTAINERS_H_
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 7c0befc..3529c27 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -687,6 +687,12 @@
     } else if (feature == "nodiv") {
       // Turn off support for divide instruction.
       result.SetHasDivideInstruction(false);
+    } else if (feature == "lpae") {
+      // Supports Large Physical Address Extension.
+      result.SetHasLpae(true);
+    } else if (feature == "nolpae") {
+      // Turn off support for Large Physical Address Extension.
+      result.SetHasLpae(false);
     } else {
       Usage("Unknown instruction set feature: '%s'", feature.c_str());
     }
diff --git a/runtime/Android.mk b/runtime/Android.mk
index bc971a9..72f1774 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -61,6 +61,7 @@
 	gc/collector/sticky_mark_sweep.cc \
 	gc/gc_cause.cc \
 	gc/heap.cc \
+	gc/reference_processor.cc \
 	gc/reference_queue.cc \
 	gc/space/bump_pointer_space.cc \
 	gc/space/dlmalloc_space.cc \
@@ -114,6 +115,7 @@
 	native/java_lang_Thread.cc \
 	native/java_lang_Throwable.cc \
 	native/java_lang_VMClassLoader.cc \
+	native/java_lang_ref_Reference.cc \
 	native/java_lang_reflect_Array.cc \
 	native/java_lang_reflect_Constructor.cc \
 	native/java_lang_reflect_Field.cc \
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 4438f25..1d05540 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -827,7 +827,7 @@
         sirt_refs.push_back(ref);
       }
     }
-    LOG(DEBUG) << "Used " << sirt_refs.size() << " arrays to fill space.";
+    LOG(INFO) << "Used " << sirt_refs.size() << " arrays to fill space.";
 
     // Allocate simple objects till it fails.
     while (!self->IsExceptionPending()) {
diff --git a/runtime/arch/x86/asm_support_x86.S b/runtime/arch/x86/asm_support_x86.S
index d7c88ba..909bd3e 100644
--- a/runtime/arch/x86/asm_support_x86.S
+++ b/runtime/arch/x86/asm_support_x86.S
@@ -86,7 +86,13 @@
     // Symbols.
 #if !defined(__APPLE__)
     #define SYMBOL(name) name
-    #define PLT_SYMBOL(name) name  // ## @PLT  // TODO: Disabled for old clang 3.3
+    #if defined(__clang__) && (__clang_major__ < 4) && (__clang_minor__ < 5)
+        // TODO: Disabled for old clang 3.3, this leads to text reolocations and there should be a
+        // better fix.
+        #define PLT_SYMBOL(name) name // ## @PLT
+    #else
+        #define PLT_SYMBOL(name) name ## @PLT
+    #endif
 #else
     // Mac OS' symbols have an _ prefix.
     #define SYMBOL(name) _ ## name
diff --git a/runtime/base/logging.h b/runtime/base/logging.h
index c4461fa..6944278 100644
--- a/runtime/base/logging.h
+++ b/runtime/base/logging.h
@@ -286,17 +286,18 @@
 // and the "-verbose:" command line argument.
 struct LogVerbosity {
   bool class_linker;  // Enabled with "-verbose:class".
-  bool verifier;
   bool compiler;
-  bool heap;
   bool gc;
+  bool heap;
   bool jdwp;
   bool jni;
   bool monitor;
+  bool profiler;
+  bool signals;
   bool startup;
   bool third_party_jni;  // Enabled with "-verbose:third-party-jni".
   bool threads;
-  bool signals;
+  bool verifier;
 };
 
 extern LogVerbosity gLogVerbosity;
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index b50c098..3f35670 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -62,6 +62,7 @@
   kRosAllocBracketLock,
   kRosAllocBulkFreeLock,
   kAllocSpaceLock,
+  kReferenceProcessorLock,
   kDexFileMethodInlinerLock,
   kDexFileToMethodInlinerMapLock,
   kMarkSweepMarkStackLock,
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 1efd2e0..22a0e22 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -595,17 +595,17 @@
 void Dbg::GcDidFinish() {
   if (gDdmHpifWhen != HPIF_WHEN_NEVER) {
     ScopedObjectAccess soa(Thread::Current());
-    LOG(DEBUG) << "Sending heap info to DDM";
+    VLOG(jdwp) << "Sending heap info to DDM";
     DdmSendHeapInfo(gDdmHpifWhen);
   }
   if (gDdmHpsgWhen != HPSG_WHEN_NEVER) {
     ScopedObjectAccess soa(Thread::Current());
-    LOG(DEBUG) << "Dumping heap to DDM";
+    VLOG(jdwp) << "Dumping heap to DDM";
     DdmSendHeapSegments(false);
   }
   if (gDdmNhsgWhen != HPSG_WHEN_NEVER) {
     ScopedObjectAccess soa(Thread::Current());
-    LOG(DEBUG) << "Dumping native heap to DDM";
+    VLOG(jdwp) << "Dumping native heap to DDM";
     DdmSendHeapSegments(true);
   }
 }
diff --git a/runtime/fault_handler.cc b/runtime/fault_handler.cc
index 283faa2..4d7fd0a 100644
--- a/runtime/fault_handler.cc
+++ b/runtime/fault_handler.cc
@@ -35,13 +35,6 @@
 // Static fault manger object accessed by signal handler.
 FaultManager fault_manager;
 
-extern "C" {
-void art_sigsegv_fault() {
-  // Set a breakpoint here to be informed when a SIGSEGV is unhandled by ART.
-  LOG(ERROR)<< "Caught unknown SIGSEGV in ART fault handler";
-}
-}
-
 // Signal handler called on SIGSEGV.
 static void art_fault_handler(int sig, siginfo_t* info, void* context) {
   fault_manager.HandleFault(sig, info, context);
@@ -67,11 +60,15 @@
 }
 
 void FaultManager::HandleFault(int sig, siginfo_t* info, void* context) {
-  LOG(DEBUG) << "Handling fault";
+  // BE CAREFUL ALLOCATING HERE INCLUDING USING LOG(...)
+  //
+  // If malloc calls abort, it will be holding its lock.
+  // If the handler tries to call malloc, it will deadlock.
+  VLOG(signals) << "Handling fault";
   if (IsInGeneratedCode(context, true)) {
-    LOG(DEBUG) << "in generated code, looking for handler";
+    VLOG(signals) << "in generated code, looking for handler";
     for (const auto& handler : generated_code_handlers_) {
-      LOG(DEBUG) << "invoking Action on handler " << handler;
+      VLOG(signals) << "invoking Action on handler " << handler;
       if (handler->Action(sig, info, context)) {
         return;
       }
@@ -82,10 +79,7 @@
       return;
     }
   }
-
-  // Allow the user to catch this problem with a simple breakpoint in art_sigsegv_fault.
-  art_sigsegv_fault();
-
+  LOG(ERROR)<< "Caught unknown SIGSEGV in ART fault handler";
   oldaction_.sa_sigaction(sig, info, context);
 }
 
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index b8051c9..5de7026 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -31,6 +31,7 @@
 #include "gc/accounting/mod_union_table.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
+#include "gc/reference_processor.h"
 #include "gc/space/image_space.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
@@ -166,18 +167,9 @@
 void MarkSweep::ProcessReferences(Thread* self) {
   TimingLogger::ScopedSplit split("ProcessReferences", &timings_);
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  GetHeap()->ProcessReferences(timings_, clear_soft_references_, &IsMarkedCallback,
-                               &MarkObjectCallback, &ProcessMarkStackPausedCallback, this);
-}
-
-void MarkSweep::PreProcessReferences() {
-  if (IsConcurrent()) {
-    // No reason to do this for non-concurrent GC since pre processing soft references only helps
-    // pauses.
-    timings_.NewSplit("PreProcessReferences");
-    GetHeap()->ProcessSoftReferences(timings_, clear_soft_references_, &IsMarkedCallback,
-                                     &MarkObjectCallback, &ProcessMarkStackPausedCallback, this);
-  }
+  GetHeap()->GetReferenceProcessor()->ProcessReferences(
+      true, &timings_, clear_soft_references_, &IsMarkedCallback, &MarkObjectCallback,
+      &ProcessMarkStackCallback, this);
 }
 
 void MarkSweep::PausePhase() {
@@ -192,7 +184,6 @@
     // Scan dirty objects, this is only required if we are not doing concurrent GC.
     RecursiveMarkDirtyObjects(true, accounting::CardTable::kCardDirty);
   }
-  ProcessReferences(self);
   {
     TimingLogger::ScopedSplit split("SwapStacks", &timings_);
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
@@ -210,6 +201,9 @@
   // incorrectly sweep it. This also fixes a race where interning may attempt to return a strong
   // reference to a string that is about to be swept.
   Runtime::Current()->DisallowNewSystemWeaks();
+  // Enable the reference processing slow path, needs to be done with mutators paused since there
+  // is no lock in the GetReferent fast path.
+  GetHeap()->GetReferenceProcessor()->EnableSlowPath();
 }
 
 void MarkSweep::PreCleanCards() {
@@ -265,7 +259,6 @@
   MarkReachableObjects();
   // Pre-clean dirtied cards to reduce pauses.
   PreCleanCards();
-  PreProcessReferences();
 }
 
 void MarkSweep::UpdateAndMarkModUnion() {
@@ -290,6 +283,8 @@
 void MarkSweep::ReclaimPhase() {
   TimingLogger::ScopedSplit split("ReclaimPhase", &timings_);
   Thread* self = Thread::Current();
+  // Process the references concurrently.
+  ProcessReferences(self);
   SweepSystemWeaks(self);
   Runtime::Current()->AllowNewSystemWeaks();
   {
@@ -383,7 +378,7 @@
       ++mark_sweep_->large_object_mark_;
     }
     space::LargeObjectSpace* large_object_space = mark_sweep_->GetHeap()->GetLargeObjectsSpace();
-    if (UNLIKELY(!IsAligned<kPageSize>(obj) ||
+    if (UNLIKELY(obj == nullptr || !IsAligned<kPageSize>(obj) ||
                  (kIsDebugBuild && !large_object_space->Contains(obj)))) {
       LOG(ERROR) << "Tried to mark " << obj << " not contained by any spaces";
       LOG(ERROR) << "Attempting see if it's a bad root";
@@ -1168,7 +1163,7 @@
   if (kCountJavaLangRefs) {
     ++reference_count_;
   }
-  heap_->DelayReferenceReferent(klass, ref, IsMarkedCallback, this);
+  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, ref, IsMarkedCallback, this);
 }
 
 class MarkObjectVisitor {
@@ -1198,8 +1193,8 @@
   ScanObjectVisit(obj, mark_visitor, ref_visitor);
 }
 
-void MarkSweep::ProcessMarkStackPausedCallback(void* arg) {
-  reinterpret_cast<MarkSweep*>(arg)->ProcessMarkStack(true);
+void MarkSweep::ProcessMarkStackCallback(void* arg) {
+  reinterpret_cast<MarkSweep*>(arg)->ProcessMarkStack(false);
 }
 
 void MarkSweep::ProcessMarkStackParallel(size_t thread_count) {
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index bfc70d1..3ebc0af 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -123,10 +123,6 @@
   void ProcessReferences(Thread* self)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void PreProcessReferences()
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   // Update and mark references from immune spaces.
   void UpdateAndMarkModUnion()
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -191,8 +187,9 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  static void ProcessMarkStackPausedCallback(void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+  static void ProcessMarkStackCallback(void* arg)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   static void MarkRootParallelCallback(mirror::Object** root, void* arg, uint32_t thread_id,
                                        RootType root_type)
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index f5d6299..a406f6d 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -30,6 +30,7 @@
 #include "gc/accounting/remembered_set.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
+#include "gc/reference_processor.h"
 #include "gc/space/bump_pointer_space.h"
 #include "gc/space/bump_pointer_space-inl.h"
 #include "gc/space/image_space.h"
@@ -64,8 +65,8 @@
 
 static constexpr bool kProtectFromSpace = true;
 static constexpr bool kStoreStackTraces = false;
-static constexpr bool kUseBytesPromoted = true;
 static constexpr size_t kBytesPromotedThreshold = 4 * MB;
+static constexpr size_t kLargeObjectBytesAllocatedThreshold = 16 * MB;
 
 void SemiSpace::BindBitmaps() {
   timings_.StartSplit("BindBitmaps");
@@ -104,8 +105,8 @@
       last_gc_to_space_end_(nullptr),
       bytes_promoted_(0),
       bytes_promoted_since_last_whole_heap_collection_(0),
+      large_object_bytes_allocated_at_last_whole_heap_collection_(0),
       whole_heap_collection_(true),
-      whole_heap_collection_interval_counter_(0),
       collector_name_(name_),
       swap_semi_spaces_(true) {
 }
@@ -162,8 +163,9 @@
 void SemiSpace::ProcessReferences(Thread* self) {
   TimingLogger::ScopedSplit split("ProcessReferences", &timings_);
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  GetHeap()->ProcessReferences(timings_, clear_soft_references_, &MarkedForwardingAddressCallback,
-                               &MarkObjectCallback, &ProcessMarkStackCallback, this);
+  GetHeap()->GetReferenceProcessor()->ProcessReferences(
+      false, &timings_, clear_soft_references_, &MarkedForwardingAddressCallback,
+      &MarkObjectCallback, &ProcessMarkStackCallback, this);
 }
 
 void SemiSpace::MarkingPhase() {
@@ -187,12 +189,8 @@
     if (gc_cause_ == kGcCauseExplicit || gc_cause_ == kGcCauseForNativeAlloc ||
         clear_soft_references_) {
       // If an explicit, native allocation-triggered, or last attempt
-      // collection, collect the whole heap (and reset the interval
-      // counter to be consistent.)
+      // collection, collect the whole heap.
       whole_heap_collection_ = true;
-      if (!kUseBytesPromoted) {
-        whole_heap_collection_interval_counter_ = 0;
-      }
     }
     if (whole_heap_collection_) {
       VLOG(heap) << "Whole heap collection";
@@ -698,7 +696,8 @@
 // Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
 // marked, put it on the appropriate list in the heap for later processing.
 void SemiSpace::DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference) {
-  heap_->DelayReferenceReferent(klass, reference, MarkedForwardingAddressCallback, this);
+  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, reference,
+                                                         MarkedForwardingAddressCallback, this);
 }
 
 class SemiSpaceMarkObjectVisitor {
@@ -798,32 +797,27 @@
     // only space collection at the next collection by updating
     // whole_heap_collection.
     if (!whole_heap_collection_) {
-      if (!kUseBytesPromoted) {
-        // Enable whole_heap_collection once every
-        // kDefaultWholeHeapCollectionInterval collections.
-        --whole_heap_collection_interval_counter_;
-        DCHECK_GE(whole_heap_collection_interval_counter_, 0);
-        if (whole_heap_collection_interval_counter_ == 0) {
-          whole_heap_collection_ = true;
-        }
-      } else {
-        // Enable whole_heap_collection if the bytes promoted since
-        // the last whole heap collection exceeds a threshold.
-        bytes_promoted_since_last_whole_heap_collection_ += bytes_promoted_;
-        if (bytes_promoted_since_last_whole_heap_collection_ >= kBytesPromotedThreshold) {
-          whole_heap_collection_ = true;
-        }
+      // Enable whole_heap_collection if the bytes promoted since the
+      // last whole heap collection or the large object bytes
+      // allocated exceeds a threshold.
+      bytes_promoted_since_last_whole_heap_collection_ += bytes_promoted_;
+      bool bytes_promoted_threshold_exceeded =
+          bytes_promoted_since_last_whole_heap_collection_ >= kBytesPromotedThreshold;
+      uint64_t current_los_bytes_allocated = GetHeap()->GetLargeObjectsSpace()->GetBytesAllocated();
+      uint64_t last_los_bytes_allocated =
+          large_object_bytes_allocated_at_last_whole_heap_collection_;
+      bool large_object_bytes_threshold_exceeded =
+          current_los_bytes_allocated >=
+          last_los_bytes_allocated + kLargeObjectBytesAllocatedThreshold;
+      if (bytes_promoted_threshold_exceeded || large_object_bytes_threshold_exceeded) {
+        whole_heap_collection_ = true;
       }
     } else {
-      if (!kUseBytesPromoted) {
-        DCHECK_EQ(whole_heap_collection_interval_counter_, 0);
-        whole_heap_collection_interval_counter_ = kDefaultWholeHeapCollectionInterval;
-        whole_heap_collection_ = false;
-      } else {
-        // Reset it.
-        bytes_promoted_since_last_whole_heap_collection_ = bytes_promoted_;
-        whole_heap_collection_ = false;
-      }
+      // Reset the counters.
+      bytes_promoted_since_last_whole_heap_collection_ = bytes_promoted_;
+      large_object_bytes_allocated_at_last_whole_heap_collection_ =
+          GetHeap()->GetLargeObjectsSpace()->GetBytesAllocated();
+      whole_heap_collection_ = false;
     }
   }
   // Clear all of the spaces' mark bitmaps.
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index 3b3e1b1..9fdf471 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -234,14 +234,14 @@
   // the non-moving space, since the last whole heap collection.
   uint64_t bytes_promoted_since_last_whole_heap_collection_;
 
+  // Used for the generational mode. Keeps track of how many bytes of
+  // large objects were allocated at the last whole heap collection.
+  uint64_t large_object_bytes_allocated_at_last_whole_heap_collection_;
+
   // Used for the generational mode. When true, collect the whole
   // heap. When false, collect only the bump pointer spaces.
   bool whole_heap_collection_;
 
-  // Used for the generational mode. A counter used to enable
-  // whole_heap_collection_ once per interval.
-  int whole_heap_collection_interval_counter_;
-
   // How many objects and bytes we moved, used so that we don't need to get the size of the
   // to_space_ when calculating how many objects and bytes we freed.
   size_t bytes_moved_;
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 5d38b02..f2919e8 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -39,6 +39,7 @@
 #include "gc/collector/partial_mark_sweep.h"
 #include "gc/collector/semi_space.h"
 #include "gc/collector/sticky_mark_sweep.h"
+#include "gc/reference_processor.h"
 #include "gc/space/bump_pointer_space.h"
 #include "gc/space/dlmalloc_space-inl.h"
 #include "gc/space/image_space.h"
@@ -175,7 +176,7 @@
     large_object_threshold_ = kDefaultLargeObjectThreshold;
     // Background compaction is currently not supported for command line runs.
     if (background_collector_type_ != foreground_collector_type_) {
-      LOG(WARNING) << "Disabling background compaction for non zygote";
+      VLOG(heap) << "Disabling background compaction for non zygote";
       background_collector_type_ = foreground_collector_type_;
     }
   }
@@ -771,102 +772,6 @@
   return FindDiscontinuousSpaceFromObject(obj, true);
 }
 
-struct SoftReferenceArgs {
-  IsMarkedCallback* is_marked_callback_;
-  MarkObjectCallback* mark_callback_;
-  void* arg_;
-};
-
-mirror::Object* Heap::PreserveSoftReferenceCallback(mirror::Object* obj, void* arg) {
-  SoftReferenceArgs* args = reinterpret_cast<SoftReferenceArgs*>(arg);
-  // TODO: Not preserve all soft references.
-  return args->mark_callback_(obj, args->arg_);
-}
-
-void Heap::ProcessSoftReferences(TimingLogger& timings, bool clear_soft,
-                                 IsMarkedCallback* is_marked_callback,
-                                 MarkObjectCallback* mark_object_callback,
-                                 ProcessMarkStackCallback* process_mark_stack_callback, void* arg) {
-  // Unless required to clear soft references with white references, preserve some white referents.
-  if (!clear_soft) {
-    // Don't clear for sticky GC.
-    SoftReferenceArgs soft_reference_args;
-    soft_reference_args.is_marked_callback_ = is_marked_callback;
-    soft_reference_args.mark_callback_ = mark_object_callback;
-    soft_reference_args.arg_ = arg;
-    // References with a marked referent are removed from the list.
-    soft_reference_queue_.PreserveSomeSoftReferences(&PreserveSoftReferenceCallback,
-                                                     &soft_reference_args);
-    process_mark_stack_callback(arg);
-  }
-}
-
-// Process reference class instances and schedule finalizations.
-void Heap::ProcessReferences(TimingLogger& timings, bool clear_soft,
-                             IsMarkedCallback* is_marked_callback,
-                             MarkObjectCallback* mark_object_callback,
-                             ProcessMarkStackCallback* process_mark_stack_callback, void* arg) {
-  timings.StartSplit("(Paused)ProcessReferences");
-  ProcessSoftReferences(timings, clear_soft, is_marked_callback, mark_object_callback,
-                        process_mark_stack_callback, arg);
-  // Clear all remaining soft and weak references with white referents.
-  soft_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
-  weak_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
-  timings.EndSplit();
-  // Preserve all white objects with finalize methods and schedule them for finalization.
-  timings.StartSplit("(Paused)EnqueueFinalizerReferences");
-  finalizer_reference_queue_.EnqueueFinalizerReferences(cleared_references_, is_marked_callback,
-                                                        mark_object_callback, arg);
-  process_mark_stack_callback(arg);
-  timings.EndSplit();
-  timings.StartSplit("(Paused)ProcessReferences");
-  // Clear all f-reachable soft and weak references with white referents.
-  soft_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
-  weak_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
-  // Clear all phantom references with white referents.
-  phantom_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
-  // At this point all reference queues other than the cleared references should be empty.
-  DCHECK(soft_reference_queue_.IsEmpty());
-  DCHECK(weak_reference_queue_.IsEmpty());
-  DCHECK(finalizer_reference_queue_.IsEmpty());
-  DCHECK(phantom_reference_queue_.IsEmpty());
-  timings.EndSplit();
-}
-
-// Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
-// marked, put it on the appropriate list in the heap for later processing.
-void Heap::DelayReferenceReferent(mirror::Class* klass, mirror::Reference* ref,
-                                  IsMarkedCallback is_marked_callback, void* arg) {
-  // klass can be the class of the old object if the visitor already updated the class of ref.
-  DCHECK(klass->IsReferenceClass());
-  mirror::Object* referent = ref->GetReferent();
-  if (referent != nullptr) {
-    mirror::Object* forward_address = is_marked_callback(referent, arg);
-    // Null means that the object is not currently marked.
-    if (forward_address == nullptr) {
-      Thread* self = Thread::Current();
-      // TODO: Remove these locks, and use atomic stacks for storing references?
-      // We need to check that the references haven't already been enqueued since we can end up
-      // scanning the same reference multiple times due to dirty cards.
-      if (klass->IsSoftReferenceClass()) {
-        soft_reference_queue_.AtomicEnqueueIfNotEnqueued(self, ref);
-      } else if (klass->IsWeakReferenceClass()) {
-        weak_reference_queue_.AtomicEnqueueIfNotEnqueued(self, ref);
-      } else if (klass->IsFinalizerReferenceClass()) {
-        finalizer_reference_queue_.AtomicEnqueueIfNotEnqueued(self, ref);
-      } else if (klass->IsPhantomReferenceClass()) {
-        phantom_reference_queue_.AtomicEnqueueIfNotEnqueued(self, ref);
-      } else {
-        LOG(FATAL) << "Invalid reference type " << PrettyClass(klass) << " " << std::hex
-                   << klass->GetAccessFlags();
-      }
-    } else if (referent != forward_address) {
-      // Referent is already marked and we need to update it.
-      ref->SetReferent<false>(forward_address);
-    }
-  }
-}
-
 space::ImageSpace* Heap::GetImageSpace() const {
   for (const auto& space : continuous_spaces_) {
     if (space->IsImageSpace()) {
@@ -1477,7 +1382,7 @@
   ChangeCollector(collector_type);
   tl->ResumeAll();
   // Can't call into java code with all threads suspended.
-  EnqueueClearedReferences();
+  reference_processor_.EnqueueClearedReferences();
   uint64_t duration = NanoTime() - start_time;
   GrowForUtilization(semi_space_collector_);
   FinishGC(self, collector::kGcTypeFull);
@@ -1881,7 +1786,7 @@
   total_bytes_freed_ever_ += collector->GetFreedBytes();
   RequestHeapTrim();
   // Enqueue cleared references.
-  EnqueueClearedReferences();
+  reference_processor_.EnqueueClearedReferences();
   // Grow the heap so that we know when to perform the next GC.
   GrowForUtilization(collector);
   const size_t duration = collector->GetDurationNs();
@@ -1952,9 +1857,9 @@
 // Verify a reference from an object.
 class VerifyReferenceVisitor {
  public:
-  explicit VerifyReferenceVisitor(Heap* heap)
+  explicit VerifyReferenceVisitor(Heap* heap, bool verify_referent)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_)
-      : heap_(heap), failed_(false) {}
+      : heap_(heap), failed_(false), verify_referent_(verify_referent) {}
 
   bool Failed() const {
     return failed_;
@@ -1962,7 +1867,9 @@
 
   void operator()(mirror::Class* klass, mirror::Reference* ref) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    this->operator()(ref, mirror::Reference::ReferentOffset(), false);
+    if (verify_referent_) {
+      this->operator()(ref, mirror::Reference::ReferentOffset(), false);
+    }
   }
 
   void operator()(mirror::Object* obj, MemberOffset offset, bool /*is_static*/) const
@@ -2079,18 +1986,21 @@
  private:
   Heap* const heap_;
   mutable bool failed_;
+  bool verify_referent_;
 };
 
 // Verify all references within an object, for use with HeapBitmap::Visit.
 class VerifyObjectVisitor {
  public:
-  explicit VerifyObjectVisitor(Heap* heap) : heap_(heap), failed_(false) {}
+  explicit VerifyObjectVisitor(Heap* heap, bool verify_referent)
+      : heap_(heap), failed_(false), verify_referent_(verify_referent) {
+  }
 
   void operator()(mirror::Object* obj) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
     // Note: we are verifying the references in obj but not obj itself, this is because obj must
     // be live or else how did we find it in the live bitmap?
-    VerifyReferenceVisitor visitor(heap_);
+    VerifyReferenceVisitor visitor(heap_, verify_referent_);
     // The class doesn't count as a reference but we should verify it anyways.
     obj->VisitReferences<true>(visitor, visitor);
     failed_ = failed_ || visitor.Failed();
@@ -2109,10 +2019,11 @@
  private:
   Heap* const heap_;
   mutable bool failed_;
+  const bool verify_referent_;
 };
 
 // Must do this with mutators suspended since we are directly accessing the allocation stacks.
-bool Heap::VerifyHeapReferences() {
+bool Heap::VerifyHeapReferences(bool verify_referents) {
   Thread* self = Thread::Current();
   Locks::mutator_lock_->AssertExclusiveHeld(self);
   // Lets sort our allocation stacks so that we can efficiently binary search them.
@@ -2121,7 +2032,7 @@
   // Since we sorted the allocation stack content, need to revoke all
   // thread-local allocation stacks.
   RevokeAllThreadLocalAllocationStacks(self);
-  VerifyObjectVisitor visitor(this);
+  VerifyObjectVisitor visitor(this, verify_referents);
   // Verify objects in the allocation stack since these will be objects which were:
   // 1. Allocated prior to the GC (pre GC verification).
   // 2. Allocated during the GC (pre sweep GC verification).
@@ -2399,7 +2310,9 @@
     // Swapping bound bitmaps does nothing.
     gc->SwapBitmaps();
     SwapSemiSpaces();
-    if (!VerifyHeapReferences()) {
+    // Pass in false since concurrent reference processing can mean that the reference referents
+    // may point to dead objects at the point which PreSweepingGcVerification is called.
+    if (!VerifyHeapReferences(false)) {
       LOG(FATAL) << "Pre sweeping " << gc->GetName() << " GC verification failed";
     }
     SwapSemiSpaces();
@@ -2622,27 +2535,10 @@
   *object = soa.Decode<mirror::Object*>(arg.get());
 }
 
-void Heap::EnqueueClearedReferences() {
-  Thread* self = Thread::Current();
-  Locks::mutator_lock_->AssertNotHeld(self);
-  if (!cleared_references_.IsEmpty()) {
-    // When a runtime isn't started there are no reference queues to care about so ignore.
-    if (LIKELY(Runtime::Current()->IsStarted())) {
-      ScopedObjectAccess soa(self);
-      ScopedLocalRef<jobject> arg(self->GetJniEnv(),
-                                  soa.AddLocalReference<jobject>(cleared_references_.GetList()));
-      jvalue args[1];
-      args[0].l = arg.get();
-      InvokeWithJValues(soa, nullptr, WellKnownClasses::java_lang_ref_ReferenceQueue_add, args);
-    }
-    cleared_references_.Clear();
-  }
-}
-
 void Heap::RequestConcurrentGC(Thread* self) {
   // Make sure that we can do a concurrent GC.
   Runtime* runtime = Runtime::Current();
-  if (runtime == NULL || !runtime->IsFinishedStarting() || runtime->IsShuttingDown(self) ||
+  if (runtime == nullptr || !runtime->IsFinishedStarting() || runtime->IsShuttingDown(self) ||
       self->IsHandlingStackOverflow()) {
     return;
   }
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 7a9ef1e..f71de1a 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -35,7 +35,7 @@
 #include "jni.h"
 #include "object_callbacks.h"
 #include "offsets.h"
-#include "reference_queue.h"
+#include "reference_processor.h"
 #include "safe_map.h"
 #include "thread_pool.h"
 #include "verify_object.h"
@@ -54,6 +54,9 @@
 }  // namespace mirror
 
 namespace gc {
+
+class ReferenceProcessor;
+
 namespace accounting {
   class HeapBitmap;
   class ModUnionTable;
@@ -215,7 +218,7 @@
 
   // Check sanity of all live references.
   void VerifyHeap() LOCKS_EXCLUDED(Locks::heap_bitmap_lock_);
-  bool VerifyHeapReferences()
+  bool VerifyHeapReferences(bool verify_referents = true)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
   bool VerifyMissingCardMarks()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
@@ -314,21 +317,6 @@
     return discontinuous_spaces_;
   }
 
-  static mirror::Object* PreserveSoftReferenceCallback(mirror::Object* obj, void* arg);
-  void ProcessSoftReferences(TimingLogger& timings, bool clear_soft,
-                             IsMarkedCallback* is_marked_callback,
-                             MarkObjectCallback* mark_object_callback,
-                             ProcessMarkStackCallback* process_mark_stack_callback, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-  void ProcessReferences(TimingLogger& timings, bool clear_soft,
-                         IsMarkedCallback* is_marked_callback,
-                         MarkObjectCallback* mark_object_callback,
-                         ProcessMarkStackCallback* process_mark_stack_callback,
-                         void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-
   // Enable verification of object references when the runtime is sufficiently initialized.
   void EnableObjectValidation() {
     verify_object_mode_ = kVerifyObjectSupport;
@@ -565,6 +553,10 @@
   }
   bool HasImageSpace() const;
 
+  ReferenceProcessor* GetReferenceProcessor() {
+    return &reference_processor_;
+  }
+
  private:
   void Compact(space::ContinuousMemMapAllocSpace* target_space,
                space::ContinuousMemMapAllocSpace* source_space)
@@ -631,12 +623,6 @@
   bool IsValidContinuousSpaceObjectAddress(const mirror::Object* obj) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void EnqueueClearedReferences();
-  // Returns true if the reference object has not yet been enqueued.
-  void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* ref,
-                              IsMarkedCallback is_marked_callback, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   // Run the finalizers.
   void RunFinalization(JNIEnv* env);
 
@@ -797,12 +783,8 @@
   Mutex* gc_complete_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   UniquePtr<ConditionVariable> gc_complete_cond_ GUARDED_BY(gc_complete_lock_);
 
-  // Reference queues.
-  ReferenceQueue soft_reference_queue_;
-  ReferenceQueue weak_reference_queue_;
-  ReferenceQueue finalizer_reference_queue_;
-  ReferenceQueue phantom_reference_queue_;
-  ReferenceQueue cleared_references_;
+  // Reference processor;
+  ReferenceProcessor reference_processor_;
 
   // True while the garbage collector is running.
   volatile CollectorType collector_type_running_ GUARDED_BY(gc_complete_lock_);
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
new file mode 100644
index 0000000..a58df8e
--- /dev/null
+++ b/runtime/gc/reference_processor.cc
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reference_processor.h"
+
+#include "mirror/object-inl.h"
+#include "mirror/reference-inl.h"
+#include "reflection.h"
+#include "ScopedLocalRef.h"
+#include "scoped_thread_state_change.h"
+#include "well_known_classes.h"
+
+namespace art {
+namespace gc {
+
+ReferenceProcessor::ReferenceProcessor()
+    : process_references_args_(nullptr, nullptr, nullptr), slow_path_enabled_(false),
+      preserving_references_(false), lock_("reference processor lock", kReferenceProcessorLock),
+      condition_("reference processor condition", lock_) {
+}
+
+void ReferenceProcessor::EnableSlowPath() {
+  Locks::mutator_lock_->AssertExclusiveHeld(Thread::Current());
+  slow_path_enabled_ = true;
+}
+
+void ReferenceProcessor::DisableSlowPath(Thread* self) {
+  slow_path_enabled_ = false;
+  condition_.Broadcast(self);
+}
+
+mirror::Object* ReferenceProcessor::GetReferent(Thread* self, mirror::Reference* reference) {
+  mirror::Object* const referent = reference->GetReferent();
+  if (LIKELY(!slow_path_enabled_)) {
+    return referent;
+  }
+  // Another fast path, the referent is cleared, we can just return null since there is no scenario
+  // where it becomes non-null.
+  if (referent == nullptr) {
+    return nullptr;
+  }
+  MutexLock mu(self, lock_);
+  while (slow_path_enabled_) {
+    mirror::Object* const referent = reference->GetReferent();
+    // If the referent became cleared, return it.
+    if (referent == nullptr) {
+      return nullptr;
+    }
+    // Try to see if the referent is already marked by using the is_marked_callback. We can return
+    // it to the mutator as long as the GC is not preserving references. If the GC is
+    // preserving references, the mutator could take a white field and move it somewhere else
+    // in the heap causing corruption since this field would get swept.
+    IsMarkedCallback* const is_marked_callback = process_references_args_.is_marked_callback_;
+    if (!preserving_references_ && is_marked_callback != nullptr) {
+      mirror::Object* const obj = is_marked_callback(referent, process_references_args_.arg_);
+      // If it's null it means not marked, but it could become marked if the referent is reachable
+      // by finalizer referents. So we can not return in this case and must block.
+      if (obj != nullptr) {
+        return obj;
+      }
+    }
+    condition_.WaitHoldingLocks(self);
+  }
+  return reference->GetReferent();
+}
+
+mirror::Object* ReferenceProcessor::PreserveSoftReferenceCallback(mirror::Object* obj, void* arg) {
+  auto* const args = reinterpret_cast<ProcessReferencesArgs*>(arg);
+  // TODO: Not preserve all soft references.
+  return args->mark_callback_(obj, args->arg_);
+}
+
+void ReferenceProcessor::StartPreservingReferences(Thread* self) {
+  MutexLock mu(self, lock_);
+  preserving_references_ = true;
+}
+
+void ReferenceProcessor::StopPreservingReferences(Thread* self) {
+  MutexLock mu(self, lock_);
+  preserving_references_ = false;
+  // We are done preserving references, some people who are blocked may see a marked referent.
+  condition_.Broadcast(self);
+}
+
+// Process reference class instances and schedule finalizations.
+void ReferenceProcessor::ProcessReferences(bool concurrent, TimingLogger* timings,
+                                           bool clear_soft_references,
+                                           IsMarkedCallback* is_marked_callback,
+                                           MarkObjectCallback* mark_object_callback,
+                                           ProcessMarkStackCallback* process_mark_stack_callback,
+                                           void* arg) {
+  Thread* self = Thread::Current();
+  {
+    MutexLock mu(self, lock_);
+    process_references_args_.is_marked_callback_ = is_marked_callback;
+    process_references_args_.mark_callback_ = mark_object_callback;
+    process_references_args_.arg_ = arg;
+    CHECK_EQ(slow_path_enabled_, concurrent) << "Slow path must be enabled iff concurrent";
+  }
+  timings->StartSplit(concurrent ? "ProcessReferences" : "(Paused)ProcessReferences");
+  // Unless required to clear soft references with white references, preserve some white referents.
+  if (!clear_soft_references) {
+    TimingLogger::ScopedSplit split(concurrent ? "PreserveSomeSoftReferences" :
+        "(Paused)PreserveSomeSoftReferences", timings);
+    if (concurrent) {
+      StartPreservingReferences(self);
+    }
+    // References with a marked referent are removed from the list.
+    soft_reference_queue_.PreserveSomeSoftReferences(&PreserveSoftReferenceCallback,
+                                                     &process_references_args_);
+    process_mark_stack_callback(arg);
+    if (concurrent) {
+      StopPreservingReferences(self);
+    }
+  }
+  // Clear all remaining soft and weak references with white referents.
+  soft_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
+  weak_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
+  {
+    TimingLogger::ScopedSplit split(concurrent ? "EnqueueFinalizerReferences" :
+        "(Paused)EnqueueFinalizerReferences", timings);
+    if (concurrent) {
+      StartPreservingReferences(self);
+    }
+    // Preserve all white objects with finalize methods and schedule them for finalization.
+    finalizer_reference_queue_.EnqueueFinalizerReferences(cleared_references_, is_marked_callback,
+                                                          mark_object_callback, arg);
+    process_mark_stack_callback(arg);
+    if (concurrent) {
+      StopPreservingReferences(self);
+    }
+  }
+  // Clear all finalizer referent reachable soft and weak references with white referents.
+  soft_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
+  weak_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
+  // Clear all phantom references with white referents.
+  phantom_reference_queue_.ClearWhiteReferences(cleared_references_, is_marked_callback, arg);
+  // At this point all reference queues other than the cleared references should be empty.
+  DCHECK(soft_reference_queue_.IsEmpty());
+  DCHECK(weak_reference_queue_.IsEmpty());
+  DCHECK(finalizer_reference_queue_.IsEmpty());
+  DCHECK(phantom_reference_queue_.IsEmpty());
+  {
+    MutexLock mu(self, lock_);
+    // Need to always do this since the next GC may be concurrent. Doing this for only concurrent
+    // could result in a stale is_marked_callback_ being called before the reference processing
+    // starts since there is a small window of time where slow_path_enabled_ is enabled but the
+    // callback isn't yet set.
+    process_references_args_.is_marked_callback_ = nullptr;
+    if (concurrent) {
+      // Done processing, disable the slow path and broadcast to the waiters.
+      DisableSlowPath(self);
+    }
+  }
+  timings->EndSplit();
+}
+
+// Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
+// marked, put it on the appropriate list in the heap for later processing.
+void ReferenceProcessor::DelayReferenceReferent(mirror::Class* klass, mirror::Reference* ref,
+                                                IsMarkedCallback is_marked_callback, void* arg) {
+  // klass can be the class of the old object if the visitor already updated the class of ref.
+  DCHECK(klass->IsReferenceClass());
+  mirror::Object* referent = ref->GetReferent<kWithoutReadBarrier>();
+  if (referent != nullptr) {
+    mirror::Object* forward_address = is_marked_callback(referent, arg);
+    // Null means that the object is not currently marked.
+    if (forward_address == nullptr) {
+      Thread* self = Thread::Current();
+      // TODO: Remove these locks, and use atomic stacks for storing references?
+      // We need to check that the references haven't already been enqueued since we can end up
+      // scanning the same reference multiple times due to dirty cards.
+      if (klass->IsSoftReferenceClass()) {
+        soft_reference_queue_.AtomicEnqueueIfNotEnqueued(self, ref);
+      } else if (klass->IsWeakReferenceClass()) {
+        weak_reference_queue_.AtomicEnqueueIfNotEnqueued(self, ref);
+      } else if (klass->IsFinalizerReferenceClass()) {
+        finalizer_reference_queue_.AtomicEnqueueIfNotEnqueued(self, ref);
+      } else if (klass->IsPhantomReferenceClass()) {
+        phantom_reference_queue_.AtomicEnqueueIfNotEnqueued(self, ref);
+      } else {
+        LOG(FATAL) << "Invalid reference type " << PrettyClass(klass) << " " << std::hex
+                   << klass->GetAccessFlags();
+      }
+    } else if (referent != forward_address) {
+      // Referent is already marked and we need to update it.
+      ref->SetReferent<false>(forward_address);
+    }
+  }
+}
+
+void ReferenceProcessor::EnqueueClearedReferences() {
+  Thread* self = Thread::Current();
+  Locks::mutator_lock_->AssertNotHeld(self);
+  if (!cleared_references_.IsEmpty()) {
+    // When a runtime isn't started there are no reference queues to care about so ignore.
+    if (LIKELY(Runtime::Current()->IsStarted())) {
+      ScopedObjectAccess soa(self);
+      ScopedLocalRef<jobject> arg(self->GetJniEnv(),
+                                  soa.AddLocalReference<jobject>(cleared_references_.GetList()));
+      jvalue args[1];
+      args[0].l = arg.get();
+      InvokeWithJValues(soa, nullptr, WellKnownClasses::java_lang_ref_ReferenceQueue_add, args);
+    }
+    cleared_references_.Clear();
+  }
+}
+
+}  // namespace gc
+}  // namespace art
diff --git a/runtime/gc/reference_processor.h b/runtime/gc/reference_processor.h
new file mode 100644
index 0000000..f082a9e
--- /dev/null
+++ b/runtime/gc/reference_processor.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_REFERENCE_PROCESSOR_H_
+#define ART_RUNTIME_GC_REFERENCE_PROCESSOR_H_
+
+#include "base/mutex.h"
+#include "globals.h"
+#include "jni.h"
+#include "object_callbacks.h"
+#include "reference_queue.h"
+
+namespace art {
+
+class TimingLogger;
+
+namespace mirror {
+class Object;
+class Reference;
+}  // namespace mirror
+
+namespace gc {
+
+class Heap;
+
+// Used to process java.lang.References concurrently or paused.
+class ReferenceProcessor {
+ public:
+  explicit ReferenceProcessor();
+  static mirror::Object* PreserveSoftReferenceCallback(mirror::Object* obj, void* arg);
+  void ProcessReferences(bool concurrent, TimingLogger* timings, bool clear_soft_references,
+                         IsMarkedCallback* is_marked_callback,
+                         MarkObjectCallback* mark_object_callback,
+                         ProcessMarkStackCallback* process_mark_stack_callback, void* arg)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      LOCKS_EXCLUDED(lock_);
+  // Only allow setting this with mutators suspended so that we can avoid using a lock in the
+  // GetReferent fast path as an optimization.
+  void EnableSlowPath() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
+  // Decode the referent, may block if references are being processed.
+  mirror::Object* GetReferent(Thread* self, mirror::Reference* reference)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) LOCKS_EXCLUDED(lock_);
+  void EnqueueClearedReferences() LOCKS_EXCLUDED(Locks::mutator_lock_);
+  void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* ref,
+                              IsMarkedCallback is_marked_callback, void* arg)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+ private:
+  class ProcessReferencesArgs {
+   public:
+    ProcessReferencesArgs(IsMarkedCallback* is_marked_callback,
+                          MarkObjectCallback* mark_callback, void* arg)
+        : is_marked_callback_(is_marked_callback), mark_callback_(mark_callback), arg_(arg) {
+    }
+
+    // The is marked callback is null when the args aren't set up.
+    IsMarkedCallback* is_marked_callback_;
+    MarkObjectCallback* mark_callback_;
+    void* arg_;
+  };
+  // Called by ProcessReferences.
+  void DisableSlowPath(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  // If we are preserving references it means that some dead objects may become live, we use start
+  // and stop preserving to block mutators using GetReferrent from getting access to these
+  // referents.
+  void StartPreservingReferences(Thread* self) LOCKS_EXCLUDED(lock_);
+  void StopPreservingReferences(Thread* self) LOCKS_EXCLUDED(lock_);
+  // Process args, used by the GetReferent to return referents which are already marked.
+  ProcessReferencesArgs process_references_args_ GUARDED_BY(lock_);
+  // Boolean for whether or not we need to go slow path in GetReferent.
+  volatile bool slow_path_enabled_;
+  // Boolean for whether or not we are preserving references (either soft references or finalizers).
+  // If this is true, then we cannot return a referent (see comment in GetReferent).
+  bool preserving_references_ GUARDED_BY(lock_);
+  // Lock that guards the reference processing.
+  Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  // Condition that people wait on if they attempt to get the referent of a reference while
+  // processing is in progress.
+  ConditionVariable condition_ GUARDED_BY(lock_);
+  // Reference queues used by the GC.
+  ReferenceQueue soft_reference_queue_;
+  ReferenceQueue weak_reference_queue_;
+  ReferenceQueue finalizer_reference_queue_;
+  ReferenceQueue phantom_reference_queue_;
+  ReferenceQueue cleared_references_;
+};
+
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_REFERENCE_PROCESSOR_H_
diff --git a/runtime/gc/reference_queue.cc b/runtime/gc/reference_queue.cc
index aee7891..caacef5 100644
--- a/runtime/gc/reference_queue.cc
+++ b/runtime/gc/reference_queue.cc
@@ -109,7 +109,7 @@
                                           void* arg) {
   while (!IsEmpty()) {
     mirror::Reference* ref = DequeuePendingReference();
-    mirror::Object* referent = ref->GetReferent();
+    mirror::Object* referent = ref->GetReferent<kWithoutReadBarrier>();
     if (referent != nullptr) {
       mirror::Object* forward_address = preserve_callback(referent, arg);
       if (forward_address == nullptr) {
@@ -131,17 +131,17 @@
 }
 
 void ReferenceQueue::EnqueueFinalizerReferences(ReferenceQueue& cleared_references,
-                                                IsMarkedCallback is_marked_callback,
-                                                MarkObjectCallback recursive_mark_callback,
+                                                IsMarkedCallback* is_marked_callback,
+                                                MarkObjectCallback* mark_object_callback,
                                                 void* arg) {
   while (!IsEmpty()) {
     mirror::FinalizerReference* ref = DequeuePendingReference()->AsFinalizerReference();
-    mirror::Object* referent = ref->GetReferent();
+    mirror::Object* referent = ref->GetReferent<kWithoutReadBarrier>();
     if (referent != nullptr) {
       mirror::Object* forward_address = is_marked_callback(referent, arg);
       // If the referent isn't marked, mark it and update the
       if (forward_address == nullptr) {
-        forward_address = recursive_mark_callback(referent, arg);
+        forward_address = mark_object_callback(referent, arg);
         // If the referent is non-null the reference must queuable.
         DCHECK(ref->IsEnqueuable());
         // Move the updated referent to the zombie field.
@@ -160,11 +160,11 @@
   }
 }
 
-void ReferenceQueue::PreserveSomeSoftReferences(IsMarkedCallback preserve_callback, void* arg) {
+void ReferenceQueue::PreserveSomeSoftReferences(IsMarkedCallback* preserve_callback, void* arg) {
   ReferenceQueue cleared;
   while (!IsEmpty()) {
     mirror::Reference* ref = DequeuePendingReference();
-    mirror::Object* referent = ref->GetReferent();
+    mirror::Object* referent = ref->GetReferent<kWithoutReadBarrier>();
     if (referent != nullptr) {
       mirror::Object* forward_address = preserve_callback(referent, arg);
       if (forward_address == nullptr) {
@@ -180,4 +180,3 @@
 
 }  // namespace gc
 }  // namespace art
-
diff --git a/runtime/gc/reference_queue.h b/runtime/gc/reference_queue.h
index 8d392ba..4f223e2 100644
--- a/runtime/gc/reference_queue.h
+++ b/runtime/gc/reference_queue.h
@@ -59,8 +59,8 @@
   // Enqueues finalizer references with white referents.  White referents are blackened, moved to the
   // zombie field, and the referent field is cleared.
   void EnqueueFinalizerReferences(ReferenceQueue& cleared_references,
-                                  IsMarkedCallback is_marked_callback,
-                                  MarkObjectCallback recursive_mark_callback, void* arg)
+                                  IsMarkedCallback* is_marked_callback,
+                                  MarkObjectCallback* mark_object_callback, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   // Walks the reference list marking any references subject to the reference clearing policy.
   // References with a black referent are removed from the list.  References with white referents
@@ -69,7 +69,8 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   // Unlink the reference list clearing references objects with white referents.  Cleared references
   // registered to a reference queue are scheduled for appending by the heap worker thread.
-  void ClearWhiteReferences(ReferenceQueue& cleared_references, IsMarkedCallback is_marked_callback,
+  void ClearWhiteReferences(ReferenceQueue& cleared_references,
+                            IsMarkedCallback* is_marked_callback,
                             void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void Dump(std::ostream& os) const
diff --git a/runtime/indirect_reference_table-inl.h b/runtime/indirect_reference_table-inl.h
new file mode 100644
index 0000000..1a28347
--- /dev/null
+++ b/runtime/indirect_reference_table-inl.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_INDIRECT_REFERENCE_TABLE_INL_H_
+#define ART_RUNTIME_INDIRECT_REFERENCE_TABLE_INL_H_
+
+#include "indirect_reference_table.h"
+
+#include "verify_object-inl.h"
+
+namespace art {
+namespace mirror {
+class Object;
+}  // namespace mirror
+
+// Verifies that the indirect table lookup is valid.
+// Returns "false" if something looks bad.
+inline bool IndirectReferenceTable::GetChecked(IndirectRef iref) const {
+  if (UNLIKELY(iref == nullptr)) {
+    LOG(WARNING) << "Attempt to look up NULL " << kind_;
+    return false;
+  }
+  if (UNLIKELY(GetIndirectRefKind(iref) == kSirtOrInvalid)) {
+    LOG(ERROR) << "JNI ERROR (app bug): invalid " << kind_ << " " << iref;
+    AbortIfNoCheckJNI();
+    return false;
+  }
+  const int topIndex = segment_state_.parts.topIndex;
+  int idx = ExtractIndex(iref);
+  if (UNLIKELY(idx >= topIndex)) {
+    LOG(ERROR) << "JNI ERROR (app bug): accessed stale " << kind_ << " "
+               << iref << " (index " << idx << " in a table of size " << topIndex << ")";
+    AbortIfNoCheckJNI();
+    return false;
+  }
+  if (UNLIKELY(table_[idx] == nullptr)) {
+    LOG(ERROR) << "JNI ERROR (app bug): accessed deleted " << kind_ << " " << iref;
+    AbortIfNoCheckJNI();
+    return false;
+  }
+  if (UNLIKELY(!CheckEntry("use", iref, idx))) {
+    return false;
+  }
+  return true;
+}
+
+// Make sure that the entry at "idx" is correctly paired with "iref".
+inline bool IndirectReferenceTable::CheckEntry(const char* what, IndirectRef iref, int idx) const {
+  const mirror::Object* obj = table_[idx];
+  IndirectRef checkRef = ToIndirectRef(obj, idx);
+  if (UNLIKELY(checkRef != iref)) {
+    LOG(ERROR) << "JNI ERROR (app bug): attempt to " << what
+               << " stale " << kind_ << " " << iref
+               << " (should be " << checkRef << ")";
+    AbortIfNoCheckJNI();
+    return false;
+  }
+  return true;
+}
+
+inline mirror::Object* IndirectReferenceTable::Get(IndirectRef iref) const {
+  if (!GetChecked(iref)) {
+    return kInvalidIndirectRefObject;
+  }
+  mirror::Object* obj = table_[ExtractIndex(iref)];
+  if (LIKELY(obj != kClearedJniWeakGlobal)) {
+    VerifyObject(obj);
+  }
+  return obj;
+}
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_INDIRECT_REFERENCE_TABLE_INL_H_
diff --git a/runtime/indirect_reference_table.cc b/runtime/indirect_reference_table.cc
index 987df91..b81e43a 100644
--- a/runtime/indirect_reference_table.cc
+++ b/runtime/indirect_reference_table.cc
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "indirect_reference_table.h"
+#include "indirect_reference_table-inl.h"
+
 #include "jni_internal.h"
 #include "reference_table.h"
 #include "runtime.h"
@@ -53,7 +54,7 @@
   return os;
 }
 
-static void AbortMaybe() {
+void IndirectReferenceTable::AbortIfNoCheckJNI() {
   // If -Xcheck:jni is on, it'll give a more detailed error before aborting.
   if (!Runtime::Current()->GetJavaVM()->check_jni) {
     // Otherwise, we want to abort rather than hand back a bad reference.
@@ -67,12 +68,23 @@
   CHECK_LE(initialCount, maxCount);
   CHECK_NE(desiredKind, kSirtOrInvalid);
 
-  table_ = reinterpret_cast<mirror::Object**>(malloc(initialCount * sizeof(const mirror::Object*)));
-  CHECK(table_ != NULL);
-  memset(table_, 0xd1, initialCount * sizeof(const mirror::Object*));
+  std::string error_str;
+  const size_t initial_bytes = initialCount * sizeof(const mirror::Object*);
+  const size_t table_bytes = maxCount * sizeof(const mirror::Object*);
+  table_mem_map_.reset(MemMap::MapAnonymous("indirect ref table", nullptr, table_bytes,
+                                            PROT_READ | PROT_WRITE, false, &error_str));
+  CHECK(table_mem_map_.get() != nullptr) << error_str;
 
-  slot_data_ = reinterpret_cast<IndirectRefSlot*>(calloc(initialCount, sizeof(IndirectRefSlot)));
-  CHECK(slot_data_ != NULL);
+  table_ = reinterpret_cast<mirror::Object**>(table_mem_map_->Begin());
+  CHECK(table_ != nullptr);
+  memset(table_, 0xd1, initial_bytes);
+
+  const size_t slot_bytes = maxCount * sizeof(IndirectRefSlot);
+  slot_mem_map_.reset(MemMap::MapAnonymous("indirect ref table slots", nullptr, slot_bytes,
+                                           PROT_READ | PROT_WRITE, false, &error_str));
+  CHECK(slot_mem_map_.get() != nullptr) << error_str;
+  slot_data_ = reinterpret_cast<IndirectRefSlot*>(slot_mem_map_->Begin());
+  CHECK(slot_data_ != nullptr);
 
   segment_state_.all = IRT_FIRST_SEGMENT;
   alloc_entries_ = initialCount;
@@ -81,25 +93,6 @@
 }
 
 IndirectReferenceTable::~IndirectReferenceTable() {
-  free(table_);
-  free(slot_data_);
-  table_ = NULL;
-  slot_data_ = NULL;
-  alloc_entries_ = max_entries_ = -1;
-}
-
-// Make sure that the entry at "idx" is correctly paired with "iref".
-bool IndirectReferenceTable::CheckEntry(const char* what, IndirectRef iref, int idx) const {
-  const mirror::Object* obj = table_[idx];
-  IndirectRef checkRef = ToIndirectRef(obj, idx);
-  if (UNLIKELY(checkRef != iref)) {
-    LOG(ERROR) << "JNI ERROR (app bug): attempt to " << what
-               << " stale " << kind_ << " " << iref
-               << " (should be " << checkRef << ")";
-    AbortMaybe();
-    return false;
-  }
-  return true;
 }
 
 IndirectRef IndirectReferenceTable::Add(uint32_t cookie, mirror::Object* obj) {
@@ -127,20 +120,6 @@
     }
     DCHECK_GT(newSize, alloc_entries_);
 
-    table_ = reinterpret_cast<mirror::Object**>(realloc(table_, newSize * sizeof(mirror::Object*)));
-    slot_data_ = reinterpret_cast<IndirectRefSlot*>(realloc(slot_data_,
-                                                            newSize * sizeof(IndirectRefSlot)));
-    if (table_ == NULL || slot_data_ == NULL) {
-      LOG(FATAL) << "JNI ERROR (app bug): unable to expand "
-                 << kind_ << " table (from "
-                 << alloc_entries_ << " to " << newSize
-                 << ", max=" << max_entries_ << ")\n"
-                 << MutatorLockedDumpable<IndirectReferenceTable>(*this);
-    }
-
-    // Clear the newly-allocated slot_data_ elements.
-    memset(slot_data_ + alloc_entries_, 0, (newSize - alloc_entries_) * sizeof(IndirectRefSlot));
-
     alloc_entries_ = newSize;
   }
 
@@ -185,55 +164,6 @@
   }
 }
 
-// Verifies that the indirect table lookup is valid.
-// Returns "false" if something looks bad.
-bool IndirectReferenceTable::GetChecked(IndirectRef iref) const {
-  if (UNLIKELY(iref == NULL)) {
-    LOG(WARNING) << "Attempt to look up NULL " << kind_;
-    return false;
-  }
-  if (UNLIKELY(GetIndirectRefKind(iref) == kSirtOrInvalid)) {
-    LOG(ERROR) << "JNI ERROR (app bug): invalid " << kind_ << " " << iref;
-    AbortMaybe();
-    return false;
-  }
-
-  int topIndex = segment_state_.parts.topIndex;
-  int idx = ExtractIndex(iref);
-  if (UNLIKELY(idx >= topIndex)) {
-    LOG(ERROR) << "JNI ERROR (app bug): accessed stale " << kind_ << " "
-               << iref << " (index " << idx << " in a table of size " << topIndex << ")";
-    AbortMaybe();
-    return false;
-  }
-
-  if (UNLIKELY(table_[idx] == NULL)) {
-    LOG(ERROR) << "JNI ERROR (app bug): accessed deleted " << kind_ << " " << iref;
-    AbortMaybe();
-    return false;
-  }
-
-  if (UNLIKELY(!CheckEntry("use", iref, idx))) {
-    return false;
-  }
-
-  return true;
-}
-
-static int Find(mirror::Object* direct_pointer, int bottomIndex, int topIndex,
-                mirror::Object** table) {
-  for (int i = bottomIndex; i < topIndex; ++i) {
-    if (table[i] == direct_pointer) {
-      return i;
-    }
-  }
-  return -1;
-}
-
-bool IndirectReferenceTable::ContainsDirectPointer(mirror::Object* direct_pointer) const {
-  return Find(direct_pointer, 0, segment_state_.parts.topIndex, table_) != -1;
-}
-
 // Removes an object. We extract the table offset bits from "iref"
 // and zap the corresponding entry, leaving a hole if it's not at the top.
 // If the entry is not between the current top index and the bottom index
@@ -346,15 +276,4 @@
   ReferenceTable::Dump(os, entries);
 }
 
-mirror::Object* IndirectReferenceTable::Get(IndirectRef iref) const {
-  if (!GetChecked(iref)) {
-    return kInvalidIndirectRefObject;
-  }
-  mirror::Object* obj = table_[ExtractIndex(iref)];;
-  if (obj != kClearedJniWeakGlobal) {
-    VerifyObject(obj);
-  }
-  return obj;
-}
-
 }  // namespace art
diff --git a/runtime/indirect_reference_table.h b/runtime/indirect_reference_table.h
index a2de726..f365acc 100644
--- a/runtime/indirect_reference_table.h
+++ b/runtime/indirect_reference_table.h
@@ -24,6 +24,7 @@
 
 #include "base/logging.h"
 #include "base/mutex.h"
+#include "mem_map.h"
 #include "object_callbacks.h"
 #include "offsets.h"
 
@@ -72,7 +73,7 @@
  * To make everything fit nicely in 32-bit integers, the maximum size of
  * the table is capped at 64K.
  *
- * None of the table functions are synchronized.
+ * Only SynchronizedGet is synchronized.
  */
 
 /*
@@ -191,11 +192,6 @@
  * and local refs to improve performance.  A large circular buffer might
  * reduce the amortized cost of adding global references.
  *
- * TODO: if we can guarantee that the underlying storage doesn't move,
- * e.g. by using oversized mmap regions to handle expanding tables, we may
- * be able to avoid having to synchronize lookups.  Might make sense to
- * add a "synchronized lookup" call that takes the mutex as an argument,
- * and either locks or doesn't lock based on internal details.
  */
 union IRTSegmentState {
   uint32_t          all;
@@ -234,7 +230,7 @@
     }
   }
 
-  mirror::Object** table_;
+  mirror::Object** const table_;
   size_t i_;
   size_t capacity_;
 };
@@ -267,10 +263,15 @@
    *
    * Returns kInvalidIndirectRefObject if iref is invalid.
    */
-  mirror::Object* Get(IndirectRef iref) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  mirror::Object* Get(IndirectRef iref) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      ALWAYS_INLINE;
 
-  // TODO: remove when we remove work_around_app_jni_bugs support.
-  bool ContainsDirectPointer(mirror::Object* direct_pointer) const;
+  // Synchronized get which reads a reference, acquiring a lock if necessary.
+  mirror::Object* SynchronizedGet(Thread* /*self*/, ReaderWriterMutex* /*mutex*/,
+                                  IndirectRef iref) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return Get(iref);
+  }
 
   /*
    * Remove an existing entry.
@@ -351,6 +352,9 @@
     }
   }
 
+  // Abort if check_jni is not enabled.
+  static void AbortIfNoCheckJNI();
+
   /* extra debugging checks */
   bool GetChecked(IndirectRef) const;
   bool CheckEntry(const char*, IndirectRef, int) const;
@@ -358,6 +362,10 @@
   /* semi-public - read/write by jni down calls */
   IRTSegmentState segment_state_;
 
+  // Mem map where we store the indirect refs.
+  UniquePtr<MemMap> table_mem_map_;
+  // Mem map where we store the extended debugging info.
+  UniquePtr<MemMap> slot_mem_map_;
   /* bottom of the stack */
   mirror::Object** table_;
   /* bit mask, ORed into all irefs */
diff --git a/runtime/indirect_reference_table_test.cc b/runtime/indirect_reference_table_test.cc
index 9b42e59..449817a 100644
--- a/runtime/indirect_reference_table_test.cc
+++ b/runtime/indirect_reference_table_test.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "indirect_reference_table.h"
+#include "indirect_reference_table-inl.h"
 
 #include "common_runtime_test.h"
 #include "mirror/object-inl.h"
diff --git a/runtime/instruction_set.h b/runtime/instruction_set.h
index bfbbbd6..1cea24b 100644
--- a/runtime/instruction_set.h
+++ b/runtime/instruction_set.h
@@ -59,7 +59,8 @@
 #endif
 
 enum InstructionFeatures {
-  kHwDiv = 1                  // Supports hardware divide.
+  kHwDiv  = 0x1,              // Supports hardware divide.
+  kHwLpae = 0x2,              // Supports Large Physical Address Extension.
 };
 
 // This is a bitmask of supported features per architecture.
@@ -78,6 +79,14 @@
     mask_ = (mask_ & ~kHwDiv) | (v ? kHwDiv : 0);
   }
 
+  bool HasLpae() const {
+    return (mask_ & kHwLpae) != 0;
+  }
+
+  void SetHasLpae(bool v) {
+    mask_ = (mask_ & ~kHwLpae) | (v ? kHwLpae : 0);
+  }
+
   std::string GetFeatureString() const;
 
   // Other features in here.
diff --git a/runtime/jdwp/jdwp_adb.cc b/runtime/jdwp/jdwp_adb.cc
index cbf35be..fe91bb6 100644
--- a/runtime/jdwp/jdwp_adb.cc
+++ b/runtime/jdwp/jdwp_adb.cc
@@ -362,7 +362,7 @@
       }
 
       if (wake_pipe_[0] >= 0 && FD_ISSET(wake_pipe_[0], &readfds)) {
-        LOG(DEBUG) << "Got wake-up signal, bailing out of select";
+        VLOG(jdwp) << "Got wake-up signal, bailing out of select";
         goto fail;
       }
       if (control_sock_ >= 0 && FD_ISSET(control_sock_, &readfds)) {
@@ -385,7 +385,7 @@
           if (errno != EINTR) {
             goto fail;
           }
-          LOG(DEBUG) << "+++ EINTR hit";
+          VLOG(jdwp) << "+++ EINTR hit";
           return true;
         } else if (readCount == 0) {
           /* EOF hit -- far end went away */
diff --git a/runtime/jdwp/jdwp_event.cc b/runtime/jdwp/jdwp_event.cc
index 223b7a1..4e2b0f8 100644
--- a/runtime/jdwp/jdwp_event.cc
+++ b/runtime/jdwp/jdwp_event.cc
@@ -280,7 +280,7 @@
   if (found) {
     Dbg::ManageDeoptimization();
   } else {
-    LOG(DEBUG) << StringPrintf("Odd: no match when removing event reqId=0x%04x", requestId);
+    LOG(WARNING) << StringPrintf("Odd: no match when removing event reqId=0x%04x", requestId);
   }
 }
 
diff --git a/runtime/jdwp/jdwp_main.cc b/runtime/jdwp/jdwp_main.cc
index f480256..2419ca6 100644
--- a/runtime/jdwp/jdwp_main.cc
+++ b/runtime/jdwp/jdwp_main.cc
@@ -573,7 +573,7 @@
  */
 int64_t JdwpState::LastDebuggerActivity() {
   if (!Dbg::IsDebuggerActive()) {
-    LOG(DEBUG) << "no active debugger";
+    LOG(WARNING) << "no active debugger";
     return -1;
   }
 
diff --git a/runtime/jdwp/jdwp_socket.cc b/runtime/jdwp/jdwp_socket.cc
index 3f5546e..4a80957 100644
--- a/runtime/jdwp/jdwp_socket.cc
+++ b/runtime/jdwp/jdwp_socket.cc
@@ -416,7 +416,7 @@
         if (listenSock >= 0) {
           LOG(ERROR) << "Exit wake set, but not exiting?";
         } else {
-          LOG(DEBUG) << "Got wake-up signal, bailing out of select";
+          VLOG(jdwp) << "Got wake-up signal, bailing out of select";
         }
         goto fail;
       }
@@ -442,7 +442,7 @@
           if (errno != EINTR) {
             goto fail;
           }
-          LOG(DEBUG) << "+++ EINTR hit";
+          VLOG(jdwp) << "+++ EINTR hit";
           return true;
         } else if (readCount == 0) {
           /* EOF hit -- far end went away */
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index e6a35d0..915f2c9 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -29,6 +29,7 @@
 #include "class_linker-inl.h"
 #include "dex_file-inl.h"
 #include "gc/accounting/card_table-inl.h"
+#include "indirect_reference_table-inl.h"
 #include "interpreter/interpreter.h"
 #include "jni.h"
 #include "mirror/art_field-inl.h"
diff --git a/runtime/jni_internal.h b/runtime/jni_internal.h
index ec911b2..cdf3c47 100644
--- a/runtime/jni_internal.h
+++ b/runtime/jni_internal.h
@@ -116,7 +116,8 @@
 
   // JNI global references.
   ReaderWriterMutex globals_lock DEFAULT_MUTEX_ACQUIRED_AFTER;
-  IndirectReferenceTable globals GUARDED_BY(globals_lock);
+  // Not guarded by globals_lock since we sometimes use SynchronizedGet in Thread::DecodeJObject.
+  IndirectReferenceTable globals;
 
   Mutex libraries_lock DEFAULT_MUTEX_ACQUIRED_AFTER;
   Libraries* libraries GUARDED_BY(libraries_lock);
diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc
index 14fc25c..778b9e5 100644
--- a/runtime/jni_internal_test.cc
+++ b/runtime/jni_internal_test.cc
@@ -987,9 +987,6 @@
     // Our local reference for the survivor is invalid because the survivor
     // gets a new local reference...
     EXPECT_EQ(JNIInvalidRefType, env_->GetObjectRefType(inner2));
-    // ...but the survivor should be in the local reference table.
-    JNIEnvExt* env = reinterpret_cast<JNIEnvExt*>(env_);
-    EXPECT_TRUE(env->locals.ContainsDirectPointer(inner2_direct_pointer));
 
     env_->PopLocalFrame(NULL);
   }
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index a556a1c..d454ae8 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -33,8 +33,14 @@
 namespace art {
 namespace mirror {
 
+template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
 inline uint32_t Class::GetObjectSize() {
-  DCHECK(!IsVariableSize()) << " class=" << PrettyTypeOf(this);
+  if (kIsDebugBuild) {
+    // Use a local variable as (D)CHECK can't handle the space between
+    // the two template params.
+    bool is_variable_size = IsVariableSize<kVerifyFlags, kReadBarrierOption>();
+    CHECK(!is_variable_size) << " class=" << PrettyTypeOf(this);
+  }
   return GetField32(OFFSET_OF_OBJECT_MEMBER(Class, object_size_));
 }
 
@@ -514,6 +520,13 @@
   return this == ArtMethod::GetJavaLangReflectArtMethod<kReadBarrierOption>();
 }
 
+template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
+inline bool Class::IsClassClass() {
+  Class* java_lang_Class = GetClass<kVerifyFlags, kReadBarrierOption>()->
+      template GetClass<kVerifyFlags, kReadBarrierOption>();
+  return this == java_lang_Class;
+}
+
 }  // namespace mirror
 }  // namespace art
 
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 662303e..ff63782 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -315,11 +315,6 @@
                          ClassHelper(klass2).GetDescriptor());
 }
 
-bool Class::IsClassClass() {
-  Class* java_lang_Class = GetClass()->GetClass();
-  return this == java_lang_Class;
-}
-
 bool Class::IsStringClass() const {
   return this == String::GetJavaLangString();
 }
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 00ecead..1f393db 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -366,6 +366,8 @@
     return GetComponentType<kVerifyFlags, kReadBarrierOption>() != NULL;
   }
 
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   bool IsClassClass() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   bool IsStringClass() const;
@@ -423,10 +425,13 @@
   Object* AllocNonMovableObject(Thread* self)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   bool IsVariableSize() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Classes and arrays vary in size, and so the object_size_ field cannot
     // be used to get their instance size
-    return IsClassClass() || IsArrayClass();
+    return IsClassClass<kVerifyFlags, kReadBarrierOption>() ||
+        IsArrayClass<kVerifyFlags, kReadBarrierOption>();
   }
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
@@ -443,6 +448,8 @@
   void SetClassSize(uint32_t new_class_size)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   uint32_t GetObjectSize() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void SetObjectSize(uint32_t new_object_size) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 064fe30..a2072a2 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -389,7 +389,8 @@
     result = AsClass<kNewFlags, kReadBarrierOption>()->
         template SizeOf<kNewFlags, kReadBarrierOption>();
   } else {
-    result = GetClass<kNewFlags, kReadBarrierOption>()->GetObjectSize();
+    result = GetClass<kNewFlags, kReadBarrierOption>()->
+        template GetObjectSize<kNewFlags, kReadBarrierOption>();
   }
   DCHECK_GE(result, sizeof(Object))
       << " class=" << PrettyTypeOf(GetClass<kNewFlags, kReadBarrierOption>());
diff --git a/runtime/mirror/reference.h b/runtime/mirror/reference.h
index cf65d20..0b6e759 100644
--- a/runtime/mirror/reference.h
+++ b/runtime/mirror/reference.h
@@ -42,8 +42,10 @@
     return OFFSET_OF_OBJECT_MEMBER(Reference, referent_);
   }
 
+  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   Object* GetReferent() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return GetFieldObjectVolatile<Object>(ReferentOffset());
+    return GetFieldObjectVolatile<Object, kDefaultVerifyFlags, kReadBarrierOption>(
+        ReferentOffset());
   }
   template<bool kTransactionActive>
   void SetReferent(Object* referent) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
diff --git a/runtime/native/dalvik_system_ZygoteHooks.cc b/runtime/native/dalvik_system_ZygoteHooks.cc
index 1008491..f541633 100644
--- a/runtime/native/dalvik_system_ZygoteHooks.cc
+++ b/runtime/native/dalvik_system_ZygoteHooks.cc
@@ -58,12 +58,12 @@
     Runtime* runtime = Runtime::Current();
     JavaVMExt* vm = runtime->GetJavaVM();
     if (!vm->check_jni) {
-      LOG(DEBUG) << "Late-enabling -Xcheck:jni";
+      VLOG(jni) << "Late-enabling -Xcheck:jni";
       vm->SetCheckJniEnabled(true);
       // There's only one thread running at this point, so only one JNIEnv to fix up.
       Thread::Current()->GetJniEnv()->SetCheckJniEnabled(true);
     } else {
-      LOG(DEBUG) << "Not late-enabling -Xcheck:jni (already on)";
+      VLOG(jni) << "Not late-enabling -Xcheck:jni (already on)";
     }
     debug_flags &= ~DEBUG_ENABLE_CHECKJNI;
   }
diff --git a/runtime/native/java_lang_ref_Reference.cc b/runtime/native/java_lang_ref_Reference.cc
new file mode 100644
index 0000000..f221ac6
--- /dev/null
+++ b/runtime/native/java_lang_ref_Reference.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gc/heap.h"
+#include "gc/reference_processor.h"
+#include "jni_internal.h"
+#include "mirror/object-inl.h"
+#include "mirror/reference-inl.h"
+#include "scoped_fast_native_object_access.h"
+
+namespace art {
+
+static jobject Reference_get(JNIEnv* env, jobject javaThis) {
+  ScopedFastNativeObjectAccess soa(env);
+  mirror::Reference* const ref = soa.Decode<mirror::Reference*>(javaThis);
+  mirror::Object* const referent =
+      Runtime::Current()->GetHeap()->GetReferenceProcessor()->GetReferent(soa.Self(), ref);
+  return soa.AddLocalReference<jobject>(referent);
+}
+
+static JNINativeMethod gMethods[] = {
+  NATIVE_METHOD(Reference, get, "!()Ljava/lang/Object;"),
+};
+
+void register_java_lang_ref_Reference(JNIEnv* env) {
+  REGISTER_NATIVE_METHODS("java/lang/ref/Reference");
+}
+
+}  // namespace art
diff --git a/runtime/oat.cc b/runtime/oat.cc
index 4b4b3d0..a1f4fd0 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -22,7 +22,7 @@
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '2', '3', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '2', '7', '\0' };
 
 OatHeader::OatHeader() {
   memset(this, 0, sizeof(*this));
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 1562527..84ca23b 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -223,16 +223,17 @@
 
 //  gLogVerbosity.class_linker = true;  // TODO: don't check this in!
 //  gLogVerbosity.compiler = true;  // TODO: don't check this in!
-//  gLogVerbosity.verifier = true;  // TODO: don't check this in!
-//  gLogVerbosity.heap = true;  // TODO: don't check this in!
 //  gLogVerbosity.gc = true;  // TODO: don't check this in!
+//  gLogVerbosity.heap = true;  // TODO: don't check this in!
 //  gLogVerbosity.jdwp = true;  // TODO: don't check this in!
 //  gLogVerbosity.jni = true;  // TODO: don't check this in!
 //  gLogVerbosity.monitor = true;  // TODO: don't check this in!
+//  gLogVerbosity.profiler = true;  // TODO: don't check this in!
+//  gLogVerbosity.signals = true;  // TODO: don't check this in!
 //  gLogVerbosity.startup = true;  // TODO: don't check this in!
 //  gLogVerbosity.third_party_jni = true;  // TODO: don't check this in!
 //  gLogVerbosity.threads = true;  // TODO: don't check this in!
-//  gLogVerbosity.signals = true;  // TODO: don't check this in!
+//  gLogVerbosity.verifier = true;  // TODO: don't check this in!
 
   method_trace_ = false;
   method_trace_file_ = "/data/method-trace-file.bin";
@@ -254,7 +255,7 @@
 #ifdef HAVE_ANDROID_OS
   {
     char buf[PROP_VALUE_MAX];
-    property_get("dalvik.vm.implicit_checks", buf, "null,stack");
+    property_get("dalvik.vm.implicit_checks", buf, "none");
     std::string checks(buf);
     std::vector<std::string> checkvec;
     Split(checks, ',', checkvec);
@@ -445,28 +446,30 @@
       for (size_t i = 0; i < verbose_options.size(); ++i) {
         if (verbose_options[i] == "class") {
           gLogVerbosity.class_linker = true;
-        } else if (verbose_options[i] == "verifier") {
-          gLogVerbosity.verifier = true;
         } else if (verbose_options[i] == "compiler") {
           gLogVerbosity.compiler = true;
-        } else if (verbose_options[i] == "heap") {
-          gLogVerbosity.heap = true;
         } else if (verbose_options[i] == "gc") {
           gLogVerbosity.gc = true;
+        } else if (verbose_options[i] == "heap") {
+          gLogVerbosity.heap = true;
         } else if (verbose_options[i] == "jdwp") {
           gLogVerbosity.jdwp = true;
         } else if (verbose_options[i] == "jni") {
           gLogVerbosity.jni = true;
         } else if (verbose_options[i] == "monitor") {
           gLogVerbosity.monitor = true;
+        } else if (verbose_options[i] == "profiler") {
+          gLogVerbosity.profiler = true;
+        } else if (verbose_options[i] == "signals") {
+          gLogVerbosity.signals = true;
         } else if (verbose_options[i] == "startup") {
           gLogVerbosity.startup = true;
         } else if (verbose_options[i] == "third-party-jni") {
           gLogVerbosity.third_party_jni = true;
         } else if (verbose_options[i] == "threads") {
           gLogVerbosity.threads = true;
-        } else if (verbose_options[i] == "signals") {
-           gLogVerbosity.signals = true;
+        } else if (verbose_options[i] == "verifier") {
+          gLogVerbosity.verifier = true;
         } else {
           Usage("Unknown -verbose option %s\n", verbose_options[i].c_str());
           return false;
diff --git a/runtime/profiler.cc b/runtime/profiler.cc
index 7b117f4..6e33f9d 100644
--- a/runtime/profiler.cc
+++ b/runtime/profiler.cc
@@ -147,7 +147,7 @@
 
       startup_delay = 0;
 
-      LOG(DEBUG) << "Delaying profile start for " << delay_secs << " secs";
+      VLOG(profiler) << "Delaying profile start for " << delay_secs << " secs";
       MutexLock mu(self, profiler->wait_lock_);
       profiler->period_condition_.TimedWait(self, delay_secs * 1000, 0);
 
@@ -167,7 +167,7 @@
     uint64_t end_us = start_us + profiler->duration_s_ * UINT64_C(1000000);
     uint64_t now_us = start_us;
 
-    LOG(DEBUG) << "Starting profiling run now for " << PrettyDuration((end_us - start_us) * 1000);
+    VLOG(profiler) << "Starting profiling run now for " << PrettyDuration((end_us - start_us) * 1000);
 
 
     SampleCheckpoint check_point(profiler);
@@ -221,7 +221,7 @@
       // After the profile has been taken, write it out.
       ScopedObjectAccess soa(self);   // Acquire the mutator lock.
       uint32_t size = profiler->WriteProfile();
-      LOG(DEBUG) << "Profile size: " << size;
+      VLOG(profiler) << "Profile size: " << size;
     }
   }
 
@@ -233,7 +233,7 @@
 // Write out the profile file if we are generating a profile.
 uint32_t BackgroundMethodSamplingProfiler::WriteProfile() {
   std::string full_name = profile_file_name_;
-  LOG(DEBUG) << "Saving profile to " << full_name;
+  VLOG(profiler) << "Saving profile to " << full_name;
 
   int fd = open(full_name.c_str(), O_RDWR);
   if (fd < 0) {
@@ -469,7 +469,7 @@
   num_null_methods_ += previous_num_null_methods_;
   num_boot_methods_ += previous_num_boot_methods_;
 
-  LOG(DEBUG) << "Profile: " << num_samples_ << "/" << num_null_methods_ << "/" << num_boot_methods_;
+  VLOG(profiler) << "Profile: " << num_samples_ << "/" << num_null_methods_ << "/" << num_boot_methods_;
   os << num_samples_ << "/" << num_null_methods_ << "/" << num_boot_methods_ << "\n";
   uint32_t num_methods = 0;
   for (int i = 0 ; i < kHashSize; i++) {
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index cbd51d4..d78be92 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -535,20 +535,9 @@
     GetInstrumentation()->ForceInterpretOnly();
   }
 
-  bool implicit_checks_supported = false;
-  switch (kRuntimeISA) {
-  case kArm:
-  case kThumb2:
-    implicit_checks_supported = true;
-    break;
-  default:
-    break;
-  }
-
-  if (implicit_checks_supported &&
-    (options->explicit_checks_ != (ParsedOptions::kExplicitSuspendCheck |
+  if (options->explicit_checks_ != (ParsedOptions::kExplicitSuspendCheck |
         ParsedOptions::kExplicitNullCheck |
-        ParsedOptions::kExplicitStackOverflowCheck) || kEnableJavaStackTraceHandler)) {
+        ParsedOptions::kExplicitStackOverflowCheck) || kEnableJavaStackTraceHandler) {
     fault_manager.Init();
 
     // These need to be in a specific order.  The null point check handler must be
@@ -744,6 +733,7 @@
   REGISTER(register_java_lang_System);
   REGISTER(register_java_lang_Thread);
   REGISTER(register_java_lang_VMClassLoader);
+  REGISTER(register_java_lang_ref_Reference);
   REGISTER(register_java_lang_reflect_Array);
   REGISTER(register_java_lang_reflect_Constructor);
   REGISTER(register_java_lang_reflect_Field);
@@ -1300,6 +1290,6 @@
 }
 
 void Runtime::UpdateProfilerState(int state) {
-  LOG(DEBUG) << "Profiler state updated to " << state;
+  VLOG(profiler) << "Profiler state updated to " << state;
 }
 }  // namespace art
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 3a62cd5..00a66d7 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -44,6 +44,7 @@
 #include "gc/accounting/card_table-inl.h"
 #include "gc/heap.h"
 #include "gc/space/space.h"
+#include "indirect_reference_table-inl.h"
 #include "jni_internal.h"
 #include "mirror/art_field-inl.h"
 #include "mirror/art_method-inl.h"
@@ -1265,10 +1266,8 @@
       result = kInvalidIndirectRefObject;
     }
   } else if (kind == kGlobal) {
-    JavaVMExt* vm = Runtime::Current()->GetJavaVM();
-    IndirectReferenceTable& globals = vm->globals;
-    ReaderMutexLock mu(const_cast<Thread*>(this), vm->globals_lock);
-    result = const_cast<mirror::Object*>(globals.Get(ref));
+    JavaVMExt* const vm = Runtime::Current()->GetJavaVM();
+    result = vm->globals.SynchronizedGet(const_cast<Thread*>(this), &vm->globals_lock, ref);
   } else {
     DCHECK_EQ(kind, kWeakGlobal);
     result = Runtime::Current()->GetJavaVM()->DecodeWeakGlobal(const_cast<Thread*>(this), ref);
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 4eb580b..31d8d60 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -650,7 +650,7 @@
         // can happen if the debugger lets go while a SIGQUIT thread
         // dump event is pending (assuming SignalCatcher was resumed for
         // just long enough to try to grab the thread-suspend lock).
-        LOG(DEBUG) << *self << " still suspended after undo "
+        LOG(WARNING) << *self << " still suspended after undo "
                    << "(suspend count=" << self->GetSuspendCount() << ")";
       }
     }
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index bf1de86..41ff96e 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -3125,11 +3125,19 @@
     VLOG(verifier) << "Failed to get mirror::Class* from '" << actual_arg_type << "'";
     return nullptr;
   }
-  mirror::ObjectArray<mirror::ArtMethod>* vtable = actual_arg_type.GetClass()->GetVTable();
-  CHECK(vtable != nullptr) << PrettyDescriptor(actual_arg_type.GetClass());
+  mirror::ObjectArray<mirror::ArtMethod>* vtable = nullptr;
+  mirror::Class* klass = actual_arg_type.GetClass();
+  if (klass->IsInterface()) {
+    // Derive Object.class from Class.class.getSuperclass().
+    mirror::Class* object_klass = klass->GetClass()->GetSuperClass();
+    CHECK(object_klass->IsObjectClass());
+    vtable = object_klass->GetVTable();
+  } else {
+    vtable = klass->GetVTable();
+  }
+  CHECK(vtable != nullptr) << PrettyDescriptor(klass);
   uint16_t vtable_index = is_range ? inst->VRegB_3rc() : inst->VRegB_35c();
-  CHECK_LT(static_cast<int32_t>(vtable_index), vtable->GetLength())
-      << PrettyDescriptor(actual_arg_type.GetClass());
+  CHECK_LT(static_cast<int32_t>(vtable_index), vtable->GetLength()) << PrettyDescriptor(klass);
   mirror::ArtMethod* res_method = vtable->Get(vtable_index);
   CHECK(!Thread::Current()->IsExceptionPending());
   return res_method;
diff --git a/test/Android.mk b/test/Android.mk
index 08a925c..aacd7b4 100644
--- a/test/Android.mk
+++ b/test/Android.mk
@@ -191,12 +191,13 @@
 ########################################################################
 
 TEST_ART_RUN_TEST_MAKE_TARGETS :=
+art_run_tests_dir := $(call intermediates-dir-for,PACKAGING,art-run-tests)/DATA
 
 # Helper to create individual build targets for tests.
 # Must be called with $(eval)
 # $(1): the test number
 define declare-make-art-run-test
-dmart_target := $(TARGET_OUT_DATA)/art-run-tests/$(1)/touch
+dmart_target := $(art_run_tests_dir)/art-run-tests/$(1)/touch
 $$(dmart_target): $(DX) $(HOST_OUT_EXECUTABLES)/jasmin
 	$(hide) rm -rf $$(dir $$@) && mkdir -p $$(dir $$@)
 	$(hide) DX=$(abspath $(DX)) JASMIN=$(abspath $(HOST_OUT_EXECUTABLES)/jasmin) $(LOCAL_PATH)/run-test --build-only --output-path $$(abspath $$(dir $$@)) $(1)
@@ -222,6 +223,9 @@
 LOCAL_MODULE_TAGS := tests
 LOCAL_MODULE := art-run-tests
 LOCAL_ADDITIONAL_DEPENDENCIES := $(TEST_ART_RUN_TEST_MAKE_TARGETS)
+# The build system use this flag to pick up files generated by declare-make-art-run-test.
+LOCAL_PICKUP_FILES := $(art_run_tests_dir)
+
 include $(BUILD_PHONY_PACKAGE)
 
 # clear temp vars
diff --git a/test/run-all-tests b/test/run-all-tests
index 3a991e9..885ee44 100755
--- a/test/run-all-tests
+++ b/test/run-all-tests
@@ -77,6 +77,9 @@
     elif [ "x$1" = "x-O" ]; then
         run_args="${run_args} -O"
         shift
+    elif [ "x$1" = "x--64" ]; then
+        run_args="${run_args} --64"
+        shift
     elif expr "x$1" : "x--" >/dev/null 2>&1; then
         echo "unknown $0 option: $1" 1>&2
         usage="yes"
@@ -95,7 +98,7 @@
         echo "  Options are all passed to run-test; refer to that for " \
              "further documentation:"
         echo "    --debug --dev --host --interpreter --jvm --no-optimize"
-        echo "    --no-verify -O --update --valgrind --zygote"
+        echo "    --no-verify -O --update --valgrind --zygote --64"
         echo "  Specific Runtime Options:"
         echo "    --seq                Run tests one-by-one, avoiding failures caused by busy CPU"
     ) 1>&2