Change AtomicStack to use StackReference

Previously used Object*, using StackReference saves memory on 64 bit
devices.

Bug: 12935052
Bug: 17643507
Change-Id: I035878690054eeeb24d655a900b8f26c837703ff
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index 929a1d2..72734e9 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -25,15 +25,34 @@
 #include "base/logging.h"
 #include "base/macros.h"
 #include "mem_map.h"
+#include "stack.h"
 #include "utils.h"
 
 namespace art {
 namespace gc {
 namespace accounting {
 
+// Internal representation is StackReference<T>, so this only works with mirror::Object or it's
+// subclasses.
 template <typename T>
 class AtomicStack {
  public:
+  class ObjectComparator {
+   public:
+    // These two comparators are for std::binary_search.
+    bool operator()(const T* a, const StackReference<T>& b) const NO_THREAD_SAFETY_ANALYSIS {
+      return a < b.AsMirrorPtr();
+    }
+    bool operator()(const StackReference<T>& a, const T* b) const NO_THREAD_SAFETY_ANALYSIS {
+      return a.AsMirrorPtr() < b;
+    }
+    // This comparator is for std::sort.
+    bool operator()(const StackReference<T>& a, const StackReference<T>& b) const
+        NO_THREAD_SAFETY_ANALYSIS {
+      return a.AsMirrorPtr() < b.AsMirrorPtr();
+    }
+  };
+
   // Capacity is how many elements we can store in the stack.
   static AtomicStack* Create(const std::string& name, size_t growth_limit, size_t capacity) {
     std::unique_ptr<AtomicStack> mark_stack(new AtomicStack(name, growth_limit, capacity));
@@ -45,7 +64,7 @@
 
   void Reset() {
     DCHECK(mem_map_.get() != nullptr);
-    DCHECK(begin_ != NULL);
+    DCHECK(begin_ != nullptr);
     front_index_.StoreRelaxed(0);
     back_index_.StoreRelaxed(0);
     debug_is_sorted_ = true;
@@ -55,18 +74,20 @@
   // Beware: Mixing atomic pushes and atomic pops will cause ABA problem.
 
   // Returns false if we overflowed the stack.
-  bool AtomicPushBackIgnoreGrowthLimit(const T& value) {
+  bool AtomicPushBackIgnoreGrowthLimit(T* value) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return AtomicPushBackInternal(value, capacity_);
   }
 
   // Returns false if we overflowed the stack.
-  bool AtomicPushBack(const T& value) {
+  bool AtomicPushBack(T* value) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return AtomicPushBackInternal(value, growth_limit_);
   }
 
   // Atomically bump the back index by the given number of
   // slots. Returns false if we overflowed the stack.
-  bool AtomicBumpBack(size_t num_slots, T** start_address, T** end_address) {
+  bool AtomicBumpBack(size_t num_slots, StackReference<T>** start_address,
+                      StackReference<T>** end_address)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (kIsDebugBuild) {
       debug_is_sorted_ = false;
     }
@@ -80,41 +101,41 @@
         return false;
       }
     } while (!back_index_.CompareExchangeWeakRelaxed(index, new_index));
-    *start_address = &begin_[index];
-    *end_address = &begin_[new_index];
+    *start_address = begin_ + index;
+    *end_address = begin_ + new_index;
     if (kIsDebugBuild) {
       // Sanity check that the memory is zero.
       for (int32_t i = index; i < new_index; ++i) {
-        DCHECK_EQ(begin_[i], static_cast<T>(0))
+        DCHECK_EQ(begin_[i].AsMirrorPtr(), static_cast<T*>(nullptr))
             << "i=" << i << " index=" << index << " new_index=" << new_index;
       }
     }
     return true;
   }
 
-  void AssertAllZero() {
+  void AssertAllZero() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (kIsDebugBuild) {
       for (size_t i = 0; i < capacity_; ++i) {
-        DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i;
+        DCHECK_EQ(begin_[i].AsMirrorPtr(), static_cast<T*>(nullptr)) << "i=" << i;
       }
     }
   }
 
-  void PushBack(const T& value) {
+  void PushBack(T* value) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (kIsDebugBuild) {
       debug_is_sorted_ = false;
     }
-    int32_t index = back_index_.LoadRelaxed();
+    const int32_t index = back_index_.LoadRelaxed();
     DCHECK_LT(static_cast<size_t>(index), growth_limit_);
     back_index_.StoreRelaxed(index + 1);
-    begin_[index] = value;
+    begin_[index].Assign(value);
   }
 
-  T PopBack() {
+  T* PopBack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK_GT(back_index_.LoadRelaxed(), front_index_.LoadRelaxed());
     // Decrement the back index non atomically.
     back_index_.StoreRelaxed(back_index_.LoadRelaxed() - 1);
-    return begin_[back_index_.LoadRelaxed()];
+    return begin_[back_index_.LoadRelaxed()].AsMirrorPtr();
   }
 
   // Take an item from the front of the stack.
@@ -140,12 +161,11 @@
     return back_index_.LoadRelaxed() - front_index_.LoadRelaxed();
   }
 
-  T* Begin() const {
-    return const_cast<T*>(begin_ + front_index_.LoadRelaxed());
+  StackReference<T>* Begin() const {
+    return begin_ + front_index_.LoadRelaxed();
   }
-
-  T* End() const {
-    return const_cast<T*>(begin_ + back_index_.LoadRelaxed());
+  StackReference<T>* End() const {
+    return begin_ + back_index_.LoadRelaxed();
   }
 
   size_t Capacity() const {
@@ -162,7 +182,7 @@
   void Sort() {
     int32_t start_back_index = back_index_.LoadRelaxed();
     int32_t start_front_index = front_index_.LoadRelaxed();
-    std::sort(Begin(), End());
+    std::sort(Begin(), End(), ObjectComparator());
     CHECK_EQ(start_back_index, back_index_.LoadRelaxed());
     CHECK_EQ(start_front_index, front_index_.LoadRelaxed());
     if (kIsDebugBuild) {
@@ -170,13 +190,18 @@
     }
   }
 
-  bool ContainsSorted(const T& value) const {
+  bool ContainsSorted(const T* value) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(debug_is_sorted_);
-    return std::binary_search(Begin(), End(), value);
+    return std::binary_search(Begin(), End(), value, ObjectComparator());
   }
 
-  bool Contains(const T& value) const {
-    return std::find(Begin(), End(), value) != End();
+  bool Contains(const T* value) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    for (auto cur = Begin(), end = End(); cur != end; ++cur) {
+      if (cur->AsMirrorPtr() == value) {
+        return true;
+      }
+    }
+    return false;
   }
 
  private:
@@ -191,7 +216,8 @@
   }
 
   // Returns false if we overflowed the stack.
-  bool AtomicPushBackInternal(const T& value, size_t limit) ALWAYS_INLINE {
+  bool AtomicPushBackInternal(T* value, size_t limit) ALWAYS_INLINE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (kIsDebugBuild) {
       debug_is_sorted_ = false;
     }
@@ -203,20 +229,20 @@
         return false;
       }
     } while (!back_index_.CompareExchangeWeakRelaxed(index, index + 1));
-    begin_[index] = value;
+    begin_[index].Assign(value);
     return true;
   }
 
   // Size in number of elements.
   void Init() {
     std::string error_msg;
-    mem_map_.reset(MemMap::MapAnonymous(name_.c_str(), NULL, capacity_ * sizeof(T),
+    mem_map_.reset(MemMap::MapAnonymous(name_.c_str(), NULL, capacity_ * sizeof(begin_[0]),
                                         PROT_READ | PROT_WRITE, false, &error_msg));
     CHECK(mem_map_.get() != NULL) << "couldn't allocate mark stack.\n" << error_msg;
     uint8_t* addr = mem_map_->Begin();
     CHECK(addr != NULL);
     debug_is_sorted_ = true;
-    begin_ = reinterpret_cast<T*>(addr);
+    begin_ = reinterpret_cast<StackReference<T>*>(addr);
     Reset();
   }
 
@@ -229,7 +255,7 @@
   // Front index, used for implementing PopFront.
   AtomicInteger front_index_;
   // Base of the atomic stack.
-  T* begin_;
+  StackReference<T>* begin_;
   // Current maximum which we can push back to, must be <= capacity_.
   size_t growth_limit_;
   // Maximum number of elements.
@@ -240,7 +266,7 @@
   DISALLOW_COPY_AND_ASSIGN(AtomicStack);
 };
 
-typedef AtomicStack<mirror::Object*> ObjectStack;
+typedef AtomicStack<mirror::Object> ObjectStack;
 
 }  // namespace accounting
 }  // namespace gc
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 5fa3c8b..754e217 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -607,9 +607,9 @@
   // The alloc stack.
   {
     ConcurrentCopyingVerifyNoFromSpaceRefsVisitor ref_visitor(this);
-    for (mirror::Object** it = heap_->allocation_stack_->Begin(),
-             **end = heap_->allocation_stack_->End(); it < end; ++it) {
-      mirror::Object* obj = *it;
+    for (auto* it = heap_->allocation_stack_->Begin(), *end = heap_->allocation_stack_->End();
+        it < end; ++it) {
+      mirror::Object* const obj = it->AsMirrorPtr();
       if (obj != nullptr && obj->GetClass() != nullptr) {
         // TODO: need to call this only if obj is alive?
         ref_visitor(obj);
@@ -845,14 +845,14 @@
   // Objects on the allocation stack?
   if (ReadBarrier::kEnableReadBarrierInvariantChecks || kIsDebugBuild) {
     size_t count = GetAllocationStack()->Size();
-    mirror::Object** it = GetAllocationStack()->Begin();
-    mirror::Object** end = GetAllocationStack()->End();
+    auto* it = GetAllocationStack()->Begin();
+    auto* end = GetAllocationStack()->End();
     for (size_t i = 0; i < count; ++i, ++it) {
-      CHECK(it < end);
-      mirror::Object* obj = *it;
+      CHECK_LT(it, end);
+      mirror::Object* obj = it->AsMirrorPtr();
       if (obj != nullptr) {
         // Must have been cleared above.
-        CHECK(obj->GetReadBarrierPointer() == ReadBarrier::WhitePtr()) << obj;
+        CHECK_EQ(obj->GetReadBarrierPointer(), ReadBarrier::WhitePtr()) << obj;
       }
     }
   }
@@ -1446,10 +1446,7 @@
 bool ConcurrentCopying::IsOnAllocStack(mirror::Object* ref) {
   QuasiAtomic::ThreadFenceAcquire();
   accounting::ObjectStack* alloc_stack = GetAllocationStack();
-  mirror::Object** begin = alloc_stack->Begin();
-  // Important to read end once as it could be concurrently updated and screw up std::find().
-  mirror::Object** end = alloc_stack->End();
-  return std::find(begin, end, ref) != end;
+  return alloc_stack->Contains(ref);
 }
 
 mirror::Object* ConcurrentCopying::Mark(mirror::Object* from_ref) {
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index d740bf2..e29e4a8 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -233,7 +233,7 @@
   void SetFwdPtr(mirror::Object* from_ref, mirror::Object* to_ref)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void FlipThreadRoots() LOCKS_EXCLUDED(Locks::mutator_lock_);;
-  void SwapStacks(Thread* self);
+  void SwapStacks(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void RecordLiveStackFreezeSize(Thread* self);
   void ComputeUnevacFromSpaceLiveRatio();
 
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index 18af005..ff3c893 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -274,11 +274,11 @@
 }
 
 void MarkCompact::ResizeMarkStack(size_t new_size) {
-  std::vector<Object*> temp(mark_stack_->Begin(), mark_stack_->End());
+  std::vector<StackReference<Object>> temp(mark_stack_->Begin(), mark_stack_->End());
   CHECK_LE(mark_stack_->Size(), new_size);
   mark_stack_->Resize(new_size);
-  for (const auto& obj : temp) {
-    mark_stack_->PushBack(obj);
+  for (auto& obj : temp) {
+    mark_stack_->PushBack(obj.AsMirrorPtr());
   }
 }
 
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
index f6d473d..06304bf 100644
--- a/runtime/gc/collector/mark_compact.h
+++ b/runtime/gc/collector/mark_compact.h
@@ -46,7 +46,7 @@
 
 namespace accounting {
   template <typename T> class AtomicStack;
-  typedef AtomicStack<mirror::Object*> ObjectStack;
+  typedef AtomicStack<mirror::Object> ObjectStack;
 }  // namespace accounting
 
 namespace space {
@@ -156,13 +156,13 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Expand mark stack to 2x its current size.
-  void ResizeMarkStack(size_t new_size);
+  void ResizeMarkStack(size_t new_size) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Returns true if we should sweep the space.
   bool ShouldSweepSpace(space::ContinuousSpace* space) const;
 
   // Push an object onto the mark stack.
-  void MarkStackPush(mirror::Object* obj);
+  void MarkStackPush(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void UpdateAndMarkModUnion()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 04fb694..1959c09 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -330,11 +330,11 @@
     // Someone else acquired the lock and expanded the mark stack before us.
     return;
   }
-  std::vector<Object*> temp(mark_stack_->Begin(), mark_stack_->End());
+  std::vector<StackReference<Object>> temp(mark_stack_->Begin(), mark_stack_->End());
   CHECK_LE(mark_stack_->Size(), new_size);
   mark_stack_->Resize(new_size);
-  for (const auto& obj : temp) {
-    mark_stack_->PushBack(obj);
+  for (auto& obj : temp) {
+    mark_stack_->PushBack(obj.AsMirrorPtr());
   }
 }
 
@@ -554,7 +554,7 @@
 class MarkStackTask : public Task {
  public:
   MarkStackTask(ThreadPool* thread_pool, MarkSweep* mark_sweep, size_t mark_stack_size,
-                Object** mark_stack)
+                StackReference<Object>* mark_stack)
       : mark_sweep_(mark_sweep),
         thread_pool_(thread_pool),
         mark_stack_pos_(mark_stack_size) {
@@ -627,11 +627,11 @@
   MarkSweep* const mark_sweep_;
   ThreadPool* const thread_pool_;
   // Thread local mark stack for this task.
-  Object* mark_stack_[kMaxSize];
+  StackReference<Object> mark_stack_[kMaxSize];
   // Mark stack position.
   size_t mark_stack_pos_;
 
-  void MarkStackPush(Object* obj) ALWAYS_INLINE {
+  ALWAYS_INLINE void MarkStackPush(Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (UNLIKELY(mark_stack_pos_ == kMaxSize)) {
       // Mark stack overflow, give 1/2 the stack to the thread pool as a new work task.
       mark_stack_pos_ /= 2;
@@ -641,7 +641,7 @@
     }
     DCHECK(obj != nullptr);
     DCHECK_LT(mark_stack_pos_, kMaxSize);
-    mark_stack_[mark_stack_pos_++] = obj;
+    mark_stack_[mark_stack_pos_++].Assign(obj);
   }
 
   virtual void Finalize() {
@@ -660,7 +660,7 @@
       Object* obj = nullptr;
       if (kUseMarkStackPrefetch) {
         while (mark_stack_pos_ != 0 && prefetch_fifo.size() < kFifoSize) {
-          Object* mark_stack_obj = mark_stack_[--mark_stack_pos_];
+          Object* const mark_stack_obj = mark_stack_[--mark_stack_pos_].AsMirrorPtr();
           DCHECK(mark_stack_obj != nullptr);
           __builtin_prefetch(mark_stack_obj);
           prefetch_fifo.push_back(mark_stack_obj);
@@ -674,7 +674,7 @@
         if (UNLIKELY(mark_stack_pos_ == 0)) {
           break;
         }
-        obj = mark_stack_[--mark_stack_pos_];
+        obj = mark_stack_[--mark_stack_pos_].AsMirrorPtr();
       }
       DCHECK(obj != nullptr);
       visitor(obj);
@@ -687,7 +687,7 @@
   CardScanTask(ThreadPool* thread_pool, MarkSweep* mark_sweep,
                accounting::ContinuousSpaceBitmap* bitmap,
                uint8_t* begin, uint8_t* end, uint8_t minimum_age, size_t mark_stack_size,
-               Object** mark_stack_obj, bool clear_card)
+               StackReference<Object>* mark_stack_obj, bool clear_card)
       : MarkStackTask<false>(thread_pool, mark_sweep, mark_stack_size, mark_stack_obj),
         bitmap_(bitmap),
         begin_(begin),
@@ -742,8 +742,8 @@
     TimingLogger::ScopedTiming t(paused ? "(Paused)ScanGrayObjects" : __FUNCTION__,
         GetTimings());
     // Try to take some of the mark stack since we can pass this off to the worker tasks.
-    Object** mark_stack_begin = mark_stack_->Begin();
-    Object** mark_stack_end = mark_stack_->End();
+    StackReference<Object>* mark_stack_begin = mark_stack_->Begin();
+    StackReference<Object>* mark_stack_end = mark_stack_->End();
     const size_t mark_stack_size = mark_stack_end - mark_stack_begin;
     // Estimated number of work tasks we will create.
     const size_t mark_stack_tasks = GetHeap()->GetContinuousSpaces().size() * thread_count;
@@ -954,9 +954,9 @@
 
 void MarkSweep::VerifyIsLive(const Object* obj) {
   if (!heap_->GetLiveBitmap()->Test(obj)) {
-    accounting::ObjectStack* allocation_stack = heap_->allocation_stack_.get();
-    CHECK(std::find(allocation_stack->Begin(), allocation_stack->End(), obj) !=
-        allocation_stack->End()) << "Found dead object " << obj << "\n" << heap_->DumpSpaces();
+    // TODO: Consider live stack? Has this code bitrotted?
+    CHECK(!heap_->allocation_stack_->Contains(obj))
+        << "Found dead object " << obj << "\n" << heap_->DumpSpaces();
   }
 }
 
@@ -1025,7 +1025,7 @@
   ObjectBytePair freed;
   ObjectBytePair freed_los;
   // How many objects are left in the array, modified after each space is swept.
-  Object** objects = allocations->Begin();
+  StackReference<Object>* objects = allocations->Begin();
   size_t count = allocations->Size();
   // Change the order to ensure that the non-moving space last swept as an optimization.
   std::vector<space::ContinuousSpace*> sweep_spaces;
@@ -1053,9 +1053,9 @@
     if (swap_bitmaps) {
       std::swap(live_bitmap, mark_bitmap);
     }
-    Object** out = objects;
+    StackReference<Object>* out = objects;
     for (size_t i = 0; i < count; ++i) {
-      Object* obj = objects[i];
+      Object* const obj = objects[i].AsMirrorPtr();
       if (kUseThreadLocalAllocationStack && obj == nullptr) {
         continue;
       }
@@ -1072,7 +1072,7 @@
           chunk_free_buffer[chunk_free_pos++] = obj;
         }
       } else {
-        *(out++) = obj;
+        (out++)->Assign(obj);
       }
     }
     if (chunk_free_pos > 0) {
@@ -1094,7 +1094,7 @@
       std::swap(large_live_objects, large_mark_objects);
     }
     for (size_t i = 0; i < count; ++i) {
-      Object* obj = objects[i];
+      Object* const obj = objects[i].AsMirrorPtr();
       // Handle large objects.
       if (kUseThreadLocalAllocationStack && obj == nullptr) {
         continue;
@@ -1195,7 +1195,7 @@
                                      static_cast<size_t>(MarkStackTask<false>::kMaxSize));
   CHECK_GT(chunk_size, 0U);
   // Split the current mark stack up into work tasks.
-  for (mirror::Object **it = mark_stack_->Begin(), **end = mark_stack_->End(); it < end; ) {
+  for (auto* it = mark_stack_->Begin(), *end = mark_stack_->End(); it < end; ) {
     const size_t delta = std::min(static_cast<size_t>(end - it), chunk_size);
     thread_pool->AddTask(self, new MarkStackTask<false>(thread_pool, this, delta, it));
     it += delta;
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index b787327..3f99e21 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -47,7 +47,7 @@
 
 namespace accounting {
   template<typename T> class AtomicStack;
-  typedef AtomicStack<mirror::Object*> ObjectStack;
+  typedef AtomicStack<mirror::Object> ObjectStack;
 }  // namespace accounting
 
 namespace collector {
@@ -136,7 +136,8 @@
 
   // Sweeps unmarked objects to complete the garbage collection. Virtual as by default it sweeps
   // all allocation spaces. Partial and sticky GCs want to just sweep a subset of the heap.
-  virtual void Sweep(bool swap_bitmaps) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+  virtual void Sweep(bool swap_bitmaps) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Sweeps unmarked objects to complete the garbage collection.
   void SweepLargeObjects(bool swap_bitmaps) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
@@ -162,13 +163,14 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) LOCKS_EXCLUDED(Locks::heap_bitmap_lock_);
 
   static mirror::Object* VerifySystemWeakIsLiveCallback(mirror::Object* obj, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   void VerifySystemWeaks()
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
 
   // Verify that an object is live, either in a live bitmap or in the allocation stack.
   void VerifyIsLive(const mirror::Object* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   static mirror::Object* MarkObjectCallback(mirror::Object* obj, void* arg)
@@ -223,11 +225,12 @@
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   void MarkObjectNonNull(mirror::Object* obj)
-        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-        EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Marks an object atomically, safe to use from multiple threads.
-  void MarkObjectNonNullParallel(mirror::Object* obj);
+  void MarkObjectNonNullParallel(mirror::Object* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Returns true if we need to add obj to a mark stack.
   bool MarkObjectParallel(const mirror::Object* obj) NO_THREAD_SAFETY_ANALYSIS;
@@ -238,8 +241,10 @@
       NO_THREAD_SAFETY_ANALYSIS;
 
   // Expand mark stack to 2x its current size.
-  void ExpandMarkStack() EXCLUSIVE_LOCKS_REQUIRED(mark_stack_lock_);
-  void ResizeMarkStack(size_t new_size) EXCLUSIVE_LOCKS_REQUIRED(mark_stack_lock_);
+  void ExpandMarkStack() EXCLUSIVE_LOCKS_REQUIRED(mark_stack_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ResizeMarkStack(size_t new_size) EXCLUSIVE_LOCKS_REQUIRED(mark_stack_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Returns how many threads we should use for the current GC phase based on if we are paused,
   // whether or not we care about pauses.
@@ -250,7 +255,7 @@
   void VerifyRoot(const mirror::Object* root, const RootInfo& root_info) NO_THREAD_SAFETY_ANALYSIS;
 
   // Push a single reference on a mark stack.
-  void PushOnMarkStack(mirror::Object* obj);
+  void PushOnMarkStack(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Blackens objects grayed during a garbage collection.
   void ScanGrayObjects(bool paused, uint8_t minimum_age)
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index fcc601f..8660eff 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -421,11 +421,11 @@
 }
 
 void SemiSpace::ResizeMarkStack(size_t new_size) {
-  std::vector<Object*> temp(mark_stack_->Begin(), mark_stack_->End());
+  std::vector<StackReference<Object>> temp(mark_stack_->Begin(), mark_stack_->End());
   CHECK_LE(mark_stack_->Size(), new_size);
   mark_stack_->Resize(new_size);
-  for (const auto& obj : temp) {
-    mark_stack_->PushBack(obj);
+  for (auto& obj : temp) {
+    mark_stack_->PushBack(obj.AsMirrorPtr());
   }
 }
 
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index f8fced8..192fb14 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -45,7 +45,7 @@
 
 namespace accounting {
   template <typename T> class AtomicStack;
-  typedef AtomicStack<mirror::Object*> ObjectStack;
+  typedef AtomicStack<mirror::Object> ObjectStack;
 }  // namespace accounting
 
 namespace space {
@@ -178,13 +178,13 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Expand mark stack to 2x its current size.
-  void ResizeMarkStack(size_t new_size);
+  void ResizeMarkStack(size_t new_size) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Returns true if we should sweep the space.
   virtual bool ShouldSweepSpace(space::ContinuousSpace* space) const;
 
   // Push an object onto the mark stack.
-  void MarkStackPush(mirror::Object* obj);
+  void MarkStackPush(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void UpdateAndMarkModUnion()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index ba06e05..9225a09 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -787,9 +787,8 @@
     bump_pointer_space_->Walk(callback, arg);
   }
   // TODO: Switch to standard begin and end to use ranged a based loop.
-  for (mirror::Object** it = allocation_stack_->Begin(), **end = allocation_stack_->End();
-      it < end; ++it) {
-    mirror::Object* obj = *it;
+  for (auto* it = allocation_stack_->Begin(), *end = allocation_stack_->End(); it < end; ++it) {
+    mirror::Object* const obj = it->AsMirrorPtr();
     if (obj != nullptr && obj->GetClass() != nullptr) {
       // Avoid the race condition caused by the object not yet being written into the allocation
       // stack or the class not yet being written in the object. Or, if
@@ -2139,9 +2138,9 @@
                           accounting::ObjectStack* stack) {
   DCHECK(bitmap1 != nullptr);
   DCHECK(bitmap2 != nullptr);
-  mirror::Object** limit = stack->End();
-  for (mirror::Object** it = stack->Begin(); it != limit; ++it) {
-    const mirror::Object* obj = *it;
+  const auto* limit = stack->End();
+  for (auto* it = stack->Begin(); it != limit; ++it) {
+    const mirror::Object* obj = it->AsMirrorPtr();
     if (!kUseThreadLocalAllocationStack || obj != nullptr) {
       if (bitmap1->HasAddress(obj)) {
         bitmap1->Set(obj);
@@ -2538,8 +2537,8 @@
 void Heap::PushOnThreadLocalAllocationStackWithInternalGC(Thread* self, mirror::Object** obj) {
   // Slow path, the allocation stack push back must have already failed.
   DCHECK(!self->PushOnThreadLocalAllocationStack(*obj));
-  mirror::Object** start_address;
-  mirror::Object** end_address;
+  StackReference<mirror::Object>* start_address;
+  StackReference<mirror::Object>* end_address;
   while (!allocation_stack_->AtomicBumpBack(kThreadLocalAllocationStackSize, &start_address,
                                             &end_address)) {
     // TODO: Add handle VerifyObject.
@@ -2698,9 +2697,9 @@
   VerifyLiveStackReferences visitor(this);
   GetLiveBitmap()->Visit(visitor);
   // We can verify objects in the live stack since none of these should reference dead objects.
-  for (mirror::Object** it = live_stack_->Begin(); it != live_stack_->End(); ++it) {
-    if (!kUseThreadLocalAllocationStack || *it != nullptr) {
-      visitor(*it);
+  for (auto* it = live_stack_->Begin(); it != live_stack_->End(); ++it) {
+    if (!kUseThreadLocalAllocationStack || it->AsMirrorPtr() != nullptr) {
+      visitor(it->AsMirrorPtr());
     }
   }
   return !visitor.Failed();
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index b0b53b0..2a0b466 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -515,6 +515,7 @@
 
   // Mark and empty stack.
   void FlushAllocStack()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Revoke all the thread-local allocation stacks.
@@ -528,10 +529,12 @@
                       accounting::SpaceBitmap<kObjectAlignment>* bitmap2,
                       accounting::SpaceBitmap<kLargeObjectAlignment>* large_objects,
                       accounting::ObjectStack* stack)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Mark the specified allocation stack as live.
   void MarkAllocStackAsLive(accounting::ObjectStack* stack)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Unbind any bound bitmaps.
@@ -818,7 +821,7 @@
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Swap the allocation stack with the live stack.
-  void SwapStacks(Thread* self);
+  void SwapStacks(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Clear cards and update the mod union table.
   void ProcessCards(TimingLogger* timings, bool use_rem_sets);