Revoke rosalloc thread-local buffers at the checkpoint.

In the mark sweep collector, rosalloc thread-local buffers were
revoked during the pause. Now, they are revoked at the thread
checkpoint, as opposed to during the pause, which appears to help
reduce the pause time.

In Ritz MemAllocTest, the average sticky pause time went down ~20%
(925 us -> 724 us).

Bug: 13394464
Bug: 9986565
Change-Id: I104992a11b46d59264c0b9aa2db82b1ccf2826bc
diff --git a/runtime/gc/accounting/remembered_set.cc b/runtime/gc/accounting/remembered_set.cc
index 72b9283..022d148 100644
--- a/runtime/gc/accounting/remembered_set.cc
+++ b/runtime/gc/accounting/remembered_set.cc
@@ -155,7 +155,8 @@
   for (const byte* card_addr : dirty_cards_) {
     auto start = reinterpret_cast<byte*>(card_table->AddrFromCard(card_addr));
     auto end = start + CardTable::kCardSize;
-    DCHECK(space_->Begin() <= start && end <= space_->Limit());
+    DCHECK_LE(space_->Begin(), start);
+    DCHECK_LE(end, space_->Limit());
   }
 }
 
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index ace9f9e..19fdc63 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -1652,6 +1652,30 @@
   }
 }
 
+void RosAlloc::AssertThreadLocalRunsAreRevoked(Thread* thread) {
+  if (kIsDebugBuild) {
+    Thread* self = Thread::Current();
+    // Avoid race conditions on the bulk free bit maps with BulkFree() (GC).
+    WriterMutexLock wmu(self, bulk_free_lock_);
+    for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
+      MutexLock mu(self, *size_bracket_locks_[idx]);
+      Run* thread_local_run = reinterpret_cast<Run*>(thread->rosalloc_runs_[idx]);
+      DCHECK(thread_local_run == nullptr);
+    }
+  }
+}
+
+void RosAlloc::AssertAllThreadLocalRunsAreRevoked() {
+  if (kIsDebugBuild) {
+    MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_);
+    MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_);
+    std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+    for (Thread* t : thread_list) {
+      AssertThreadLocalRunsAreRevoked(t);
+    }
+  }
+}
+
 void RosAlloc::Initialize() {
   // Check the consistency of the number of size brackets.
   DCHECK_EQ(Thread::kRosAllocNumOfSizeBrackets, kNumOfSizeBrackets);
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index 738d917..0b4b189 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -549,6 +549,10 @@
   void RevokeThreadLocalRuns(Thread* thread);
   // Releases the thread-local runs assigned to all the threads back to the common set of runs.
   void RevokeAllThreadLocalRuns() LOCKS_EXCLUDED(Locks::thread_list_lock_);
+  // Assert the thread local runs of a thread are revoked.
+  void AssertThreadLocalRunsAreRevoked(Thread* thread);
+  // Assert all the thread local runs are revoked.
+  void AssertAllThreadLocalRunsAreRevoked() LOCKS_EXCLUDED(Locks::thread_list_lock_);
   // Dumps the page map for debugging.
   std::string DumpPageMap() EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index b190dab..65b5471 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -63,12 +63,6 @@
   total_freed_bytes_ = 0;
 }
 
-void GarbageCollector::RevokeAllThreadLocalBuffers() {
-  timings_.StartSplit("(Paused)RevokeAllThreadLocalBuffers");
-  GetHeap()->RevokeAllThreadLocalBuffers();
-  timings_.EndSplit();
-}
-
 void GarbageCollector::Run(GcCause gc_cause, bool clear_soft_references) {
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
   Thread* self = Thread::Current();
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index 2182430..93fd2ab 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -127,7 +127,8 @@
   // Called after the GC is finished. Done without mutators paused.
   virtual void FinishPhase() = 0;
 
-  void RevokeAllThreadLocalBuffers();
+  // Revoke all the thread-local buffers.
+  virtual void RevokeAllThreadLocalBuffers() = 0;
 
   static constexpr size_t kPauseBucketSize = 500;
   static constexpr size_t kPauseBucketCount = 32;
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 9fe904c..579b781 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -89,6 +89,10 @@
 static constexpr bool kCheckLocks = kDebugLocking;
 static constexpr bool kVerifyRoots = kIsDebugBuild;
 
+// If true, revoke the rosalloc thread-local buffers at the
+// checkpoint, as opposed to during the pause.
+static constexpr bool kRevokeRosAllocThreadLocalBuffersAtCheckpoint = true;
+
 void MarkSweep::BindBitmaps() {
   timings_.StartSplit("BindBitmaps");
   WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
@@ -1028,6 +1032,9 @@
     if (kUseThreadLocalAllocationStack) {
       thread->RevokeThreadLocalAllocationStack();
     }
+    if (kRevokeRosAllocThreadLocalBuffersAtCheckpoint) {
+      mark_sweep_->GetHeap()->RevokeRosAllocThreadLocalBuffers(thread);
+    }
     mark_sweep_->GetBarrier().Pass(self);
   }
 
@@ -1360,6 +1367,19 @@
   large_objects->GetMarkObjects()->Clear();
 }
 
+void MarkSweep::RevokeAllThreadLocalBuffers() {
+  if (kRevokeRosAllocThreadLocalBuffersAtCheckpoint && IsConcurrent()) {
+    // If concurrent, rosalloc thread-local buffers are revoked at the
+    // thread checkpoint. Bump pointer space thread-local buffers must
+    // not be in use.
+    GetHeap()->AssertAllBumpPointerSpaceThreadLocalBuffersAreRevoked();
+  } else {
+    timings_.StartSplit("(Paused)RevokeAllThreadLocalBuffers");
+    GetHeap()->RevokeAllThreadLocalBuffers();
+    timings_.EndSplit();
+  }
+}
+
 }  // namespace collector
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index d88424d..b4dd8c7 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -338,6 +338,9 @@
   // IsExclusiveHeld.
   void RevokeAllThreadLocalAllocationStacks(Thread* self) NO_THREAD_SAFETY_ANALYSIS;
 
+  // Revoke all the thread-local buffers.
+  void RevokeAllThreadLocalBuffers();
+
   // Whether or not we count how many of each type of object were scanned.
   static const bool kCountScannedTypes = false;
 
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 5b9c397..565966a 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -858,6 +858,12 @@
   }
 }
 
+void SemiSpace::RevokeAllThreadLocalBuffers() {
+  timings_.StartSplit("(Paused)RevokeAllThreadLocalBuffers");
+  GetHeap()->RevokeAllThreadLocalBuffers();
+  timings_.EndSplit();
+}
+
 }  // namespace collector
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index 34cc1d3..7cc7f9b 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -246,6 +246,9 @@
 
   inline mirror::Object* GetForwardingAddressInFromSpace(mirror::Object* obj) const;
 
+  // Revoke all the thread-local buffers.
+  void RevokeAllThreadLocalBuffers();
+
   // Current space, we check this space first to avoid searching for the appropriate space for an
   // object.
   accounting::ObjectStack* mark_stack_;
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 2bab000..8d06673 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -2228,6 +2228,14 @@
   }
 }
 
+void Heap::AssertAllBumpPointerSpaceThreadLocalBuffersAreRevoked() {
+  if (kIsDebugBuild) {
+    if (bump_pointer_space_ != nullptr) {
+      bump_pointer_space_->AssertAllThreadLocalBuffersAreRevoked();
+    }
+  }
+}
+
 accounting::ModUnionTable* Heap::FindModUnionTableFromSpace(space::Space* space) {
   auto it = mod_union_tables_.find(space);
   if (it == mod_union_tables_.end()) {
@@ -2650,6 +2658,12 @@
   }
 }
 
+void Heap::RevokeRosAllocThreadLocalBuffers(Thread* thread) {
+  if (rosalloc_space_ != nullptr) {
+    rosalloc_space_->RevokeThreadLocalBuffers(thread);
+  }
+}
+
 void Heap::RevokeAllThreadLocalBuffers() {
   if (rosalloc_space_ != nullptr) {
     rosalloc_space_->RevokeAllThreadLocalBuffers();
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 17c6f62..60b8450 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -437,7 +437,9 @@
   void Trim() LOCKS_EXCLUDED(heap_trim_request_lock_);
 
   void RevokeThreadLocalBuffers(Thread* thread);
+  void RevokeRosAllocThreadLocalBuffers(Thread* thread);
   void RevokeAllThreadLocalBuffers();
+  void AssertAllBumpPointerSpaceThreadLocalBuffersAreRevoked();
 
   void PreGcRosAllocVerification(TimingLogger* timings)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index fcd3b70..6148894 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -104,6 +104,26 @@
   }
 }
 
+void BumpPointerSpace::AssertThreadLocalBuffersAreRevoked(Thread* thread) {
+  if (kIsDebugBuild) {
+    MutexLock mu(Thread::Current(), block_lock_);
+    DCHECK(!thread->HasTlab());
+  }
+}
+
+void BumpPointerSpace::AssertAllThreadLocalBuffersAreRevoked() {
+  if (kIsDebugBuild) {
+    Thread* self = Thread::Current();
+    MutexLock mu(self, *Locks::runtime_shutdown_lock_);
+    MutexLock mu2(self, *Locks::thread_list_lock_);
+    // TODO: Not do a copy of the thread list?
+    std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+    for (Thread* thread : thread_list) {
+      AssertThreadLocalBuffersAreRevoked(thread);
+    }
+  }
+}
+
 void BumpPointerSpace::UpdateMainBlock() {
   DCHECK_EQ(num_blocks_, 0U);
   main_block_size_ = Size();
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 031fccd..3ab5df4 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -103,6 +103,9 @@
   void RevokeThreadLocalBuffers(Thread* thread) LOCKS_EXCLUDED(block_lock_);
   void RevokeAllThreadLocalBuffers() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
                                                     Locks::thread_list_lock_);
+  void AssertThreadLocalBuffersAreRevoked(Thread* thread) LOCKS_EXCLUDED(block_lock_);
+  void AssertAllThreadLocalBuffersAreRevoked() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
+                                                              Locks::thread_list_lock_);
 
   uint64_t GetBytesAllocated() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   uint64_t GetObjectsAllocated() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc
index fbb2afe..3c65205 100644
--- a/runtime/gc/space/rosalloc_space.cc
+++ b/runtime/gc/space/rosalloc_space.cc
@@ -308,6 +308,12 @@
   rosalloc_->RevokeAllThreadLocalRuns();
 }
 
+void RosAllocSpace::AssertAllThreadLocalBuffersAreRevoked() {
+  if (kIsDebugBuild) {
+    rosalloc_->AssertAllThreadLocalRunsAreRevoked();
+  }
+}
+
 void RosAllocSpace::Clear() {
   madvise(GetMemMap()->Begin(), GetMemMap()->Size(), MADV_DONTNEED);
   GetLiveBitmap()->Clear();
diff --git a/runtime/gc/space/rosalloc_space.h b/runtime/gc/space/rosalloc_space.h
index 9b9adf8..949ec08 100644
--- a/runtime/gc/space/rosalloc_space.h
+++ b/runtime/gc/space/rosalloc_space.h
@@ -89,6 +89,7 @@
 
   void RevokeThreadLocalBuffers(Thread* thread);
   void RevokeAllThreadLocalBuffers();
+  void AssertAllThreadLocalBuffersAreRevoked();
 
   // Returns the class of a recently freed object.
   mirror::Class* FindRecentFreedObject(const mirror::Object* obj);