Add thread unsafe allocation methods to spaces.

Used by SS/GSS collectors since these run with mutators suspended and
only allocate from a single thread. Added AllocThreadUnsafe to
BumpPointerSpace and RosAllocSpace. Added AllocThreadUnsafe which uses
current runs as thread local runs for a thread unsafe allocation.
Added code to revoke current runs which are the same idx as thread
local runs.

Changed:
The number of thread local runs in each thread is now the the number
of thread local runs in RosAlloc instead of the number of size
brackets.

Total GC time / time on EvaluateAndApplyChanges.
TLAB SS:
Before: 36.7s / 7254
After: 16.1s / 4837

TLAB GSS:
Before: 6.9s / 3973
After: 5.7s / 3778

Bug: 8981901

Change-Id: Id1d264ade3799f431bf7ebbdcca6146aefbeb632
diff --git a/runtime/gc/allocator/rosalloc-inl.h b/runtime/gc/allocator/rosalloc-inl.h
index ac0f67b..c69ca48 100644
--- a/runtime/gc/allocator/rosalloc-inl.h
+++ b/runtime/gc/allocator/rosalloc-inl.h
@@ -23,11 +23,17 @@
 namespace gc {
 namespace allocator {
 
+template<bool kThreadSafe>
 inline ALWAYS_INLINE void* RosAlloc::Alloc(Thread* self, size_t size, size_t* bytes_allocated) {
   if (UNLIKELY(size > kLargeSizeThreshold)) {
     return AllocLargeObject(self, size, bytes_allocated);
   }
-  void* m = AllocFromRun(self, size, bytes_allocated);
+  void* m;
+  if (kThreadSafe) {
+    m = AllocFromRun(self, size, bytes_allocated);
+  } else {
+    m = AllocFromRunThreadUnsafe(self, size, bytes_allocated);
+  }
   // Check if the returned memory is really all zero.
   if (kCheckZeroMemory && m != nullptr) {
     byte* bytes = reinterpret_cast<byte*>(m);
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index ff59016..f113030 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -67,11 +67,11 @@
              << std::hex << (intptr_t)(base_ + capacity_)
              << ", capacity=" << std::dec << capacity_
              << ", max_capacity=" << std::dec << max_capacity_;
-  memset(current_runs_, 0, sizeof(current_runs_));
   for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
     size_bracket_lock_names[i] =
         StringPrintf("an rosalloc size bracket %d lock", static_cast<int>(i));
     size_bracket_locks_[i] = new Mutex(size_bracket_lock_names[i].c_str(), kRosAllocBracketLock);
+    current_runs_[i] = dedicated_full_run_;
   }
   DCHECK_EQ(footprint_, capacity_);
   size_t num_of_pages = footprint_ / kPageSize;
@@ -548,7 +548,7 @@
     DCHECK(!new_run->IsThreadLocal());
     DCHECK_EQ(new_run->first_search_vec_idx_, 0U);
     DCHECK(!new_run->to_be_bulk_freed_);
-    if (kUsePrefetchDuringAllocRun && idx <= kMaxThreadLocalSizeBracketIdx) {
+    if (kUsePrefetchDuringAllocRun && idx < kNumThreadLocalSizeBrackets) {
       // Take ownership of the cache lines if we are likely to be thread local run.
       if (kPrefetchNewRunDataByZeroing) {
         // Zeroing the data is sometimes faster than prefetching but it increases memory usage
@@ -584,6 +584,60 @@
   return AllocRun(self, idx);
 }
 
+void* RosAlloc::AllocFromCurrentRunUnlocked(Thread* self, size_t idx) {
+  Run* current_run = current_runs_[idx];
+  DCHECK(current_run != nullptr);
+  void* slot_addr = current_run->AllocSlot();
+  if (UNLIKELY(slot_addr == nullptr)) {
+    // The current run got full. Try to refill it.
+    DCHECK(current_run->IsFull());
+    if (kIsDebugBuild && current_run != dedicated_full_run_) {
+      full_runs_[idx].insert(current_run);
+      if (kTraceRosAlloc) {
+        LOG(INFO) << __FUNCTION__ << " : Inserted run 0x" << std::hex << reinterpret_cast<intptr_t>(current_run)
+                  << " into full_runs_[" << std::dec << idx << "]";
+      }
+      DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
+      DCHECK(full_runs_[idx].find(current_run) != full_runs_[idx].end());
+    }
+    current_run = RefillRun(self, idx);
+    if (UNLIKELY(current_run == nullptr)) {
+      // Failed to allocate a new run, make sure that it is the dedicated full run.
+      current_runs_[idx] = dedicated_full_run_;
+      return nullptr;
+    }
+    DCHECK(current_run != nullptr);
+    DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
+    DCHECK(full_runs_[idx].find(current_run) == full_runs_[idx].end());
+    current_run->SetIsThreadLocal(false);
+    current_runs_[idx] = current_run;
+    DCHECK(!current_run->IsFull());
+    slot_addr = current_run->AllocSlot();
+    // Must succeed now with a new run.
+    DCHECK(slot_addr != nullptr);
+  }
+  return slot_addr;
+}
+
+void* RosAlloc::AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated) {
+  DCHECK_LE(size, kLargeSizeThreshold);
+  size_t bracket_size;
+  size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
+  DCHECK_EQ(idx, SizeToIndex(size));
+  DCHECK_EQ(bracket_size, IndexToBracketSize(idx));
+  DCHECK_EQ(bracket_size, bracketSizes[idx]);
+  DCHECK_LE(size, bracket_size);
+  DCHECK(size > 512 || bracket_size - size < 16);
+  Locks::mutator_lock_->AssertExclusiveHeld(self);
+  void* slot_addr = AllocFromCurrentRunUnlocked(self, idx);
+  if (LIKELY(slot_addr != nullptr)) {
+    DCHECK(bytes_allocated != nullptr);
+    *bytes_allocated = bracket_size;
+    // Caller verifies that it is all 0.
+  }
+  return slot_addr;
+}
+
 void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated) {
   DCHECK_LE(size, kLargeSizeThreshold);
   size_t bracket_size;
@@ -596,7 +650,7 @@
 
   void* slot_addr;
 
-  if (LIKELY(idx <= kMaxThreadLocalSizeBracketIdx)) {
+  if (LIKELY(idx < kNumThreadLocalSizeBrackets)) {
     // Use a thread-local run.
     Run* thread_local_run = reinterpret_cast<Run*>(self->GetRosAllocRun(idx));
     // Allow invalid since this will always fail the allocation.
@@ -631,7 +685,6 @@
         // No slots got freed. Try to refill the thread-local run.
         DCHECK(thread_local_run->IsFull());
         if (thread_local_run != dedicated_full_run_) {
-          self->SetRosAllocRun(idx, dedicated_full_run_);
           thread_local_run->SetIsThreadLocal(false);
           if (kIsDebugBuild) {
             full_runs_[idx].insert(thread_local_run);
@@ -646,8 +699,9 @@
         }
 
         thread_local_run = RefillRun(self, idx);
-        if (UNLIKELY(thread_local_run == NULL)) {
-          return NULL;
+        if (UNLIKELY(thread_local_run == nullptr)) {
+          self->SetRosAllocRun(idx, dedicated_full_run_);
+          return nullptr;
         }
         DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
         DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
@@ -656,12 +710,12 @@
         DCHECK(!thread_local_run->IsFull());
       }
 
-      DCHECK(thread_local_run != NULL);
+      DCHECK(thread_local_run != nullptr);
       DCHECK(!thread_local_run->IsFull());
       DCHECK(thread_local_run->IsThreadLocal());
       slot_addr = thread_local_run->AllocSlot();
       // Must succeed now with a new run.
-      DCHECK(slot_addr != NULL);
+      DCHECK(slot_addr != nullptr);
     }
     if (kTraceRosAlloc) {
       LOG(INFO) << "RosAlloc::AllocFromRun() thread-local : 0x" << std::hex << reinterpret_cast<intptr_t>(slot_addr)
@@ -671,48 +725,7 @@
   } else {
     // Use the (shared) current run.
     MutexLock mu(self, *size_bracket_locks_[idx]);
-    Run* current_run = current_runs_[idx];
-    if (UNLIKELY(current_run == NULL)) {
-      current_run = RefillRun(self, idx);
-      if (UNLIKELY(current_run == NULL)) {
-        return NULL;
-      }
-      DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
-      DCHECK(full_runs_[idx].find(current_run) == full_runs_[idx].end());
-      current_run->SetIsThreadLocal(false);
-      current_runs_[idx] = current_run;
-      DCHECK(!current_run->IsFull());
-    }
-    DCHECK(current_run != NULL);
-    slot_addr = current_run->AllocSlot();
-    if (UNLIKELY(slot_addr == NULL)) {
-      // The current run got full. Try to refill it.
-      DCHECK(current_run->IsFull());
-      current_runs_[idx] = NULL;
-      if (kIsDebugBuild) {
-        // Insert it into full_runs and set the current run to NULL.
-        full_runs_[idx].insert(current_run);
-        if (kTraceRosAlloc) {
-          LOG(INFO) << "RosAlloc::AllocFromRun() : Inserted run 0x" << std::hex << reinterpret_cast<intptr_t>(current_run)
-                    << " into full_runs_[" << std::dec << idx << "]";
-        }
-      }
-      DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
-      DCHECK(full_runs_[idx].find(current_run) != full_runs_[idx].end());
-      current_run = RefillRun(self, idx);
-      if (UNLIKELY(current_run == NULL)) {
-        return NULL;
-      }
-      DCHECK(current_run != NULL);
-      DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
-      DCHECK(full_runs_[idx].find(current_run) == full_runs_[idx].end());
-      current_run->SetIsThreadLocal(false);
-      current_runs_[idx] = current_run;
-      DCHECK(!current_run->IsFull());
-      slot_addr = current_run->AllocSlot();
-      // Must succeed now with a new run.
-      DCHECK(slot_addr != NULL);
-    }
+    slot_addr = AllocFromCurrentRunUnlocked(self, idx);
     if (kTraceRosAlloc) {
       LOG(INFO) << "RosAlloc::AllocFromRun() : 0x" << std::hex << reinterpret_cast<intptr_t>(slot_addr)
                 << "-0x" << (reinterpret_cast<intptr_t>(slot_addr) + bracket_size)
@@ -741,7 +754,7 @@
   }
   if (LIKELY(run->IsThreadLocal())) {
     // It's a thread-local run. Just mark the thread-local free bit map and return.
-    DCHECK_LE(run->size_bracket_idx_, kMaxThreadLocalSizeBracketIdx);
+    DCHECK_LT(run->size_bracket_idx_, kNumThreadLocalSizeBrackets);
     DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end());
     DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end());
     run->MarkThreadLocalFreeBitMap(ptr);
@@ -766,7 +779,7 @@
       }
     }
     if (run == current_runs_[idx]) {
-      current_runs_[idx] = NULL;
+      current_runs_[idx] = dedicated_full_run_;
     }
     DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end());
     DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end());
@@ -1233,7 +1246,7 @@
     size_t idx = run->size_bracket_idx_;
     MutexLock mu(self, *size_bracket_locks_[idx]);
     if (run->IsThreadLocal()) {
-      DCHECK_LE(run->size_bracket_idx_, kMaxThreadLocalSizeBracketIdx);
+      DCHECK_LT(run->size_bracket_idx_, kNumThreadLocalSizeBrackets);
       DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end());
       DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end());
       run->UnionBulkFreeBitMapToThreadLocalFreeBitMap();
@@ -1627,7 +1640,7 @@
   Thread* self = Thread::Current();
   // Avoid race conditions on the bulk free bit maps with BulkFree() (GC).
   WriterMutexLock wmu(self, bulk_free_lock_);
-  for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
+  for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; idx++) {
     MutexLock mu(self, *size_bracket_locks_[idx]);
     Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
     CHECK(thread_local_run != nullptr);
@@ -1643,30 +1656,48 @@
       thread_local_run->MergeBulkFreeBitMapIntoAllocBitMap();
       DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
       DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
-      if (thread_local_run->IsFull()) {
-        if (kIsDebugBuild) {
-          full_runs_[idx].insert(thread_local_run);
-          DCHECK(full_runs_[idx].find(thread_local_run) != full_runs_[idx].end());
-          if (kTraceRosAlloc) {
-            LOG(INFO) << "RosAlloc::RevokeThreadLocalRuns() : Inserted run 0x" << std::hex
-                      << reinterpret_cast<intptr_t>(thread_local_run)
-                      << " into full_runs_[" << std::dec << idx << "]";
-          }
-        }
-      } else if (thread_local_run->IsAllFree()) {
-        MutexLock mu(self, lock_);
-        thread_local_run->ZeroHeader();
-        FreePages(self, thread_local_run, true);
-      } else {
-        non_full_runs_[idx].insert(thread_local_run);
-        DCHECK(non_full_runs_[idx].find(thread_local_run) != non_full_runs_[idx].end());
-        if (kTraceRosAlloc) {
-          LOG(INFO) << "RosAlloc::RevokeThreadLocalRuns() : Inserted run 0x" << std::hex
-                    << reinterpret_cast<intptr_t>(thread_local_run)
-                    << " into non_full_runs_[" << std::dec << idx << "]";
-        }
+      RevokeRun(self, idx, thread_local_run);
+    }
+  }
+}
+
+void RosAlloc::RevokeRun(Thread* self, size_t idx, Run* run) {
+  size_bracket_locks_[idx]->AssertHeld(self);
+  DCHECK(run != dedicated_full_run_);
+  if (run->IsFull()) {
+    if (kIsDebugBuild) {
+      full_runs_[idx].insert(run);
+      DCHECK(full_runs_[idx].find(run) != full_runs_[idx].end());
+      if (kTraceRosAlloc) {
+        LOG(INFO) << __FUNCTION__  << " : Inserted run 0x" << std::hex
+                  << reinterpret_cast<intptr_t>(run)
+                  << " into full_runs_[" << std::dec << idx << "]";
       }
     }
+  } else if (run->IsAllFree()) {
+    run->ZeroHeader();
+    MutexLock mu(self, lock_);
+    FreePages(self, run, true);
+  } else {
+    non_full_runs_[idx].insert(run);
+    DCHECK(non_full_runs_[idx].find(run) != non_full_runs_[idx].end());
+    if (kTraceRosAlloc) {
+      LOG(INFO) << __FUNCTION__ << " : Inserted run 0x" << std::hex
+                << reinterpret_cast<intptr_t>(run)
+                << " into non_full_runs_[" << std::dec << idx << "]";
+    }
+  }
+}
+
+void RosAlloc::RevokeThreadUnsafeCurrentRuns() {
+  // Revoke the current runs which share the same idx as thread local runs.
+  Thread* self = Thread::Current();
+  for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; ++idx) {
+    MutexLock mu(self, *size_bracket_locks_[idx]);
+    if (current_runs_[idx] != dedicated_full_run_) {
+      RevokeRun(self, idx, current_runs_[idx]);
+      current_runs_[idx] = dedicated_full_run_;
+    }
   }
 }
 
@@ -1679,6 +1710,7 @@
   for (Thread* thread : thread_list) {
     RevokeThreadLocalRuns(thread);
   }
+  RevokeThreadUnsafeCurrentRuns();
 }
 
 void RosAlloc::AssertThreadLocalRunsAreRevoked(Thread* thread) {
@@ -1686,7 +1718,7 @@
     Thread* self = Thread::Current();
     // Avoid race conditions on the bulk free bit maps with BulkFree() (GC).
     WriterMutexLock wmu(self, bulk_free_lock_);
-    for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
+    for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; idx++) {
       MutexLock mu(self, *size_bracket_locks_[idx]);
       Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
       DCHECK(thread_local_run == nullptr || thread_local_run == dedicated_full_run_);
@@ -1696,18 +1728,21 @@
 
 void RosAlloc::AssertAllThreadLocalRunsAreRevoked() {
   if (kIsDebugBuild) {
-    MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_);
-    MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_);
+    Thread* self = Thread::Current();
+    MutexLock mu(self, *Locks::runtime_shutdown_lock_);
+    MutexLock mu2(self, *Locks::thread_list_lock_);
     std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
     for (Thread* t : thread_list) {
       AssertThreadLocalRunsAreRevoked(t);
     }
+    for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; ++idx) {
+      MutexLock mu(self, *size_bracket_locks_[idx]);
+      CHECK_EQ(current_runs_[idx], dedicated_full_run_);
+    }
   }
 }
 
 void RosAlloc::Initialize() {
-  // Check the consistency of the number of size brackets.
-  DCHECK_EQ(Thread::kRosAllocNumOfSizeBrackets, kNumOfSizeBrackets);
   // bracketSizes.
   for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
     if (i < kNumOfSizeBrackets - 2) {
@@ -1911,15 +1946,34 @@
           break;
         }
         case kPageMapRunPart:
-          LOG(FATAL) << "Unreachable - page map type: " << pm << std::endl << DumpPageMap();
-          break;
+          // Fall-through.
         default:
           LOG(FATAL) << "Unreachable - page map type: " << pm << std::endl << DumpPageMap();
           break;
       }
     }
   }
-
+  std::list<Thread*> threads = Runtime::Current()->GetThreadList()->GetList();
+  for (Thread* thread : threads) {
+    for (size_t i = 0; i < kNumThreadLocalSizeBrackets; ++i) {
+      MutexLock mu(self, *size_bracket_locks_[i]);
+      Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(i));
+      CHECK(thread_local_run != nullptr);
+      CHECK(thread_local_run->IsThreadLocal());
+      CHECK(thread_local_run == dedicated_full_run_ ||
+            thread_local_run->size_bracket_idx_ == i);
+    }
+  }
+  for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
+    MutexLock mu(self, *size_bracket_locks_[i]);
+    Run* current_run = current_runs_[i];
+    CHECK(current_run != nullptr);
+    if (current_run != dedicated_full_run_) {
+      // The dedicated full run is currently marked as thread local.
+      CHECK(!current_run->IsThreadLocal());
+      CHECK_EQ(current_run->size_bracket_idx_, i);
+    }
+  }
   // Call Verify() here for the lock order.
   for (auto& run : runs) {
     run->Verify(self, this);
@@ -1952,7 +2006,7 @@
     std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
     for (auto it = thread_list.begin(); it != thread_list.end(); ++it) {
       Thread* thread = *it;
-      for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
+      for (size_t i = 0; i < kNumThreadLocalSizeBrackets; i++) {
         MutexLock mu(self, *rosalloc->size_bracket_locks_[i]);
         Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(i));
         if (thread_local_run == this) {
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index f7fa2da..21044f3 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -405,11 +405,6 @@
   // at a page-granularity.
   static const size_t kLargeSizeThreshold = 2048;
 
-  // We use use thread-local runs for the size Brackets whose indexes
-  // are less than or equal to this index. We use shared (current)
-  // runs for the rest.
-  static const size_t kMaxThreadLocalSizeBracketIdx = 10;
-
   // If true, check that the returned memory is actually zero.
   static constexpr bool kCheckZeroMemory = kIsDebugBuild;
 
@@ -442,6 +437,10 @@
   // The default value for page_release_size_threshold_.
   static constexpr size_t kDefaultPageReleaseSizeThreshold = 4 * MB;
 
+  // We use thread-local runs for the size Brackets whose indexes
+  // are less than this index. We use shared (current) runs for the rest.
+  static const size_t kNumThreadLocalSizeBrackets = 11;
+
  private:
   // The base address of the memory region that's managed by this allocator.
   byte* base_;
@@ -526,6 +525,12 @@
   // Allocate/free a run slot.
   void* AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated)
       LOCKS_EXCLUDED(lock_);
+  // Allocate/free a run slot without acquiring locks.
+  // TODO: EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
+  void* AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated)
+      LOCKS_EXCLUDED(lock_);
+  void* AllocFromCurrentRunUnlocked(Thread* self, size_t idx);
+
   // Returns the bracket size.
   size_t FreeFromRun(Thread* self, void* ptr, Run* run)
       LOCKS_EXCLUDED(lock_);
@@ -543,11 +548,20 @@
   // Allocates large objects.
   void* AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated) LOCKS_EXCLUDED(lock_);
 
+  // Revoke a run by adding it to non_full_runs_ or freeing the pages.
+  void RevokeRun(Thread* self, size_t idx, Run* run);
+
+  // Revoke the current runs which share an index with the thread local runs.
+  void RevokeThreadUnsafeCurrentRuns();
+
  public:
   RosAlloc(void* base, size_t capacity, size_t max_capacity,
            PageReleaseMode page_release_mode,
            size_t page_release_size_threshold = kDefaultPageReleaseSizeThreshold);
   ~RosAlloc();
+  // If kThreadUnsafe is true then the allocator may avoid acquiring some locks as an optimization.
+  // If used, this may cause race conditions if multiple threads are allocating at the same time.
+  template<bool kThreadSafe = true>
   void* Alloc(Thread* self, size_t size, size_t* bytes_allocated)
       LOCKS_EXCLUDED(lock_);
   size_t Free(Thread* self, void* ptr)
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 0b26019..b53ee10 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -180,7 +180,9 @@
     runtime->SetFaultMessage(oss.str());
     CHECK_EQ(self_->SetStateUnsafe(old_state), kRunnable);
   }
-
+  // Revoke the thread local buffers since the GC may allocate into a RosAllocSpace and this helps
+  // to prevent fragmentation.
+  RevokeAllThreadLocalBuffers();
   if (generational_) {
     if (gc_cause_ == kGcCauseExplicit || gc_cause_ == kGcCauseForNativeAlloc ||
         clear_soft_references_) {
@@ -332,11 +334,8 @@
 class SemiSpaceScanObjectVisitor {
  public:
   explicit SemiSpaceScanObjectVisitor(SemiSpace* ss) : semi_space_(ss) {}
-  void operator()(Object* obj) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
-    // TODO: fix NO_THREAD_SAFETY_ANALYSIS. ScanObject() requires an
-    // exclusive lock on the mutator lock, but
-    // SpaceBitmap::VisitMarkedRange() only requires the shared lock.
+  void operator()(Object* obj) const EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_,
+                                                              Locks::heap_bitmap_lock_) {
     DCHECK(obj != nullptr);
     semi_space_->ScanObject(obj);
   }
@@ -552,10 +551,11 @@
     // (pseudo-promote) it to the main free list space (as sort
     // of an old generation.)
     space::MallocSpace* promo_dest_space = GetHeap()->GetPrimaryFreeListSpace();
-    forward_address = promo_dest_space->Alloc(self_, object_size, &bytes_allocated, nullptr);
+    forward_address = promo_dest_space->AllocThreadUnsafe(self_, object_size, &bytes_allocated,
+                                                          nullptr);
     if (UNLIKELY(forward_address == nullptr)) {
       // If out of space, fall back to the to-space.
-      forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated, nullptr);
+      forward_address = to_space_->AllocThreadUnsafe(self_, object_size, &bytes_allocated, nullptr);
     } else {
       bytes_promoted_ += bytes_allocated;
       // Dirty the card at the destionation as it may contain
@@ -599,7 +599,7 @@
     DCHECK(forward_address != nullptr);
   } else {
     // If it's allocated after the last GC (younger), copy it to the to-space.
-    forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated, nullptr);
+    forward_address = to_space_->AllocThreadUnsafe(self_, object_size, &bytes_allocated, nullptr);
   }
   ++objects_moved_;
   bytes_moved_ += bytes_allocated;
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index 9b6df16..3b3e1b1 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -98,12 +98,10 @@
   // Returns the new address of the object.
   template<bool kPoisonReferences>
   void MarkObject(mirror::ObjectReference<kPoisonReferences, mirror::Object>* obj_ptr)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   void ScanObject(mirror::Object* obj)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   void VerifyNoFromSpaceReferences(mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
@@ -150,8 +148,7 @@
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   virtual mirror::Object* MarkNonForwardedObject(mirror::Object* obj)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   // Schedules an unmarked object for reference processing.
   void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference)
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 4d074f1..b913d21 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -2396,8 +2396,7 @@
 }
 
 void Heap::PreGcVerification(collector::GarbageCollector* gc) {
-  if (verify_pre_gc_heap_ || verify_missing_card_marks_ || verify_mod_union_table_ ||
-      verify_pre_gc_rosalloc_) {
+  if (verify_pre_gc_heap_ || verify_missing_card_marks_ || verify_mod_union_table_) {
     collector::GarbageCollector::ScopedPause pause(gc);
     PreGcVerificationPaused(gc);
   }
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 70ab64b..497a61f 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -36,6 +36,26 @@
   return ret;
 }
 
+inline mirror::Object* BumpPointerSpace::AllocThreadUnsafe(Thread* self, size_t num_bytes,
+                                                           size_t* bytes_allocated,
+                                                           size_t* usable_size) {
+  Locks::mutator_lock_->AssertExclusiveHeld(self);
+  num_bytes = RoundUp(num_bytes, kAlignment);
+  if (end_ + num_bytes > growth_end_) {
+    return nullptr;
+  }
+  mirror::Object* obj = reinterpret_cast<mirror::Object*>(end_);
+  end_ += num_bytes;
+  *bytes_allocated = num_bytes;
+  // Use the CAS free versions as an optimization.
+  objects_allocated_ = objects_allocated_ + 1;
+  bytes_allocated_ = bytes_allocated_ + num_bytes;
+  if (UNLIKELY(usable_size != nullptr)) {
+    *usable_size = num_bytes;
+  }
+  return obj;
+}
+
 inline mirror::Object* BumpPointerSpace::AllocNonvirtualWithoutAccounting(size_t num_bytes) {
   DCHECK(IsAligned<kAlignment>(num_bytes));
   byte* old_end;
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index e52a9a3..9e61f30 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -48,6 +48,11 @@
   // Allocate num_bytes, returns nullptr if the space is full.
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
                         size_t* usable_size) OVERRIDE;
+  // Thread-unsafe allocation for when mutators are suspended, used by the semispace collector.
+  mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
+                                    size_t* usable_size)
+      OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   mirror::Object* AllocNonvirtual(size_t num_bytes);
   mirror::Object* AllocNonvirtualWithoutAccounting(size_t num_bytes);
 
diff --git a/runtime/gc/space/rosalloc_space-inl.h b/runtime/gc/space/rosalloc_space-inl.h
index d270885..fbfef45 100644
--- a/runtime/gc/space/rosalloc_space-inl.h
+++ b/runtime/gc/space/rosalloc_space-inl.h
@@ -46,11 +46,15 @@
   return size_by_size;
 }
 
+template<bool kThreadSafe>
 inline mirror::Object* RosAllocSpace::AllocCommon(Thread* self, size_t num_bytes,
                                                   size_t* bytes_allocated, size_t* usable_size) {
   size_t rosalloc_size = 0;
+  if (!kThreadSafe) {
+    Locks::mutator_lock_->AssertExclusiveHeld(self);
+  }
   mirror::Object* result = reinterpret_cast<mirror::Object*>(
-      rosalloc_->Alloc(self, num_bytes, &rosalloc_size));
+      rosalloc_->Alloc<kThreadSafe>(self, num_bytes, &rosalloc_size));
   if (LIKELY(result != NULL)) {
     if (kDebugSpaces) {
       CHECK(Contains(result)) << "Allocation (" << reinterpret_cast<void*>(result)
diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc
index f5c0e94..a1511e7 100644
--- a/runtime/gc/space/rosalloc_space.cc
+++ b/runtime/gc/space/rosalloc_space.cc
@@ -159,7 +159,7 @@
   }
   // Note RosAlloc zeroes memory internally.
   // Return the new allocation or NULL.
-  CHECK(!kDebugSpaces || result == NULL || Contains(result));
+  CHECK(!kDebugSpaces || result == nullptr || Contains(result));
   return result;
 }
 
diff --git a/runtime/gc/space/rosalloc_space.h b/runtime/gc/space/rosalloc_space.h
index a156738..2934af8 100644
--- a/runtime/gc/space/rosalloc_space.h
+++ b/runtime/gc/space/rosalloc_space.h
@@ -52,6 +52,11 @@
                         size_t* usable_size) OVERRIDE {
     return AllocNonvirtual(self, num_bytes, bytes_allocated, usable_size);
   }
+  mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
+                                    size_t* usable_size)
+      OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return AllocNonvirtualThreadUnsafe(self, num_bytes, bytes_allocated, usable_size);
+  }
   size_t AllocationSize(mirror::Object* obj, size_t* usable_size) OVERRIDE {
     return AllocationSizeNonvirtual(obj, usable_size);
   }
@@ -65,6 +70,11 @@
     // RosAlloc zeroes memory internally.
     return AllocCommon(self, num_bytes, bytes_allocated, usable_size);
   }
+  mirror::Object* AllocNonvirtualThreadUnsafe(Thread* self, size_t num_bytes,
+                                              size_t* bytes_allocated, size_t* usable_size) {
+    // RosAlloc zeroes memory internally. Pass in false for thread unsafe.
+    return AllocCommon<false>(self, num_bytes, bytes_allocated, usable_size);
+  }
 
   // TODO: NO_THREAD_SAFETY_ANALYSIS because SizeOf() requires that mutator_lock is held.
   size_t AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size)
@@ -116,6 +126,7 @@
                 size_t starting_size, size_t initial_size, bool low_memory_mode);
 
  private:
+  template<bool kThreadSafe = true>
   mirror::Object* AllocCommon(Thread* self, size_t num_bytes, size_t* bytes_allocated,
                               size_t* usable_size);
 
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index 0a87a16..dcf5357 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -203,9 +203,17 @@
   // Allocate num_bytes without allowing growth. If the allocation
   // succeeds, the output parameter bytes_allocated will be set to the
   // actually allocated bytes which is >= num_bytes.
+  // Alloc can be called from multiple threads at the same time and must be thread-safe.
   virtual mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
                                 size_t* usable_size) = 0;
 
+  // Thread-unsafe allocation for when mutators are suspended, used by the semispace collector.
+  virtual mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
+                                            size_t* usable_size)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return Alloc(self, num_bytes, bytes_allocated, usable_size);
+  }
+
   // Return the storage space required by obj.
   virtual size_t AllocationSize(mirror::Object* obj, size_t* usable_size) = 0;
 
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 7470670..e67a64f 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1018,7 +1018,8 @@
   tls32_.state_and_flags.as_struct.flags = 0;
   tls32_.state_and_flags.as_struct.state = kNative;
   memset(&tlsPtr_.held_mutexes[0], 0, sizeof(tlsPtr_.held_mutexes));
-  std::fill(tlsPtr_.rosalloc_runs, tlsPtr_.rosalloc_runs + kRosAllocNumOfSizeBrackets,
+  std::fill(tlsPtr_.rosalloc_runs,
+            tlsPtr_.rosalloc_runs + gc::allocator::RosAlloc::kNumThreadLocalSizeBrackets,
             gc::allocator::RosAlloc::GetDedicatedFullRun());
   for (uint32_t i = 0; i < kMaxCheckpoints; ++i) {
     tlsPtr_.checkpoint_functions[i] = nullptr;
diff --git a/runtime/thread.h b/runtime/thread.h
index f869285..8c17082 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -29,6 +29,7 @@
 #include "entrypoints/jni/jni_entrypoints.h"
 #include "entrypoints/portable/portable_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints.h"
+#include "gc/allocator/rosalloc.h"
 #include "globals.h"
 #include "jvalue.h"
 #include "object_callbacks.h"
@@ -783,9 +784,6 @@
     return tlsPtr_.thread_local_objects;
   }
 
-  // ROS alloc TLS.
-  static constexpr size_t kRosAllocNumOfSizeBrackets = 34;
-
   void* GetRosAllocRun(size_t index) const {
     return tlsPtr_.rosalloc_runs[index];
   }
@@ -1060,12 +1058,8 @@
     byte* thread_local_end;
     size_t thread_local_objects;
 
-    // Thread-local rosalloc runs. There are 34 size brackets in rosalloc
-    // runs (RosAlloc::kNumOfSizeBrackets). We can't refer to the
-    // RosAlloc class due to a header file circular dependency issue.
-    // To compensate, we check that the two values match at RosAlloc
-    // initialization time.
-    void* rosalloc_runs[kRosAllocNumOfSizeBrackets];
+    // There are RosAlloc::kNumThreadLocalSizeBrackets thread-local size brackets per thread.
+    void* rosalloc_runs[gc::allocator::RosAlloc::kNumThreadLocalSizeBrackets];
 
     // Thread-local allocation stack data/routines.
     mirror::Object** thread_local_alloc_stack_top;