Rosalloc thread local allocation path without a cas.

Speedup on N4:
MemAllocTest 3044 -> 2396 (~21% reduction)
BinaryTrees  4101 -> 2929 (~26% reduction)

Bug: 9986565
Change-Id: Ia1d1a37b9e001f903c3c056e8ec68fc8c623a78b
diff --git a/runtime/common_runtime_test.cc b/runtime/common_runtime_test.cc
index 8486597..e0d62d7 100644
--- a/runtime/common_runtime_test.cc
+++ b/runtime/common_runtime_test.cc
@@ -263,6 +263,8 @@
   // pool is created by the runtime.
   runtime_->GetHeap()->CreateThreadPool();
   runtime_->GetHeap()->VerifyHeap();  // Check for heap corruption before the test
+  // Reduce timinig-dependent flakiness in OOME behavior (eg StubTest.AllocObject).
+  runtime_->GetHeap()->SetMinIntervalHomogeneousSpaceCompactionByOom(0U);
 
   // Get the boot class path from the runtime so it can be used in tests.
   boot_class_path_ = class_linker_->GetBootClassPath();
diff --git a/runtime/gc/accounting/mod_union_table_test.cc b/runtime/gc/accounting/mod_union_table_test.cc
index 87ce166..7780935 100644
--- a/runtime/gc/accounting/mod_union_table_test.cc
+++ b/runtime/gc/accounting/mod_union_table_test.cc
@@ -48,9 +48,9 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     auto* klass = GetObjectArrayClass(self, space);
     const size_t size = ComputeArraySize(self, klass, component_count, 2);
-    size_t bytes_allocated = 0;
+    size_t bytes_allocated = 0, bytes_tl_bulk_allocated;
     auto* obj = down_cast<mirror::ObjectArray<mirror::Object>*>(
-        space->Alloc(self, size, &bytes_allocated, nullptr));
+        space->Alloc(self, size, &bytes_allocated, nullptr, &bytes_tl_bulk_allocated));
     if (obj != nullptr) {
       obj->SetClass(klass);
       obj->SetLength(static_cast<int32_t>(component_count));
@@ -77,9 +77,10 @@
       // copy of the class in the same space that we are allocating in.
       DCHECK(java_lang_object_array_ != nullptr);
       const size_t class_size = java_lang_object_array_->GetClassSize();
-      size_t bytes_allocated = 0;
+      size_t bytes_allocated = 0, bytes_tl_bulk_allocated;
       auto* klass = down_cast<mirror::Class*>(space->Alloc(self, class_size, &bytes_allocated,
-                                                           nullptr));
+                                                           nullptr,
+                                                           &bytes_tl_bulk_allocated));
       DCHECK(klass != nullptr);
       memcpy(klass, java_lang_object_array_, class_size);
       Runtime::Current()->GetHeap()->GetCardTable()->MarkCard(klass);
diff --git a/runtime/gc/allocator/rosalloc-inl.h b/runtime/gc/allocator/rosalloc-inl.h
index f6c9d3c..bba92a1 100644
--- a/runtime/gc/allocator/rosalloc-inl.h
+++ b/runtime/gc/allocator/rosalloc-inl.h
@@ -28,15 +28,19 @@
 }
 
 template<bool kThreadSafe>
-inline ALWAYS_INLINE void* RosAlloc::Alloc(Thread* self, size_t size, size_t* bytes_allocated) {
+inline ALWAYS_INLINE void* RosAlloc::Alloc(Thread* self, size_t size, size_t* bytes_allocated,
+                                           size_t* usable_size,
+                                           size_t* bytes_tl_bulk_allocated) {
   if (UNLIKELY(size > kLargeSizeThreshold)) {
-    return AllocLargeObject(self, size, bytes_allocated);
+    return AllocLargeObject(self, size, bytes_allocated, usable_size,
+                            bytes_tl_bulk_allocated);
   }
   void* m;
   if (kThreadSafe) {
-    m = AllocFromRun(self, size, bytes_allocated);
+    m = AllocFromRun(self, size, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
   } else {
-    m = AllocFromRunThreadUnsafe(self, size, bytes_allocated);
+    m = AllocFromRunThreadUnsafe(self, size, bytes_allocated, usable_size,
+                                 bytes_tl_bulk_allocated);
   }
   // Check if the returned memory is really all zero.
   if (ShouldCheckZeroMemory() && m != nullptr) {
@@ -48,6 +52,115 @@
   return m;
 }
 
+inline bool RosAlloc::Run::IsFull() {
+  const size_t num_vec = NumberOfBitmapVectors();
+  for (size_t v = 0; v < num_vec; ++v) {
+    if (~alloc_bit_map_[v] != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool RosAlloc::CanAllocFromThreadLocalRun(Thread* self, size_t size) {
+  if (UNLIKELY(!IsSizeForThreadLocal(size))) {
+    return false;
+  }
+  size_t bracket_size;
+  size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
+  DCHECK_EQ(idx, SizeToIndex(size));
+  DCHECK_EQ(bracket_size, IndexToBracketSize(idx));
+  DCHECK_EQ(bracket_size, bracketSizes[idx]);
+  DCHECK_LE(size, bracket_size);
+  DCHECK(size > 512 || bracket_size - size < 16);
+  DCHECK_LT(idx, kNumThreadLocalSizeBrackets);
+  Run* thread_local_run = reinterpret_cast<Run*>(self->GetRosAllocRun(idx));
+  if (kIsDebugBuild) {
+    // Need the lock to prevent race conditions.
+    MutexLock mu(self, *size_bracket_locks_[idx]);
+    CHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
+    CHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
+  }
+  DCHECK(thread_local_run != nullptr);
+  DCHECK(thread_local_run->IsThreadLocal() || thread_local_run == dedicated_full_run_);
+  return !thread_local_run->IsFull();
+}
+
+inline void* RosAlloc::AllocFromThreadLocalRun(Thread* self, size_t size,
+                                               size_t* bytes_allocated) {
+  DCHECK(bytes_allocated != nullptr);
+  if (UNLIKELY(!IsSizeForThreadLocal(size))) {
+    return nullptr;
+  }
+  size_t bracket_size;
+  size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
+  Run* thread_local_run = reinterpret_cast<Run*>(self->GetRosAllocRun(idx));
+  if (kIsDebugBuild) {
+    // Need the lock to prevent race conditions.
+    MutexLock mu(self, *size_bracket_locks_[idx]);
+    CHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
+    CHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
+  }
+  DCHECK(thread_local_run != nullptr);
+  DCHECK(thread_local_run->IsThreadLocal() || thread_local_run == dedicated_full_run_);
+  void* slot_addr = thread_local_run->AllocSlot();
+  if (LIKELY(slot_addr != nullptr)) {
+    *bytes_allocated = bracket_size;
+  }
+  return slot_addr;
+}
+
+inline size_t RosAlloc::MaxBytesBulkAllocatedFor(size_t size) {
+  if (UNLIKELY(!IsSizeForThreadLocal(size))) {
+    return size;
+  }
+  size_t bracket_size;
+  size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
+  return numOfSlots[idx] * bracket_size;
+}
+
+inline void* RosAlloc::Run::AllocSlot() {
+  const size_t idx = size_bracket_idx_;
+  while (true) {
+    if (kIsDebugBuild) {
+      // Make sure that no slots leaked, the bitmap should be full for all previous vectors.
+      for (size_t i = 0; i < first_search_vec_idx_; ++i) {
+        CHECK_EQ(~alloc_bit_map_[i], 0U);
+      }
+    }
+    uint32_t* const alloc_bitmap_ptr = &alloc_bit_map_[first_search_vec_idx_];
+    uint32_t ffz1 = __builtin_ffs(~*alloc_bitmap_ptr);
+    if (LIKELY(ffz1 != 0)) {
+      const uint32_t ffz = ffz1 - 1;
+      const uint32_t slot_idx = ffz +
+          first_search_vec_idx_ * sizeof(*alloc_bitmap_ptr) * kBitsPerByte;
+      const uint32_t mask = 1U << ffz;
+      DCHECK_LT(slot_idx, numOfSlots[idx]) << "out of range";
+      // Found an empty slot. Set the bit.
+      DCHECK_EQ(*alloc_bitmap_ptr & mask, 0U);
+      *alloc_bitmap_ptr |= mask;
+      DCHECK_NE(*alloc_bitmap_ptr & mask, 0U);
+      uint8_t* slot_addr = reinterpret_cast<uint8_t*>(this) +
+          headerSizes[idx] + slot_idx * bracketSizes[idx];
+      if (kTraceRosAlloc) {
+        LOG(INFO) << "RosAlloc::Run::AllocSlot() : 0x" << std::hex
+                  << reinterpret_cast<intptr_t>(slot_addr)
+                  << ", bracket_size=" << std::dec << bracketSizes[idx]
+                  << ", slot_idx=" << slot_idx;
+      }
+      return slot_addr;
+    }
+    const size_t num_words = RoundUp(numOfSlots[idx], 32) / 32;
+    if (first_search_vec_idx_ + 1 >= num_words) {
+      DCHECK(IsFull());
+      // Already at the last word, return null.
+      return nullptr;
+    }
+    // Increase the index to the next word and try again.
+    ++first_search_vec_idx_;
+  }
+}
+
 }  // namespace allocator
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index f51093a..f64a4ff 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -454,7 +454,10 @@
   return byte_size;
 }
 
-void* RosAlloc::AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated) {
+void* RosAlloc::AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated,
+                                 size_t* usable_size, size_t* bytes_tl_bulk_allocated) {
+  DCHECK(bytes_allocated != nullptr);
+  DCHECK(usable_size != nullptr);
   DCHECK_GT(size, kLargeSizeThreshold);
   size_t num_pages = RoundUp(size, kPageSize) / kPageSize;
   void* r;
@@ -470,6 +473,8 @@
   }
   const size_t total_bytes = num_pages * kPageSize;
   *bytes_allocated = total_bytes;
+  *usable_size = total_bytes;
+  *bytes_tl_bulk_allocated = total_bytes;
   if (kTraceRosAlloc) {
     LOG(INFO) << "RosAlloc::AllocLargeObject() : 0x" << std::hex << reinterpret_cast<intptr_t>(r)
               << "-0x" << (reinterpret_cast<intptr_t>(r) + num_pages * kPageSize)
@@ -622,7 +627,12 @@
   return slot_addr;
 }
 
-void* RosAlloc::AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated) {
+void* RosAlloc::AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated,
+                                         size_t* usable_size,
+                                         size_t* bytes_tl_bulk_allocated) {
+  DCHECK(bytes_allocated != nullptr);
+  DCHECK(usable_size != nullptr);
+  DCHECK(bytes_tl_bulk_allocated != nullptr);
   DCHECK_LE(size, kLargeSizeThreshold);
   size_t bracket_size;
   size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
@@ -634,14 +644,19 @@
   Locks::mutator_lock_->AssertExclusiveHeld(self);
   void* slot_addr = AllocFromCurrentRunUnlocked(self, idx);
   if (LIKELY(slot_addr != nullptr)) {
-    DCHECK(bytes_allocated != nullptr);
     *bytes_allocated = bracket_size;
-    // Caller verifies that it is all 0.
+    *usable_size = bracket_size;
+    *bytes_tl_bulk_allocated = bracket_size;
   }
+  // Caller verifies that it is all 0.
   return slot_addr;
 }
 
-void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated) {
+void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated,
+                             size_t* usable_size, size_t* bytes_tl_bulk_allocated) {
+  DCHECK(bytes_allocated != nullptr);
+  DCHECK(usable_size != nullptr);
+  DCHECK(bytes_tl_bulk_allocated != nullptr);
   DCHECK_LE(size, kLargeSizeThreshold);
   size_t bracket_size;
   size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
@@ -712,31 +727,43 @@
         self->SetRosAllocRun(idx, thread_local_run);
         DCHECK(!thread_local_run->IsFull());
       }
-
       DCHECK(thread_local_run != nullptr);
       DCHECK(!thread_local_run->IsFull());
       DCHECK(thread_local_run->IsThreadLocal());
+      // Account for all the free slots in the new or refreshed thread local run.
+      *bytes_tl_bulk_allocated = thread_local_run->NumberOfFreeSlots() * bracket_size;
       slot_addr = thread_local_run->AllocSlot();
       // Must succeed now with a new run.
       DCHECK(slot_addr != nullptr);
+    } else {
+      // The slot is already counted. Leave it as is.
+      *bytes_tl_bulk_allocated = 0;
     }
+    DCHECK(slot_addr != nullptr);
     if (kTraceRosAlloc) {
-      LOG(INFO) << "RosAlloc::AllocFromRun() thread-local : 0x" << std::hex << reinterpret_cast<intptr_t>(slot_addr)
+      LOG(INFO) << "RosAlloc::AllocFromRun() thread-local : 0x" << std::hex
+                << reinterpret_cast<intptr_t>(slot_addr)
                 << "-0x" << (reinterpret_cast<intptr_t>(slot_addr) + bracket_size)
                 << "(" << std::dec << (bracket_size) << ")";
     }
+    *bytes_allocated = bracket_size;
+    *usable_size = bracket_size;
   } else {
     // Use the (shared) current run.
     MutexLock mu(self, *size_bracket_locks_[idx]);
     slot_addr = AllocFromCurrentRunUnlocked(self, idx);
     if (kTraceRosAlloc) {
-      LOG(INFO) << "RosAlloc::AllocFromRun() : 0x" << std::hex << reinterpret_cast<intptr_t>(slot_addr)
+      LOG(INFO) << "RosAlloc::AllocFromRun() : 0x" << std::hex
+                << reinterpret_cast<intptr_t>(slot_addr)
                 << "-0x" << (reinterpret_cast<intptr_t>(slot_addr) + bracket_size)
                 << "(" << std::dec << (bracket_size) << ")";
     }
+    if (LIKELY(slot_addr != nullptr)) {
+      *bytes_allocated = bracket_size;
+      *usable_size = bracket_size;
+      *bytes_tl_bulk_allocated = bracket_size;
+    }
   }
-  DCHECK(bytes_allocated != nullptr);
-  *bytes_allocated = bracket_size;
   // Caller verifies that it is all 0.
   return slot_addr;
 }
@@ -852,44 +879,6 @@
   return stream.str();
 }
 
-inline void* RosAlloc::Run::AllocSlot() {
-  const size_t idx = size_bracket_idx_;
-  while (true) {
-    if (kIsDebugBuild) {
-      // Make sure that no slots leaked, the bitmap should be full for all previous vectors.
-      for (size_t i = 0; i < first_search_vec_idx_; ++i) {
-        CHECK_EQ(~alloc_bit_map_[i], 0U);
-      }
-    }
-    uint32_t* const alloc_bitmap_ptr = &alloc_bit_map_[first_search_vec_idx_];
-    uint32_t ffz1 = __builtin_ffs(~*alloc_bitmap_ptr);
-    if (LIKELY(ffz1 != 0)) {
-      const uint32_t ffz = ffz1 - 1;
-      const uint32_t slot_idx = ffz + first_search_vec_idx_ * sizeof(*alloc_bitmap_ptr) * kBitsPerByte;
-      const uint32_t mask = 1U << ffz;
-      DCHECK_LT(slot_idx, numOfSlots[idx]) << "out of range";
-      // Found an empty slot. Set the bit.
-      DCHECK_EQ(*alloc_bitmap_ptr & mask, 0U);
-      *alloc_bitmap_ptr |= mask;
-      DCHECK_NE(*alloc_bitmap_ptr & mask, 0U);
-      uint8_t* slot_addr = reinterpret_cast<uint8_t*>(this) + headerSizes[idx] + slot_idx * bracketSizes[idx];
-      if (kTraceRosAlloc) {
-        LOG(INFO) << "RosAlloc::Run::AllocSlot() : 0x" << std::hex << reinterpret_cast<intptr_t>(slot_addr)
-                  << ", bracket_size=" << std::dec << bracketSizes[idx] << ", slot_idx=" << slot_idx;
-      }
-      return slot_addr;
-    }
-    const size_t num_words = RoundUp(numOfSlots[idx], 32) / 32;
-    if (first_search_vec_idx_ + 1 >= num_words) {
-      DCHECK(IsFull());
-      // Already at the last word, return null.
-      return nullptr;
-    }
-    // Increase the index to the next word and try again.
-    ++first_search_vec_idx_;
-  }
-}
-
 void RosAlloc::Run::FreeSlot(void* ptr) {
   DCHECK(!IsThreadLocal());
   const uint8_t idx = size_bracket_idx_;
@@ -920,6 +909,25 @@
   }
 }
 
+size_t RosAlloc::Run::NumberOfFreeSlots() {
+  size_t num_alloc_slots = 0;
+  const size_t idx = size_bracket_idx_;
+  const size_t num_slots = numOfSlots[idx];
+  const size_t num_vec = RoundUp(num_slots, 32) / 32;
+  DCHECK_NE(num_vec, 0U);
+  for (size_t v = 0; v < num_vec - 1; v++) {
+    num_alloc_slots += POPCOUNT(alloc_bit_map_[v]);
+  }
+  // Don't count the invalid bits in the last vector.
+  uint32_t last_vec_masked = alloc_bit_map_[num_vec - 1] &
+      ~GetBitmapLastVectorMask(num_slots, num_vec);
+  num_alloc_slots += POPCOUNT(last_vec_masked);
+  size_t num_free_slots = num_slots - num_alloc_slots;
+  DCHECK_LE(num_alloc_slots, num_slots);
+  DCHECK_LE(num_free_slots, num_slots);
+  return num_free_slots;
+}
+
 inline bool RosAlloc::Run::MergeThreadLocalFreeBitMapToAllocBitMap(bool* is_all_free_after_out) {
   DCHECK(IsThreadLocal());
   // Free slots in the alloc bit map based on the thread local free bit map.
@@ -1055,16 +1063,6 @@
   return alloc_bit_map_[num_vec - 1] == GetBitmapLastVectorMask(num_slots, num_vec);
 }
 
-inline bool RosAlloc::Run::IsFull() {
-  const size_t num_vec = NumberOfBitmapVectors();
-  for (size_t v = 0; v < num_vec; ++v) {
-    if (~alloc_bit_map_[v] != 0) {
-      return false;
-    }
-  }
-  return true;
-}
-
 inline bool RosAlloc::Run::IsBulkFreeBitmapClean() {
   const size_t num_vec = NumberOfBitmapVectors();
   for (size_t v = 0; v < num_vec; v++) {
@@ -1654,10 +1652,11 @@
   }
 }
 
-void RosAlloc::RevokeThreadLocalRuns(Thread* thread) {
+size_t RosAlloc::RevokeThreadLocalRuns(Thread* thread) {
   Thread* self = Thread::Current();
   // Avoid race conditions on the bulk free bit maps with BulkFree() (GC).
   ReaderMutexLock wmu(self, bulk_free_lock_);
+  size_t free_bytes = 0U;
   for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; idx++) {
     MutexLock mu(self, *size_bracket_locks_[idx]);
     Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
@@ -1665,9 +1664,12 @@
     // Invalid means already revoked.
     DCHECK(thread_local_run->IsThreadLocal());
     if (thread_local_run != dedicated_full_run_) {
+      // Note the thread local run may not be full here.
       thread->SetRosAllocRun(idx, dedicated_full_run_);
       DCHECK_EQ(thread_local_run->magic_num_, kMagicNum);
-      // Note the thread local run may not be full here.
+      // Count the number of free slots left.
+      size_t num_free_slots = thread_local_run->NumberOfFreeSlots();
+      free_bytes += num_free_slots * bracketSizes[idx];
       bool dont_care;
       thread_local_run->MergeThreadLocalFreeBitMapToAllocBitMap(&dont_care);
       thread_local_run->SetIsThreadLocal(false);
@@ -1677,6 +1679,7 @@
       RevokeRun(self, idx, thread_local_run);
     }
   }
+  return free_bytes;
 }
 
 void RosAlloc::RevokeRun(Thread* self, size_t idx, Run* run) {
@@ -1719,16 +1722,18 @@
   }
 }
 
-void RosAlloc::RevokeAllThreadLocalRuns() {
+size_t RosAlloc::RevokeAllThreadLocalRuns() {
   // This is called when a mutator thread won't allocate such as at
   // the Zygote creation time or during the GC pause.
   MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_);
   MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_);
   std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+  size_t free_bytes = 0U;
   for (Thread* thread : thread_list) {
-    RevokeThreadLocalRuns(thread);
+    free_bytes += RevokeThreadLocalRuns(thread);
   }
   RevokeThreadUnsafeCurrentRuns();
+  return free_bytes;
 }
 
 void RosAlloc::AssertThreadLocalRunsAreRevoked(Thread* thread) {
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index 3269e10..d1e7ad9 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -230,8 +230,10 @@
     static uint32_t GetBitmapLastVectorMask(size_t num_slots, size_t num_vec);
     // Returns true if all the slots in the run are not in use.
     bool IsAllFree();
+    // Returns the number of free slots.
+    size_t NumberOfFreeSlots();
     // Returns true if all the slots in the run are in use.
-    bool IsFull();
+    ALWAYS_INLINE bool IsFull();
     // Returns true if the bulk free bit map is clean.
     bool IsBulkFreeBitmapClean();
     // Returns true if the thread local free bit map is clean.
@@ -309,6 +311,15 @@
     DCHECK(bracketSizes[idx] == size);
     return idx;
   }
+  // Returns true if the given allocation size is for a thread local allocation.
+  static bool IsSizeForThreadLocal(size_t size) {
+    DCHECK_GT(kNumThreadLocalSizeBrackets, 0U);
+    size_t max_thread_local_bracket_idx = kNumThreadLocalSizeBrackets - 1;
+    bool is_size_for_thread_local = size <= bracketSizes[max_thread_local_bracket_idx];
+    DCHECK(size > kLargeSizeThreshold ||
+           (is_size_for_thread_local == (SizeToIndex(size) < kNumThreadLocalSizeBrackets)));
+    return is_size_for_thread_local;
+  }
   // Rounds up the size up the nearest bracket size.
   static size_t RoundToBracketSize(size_t size) {
     DCHECK(size <= kLargeSizeThreshold);
@@ -504,11 +515,13 @@
   size_t FreePages(Thread* self, void* ptr, bool already_zero) EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Allocate/free a run slot.
-  void* AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated)
+  void* AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated, size_t* usable_size,
+                     size_t* bytes_tl_bulk_allocated)
       LOCKS_EXCLUDED(lock_);
   // Allocate/free a run slot without acquiring locks.
   // TODO: EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-  void* AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated)
+  void* AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated,
+                                 size_t* usable_size, size_t* bytes_tl_bulk_allocated)
       LOCKS_EXCLUDED(lock_);
   void* AllocFromCurrentRunUnlocked(Thread* self, size_t idx);
 
@@ -527,7 +540,9 @@
   size_t FreeInternal(Thread* self, void* ptr) LOCKS_EXCLUDED(lock_);
 
   // Allocates large objects.
-  void* AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated) LOCKS_EXCLUDED(lock_);
+  void* AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated,
+                         size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      LOCKS_EXCLUDED(lock_);
 
   // Revoke a run by adding it to non_full_runs_ or freeing the pages.
   void RevokeRun(Thread* self, size_t idx, Run* run);
@@ -551,13 +566,26 @@
   // If kThreadUnsafe is true then the allocator may avoid acquiring some locks as an optimization.
   // If used, this may cause race conditions if multiple threads are allocating at the same time.
   template<bool kThreadSafe = true>
-  void* Alloc(Thread* self, size_t size, size_t* bytes_allocated)
+  void* Alloc(Thread* self, size_t size, size_t* bytes_allocated, size_t* usable_size,
+              size_t* bytes_tl_bulk_allocated)
       LOCKS_EXCLUDED(lock_);
   size_t Free(Thread* self, void* ptr)
       LOCKS_EXCLUDED(bulk_free_lock_);
   size_t BulkFree(Thread* self, void** ptrs, size_t num_ptrs)
       LOCKS_EXCLUDED(bulk_free_lock_);
 
+  // Returns true if the given allocation request can be allocated in
+  // an existing thread local run without allocating a new run.
+  ALWAYS_INLINE bool CanAllocFromThreadLocalRun(Thread* self, size_t size);
+  // Allocate the given allocation request in an existing thread local
+  // run without allocating a new run.
+  ALWAYS_INLINE void* AllocFromThreadLocalRun(Thread* self, size_t size, size_t* bytes_allocated);
+
+  // Returns the maximum bytes that could be allocated for the given
+  // size in bulk, that is the maximum value for the
+  // bytes_allocated_bulk out param returned by RosAlloc::Alloc().
+  ALWAYS_INLINE size_t MaxBytesBulkAllocatedFor(size_t size);
+
   // Returns the size of the allocated slot for a given allocated memory chunk.
   size_t UsableSize(const void* ptr);
   // Returns the size of the allocated slot for a given size.
@@ -586,9 +614,13 @@
   void SetFootprintLimit(size_t bytes) LOCKS_EXCLUDED(lock_);
 
   // Releases the thread-local runs assigned to the given thread back to the common set of runs.
-  void RevokeThreadLocalRuns(Thread* thread);
+  // Returns the total bytes of free slots in the revoked thread local runs. This is to be
+  // subtracted from Heap::num_bytes_allocated_ to cancel out the ahead-of-time counting.
+  size_t RevokeThreadLocalRuns(Thread* thread);
   // Releases the thread-local runs assigned to all the threads back to the common set of runs.
-  void RevokeAllThreadLocalRuns() LOCKS_EXCLUDED(Locks::thread_list_lock_);
+  // Returns the total bytes of free slots in the revoked thread local runs. This is to be
+  // subtracted from Heap::num_bytes_allocated_ to cancel out the ahead-of-time counting.
+  size_t RevokeAllThreadLocalRuns() LOCKS_EXCLUDED(Locks::thread_list_lock_);
   // Assert the thread local runs of a thread are revoked.
   void AssertThreadLocalRunsAreRevoked(Thread* thread);
   // Assert all the thread local runs are revoked.
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index dd45eca..db7a4ef 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -1259,8 +1259,9 @@
   size_t region_space_bytes_allocated = 0U;
   size_t non_moving_space_bytes_allocated = 0U;
   size_t bytes_allocated = 0U;
+  size_t dummy;
   mirror::Object* to_ref = region_space_->AllocNonvirtual<true>(
-      region_space_alloc_size, &region_space_bytes_allocated, nullptr);
+      region_space_alloc_size, &region_space_bytes_allocated, nullptr, &dummy);
   bytes_allocated = region_space_bytes_allocated;
   if (to_ref != nullptr) {
     DCHECK_EQ(region_space_alloc_size, region_space_bytes_allocated);
@@ -1286,7 +1287,7 @@
       }
       fall_back_to_non_moving = true;
       to_ref = heap_->non_moving_space_->Alloc(Thread::Current(), obj_size,
-                                               &non_moving_space_bytes_allocated, nullptr);
+                                               &non_moving_space_bytes_allocated, nullptr, &dummy);
       CHECK(to_ref != nullptr) << "Fall-back non-moving space allocation failed";
       bytes_allocated = non_moving_space_bytes_allocated;
       // Mark it in the mark bitmap.
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index 8be18be..eafcc45 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -48,6 +48,7 @@
   gc_cause_ = gc_cause;
   freed_ = ObjectBytePair();
   freed_los_ = ObjectBytePair();
+  freed_bytes_revoke_ = 0;
 }
 
 uint64_t Iteration::GetEstimatedThroughput() const {
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index b809469..ed5207a 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -75,6 +75,12 @@
   uint64_t GetFreedLargeObjects() const {
     return freed_los_.objects;
   }
+  uint64_t GetFreedRevokeBytes() const {
+    return freed_bytes_revoke_;
+  }
+  void SetFreedRevoke(uint64_t freed) {
+    freed_bytes_revoke_ = freed;
+  }
   void Reset(GcCause gc_cause, bool clear_soft_references);
   // Returns the estimated throughput of the iteration.
   uint64_t GetEstimatedThroughput() const;
@@ -99,6 +105,7 @@
   TimingLogger timings_;
   ObjectBytePair freed_;
   ObjectBytePair freed_los_;
+  uint64_t freed_bytes_revoke_;  // see Heap::num_bytes_freed_revoke_.
   std::vector<uint64_t> pause_times_;
 
   friend class GarbageCollector;
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 8aac484..ee4e752 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -292,6 +292,7 @@
   Runtime::Current()->AllowNewSystemWeaks();
   {
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
+    GetHeap()->RecordFreeRevoke();
     // Reclaim unmarked objects.
     Sweep(false);
     // Swap the live and mark bitmaps for each space which we modified space. This is an
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index c1ba5e3..b3d59f2 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -242,6 +242,7 @@
   // Revoke buffers before measuring how many objects were moved since the TLABs need to be revoked
   // before they are properly counted.
   RevokeAllThreadLocalBuffers();
+  GetHeap()->RecordFreeRevoke();  // this is for the non-moving rosalloc space used by GSS.
   // Record freed memory.
   const int64_t from_bytes = from_space_->GetBytesAllocated();
   const int64_t to_bytes = bytes_moved_;
@@ -489,17 +490,18 @@
 
 mirror::Object* SemiSpace::MarkNonForwardedObject(mirror::Object* obj) {
   const size_t object_size = obj->SizeOf();
-  size_t bytes_allocated;
+  size_t bytes_allocated, dummy;
   mirror::Object* forward_address = nullptr;
   if (generational_ && reinterpret_cast<uint8_t*>(obj) < last_gc_to_space_end_) {
     // If it's allocated before the last GC (older), move
     // (pseudo-promote) it to the main free list space (as sort
     // of an old generation.)
     forward_address = promo_dest_space_->AllocThreadUnsafe(self_, object_size, &bytes_allocated,
-                                                           nullptr);
+                                                           nullptr, &dummy);
     if (UNLIKELY(forward_address == nullptr)) {
       // If out of space, fall back to the to-space.
-      forward_address = to_space_->AllocThreadUnsafe(self_, object_size, &bytes_allocated, nullptr);
+      forward_address = to_space_->AllocThreadUnsafe(self_, object_size, &bytes_allocated, nullptr,
+                                                     &dummy);
       // No logic for marking the bitmap, so it must be null.
       DCHECK(to_space_live_bitmap_ == nullptr);
     } else {
@@ -544,7 +546,8 @@
     }
   } else {
     // If it's allocated after the last GC (younger), copy it to the to-space.
-    forward_address = to_space_->AllocThreadUnsafe(self_, object_size, &bytes_allocated, nullptr);
+    forward_address = to_space_->AllocThreadUnsafe(self_, object_size, &bytes_allocated, nullptr,
+                                                   &dummy);
     if (forward_address != nullptr && to_space_live_bitmap_ != nullptr) {
       to_space_live_bitmap_->Set(forward_address);
     }
@@ -552,7 +555,7 @@
   // If it's still null, attempt to use the fallback space.
   if (UNLIKELY(forward_address == nullptr)) {
     forward_address = fallback_space_->AllocThreadUnsafe(self_, object_size, &bytes_allocated,
-                                                         nullptr);
+                                                         nullptr, &dummy);
     CHECK(forward_address != nullptr) << "Out of memory in the to-space and fallback space.";
     accounting::ContinuousSpaceBitmap* bitmap = fallback_space_->GetLiveBitmap();
     if (bitmap != nullptr) {
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index b8c2452..b770096 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -64,6 +64,7 @@
     // fragmentation.
   }
   AllocationTimer alloc_timer(this, &obj);
+  // bytes allocated for the (individual) object.
   size_t bytes_allocated;
   size_t usable_size;
   size_t new_num_bytes_allocated = 0;
@@ -86,13 +87,29 @@
     usable_size = bytes_allocated;
     pre_fence_visitor(obj, usable_size);
     QuasiAtomic::ThreadFenceForConstructor();
+  } else if (!kInstrumented && allocator == kAllocatorTypeRosAlloc &&
+             (obj = rosalloc_space_->AllocThreadLocal(self, byte_count, &bytes_allocated)) &&
+             LIKELY(obj != nullptr)) {
+    DCHECK(!running_on_valgrind_);
+    obj->SetClass(klass);
+    if (kUseBakerOrBrooksReadBarrier) {
+      if (kUseBrooksReadBarrier) {
+        obj->SetReadBarrierPointer(obj);
+      }
+      obj->AssertReadBarrierPointer();
+    }
+    usable_size = bytes_allocated;
+    pre_fence_visitor(obj, usable_size);
+    QuasiAtomic::ThreadFenceForConstructor();
   } else {
+    // bytes allocated that takes bulk thread-local buffer allocations into account.
+    size_t bytes_tl_bulk_allocated = 0;
     obj = TryToAllocate<kInstrumented, false>(self, allocator, byte_count, &bytes_allocated,
-                                              &usable_size);
+                                              &usable_size, &bytes_tl_bulk_allocated);
     if (UNLIKELY(obj == nullptr)) {
       bool is_current_allocator = allocator == GetCurrentAllocator();
       obj = AllocateInternalWithGc(self, allocator, byte_count, &bytes_allocated, &usable_size,
-                                   &klass);
+                                   &bytes_tl_bulk_allocated, &klass);
       if (obj == nullptr) {
         bool after_is_current_allocator = allocator == GetCurrentAllocator();
         // If there is a pending exception, fail the allocation right away since the next one
@@ -126,9 +143,9 @@
       WriteBarrierField(obj, mirror::Object::ClassOffset(), klass);
     }
     pre_fence_visitor(obj, usable_size);
-    new_num_bytes_allocated =
-        static_cast<size_t>(num_bytes_allocated_.FetchAndAddSequentiallyConsistent(bytes_allocated))
-        + bytes_allocated;
+    new_num_bytes_allocated = static_cast<size_t>(
+        num_bytes_allocated_.FetchAndAddSequentiallyConsistent(bytes_tl_bulk_allocated))
+        + bytes_tl_bulk_allocated;
   }
   if (kIsDebugBuild && Runtime::Current()->IsStarted()) {
     CHECK_LE(obj->SizeOf(), usable_size);
@@ -196,8 +213,10 @@
 template <const bool kInstrumented, const bool kGrow>
 inline mirror::Object* Heap::TryToAllocate(Thread* self, AllocatorType allocator_type,
                                            size_t alloc_size, size_t* bytes_allocated,
-                                           size_t* usable_size) {
+                                           size_t* usable_size,
+                                           size_t* bytes_tl_bulk_allocated) {
   if (allocator_type != kAllocatorTypeTLAB && allocator_type != kAllocatorTypeRegionTLAB &&
+      allocator_type != kAllocatorTypeRosAlloc &&
       UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) {
     return nullptr;
   }
@@ -210,35 +229,56 @@
       if (LIKELY(ret != nullptr)) {
         *bytes_allocated = alloc_size;
         *usable_size = alloc_size;
+        *bytes_tl_bulk_allocated = alloc_size;
       }
       break;
     }
     case kAllocatorTypeRosAlloc: {
       if (kInstrumented && UNLIKELY(running_on_valgrind_)) {
         // If running on valgrind, we should be using the instrumented path.
-        ret = rosalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size);
+        size_t max_bytes_tl_bulk_allocated = rosalloc_space_->MaxBytesBulkAllocatedFor(alloc_size);
+        if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type,
+                                                      max_bytes_tl_bulk_allocated))) {
+          return nullptr;
+        }
+        ret = rosalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
+                                     bytes_tl_bulk_allocated);
       } else {
         DCHECK(!running_on_valgrind_);
-        ret = rosalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size);
+        size_t max_bytes_tl_bulk_allocated =
+            rosalloc_space_->MaxBytesBulkAllocatedForNonvirtual(alloc_size);
+        if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type,
+                                                      max_bytes_tl_bulk_allocated))) {
+          return nullptr;
+        }
+        if (!kInstrumented) {
+          DCHECK(!rosalloc_space_->CanAllocThreadLocal(self, alloc_size));
+        }
+        ret = rosalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size,
+                                               bytes_tl_bulk_allocated);
       }
       break;
     }
     case kAllocatorTypeDlMalloc: {
       if (kInstrumented && UNLIKELY(running_on_valgrind_)) {
         // If running on valgrind, we should be using the instrumented path.
-        ret = dlmalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size);
+        ret = dlmalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
+                                     bytes_tl_bulk_allocated);
       } else {
         DCHECK(!running_on_valgrind_);
-        ret = dlmalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size);
+        ret = dlmalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size,
+                                               bytes_tl_bulk_allocated);
       }
       break;
     }
     case kAllocatorTypeNonMoving: {
-      ret = non_moving_space_->Alloc(self, alloc_size, bytes_allocated, usable_size);
+      ret = non_moving_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
+                                     bytes_tl_bulk_allocated);
       break;
     }
     case kAllocatorTypeLOS: {
-      ret = large_object_space_->Alloc(self, alloc_size, bytes_allocated, usable_size);
+      ret = large_object_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
+                                       bytes_tl_bulk_allocated);
       // Note that the bump pointer spaces aren't necessarily next to
       // the other continuous spaces like the non-moving alloc space or
       // the zygote space.
@@ -257,20 +297,22 @@
         if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) {
           return nullptr;
         }
-        *bytes_allocated = new_tlab_size;
+        *bytes_tl_bulk_allocated = new_tlab_size;
       } else {
-        *bytes_allocated = 0;
+        *bytes_tl_bulk_allocated = 0;
       }
       // The allocation can't fail.
       ret = self->AllocTlab(alloc_size);
       DCHECK(ret != nullptr);
+      *bytes_allocated = alloc_size;
       *usable_size = alloc_size;
       break;
     }
     case kAllocatorTypeRegion: {
       DCHECK(region_space_ != nullptr);
       alloc_size = RoundUp(alloc_size, space::RegionSpace::kAlignment);
-      ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size);
+      ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
+                                                  bytes_tl_bulk_allocated);
       break;
     }
     case kAllocatorTypeRegionTLAB: {
@@ -283,15 +325,17 @@
             // Try to allocate a tlab.
             if (!region_space_->AllocNewTlab(self)) {
               // Failed to allocate a tlab. Try non-tlab.
-              ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size);
+              ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
+                                                          bytes_tl_bulk_allocated);
               return ret;
             }
-            *bytes_allocated = space::RegionSpace::kRegionSize;
+            *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize;
             // Fall-through.
           } else {
             // Check OOME for a non-tlab allocation.
             if (!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size)) {
-              ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size);
+              ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
+                                                          bytes_tl_bulk_allocated);
               return ret;
             } else {
               // Neither tlab or non-tlab works. Give up.
@@ -301,18 +345,20 @@
         } else {
           // Large. Check OOME.
           if (LIKELY(!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) {
-            ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size);
+            ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
+                                                        bytes_tl_bulk_allocated);
             return ret;
           } else {
             return nullptr;
           }
         }
       } else {
-        *bytes_allocated = 0;
+        *bytes_tl_bulk_allocated = 0;  // Allocated in an existing buffer.
       }
       // The allocation can't fail.
       ret = self->AllocTlab(alloc_size);
       DCHECK(ret != nullptr);
+      *bytes_allocated = alloc_size;
       *usable_size = alloc_size;
       break;
     }
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 7534515..a41d65c 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -156,6 +156,7 @@
       total_objects_freed_ever_(0),
       num_bytes_allocated_(0),
       native_bytes_allocated_(0),
+      num_bytes_freed_revoke_(0),
       verify_missing_card_marks_(false),
       verify_system_weaks_(false),
       verify_pre_gc_heap_(verify_pre_gc_heap),
@@ -1344,6 +1345,19 @@
   }
 }
 
+void Heap::RecordFreeRevoke() {
+  // Subtract num_bytes_freed_revoke_ from num_bytes_allocated_ to cancel out the
+  // the ahead-of-time, bulk counting of bytes allocated in rosalloc thread-local buffers.
+  // If there's a concurrent revoke, ok to not necessarily reset num_bytes_freed_revoke_
+  // all the way to zero exactly as the remainder will be subtracted at the next GC.
+  size_t bytes_freed = num_bytes_freed_revoke_.LoadSequentiallyConsistent();
+  CHECK_GE(num_bytes_freed_revoke_.FetchAndSubSequentiallyConsistent(bytes_freed),
+           bytes_freed) << "num_bytes_freed_revoke_ underflow";
+  CHECK_GE(num_bytes_allocated_.FetchAndSubSequentiallyConsistent(bytes_freed),
+           bytes_freed) << "num_bytes_allocated_ underflow";
+  GetCurrentGcIteration()->SetFreedRevoke(bytes_freed);
+}
+
 space::RosAllocSpace* Heap::GetRosAllocSpace(gc::allocator::RosAlloc* rosalloc) const {
   for (const auto& space : continuous_spaces_) {
     if (space->AsContinuousSpace()->IsRosAllocSpace()) {
@@ -1358,6 +1372,7 @@
 mirror::Object* Heap::AllocateInternalWithGc(Thread* self, AllocatorType allocator,
                                              size_t alloc_size, size_t* bytes_allocated,
                                              size_t* usable_size,
+                                             size_t* bytes_tl_bulk_allocated,
                                              mirror::Class** klass) {
   bool was_default_allocator = allocator == GetCurrentAllocator();
   // Make sure there is no pending exception since we may need to throw an OOME.
@@ -1377,7 +1392,7 @@
     }
     // A GC was in progress and we blocked, retry allocation now that memory has been freed.
     mirror::Object* ptr = TryToAllocate<true, false>(self, allocator, alloc_size, bytes_allocated,
-                                                     usable_size);
+                                                     usable_size, bytes_tl_bulk_allocated);
     if (ptr != nullptr) {
       return ptr;
     }
@@ -1391,7 +1406,7 @@
   }
   if (gc_ran) {
     mirror::Object* ptr = TryToAllocate<true, false>(self, allocator, alloc_size, bytes_allocated,
-                                                     usable_size);
+                                                     usable_size, bytes_tl_bulk_allocated);
     if (ptr != nullptr) {
       return ptr;
     }
@@ -1411,7 +1426,7 @@
     if (plan_gc_ran) {
       // Did we free sufficient memory for the allocation to succeed?
       mirror::Object* ptr = TryToAllocate<true, false>(self, allocator, alloc_size, bytes_allocated,
-                                                       usable_size);
+                                                       usable_size, bytes_tl_bulk_allocated);
       if (ptr != nullptr) {
         return ptr;
       }
@@ -1420,7 +1435,7 @@
   // Allocations have failed after GCs;  this is an exceptional state.
   // Try harder, growing the heap if necessary.
   mirror::Object* ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated,
-                                                  usable_size);
+                                                  usable_size, bytes_tl_bulk_allocated);
   if (ptr != nullptr) {
     return ptr;
   }
@@ -1437,7 +1452,8 @@
   if (was_default_allocator && allocator != GetCurrentAllocator()) {
     return nullptr;
   }
-  ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated, usable_size);
+  ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated, usable_size,
+                                  bytes_tl_bulk_allocated);
   if (ptr == nullptr) {
     const uint64_t current_time = NanoTime();
     switch (allocator) {
@@ -1453,7 +1469,7 @@
             case HomogeneousSpaceCompactResult::kSuccess:
               // If the allocation succeeded, we delayed an oom.
               ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated,
-                                              usable_size);
+                                              usable_size, bytes_tl_bulk_allocated);
               if (ptr != nullptr) {
                 count_delayed_oom_++;
               }
@@ -1498,7 +1514,7 @@
           } else {
             LOG(WARNING) << "Disabled moving GC due to the non moving space being full";
             ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated,
-                                            usable_size);
+                                            usable_size, bytes_tl_bulk_allocated);
           }
         }
         break;
@@ -1984,8 +2000,8 @@
     if (it == bins_.end()) {
       // No available space in the bins, place it in the target space instead (grows the zygote
       // space).
-      size_t bytes_allocated;
-      forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated, nullptr);
+      size_t bytes_allocated, dummy;
+      forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated, nullptr, &dummy);
       if (to_space_live_bitmap_ != nullptr) {
         to_space_live_bitmap_->Set(forward_address);
       } else {
@@ -3084,7 +3100,8 @@
     SetIdealFootprint(target_size);
     if (IsGcConcurrent()) {
       const uint64_t freed_bytes = current_gc_iteration_.GetFreedBytes() +
-          current_gc_iteration_.GetFreedLargeObjectBytes();
+          current_gc_iteration_.GetFreedLargeObjectBytes() +
+          current_gc_iteration_.GetFreedRevokeBytes();
       // Bytes allocated will shrink by freed_bytes after the GC runs, so if we want to figure out
       // how many bytes were allocated during the GC we need to add freed_bytes back on.
       CHECK_GE(bytes_allocated + freed_bytes, bytes_allocated_before_gc);
@@ -3290,31 +3307,43 @@
 
 void Heap::RevokeThreadLocalBuffers(Thread* thread) {
   if (rosalloc_space_ != nullptr) {
-    rosalloc_space_->RevokeThreadLocalBuffers(thread);
+    size_t freed_bytes_revoke = rosalloc_space_->RevokeThreadLocalBuffers(thread);
+    if (freed_bytes_revoke > 0U) {
+      num_bytes_freed_revoke_.FetchAndAddSequentiallyConsistent(freed_bytes_revoke);
+      CHECK_GE(num_bytes_allocated_.LoadRelaxed(), num_bytes_freed_revoke_.LoadRelaxed());
+    }
   }
   if (bump_pointer_space_ != nullptr) {
-    bump_pointer_space_->RevokeThreadLocalBuffers(thread);
+    CHECK_EQ(bump_pointer_space_->RevokeThreadLocalBuffers(thread), 0U);
   }
   if (region_space_ != nullptr) {
-    region_space_->RevokeThreadLocalBuffers(thread);
+    CHECK_EQ(region_space_->RevokeThreadLocalBuffers(thread), 0U);
   }
 }
 
 void Heap::RevokeRosAllocThreadLocalBuffers(Thread* thread) {
   if (rosalloc_space_ != nullptr) {
-    rosalloc_space_->RevokeThreadLocalBuffers(thread);
+    size_t freed_bytes_revoke = rosalloc_space_->RevokeThreadLocalBuffers(thread);
+    if (freed_bytes_revoke > 0U) {
+      num_bytes_freed_revoke_.FetchAndAddSequentiallyConsistent(freed_bytes_revoke);
+      CHECK_GE(num_bytes_allocated_.LoadRelaxed(), num_bytes_freed_revoke_.LoadRelaxed());
+    }
   }
 }
 
 void Heap::RevokeAllThreadLocalBuffers() {
   if (rosalloc_space_ != nullptr) {
-    rosalloc_space_->RevokeAllThreadLocalBuffers();
+    size_t freed_bytes_revoke = rosalloc_space_->RevokeAllThreadLocalBuffers();
+    if (freed_bytes_revoke > 0U) {
+      num_bytes_freed_revoke_.FetchAndAddSequentiallyConsistent(freed_bytes_revoke);
+      CHECK_GE(num_bytes_allocated_.LoadRelaxed(), num_bytes_freed_revoke_.LoadRelaxed());
+    }
   }
   if (bump_pointer_space_ != nullptr) {
-    bump_pointer_space_->RevokeAllThreadLocalBuffers();
+    CHECK_EQ(bump_pointer_space_->RevokeAllThreadLocalBuffers(), 0U);
   }
   if (region_space_ != nullptr) {
-    region_space_->RevokeAllThreadLocalBuffers();
+    CHECK_EQ(region_space_->RevokeAllThreadLocalBuffers(), 0U);
   }
 }
 
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index d41e17f..959ff18 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -390,6 +390,9 @@
   // free-list backed space.
   void RecordFree(uint64_t freed_objects, int64_t freed_bytes);
 
+  // Record the bytes freed by thread-local buffer revoke.
+  void RecordFreeRevoke();
+
   // Must be called if a field of an Object in the heap changes, and before any GC safe-point.
   // The call is not needed if NULL is stored in the field.
   ALWAYS_INLINE void WriteBarrierField(const mirror::Object* dst, MemberOffset /*offset*/,
@@ -664,6 +667,11 @@
   // Whether or not we may use a garbage collector, used so that we only create collectors we need.
   bool MayUseCollector(CollectorType type) const;
 
+  // Used by tests to reduce timinig-dependent flakiness in OOME behavior.
+  void SetMinIntervalHomogeneousSpaceCompactionByOom(uint64_t interval) {
+    min_interval_homogeneous_space_compaction_by_oom_ = interval;
+  }
+
  private:
   class ConcurrentGCTask;
   class CollectorTransitionTask;
@@ -724,6 +732,7 @@
   // an initial allocation attempt failed.
   mirror::Object* AllocateInternalWithGc(Thread* self, AllocatorType allocator, size_t num_bytes,
                                          size_t* bytes_allocated, size_t* usable_size,
+                                         size_t* bytes_tl_bulk_allocated,
                                          mirror::Class** klass)
       LOCKS_EXCLUDED(Locks::thread_suspend_count_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -742,7 +751,8 @@
   template <const bool kInstrumented, const bool kGrow>
   ALWAYS_INLINE mirror::Object* TryToAllocate(Thread* self, AllocatorType allocator_type,
                                               size_t alloc_size, size_t* bytes_allocated,
-                                              size_t* usable_size)
+                                              size_t* usable_size,
+                                              size_t* bytes_tl_bulk_allocated)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType allocator_type)
@@ -998,6 +1008,13 @@
   // Bytes which are allocated and managed by native code but still need to be accounted for.
   Atomic<size_t> native_bytes_allocated_;
 
+  // Number of bytes freed by thread local buffer revokes. This will
+  // cancel out the ahead-of-time bulk counting of bytes allocated in
+  // rosalloc thread-local buffers.  It is temporarily accumulated
+  // here to be subtracted from num_bytes_allocated_ later at the next
+  // GC.
+  Atomic<size_t> num_bytes_freed_revoke_;
+
   // Info related to the current or previous GC iteration.
   collector::Iteration current_gc_iteration_;
 
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 9f1f953..14a93d1 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -24,7 +24,8 @@
 namespace space {
 
 inline mirror::Object* BumpPointerSpace::Alloc(Thread*, size_t num_bytes, size_t* bytes_allocated,
-                                               size_t* usable_size) {
+                                               size_t* usable_size,
+                                               size_t* bytes_tl_bulk_allocated) {
   num_bytes = RoundUp(num_bytes, kAlignment);
   mirror::Object* ret = AllocNonvirtual(num_bytes);
   if (LIKELY(ret != nullptr)) {
@@ -32,13 +33,15 @@
     if (usable_size != nullptr) {
       *usable_size = num_bytes;
     }
+    *bytes_tl_bulk_allocated = num_bytes;
   }
   return ret;
 }
 
 inline mirror::Object* BumpPointerSpace::AllocThreadUnsafe(Thread* self, size_t num_bytes,
                                                            size_t* bytes_allocated,
-                                                           size_t* usable_size) {
+                                                           size_t* usable_size,
+                                                           size_t* bytes_tl_bulk_allocated) {
   Locks::mutator_lock_->AssertExclusiveHeld(self);
   num_bytes = RoundUp(num_bytes, kAlignment);
   uint8_t* end = end_.LoadRelaxed();
@@ -54,6 +57,7 @@
   if (UNLIKELY(usable_size != nullptr)) {
     *usable_size = num_bytes;
   }
+  *bytes_tl_bulk_allocated = num_bytes;
   return obj;
 }
 
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index fbfc449..1303d77 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -93,12 +93,13 @@
   return reinterpret_cast<mirror::Object*>(RoundUp(position, kAlignment));
 }
 
-void BumpPointerSpace::RevokeThreadLocalBuffers(Thread* thread) {
+size_t BumpPointerSpace::RevokeThreadLocalBuffers(Thread* thread) {
   MutexLock mu(Thread::Current(), block_lock_);
   RevokeThreadLocalBuffersLocked(thread);
+  return 0U;
 }
 
-void BumpPointerSpace::RevokeAllThreadLocalBuffers() {
+size_t BumpPointerSpace::RevokeAllThreadLocalBuffers() {
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::runtime_shutdown_lock_);
   MutexLock mu2(self, *Locks::thread_list_lock_);
@@ -107,6 +108,7 @@
   for (Thread* thread : thread_list) {
     RevokeThreadLocalBuffers(thread);
   }
+  return 0U;
 }
 
 void BumpPointerSpace::AssertThreadLocalBuffersAreRevoked(Thread* thread) {
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 089ede4..c496a42 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -47,10 +47,10 @@
 
   // Allocate num_bytes, returns nullptr if the space is full.
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                        size_t* usable_size) OVERRIDE;
+                        size_t* usable_size, size_t* bytes_tl_bulk_allocated) OVERRIDE;
   // Thread-unsafe allocation for when mutators are suspended, used by the semispace collector.
   mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                    size_t* usable_size)
+                                    size_t* usable_size, size_t* bytes_tl_bulk_allocated)
       OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   mirror::Object* AllocNonvirtual(size_t num_bytes);
@@ -103,9 +103,9 @@
 
   void Dump(std::ostream& os) const;
 
-  void RevokeThreadLocalBuffers(Thread* thread) LOCKS_EXCLUDED(block_lock_);
-  void RevokeAllThreadLocalBuffers() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
-                                                    Locks::thread_list_lock_);
+  size_t RevokeThreadLocalBuffers(Thread* thread) LOCKS_EXCLUDED(block_lock_);
+  size_t RevokeAllThreadLocalBuffers() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
+                                                      Locks::thread_list_lock_);
   void AssertThreadLocalBuffersAreRevoked(Thread* thread) LOCKS_EXCLUDED(block_lock_);
   void AssertAllThreadLocalBuffersAreRevoked() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
                                                               Locks::thread_list_lock_);
diff --git a/runtime/gc/space/dlmalloc_space-inl.h b/runtime/gc/space/dlmalloc_space-inl.h
index 4c8a35e..9eace89 100644
--- a/runtime/gc/space/dlmalloc_space-inl.h
+++ b/runtime/gc/space/dlmalloc_space-inl.h
@@ -27,11 +27,13 @@
 
 inline mirror::Object* DlMallocSpace::AllocNonvirtual(Thread* self, size_t num_bytes,
                                                       size_t* bytes_allocated,
-                                                      size_t* usable_size) {
+                                                      size_t* usable_size,
+                                                      size_t* bytes_tl_bulk_allocated) {
   mirror::Object* obj;
   {
     MutexLock mu(self, lock_);
-    obj = AllocWithoutGrowthLocked(self, num_bytes, bytes_allocated, usable_size);
+    obj = AllocWithoutGrowthLocked(self, num_bytes, bytes_allocated, usable_size,
+                                   bytes_tl_bulk_allocated);
   }
   if (LIKELY(obj != NULL)) {
     // Zero freshly allocated memory, done while not holding the space's lock.
@@ -49,9 +51,11 @@
   return size + kChunkOverhead;
 }
 
-inline mirror::Object* DlMallocSpace::AllocWithoutGrowthLocked(Thread* /*self*/, size_t num_bytes,
-                                                               size_t* bytes_allocated,
-                                                               size_t* usable_size) {
+inline mirror::Object* DlMallocSpace::AllocWithoutGrowthLocked(
+    Thread* /*self*/, size_t num_bytes,
+    size_t* bytes_allocated,
+    size_t* usable_size,
+    size_t* bytes_tl_bulk_allocated) {
   mirror::Object* result = reinterpret_cast<mirror::Object*>(mspace_malloc(mspace_, num_bytes));
   if (LIKELY(result != NULL)) {
     if (kDebugSpaces) {
@@ -61,6 +65,7 @@
     size_t allocation_size = AllocationSizeNonvirtual(result, usable_size);
     DCHECK(bytes_allocated != NULL);
     *bytes_allocated = allocation_size;
+    *bytes_tl_bulk_allocated = allocation_size;
   }
   return result;
 }
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index b8a9dd6..225861d 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -123,7 +123,8 @@
 }
 
 mirror::Object* DlMallocSpace::AllocWithGrowth(Thread* self, size_t num_bytes,
-                                               size_t* bytes_allocated, size_t* usable_size) {
+                                               size_t* bytes_allocated, size_t* usable_size,
+                                               size_t* bytes_tl_bulk_allocated) {
   mirror::Object* result;
   {
     MutexLock mu(self, lock_);
@@ -131,7 +132,8 @@
     size_t max_allowed = Capacity();
     mspace_set_footprint_limit(mspace_, max_allowed);
     // Try the allocation.
-    result = AllocWithoutGrowthLocked(self, num_bytes, bytes_allocated, usable_size);
+    result = AllocWithoutGrowthLocked(self, num_bytes, bytes_allocated, usable_size,
+                                      bytes_tl_bulk_allocated);
     // Shrink back down as small as possible.
     size_t footprint = mspace_footprint(mspace_);
     mspace_set_footprint_limit(mspace_, footprint);
diff --git a/runtime/gc/space/dlmalloc_space.h b/runtime/gc/space/dlmalloc_space.h
index 6ce138c..1f80f1f 100644
--- a/runtime/gc/space/dlmalloc_space.h
+++ b/runtime/gc/space/dlmalloc_space.h
@@ -48,11 +48,15 @@
 
   // Virtual to allow ValgrindMallocSpace to intercept.
   virtual mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                          size_t* usable_size) OVERRIDE LOCKS_EXCLUDED(lock_);
+                                          size_t* usable_size,
+                                          size_t* bytes_tl_bulk_allocated)
+      OVERRIDE LOCKS_EXCLUDED(lock_);
   // Virtual to allow ValgrindMallocSpace to intercept.
   virtual mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                        size_t* usable_size) OVERRIDE LOCKS_EXCLUDED(lock_) {
-    return AllocNonvirtual(self, num_bytes, bytes_allocated, usable_size);
+                                size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      OVERRIDE LOCKS_EXCLUDED(lock_) {
+    return AllocNonvirtual(self, num_bytes, bytes_allocated, usable_size,
+                           bytes_tl_bulk_allocated);
   }
   // Virtual to allow ValgrindMallocSpace to intercept.
   virtual size_t AllocationSize(mirror::Object* obj, size_t* usable_size) OVERRIDE {
@@ -67,15 +71,22 @@
       LOCKS_EXCLUDED(lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  // DlMallocSpaces don't have thread local state.
-  void RevokeThreadLocalBuffers(art::Thread*) OVERRIDE {
+  size_t MaxBytesBulkAllocatedFor(size_t num_bytes) OVERRIDE {
+    return num_bytes;
   }
-  void RevokeAllThreadLocalBuffers() OVERRIDE {
+
+  // DlMallocSpaces don't have thread local state.
+  size_t RevokeThreadLocalBuffers(art::Thread*) OVERRIDE {
+    return 0U;
+  }
+  size_t RevokeAllThreadLocalBuffers() OVERRIDE {
+    return 0U;
   }
 
   // Faster non-virtual allocation path.
   mirror::Object* AllocNonvirtual(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                  size_t* usable_size) LOCKS_EXCLUDED(lock_);
+                                  size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      LOCKS_EXCLUDED(lock_);
 
   // Faster non-virtual allocation size path.
   size_t AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size);
@@ -134,7 +145,8 @@
 
  private:
   mirror::Object* AllocWithoutGrowthLocked(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                           size_t* usable_size)
+                                           size_t* usable_size,
+                                           size_t* bytes_tl_bulk_allocated)
       EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   void* CreateAllocator(void* base, size_t morecore_start, size_t initial_size,
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 7523de5..5c8e4b9 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -38,10 +38,11 @@
   }
 
   virtual mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                size_t* usable_size) OVERRIDE {
+                                size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      OVERRIDE {
     mirror::Object* obj =
         LargeObjectMapSpace::Alloc(self, num_bytes + kValgrindRedZoneBytes * 2, bytes_allocated,
-                                   usable_size);
+                                   usable_size, bytes_tl_bulk_allocated);
     mirror::Object* object_without_rdz = reinterpret_cast<mirror::Object*>(
         reinterpret_cast<uintptr_t>(obj) + kValgrindRedZoneBytes);
     VALGRIND_MAKE_MEM_NOACCESS(reinterpret_cast<void*>(obj), kValgrindRedZoneBytes);
@@ -108,7 +109,8 @@
 }
 
 mirror::Object* LargeObjectMapSpace::Alloc(Thread* self, size_t num_bytes,
-                                           size_t* bytes_allocated, size_t* usable_size) {
+                                           size_t* bytes_allocated, size_t* usable_size,
+                                           size_t* bytes_tl_bulk_allocated) {
   std::string error_msg;
   MemMap* mem_map = MemMap::MapAnonymous("large object space allocation", nullptr, num_bytes,
                                          PROT_READ | PROT_WRITE, true, false, &error_msg);
@@ -131,6 +133,8 @@
   if (usable_size != nullptr) {
     *usable_size = allocation_size;
   }
+  DCHECK(bytes_tl_bulk_allocated != nullptr);
+  *bytes_tl_bulk_allocated = allocation_size;
   num_bytes_allocated_ += allocation_size;
   total_bytes_allocated_ += allocation_size;
   ++num_objects_allocated_;
@@ -413,7 +417,7 @@
 }
 
 mirror::Object* FreeListSpace::Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                     size_t* usable_size) {
+                                     size_t* usable_size, size_t* bytes_tl_bulk_allocated) {
   MutexLock mu(self, lock_);
   const size_t allocation_size = RoundUp(num_bytes, kAlignment);
   AllocationInfo temp_info;
@@ -451,6 +455,8 @@
   if (usable_size != nullptr) {
     *usable_size = allocation_size;
   }
+  DCHECK(bytes_tl_bulk_allocated != nullptr);
+  *bytes_tl_bulk_allocated = allocation_size;
   // Need to do these inside of the lock.
   ++num_objects_allocated_;
   ++total_objects_allocated_;
diff --git a/runtime/gc/space/large_object_space.h b/runtime/gc/space/large_object_space.h
index 847f575..d1f9386 100644
--- a/runtime/gc/space/large_object_space.h
+++ b/runtime/gc/space/large_object_space.h
@@ -62,9 +62,11 @@
   }
   size_t FreeList(Thread* self, size_t num_ptrs, mirror::Object** ptrs) OVERRIDE;
   // LargeObjectSpaces don't have thread local state.
-  void RevokeThreadLocalBuffers(art::Thread*) OVERRIDE {
+  size_t RevokeThreadLocalBuffers(art::Thread*) OVERRIDE {
+    return 0U;
   }
-  void RevokeAllThreadLocalBuffers() OVERRIDE {
+  size_t RevokeAllThreadLocalBuffers() OVERRIDE {
+    return 0U;
   }
   bool IsAllocSpace() const OVERRIDE {
     return true;
@@ -124,7 +126,7 @@
   // Return the storage space required by obj.
   size_t AllocationSize(mirror::Object* obj, size_t* usable_size);
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                        size_t* usable_size);
+                        size_t* usable_size, size_t* bytes_tl_bulk_allocated);
   size_t Free(Thread* self, mirror::Object* ptr);
   void Walk(DlMallocSpace::WalkCallback, void* arg) OVERRIDE LOCKS_EXCLUDED(lock_);
   // TODO: disabling thread safety analysis as this may be called when we already hold lock_.
@@ -153,7 +155,7 @@
   size_t AllocationSize(mirror::Object* obj, size_t* usable_size) OVERRIDE
       EXCLUSIVE_LOCKS_REQUIRED(lock_);
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                        size_t* usable_size) OVERRIDE;
+                        size_t* usable_size, size_t* bytes_tl_bulk_allocated) OVERRIDE;
   size_t Free(Thread* self, mirror::Object* obj) OVERRIDE;
   void Walk(DlMallocSpace::WalkCallback callback, void* arg) OVERRIDE LOCKS_EXCLUDED(lock_);
   void Dump(std::ostream& os) const;
diff --git a/runtime/gc/space/large_object_space_test.cc b/runtime/gc/space/large_object_space_test.cc
index e17bad8..a261663 100644
--- a/runtime/gc/space/large_object_space_test.cc
+++ b/runtime/gc/space/large_object_space_test.cc
@@ -49,11 +49,13 @@
       while (requests.size() < num_allocations) {
         size_t request_size = test_rand(&rand_seed) % max_allocation_size;
         size_t allocation_size = 0;
+        size_t bytes_tl_bulk_allocated;
         mirror::Object* obj = los->Alloc(Thread::Current(), request_size, &allocation_size,
-                                         nullptr);
+                                         nullptr, &bytes_tl_bulk_allocated);
         ASSERT_TRUE(obj != nullptr);
         ASSERT_EQ(allocation_size, los->AllocationSize(obj, nullptr));
         ASSERT_GE(allocation_size, request_size);
+        ASSERT_EQ(allocation_size, bytes_tl_bulk_allocated);
         // Fill in our magic value.
         uint8_t magic = (request_size & 0xFF) | 1;
         memset(obj, magic, request_size);
@@ -83,9 +85,10 @@
     // Test that dump doesn't crash.
     los->Dump(LOG(INFO));
 
-    size_t bytes_allocated = 0;
+    size_t bytes_allocated = 0, bytes_tl_bulk_allocated;
     // Checks that the coalescing works.
-    mirror::Object* obj = los->Alloc(Thread::Current(), 100 * MB, &bytes_allocated, nullptr);
+    mirror::Object* obj = los->Alloc(Thread::Current(), 100 * MB, &bytes_allocated, nullptr,
+                                     &bytes_tl_bulk_allocated);
     EXPECT_TRUE(obj != nullptr);
     los->Free(Thread::Current(), obj);
 
@@ -102,8 +105,9 @@
 
   void Run(Thread* self) {
     for (size_t i = 0; i < iterations_ ; ++i) {
-      size_t alloc_size;
-      mirror::Object* ptr = los_->Alloc(self, size_, &alloc_size, nullptr);
+      size_t alloc_size, bytes_tl_bulk_allocated;
+      mirror::Object* ptr = los_->Alloc(self, size_, &alloc_size, nullptr,
+                                        &bytes_tl_bulk_allocated);
 
       NanoSleep((id_ + 3) * 1000);  // (3+id) mu s
 
diff --git a/runtime/gc/space/malloc_space.h b/runtime/gc/space/malloc_space.h
index 06239e5..bbf1bbb 100644
--- a/runtime/gc/space/malloc_space.h
+++ b/runtime/gc/space/malloc_space.h
@@ -55,10 +55,11 @@
 
   // Allocate num_bytes allowing the underlying space to grow.
   virtual mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes,
-                                          size_t* bytes_allocated, size_t* usable_size) = 0;
+                                          size_t* bytes_allocated, size_t* usable_size,
+                                          size_t* bytes_tl_bulk_allocated) = 0;
   // Allocate num_bytes without allowing the underlying space to grow.
   virtual mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                size_t* usable_size) = 0;
+                                size_t* usable_size, size_t* bytes_tl_bulk_allocated) = 0;
   // Return the storage space required by obj. If usable_size isn't nullptr then it is set to the
   // amount of the storage space that may be used by obj.
   virtual size_t AllocationSize(mirror::Object* obj, size_t* usable_size) = 0;
@@ -67,6 +68,11 @@
   virtual size_t FreeList(Thread* self, size_t num_ptrs, mirror::Object** ptrs)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
 
+  // Returns the maximum bytes that could be allocated for the given
+  // size in bulk, that is the maximum value for the
+  // bytes_allocated_bulk out param returned by MallocSpace::Alloc().
+  virtual size_t MaxBytesBulkAllocatedFor(size_t num_bytes) = 0;
+
 #ifndef NDEBUG
   virtual void CheckMoreCoreForPrecondition() {}  // to be overridden in the debug build.
 #else
diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index a4ed718..1cdf69d 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h
@@ -24,30 +24,36 @@
 namespace space {
 
 inline mirror::Object* RegionSpace::Alloc(Thread*, size_t num_bytes, size_t* bytes_allocated,
-                                          size_t* usable_size) {
+                                          size_t* usable_size,
+                                          size_t* bytes_tl_bulk_allocated) {
   num_bytes = RoundUp(num_bytes, kAlignment);
-  return AllocNonvirtual<false>(num_bytes, bytes_allocated, usable_size);
+  return AllocNonvirtual<false>(num_bytes, bytes_allocated, usable_size,
+                                bytes_tl_bulk_allocated);
 }
 
 inline mirror::Object* RegionSpace::AllocThreadUnsafe(Thread* self, size_t num_bytes,
                                                       size_t* bytes_allocated,
-                                                      size_t* usable_size) {
+                                                      size_t* usable_size,
+                                                      size_t* bytes_tl_bulk_allocated) {
   Locks::mutator_lock_->AssertExclusiveHeld(self);
-  return Alloc(self, num_bytes, bytes_allocated, usable_size);
+  return Alloc(self, num_bytes, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
 }
 
 template<bool kForEvac>
 inline mirror::Object* RegionSpace::AllocNonvirtual(size_t num_bytes, size_t* bytes_allocated,
-                                                    size_t* usable_size) {
+                                                    size_t* usable_size,
+                                                    size_t* bytes_tl_bulk_allocated) {
   DCHECK(IsAligned<kAlignment>(num_bytes));
   mirror::Object* obj;
   if (LIKELY(num_bytes <= kRegionSize)) {
     // Non-large object.
     if (!kForEvac) {
-      obj = current_region_->Alloc(num_bytes, bytes_allocated, usable_size);
+      obj = current_region_->Alloc(num_bytes, bytes_allocated, usable_size,
+                                   bytes_tl_bulk_allocated);
     } else {
       DCHECK(evac_region_ != nullptr);
-      obj = evac_region_->Alloc(num_bytes, bytes_allocated, usable_size);
+      obj = evac_region_->Alloc(num_bytes, bytes_allocated, usable_size,
+                                bytes_tl_bulk_allocated);
     }
     if (LIKELY(obj != nullptr)) {
       return obj;
@@ -55,9 +61,11 @@
     MutexLock mu(Thread::Current(), region_lock_);
     // Retry with current region since another thread may have updated it.
     if (!kForEvac) {
-      obj = current_region_->Alloc(num_bytes, bytes_allocated, usable_size);
+      obj = current_region_->Alloc(num_bytes, bytes_allocated, usable_size,
+                                   bytes_tl_bulk_allocated);
     } else {
-      obj = evac_region_->Alloc(num_bytes, bytes_allocated, usable_size);
+      obj = evac_region_->Alloc(num_bytes, bytes_allocated, usable_size,
+                                bytes_tl_bulk_allocated);
     }
     if (LIKELY(obj != nullptr)) {
       return obj;
@@ -73,7 +81,7 @@
           r->Unfree(time_);
           r->SetNewlyAllocated();
           ++num_non_free_regions_;
-          obj = r->Alloc(num_bytes, bytes_allocated, usable_size);
+          obj = r->Alloc(num_bytes, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
           CHECK(obj != nullptr);
           current_region_ = r;
           return obj;
@@ -85,7 +93,7 @@
         if (r->IsFree()) {
           r->Unfree(time_);
           ++num_non_free_regions_;
-          obj = r->Alloc(num_bytes, bytes_allocated, usable_size);
+          obj = r->Alloc(num_bytes, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
           CHECK(obj != nullptr);
           evac_region_ = r;
           return obj;
@@ -94,7 +102,8 @@
     }
   } else {
     // Large object.
-    obj = AllocLarge<kForEvac>(num_bytes, bytes_allocated, usable_size);
+    obj = AllocLarge<kForEvac>(num_bytes, bytes_allocated, usable_size,
+                               bytes_tl_bulk_allocated);
     if (LIKELY(obj != nullptr)) {
       return obj;
     }
@@ -103,7 +112,8 @@
 }
 
 inline mirror::Object* RegionSpace::Region::Alloc(size_t num_bytes, size_t* bytes_allocated,
-                                                  size_t* usable_size) {
+                                                  size_t* usable_size,
+                                                  size_t* bytes_tl_bulk_allocated) {
   DCHECK(IsAllocated() && IsInToSpace());
   DCHECK(IsAligned<kAlignment>(num_bytes));
   Atomic<uint8_t*>* atomic_top = reinterpret_cast<Atomic<uint8_t*>*>(&top_);
@@ -124,6 +134,7 @@
   if (usable_size != nullptr) {
     *usable_size = num_bytes;
   }
+  *bytes_tl_bulk_allocated = num_bytes;
   return reinterpret_cast<mirror::Object*>(old_top);
 }
 
@@ -253,7 +264,8 @@
 
 template<bool kForEvac>
 mirror::Object* RegionSpace::AllocLarge(size_t num_bytes, size_t* bytes_allocated,
-                                        size_t* usable_size) {
+                                        size_t* usable_size,
+                                        size_t* bytes_tl_bulk_allocated) {
   DCHECK(IsAligned<kAlignment>(num_bytes));
   DCHECK_GT(num_bytes, kRegionSize);
   size_t num_regs = RoundUp(num_bytes, kRegionSize) / kRegionSize;
@@ -300,6 +312,7 @@
       if (usable_size != nullptr) {
         *usable_size = num_regs * kRegionSize;
       }
+      *bytes_tl_bulk_allocated = num_bytes;
       return reinterpret_cast<mirror::Object*>(first_reg->Begin());
     } else {
       // right points to the non-free region. Start with the one after it.
diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index 8bb73d6..814ab6c 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc
@@ -76,7 +76,7 @@
   current_region_ = &full_region_;
   evac_region_ = nullptr;
   size_t ignored;
-  DCHECK(full_region_.Alloc(kAlignment, &ignored, nullptr) == nullptr);
+  DCHECK(full_region_.Alloc(kAlignment, &ignored, nullptr, &ignored) == nullptr);
 }
 
 size_t RegionSpace::FromSpaceSize() {
@@ -356,9 +356,10 @@
   return false;
 }
 
-void RegionSpace::RevokeThreadLocalBuffers(Thread* thread) {
+size_t RegionSpace::RevokeThreadLocalBuffers(Thread* thread) {
   MutexLock mu(Thread::Current(), region_lock_);
   RevokeThreadLocalBuffersLocked(thread);
+  return 0U;
 }
 
 void RegionSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
@@ -377,7 +378,7 @@
   thread->SetTlab(nullptr, nullptr);
 }
 
-void RegionSpace::RevokeAllThreadLocalBuffers() {
+size_t RegionSpace::RevokeAllThreadLocalBuffers() {
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::runtime_shutdown_lock_);
   MutexLock mu2(self, *Locks::thread_list_lock_);
@@ -385,6 +386,7 @@
   for (Thread* thread : thread_list) {
     RevokeThreadLocalBuffers(thread);
   }
+  return 0U;
 }
 
 void RegionSpace::AssertThreadLocalBuffersAreRevoked(Thread* thread) {
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index 4160547..b88ce24 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -42,18 +42,20 @@
 
   // Allocate num_bytes, returns nullptr if the space is full.
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                        size_t* usable_size) OVERRIDE;
+                        size_t* usable_size, size_t* bytes_tl_bulk_allocated) OVERRIDE;
   // Thread-unsafe allocation for when mutators are suspended, used by the semispace collector.
   mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                    size_t* usable_size)
+                                    size_t* usable_size, size_t* bytes_tl_bulk_allocated)
       OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   // The main allocation routine.
   template<bool kForEvac>
   ALWAYS_INLINE mirror::Object* AllocNonvirtual(size_t num_bytes, size_t* bytes_allocated,
-                                                size_t* usable_size);
+                                                size_t* usable_size,
+                                                size_t* bytes_tl_bulk_allocated);
   // Allocate/free large objects (objects that are larger than the region size.)
   template<bool kForEvac>
-  mirror::Object* AllocLarge(size_t num_bytes, size_t* bytes_allocated, size_t* usable_size);
+  mirror::Object* AllocLarge(size_t num_bytes, size_t* bytes_allocated, size_t* usable_size,
+                             size_t* bytes_tl_bulk_allocated);
   void FreeLarge(mirror::Object* large_obj, size_t bytes_allocated);
 
   // Return the storage space required by obj.
@@ -87,10 +89,10 @@
   void DumpRegions(std::ostream& os);
   void DumpNonFreeRegions(std::ostream& os);
 
-  void RevokeThreadLocalBuffers(Thread* thread) LOCKS_EXCLUDED(region_lock_);
+  size_t RevokeThreadLocalBuffers(Thread* thread) LOCKS_EXCLUDED(region_lock_);
   void RevokeThreadLocalBuffersLocked(Thread* thread) EXCLUSIVE_LOCKS_REQUIRED(region_lock_);
-  void RevokeAllThreadLocalBuffers() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
-                                                    Locks::thread_list_lock_);
+  size_t RevokeAllThreadLocalBuffers() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
+                                                      Locks::thread_list_lock_);
   void AssertThreadLocalBuffersAreRevoked(Thread* thread) LOCKS_EXCLUDED(region_lock_);
   void AssertAllThreadLocalBuffersAreRevoked() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
                                                               Locks::thread_list_lock_);
@@ -269,7 +271,8 @@
     }
 
     ALWAYS_INLINE mirror::Object* Alloc(size_t num_bytes, size_t* bytes_allocated,
-                                        size_t* usable_size);
+                                        size_t* usable_size,
+                                        size_t* bytes_tl_bulk_allocated);
 
     bool IsFree() const {
       bool is_free = state_ == RegionState::kRegionStateFree;
diff --git a/runtime/gc/space/rosalloc_space-inl.h b/runtime/gc/space/rosalloc_space-inl.h
index 5d6642d..9d582a3 100644
--- a/runtime/gc/space/rosalloc_space-inl.h
+++ b/runtime/gc/space/rosalloc_space-inl.h
@@ -26,13 +26,19 @@
 namespace gc {
 namespace space {
 
+template<bool kMaybeRunningOnValgrind>
 inline size_t RosAllocSpace::AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size) {
   // obj is a valid object. Use its class in the header to get the size.
   // Don't use verification since the object may be dead if we are sweeping.
   size_t size = obj->SizeOf<kVerifyNone>();
-  bool running_on_valgrind = RUNNING_ON_VALGRIND != 0;
-  if (running_on_valgrind) {
-    size += 2 * kDefaultValgrindRedZoneBytes;
+  bool running_on_valgrind = false;
+  if (kMaybeRunningOnValgrind) {
+    running_on_valgrind = RUNNING_ON_VALGRIND != 0;
+    if (running_on_valgrind) {
+      size += 2 * kDefaultValgrindRedZoneBytes;
+    }
+  } else {
+    DCHECK_EQ(RUNNING_ON_VALGRIND, 0U);
   }
   size_t size_by_size = rosalloc_->UsableSize(size);
   if (kIsDebugBuild) {
@@ -55,28 +61,50 @@
 
 template<bool kThreadSafe>
 inline mirror::Object* RosAllocSpace::AllocCommon(Thread* self, size_t num_bytes,
-                                                  size_t* bytes_allocated, size_t* usable_size) {
-  size_t rosalloc_size = 0;
+                                                  size_t* bytes_allocated, size_t* usable_size,
+                                                  size_t* bytes_tl_bulk_allocated) {
+  size_t rosalloc_bytes_allocated = 0;
+  size_t rosalloc_usable_size = 0;
+  size_t rosalloc_bytes_tl_bulk_allocated = 0;
   if (!kThreadSafe) {
     Locks::mutator_lock_->AssertExclusiveHeld(self);
   }
   mirror::Object* result = reinterpret_cast<mirror::Object*>(
-      rosalloc_->Alloc<kThreadSafe>(self, num_bytes, &rosalloc_size));
+      rosalloc_->Alloc<kThreadSafe>(self, num_bytes, &rosalloc_bytes_allocated,
+                                    &rosalloc_usable_size,
+                                    &rosalloc_bytes_tl_bulk_allocated));
   if (LIKELY(result != NULL)) {
     if (kDebugSpaces) {
       CHECK(Contains(result)) << "Allocation (" << reinterpret_cast<void*>(result)
             << ") not in bounds of allocation space " << *this;
     }
     DCHECK(bytes_allocated != NULL);
-    *bytes_allocated = rosalloc_size;
-    DCHECK_EQ(rosalloc_size, rosalloc_->UsableSize(result));
+    *bytes_allocated = rosalloc_bytes_allocated;
+    DCHECK_EQ(rosalloc_usable_size, rosalloc_->UsableSize(result));
     if (usable_size != nullptr) {
-      *usable_size = rosalloc_size;
+      *usable_size = rosalloc_usable_size;
     }
+    DCHECK(bytes_tl_bulk_allocated != NULL);
+    *bytes_tl_bulk_allocated = rosalloc_bytes_tl_bulk_allocated;
   }
   return result;
 }
 
+inline bool RosAllocSpace::CanAllocThreadLocal(Thread* self, size_t num_bytes) {
+  return rosalloc_->CanAllocFromThreadLocalRun(self, num_bytes);
+}
+
+inline mirror::Object* RosAllocSpace::AllocThreadLocal(Thread* self, size_t num_bytes,
+                                                       size_t* bytes_allocated) {
+  DCHECK(bytes_allocated != nullptr);
+  return reinterpret_cast<mirror::Object*>(
+      rosalloc_->AllocFromThreadLocalRun(self, num_bytes, bytes_allocated));
+}
+
+inline size_t RosAllocSpace::MaxBytesBulkAllocatedForNonvirtual(size_t num_bytes) {
+  return rosalloc_->MaxBytesBulkAllocatedFor(num_bytes);
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc
index ced25a4..f140021 100644
--- a/runtime/gc/space/rosalloc_space.cc
+++ b/runtime/gc/space/rosalloc_space.cc
@@ -154,7 +154,8 @@
 }
 
 mirror::Object* RosAllocSpace::AllocWithGrowth(Thread* self, size_t num_bytes,
-                                               size_t* bytes_allocated, size_t* usable_size) {
+                                               size_t* bytes_allocated, size_t* usable_size,
+                                               size_t* bytes_tl_bulk_allocated) {
   mirror::Object* result;
   {
     MutexLock mu(self, lock_);
@@ -162,7 +163,8 @@
     size_t max_allowed = Capacity();
     rosalloc_->SetFootprintLimit(max_allowed);
     // Try the allocation.
-    result = AllocCommon(self, num_bytes, bytes_allocated, usable_size);
+    result = AllocCommon(self, num_bytes, bytes_allocated, usable_size,
+                         bytes_tl_bulk_allocated);
     // Shrink back down as small as possible.
     size_t footprint = rosalloc_->Footprint();
     rosalloc_->SetFootprintLimit(footprint);
@@ -209,7 +211,7 @@
       __builtin_prefetch(reinterpret_cast<char*>(ptrs[i + kPrefetchLookAhead]));
     }
     if (kVerifyFreedBytes) {
-      verify_bytes += AllocationSizeNonvirtual(ptrs[i], nullptr);
+      verify_bytes += AllocationSizeNonvirtual<true>(ptrs[i], nullptr);
     }
   }
 
@@ -338,12 +340,12 @@
   }
 }
 
-void RosAllocSpace::RevokeThreadLocalBuffers(Thread* thread) {
-  rosalloc_->RevokeThreadLocalRuns(thread);
+size_t RosAllocSpace::RevokeThreadLocalBuffers(Thread* thread) {
+  return rosalloc_->RevokeThreadLocalRuns(thread);
 }
 
-void RosAllocSpace::RevokeAllThreadLocalBuffers() {
-  rosalloc_->RevokeAllThreadLocalRuns();
+size_t RosAllocSpace::RevokeAllThreadLocalBuffers() {
+  return rosalloc_->RevokeAllThreadLocalRuns();
 }
 
 void RosAllocSpace::AssertThreadLocalBuffersAreRevoked(Thread* thread) {
diff --git a/runtime/gc/space/rosalloc_space.h b/runtime/gc/space/rosalloc_space.h
index c856e95..36268f7 100644
--- a/runtime/gc/space/rosalloc_space.h
+++ b/runtime/gc/space/rosalloc_space.h
@@ -47,18 +47,21 @@
                                          bool low_memory_mode, bool can_move_objects);
 
   mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                  size_t* usable_size) OVERRIDE LOCKS_EXCLUDED(lock_);
+                                  size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      OVERRIDE LOCKS_EXCLUDED(lock_);
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                        size_t* usable_size) OVERRIDE {
-    return AllocNonvirtual(self, num_bytes, bytes_allocated, usable_size);
+                        size_t* usable_size, size_t* bytes_tl_bulk_allocated) OVERRIDE {
+    return AllocNonvirtual(self, num_bytes, bytes_allocated, usable_size,
+                           bytes_tl_bulk_allocated);
   }
   mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                    size_t* usable_size)
+                                    size_t* usable_size, size_t* bytes_tl_bulk_allocated)
       OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return AllocNonvirtualThreadUnsafe(self, num_bytes, bytes_allocated, usable_size);
+    return AllocNonvirtualThreadUnsafe(self, num_bytes, bytes_allocated, usable_size,
+                                       bytes_tl_bulk_allocated);
   }
   size_t AllocationSize(mirror::Object* obj, size_t* usable_size) OVERRIDE {
-    return AllocationSizeNonvirtual(obj, usable_size);
+    return AllocationSizeNonvirtual<true>(obj, usable_size);
   }
   size_t Free(Thread* self, mirror::Object* ptr) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -66,17 +69,33 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   mirror::Object* AllocNonvirtual(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                  size_t* usable_size) {
+                                  size_t* usable_size, size_t* bytes_tl_bulk_allocated) {
     // RosAlloc zeroes memory internally.
-    return AllocCommon(self, num_bytes, bytes_allocated, usable_size);
+    return AllocCommon(self, num_bytes, bytes_allocated, usable_size,
+                       bytes_tl_bulk_allocated);
   }
   mirror::Object* AllocNonvirtualThreadUnsafe(Thread* self, size_t num_bytes,
-                                              size_t* bytes_allocated, size_t* usable_size) {
+                                              size_t* bytes_allocated, size_t* usable_size,
+                                              size_t* bytes_tl_bulk_allocated) {
     // RosAlloc zeroes memory internally. Pass in false for thread unsafe.
-    return AllocCommon<false>(self, num_bytes, bytes_allocated, usable_size);
+    return AllocCommon<false>(self, num_bytes, bytes_allocated, usable_size,
+                              bytes_tl_bulk_allocated);
   }
 
+  // Returns true if the given allocation request can be allocated in
+  // an existing thread local run without allocating a new run.
+  ALWAYS_INLINE bool CanAllocThreadLocal(Thread* self, size_t num_bytes);
+  // Allocate the given allocation request in an existing thread local
+  // run without allocating a new run.
+  ALWAYS_INLINE mirror::Object* AllocThreadLocal(Thread* self, size_t num_bytes,
+                                                 size_t* bytes_allocated);
+  size_t MaxBytesBulkAllocatedFor(size_t num_bytes) OVERRIDE {
+    return MaxBytesBulkAllocatedForNonvirtual(num_bytes);
+  }
+  ALWAYS_INLINE size_t MaxBytesBulkAllocatedForNonvirtual(size_t num_bytes);
+
   // TODO: NO_THREAD_SAFETY_ANALYSIS because SizeOf() requires that mutator_lock is held.
+  template<bool kMaybeRunningOnValgrind>
   size_t AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size)
       NO_THREAD_SAFETY_ANALYSIS;
 
@@ -99,8 +118,8 @@
   uint64_t GetBytesAllocated() OVERRIDE;
   uint64_t GetObjectsAllocated() OVERRIDE;
 
-  void RevokeThreadLocalBuffers(Thread* thread);
-  void RevokeAllThreadLocalBuffers();
+  size_t RevokeThreadLocalBuffers(Thread* thread);
+  size_t RevokeAllThreadLocalBuffers();
   void AssertThreadLocalBuffersAreRevoked(Thread* thread);
   void AssertAllThreadLocalBuffersAreRevoked();
 
@@ -134,7 +153,7 @@
  private:
   template<bool kThreadSafe = true>
   mirror::Object* AllocCommon(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                              size_t* usable_size);
+                              size_t* usable_size, size_t* bytes_tl_bulk_allocated);
 
   void* CreateAllocator(void* base, size_t morecore_start, size_t initial_size,
                         size_t maximum_size, bool low_memory_mode) OVERRIDE {
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index d24650b..f2378d9 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -203,14 +203,24 @@
   // succeeds, the output parameter bytes_allocated will be set to the
   // actually allocated bytes which is >= num_bytes.
   // Alloc can be called from multiple threads at the same time and must be thread-safe.
+  //
+  // bytes_tl_bulk_allocated - bytes allocated in bulk ahead of time for a thread local allocation,
+  // if applicable. It can be
+  // 1) equal to bytes_allocated if it's not a thread local allocation,
+  // 2) greater than bytes_allocated if it's a thread local
+  //    allocation that required a new buffer, or
+  // 3) zero if it's a thread local allocation in an existing
+  //    buffer.
+  // This is what is to be added to Heap::num_bytes_allocated_.
   virtual mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                size_t* usable_size) = 0;
+                                size_t* usable_size, size_t* bytes_tl_bulk_allocated) = 0;
 
   // Thread-unsafe allocation for when mutators are suspended, used by the semispace collector.
   virtual mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                            size_t* usable_size)
+                                            size_t* usable_size,
+                                            size_t* bytes_tl_bulk_allocated)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return Alloc(self, num_bytes, bytes_allocated, usable_size);
+    return Alloc(self, num_bytes, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
   }
 
   // Return the storage space required by obj.
@@ -224,11 +234,15 @@
 
   // Revoke any sort of thread-local buffers that are used to speed up allocations for the given
   // thread, if the alloc space implementation uses any.
-  virtual void RevokeThreadLocalBuffers(Thread* thread) = 0;
+  // Returns the total free bytes in the revoked thread local runs that's to be subtracted
+  // from Heap::num_bytes_allocated_ or zero if unnecessary.
+  virtual size_t RevokeThreadLocalBuffers(Thread* thread) = 0;
 
   // Revoke any sort of thread-local buffers that are used to speed up allocations for all the
   // threads, if the alloc space implementation uses any.
-  virtual void RevokeAllThreadLocalBuffers() = 0;
+  // Returns the total free bytes in the revoked thread local runs that's to be subtracted
+  // from Heap::num_bytes_allocated_ or zero if unnecessary.
+  virtual size_t RevokeAllThreadLocalBuffers() = 0;
 
   virtual void LogFragmentationAllocFailure(std::ostream& os, size_t failed_alloc_bytes) = 0;
 
diff --git a/runtime/gc/space/space_test.h b/runtime/gc/space/space_test.h
index 09d10dd..3e9e9f7 100644
--- a/runtime/gc/space/space_test.h
+++ b/runtime/gc/space/space_test.h
@@ -61,11 +61,13 @@
   }
 
   mirror::Object* Alloc(space::MallocSpace* alloc_space, Thread* self, size_t bytes,
-                        size_t* bytes_allocated, size_t* usable_size)
+                        size_t* bytes_allocated, size_t* usable_size,
+                        size_t* bytes_tl_bulk_allocated)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     StackHandleScope<1> hs(self);
     Handle<mirror::Class> byte_array_class(hs.NewHandle(GetByteArrayClass(self)));
-    mirror::Object* obj = alloc_space->Alloc(self, bytes, bytes_allocated, usable_size);
+    mirror::Object* obj = alloc_space->Alloc(self, bytes, bytes_allocated, usable_size,
+                                             bytes_tl_bulk_allocated);
     if (obj != nullptr) {
       InstallClass(obj, byte_array_class.Get(), bytes);
     }
@@ -73,11 +75,13 @@
   }
 
   mirror::Object* AllocWithGrowth(space::MallocSpace* alloc_space, Thread* self, size_t bytes,
-                                  size_t* bytes_allocated, size_t* usable_size)
+                                  size_t* bytes_allocated, size_t* usable_size,
+                                  size_t* bytes_tl_bulk_allocated)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     StackHandleScope<1> hs(self);
     Handle<mirror::Class> byte_array_class(hs.NewHandle(GetByteArrayClass(self)));
-    mirror::Object* obj = alloc_space->AllocWithGrowth(self, bytes, bytes_allocated, usable_size);
+    mirror::Object* obj = alloc_space->AllocWithGrowth(self, bytes, bytes_allocated, usable_size,
+                                                       bytes_tl_bulk_allocated);
     if (obj != nullptr) {
       InstallClass(obj, byte_array_class.Get(), bytes);
     }
@@ -182,34 +186,38 @@
   ScopedObjectAccess soa(self);
 
   // Succeeds, fits without adjusting the footprint limit.
-  size_t ptr1_bytes_allocated, ptr1_usable_size;
+  size_t ptr1_bytes_allocated, ptr1_usable_size, ptr1_bytes_tl_bulk_allocated;
   StackHandleScope<3> hs(soa.Self());
   MutableHandle<mirror::Object> ptr1(
-      hs.NewHandle(Alloc(space, self, 1 * MB, &ptr1_bytes_allocated, &ptr1_usable_size)));
+      hs.NewHandle(Alloc(space, self, 1 * MB, &ptr1_bytes_allocated, &ptr1_usable_size,
+                         &ptr1_bytes_tl_bulk_allocated)));
   EXPECT_TRUE(ptr1.Get() != nullptr);
   EXPECT_LE(1U * MB, ptr1_bytes_allocated);
   EXPECT_LE(1U * MB, ptr1_usable_size);
   EXPECT_LE(ptr1_usable_size, ptr1_bytes_allocated);
+  EXPECT_EQ(ptr1_bytes_tl_bulk_allocated, ptr1_bytes_allocated);
 
   // Fails, requires a higher footprint limit.
-  mirror::Object* ptr2 = Alloc(space, self, 8 * MB, &dummy, nullptr);
+  mirror::Object* ptr2 = Alloc(space, self, 8 * MB, &dummy, nullptr, &dummy);
   EXPECT_TRUE(ptr2 == nullptr);
 
   // Succeeds, adjusts the footprint.
-  size_t ptr3_bytes_allocated, ptr3_usable_size;
+  size_t ptr3_bytes_allocated, ptr3_usable_size, ptr3_bytes_tl_bulk_allocated;
   MutableHandle<mirror::Object> ptr3(
-      hs.NewHandle(AllocWithGrowth(space, self, 8 * MB, &ptr3_bytes_allocated, &ptr3_usable_size)));
+      hs.NewHandle(AllocWithGrowth(space, self, 8 * MB, &ptr3_bytes_allocated, &ptr3_usable_size,
+                                   &ptr3_bytes_tl_bulk_allocated)));
   EXPECT_TRUE(ptr3.Get() != nullptr);
   EXPECT_LE(8U * MB, ptr3_bytes_allocated);
   EXPECT_LE(8U * MB, ptr3_usable_size);
   EXPECT_LE(ptr3_usable_size, ptr3_bytes_allocated);
+  EXPECT_EQ(ptr3_bytes_tl_bulk_allocated, ptr3_bytes_allocated);
 
   // Fails, requires a higher footprint limit.
-  mirror::Object* ptr4 = space->Alloc(self, 8 * MB, &dummy, nullptr);
+  mirror::Object* ptr4 = space->Alloc(self, 8 * MB, &dummy, nullptr, &dummy);
   EXPECT_TRUE(ptr4 == nullptr);
 
   // Also fails, requires a higher allowed footprint.
-  mirror::Object* ptr5 = space->AllocWithGrowth(self, 8 * MB, &dummy, nullptr);
+  mirror::Object* ptr5 = space->AllocWithGrowth(self, 8 * MB, &dummy, nullptr, &dummy);
   EXPECT_TRUE(ptr5 == nullptr);
 
   // Release some memory.
@@ -219,13 +227,15 @@
   EXPECT_LE(8U * MB, free3);
 
   // Succeeds, now that memory has been freed.
-  size_t ptr6_bytes_allocated, ptr6_usable_size;
+  size_t ptr6_bytes_allocated, ptr6_usable_size, ptr6_bytes_tl_bulk_allocated;
   Handle<mirror::Object> ptr6(
-      hs.NewHandle(AllocWithGrowth(space, self, 9 * MB, &ptr6_bytes_allocated, &ptr6_usable_size)));
+      hs.NewHandle(AllocWithGrowth(space, self, 9 * MB, &ptr6_bytes_allocated, &ptr6_usable_size,
+                                   &ptr6_bytes_tl_bulk_allocated)));
   EXPECT_TRUE(ptr6.Get() != nullptr);
   EXPECT_LE(9U * MB, ptr6_bytes_allocated);
   EXPECT_LE(9U * MB, ptr6_usable_size);
   EXPECT_LE(ptr6_usable_size, ptr6_bytes_allocated);
+  EXPECT_EQ(ptr6_bytes_tl_bulk_allocated, ptr6_bytes_allocated);
 
   // Final clean up.
   size_t free1 = space->AllocationSize(ptr1.Get(), nullptr);
@@ -233,7 +243,7 @@
   EXPECT_LE(1U * MB, free1);
 
   // Make sure that the zygote space isn't directly at the start of the space.
-  EXPECT_TRUE(space->Alloc(self, 1U * MB, &dummy, nullptr) != nullptr);
+  EXPECT_TRUE(space->Alloc(self, 1U * MB, &dummy, nullptr, &dummy) != nullptr);
 
   gc::Heap* heap = Runtime::Current()->GetHeap();
   space::Space* old_space = space;
@@ -250,22 +260,26 @@
   AddSpace(space, false);
 
   // Succeeds, fits without adjusting the footprint limit.
-  ptr1.Assign(Alloc(space, self, 1 * MB, &ptr1_bytes_allocated, &ptr1_usable_size));
+  ptr1.Assign(Alloc(space, self, 1 * MB, &ptr1_bytes_allocated, &ptr1_usable_size,
+                    &ptr1_bytes_tl_bulk_allocated));
   EXPECT_TRUE(ptr1.Get() != nullptr);
   EXPECT_LE(1U * MB, ptr1_bytes_allocated);
   EXPECT_LE(1U * MB, ptr1_usable_size);
   EXPECT_LE(ptr1_usable_size, ptr1_bytes_allocated);
+  EXPECT_EQ(ptr1_bytes_tl_bulk_allocated, ptr1_bytes_allocated);
 
   // Fails, requires a higher footprint limit.
-  ptr2 = Alloc(space, self, 8 * MB, &dummy, nullptr);
+  ptr2 = Alloc(space, self, 8 * MB, &dummy, nullptr, &dummy);
   EXPECT_TRUE(ptr2 == nullptr);
 
   // Succeeds, adjusts the footprint.
-  ptr3.Assign(AllocWithGrowth(space, self, 2 * MB, &ptr3_bytes_allocated, &ptr3_usable_size));
+  ptr3.Assign(AllocWithGrowth(space, self, 2 * MB, &ptr3_bytes_allocated, &ptr3_usable_size,
+                              &ptr3_bytes_tl_bulk_allocated));
   EXPECT_TRUE(ptr3.Get() != nullptr);
   EXPECT_LE(2U * MB, ptr3_bytes_allocated);
   EXPECT_LE(2U * MB, ptr3_usable_size);
   EXPECT_LE(ptr3_usable_size, ptr3_bytes_allocated);
+  EXPECT_EQ(ptr3_bytes_tl_bulk_allocated, ptr3_bytes_allocated);
   space->Free(self, ptr3.Assign(nullptr));
 
   // Final clean up.
@@ -285,34 +299,38 @@
   AddSpace(space);
 
   // Succeeds, fits without adjusting the footprint limit.
-  size_t ptr1_bytes_allocated, ptr1_usable_size;
+  size_t ptr1_bytes_allocated, ptr1_usable_size, ptr1_bytes_tl_bulk_allocated;
   StackHandleScope<3> hs(soa.Self());
   MutableHandle<mirror::Object> ptr1(
-      hs.NewHandle(Alloc(space, self, 1 * MB, &ptr1_bytes_allocated, &ptr1_usable_size)));
+      hs.NewHandle(Alloc(space, self, 1 * MB, &ptr1_bytes_allocated, &ptr1_usable_size,
+                         &ptr1_bytes_tl_bulk_allocated)));
   EXPECT_TRUE(ptr1.Get() != nullptr);
   EXPECT_LE(1U * MB, ptr1_bytes_allocated);
   EXPECT_LE(1U * MB, ptr1_usable_size);
   EXPECT_LE(ptr1_usable_size, ptr1_bytes_allocated);
+  EXPECT_EQ(ptr1_bytes_tl_bulk_allocated, ptr1_bytes_allocated);
 
   // Fails, requires a higher footprint limit.
-  mirror::Object* ptr2 = Alloc(space, self, 8 * MB, &dummy, nullptr);
+  mirror::Object* ptr2 = Alloc(space, self, 8 * MB, &dummy, nullptr, &dummy);
   EXPECT_TRUE(ptr2 == nullptr);
 
   // Succeeds, adjusts the footprint.
-  size_t ptr3_bytes_allocated, ptr3_usable_size;
+  size_t ptr3_bytes_allocated, ptr3_usable_size, ptr3_bytes_tl_bulk_allocated;
   MutableHandle<mirror::Object> ptr3(
-      hs.NewHandle(AllocWithGrowth(space, self, 8 * MB, &ptr3_bytes_allocated, &ptr3_usable_size)));
+      hs.NewHandle(AllocWithGrowth(space, self, 8 * MB, &ptr3_bytes_allocated, &ptr3_usable_size,
+                                   &ptr3_bytes_tl_bulk_allocated)));
   EXPECT_TRUE(ptr3.Get() != nullptr);
   EXPECT_LE(8U * MB, ptr3_bytes_allocated);
   EXPECT_LE(8U * MB, ptr3_usable_size);
   EXPECT_LE(ptr3_usable_size, ptr3_bytes_allocated);
+  EXPECT_EQ(ptr3_bytes_tl_bulk_allocated, ptr3_bytes_allocated);
 
   // Fails, requires a higher footprint limit.
-  mirror::Object* ptr4 = Alloc(space, self, 8 * MB, &dummy, nullptr);
+  mirror::Object* ptr4 = Alloc(space, self, 8 * MB, &dummy, nullptr, &dummy);
   EXPECT_TRUE(ptr4 == nullptr);
 
   // Also fails, requires a higher allowed footprint.
-  mirror::Object* ptr5 = AllocWithGrowth(space, self, 8 * MB, &dummy, nullptr);
+  mirror::Object* ptr5 = AllocWithGrowth(space, self, 8 * MB, &dummy, nullptr, &dummy);
   EXPECT_TRUE(ptr5 == nullptr);
 
   // Release some memory.
@@ -322,13 +340,15 @@
   EXPECT_LE(8U * MB, free3);
 
   // Succeeds, now that memory has been freed.
-  size_t ptr6_bytes_allocated, ptr6_usable_size;
+  size_t ptr6_bytes_allocated, ptr6_usable_size, ptr6_bytes_tl_bulk_allocated;
   Handle<mirror::Object> ptr6(
-      hs.NewHandle(AllocWithGrowth(space, self, 9 * MB, &ptr6_bytes_allocated, &ptr6_usable_size)));
+      hs.NewHandle(AllocWithGrowth(space, self, 9 * MB, &ptr6_bytes_allocated, &ptr6_usable_size,
+                                   &ptr6_bytes_tl_bulk_allocated)));
   EXPECT_TRUE(ptr6.Get() != nullptr);
   EXPECT_LE(9U * MB, ptr6_bytes_allocated);
   EXPECT_LE(9U * MB, ptr6_usable_size);
   EXPECT_LE(ptr6_usable_size, ptr6_bytes_allocated);
+  EXPECT_EQ(ptr6_bytes_tl_bulk_allocated, ptr6_bytes_allocated);
 
   // Final clean up.
   size_t free1 = space->AllocationSize(ptr1.Get(), nullptr);
@@ -348,14 +368,16 @@
   // Succeeds, fits without adjusting the max allowed footprint.
   mirror::Object* lots_of_objects[1024];
   for (size_t i = 0; i < arraysize(lots_of_objects); i++) {
-    size_t allocation_size, usable_size;
+    size_t allocation_size, usable_size, bytes_tl_bulk_allocated;
     size_t size_of_zero_length_byte_array = SizeOfZeroLengthByteArray();
     lots_of_objects[i] = Alloc(space, self, size_of_zero_length_byte_array, &allocation_size,
-                               &usable_size);
+                               &usable_size, &bytes_tl_bulk_allocated);
     EXPECT_TRUE(lots_of_objects[i] != nullptr);
     size_t computed_usable_size;
     EXPECT_EQ(allocation_size, space->AllocationSize(lots_of_objects[i], &computed_usable_size));
     EXPECT_EQ(usable_size, computed_usable_size);
+    EXPECT_TRUE(bytes_tl_bulk_allocated == 0 ||
+                bytes_tl_bulk_allocated >= allocation_size);
   }
 
   // Release memory.
@@ -363,12 +385,15 @@
 
   // Succeeds, fits by adjusting the max allowed footprint.
   for (size_t i = 0; i < arraysize(lots_of_objects); i++) {
-    size_t allocation_size, usable_size;
-    lots_of_objects[i] = AllocWithGrowth(space, self, 1024, &allocation_size, &usable_size);
+    size_t allocation_size, usable_size, bytes_tl_bulk_allocated;
+    lots_of_objects[i] = AllocWithGrowth(space, self, 1024, &allocation_size, &usable_size,
+                                         &bytes_tl_bulk_allocated);
     EXPECT_TRUE(lots_of_objects[i] != nullptr);
     size_t computed_usable_size;
     EXPECT_EQ(allocation_size, space->AllocationSize(lots_of_objects[i], &computed_usable_size));
     EXPECT_EQ(usable_size, computed_usable_size);
+    EXPECT_TRUE(bytes_tl_bulk_allocated == 0 ||
+                bytes_tl_bulk_allocated >= allocation_size);
   }
 
   // Release memory.
@@ -425,10 +450,13 @@
       StackHandleScope<1> hs(soa.Self());
       auto object(hs.NewHandle<mirror::Object>(nullptr));
       size_t bytes_allocated = 0;
+      size_t bytes_tl_bulk_allocated;
       if (round <= 1) {
-        object.Assign(Alloc(space, self, alloc_size, &bytes_allocated, nullptr));
+        object.Assign(Alloc(space, self, alloc_size, &bytes_allocated, nullptr,
+                            &bytes_tl_bulk_allocated));
       } else {
-        object.Assign(AllocWithGrowth(space, self, alloc_size, &bytes_allocated, nullptr));
+        object.Assign(AllocWithGrowth(space, self, alloc_size, &bytes_allocated, nullptr,
+                                      &bytes_tl_bulk_allocated));
       }
       footprint = space->GetFootprint();
       EXPECT_GE(space->Size(), footprint);  // invariant
@@ -441,6 +469,8 @@
         } else {
           EXPECT_GE(allocation_size, 8u);
         }
+        EXPECT_TRUE(bytes_tl_bulk_allocated == 0 ||
+                    bytes_tl_bulk_allocated >= allocation_size);
         amount_allocated += allocation_size;
         break;
       }
@@ -518,11 +548,13 @@
   auto large_object(hs.NewHandle<mirror::Object>(nullptr));
   size_t three_quarters_space = (growth_limit / 2) + (growth_limit / 4);
   size_t bytes_allocated = 0;
+  size_t bytes_tl_bulk_allocated;
   if (round <= 1) {
-    large_object.Assign(Alloc(space, self, three_quarters_space, &bytes_allocated, nullptr));
+    large_object.Assign(Alloc(space, self, three_quarters_space, &bytes_allocated, nullptr,
+                              &bytes_tl_bulk_allocated));
   } else {
     large_object.Assign(AllocWithGrowth(space, self, three_quarters_space, &bytes_allocated,
-                                        nullptr));
+                                        nullptr, &bytes_tl_bulk_allocated));
   }
   EXPECT_TRUE(large_object.Get() != nullptr);
 
diff --git a/runtime/gc/space/valgrind_malloc_space-inl.h b/runtime/gc/space/valgrind_malloc_space-inl.h
index ae8e892..bc329e1 100644
--- a/runtime/gc/space/valgrind_malloc_space-inl.h
+++ b/runtime/gc/space/valgrind_malloc_space-inl.h
@@ -32,10 +32,15 @@
 template <size_t kValgrindRedZoneBytes, bool kUseObjSizeForUsable>
 inline mirror::Object* AdjustForValgrind(void* obj_with_rdz, size_t num_bytes,
                                          size_t bytes_allocated, size_t usable_size,
-                                         size_t* bytes_allocated_out, size_t* usable_size_out) {
+                                         size_t bytes_tl_bulk_allocated,
+                                         size_t* bytes_allocated_out, size_t* usable_size_out,
+                                         size_t* bytes_tl_bulk_allocated_out) {
   if (bytes_allocated_out != nullptr) {
     *bytes_allocated_out = bytes_allocated;
   }
+  if (bytes_tl_bulk_allocated_out != nullptr) {
+    *bytes_tl_bulk_allocated_out = bytes_tl_bulk_allocated;
+  }
 
   // This cuts over-provision and is a trade-off between testing the over-provisioning code paths
   // vs checking overflows in the regular paths.
@@ -82,20 +87,25 @@
                     kValgrindRedZoneBytes,
                     kAdjustForRedzoneInAllocSize,
                     kUseObjSizeForUsable>::AllocWithGrowth(
-    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out) {
+    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out,
+    size_t* bytes_tl_bulk_allocated_out) {
   size_t bytes_allocated;
   size_t usable_size;
+  size_t bytes_tl_bulk_allocated;
   void* obj_with_rdz = S::AllocWithGrowth(self, num_bytes + 2 * kValgrindRedZoneBytes,
-                                          &bytes_allocated, &usable_size);
+                                          &bytes_allocated, &usable_size,
+                                          &bytes_tl_bulk_allocated);
   if (obj_with_rdz == nullptr) {
     return nullptr;
   }
 
-  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes,
-                                             kUseObjSizeForUsable>(obj_with_rdz, num_bytes,
-                                                                   bytes_allocated, usable_size,
-                                                                   bytes_allocated_out,
-                                                                   usable_size_out);
+  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes, kUseObjSizeForUsable>(
+      obj_with_rdz, num_bytes,
+      bytes_allocated, usable_size,
+      bytes_tl_bulk_allocated,
+      bytes_allocated_out,
+      usable_size_out,
+      bytes_tl_bulk_allocated_out);
 }
 
 template <typename S,
@@ -106,11 +116,13 @@
                                     kValgrindRedZoneBytes,
                                     kAdjustForRedzoneInAllocSize,
                                     kUseObjSizeForUsable>::Alloc(
-    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out) {
+    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out,
+    size_t* bytes_tl_bulk_allocated_out) {
   size_t bytes_allocated;
   size_t usable_size;
+  size_t bytes_tl_bulk_allocated;
   void* obj_with_rdz = S::Alloc(self, num_bytes + 2 * kValgrindRedZoneBytes,
-                                &bytes_allocated, &usable_size);
+                                &bytes_allocated, &usable_size, &bytes_tl_bulk_allocated);
   if (obj_with_rdz == nullptr) {
     return nullptr;
   }
@@ -118,8 +130,10 @@
   return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes,
                                              kUseObjSizeForUsable>(obj_with_rdz, num_bytes,
                                                                    bytes_allocated, usable_size,
+                                                                   bytes_tl_bulk_allocated,
                                                                    bytes_allocated_out,
-                                                                   usable_size_out);
+                                                                   usable_size_out,
+                                                                   bytes_tl_bulk_allocated_out);
 }
 
 template <typename S,
@@ -130,20 +144,25 @@
                                     kValgrindRedZoneBytes,
                                     kAdjustForRedzoneInAllocSize,
                                     kUseObjSizeForUsable>::AllocThreadUnsafe(
-    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out) {
+    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out,
+    size_t* bytes_tl_bulk_allocated_out) {
   size_t bytes_allocated;
   size_t usable_size;
+  size_t bytes_tl_bulk_allocated;
   void* obj_with_rdz = S::AllocThreadUnsafe(self, num_bytes + 2 * kValgrindRedZoneBytes,
-                                &bytes_allocated, &usable_size);
+                                            &bytes_allocated, &usable_size,
+                                            &bytes_tl_bulk_allocated);
   if (obj_with_rdz == nullptr) {
     return nullptr;
   }
 
-  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes,
-                                             kUseObjSizeForUsable>(obj_with_rdz, num_bytes,
-                                                                   bytes_allocated, usable_size,
-                                                                   bytes_allocated_out,
-                                                                   usable_size_out);
+  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes, kUseObjSizeForUsable>(
+      obj_with_rdz, num_bytes,
+      bytes_allocated, usable_size,
+      bytes_tl_bulk_allocated,
+      bytes_allocated_out,
+      usable_size_out,
+      bytes_tl_bulk_allocated_out);
 }
 
 template <typename S,
@@ -226,6 +245,17 @@
                               mem_map->Size() - initial_size);
 }
 
+template <typename S,
+          size_t kValgrindRedZoneBytes,
+          bool kAdjustForRedzoneInAllocSize,
+          bool kUseObjSizeForUsable>
+size_t ValgrindMallocSpace<S,
+                           kValgrindRedZoneBytes,
+                           kAdjustForRedzoneInAllocSize,
+                           kUseObjSizeForUsable>::MaxBytesBulkAllocatedFor(size_t num_bytes) {
+  return S::MaxBytesBulkAllocatedFor(num_bytes + 2 * kValgrindRedZoneBytes);
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/valgrind_malloc_space.h b/runtime/gc/space/valgrind_malloc_space.h
index 707ea69..a6b010a 100644
--- a/runtime/gc/space/valgrind_malloc_space.h
+++ b/runtime/gc/space/valgrind_malloc_space.h
@@ -34,12 +34,13 @@
 class ValgrindMallocSpace FINAL : public BaseMallocSpaceType {
  public:
   mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                  size_t* usable_size) OVERRIDE;
+                                  size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      OVERRIDE;
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                        size_t* usable_size) OVERRIDE;
+                        size_t* usable_size, size_t* bytes_tl_bulk_allocated) OVERRIDE;
   mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                    size_t* usable_size) OVERRIDE
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
+                                    size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   size_t AllocationSize(mirror::Object* obj, size_t* usable_size) OVERRIDE;
 
@@ -53,6 +54,8 @@
     UNUSED(ptr);
   }
 
+  size_t MaxBytesBulkAllocatedFor(size_t num_bytes) OVERRIDE;
+
   template <typename... Params>
   explicit ValgrindMallocSpace(MemMap* mem_map, size_t initial_size, Params... params);
   virtual ~ValgrindMallocSpace() {}
diff --git a/runtime/gc/space/zygote_space.cc b/runtime/gc/space/zygote_space.cc
index a868e68..9e882a8 100644
--- a/runtime/gc/space/zygote_space.cc
+++ b/runtime/gc/space/zygote_space.cc
@@ -77,7 +77,7 @@
       << ",name=\"" << GetName() << "\"]";
 }
 
-mirror::Object* ZygoteSpace::Alloc(Thread*, size_t, size_t*, size_t*) {
+mirror::Object* ZygoteSpace::Alloc(Thread*, size_t, size_t*, size_t*, size_t*) {
   UNIMPLEMENTED(FATAL);
   UNREACHABLE();
 }
diff --git a/runtime/gc/space/zygote_space.h b/runtime/gc/space/zygote_space.h
index 0cf4bb1..934a234 100644
--- a/runtime/gc/space/zygote_space.h
+++ b/runtime/gc/space/zygote_space.h
@@ -46,7 +46,7 @@
   }
 
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                        size_t* usable_size) OVERRIDE;
+                        size_t* usable_size, size_t* bytes_tl_bulk_allocated) OVERRIDE;
 
   size_t AllocationSize(mirror::Object* obj, size_t* usable_size) OVERRIDE;
 
@@ -55,9 +55,11 @@
   size_t FreeList(Thread* self, size_t num_ptrs, mirror::Object** ptrs) OVERRIDE;
 
   // ZygoteSpaces don't have thread local state.
-  void RevokeThreadLocalBuffers(art::Thread*) OVERRIDE {
+  size_t RevokeThreadLocalBuffers(art::Thread*) OVERRIDE {
+    return 0U;
   }
-  void RevokeAllThreadLocalBuffers() OVERRIDE {
+  size_t RevokeAllThreadLocalBuffers() OVERRIDE {
+    return 0U;
   }
 
   uint64_t GetBytesAllocated() {