Merge "Add concurrent card graying for immune spaces"
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 08a752f..8afc6ba 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -7177,10 +7177,10 @@
           instruction, root, /* unpoison_ref_before_marking */ false);
       codegen_->AddSlowPath(slow_path);
 
-      // Test the entrypoint (`Thread::Current()->pReadBarrierMarkReg ## root.reg()`).
-      const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(root.reg());
-      __ fs()->cmpl(Address::Absolute(entry_point_offset), Immediate(0));
+      // Test if the GC is marking. Note that X86 and X86_64 don't switch the entrypoints when the
+      // GC is marking.
+      const int32_t is_marking_offset = Thread::IsGcMarkingOffset<kX86PointerSize>().Int32Value();
+      __ fs()->cmpl(Address::Absolute(is_marking_offset), Immediate(0));
       // The entrypoint is null when the GC is not marking.
       __ j(kNotEqual, slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index ff6e099..c2b1a31 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -6542,10 +6542,11 @@
           instruction, root, /* unpoison_ref_before_marking */ false);
       codegen_->AddSlowPath(slow_path);
 
-      // Test the `Thread::Current()->pReadBarrierMarkReg ## root.reg()` entrypoint.
-      const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(root.reg());
-      __ gs()->cmpl(Address::Absolute(entry_point_offset, /* no_rip */ true), Immediate(0));
+      // Test if the GC is marking. Note that X86 and X86_64 don't switch the entrypoints when the
+      // GC is marking.
+      const int32_t is_marking_offset =
+          Thread::IsGcMarkingOffset<kX86_64PointerSize>().Int32Value();
+      __ gs()->cmpl(Address::Absolute(is_marking_offset, /* no_rip */ true), Immediate(0));
       // The entrypoint is null when the GC is not marking.
       __ j(kNotEqual, slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 2414b5f..03ae63a 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -373,19 +373,19 @@
   bool IsSharedHeld(const Thread* self) const;
 
   // Assert the current thread has shared access to the ReaderWriterMutex.
-  void AssertSharedHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(this) {
+  ALWAYS_INLINE void AssertSharedHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(this) {
     if (kDebugLocking && (gAborting == 0)) {
       // TODO: we can only assert this well when self != null.
       CHECK(IsSharedHeld(self) || self == nullptr) << *this;
     }
   }
-  void AssertReaderHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(this) {
+  ALWAYS_INLINE void AssertReaderHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(this) {
     AssertSharedHeld(self);
   }
 
   // Assert the current thread doesn't hold this ReaderWriterMutex either in shared or exclusive
   // mode.
-  void AssertNotHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(!this) {
+  ALWAYS_INLINE void AssertNotHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(!this) {
     if (kDebugLocking && (gAborting == 0)) {
       CHECK(!IsSharedHeld(self)) << *this;
     }
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h
index 68ef15d..14aa730 100644
--- a/runtime/gc/accounting/card_table.h
+++ b/runtime/gc/accounting/card_table.h
@@ -51,6 +51,7 @@
   static constexpr size_t kCardSize = 1 << kCardShift;
   static constexpr uint8_t kCardClean = 0x0;
   static constexpr uint8_t kCardDirty = 0x70;
+  static constexpr uint8_t kCardAged = kCardDirty - 1;
 
   static CardTable* Create(const uint8_t* heap_begin, size_t heap_capacity);
   ~CardTable();
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index dd449f9..1f06f15 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -152,7 +152,8 @@
 
 inline mirror::Object* ConcurrentCopying::MarkFromReadBarrier(mirror::Object* from_ref) {
   mirror::Object* ret;
-  if (from_ref == nullptr) {
+  // We can get here before marking starts since we gray immune objects before the marking phase.
+  if (from_ref == nullptr || !Thread::Current()->GetIsGcMarking()) {
     return from_ref;
   }
   // TODO: Consider removing this check when we are done investigating slow paths. b/30162165
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 4192f34..9c3ce0b 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -163,6 +163,12 @@
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
     InitializePhase();
   }
+  if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects) {
+    // Gray dirty immune objects concurrently to reduce GC pause times. We re-process gray cards in
+    // the pause.
+    ReaderMutexLock mu(self, *Locks::mutator_lock_);
+    GrayAllDirtyImmuneObjects();
+  }
   FlipThreadRoots();
   {
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
@@ -352,9 +358,12 @@
     if (kVerifyNoMissingCardMarks) {
       cc->VerifyNoMissingCardMarks();
     }
-    CHECK(thread == self);
+    CHECK_EQ(thread, self);
     Locks::mutator_lock_->AssertExclusiveHeld(self);
-    cc->region_space_->SetFromSpace(cc->rb_table_, cc->force_evacuate_all_);
+    {
+      TimingLogger::ScopedTiming split2("(Paused)SetFromSpace", cc->GetTimings());
+      cc->region_space_->SetFromSpace(cc->rb_table_, cc->force_evacuate_all_);
+    }
     cc->SwapStacks();
     if (ConcurrentCopying::kEnableFromSpaceAccountingCheck) {
       cc->RecordLiveStackFreezeSize(self);
@@ -368,11 +377,11 @@
     }
     if (UNLIKELY(Runtime::Current()->IsActiveTransaction())) {
       CHECK(Runtime::Current()->IsAotCompiler());
-      TimingLogger::ScopedTiming split2("(Paused)VisitTransactionRoots", cc->GetTimings());
+      TimingLogger::ScopedTiming split3("(Paused)VisitTransactionRoots", cc->GetTimings());
       Runtime::Current()->VisitTransactionRoots(cc);
     }
     if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects) {
-      cc->GrayAllDirtyImmuneObjects();
+      cc->GrayAllNewlyDirtyImmuneObjects();
       if (kIsDebugBuild) {
         // Check that all non-gray immune objects only refernce immune objects.
         cc->VerifyGrayImmuneObjects();
@@ -519,8 +528,8 @@
 
 void ConcurrentCopying::VerifyNoMissingCardMarkCallback(mirror::Object* obj, void* arg) {
   auto* collector = reinterpret_cast<ConcurrentCopying*>(arg);
-  // Objects not on dirty cards should never have references to newly allocated regions.
-  if (!collector->heap_->GetCardTable()->IsDirty(obj)) {
+  // Objects not on dirty or aged cards should never have references to newly allocated regions.
+  if (collector->heap_->GetCardTable()->GetCard(obj) == gc::accounting::CardTable::kCardClean) {
     VerifyNoMissingCardMarkVisitor visitor(collector, /*holder*/ obj);
     obj->VisitReferences</*kVisitNativeRoots*/true, kVerifyNone, kWithoutReadBarrier>(
         visitor,
@@ -583,53 +592,100 @@
   }
 }
 
+template <bool kConcurrent>
 class ConcurrentCopying::GrayImmuneObjectVisitor {
  public:
-  explicit GrayImmuneObjectVisitor() {}
+  explicit GrayImmuneObjectVisitor(Thread* self) : self_(self) {}
 
   ALWAYS_INLINE void operator()(mirror::Object* obj) const REQUIRES_SHARED(Locks::mutator_lock_) {
-    if (kUseBakerReadBarrier) {
-      if (kIsDebugBuild) {
-        Locks::mutator_lock_->AssertExclusiveHeld(Thread::Current());
+    if (kUseBakerReadBarrier && obj->GetReadBarrierState() == ReadBarrier::WhiteState()) {
+      if (kConcurrent) {
+        Locks::mutator_lock_->AssertSharedHeld(self_);
+        obj->AtomicSetReadBarrierState(ReadBarrier::WhiteState(), ReadBarrier::GrayState());
+        // Mod union table VisitObjects may visit the same object multiple times so we can't check
+        // the result of the atomic set.
+      } else {
+        Locks::mutator_lock_->AssertExclusiveHeld(self_);
+        obj->SetReadBarrierState(ReadBarrier::GrayState());
       }
-      obj->SetReadBarrierState(ReadBarrier::GrayState());
     }
   }
 
   static void Callback(mirror::Object* obj, void* arg) REQUIRES_SHARED(Locks::mutator_lock_) {
-    reinterpret_cast<GrayImmuneObjectVisitor*>(arg)->operator()(obj);
+    reinterpret_cast<GrayImmuneObjectVisitor<kConcurrent>*>(arg)->operator()(obj);
   }
+
+ private:
+  Thread* const self_;
 };
 
 void ConcurrentCopying::GrayAllDirtyImmuneObjects() {
-  TimingLogger::ScopedTiming split(__FUNCTION__, GetTimings());
-  gc::Heap* const heap = Runtime::Current()->GetHeap();
-  accounting::CardTable* const card_table = heap->GetCardTable();
-  WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
+  TimingLogger::ScopedTiming split("GrayAllDirtyImmuneObjects", GetTimings());
+  accounting::CardTable* const card_table = heap_->GetCardTable();
+  Thread* const self = Thread::Current();
+  using VisitorType = GrayImmuneObjectVisitor</* kIsConcurrent */ true>;
+  VisitorType visitor(self);
+  WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   for (space::ContinuousSpace* space : immune_spaces_.GetSpaces()) {
     DCHECK(space->IsImageSpace() || space->IsZygoteSpace());
-    GrayImmuneObjectVisitor visitor;
-    accounting::ModUnionTable* table = heap->FindModUnionTableFromSpace(space);
+    accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
     // Mark all the objects on dirty cards since these may point to objects in other space.
     // Once these are marked, the GC will eventually clear them later.
     // Table is non null for boot image and zygote spaces. It is only null for application image
     // spaces.
     if (table != nullptr) {
-      // TODO: Consider adding precleaning outside the pause.
       table->ProcessCards();
-      table->VisitObjects(GrayImmuneObjectVisitor::Callback, &visitor);
-      // Since the cards are recorded in the mod-union table and this is paused, we can clear
-      // the cards for the space (to madvise).
+      table->VisitObjects(&VisitorType::Callback, &visitor);
+      // Don't clear cards here since we need to rescan in the pause. If we cleared the cards here,
+      // there would be races with the mutator marking new cards.
+    } else {
+      // Keep cards aged if we don't have a mod-union table since we may need to scan them in future
+      // GCs. This case is for app images.
+      card_table->ModifyCardsAtomic(
+          space->Begin(),
+          space->End(),
+          [](uint8_t card) {
+            return (card != gc::accounting::CardTable::kCardClean)
+                ? gc::accounting::CardTable::kCardAged
+                : card;
+          },
+          /* card modified visitor */ VoidFunctor());
+      card_table->Scan</* kClearCard */ false>(space->GetMarkBitmap(),
+                                               space->Begin(),
+                                               space->End(),
+                                               visitor,
+                                               gc::accounting::CardTable::kCardAged);
+    }
+  }
+}
+
+void ConcurrentCopying::GrayAllNewlyDirtyImmuneObjects() {
+  TimingLogger::ScopedTiming split("(Paused)GrayAllNewlyDirtyImmuneObjects", GetTimings());
+  accounting::CardTable* const card_table = heap_->GetCardTable();
+  using VisitorType = GrayImmuneObjectVisitor</* kIsConcurrent */ false>;
+  Thread* const self = Thread::Current();
+  VisitorType visitor(self);
+  WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
+  for (space::ContinuousSpace* space : immune_spaces_.GetSpaces()) {
+    DCHECK(space->IsImageSpace() || space->IsZygoteSpace());
+    accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+
+    // Don't need to scan aged cards since we did these before the pause. Note that scanning cards
+    // also handles the mod-union table cards.
+    card_table->Scan</* kClearCard */ false>(space->GetMarkBitmap(),
+                                             space->Begin(),
+                                             space->End(),
+                                             visitor,
+                                             gc::accounting::CardTable::kCardDirty);
+    if (table != nullptr) {
+      // Add the cards to the mod-union table so that we can clear cards to save RAM.
+      table->ProcessCards();
       TimingLogger::ScopedTiming split2("(Paused)ClearCards", GetTimings());
       card_table->ClearCardRange(space->Begin(),
                                  AlignDown(space->End(), accounting::CardTable::kCardSize));
-    } else {
-      // TODO: Consider having a mark bitmap for app image spaces and avoid scanning during the
-      // pause because app image spaces are all dirty pages anyways.
-      card_table->Scan<false>(space->GetMarkBitmap(), space->Begin(), space->End(), visitor);
     }
   }
-  // Since all of the objects that may point to other spaces are marked, we can avoid all the read
+  // Since all of the objects that may point to other spaces are gray, we can avoid all the read
   // barriers in the immune spaces.
   updated_all_immune_objects_.StoreRelaxed(true);
 }
@@ -658,6 +714,7 @@
 
   ALWAYS_INLINE void operator()(mirror::Object* obj) const REQUIRES_SHARED(Locks::mutator_lock_) {
     if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects) {
+      // Only need to scan gray objects.
       if (obj->GetReadBarrierState() == ReadBarrier::GrayState()) {
         collector_->ScanImmuneObject(obj);
         // Done scanning the object, go back to white.
@@ -707,6 +764,7 @@
       if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects && table != nullptr) {
         table->VisitObjects(ImmuneSpaceScanObjVisitor::Callback, &visitor);
       } else {
+        // TODO: Scan only the aged cards.
         live_bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(space->Begin()),
                                       reinterpret_cast<uintptr_t>(space->Limit()),
                                       visitor);
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index f877314..d8dc9f6 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -162,6 +162,9 @@
   void GrayAllDirtyImmuneObjects()
       REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_);
+  void GrayAllNewlyDirtyImmuneObjects()
+      REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!mark_stack_lock_);
   void VerifyGrayImmuneObjects()
       REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_);
@@ -336,7 +339,7 @@
   class DisableMarkingCheckpoint;
   class DisableWeakRefAccessCallback;
   class FlipCallback;
-  class GrayImmuneObjectVisitor;
+  template <bool kConcurrent> class GrayImmuneObjectVisitor;
   class ImmuneSpaceScanObjVisitor;
   class LostCopyVisitor;
   class RefFieldsVisitor;
diff --git a/runtime/oat.h b/runtime/oat.h
index 05706252..84ca2a6 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '1', '1', '9', '\0' };  // Add thread_local_limit.
+  // Go back to checking is_marking for x86 and x86_64.
+  static constexpr uint8_t kOatVersion[] = { '1', '2', '0', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 62a616b..0a81bfd 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -134,6 +134,13 @@
 void Thread::SetIsGcMarkingAndUpdateEntrypoints(bool is_marking) {
   CHECK(kUseReadBarrier);
   tls32_.is_gc_marking = is_marking;
+  if (kUseReadBarrier && (kRuntimeISA == kX86_64 || kRuntimeISA == kX86)) {
+    // Disable entrypoint switching for X86 since we don't always check is_marking with the gray
+    // bit. This causes a race between GrayAllDirtyImmuneObjects and FlipThreadRoots where
+    // we may try to go slow path with a null entrypoint. The fix is to never do entrypoint
+    // switching for x86.
+    is_marking = true;
+  }
   UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, is_marking);
   ResetQuickAllocEntryPointsForThread(is_marking);
 }