More parallel GC, rewritten parallel mark stack processing.

Card scanning may now be done in parallel. This speeds up sticky and
reduces pause times for all GC types.

Speedup on my mako (ritz perf):
Average pause time for sticky GC (~250 samples):
Without parallel cards scanning enabled: 2.524904215ms
Parallel card scanning (num_gc_threads_): 1.552123552ms
Throughput (~250 samples):
Sticky GC throughput with parallel card scanning: 69MB/s
Sticky GC throughput without parallel card scanning: 51MB/s

Rewrote the mark stack processing to be LIFO and use a prefetch queue
like the non parallel version.

Cleaned up some of the logcat printing for the activity manager
process state listening.

Added unlikely hints to object scanning since arrays and classes are
scanned much less often than normal objects.

Fixed a bug where the number of GC threads was clamped to 1 due to a
bool instead of a size_t.

Fixed a race condition when we added references to the reference
queues. Sharded the reference queue lock into one lock for each reference
type (weak, soft, phatom, finalizer).

Changed timing splits to be different for processing gray objects with
and without mutators paused since sticky GC does both.

Mask out the class bit when visiting fields as an optimization, this is
valid since classes are held live by the class linker.

Partially completed: Parallel recursive mark + finger.

Bug: 10245302
Bug: 9969166
Bug: 9986532
Bug: 9961698

Change-Id: I142d09718c4609b7c2387cb28f517a6983c73288
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 8db03d3..ba12e64 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -54,7 +54,6 @@
   class ContinuousSpace;
 }  // namespace space
 
-class CheckObjectVisitor;
 class Heap;
 
 namespace collector {
@@ -126,7 +125,7 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Builds a mark stack with objects on dirty cards and recursively mark until it empties.
-  void RecursiveMarkDirtyObjects(byte minimum_age)
+  void RecursiveMarkDirtyObjects(bool paused, byte minimum_age)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -260,8 +259,13 @@
   // Unmarks an object by clearing the bit inside of the corresponding bitmap, or if it is in a
   // space set, removing the object from the set.
   void UnMarkObjectNonNull(const mirror::Object* obj)
-        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-        EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
+  // Mark the vm thread roots.
+  virtual void MarkThreadRoots(Thread* self)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Marks an object atomically, safe to use from multiple threads.
   void MarkObjectNonNullParallel(const mirror::Object* obj);
@@ -342,20 +346,20 @@
   }
 
   // Blackens objects grayed during a garbage collection.
-  void ScanGrayObjects(byte minimum_age)
+  void ScanGrayObjects(bool paused, byte minimum_age)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Schedules an unmarked object for reference processing.
-  void DelayReferenceReferent(mirror::Object* reference)
+  void DelayReferenceReferent(mirror::Class* klass, mirror::Object* reference)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   // Recursively blackens objects on the mark stack.
-  void ProcessMarkStack()
+  void ProcessMarkStack(bool paused)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void ProcessMarkStackParallel()
+  void ProcessMarkStackParallel(bool paused)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -402,6 +406,9 @@
   mirror::Object* phantom_reference_list_;
   mirror::Object* cleared_reference_list_;
 
+  // Parallel finger.
+  AtomicInteger atomic_finger_;
+
   // Number of bytes freed in this collection.
   AtomicInteger freed_bytes_;
   // Number of objects freed in this collection.
@@ -431,7 +438,6 @@
  private:
   friend class AddIfReachesAllocSpaceVisitor;  // Used by mod-union table.
   friend class CheckBitmapVisitor;
-  friend class CheckObjectVisitor;
   friend class CheckReferenceVisitor;
   friend class art::gc::Heap;
   friend class InternTableEntryIsUnmarked;
@@ -445,7 +451,7 @@
   friend class ModUnionScanImageRootVisitor;
   friend class ScanBitmapVisitor;
   friend class ScanImageRootVisitor;
-  friend class MarkStackChunk;
+  template<bool kUseFinger> friend class MarkStackTask;
   friend class FifoMarkStackChunk;
 
   DISALLOW_COPY_AND_ASSIGN(MarkSweep);