Parellel mark stack processing

Enabled parallel mark stack processing by using a thread pool.

Optimized object scanning by removing dependent loads for IsClass.

Performance:
Prime: ~10% speedup of partial GC.
Nakasi: ~50% speedup of partial GC.

Change-Id: I43256a068efc47cb52d93108458ea18d4e02fccc
diff --git a/src/heap.h b/src/heap.h
index 8ed5881..6c4c38b 100644
--- a/src/heap.h
+++ b/src/heap.h
@@ -31,6 +31,7 @@
 #include "offsets.h"
 #include "safe_map.h"
 #include "timing_logger.h"
+#include "thread_pool.h"
 
 #define VERIFY_OBJECT_ENABLED 0
 
@@ -312,6 +313,13 @@
   // GC performance measuring
   void DumpGcPerformanceInfo();
 
+  // Thread pool.
+  void CreateThreadPool();
+  void DeleteThreadPool();
+  ThreadPool* GetThreadPool() {
+    return thread_pool_.get();
+  }
+
  private:
   // Allocates uninitialized storage. Passing in a null space tries to place the object in the
   // large object space.
@@ -408,6 +416,9 @@
   Mutex* gc_complete_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   UniquePtr<ConditionVariable> gc_complete_cond_ GUARDED_BY(gc_complete_lock_);
 
+  // Reference queue lock
+  UniquePtr<Mutex> reference_queue_lock_;
+
   // True while the garbage collector is running.
   volatile bool is_gc_running_ GUARDED_BY(gc_complete_lock_);
 
@@ -450,6 +461,9 @@
   const bool verify_post_gc_heap_;
   const bool verify_mod_union_table_;
 
+  // Parallel GC data structures.
+  UniquePtr<ThreadPool> thread_pool_;
+
   // After how many GCs we force to do a partial GC instead of sticky mark bits GC.
   const size_t partial_gc_frequency_;