Fix GC performance regression

Enable CMS and fix performance regression due to recursive marking image spaces. Dependent on my java change list.

Change-Id: I4765792aa8226e811ac158f04ab88217db755573
diff --git a/src/heap.cc b/src/heap.cc
index 5129a41..49657e0 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -30,6 +30,7 @@
 #include "object_utils.h"
 #include "os.h"
 #include "scoped_heap_lock.h"
+#include "ScopedLocalRef.h"
 #include "space.h"
 #include "stl_util.h"
 #include "thread_list.h"
@@ -140,8 +141,13 @@
       card_table_(NULL),
       card_marking_disabled_(false),
       is_gc_running_(false),
+      concurrent_start_size_(128 * KB),
+      concurrent_min_free_(256 * KB),
+      try_running_gc_(false),
+      requesting_gc_(false),
       num_bytes_allocated_(0),
       num_objects_allocated_(0),
+      last_trim_time_(0),
       reference_referent_offset_(0),
       reference_queue_offset_(0),
       reference_queueNext_offset_(0),
@@ -245,6 +251,9 @@
   // but we can create the heap lock now. We don't create it earlier to
   // make it clear that you can't use locks during heap initialization.
   lock_ = new Mutex("Heap lock", kHeapLock);
+  condition_ = new ConditionVariable("Heap condition variable");
+
+  concurrent_start_bytes_ = std::numeric_limits<size_t>::max();
 
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() exiting";
@@ -266,6 +275,7 @@
   delete live_bitmap_;
   delete card_table_;
   delete mark_stack_;
+  delete condition_;
   delete lock_;
 }
 
@@ -296,6 +306,10 @@
     DCHECK_GE(byte_count, sizeof(Object));
     Object* obj = AllocateLocked(byte_count);
     if (obj != NULL) {
+      if (!is_gc_running_ && num_bytes_allocated_ >= concurrent_start_bytes_) {
+        RequestConcurrentGC();
+      }
+
       obj->SetClass(c);
       if (Dbg::IsAllocTrackingEnabled()) {
         Dbg::RecordAllocation(c, byte_count);
@@ -597,17 +611,26 @@
     mark_sweep.Init();
     timings.AddSplit("Init");
 
+    if (concurrent) {
+      card_table_->ClearNonImageSpaceCards(this);
+    }
+
     mark_sweep.MarkRoots();
     timings.AddSplit("MarkRoots");
 
+    if (!concurrent) {
+      mark_sweep.ScanDirtyImageRoots();
+      timings.AddSplit("ScanDirtyImageRoots");
+    }
+
     // Roots are marked on the bitmap and the mark_stack is empty
     DCHECK(mark_sweep.IsMarkStackEmpty());
 
     if (concurrent) {
-      timings.AddSplit("RootEnd");
       Unlock();
       thread_list->ResumeAll();
       rootEnd = NanoTime();
+      timings.AddSplit("RootEnd");
     }
 
     // Recursively mark all bits set in the non-image mark bitmap
@@ -623,12 +646,12 @@
       // Re-mark root set.
       mark_sweep.ReMarkRoots();
       timings.AddSplit("ReMarkRoots");
-    }
 
-    // Scan dirty objects, this is required even if we are not doing a
-    // concurrent GC since we use the card table to locate image roots.
-    mark_sweep.RecursiveMarkDirtyObjects();
-    timings.AddSplit("RecursiveMarkDirtyObjects");
+      // Scan dirty objects, this is required even if we are not doing a
+      // concurrent GC since we use the card table to locate image roots.
+      mark_sweep.RecursiveMarkDirtyObjects();
+      timings.AddSplit("RecursiveMarkDirtyObjects");
+    }
 
     mark_sweep.ProcessReferences(clear_soft_references);
     timings.AddSplit("ProcessReferences");
@@ -667,7 +690,7 @@
     duration_ns = (duration_ns / 1000) * 1000;
     if (concurrent) {
       uint64_t pauseRootsTime = (rootEnd - t0) / 1000 * 1000;
-      uint64_t pauseDirtyTime = (t1 - dirtyBegin) / 1000 * 1000;
+      uint64_t pauseDirtyTime = (dirtyEnd - dirtyBegin) / 1000 * 1000;
       LOG(INFO) << "GC freed " << PrettySize(bytes_freed) << ", " << GetPercentFree() << "% free, "
                 << PrettySize(num_bytes_allocated_) << "/" << PrettySize(GetTotalMemory()) << ", "
                 << "paused " << PrettyDuration(pauseRootsTime) << "+" << PrettyDuration(pauseDirtyTime)
@@ -688,6 +711,15 @@
 
 void Heap::WaitForConcurrentGcToComplete() {
   lock_->AssertHeld();
+
+  // Busy wait for GC to finish
+  if (is_gc_running_) {
+    uint64_t waitStart = NanoTime();
+    do {
+      condition_->Wait(*lock_);
+    } while (is_gc_running_);
+    LOG(INFO) << "WaitForConcurrentGcToComplete blocked for " << PrettyDuration(NsToMs(NanoTime() - waitStart));
+  }
 }
 
 void Heap::DumpForSigQuit(std::ostream& os) {
@@ -731,6 +763,14 @@
     target_size = num_bytes_allocated_ + kHeapMinFree;
   }
 
+  // Calculate when to perform the next ConcurrentGC.
+  if (GetTotalMemory() - num_bytes_allocated_ < concurrent_min_free_) {
+    // Not enough free memory to perform concurrent GC.
+    concurrent_start_bytes_ = std::numeric_limits<size_t>::max();
+  } else {
+    concurrent_start_bytes_ = alloc_space_->GetFootprintLimit() - concurrent_start_size_;
+  }
+
   SetIdealFootprint(target_size);
 }
 
@@ -854,6 +894,33 @@
   }
 }
 
+void Heap::RequestConcurrentGC() {
+  // Make sure that our Daemon threads are started
+  if (requesting_gc_ || !Runtime::Current()->IsFinishedStarting()) {
+    return;
+  }
+
+  requesting_gc_ = true;
+  JNIEnv* env = Thread::Current()->GetJniEnv();
+  env->CallStaticVoidMethod(WellKnownClasses::java_lang_Daemons, WellKnownClasses::java_lang_Daemons_requestGC);
+  CHECK(!env->ExceptionCheck());
+  requesting_gc_ = false;
+}
+
+void Heap::ConcurrentGC() {
+  ScopedHeapLock heap_lock;
+  WaitForConcurrentGcToComplete();
+  // Current thread needs to be runnable or else we can't suspend all threads.
+  ScopedThreadStateChange tsc(Thread::Current(), kRunnable);
+  CollectGarbageInternal(true, false);
+  condition_->Broadcast(); // Broadcast anyone that is blocked waiting for concurrent GC
+}
+
+void Heap::Trim() {
+  ScopedHeapLock heap_lock;
+  GetAllocSpace()->Trim();
+}
+
 void Heap::RequestHeapTrim() {
   // We don't have a good measure of how worthwhile a trim might be. We can't use the live bitmap
   // because that only marks object heads, so a large array looks like lots of empty space. We
@@ -862,15 +929,17 @@
   // We could try mincore(2) but that's only a measure of how many pages we haven't given away,
   // not how much use we're making of those pages.
   float utilization = static_cast<float>(num_bytes_allocated_) / alloc_space_->Size();
-  if (utilization > 0.75f) {
-    // Don't bother trimming the heap if it's more than 75% utilized.
-    // (This percentage was picked arbitrarily.)
+  uint64_t ms_time = NsToMs(NanoTime());
+  if (utilization > 0.75f || ms_time - last_trim_time_ < 2 * 1000) {
+    // Don't bother trimming the heap if it's more than 75% utilized, or if a
+    // heap trim occurred in the last two seconds.
     return;
   }
-  if (!Runtime::Current()->IsStarted()) {
-    // Heap trimming isn't supported without a Java runtime (such as at dex2oat time)
+  if (!Runtime::Current()->IsFinishedStarting()) {
+    // Heap trimming isn't supported without a Java runtime or Daemons (such as at dex2oat time)
     return;
   }
+  last_trim_time_ = ms_time;
   JNIEnv* env = Thread::Current()->GetJniEnv();
   env->CallStaticVoidMethod(WellKnownClasses::java_lang_Daemons, WellKnownClasses::java_lang_Daemons_requestHeapTrim);
   CHECK(!env->ExceptionCheck());
diff --git a/src/heap.h b/src/heap.h
index de3caa2..a6fb4d4 100644
--- a/src/heap.h
+++ b/src/heap.h
@@ -84,6 +84,10 @@
   // Initiates an explicit garbage collection.
   void CollectGarbage(bool clear_soft_references);
 
+  // Does a concurrent GC, should only be called by the GC daemon thread
+  // through runtime.
+  void ConcurrentGC();
+
   // Implements java.lang.Runtime.maxMemory.
   int64_t GetMaxMemory();
   // Implements java.lang.Runtime.totalMemory.
@@ -218,8 +222,21 @@
     return alloc_space_;
   }
 
+  size_t GetConcurrentStartSize() const { return concurrent_start_size_; }
+
+  void SetConcurrentStartSize(size_t size) {
+    concurrent_start_size_ = size;
+  }
+
+  size_t GetConcurrentMinFree() const { return concurrent_min_free_; }
+
+  void SetConcurrentMinFree(size_t size) {
+    concurrent_min_free_ = size;
+  }
+
   void DumpForSigQuit(std::ostream& os);
 
+  void Trim();
  private:
   // Allocates uninitialized storage.
   Object* AllocateLocked(size_t num_bytes);
@@ -229,6 +246,7 @@
   void EnqueueClearedReferences(Object** cleared_references);
 
   void RequestHeapTrim();
+  void RequestConcurrentGC();
 
   void RecordAllocationLocked(AllocSpace* space, const Object* object);
   void RecordImageAllocations(Space* space);
@@ -251,6 +269,7 @@
   static void VerificationCallback(Object* obj, void* arg);
 
   Mutex* lock_;
+  ConditionVariable* condition_;
 
   std::vector<Space*> spaces_;
 
@@ -272,6 +291,19 @@
   // True while the garbage collector is running.
   bool is_gc_running_;
 
+  // Bytes until concurrent GC
+  size_t concurrent_start_bytes_;
+  size_t concurrent_start_size_;
+  size_t concurrent_min_free_;
+
+  // True while the garbage collector is trying to signal the GC daemon thread.
+  // This flag is needed to prevent recursion from occurring when the JNI calls
+  // allocate memory and request another GC.
+  bool try_running_gc_;
+
+  // Used to ensure that we don't ever recursively request GC.
+  bool requesting_gc_;
+
   // Mark stack that we reuse to avoid re-allocating the mark stack
   MarkStack* mark_stack_;
 
@@ -281,6 +313,9 @@
   // Number of objects allocated.  Adjusted after each allocation and free.
   size_t num_objects_allocated_;
 
+  // Last trim time
+  uint64_t last_trim_time_;
+
   // offset of java.lang.ref.Reference.referent
   MemberOffset reference_referent_offset_;
 
diff --git a/src/mark_sweep.cc b/src/mark_sweep.cc
index d070a57..45ad0fe 100644
--- a/src/mark_sweep.cc
+++ b/src/mark_sweep.cc
@@ -58,9 +58,6 @@
   live_bitmap_ = heap_->GetLiveBits();
   mark_stack_->Reset();
 
-  // TODO: if concurrent, clear the card table.
-  heap_->GetCardTable()->ClearNonImageSpaceCards(heap_);
-
   // TODO: if concurrent, enable card marking in compiler
 
   // TODO: check that the mark bitmap is entirely clear.
@@ -204,9 +201,11 @@
   void* arg = reinterpret_cast<void*>(this);
   const std::vector<Space*>& spaces = heap_->GetSpaces();
   for (size_t i = 0; i < spaces.size(); ++i) {
-    uintptr_t begin = reinterpret_cast<uintptr_t>(spaces[i]->Begin());
-    uintptr_t end = reinterpret_cast<uintptr_t>(spaces[i]->End());
-    mark_bitmap_->ScanWalk(begin, end, &MarkSweep::ScanBitmapCallback, arg);
+    if (!spaces[i]->IsImageSpace()) {
+      uintptr_t begin = reinterpret_cast<uintptr_t>(spaces[i]->Begin());
+      uintptr_t end = reinterpret_cast<uintptr_t>(spaces[i]->End());
+      mark_bitmap_->ScanWalk(begin, end, &MarkSweep::ScanBitmapCallback, arg);
+    }
   }
   finger_ = reinterpret_cast<Object*>(~0);
   // TODO: tune the frequency of emptying the mark stack
diff --git a/src/native/dalvik_system_VMRuntime.cc b/src/native/dalvik_system_VMRuntime.cc
index a500f6a..09ca251 100644
--- a/src/native/dalvik_system_VMRuntime.cc
+++ b/src/native/dalvik_system_VMRuntime.cc
@@ -166,7 +166,8 @@
   size_t alloc_space_size = heap->GetAllocSpace()->Size();
   float utilization = static_cast<float>(heap->GetBytesAllocated()) / alloc_space_size;
   uint64_t start_ns = NanoTime();
-  heap->GetAllocSpace()->Trim();
+
+  heap->Trim();
 
   // Trim the native heap.
   dlmalloc_trim(0);
@@ -181,11 +182,16 @@
             << " heap with " << static_cast<int>(100 * utilization) << "% utilization";
 }
 
+static void VMRuntime_concurrentGC(JNIEnv*, jobject) {
+  Runtime::Current()->GetHeap()->ConcurrentGC();
+}
+
 static JNINativeMethod gMethods[] = {
   NATIVE_METHOD(VMRuntime, addressOf, "(Ljava/lang/Object;)J"),
   NATIVE_METHOD(VMRuntime, bootClassPath, "()Ljava/lang/String;"),
   NATIVE_METHOD(VMRuntime, classPath, "()Ljava/lang/String;"),
   NATIVE_METHOD(VMRuntime, clearGrowthLimit, "()V"),
+  NATIVE_METHOD(VMRuntime, concurrentGC, "()V"),
   NATIVE_METHOD(VMRuntime, disableJitCompilation, "()V"),
   NATIVE_METHOD(VMRuntime, getTargetHeapUtilization, "()F"),
   NATIVE_METHOD(VMRuntime, isDebuggerActive, "()Z"),
diff --git a/src/runtime.cc b/src/runtime.cc
index 784a3f6..1a66951 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -69,6 +69,7 @@
       system_class_loader_(NULL),
       shutting_down_(false),
       started_(false),
+      finished_starting_(false),
       vfprintf_(NULL),
       exit_(NULL),
       abort_(NULL),
@@ -573,6 +574,8 @@
   Thread::Current()->GetJniEnv()->locals.AssertEmpty();
 
   VLOG(startup) << "Runtime::Start exiting";
+
+  finished_starting_ = true;
 }
 
 void Runtime::DidForkFromZygote() {
diff --git a/src/runtime.h b/src/runtime.h
index 55dab07..d80f1c9 100644
--- a/src/runtime.h
+++ b/src/runtime.h
@@ -116,6 +116,10 @@
     return started_;
   }
 
+  bool IsFinishedStarting() const {
+    return finished_starting_;
+  }
+
   static Runtime* Current() {
     return instance_;
   }
@@ -371,6 +375,11 @@
   bool shutting_down_;
   bool started_;
 
+  // New flag added which tells us if the runtime has finished starting. If
+  // this flag is set then the Daemon threads are created and the class loader
+  // is created. This flag is needed for knowing if its safe to request CMS.
+  bool finished_starting_;
+
   // Hooks supported by JNI_CreateJavaVM
   jint (*vfprintf_)(FILE* stream, const char* format, va_list ap);
   void (*exit_)(jint status);
diff --git a/src/thread.cc b/src/thread.cc
index e1764ef..f6b9a44 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -756,7 +756,9 @@
     delay = new_delay;
     if (delay == 0) {
       sched_yield();
-      delay = 10000;
+      // Default to 1 milliseconds (note that this gets multiplied by 2 before
+      // the first sleep)
+      delay = 500;
     } else {
       usleep(delay);
       total_delay += delay;
diff --git a/src/thread_list.cc b/src/thread_list.cc
index 0dc06f8..a1e9d2f 100644
--- a/src/thread_list.cc
+++ b/src/thread_list.cc
@@ -23,6 +23,7 @@
 #include "debugger.h"
 #include "scoped_heap_lock.h"
 #include "scoped_thread_list_lock.h"
+#include "timing_logger.h"
 #include "utils.h"
 
 namespace art {
@@ -159,7 +160,6 @@
   CHECK_EQ(self->GetState(), kRunnable);
   ScopedThreadListLock thread_list_lock;
   Thread* debug_thread = Dbg::GetDebugThread();
-
   {
     // Increment everybody's suspend count (except our own).
     MutexLock mu(thread_suspend_count_lock_);
diff --git a/src/thread_list.h b/src/thread_list.h
index 9e45bfb..dc6e820 100644
--- a/src/thread_list.h
+++ b/src/thread_list.h
@@ -22,6 +22,8 @@
 
 namespace art {
 
+class TimingLogger;
+
 class ThreadList {
  public:
   static const uint32_t kMaxThreadId = 0xFFFF;
diff --git a/src/well_known_classes.cc b/src/well_known_classes.cc
index 20e71d5..a0397ff 100644
--- a/src/well_known_classes.cc
+++ b/src/well_known_classes.cc
@@ -45,6 +45,7 @@
 jmethodID WellKnownClasses::com_android_dex_Dex_create;
 jmethodID WellKnownClasses::java_lang_ClassNotFoundException_init;
 jmethodID WellKnownClasses::java_lang_ClassLoader_loadClass;
+jmethodID WellKnownClasses::java_lang_Daemons_requestGC;
 jmethodID WellKnownClasses::java_lang_Daemons_requestHeapTrim;
 jmethodID WellKnownClasses::java_lang_Daemons_start;
 jmethodID WellKnownClasses::java_lang_ref_FinalizerReference_add;
@@ -123,6 +124,8 @@
   com_android_dex_Dex_create = CacheMethod(env, com_android_dex_Dex, true, "create", "(Ljava/nio/ByteBuffer;)Lcom/android/dex/Dex;");
   java_lang_ClassNotFoundException_init = CacheMethod(env, java_lang_ClassNotFoundException, false, "<init>", "(Ljava/lang/String;Ljava/lang/Throwable;)V");
   java_lang_ClassLoader_loadClass = CacheMethod(env, java_lang_ClassLoader, false, "loadClass", "(Ljava/lang/String;)Ljava/lang/Class;");
+
+  java_lang_Daemons_requestGC = CacheMethod(env, java_lang_Daemons, true, "requestGC", "()V");
   java_lang_Daemons_requestHeapTrim = CacheMethod(env, java_lang_Daemons, true, "requestHeapTrim", "()V");
   java_lang_Daemons_start = CacheMethod(env, java_lang_Daemons, true, "start", "()V");
 
diff --git a/src/well_known_classes.h b/src/well_known_classes.h
index 6a6dd60..d2c4959 100644
--- a/src/well_known_classes.h
+++ b/src/well_known_classes.h
@@ -53,6 +53,7 @@
   static jmethodID com_android_dex_Dex_create;
   static jmethodID java_lang_ClassLoader_loadClass;
   static jmethodID java_lang_ClassNotFoundException_init;
+  static jmethodID java_lang_Daemons_requestGC;
   static jmethodID java_lang_Daemons_requestHeapTrim;
   static jmethodID java_lang_Daemons_start;
   static jmethodID java_lang_ref_FinalizerReference_add;