Parellel mark stack processing
Enabled parallel mark stack processing by using a thread pool.
Optimized object scanning by removing dependent loads for IsClass.
Performance:
Prime: ~10% speedup of partial GC.
Nakasi: ~50% speedup of partial GC.
Change-Id: I43256a068efc47cb52d93108458ea18d4e02fccc
diff --git a/src/heap.cc b/src/heap.cc
index d12d20e..b4cf4a9 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -45,6 +45,7 @@
namespace art {
+static const bool kDumpGcPerformanceOnShutdown = false;
const double Heap::kDefaultTargetUtilization = 0.5;
static bool GenerateImage(const std::string& image_file_name) {
@@ -282,6 +283,11 @@
gc_complete_cond_.reset(new ConditionVariable("GC complete condition variable",
*gc_complete_lock_));
+ // Create the reference queue lock, this is required so for parrallel object scanning in the GC.
+ reference_queue_lock_.reset(new Mutex("reference queue lock"));
+
+ CHECK(max_allowed_footprint_ != 0);
+
// Set up the cumulative timing loggers.
for (size_t i = static_cast<size_t>(kGcTypeSticky); i < static_cast<size_t>(kGcTypeMax);
++i) {
@@ -296,6 +302,17 @@
}
}
+void Heap::CreateThreadPool() {
+ // TODO: Make sysconf(_SC_NPROCESSORS_CONF) be a helper function?
+ // Use the number of processors - 1 since the thread doing the GC does work while its waiting for
+ // workers to complete.
+ thread_pool_.reset(new ThreadPool(sysconf(_SC_NPROCESSORS_CONF) - 1));
+}
+
+void Heap::DeleteThreadPool() {
+ thread_pool_.reset(NULL);
+}
+
// Sort spaces based on begin address
struct SpaceSorter {
bool operator ()(const ContinuousSpace* a, const ContinuousSpace* b) const {
@@ -369,6 +386,10 @@
}
Heap::~Heap() {
+ if (kDumpGcPerformanceOnShutdown) {
+ DumpGcPerformanceInfo();
+ }
+
// If we don't reset then the mark stack complains in it's destructor.
allocation_stack_->Reset();
live_stack_->Reset();
@@ -947,12 +968,11 @@
// We need to do partial GCs every now and then to avoid the heap growing too much and
// fragmenting.
if (gc_type == kGcTypeSticky && ++sticky_gc_count_ > partial_gc_frequency_) {
- gc_type = kGcTypePartial;
+ gc_type = have_zygote_space_ ? kGcTypePartial : kGcTypeFull;
}
if (gc_type != kGcTypeSticky) {
sticky_gc_count_ = 0;
}
-
if (concurrent_gc_) {
CollectGarbageConcurrentMarkSweepPlan(self, gc_type, gc_cause, clear_soft_references);
} else {
@@ -1049,9 +1069,6 @@
mark_sweep.MarkConcurrentRoots();
timings.AddSplit("MarkRoots");
- // Roots are marked on the bitmap and the mark_stack is empty.
- DCHECK(mark_stack_->IsEmpty());
-
UpdateAndMarkModUnion(&mark_sweep, timings, gc_type);
if (gc_type != kGcTypeSticky) {
@@ -1079,15 +1096,14 @@
mark_sweep.ProcessReferences(clear_soft_references);
timings.AddSplit("ProcessReferences");
-#ifndef NDEBUG
- // Verify that we only reach marked objects from the image space
- mark_sweep.VerifyImageRoots();
- timings.AddSplit("VerifyImageRoots");
-#endif
+ if (kIsDebugBuild) {
+ // Verify that we only reach marked objects from the image space
+ mark_sweep.VerifyImageRoots();
+ timings.AddSplit("VerifyImageRoots");
+ }
if (gc_type != kGcTypeSticky) {
- mark_sweep.Sweep(gc_type == kGcTypePartial, false);
- timings.AddSplit("Sweep");
+ mark_sweep.Sweep(timings, gc_type == kGcTypePartial, false);
mark_sweep.SweepLargeObjects(false);
timings.AddSplit("SweepLargeObjects");
} else {
@@ -1098,15 +1114,12 @@
// Unbind the live and mark bitmaps.
mark_sweep.UnBindBitmaps();
-
- const bool swap = true;
- if (swap) {
- if (gc_type == kGcTypeSticky) {
- SwapLargeObjects();
- } else {
- SwapBitmaps(gc_type);
- }
+ if (gc_type == kGcTypeSticky) {
+ SwapLargeObjects();
+ } else {
+ SwapBitmaps(gc_type);
}
+ timings.AddSplit("SwapBitmaps");
if (verify_system_weaks_) {
mark_sweep.VerifySystemWeaks();
@@ -1600,9 +1613,6 @@
ClearCards(timings);
}
- // Roots are marked on the bitmap and the mark_stack is empty.
- DCHECK(mark_stack_->IsEmpty());
-
if (verify_mod_union_table_) {
ReaderMutexLock reader_lock(self, *Locks::heap_bitmap_lock_);
zygote_mod_union_table_->Update();
@@ -1657,7 +1667,7 @@
timings.AddSplit("ReMarkRoots");
// Scan dirty objects, this is only required if we are not doing concurrent GC.
- mark_sweep.RecursiveMarkDirtyObjects(false);
+ mark_sweep.RecursiveMarkDirtyObjects();
timings.AddSplit("RecursiveMarkDirtyObjects");
}
@@ -1721,8 +1731,7 @@
// TODO: this lock shouldn't be necessary (it's why we did the bitmap flip above).
if (gc_type != kGcTypeSticky) {
WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
- mark_sweep.Sweep(gc_type == kGcTypePartial, false);
- timings.AddSplit("Sweep");
+ mark_sweep.Sweep(timings, gc_type == kGcTypePartial, false);
mark_sweep.SweepLargeObjects(false);
timings.AddSplit("SweepLargeObjects");
} else {
@@ -1740,14 +1749,12 @@
// Swap the live and mark bitmaps for each space which we modified space. This is an
// optimization that enables us to not clear live bits inside of the sweep.
- const bool swap = true;
- if (swap) {
- if (gc_type == kGcTypeSticky) {
- SwapLargeObjects();
- } else {
- SwapBitmaps(gc_type);
- }
+ if (gc_type == kGcTypeSticky) {
+ SwapLargeObjects();
+ } else {
+ SwapBitmaps(gc_type);
}
+ timings.AddSplit("SwapBitmaps");
}
if (verify_system_weaks_) {
@@ -1850,7 +1857,7 @@
void Heap::GrowForUtilization() {
// We know what our utilization is at this moment.
// This doesn't actually resize any memory. It just lets the heap grow more when necessary.
- size_t target_size = num_bytes_allocated_ / Heap::GetTargetHeapUtilization();
+ size_t target_size = num_bytes_allocated_ / GetTargetHeapUtilization();
if (target_size > num_bytes_allocated_ + max_free_) {
target_size = num_bytes_allocated_ + max_free_;
} else if (target_size < num_bytes_allocated_ + min_free_) {
@@ -1870,7 +1877,6 @@
}
void Heap::ClearGrowthLimit() {
- WaitForConcurrentGcToComplete(Thread::Current());
alloc_space_->ClearGrowthLimit();
}
@@ -1922,6 +1928,8 @@
DCHECK(ref != NULL);
DCHECK(list != NULL);
+ // TODO: Remove this lock, use atomic stacks for storing references.
+ MutexLock mu(Thread::Current(), *reference_queue_lock_);
if (*list == NULL) {
ref->SetFieldObject(reference_pendingNext_offset_, ref, false);
*list = ref;
@@ -1937,6 +1945,9 @@
DCHECK(*list != NULL);
Object* head = (*list)->GetFieldObject<Object*>(reference_pendingNext_offset_, false);
Object* ref;
+
+ // TODO: Remove this lock, use atomic stacks for storing references.
+ MutexLock mu(Thread::Current(), *reference_queue_lock_);
if (*list == head) {
ref = *list;
*list = NULL;