Parellel mark stack processing
Enabled parallel mark stack processing by using a thread pool.
Optimized object scanning by removing dependent loads for IsClass.
Performance:
Prime: ~10% speedup of partial GC.
Nakasi: ~50% speedup of partial GC.
Change-Id: I43256a068efc47cb52d93108458ea18d4e02fccc
diff --git a/src/atomic_integer.h b/src/atomic_integer.h
index adf3e77..22cc7b4 100644
--- a/src/atomic_integer.h
+++ b/src/atomic_integer.h
@@ -71,6 +71,10 @@
int32_t operator -- () {
return android_atomic_dec(&value_) - 1;
}
+
+ int CompareAndSwap(int expected_value, int new_value) {
+ return android_atomic_cas(expected_value, new_value, &value_);
+ }
private:
int32_t value_;
};
diff --git a/src/barrier_test.cc b/src/barrier_test.cc
index 43b279e..7b31e29 100644
--- a/src/barrier_test.cc
+++ b/src/barrier_test.cc
@@ -24,9 +24,9 @@
#include "UniquePtr.h"
namespace art {
-class CheckWaitClosure : public Closure {
+class CheckWaitTask : public Task {
public:
- CheckWaitClosure(Barrier* barrier, AtomicInteger* count1, AtomicInteger* count2,
+ CheckWaitTask(Barrier* barrier, AtomicInteger* count1, AtomicInteger* count2,
AtomicInteger* count3)
: barrier_(barrier),
count1_(count1),
@@ -44,6 +44,9 @@
barrier_->Wait(self);
++*count3_;
LOG(INFO) << "After barrier 2 " << self;
+ }
+
+ virtual void Finalize() {
delete this;
}
private:
@@ -69,7 +72,7 @@
AtomicInteger count2 = 0;
AtomicInteger count3 = 0;
for (int32_t i = 0; i < num_threads; ++i) {
- thread_pool.AddTask(self, new CheckWaitClosure(&barrier, &count1, &count2, &count3));
+ thread_pool.AddTask(self, new CheckWaitTask(&barrier, &count1, &count2, &count3));
}
thread_pool.StartWorkers(self);
barrier.Increment(self, num_threads);
@@ -91,9 +94,9 @@
EXPECT_EQ(num_threads, count3);
}
-class CheckPassClosure : public Closure {
+class CheckPassTask : public Task {
public:
- CheckPassClosure(Barrier* barrier, AtomicInteger* count, size_t subtasks)
+ CheckPassTask(Barrier* barrier, AtomicInteger* count, size_t subtasks)
: barrier_(barrier),
count_(count),
subtasks_(subtasks) {
@@ -106,6 +109,9 @@
// Pass through to next subtask.
barrier_->Pass(self);
}
+ }
+
+ void Finalize() {
delete this;
}
private:
@@ -123,7 +129,7 @@
const int32_t num_tasks = num_threads * 4;
const int32_t num_sub_tasks = 128;
for (int32_t i = 0; i < num_tasks; ++i) {
- thread_pool.AddTask(self, new CheckPassClosure(&barrier, &count, num_sub_tasks));
+ thread_pool.AddTask(self, new CheckPassTask(&barrier, &count, num_sub_tasks));
}
thread_pool.StartWorkers(self);
const int32_t expected_total_tasks = num_sub_tasks * num_tasks;
diff --git a/src/class_linker.cc b/src/class_linker.cc
index dc86aed..ce9b37b 100644
--- a/src/class_linker.cc
+++ b/src/class_linker.cc
@@ -240,6 +240,7 @@
SirtRef<Class>
java_lang_Class(self, down_cast<Class*>(heap->AllocObject(self, NULL, sizeof(ClassClass))));
CHECK(java_lang_Class.get() != NULL);
+ Class::SetClassClass(java_lang_Class.get());
java_lang_Class->SetClass(java_lang_Class.get());
java_lang_Class->SetClassSize(sizeof(ClassClass));
// AllocClass(Class*) can now be used
@@ -972,11 +973,12 @@
Object* dex_caches_object = space->GetImageHeader().GetImageRoot(ImageHeader::kDexCaches);
ObjectArray<DexCache>* dex_caches = dex_caches_object->AsObjectArray<DexCache>();
+ ObjectArray<Class>* class_roots =
+ space->GetImageHeader().GetImageRoot(ImageHeader::kClassRoots)->AsObjectArray<Class>();
+
// Special case of setting up the String class early so that we can test arbitrary objects
// as being Strings or not
- Class* java_lang_String = space->GetImageHeader().GetImageRoot(ImageHeader::kClassRoots)
- ->AsObjectArray<Class>()->Get(kJavaLangString);
- String::SetClass(java_lang_String);
+ String::SetClass(class_roots->Get(kJavaLangString));
CHECK_EQ(oat_file->GetOatHeader().GetDexFileCount(),
static_cast<uint32_t>(dex_caches->GetLength()));
@@ -1004,9 +1006,8 @@
}
// reinit class_roots_
- Object* class_roots_object =
- heap->GetImageSpace()->GetImageHeader().GetImageRoot(ImageHeader::kClassRoots);
- class_roots_ = class_roots_object->AsObjectArray<Class>();
+ Class::SetClassClass(class_roots->Get(kJavaLangClass));
+ class_roots_ = class_roots;
// reinit array_iftable_ from any array class instance, they should be ==
array_iftable_ = GetClassRoot(kObjectArrayClass)->GetIfTable();
@@ -1112,6 +1113,7 @@
ClassLinker::~ClassLinker() {
+ Class::ResetClass();
String::ResetClass();
Field::ResetClass();
AbstractMethod::ResetClasses();
diff --git a/src/common_test.h b/src/common_test.h
index 67d2266..f564bbd 100644
--- a/src/common_test.h
+++ b/src/common_test.h
@@ -388,6 +388,10 @@
compiler_.reset(new Compiler(compiler_backend, instruction_set, true, 2, false, image_classes_.get(),
true, true));
+ // Create the heap thread pool so that the GC runs in parallel for tests. Normally, the thread
+ // pool is created by the runtime.
+ runtime_->GetHeap()->CreateThreadPool();
+
runtime_->GetHeap()->VerifyHeap(); // Check for heap corruption before the test
}
diff --git a/src/compiler.cc b/src/compiler.cc
index b096912..4c9860c 100644
--- a/src/compiler.cc
+++ b/src/compiler.cc
@@ -993,7 +993,7 @@
self->AssertNoPendingException();
CHECK_GT(work_units, 0U);
- std::vector<Closure*> closures(work_units);
+ std::vector<ForAllClosure*> closures(work_units);
for (size_t i = 0; i < work_units; ++i) {
closures[i] = new ForAllClosure(this, begin + i, end, callback, work_units);
thread_pool_->AddTask(self, closures[i]);
@@ -1006,13 +1006,11 @@
// Wait for all the worker threads to finish.
thread_pool_->Wait(self);
-
- STLDeleteElements(&closures);
}
private:
- class ForAllClosure : public Closure {
+ class ForAllClosure : public Task {
public:
ForAllClosure(CompilationContext* context, size_t begin, size_t end, Callback* callback,
size_t stripe)
@@ -1031,6 +1029,10 @@
self->AssertNoPendingException();
}
}
+
+ virtual void Finalize() {
+ delete this;
+ }
private:
CompilationContext* const context_;
const size_t begin_;
diff --git a/src/gc/heap_bitmap.h b/src/gc/heap_bitmap.h
index 1610df8..666fcc7 100644
--- a/src/gc/heap_bitmap.h
+++ b/src/gc/heap_bitmap.h
@@ -38,9 +38,9 @@
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
SpaceBitmap* bitmap = GetSpaceBitmap(obj);
if (LIKELY(bitmap != NULL)) {
- return bitmap->Clear(obj);
+ bitmap->Clear(obj);
} else {
- return large_objects_->Clear(obj);
+ large_objects_->Clear(obj);
}
}
diff --git a/src/gc/mark_sweep.cc b/src/gc/mark_sweep.cc
index e93eb1a..4662cf6 100644
--- a/src/gc/mark_sweep.cc
+++ b/src/gc/mark_sweep.cc
@@ -41,8 +41,17 @@
namespace art {
+// Performance options.
+static const bool kParallelMarkStack = true;
+static const bool kDisableFinger = true;
static const bool kUseMarkStackPrefetch = true;
+// Profiling and information flags.
+static const bool kCountClassesMarked = false;
+static const bool kProfileLargeObjects = false;
+static const bool kMeasureOverhead = false;
+static const bool kCountTasks = false;
+
class SetFingerVisitor {
public:
SetFingerVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
@@ -71,17 +80,20 @@
cleared_reference_list_(NULL),
freed_bytes_(0), freed_objects_(0),
class_count_(0), array_count_(0), other_count_(0),
- gc_barrier_(new Barrier) {
+ large_object_test_(0), large_object_mark_(0),
+ classes_marked_(0), overhead_time_(0),
+ work_chunks_created_(0), work_chunks_deleted_(0),
+ gc_barrier_(new Barrier),
+ large_object_lock_("large object lock") {
DCHECK(mark_stack_ != NULL);
}
void MarkSweep::Init() {
+ java_lang_Class_ = Class::GetJavaLangClass();
+ CHECK(java_lang_Class_ != NULL);
heap_ = Runtime::Current()->GetHeap();
mark_stack_->Reset();
- // TODO: C++0x auto
FindDefaultMarkBitmap();
- // TODO: if concurrent, enable card marking in compiler
- // TODO: check that the mark bitmap is entirely clear.
// Mark any concurrent roots as dirty since we need to scan them at least once during this GC.
Runtime::Current()->DirtyRoots();
}
@@ -99,7 +111,7 @@
LOG(FATAL) << "Could not find a default mark bitmap";
}
-inline void MarkSweep::MarkObject0(const Object* obj, bool check_finger) {
+inline void MarkSweep::MarkObjectNonNull(const Object* obj, bool check_finger) {
DCHECK(obj != NULL);
if (obj >= immune_begin_ && obj < immune_end_) {
@@ -109,32 +121,21 @@
// Try to take advantage of locality of references within a space, failing this find the space
// the hard way.
- if (UNLIKELY(!current_mark_bitmap_->HasAddress(obj))) {
+ SpaceBitmap* object_bitmap = current_mark_bitmap_;
+ if (UNLIKELY(!object_bitmap->HasAddress(obj))) {
SpaceBitmap* new_bitmap = heap_->GetMarkBitmap()->GetSpaceBitmap(obj);
if (new_bitmap != NULL) {
- current_mark_bitmap_ = new_bitmap;
+ object_bitmap = new_bitmap;
} else {
- LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
- SpaceSetMap* large_objects = large_object_space->GetMarkObjects();
- if (!large_objects->Test(obj)) {
- if (!large_object_space->Contains(obj)) {
- LOG(ERROR) << "Tried to mark " << obj << " not contained by any spaces";
- LOG(ERROR) << "Attempting see if it's a bad root";
- VerifyRoots();
- LOG(FATAL) << "Can't mark bad root";
- }
- large_objects->Set(obj);
- // Don't need to check finger since large objects never have any object references.
- }
- // TODO: Improve clarity of control flow in this function?
+ MarkLargeObject(obj);
return;
}
}
// This object was not previously marked.
- if (!current_mark_bitmap_->Test(obj)) {
- current_mark_bitmap_->Set(obj);
- if (check_finger && obj < finger_) {
+ if (!object_bitmap->Test(obj)) {
+ object_bitmap->Set(obj);
+ if (kDisableFinger || (check_finger && obj < finger_)) {
// Do we need to expand the mark stack?
if (UNLIKELY(mark_stack_->Size() >= mark_stack_->Capacity())) {
std::vector<Object*> temp;
@@ -150,6 +151,57 @@
}
}
+// Rare case, probably not worth inlining since it will increase instruction cache miss rate.
+bool MarkSweep::MarkLargeObject(const Object* obj) {
+ LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
+ SpaceSetMap* large_objects = large_object_space->GetMarkObjects();
+ if (kProfileLargeObjects) {
+ ++large_object_test_;
+ }
+ if (UNLIKELY(!large_objects->Test(obj))) {
+ if (!large_object_space->Contains(obj)) {
+ LOG(ERROR) << "Tried to mark " << obj << " not contained by any spaces";
+ LOG(ERROR) << "Attempting see if it's a bad root";
+ VerifyRoots();
+ LOG(FATAL) << "Can't mark bad root";
+ }
+ if (kProfileLargeObjects) {
+ ++large_object_mark_;
+ }
+ large_objects->Set(obj);
+ // Don't need to check finger since large objects never have any object references.
+ return true;
+ }
+ return false;
+}
+
+inline bool MarkSweep::MarkObjectParallel(const Object* obj) {
+ DCHECK(obj != NULL);
+
+ if (obj >= immune_begin_ && obj < immune_end_) {
+ DCHECK(IsMarked(obj));
+ return false;
+ }
+
+ // Try to take advantage of locality of references within a space, failing this find the space
+ // the hard way.
+ SpaceBitmap* object_bitmap = current_mark_bitmap_;
+ if (UNLIKELY(!object_bitmap->HasAddress(obj))) {
+ SpaceBitmap* new_bitmap = heap_->GetMarkBitmap()->GetSpaceBitmap(obj);
+ if (new_bitmap != NULL) {
+ object_bitmap = new_bitmap;
+ } else {
+ // TODO: Remove the Thread::Current here?
+ // TODO: Convert this to some kind of atomic marking?
+ MutexLock mu(Thread::Current(), large_object_lock_);
+ return MarkLargeObject(obj);
+ }
+ }
+
+ // Return true if the object was not previously marked.
+ return !object_bitmap->AtomicTestAndSet(obj);
+}
+
// Used to mark objects when recursing. Recursion is done by moving
// the finger across the bitmaps in address order and marking child
// objects. Any newly-marked objects whose addresses are lower than
@@ -157,22 +209,22 @@
// need to be added to the mark stack.
void MarkSweep::MarkObject(const Object* obj) {
if (obj != NULL) {
- MarkObject0(obj, true);
+ MarkObjectNonNull(obj, true);
}
}
-void MarkSweep::MarkObjectVisitor(const Object* root, void* arg) {
+void MarkSweep::MarkObjectCallback(const Object* root, void* arg) {
DCHECK(root != NULL);
DCHECK(arg != NULL);
MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
- mark_sweep->MarkObject0(root, false);
+ mark_sweep->MarkObjectNonNull(root, false);
}
void MarkSweep::ReMarkObjectVisitor(const Object* root, void* arg) {
DCHECK(root != NULL);
DCHECK(arg != NULL);
MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
- mark_sweep->MarkObject0(root, true);
+ mark_sweep->MarkObjectNonNull(root, true);
}
void MarkSweep::VerifyRootCallback(const Object* root, void* arg, size_t vreg,
@@ -201,15 +253,15 @@
// Marks all objects in the root set.
void MarkSweep::MarkRoots() {
- Runtime::Current()->VisitNonConcurrentRoots(MarkObjectVisitor, this);
+ Runtime::Current()->VisitNonConcurrentRoots(MarkObjectCallback, this);
}
void MarkSweep::MarkNonThreadRoots() {
- Runtime::Current()->VisitNonThreadRoots(MarkObjectVisitor, this);
+ Runtime::Current()->VisitNonThreadRoots(MarkObjectCallback, this);
}
void MarkSweep::MarkConcurrentRoots() {
- Runtime::Current()->VisitConcurrentRoots(MarkObjectVisitor, this);
+ Runtime::Current()->VisitConcurrentRoots(MarkObjectCallback, this);
}
class CheckObjectVisitor {
@@ -259,26 +311,26 @@
alloc_space->mark_bitmap_.reset(live_bitmap);
}
-class ScanImageRootVisitor {
+class ScanObjectVisitor {
public:
- ScanImageRootVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
+ ScanObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
+
}
- void operator ()(const Object* root) const
+ void operator ()(const Object* obj) const
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
- DCHECK(root != NULL);
- mark_sweep_->ScanObject(root);
+ mark_sweep_->ScanObject(obj);
}
private:
MarkSweep* const mark_sweep_;
};
-void MarkSweep::ScanGrayObjects(bool update_finger) {
+void MarkSweep::ScanGrayObjects() {
const Spaces& spaces = heap_->GetSpaces();
CardTable* card_table = heap_->GetCardTable();
- ScanImageRootVisitor image_root_visitor(this);
+ ScanObjectVisitor visitor(this);
SetFingerVisitor finger_visitor(this);
// TODO: C++ 0x auto
for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
@@ -287,11 +339,7 @@
byte* end = space->End();
// Image spaces are handled properly since live == marked for them.
SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
- if (update_finger) {
- card_table->Scan(mark_bitmap, begin, end, image_root_visitor, finger_visitor);
- } else {
- card_table->Scan(mark_bitmap, begin, end, image_root_visitor, IdentityFunctor());
- }
+ card_table->Scan(mark_bitmap, begin, end, visitor, IdentityFunctor());
}
}
@@ -330,22 +378,6 @@
}
}
-class ScanObjectVisitor {
- public:
- ScanObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
-
- }
-
- void operator ()(const Object* obj) const
- EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
- SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
- mark_sweep_->ScanObject(obj);
- }
-
- private:
- MarkSweep* const mark_sweep_;
-};
-
// Populates the mark stack based on the set of marked objects and
// recursively marks until the mark stack is emptied.
void MarkSweep::RecursiveMark(bool partial, TimingLogger& timings) {
@@ -358,12 +390,11 @@
CHECK(cleared_reference_list_ == NULL);
const Spaces& spaces = heap_->GetSpaces();
-
SetFingerVisitor set_finger_visitor(this);
ScanObjectVisitor scan_visitor(this);
for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
ContinuousSpace* space = *it;
- if (space->GetGcRetentionPolicy() == kGcRetentionPolicyAlwaysCollect ||
+ if ((!kDisableFinger && space->GetGcRetentionPolicy() == kGcRetentionPolicyAlwaysCollect) ||
(!partial && space->GetGcRetentionPolicy() == kGcRetentionPolicyFullCollect)
) {
current_mark_bitmap_ = space->GetMarkBitmap();
@@ -386,7 +417,7 @@
void MarkSweep::RecursiveMarkCards(CardTable* card_table, const std::vector<byte*>& cards,
TimingLogger& timings) {
- ScanImageRootVisitor image_root_visitor(this);
+ ScanObjectVisitor image_root_visitor(this);
SetFingerVisitor finger_visitor(this);
const size_t card_count = cards.size();
SpaceBitmap* active_bitmap = NULL;
@@ -399,14 +430,16 @@
}
if (active_bitmap == NULL || !active_bitmap->HasAddress(start_obj)) {
active_bitmap = heap_->GetMarkBitmap()->GetSpaceBitmap(start_obj);
-#ifndef NDEBUG
- if (active_bitmap == NULL) {
+ if (kIsDebugBuild && active_bitmap == NULL) {
GetHeap()->DumpSpaces();
LOG(FATAL) << "Object " << reinterpret_cast<const void*>(start_obj);
}
-#endif
}
- active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, finger_visitor);
+ if (kDisableFinger) {
+ active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, IdentityFunctor());
+ } else {
+ active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, finger_visitor);
+ }
}
timings.AddSplit("RecursiveMarkCards");
ProcessMarkStack();
@@ -419,8 +452,8 @@
!reinterpret_cast<MarkSweep*>(arg)->GetHeap()->GetLiveBitmap()->Test(object);
}
-void MarkSweep::RecursiveMarkDirtyObjects(bool update_finger) {
- ScanGrayObjects(update_finger);
+void MarkSweep::RecursiveMarkDirtyObjects() {
+ ScanGrayObjects();
ProcessMarkStack();
}
@@ -539,7 +572,7 @@
DCHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
<< thread->GetState();
WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
- thread->VisitRoots(MarkSweep::MarkObjectVisitor, mark_sweep_);
+ thread->VisitRoots(MarkSweep::MarkObjectCallback, mark_sweep_);
mark_sweep_->GetBarrier().Pass(self);
}
@@ -561,8 +594,6 @@
}
void MarkSweep::SweepCallback(size_t num_ptrs, Object** ptrs, void* arg) {
- size_t freed_objects = num_ptrs;
- size_t freed_bytes = 0;
SweepCallbackContext* context = static_cast<SweepCallbackContext*>(arg);
MarkSweep* mark_sweep = context->mark_sweep;
Heap* heap = mark_sweep->GetHeap();
@@ -572,17 +603,9 @@
// Use a bulk free, that merges consecutive objects before freeing or free per object?
// Documentation suggests better free performance with merging, but this may be at the expensive
// of allocation.
- // TODO: investigate performance
- static const bool kUseFreeList = true;
- if (kUseFreeList) {
- // AllocSpace::FreeList clears the value in ptrs, so perform after clearing the live bit
- freed_bytes += space->FreeList(self, num_ptrs, ptrs);
- } else {
- for (size_t i = 0; i < num_ptrs; ++i) {
- freed_bytes += space->Free(self, ptrs[i]);
- }
- }
-
+ size_t freed_objects = num_ptrs;
+ // AllocSpace::FreeList clears the value in ptrs, so perform after clearing the live bit
+ size_t freed_bytes = space->FreeList(self, num_ptrs, ptrs);
heap->RecordFree(freed_objects, freed_bytes);
mark_sweep->freed_objects_ += freed_objects;
mark_sweep->freed_bytes_ += freed_bytes;
@@ -606,7 +629,6 @@
// If we don't swap bitmaps then newly allocated Weaks go into the live bitmap but not mark
// bitmap, resulting in occasional frees of Weaks which are still in use.
- // TODO: Fix when sweeping weaks works properly with mutators unpaused + allocation list.
SweepSystemWeaksArray(allocations);
logger.AddSplit("SweepSystemWeaks");
@@ -656,12 +678,13 @@
logger.AddSplit("Reset stack");
}
-void MarkSweep::Sweep(bool partial, bool swap_bitmaps) {
+void MarkSweep::Sweep(TimingLogger& timings, bool partial, bool swap_bitmaps) {
DCHECK(mark_stack_->IsEmpty());
// If we don't swap bitmaps then newly allocated Weaks go into the live bitmap but not mark
// bitmap, resulting in occasional frees of Weaks which are still in use.
SweepSystemWeaks();
+ timings.AddSplit("SweepSystemWeaks");
const Spaces& spaces = heap_->GetSpaces();
SweepCallbackContext scc;
@@ -693,6 +716,7 @@
}
}
}
+ timings.AddSplit("Sweep");
}
void MarkSweep::SweepLargeObjects(bool swap_bitmaps) {
@@ -721,53 +745,6 @@
GetHeap()->RecordFree(freed_objects, freed_bytes);
}
-// Scans instance fields.
-inline void MarkSweep::ScanInstanceFields(const Object* obj) {
- DCHECK(obj != NULL);
- Class* klass = obj->GetClass();
- DCHECK(klass != NULL);
- ScanFields(obj, klass->GetReferenceInstanceOffsets(), false);
-}
-
-// Scans static storage on a Class.
-inline void MarkSweep::ScanStaticFields(const Class* klass) {
- DCHECK(klass != NULL);
- ScanFields(klass, klass->GetReferenceStaticOffsets(), true);
-}
-
-inline void MarkSweep::ScanFields(const Object* obj, uint32_t ref_offsets, bool is_static) {
- if (ref_offsets != CLASS_WALK_SUPER) {
- // Found a reference offset bitmap. Mark the specified offsets.
- while (ref_offsets != 0) {
- const size_t right_shift = CLZ(ref_offsets);
- MemberOffset byte_offset = CLASS_OFFSET_FROM_CLZ(right_shift);
- const Object* ref = obj->GetFieldObject<const Object*>(byte_offset, false);
- MarkObject(ref);
- ref_offsets ^= CLASS_HIGH_BIT >> right_shift;
- }
- } else {
- // There is no reference offset bitmap. In the non-static case,
- // walk up the class inheritance hierarchy and find reference
- // offsets the hard way. In the static case, just consider this
- // class.
- for (const Class* klass = is_static ? obj->AsClass() : obj->GetClass();
- klass != NULL;
- klass = is_static ? NULL : klass->GetSuperClass()) {
- size_t num_reference_fields = (is_static
- ? klass->NumReferenceStaticFields()
- : klass->NumReferenceInstanceFields());
- for (size_t i = 0; i < num_reference_fields; ++i) {
- Field* field = (is_static
- ? klass->GetStaticField(i)
- : klass->GetInstanceField(i));
- MemberOffset field_offset = field->GetOffset();
- const Object* ref = obj->GetFieldObject<const Object*>(field_offset, false);
- MarkObject(ref);
- }
- }
- }
-}
-
void MarkSweep::CheckReference(const Object* obj, const Object* ref, MemberOffset offset, bool is_static) {
const Spaces& spaces = heap_->GetSpaces();
// TODO: C++0x auto
@@ -812,32 +789,6 @@
}
}
-// Scans the header, static field references, and interface pointers
-// of a class object.
-inline void MarkSweep::ScanClass(const Object* obj) {
-#ifndef NDEBUG
- ++class_count_;
-#endif
- ScanInstanceFields(obj);
- ScanStaticFields(obj->AsClass());
-}
-
-// Scans the header of all array objects. If the array object is
-// specialized to a reference type, scans the array data as well.
-inline void MarkSweep::ScanArray(const Object* obj) {
-#ifndef NDEBUG
- ++array_count_;
-#endif
- MarkObject(obj->GetClass());
- if (obj->IsObjectArray()) {
- const ObjectArray<Object>* array = obj->AsObjectArray<Object>();
- for (int32_t i = 0; i < array->GetLength(); ++i) {
- const Object* element = array->GetWithoutChecks(i);
- MarkObject(element);
- }
- }
-}
-
// Process the "referent" field in a java.lang.ref.Reference. If the
// referent has not yet been marked, put it on the appropriate list in
// the gcHeap for later processing.
@@ -860,49 +811,205 @@
list = &phantom_reference_list_;
}
DCHECK(list != NULL) << PrettyClass(klass) << " " << std::hex << klass->GetAccessFlags();
+ // TODO: One lock per list?
heap_->EnqueuePendingReference(obj, list);
}
}
-// Scans the header and field references of a data object. If the
-// scanned object is a reference subclass, it is scheduled for later
-// processing.
-inline void MarkSweep::ScanOther(const Object* obj) {
-#ifndef NDEBUG
- ++other_count_;
-#endif
- ScanInstanceFields(obj);
- if (obj->GetClass()->IsReferenceClass()) {
- DelayReferenceReferent(const_cast<Object*>(obj));
- }
-}
-
void MarkSweep::ScanRoot(const Object* obj) {
ScanObject(obj);
}
+class MarkObjectVisitor {
+ public:
+ MarkObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
+ }
+
+ void operator ()(const Object* /* obj */, const Object* ref, const MemberOffset& /* offset */,
+ bool /* is_static */) const
+ EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+ SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+ mark_sweep_->MarkObject(ref);
+ }
+
+ private:
+ MarkSweep* const mark_sweep_;
+};
+
// Scans an object reference. Determines the type of the reference
// and dispatches to a specialized scanning routine.
void MarkSweep::ScanObject(const Object* obj) {
- DCHECK(obj != NULL);
- DCHECK(obj->GetClass() != NULL);
-#ifndef NDEBUG
- if (!IsMarked(obj)) {
- heap_->DumpSpaces();
- LOG(FATAL) << "Scanning unmarked object " << reinterpret_cast<const void*>(obj);
+ MarkObjectVisitor visitor(this);
+ ScanObjectVisit(obj, visitor);
+}
+
+class MarkStackChunk : public Task {
+public:
+ MarkStackChunk(ThreadPool* thread_pool, MarkSweep* mark_sweep, Object** begin, Object** end)
+ : mark_sweep_(mark_sweep),
+ thread_pool_(thread_pool),
+ index_(0),
+ length_(0),
+ output_(NULL) {
+ length_ = end - begin;
+ if (begin != end) {
+ // Cost not significant since we only do this for the initial set of mark stack chunks.
+ memcpy(data_, begin, length_ * sizeof(*begin));
+ }
+ if (kCountTasks) {
+ ++mark_sweep_->work_chunks_created_;
+ }
}
-#endif
- if (obj->IsClass()) {
- ScanClass(obj);
- } else if (obj->IsArrayInstance()) {
- ScanArray(obj);
- } else {
- ScanOther(obj);
+
+ ~MarkStackChunk() {
+ DCHECK(output_ == NULL || output_->length_ == 0);
+ DCHECK_GE(index_, length_);
+ delete output_;
+ if (kCountTasks) {
+ ++mark_sweep_->work_chunks_deleted_;
+ }
}
+
+ MarkSweep* const mark_sweep_;
+ ThreadPool* const thread_pool_;
+ static const size_t max_size = 1 * KB;
+ // Index of which object we are scanning. Only needs to be atomic if we are doing work stealing.
+ size_t index_;
+ // Input / output mark stack. We add newly marked references to data_ until length reaches
+ // max_size. This is an optimization so that less tasks are created.
+ // TODO: Investigate using a bounded buffer FIFO.
+ Object* data_[max_size];
+ // How many elements in data_ we need to scan.
+ size_t length_;
+ // Output block, newly marked references get added to the ouput block so that another thread can
+ // scan them.
+ MarkStackChunk* output_;
+
+ class MarkObjectParallelVisitor {
+ public:
+ MarkObjectParallelVisitor(MarkStackChunk* chunk_task) : chunk_task_(chunk_task) {
+
+ }
+
+ void operator ()(const Object* /* obj */, const Object* ref,
+ const MemberOffset& /* offset */, bool /* is_static */) const {
+ if (ref != NULL && chunk_task_->mark_sweep_->MarkObjectParallel(ref)) {
+ chunk_task_->MarkStackPush(ref);
+ }
+ }
+
+ private:
+ MarkStackChunk* const chunk_task_;
+ };
+
+ // Push an object into the block.
+ // Don't need to use atomic ++ since we only one thread is writing to an output block at any
+ // given time.
+ void Push(Object* obj) {
+ data_[length_++] = obj;
+ }
+
+ void MarkStackPush(const Object* obj) {
+ if (static_cast<size_t>(length_) < max_size) {
+ Push(const_cast<Object*>(obj));
+ } else {
+ // Internal buffer is full, push to a new buffer instead.
+ if (UNLIKELY(output_ == NULL)) {
+ AllocateOutputChunk();
+ } else if (UNLIKELY(static_cast<size_t>(output_->length_) == max_size)) {
+ // Output block is full, queue it up for processing and obtain a new block.
+ EnqueueOutput();
+ AllocateOutputChunk();
+ }
+ output_->Push(const_cast<Object*>(obj));
+ }
+ }
+
+ void ScanObject(Object* obj) {
+ mark_sweep_->ScanObjectVisit(obj, MarkObjectParallelVisitor(this));
+ }
+
+ void EnqueueOutput() {
+ if (output_ != NULL) {
+ uint64_t start = 0;
+ if (kMeasureOverhead) {
+ start = NanoTime();
+ }
+ thread_pool_->AddTask(Thread::Current(), output_);
+ output_ = NULL;
+ if (kMeasureOverhead) {
+ mark_sweep_->overhead_time_ += NanoTime() - start;
+ }
+ }
+ }
+
+ void AllocateOutputChunk() {
+ uint64_t start = 0;
+ if (kMeasureOverhead) {
+ start = NanoTime();
+ }
+ output_ = new MarkStackChunk(thread_pool_, mark_sweep_, NULL, NULL);
+ if (kMeasureOverhead) {
+ mark_sweep_->overhead_time_ += NanoTime() - start;
+ }
+ }
+
+ void Finalize() {
+ EnqueueOutput();
+ delete this;
+ }
+
+ // Scans all of the objects
+ virtual void Run(Thread* self) {
+ int index;
+ while ((index = index_++) < length_) {
+ static const size_t prefetch_look_ahead = 4;
+ if (kUseMarkStackPrefetch) {
+ if (index + prefetch_look_ahead < length_) {
+ __builtin_prefetch(&data_[index + prefetch_look_ahead]);
+ } else {
+ __builtin_prefetch(&data_[length_ - 1]);
+ }
+ }
+ Object* obj = data_[index];
+ DCHECK(obj != NULL);
+ ScanObject(obj);
+ }
+ }
+};
+
+void MarkSweep::ProcessMarkStackParallel() {
+ CHECK(kDisableFinger) << "parallel mark stack processing cannot work when finger is enabled";
+ Thread* self = Thread::Current();
+ ThreadPool* thread_pool = GetHeap()->GetThreadPool();
+ // Split the current mark stack up into work tasks.
+ const size_t num_threads = thread_pool->GetThreadCount();
+ const size_t stack_size = mark_stack_->Size();
+ const size_t chunk_size =
+ std::min((stack_size + num_threads - 1) / num_threads,
+ static_cast<size_t>(MarkStackChunk::max_size));
+ size_t index = 0;
+ for (size_t i = 0; i < num_threads || index < stack_size; ++i) {
+ Object** begin = &mark_stack_->Begin()[std::min(stack_size, index)];
+ Object** end = &mark_stack_->Begin()[std::min(stack_size, index + chunk_size)];
+ index += chunk_size;
+ thread_pool->AddTask(self, new MarkStackChunk(thread_pool, this, begin, end));
+ }
+ thread_pool->StartWorkers(self);
+ mark_stack_->Reset();
+ thread_pool->Wait(self, true);
+ //LOG(INFO) << "Idle wait time " << PrettyDuration(thread_pool->GetWaitTime());
+ CHECK_EQ(work_chunks_created_, work_chunks_deleted_) << " some of the work chunks were leaked";
}
// Scan anything that's on the mark stack.
void MarkSweep::ProcessMarkStack() {
+ ThreadPool* thread_pool = GetHeap()->GetThreadPool();
+ if (kParallelMarkStack && thread_pool != NULL && thread_pool->GetThreadCount() > 0) {
+ ProcessMarkStackParallel();
+ return;
+ }
+
if (kUseMarkStackPrefetch) {
const size_t fifo_size = 4;
const size_t fifo_mask = fifo_size - 1;
@@ -1096,13 +1203,31 @@
}
MarkSweep::~MarkSweep() {
-#ifndef NDEBUG
- VLOG(heap) << "MarkSweep scanned classes=" << class_count_ << " arrays=" << array_count_ << " other=" << other_count_;
-#endif
+ if (class_count_ != 0 || array_count_ != 0 || other_count_ != 0) {
+ LOG(INFO) << "MarkSweep scanned classes=" << class_count_ << " arrays=" << array_count_
+ << " other=" << other_count_;
+ }
+
+ if (kCountTasks) {
+ LOG(INFO) << "Total number of work chunks allocated: " << work_chunks_created_;
+ }
+
+ if (kMeasureOverhead) {
+ LOG(INFO) << "Overhead time " << PrettyDuration(overhead_time_);
+ }
+
+ if (kProfileLargeObjects) {
+ LOG(INFO) << "Large objects tested " << large_object_test_ << " marked " << large_object_mark_;
+ }
+
+ if (kCountClassesMarked) {
+ LOG(INFO) << "Classes marked " << classes_marked_;
+ }
+
// Ensure that the mark stack is empty.
CHECK(mark_stack_->IsEmpty());
- // Clear all of the alloc spaces' mark bitmaps.
+ // Clear all of the spaces' mark bitmaps.
const Spaces& spaces = heap_->GetSpaces();
// TODO: C++0x auto
for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
diff --git a/src/gc/mark_sweep.h b/src/gc/mark_sweep.h
index d64439f..f510943 100644
--- a/src/gc/mark_sweep.h
+++ b/src/gc/mark_sweep.h
@@ -35,6 +35,7 @@
class ModUnionTableBitmap;
class Object;
class TimingLogger;
+class MarkStackChunk;
class MarkSweep {
public:
@@ -80,9 +81,8 @@
void UnBindBitmaps()
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
- // Builds a mark stack with objects on dirty cards and recursively mark
- // until it empties.
- void RecursiveMarkDirtyObjects(bool update_finger)
+ // Builds a mark stack with objects on dirty cards and recursively mark until it empties.
+ void RecursiveMarkDirtyObjects()
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -111,7 +111,7 @@
}
// Sweeps unmarked objects to complete the garbage collection.
- void Sweep(bool partial, bool swap_bitmaps)
+ void Sweep(TimingLogger& timings, bool partial, bool swap_bitmaps)
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
// Sweeps unmarked objects to complete the garbage collection.
@@ -136,6 +136,41 @@
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ template <typename MarkVisitor>
+ void ScanObjectVisit(const Object* obj, const MarkVisitor& visitor)
+ NO_THREAD_SAFETY_ANALYSIS {
+ DCHECK(obj != NULL);
+ if (kIsDebugBuild && !IsMarked(obj)) {
+ heap_->DumpSpaces();
+ LOG(FATAL) << "Scanning unmarked object " << obj;
+ }
+ Class* klass = obj->GetClass();
+ DCHECK(klass != NULL);
+ if (klass == java_lang_Class_) {
+ DCHECK_EQ(klass->GetClass(), java_lang_Class_);
+ if (kCountScannedTypes) {
+ ++class_count_;
+ }
+ VisitClassReferences(klass, obj, visitor);
+ } else if (klass->IsArrayClass()) {
+ if (kCountScannedTypes) {
+ ++array_count_;
+ }
+ visitor(obj, klass, Object::ClassOffset(), false);
+ if (klass->IsObjectArrayClass()) {
+ VisitObjectArrayReferences(obj->AsObjectArray<Object>(), visitor);
+ }
+ } else {
+ if (kCountScannedTypes) {
+ ++other_count_;
+ }
+ VisitOtherReferences(klass, obj, visitor);
+ if (klass->IsReferenceClass()) {
+ DelayReferenceReferent(const_cast<Object*>(obj));
+ }
+ }
+ }
+
void SetFinger(Object* new_finger) {
finger_ = new_finger;
}
@@ -181,16 +216,29 @@
Locks::mutator_lock_) {
DCHECK(obj != NULL);
DCHECK(obj->GetClass() != NULL);
- if (obj->IsClass()) {
- VisitClassReferences(obj, visitor);
- } else if (obj->IsArrayInstance()) {
- VisitArrayReferences(obj, visitor);
+
+ Class* klass = obj->GetClass();
+ DCHECK(klass != NULL);
+ if (klass == Class::GetJavaLangClass()) {
+ DCHECK_EQ(klass->GetClass(), Class::GetJavaLangClass());
+ VisitClassReferences(klass, obj, visitor);
} else {
- VisitOtherReferences(obj, visitor);
+ if (klass->IsArrayClass()) {
+ visitor(obj, klass, Object::ClassOffset(), false);
+ if (klass->IsObjectArrayClass()) {
+ VisitObjectArrayReferences(obj->AsObjectArray<Object>(), visitor);
+ }
+ } else {
+ VisitOtherReferences(klass, obj, visitor);
+ }
}
}
- static void MarkObjectVisitor(const Object* root, void* arg)
+ static void MarkObjectCallback(const Object* root, void* arg)
+ EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
+ // Marks an object.
+ void MarkObject(const Object* obj)
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
Barrier& GetBarrier();
@@ -216,14 +264,15 @@
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
- // Marks an object.
- void MarkObject(const Object* obj)
+ void MarkObjectNonNull(const Object* obj, bool check_finger)
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
- // Yuck.
- void MarkObject0(const Object* obj, bool check_finger)
+ bool MarkLargeObject(const Object* obj)
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+ // Returns true if we need to add obj to a mark stack.
+ bool MarkObjectParallel(const Object* obj) NO_THREAD_SAFETY_ANALYSIS;
+
static void ScanBitmapCallback(Object* obj, void* finger, void* arg)
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -241,11 +290,6 @@
void CheckObject(const Object* obj)
SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
- // Grays references in instance fields.
- void ScanInstanceFields(const Object* obj)
- EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
- SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
// Verify the roots of the heap and print out information related to any invalid roots.
// Called in MarkObject, so may we may not hold the mutator lock.
void VerifyRoots()
@@ -258,32 +302,23 @@
NO_THREAD_SAFETY_ANALYSIS;
template <typename Visitor>
- static void VisitInstanceFieldsReferences(const Object* obj, const Visitor& visitor)
+ static void VisitInstanceFieldsReferences(const Class* klass, const Object* obj,
+ const Visitor& visitor)
SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
DCHECK(obj != NULL);
- Class* klass = obj->GetClass();
DCHECK(klass != NULL);
VisitFieldsReferences(obj, klass->GetReferenceInstanceOffsets(), false, visitor);
}
- // Blackens a class object.
- void ScanClass(const Object* obj)
- EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
- SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
-
+ // Visit the header, static field references, and interface pointers of a class object.
template <typename Visitor>
- static void VisitClassReferences(const Object* obj, const Visitor& visitor)
+ static void VisitClassReferences(const Class* klass, const Object* obj,
+ const Visitor& visitor)
SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
- VisitInstanceFieldsReferences(obj, visitor);
+ VisitInstanceFieldsReferences(klass, obj, visitor);
VisitStaticFieldsReferences(obj->AsClass(), visitor);
}
- // Grays references in static fields.
- void ScanStaticFields(const Class* klass)
- EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
- SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
template <typename Visitor>
static void VisitStaticFieldsReferences(const Class* klass, const Visitor& visitor)
SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
@@ -291,11 +326,6 @@
VisitFieldsReferences(klass, klass->GetReferenceStaticOffsets(), true, visitor);
}
- // Used by ScanInstanceFields and ScanStaticFields
- void ScanFields(const Object* obj, uint32_t ref_offsets, bool is_static)
- EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
- SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
template <typename Visitor>
static void VisitFieldsReferences(const Object* obj, uint32_t ref_offsets, bool is_static,
const Visitor& visitor)
@@ -333,37 +363,30 @@
}
}
- // Grays references in an array.
- void ScanArray(const Object* obj)
- EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
- SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
+ // Visit all of the references in an object array.
template <typename Visitor>
- static void VisitArrayReferences(const Object* obj, const Visitor& visitor)
+ static void VisitObjectArrayReferences(const ObjectArray<Object>* array,
+ const Visitor& visitor)
SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
- visitor(obj, obj->GetClass(), Object::ClassOffset(), false);
- if (obj->IsObjectArray()) {
- const ObjectArray<Object>* array = obj->AsObjectArray<Object>();
- for (int32_t i = 0; i < array->GetLength(); ++i) {
- const Object* element = array->GetWithoutChecks(i);
- size_t width = sizeof(Object*);
- visitor(obj, element, MemberOffset(i * width + Array::DataOffset(width).Int32Value()), false);
- }
+ const int32_t length = array->GetLength();
+ for (int32_t i = 0; i < length; ++i) {
+ const Object* element = array->GetWithoutChecks(i);
+ const size_t width = sizeof(Object*);
+ MemberOffset offset = MemberOffset(i * width + Array::DataOffset(width).Int32Value());
+ visitor(array, element, offset, false);
}
}
- void ScanOther(const Object* obj)
- EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
- SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
+ // Visits the header and field references of a data object.
template <typename Visitor>
- static void VisitOtherReferences(const Object* obj, const Visitor& visitor)
+ static void VisitOtherReferences(const Class* klass, const Object* obj,
+ const Visitor& visitor)
SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
- return VisitInstanceFieldsReferences(obj, visitor);
+ return VisitInstanceFieldsReferences(klass, obj, visitor);
}
// Blackens objects grayed during a garbage collection.
- void ScanGrayObjects(bool update_finger)
+ void ScanGrayObjects()
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
// Schedules an unmarked object for reference processing.
@@ -375,6 +398,10 @@
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ void ProcessMarkStackParallel()
+ EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+ SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
void EnqueueFinalizerReferences(Object** ref)
EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -396,9 +423,15 @@
void SweepJniWeakGlobals(Heap::IsMarkedTester is_marked, void* arg)
SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+ // Whether or not we count how many of each type of object were scanned.
+ static const bool kCountScannedTypes = false;
+
// Current space, we check this space first to avoid searching for the appropriate space for an object.
SpaceBitmap* current_mark_bitmap_;
+ // Cache java.lang.Class for optimization.
+ Class* java_lang_Class_;
+
ObjectStack* mark_stack_;
Heap* heap_;
@@ -410,23 +443,25 @@
Object* immune_end_;
Object* soft_reference_list_;
-
Object* weak_reference_list_;
-
Object* finalizer_reference_list_;
-
Object* phantom_reference_list_;
-
Object* cleared_reference_list_;
- size_t freed_bytes_;
- size_t freed_objects_;
-
- size_t class_count_;
- size_t array_count_;
- size_t other_count_;
+ AtomicInteger freed_bytes_;
+ AtomicInteger freed_objects_;
+ AtomicInteger class_count_;
+ AtomicInteger array_count_;
+ AtomicInteger other_count_;
+ AtomicInteger large_object_test_;
+ AtomicInteger large_object_mark_;
+ AtomicInteger classes_marked_;
+ AtomicInteger overhead_time_;
+ AtomicInteger work_chunks_created_;
+ AtomicInteger work_chunks_deleted_;
UniquePtr<Barrier> gc_barrier_;
+ Mutex large_object_lock_;
friend class AddIfReachesAllocSpaceVisitor; // Used by mod-union table.
friend class CheckBitmapVisitor;
@@ -443,6 +478,8 @@
friend class ModUnionScanImageRootVisitor;
friend class ScanBitmapVisitor;
friend class ScanImageRootVisitor;
+ friend class MarkStackChunk;
+ friend class FifoMarkStackChunk;
DISALLOW_COPY_AND_ASSIGN(MarkSweep);
};
diff --git a/src/gc/space_bitmap.h b/src/gc/space_bitmap.h
index 885491f..25fd538 100644
--- a/src/gc/space_bitmap.h
+++ b/src/gc/space_bitmap.h
@@ -22,6 +22,8 @@
#include <stdint.h>
#include <vector>
+#include "cutils/atomic.h"
+#include "cutils/atomic-inline.h"
#include "UniquePtr.h"
#include "globals.h"
#include "logging.h"
@@ -64,12 +66,32 @@
return static_cast<uintptr_t>(kWordHighBitMask) >> ((offset_ / kAlignment) % kBitsPerWord);
}
- inline void Set(const Object* obj) {
- Modify(obj, true);
+ inline bool Set(const Object* obj) {
+ return Modify(obj, true);
}
- inline void Clear(const Object* obj) {
- Modify(obj, false);
+ inline bool Clear(const Object* obj) {
+ return Modify(obj, false);
+ }
+
+ // Returns true if the object was previously marked.
+ inline bool AtomicTestAndSet(const Object* obj) {
+ uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
+ DCHECK_GE(addr, heap_begin_);
+ const uintptr_t offset = addr - heap_begin_;
+ const size_t index = OffsetToIndex(offset);
+ const word mask = OffsetToMask(offset);
+ word* const address = &bitmap_begin_[index];
+ DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_;
+ word old_word;
+ do {
+ old_word = *address;
+ // Fast path: The bit is already set.
+ if ((old_word & mask) != 0) {
+ return true;
+ }
+ } while (UNLIKELY(android_atomic_cas(old_word, old_word | mask, address) != 0));
+ return false;
}
void Clear();
@@ -229,6 +251,12 @@
std::string GetName() const;
void SetName(const std::string& name);
+ const void* GetObjectWordAddress(const Object* obj) const {
+ uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
+ const uintptr_t offset = addr - heap_begin_;
+ const size_t index = OffsetToIndex(offset);
+ return &bitmap_begin_[index];
+ }
private:
// TODO: heap_end_ is initialized so that the heap bitmap is empty, this doesn't require the -1,
// however, we document that this is expected on heap_end_
@@ -237,18 +265,21 @@
heap_begin_(reinterpret_cast<uintptr_t>(heap_begin)),
name_(name) {}
- inline void Modify(const Object* obj, bool do_set) {
+ inline bool Modify(const Object* obj, bool do_set) {
uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
DCHECK_GE(addr, heap_begin_);
const uintptr_t offset = addr - heap_begin_;
const size_t index = OffsetToIndex(offset);
const word mask = OffsetToMask(offset);
DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_;
+ word* address = &bitmap_begin_[index];
+ word old_word = *address;
if (do_set) {
- bitmap_begin_[index] |= mask;
+ *address = old_word | mask;
} else {
- bitmap_begin_[index] &= ~mask;
+ *address = old_word & ~mask;
}
+ return (old_word & mask) != 0;
}
// Backing storage for bitmap.
diff --git a/src/heap.cc b/src/heap.cc
index d12d20e..b4cf4a9 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -45,6 +45,7 @@
namespace art {
+static const bool kDumpGcPerformanceOnShutdown = false;
const double Heap::kDefaultTargetUtilization = 0.5;
static bool GenerateImage(const std::string& image_file_name) {
@@ -282,6 +283,11 @@
gc_complete_cond_.reset(new ConditionVariable("GC complete condition variable",
*gc_complete_lock_));
+ // Create the reference queue lock, this is required so for parrallel object scanning in the GC.
+ reference_queue_lock_.reset(new Mutex("reference queue lock"));
+
+ CHECK(max_allowed_footprint_ != 0);
+
// Set up the cumulative timing loggers.
for (size_t i = static_cast<size_t>(kGcTypeSticky); i < static_cast<size_t>(kGcTypeMax);
++i) {
@@ -296,6 +302,17 @@
}
}
+void Heap::CreateThreadPool() {
+ // TODO: Make sysconf(_SC_NPROCESSORS_CONF) be a helper function?
+ // Use the number of processors - 1 since the thread doing the GC does work while its waiting for
+ // workers to complete.
+ thread_pool_.reset(new ThreadPool(sysconf(_SC_NPROCESSORS_CONF) - 1));
+}
+
+void Heap::DeleteThreadPool() {
+ thread_pool_.reset(NULL);
+}
+
// Sort spaces based on begin address
struct SpaceSorter {
bool operator ()(const ContinuousSpace* a, const ContinuousSpace* b) const {
@@ -369,6 +386,10 @@
}
Heap::~Heap() {
+ if (kDumpGcPerformanceOnShutdown) {
+ DumpGcPerformanceInfo();
+ }
+
// If we don't reset then the mark stack complains in it's destructor.
allocation_stack_->Reset();
live_stack_->Reset();
@@ -947,12 +968,11 @@
// We need to do partial GCs every now and then to avoid the heap growing too much and
// fragmenting.
if (gc_type == kGcTypeSticky && ++sticky_gc_count_ > partial_gc_frequency_) {
- gc_type = kGcTypePartial;
+ gc_type = have_zygote_space_ ? kGcTypePartial : kGcTypeFull;
}
if (gc_type != kGcTypeSticky) {
sticky_gc_count_ = 0;
}
-
if (concurrent_gc_) {
CollectGarbageConcurrentMarkSweepPlan(self, gc_type, gc_cause, clear_soft_references);
} else {
@@ -1049,9 +1069,6 @@
mark_sweep.MarkConcurrentRoots();
timings.AddSplit("MarkRoots");
- // Roots are marked on the bitmap and the mark_stack is empty.
- DCHECK(mark_stack_->IsEmpty());
-
UpdateAndMarkModUnion(&mark_sweep, timings, gc_type);
if (gc_type != kGcTypeSticky) {
@@ -1079,15 +1096,14 @@
mark_sweep.ProcessReferences(clear_soft_references);
timings.AddSplit("ProcessReferences");
-#ifndef NDEBUG
- // Verify that we only reach marked objects from the image space
- mark_sweep.VerifyImageRoots();
- timings.AddSplit("VerifyImageRoots");
-#endif
+ if (kIsDebugBuild) {
+ // Verify that we only reach marked objects from the image space
+ mark_sweep.VerifyImageRoots();
+ timings.AddSplit("VerifyImageRoots");
+ }
if (gc_type != kGcTypeSticky) {
- mark_sweep.Sweep(gc_type == kGcTypePartial, false);
- timings.AddSplit("Sweep");
+ mark_sweep.Sweep(timings, gc_type == kGcTypePartial, false);
mark_sweep.SweepLargeObjects(false);
timings.AddSplit("SweepLargeObjects");
} else {
@@ -1098,15 +1114,12 @@
// Unbind the live and mark bitmaps.
mark_sweep.UnBindBitmaps();
-
- const bool swap = true;
- if (swap) {
- if (gc_type == kGcTypeSticky) {
- SwapLargeObjects();
- } else {
- SwapBitmaps(gc_type);
- }
+ if (gc_type == kGcTypeSticky) {
+ SwapLargeObjects();
+ } else {
+ SwapBitmaps(gc_type);
}
+ timings.AddSplit("SwapBitmaps");
if (verify_system_weaks_) {
mark_sweep.VerifySystemWeaks();
@@ -1600,9 +1613,6 @@
ClearCards(timings);
}
- // Roots are marked on the bitmap and the mark_stack is empty.
- DCHECK(mark_stack_->IsEmpty());
-
if (verify_mod_union_table_) {
ReaderMutexLock reader_lock(self, *Locks::heap_bitmap_lock_);
zygote_mod_union_table_->Update();
@@ -1657,7 +1667,7 @@
timings.AddSplit("ReMarkRoots");
// Scan dirty objects, this is only required if we are not doing concurrent GC.
- mark_sweep.RecursiveMarkDirtyObjects(false);
+ mark_sweep.RecursiveMarkDirtyObjects();
timings.AddSplit("RecursiveMarkDirtyObjects");
}
@@ -1721,8 +1731,7 @@
// TODO: this lock shouldn't be necessary (it's why we did the bitmap flip above).
if (gc_type != kGcTypeSticky) {
WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
- mark_sweep.Sweep(gc_type == kGcTypePartial, false);
- timings.AddSplit("Sweep");
+ mark_sweep.Sweep(timings, gc_type == kGcTypePartial, false);
mark_sweep.SweepLargeObjects(false);
timings.AddSplit("SweepLargeObjects");
} else {
@@ -1740,14 +1749,12 @@
// Swap the live and mark bitmaps for each space which we modified space. This is an
// optimization that enables us to not clear live bits inside of the sweep.
- const bool swap = true;
- if (swap) {
- if (gc_type == kGcTypeSticky) {
- SwapLargeObjects();
- } else {
- SwapBitmaps(gc_type);
- }
+ if (gc_type == kGcTypeSticky) {
+ SwapLargeObjects();
+ } else {
+ SwapBitmaps(gc_type);
}
+ timings.AddSplit("SwapBitmaps");
}
if (verify_system_weaks_) {
@@ -1850,7 +1857,7 @@
void Heap::GrowForUtilization() {
// We know what our utilization is at this moment.
// This doesn't actually resize any memory. It just lets the heap grow more when necessary.
- size_t target_size = num_bytes_allocated_ / Heap::GetTargetHeapUtilization();
+ size_t target_size = num_bytes_allocated_ / GetTargetHeapUtilization();
if (target_size > num_bytes_allocated_ + max_free_) {
target_size = num_bytes_allocated_ + max_free_;
} else if (target_size < num_bytes_allocated_ + min_free_) {
@@ -1870,7 +1877,6 @@
}
void Heap::ClearGrowthLimit() {
- WaitForConcurrentGcToComplete(Thread::Current());
alloc_space_->ClearGrowthLimit();
}
@@ -1922,6 +1928,8 @@
DCHECK(ref != NULL);
DCHECK(list != NULL);
+ // TODO: Remove this lock, use atomic stacks for storing references.
+ MutexLock mu(Thread::Current(), *reference_queue_lock_);
if (*list == NULL) {
ref->SetFieldObject(reference_pendingNext_offset_, ref, false);
*list = ref;
@@ -1937,6 +1945,9 @@
DCHECK(*list != NULL);
Object* head = (*list)->GetFieldObject<Object*>(reference_pendingNext_offset_, false);
Object* ref;
+
+ // TODO: Remove this lock, use atomic stacks for storing references.
+ MutexLock mu(Thread::Current(), *reference_queue_lock_);
if (*list == head) {
ref = *list;
*list = NULL;
diff --git a/src/heap.h b/src/heap.h
index 8ed5881..6c4c38b 100644
--- a/src/heap.h
+++ b/src/heap.h
@@ -31,6 +31,7 @@
#include "offsets.h"
#include "safe_map.h"
#include "timing_logger.h"
+#include "thread_pool.h"
#define VERIFY_OBJECT_ENABLED 0
@@ -312,6 +313,13 @@
// GC performance measuring
void DumpGcPerformanceInfo();
+ // Thread pool.
+ void CreateThreadPool();
+ void DeleteThreadPool();
+ ThreadPool* GetThreadPool() {
+ return thread_pool_.get();
+ }
+
private:
// Allocates uninitialized storage. Passing in a null space tries to place the object in the
// large object space.
@@ -408,6 +416,9 @@
Mutex* gc_complete_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
UniquePtr<ConditionVariable> gc_complete_cond_ GUARDED_BY(gc_complete_lock_);
+ // Reference queue lock
+ UniquePtr<Mutex> reference_queue_lock_;
+
// True while the garbage collector is running.
volatile bool is_gc_running_ GUARDED_BY(gc_complete_lock_);
@@ -450,6 +461,9 @@
const bool verify_post_gc_heap_;
const bool verify_mod_union_table_;
+ // Parallel GC data structures.
+ UniquePtr<ThreadPool> thread_pool_;
+
// After how many GCs we force to do a partial GC instead of sticky mark bits GC.
const size_t partial_gc_frequency_;
diff --git a/src/mutex.cc b/src/mutex.cc
index e2044d6..1cdbc4d 100644
--- a/src/mutex.cc
+++ b/src/mutex.cc
@@ -758,7 +758,6 @@
void ConditionVariable::Wait(Thread* self) {
DCHECK(self == NULL || self == Thread::Current());
guard_.AssertExclusiveHeld(self);
- guard_.CheckSafeToWait(self);
unsigned int old_recursion_count = guard_.recursion_count_;
#if ART_USE_FUTEXES
int32_t cur_state = state_;
@@ -794,7 +793,6 @@
void ConditionVariable::TimedWait(Thread* self, int64_t ms, int32_t ns) {
DCHECK(self == NULL || self == Thread::Current());
guard_.AssertExclusiveHeld(self);
- guard_.CheckSafeToWait(self);
unsigned int old_recursion_count = guard_.recursion_count_;
#if ART_USE_FUTEXES
// Record the original end time so that if the futex call fails we can recompute the appropriate
diff --git a/src/object.cc b/src/object.cc
index 9a4588a..cebbb2a 100644
--- a/src/object.cc
+++ b/src/object.cc
@@ -733,6 +733,19 @@
RegisterNative(self, Runtime::Current()->GetJniDlsymLookupStub()->GetData());
}
+Class* Class::java_lang_Class_ = NULL;
+
+void Class::SetClassClass(Class* java_lang_Class) {
+ CHECK(java_lang_Class_ == NULL) << java_lang_Class_ << " " << java_lang_Class;
+ CHECK(java_lang_Class != NULL);
+ java_lang_Class_ = java_lang_Class;
+}
+
+void Class::ResetClass() {
+ CHECK(java_lang_Class_ != NULL);
+ java_lang_Class_ = NULL;
+}
+
void Class::SetStatus(Status new_status) {
CHECK(new_status > GetStatus() || new_status == kStatusError || !Runtime::Current()->IsStarted())
<< PrettyClass(this) << " " << GetStatus() << " -> " << new_status;
diff --git a/src/object.h b/src/object.h
index 79df4e2..efd456e 100644
--- a/src/object.h
+++ b/src/object.h
@@ -1521,6 +1521,10 @@
return !IsPrimitive() && !IsInterface() && !IsAbstract();
}
+ bool IsObjectArrayClass() const {
+ return GetComponentType() != NULL && !GetComponentType()->IsPrimitive();
+ }
+
// Creates a raw object instance but does not invoke the default constructor.
Object* AllocObject(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -2008,6 +2012,15 @@
SetField32(OFFSET_OF_OBJECT_MEMBER(Class, dex_type_idx_), type_idx, false);
}
+ static Class* GetJavaLangClass() {
+ DCHECK(java_lang_Class_ != NULL);
+ return java_lang_Class_;
+ }
+
+ // Can't call this SetClass or else gets called instead of Object::SetClass in places.
+ static void SetClassClass(Class* java_lang_Class);
+ static void ResetClass();
+
private:
void SetVerifyErrorClass(Class* klass) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
CHECK(klass != NULL) << PrettyClass(this);
@@ -2127,6 +2140,9 @@
// Location of first static field.
uint32_t fields_[0];
+ // java.lang.Class
+ static Class* java_lang_Class_;
+
friend struct ClassOffsets; // for verifying offset information
DISALLOW_IMPLICIT_CONSTRUCTORS(Class);
};
diff --git a/src/runtime.cc b/src/runtime.cc
index 79d1fb2..3fa3123 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -115,6 +115,8 @@
}
Runtime::~Runtime() {
+ heap_->DeleteThreadPool();
+
Thread* self = Thread::Current();
{
MutexLock mu(self, *Locks::runtime_shutdown_lock_);
@@ -696,6 +698,9 @@
void Runtime::DidForkFromZygote() {
is_zygote_ = false;
+ // Create the thread pool.
+ heap_->CreateThreadPool();
+
StartSignalCatcher();
// Start the JDWP thread. If the command-line debugger flags specified "suspend=y",
@@ -1030,7 +1035,9 @@
}
void Runtime::DirtyRoots() {
+ CHECK(intern_table_ != NULL);
intern_table_->Dirty();
+ CHECK(class_linker_ != NULL);
class_linker_->Dirty();
}
diff --git a/src/thread_pool.cc b/src/thread_pool.cc
index ba53113..26c83d2 100644
--- a/src/thread_pool.cc
+++ b/src/thread_pool.cc
@@ -1,3 +1,4 @@
+#include "casts.h"
#include "runtime.h"
#include "stl_util.h"
#include "thread.h"
@@ -24,9 +25,10 @@
void ThreadPoolWorker::Run() {
Thread* self = Thread::Current();
- Closure* task = NULL;
+ Task* task = NULL;
while ((task = thread_pool_->GetTask(self)) != NULL) {
task->Run(self);
+ task->Finalize();
}
}
@@ -40,7 +42,7 @@
return NULL;
}
-void ThreadPool::AddTask(Thread* self, Closure* task){
+void ThreadPool::AddTask(Thread* self, Task* task){
MutexLock mu(self, task_queue_lock_);
tasks_.push_back(task);
// If we have any waiters, signal one.
@@ -49,14 +51,6 @@
}
}
-void ThreadPool::AddThread(size_t stack_size) {
- threads_.push_back(
- new ThreadPoolWorker(
- this,
- StringPrintf("Thread pool worker %d", static_cast<int>(GetThreadCount())),
- stack_size));
-}
-
ThreadPool::ThreadPool(size_t num_threads)
: task_queue_lock_("task queue lock"),
task_queue_condition_("task queue condition", task_queue_lock_),
@@ -65,7 +59,8 @@
shutting_down_(false),
waiting_count_(0) {
while (GetThreadCount() < num_threads) {
- AddThread(ThreadPoolWorker::kDefaultStackSize);
+ const std::string name = StringPrintf("Thread pool worker %zu", GetThreadCount());
+ threads_.push_back(new ThreadPoolWorker(this, name, ThreadPoolWorker::kDefaultStackSize));
}
}
@@ -75,9 +70,9 @@
MutexLock mu(self, task_queue_lock_);
// Tell any remaining workers to shut down.
shutting_down_ = true;
- android_memory_barrier();
// Broadcast to everyone waiting.
task_queue_condition_.Broadcast(self);
+ completion_condition_.Broadcast(self);
}
// Wait for the threads to finish.
STLDeleteElements(&threads_);
@@ -86,22 +81,21 @@
void ThreadPool::StartWorkers(Thread* self) {
MutexLock mu(self, task_queue_lock_);
started_ = true;
- android_memory_barrier();
task_queue_condition_.Broadcast(self);
+ start_time_ = NanoTime();
+ total_wait_time_ = 0;
}
void ThreadPool::StopWorkers(Thread* self) {
MutexLock mu(self, task_queue_lock_);
started_ = false;
- android_memory_barrier();
}
-Closure* ThreadPool::GetTask(Thread* self) {
+Task* ThreadPool::GetTask(Thread* self) {
MutexLock mu(self, task_queue_lock_);
- while (!shutting_down_) {
- if (started_ && !tasks_.empty()) {
- Closure* task = tasks_.front();
- tasks_.pop_front();
+ while (!IsShuttingDown()) {
+ Task* task = TryGetTaskLocked(self);
+ if (task != NULL) {
return task;
}
@@ -110,7 +104,10 @@
// We may be done, lets broadcast to the completion condition.
completion_condition_.Broadcast(self);
}
+ const uint64_t wait_start = NanoTime();
task_queue_condition_.Wait(self);
+ const uint64_t wait_end = NanoTime();
+ total_wait_time_ += wait_end - std::max(wait_start, start_time_);
waiting_count_--;
}
@@ -118,12 +115,151 @@
return NULL;
}
-void ThreadPool::Wait(Thread* self) {
+Task* ThreadPool::TryGetTask(Thread* self) {
MutexLock mu(self, task_queue_lock_);
+ return TryGetTaskLocked(self);
+}
+
+Task* ThreadPool::TryGetTaskLocked(Thread* self) {
+ if (started_ && !tasks_.empty()) {
+ Task* task = tasks_.front();
+ tasks_.pop_front();
+ return task;
+ }
+ return NULL;
+}
+
+void ThreadPool::Wait(Thread* self, bool do_work) {
+ Task* task = NULL;
+ while ((task = TryGetTask(self)) != NULL) {
+ task->Run(self);
+ task->Finalize();
+ }
+
// Wait until each thread is waiting and the task list is empty.
- while (waiting_count_ != GetThreadCount() || !tasks_.empty()) {
+ MutexLock mu(self, task_queue_lock_);
+ while (!shutting_down_ && (waiting_count_ != GetThreadCount() || !tasks_.empty())) {
completion_condition_.Wait(self);
}
}
+size_t ThreadPool::GetTaskCount(Thread* self){
+ MutexLock mu(self, task_queue_lock_);
+ return tasks_.size();
+}
+
+WorkStealingWorker::WorkStealingWorker(ThreadPool* thread_pool, const std::string& name,
+ size_t stack_size)
+ : ThreadPoolWorker(thread_pool, name, stack_size),
+ task_(NULL) {
+
+}
+
+void WorkStealingWorker::Run() {
+ Thread* self = Thread::Current();
+ Task* task = NULL;
+ WorkStealingThreadPool* thread_pool = down_cast<WorkStealingThreadPool*>(thread_pool_);
+ while ((task = thread_pool_->GetTask(self)) != NULL) {
+ WorkStealingTask* stealing_task = down_cast<WorkStealingTask*>(task);
+
+ {
+ CHECK(task_ == NULL);
+ MutexLock mu(self, thread_pool->work_steal_lock_);
+ // Register that we are running the task
+ ++stealing_task->ref_count_;
+ task_ = stealing_task;
+ }
+ stealing_task->Run(self);
+ // Mark ourselves as not running a task so that nobody tries to steal from us.
+ // There is a race condition that someone starts stealing from us at this point. This is okay
+ // due to the reference counting.
+ task_ = NULL;
+
+ bool finalize;
+
+ // Steal work from tasks until there is none left to steal. Note: There is a race, but
+ // all that happens when the race occurs is that we steal some work instead of processing a
+ // task from the queue.
+ while (thread_pool->GetTaskCount(self) == 0) {
+ WorkStealingTask* steal_from_task = NULL;
+
+ {
+ MutexLock mu(self, thread_pool->work_steal_lock_);
+ // Try finding a task to steal from.
+ steal_from_task = thread_pool->FindTaskToStealFrom(self);
+ if (steal_from_task != NULL) {
+ CHECK_NE(stealing_task, steal_from_task)
+ << "Attempting to steal from completed self task";
+ steal_from_task->ref_count_++;
+ } else {
+ break;
+ }
+ }
+
+ if (steal_from_task != NULL) {
+ // Task which completed earlier is going to steal some work.
+ stealing_task->StealFrom(self, steal_from_task);
+
+ {
+ // We are done stealing from the task, lets decrement its reference count.
+ MutexLock mu(self, thread_pool->work_steal_lock_);
+ finalize = !--steal_from_task->ref_count_;
+ }
+
+ if (finalize) {
+ steal_from_task->Finalize();
+ }
+ }
+ }
+
+ {
+ MutexLock mu(self, thread_pool->work_steal_lock_);
+ // If nobody is still referencing task_ we can finalize it.
+ finalize = !--stealing_task->ref_count_;
+ }
+
+ if (finalize) {
+ stealing_task->Finalize();
+ }
+ }
+}
+
+WorkStealingWorker::~WorkStealingWorker() {
+
+}
+
+WorkStealingThreadPool::WorkStealingThreadPool(size_t num_threads)
+ : ThreadPool(0),
+ work_steal_lock_("work stealing lock"),
+ steal_index_(0) {
+ while (GetThreadCount() < num_threads) {
+ const std::string name = StringPrintf("Work stealing worker %zu", GetThreadCount());
+ threads_.push_back(new WorkStealingWorker(this, name, ThreadPoolWorker::kDefaultStackSize));
+ }
+}
+
+WorkStealingTask* WorkStealingThreadPool::FindTaskToStealFrom(Thread* self) {
+ const size_t thread_count = GetThreadCount();
+ for (size_t i = 0; i < thread_count; ++i) {
+ // TODO: Use CAS instead of lock.
+ ++steal_index_;
+ if (steal_index_ >= thread_count) {
+ steal_index_-= thread_count;
+ }
+
+ WorkStealingWorker* worker = down_cast<WorkStealingWorker*>(threads_[steal_index_]);
+ WorkStealingTask* task = worker->task_;
+ if (task) {
+ // Not null, we can probably steal from this worker.
+ return task;
+ }
+ }
+ // Couldn't find something to steal.
+ return NULL;
+}
+
+WorkStealingThreadPool::~WorkStealingThreadPool() {
+
+}
+
} // namespace art
diff --git a/src/thread_pool.h b/src/thread_pool.h
index 1d0f85d..c8f6056 100644
--- a/src/thread_pool.h
+++ b/src/thread_pool.h
@@ -20,14 +20,20 @@
#include <deque>
#include <vector>
+#include "closure.h"
#include "locks.h"
#include "../src/mutex.h"
namespace art {
-class Closure;
class ThreadPool;
+class Task : public Closure {
+public:
+ // Called when references reaches 0.
+ virtual void Finalize() { }
+};
+
class ThreadPoolWorker {
public:
static const size_t kDefaultStackSize = 1 * MB;
@@ -38,10 +44,10 @@
virtual ~ThreadPoolWorker();
- private:
+ protected:
ThreadPoolWorker(ThreadPool* thread_pool, const std::string& name, size_t stack_size);
static void* Callback(void* arg) LOCKS_EXCLUDED(Locks::mutator_lock_);
- void Run();
+ virtual void Run();
ThreadPool* thread_pool_;
const std::string name_;
@@ -67,20 +73,33 @@
// Add a new task, the first available started worker will process it. Does not delete the task
// after running it, it is the caller's responsibility.
- void AddTask(Thread* self, Closure* task);
+ void AddTask(Thread* self, Task* task);
ThreadPool(size_t num_threads);
virtual ~ThreadPool();
// Wait for all tasks currently on queue to get completed.
- void Wait(Thread* self);
+ void Wait(Thread* self, bool do_work = true);
- private:
- // Add a new task.
- void AddThread(size_t stack_size);
+ size_t GetTaskCount(Thread* self);
+ // Returns the total amount of workers waited for tasks.
+ uint64_t GetWaitTime() const {
+ return total_wait_time_;
+ }
+
+ protected:
// Get a task to run, blocks if there are no tasks left
- Closure* GetTask(Thread* self);
+ virtual Task* GetTask(Thread* self);
+
+ // Try to get a task, returning NULL if there is none available.
+ Task* TryGetTask(Thread* self);
+ Task* TryGetTaskLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(task_queue_lock_);
+
+ // Are we shutting down?
+ bool IsShuttingDown() const EXCLUSIVE_LOCKS_REQUIRED(task_queue_lock_) {
+ return shutting_down_;
+ }
Mutex task_queue_lock_;
ConditionVariable task_queue_condition_ GUARDED_BY(task_queue_lock_);
@@ -89,14 +108,71 @@
volatile bool shutting_down_ GUARDED_BY(task_queue_lock_);
// How many worker threads are waiting on the condition.
volatile size_t waiting_count_ GUARDED_BY(task_queue_lock_);
- std::deque<Closure*> tasks_ GUARDED_BY(task_queue_lock_);
+ std::deque<Task*> tasks_ GUARDED_BY(task_queue_lock_);
// TODO: make this immutable/const?
std::vector<ThreadPoolWorker*> threads_;
+ // Work balance detection.
+ uint64_t start_time_ GUARDED_BY(task_queue_lock_);
+ uint64_t total_wait_time_;
friend class ThreadPoolWorker;
+ friend class WorkStealingWorker;
DISALLOW_COPY_AND_ASSIGN(ThreadPool);
};
+class WorkStealingTask : public Task {
+ public:
+ WorkStealingTask() : ref_count_(0) {
+
+ }
+
+ size_t GetRefCount() const {
+ return ref_count_;
+ }
+
+ virtual void StealFrom(Thread* self, WorkStealingTask* source) = 0;
+
+ private:
+ // How many people are referencing this task.
+ size_t ref_count_;
+
+ friend class WorkStealingWorker;
+};
+
+class WorkStealingWorker : public ThreadPoolWorker {
+ public:
+ virtual ~WorkStealingWorker();
+
+ bool IsRunningTask() const {
+ return task_ != NULL;
+ }
+
+ protected:
+ WorkStealingTask* task_;
+
+ WorkStealingWorker(ThreadPool* thread_pool, const std::string& name, size_t stack_size);
+ virtual void Run();
+
+ friend class WorkStealingThreadPool;
+ DISALLOW_COPY_AND_ASSIGN(WorkStealingWorker);
+};
+
+class WorkStealingThreadPool : public ThreadPool {
+ public:
+ WorkStealingThreadPool(size_t num_threads);
+ virtual ~WorkStealingThreadPool();
+
+ private:
+ Mutex work_steal_lock_;
+ // Which thread we are stealing from (round robin).
+ size_t steal_index_;
+
+ // Find a task to steal from
+ WorkStealingTask* FindTaskToStealFrom(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(work_steal_lock_);
+
+ friend class WorkStealingWorker;
+};
+
} // namespace art
#endif // ART_SRC_THREAD_POOL_H_
diff --git a/src/thread_pool_test.cc b/src/thread_pool_test.cc
index 783f786..bac6002 100644
--- a/src/thread_pool_test.cc
+++ b/src/thread_pool_test.cc
@@ -23,9 +23,9 @@
namespace art {
-class CountClosure : public Closure {
+class CountTask : public Task {
public:
- CountClosure(AtomicInteger* count) : count_(count) {
+ CountTask(AtomicInteger* count) : count_(count) {
}
@@ -34,6 +34,9 @@
usleep(100);
// Increment the counter which keeps track of work completed.
++*count_;
+ }
+
+ void Finalize() {
delete this;
}
@@ -55,7 +58,7 @@
AtomicInteger count = 0;
static const int32_t num_tasks = num_threads * 4;
for (int32_t i = 0; i < num_tasks; ++i) {
- thread_pool.AddTask(self, new CountClosure(&count));
+ thread_pool.AddTask(self, new CountTask(&count));
}
thread_pool.StartWorkers(self);
// Wait for tasks to complete.
@@ -70,7 +73,7 @@
AtomicInteger count = 0;
static const int32_t num_tasks = num_threads * 4;
for (int32_t i = 0; i < num_tasks; ++i) {
- thread_pool.AddTask(self, new CountClosure(&count));
+ thread_pool.AddTask(self, new CountTask(&count));
}
usleep(200);
// Check that no threads started prematurely.
@@ -80,15 +83,15 @@
usleep(200);
thread_pool.StopWorkers(self);
AtomicInteger bad_count = 0;
- thread_pool.AddTask(self, new CountClosure(&bad_count));
+ thread_pool.AddTask(self, new CountTask(&bad_count));
usleep(200);
// Ensure that the task added after the workers were stopped doesn't get run.
EXPECT_EQ(0, bad_count);
}
-class TreeClosure : public Closure {
+class TreeTask : public Task {
public:
- TreeClosure(ThreadPool* const thread_pool, AtomicInteger* count, int depth)
+ TreeTask(ThreadPool* const thread_pool, AtomicInteger* count, int depth)
: thread_pool_(thread_pool),
count_(count),
depth_(depth) {
@@ -97,11 +100,14 @@
void Run(Thread* self) {
if (depth_ > 1) {
- thread_pool_->AddTask(self, new TreeClosure(thread_pool_, count_, depth_ - 1));
- thread_pool_->AddTask(self, new TreeClosure(thread_pool_, count_, depth_ - 1));
+ thread_pool_->AddTask(self, new TreeTask(thread_pool_, count_, depth_ - 1));
+ thread_pool_->AddTask(self, new TreeTask(thread_pool_, count_, depth_ - 1));
}
// Increment the counter which keeps track of work completed.
++*count_;
+ }
+
+ void Finalize() {
delete this;
}
@@ -117,7 +123,7 @@
ThreadPool thread_pool(num_threads);
AtomicInteger count = 0;
static const int depth = 8;
- thread_pool.AddTask(self, new TreeClosure(&thread_pool, &count, depth));
+ thread_pool.AddTask(self, new TreeTask(&thread_pool, &count, depth));
thread_pool.StartWorkers(self);
thread_pool.Wait(self);
EXPECT_EQ((1 << depth) - 1, count);