Fix the empty checkpoint timeout.
The problem happens when a thread is indirectly blocked on a mutex
that another thread holds and is blocked on a weak ref access.
Add a way to do a dummy wakeup on a thread that's blocked on a mutex
so that the thread will respond to the empty checkpoint request. Do
this for the mutexes that are expected to be held when a weak ref is
accessed. Add a check that detects an unexpected case.
Bug: 33006388
Bug: 12687968
Test: test-art-host.
Change-Id: Iefec69b9a21aa25a928cb31fcf4fb872f867a8c2
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 6e102be..7bba944 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -72,6 +72,7 @@
ReaderWriterMutex* Locks::jni_globals_lock_ = nullptr;
Mutex* Locks::jni_weak_globals_lock_ = nullptr;
ReaderWriterMutex* Locks::dex_lock_ = nullptr;
+std::vector<BaseMutex*> Locks::expected_mutexes_on_weak_ref_access_;
struct AllMutexData {
// A guard for all_mutexes_ that's not a mutex (Mutexes must CAS to acquire and busy wait).
@@ -146,7 +147,10 @@
const uint64_t start_nano_time_;
};
-BaseMutex::BaseMutex(const char* name, LockLevel level) : level_(level), name_(name) {
+BaseMutex::BaseMutex(const char* name, LockLevel level)
+ : level_(level),
+ name_(name),
+ should_respond_to_empty_checkpoint_request_(false) {
if (kLogLockContentions) {
ScopedAllMutexesLock mu(this);
std::set<BaseMutex*>** all_mutexes_ptr = &gAllMutexData->all_mutexes;
@@ -377,6 +381,9 @@
// Failed to acquire, hang up.
ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
num_contenders_++;
+ if (UNLIKELY(should_respond_to_empty_checkpoint_request_)) {
+ self->CheckEmptyCheckpointFromMutex();
+ }
if (futex(state_.Address(), FUTEX_WAIT, 1, nullptr, nullptr, 0) != 0) {
// EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
// We don't use TEMP_FAILURE_RETRY so we can intentionally retry to acquire the lock.
@@ -519,6 +526,18 @@
return os;
}
+void Mutex::WakeupToRespondToEmptyCheckpoint() {
+#if ART_USE_FUTEXES
+ // Wake up all the waiters so they will respond to the emtpy checkpoint.
+ DCHECK(should_respond_to_empty_checkpoint_request_);
+ if (UNLIKELY(num_contenders_.LoadRelaxed() > 0)) {
+ futex(state_.Address(), FUTEX_WAKE, -1, nullptr, nullptr, 0);
+ }
+#else
+ LOG(FATAL) << "Non futex case isn't supported.";
+#endif
+}
+
ReaderWriterMutex::ReaderWriterMutex(const char* name, LockLevel level)
: BaseMutex(name, level)
#if ART_USE_FUTEXES
@@ -563,6 +582,9 @@
// Failed to acquire, hang up.
ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
++num_pending_writers_;
+ if (UNLIKELY(should_respond_to_empty_checkpoint_request_)) {
+ self->CheckEmptyCheckpointFromMutex();
+ }
if (futex(state_.Address(), FUTEX_WAIT, cur_state, nullptr, nullptr, 0) != 0) {
// EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
// We don't use TEMP_FAILURE_RETRY so we can intentionally retry to acquire the lock.
@@ -639,6 +661,9 @@
}
ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
++num_pending_writers_;
+ if (UNLIKELY(should_respond_to_empty_checkpoint_request_)) {
+ self->CheckEmptyCheckpointFromMutex();
+ }
if (futex(state_.Address(), FUTEX_WAIT, cur_state, &rel_ts, nullptr, 0) != 0) {
if (errno == ETIMEDOUT) {
--num_pending_writers_;
@@ -677,6 +702,9 @@
// Owner holds it exclusively, hang up.
ScopedContentionRecorder scr(this, GetExclusiveOwnerTid(), SafeGetTid(self));
++num_pending_readers_;
+ if (UNLIKELY(should_respond_to_empty_checkpoint_request_)) {
+ self->CheckEmptyCheckpointFromMutex();
+ }
if (futex(state_.Address(), FUTEX_WAIT, cur_state, nullptr, nullptr, 0) != 0) {
if (errno != EAGAIN && errno != EINTR) {
PLOG(FATAL) << "futex wait failed for " << name_;
@@ -749,6 +777,19 @@
return os;
}
+void ReaderWriterMutex::WakeupToRespondToEmptyCheckpoint() {
+#if ART_USE_FUTEXES
+ // Wake up all the waiters so they will respond to the emtpy checkpoint.
+ DCHECK(should_respond_to_empty_checkpoint_request_);
+ if (UNLIKELY(num_pending_readers_.LoadRelaxed() > 0 ||
+ num_pending_writers_.LoadRelaxed() > 0)) {
+ futex(state_.Address(), FUTEX_WAKE, -1, nullptr, nullptr, 0);
+ }
+#else
+ LOG(FATAL) << "Non futex case isn't supported.";
+#endif
+}
+
ConditionVariable::ConditionVariable(const char* name, Mutex& guard)
: name_(name), guard_(guard) {
#if ART_USE_FUTEXES
@@ -1121,6 +1162,12 @@
#undef UPDATE_CURRENT_LOCK_LEVEL
+ // List of mutexes that we may hold when accessing a weak ref.
+ dex_lock_->SetShouldRespondToEmptyCheckpointRequest(true);
+ expected_mutexes_on_weak_ref_access_.push_back(dex_lock_);
+ classlinker_classes_lock_->SetShouldRespondToEmptyCheckpointRequest(true);
+ expected_mutexes_on_weak_ref_access_.push_back(classlinker_classes_lock_);
+
InitConditions();
}
}
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index ffe18c6..9b6938f 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -152,6 +152,16 @@
static void DumpAll(std::ostream& os);
+ bool ShouldRespondToEmptyCheckpointRequest() const {
+ return should_respond_to_empty_checkpoint_request_;
+ }
+
+ void SetShouldRespondToEmptyCheckpointRequest(bool value) {
+ should_respond_to_empty_checkpoint_request_ = value;
+ }
+
+ virtual void WakeupToRespondToEmptyCheckpoint() = 0;
+
protected:
friend class ConditionVariable;
@@ -168,6 +178,7 @@
const LockLevel level_; // Support for lock hierarchy.
const char* const name_;
+ bool should_respond_to_empty_checkpoint_request_;
// A log entry that records contention but makes no guarantee that either tid will be held live.
struct ContentionLogEntry {
@@ -266,6 +277,8 @@
// For negative capabilities in clang annotations.
const Mutex& operator!() const { return *this; }
+ void WakeupToRespondToEmptyCheckpoint() OVERRIDE;
+
private:
#if ART_USE_FUTEXES
// 0 is unheld, 1 is held.
@@ -386,6 +399,8 @@
// For negative capabilities in clang annotations.
const ReaderWriterMutex& operator!() const { return *this; }
+ void WakeupToRespondToEmptyCheckpoint() OVERRIDE;
+
private:
#if ART_USE_FUTEXES
// Out-of-inline path for handling contention for a SharedLock.
@@ -713,6 +728,12 @@
// Have an exclusive logging thread.
static Mutex* logging_lock_ ACQUIRED_AFTER(unexpected_signal_lock_);
+
+ // List of mutexes that we expect a thread may hold when accessing weak refs. This is used to
+ // avoid a deadlock in the empty checkpoint while weak ref access is disabled (b/34964016). If we
+ // encounter an unexpected mutex on accessing weak refs,
+ // Thread::CheckEmptyCheckpointFromWeakRefAccess will detect it.
+ static std::vector<BaseMutex*> expected_mutexes_on_weak_ref_access_;
};
class Roles {
diff --git a/runtime/gc/allocation_record.cc b/runtime/gc/allocation_record.cc
index e18a955..122f779 100644
--- a/runtime/gc/allocation_record.cc
+++ b/runtime/gc/allocation_record.cc
@@ -292,7 +292,7 @@
(kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
// Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
// presence of threads blocking for weak ref access.
- self->CheckEmptyCheckpoint();
+ self->CheckEmptyCheckpointFromWeakRefAccess(Locks::alloc_tracker_lock_);
new_record_condition_.WaitHoldingLocks(self);
}
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index f12ad80..f18ffb4 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -835,65 +835,9 @@
void ConcurrentCopying::IssueEmptyCheckpoint() {
Thread* self = Thread::Current();
ThreadList* thread_list = Runtime::Current()->GetThreadList();
- Barrier* barrier = thread_list->EmptyCheckpointBarrier();
- barrier->Init(self, 0);
- std::vector<uint32_t> runnable_thread_ids; // Used in debug build only
- size_t barrier_count = thread_list->RunEmptyCheckpoint(runnable_thread_ids);
- // If there are no threads to wait which implys that all the checkpoint functions are finished,
- // then no need to release the mutator lock.
- if (barrier_count == 0) {
- return;
- }
// Release locks then wait for all mutator threads to pass the barrier.
Locks::mutator_lock_->SharedUnlock(self);
- {
- ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
- if (kIsDebugBuild) {
- static constexpr uint64_t kEmptyCheckpointTimeoutMs = 600 * 1000; // 10 minutes.
- bool timed_out = barrier->Increment(self, barrier_count, kEmptyCheckpointTimeoutMs);
- if (timed_out) {
- std::ostringstream ss;
- ss << "Empty checkpoint timeout\n";
- ss << "Barrier count " << barrier->GetCount(self) << "\n";
- ss << "Runnable thread IDs";
- for (uint32_t tid : runnable_thread_ids) {
- ss << " " << tid;
- }
- ss << "\n";
- Locks::mutator_lock_->Dump(ss);
- ss << "\n";
- LOG(FATAL_WITHOUT_ABORT) << ss.str();
- // Some threads in 'runnable_thread_ids' are probably stuck. Try to dump their stacks.
- // Avoid using ThreadList::Dump() initially because it is likely to get stuck as well.
- {
- ScopedObjectAccess soa(self);
- MutexLock mu1(self, *Locks::thread_list_lock_);
- for (Thread* thread : thread_list->GetList()) {
- uint32_t tid = thread->GetThreadId();
- bool is_in_runnable_thread_ids =
- std::find(runnable_thread_ids.begin(), runnable_thread_ids.end(), tid) !=
- runnable_thread_ids.end();
- if (is_in_runnable_thread_ids &&
- thread->ReadFlag(kEmptyCheckpointRequest)) {
- // Found a runnable thread that hasn't responded to the empty checkpoint request.
- // Assume it's stuck and safe to dump its stack.
- thread->Dump(LOG_STREAM(FATAL_WITHOUT_ABORT),
- /*dump_native_stack*/ true,
- /*backtrace_map*/ nullptr,
- /*force_dump_stack*/ true);
- }
- }
- }
- LOG(FATAL_WITHOUT_ABORT)
- << "Dumped runnable threads that haven't responded to empty checkpoint.";
- // Now use ThreadList::Dump() to dump more threads, noting it may get stuck.
- thread_list->Dump(LOG_STREAM(FATAL_WITHOUT_ABORT));
- LOG(FATAL) << "Dumped all threads.";
- }
- } else {
- barrier->Increment(self, barrier_count);
- }
- }
+ thread_list->RunEmptyCheckpoint();
Locks::mutator_lock_->SharedLock(self);
}
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index c154836..86b1522 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc
@@ -104,7 +104,7 @@
}
// Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
// presence of threads blocking for weak ref access.
- self->CheckEmptyCheckpoint();
+ self->CheckEmptyCheckpointFromWeakRefAccess(Locks::reference_processor_lock_);
condition_.WaitHoldingLocks(self);
}
return reference->GetReferent();
@@ -292,7 +292,7 @@
(kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
// Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
// presence of threads blocking for weak ref access.
- self->CheckEmptyCheckpoint();
+ self->CheckEmptyCheckpointFromWeakRefAccess(Locks::reference_processor_lock_);
condition_.WaitHoldingLocks(self);
}
}
diff --git a/runtime/gc/system_weak.h b/runtime/gc/system_weak.h
index e5cddfc..60105f4 100644
--- a/runtime/gc/system_weak.h
+++ b/runtime/gc/system_weak.h
@@ -82,7 +82,7 @@
(kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
// Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
// presence of threads blocking for weak ref access.
- self->CheckEmptyCheckpoint();
+ self->CheckEmptyCheckpointFromWeakRefAccess(&allow_disallow_lock_);
new_weak_condition_.WaitHoldingLocks(self);
}
}
diff --git a/runtime/java_vm_ext.cc b/runtime/java_vm_ext.cc
index e0f28ad..a341cdb 100644
--- a/runtime/java_vm_ext.cc
+++ b/runtime/java_vm_ext.cc
@@ -572,7 +572,7 @@
while (!kUseReadBarrier && UNLIKELY(!MayAccessWeakGlobals(self))) {
// Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
// presence of threads blocking for weak ref access.
- self->CheckEmptyCheckpoint();
+ self->CheckEmptyCheckpointFromWeakRefAccess(Locks::jni_weak_globals_lock_);
weak_globals_add_condition_.WaitHoldingLocks(self);
}
IndirectRef ref = weak_globals_.Add(kIRTFirstSegment, obj);
@@ -706,7 +706,7 @@
while (UNLIKELY(!MayAccessWeakGlobals(self))) {
// Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
// presence of threads blocking for weak ref access.
- self->CheckEmptyCheckpoint();
+ self->CheckEmptyCheckpointFromWeakRefAccess(Locks::jni_weak_globals_lock_);
weak_globals_add_condition_.WaitHoldingLocks(self);
}
return weak_globals_.Get(ref);
@@ -731,7 +731,7 @@
while (UNLIKELY(!MayAccessWeakGlobals(self))) {
// Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
// presence of threads blocking for weak ref access.
- self->CheckEmptyCheckpoint();
+ self->CheckEmptyCheckpointFromWeakRefAccess(Locks::jni_weak_globals_lock_);
weak_globals_add_condition_.WaitHoldingLocks(self);
}
// When just checking a weak ref has been cleared, avoid triggering the read barrier in decode
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index a32003e..f3cb0df 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -1380,7 +1380,7 @@
while (!kUseReadBarrier && UNLIKELY(!allow_new_monitors_)) {
// Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
// presence of threads blocking for weak ref access.
- self->CheckEmptyCheckpoint();
+ self->CheckEmptyCheckpointFromWeakRefAccess(&monitor_list_lock_);
monitor_add_condition_.WaitHoldingLocks(self);
}
list_.push_front(m);
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index c92305f..8d94626 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -80,7 +80,34 @@
}
}
-inline void Thread::CheckEmptyCheckpoint() {
+inline void Thread::CheckEmptyCheckpointFromWeakRefAccess(BaseMutex* cond_var_mutex) {
+ Thread* self = Thread::Current();
+ DCHECK_EQ(self, this);
+ for (;;) {
+ if (ReadFlag(kEmptyCheckpointRequest)) {
+ RunEmptyCheckpoint();
+ // Check we hold only an expected mutex when accessing weak ref.
+ if (kIsDebugBuild) {
+ for (int i = kLockLevelCount - 1; i >= 0; --i) {
+ BaseMutex* held_mutex = self->GetHeldMutex(static_cast<LockLevel>(i));
+ if (held_mutex != nullptr &&
+ held_mutex != Locks::mutator_lock_ &&
+ held_mutex != cond_var_mutex) {
+ std::vector<BaseMutex*>& expected_mutexes = Locks::expected_mutexes_on_weak_ref_access_;
+ CHECK(std::find(expected_mutexes.begin(), expected_mutexes.end(), held_mutex) !=
+ expected_mutexes.end())
+ << "Holding unexpected mutex " << held_mutex->GetName()
+ << " when accessing weak ref";
+ }
+ }
+ }
+ } else {
+ break;
+ }
+ }
+}
+
+inline void Thread::CheckEmptyCheckpointFromMutex() {
DCHECK_EQ(Thread::Current(), this);
for (;;) {
if (ReadFlag(kEmptyCheckpointRequest)) {
diff --git a/runtime/thread.h b/runtime/thread.h
index 3a1b7da..a46e799 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -176,7 +176,8 @@
void CheckSuspend() REQUIRES_SHARED(Locks::mutator_lock_);
// Process a pending empty checkpoint if pending.
- void CheckEmptyCheckpoint() REQUIRES_SHARED(Locks::mutator_lock_);
+ void CheckEmptyCheckpointFromWeakRefAccess(BaseMutex* cond_var_mutex);
+ void CheckEmptyCheckpointFromMutex();
static Thread* FromManagedThread(const ScopedObjectAccessAlreadyRunnable& ts,
ObjPtr<mirror::Object> thread_peer)
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index df8acc3..caed369 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -379,13 +379,15 @@
return count;
}
-size_t ThreadList::RunEmptyCheckpoint(std::vector<uint32_t>& runnable_thread_ids) {
+void ThreadList::RunEmptyCheckpoint() {
Thread* self = Thread::Current();
Locks::mutator_lock_->AssertNotExclusiveHeld(self);
Locks::thread_list_lock_->AssertNotHeld(self);
Locks::thread_suspend_count_lock_->AssertNotHeld(self);
-
+ std::vector<uint32_t> runnable_thread_ids;
size_t count = 0;
+ Barrier* barrier = empty_checkpoint_barrier_.get();
+ barrier->Init(self, 0);
{
MutexLock mu(self, *Locks::thread_list_lock_);
MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
@@ -415,8 +417,72 @@
// checkpoint request. Otherwise we will hang as they are blocking in the kRunnable state.
Runtime::Current()->GetHeap()->GetReferenceProcessor()->BroadcastForSlowPath(self);
Runtime::Current()->BroadcastForNewSystemWeaks(/*broadcast_for_checkpoint*/true);
-
- return count;
+ {
+ ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
+ uint64_t total_wait_time = 0;
+ bool first_iter = true;
+ while (true) {
+ // Wake up the runnable threads blocked on the mutexes that another thread, which is blocked
+ // on a weak ref access, holds (indirectly blocking for weak ref access through another thread
+ // and a mutex.) This needs to be done periodically because the thread may be preempted
+ // between the CheckEmptyCheckpointFromMutex call and the subsequent futex wait in
+ // Mutex::ExclusiveLock, etc. when the wakeup via WakeupToRespondToEmptyCheckpoint
+ // arrives. This could cause a *very rare* deadlock, if not repeated. Most of the cases are
+ // handled in the first iteration.
+ for (BaseMutex* mutex : Locks::expected_mutexes_on_weak_ref_access_) {
+ mutex->WakeupToRespondToEmptyCheckpoint();
+ }
+ static constexpr uint64_t kEmptyCheckpointPeriodicTimeoutMs = 100; // 100ms
+ static constexpr uint64_t kEmptyCheckpointTotalTimeoutMs = 600 * 1000; // 10 minutes.
+ size_t barrier_count = first_iter ? count : 0;
+ first_iter = false; // Don't add to the barrier count from the second iteration on.
+ bool timed_out = barrier->Increment(self, barrier_count, kEmptyCheckpointPeriodicTimeoutMs);
+ if (!timed_out) {
+ break; // Success
+ }
+ // This is a very rare case.
+ total_wait_time += kEmptyCheckpointPeriodicTimeoutMs;
+ if (kIsDebugBuild && total_wait_time > kEmptyCheckpointTotalTimeoutMs) {
+ std::ostringstream ss;
+ ss << "Empty checkpoint timeout\n";
+ ss << "Barrier count " << barrier->GetCount(self) << "\n";
+ ss << "Runnable thread IDs";
+ for (uint32_t tid : runnable_thread_ids) {
+ ss << " " << tid;
+ }
+ ss << "\n";
+ Locks::mutator_lock_->Dump(ss);
+ ss << "\n";
+ LOG(FATAL_WITHOUT_ABORT) << ss.str();
+ // Some threads in 'runnable_thread_ids' are probably stuck. Try to dump their stacks.
+ // Avoid using ThreadList::Dump() initially because it is likely to get stuck as well.
+ {
+ ScopedObjectAccess soa(self);
+ MutexLock mu1(self, *Locks::thread_list_lock_);
+ for (Thread* thread : GetList()) {
+ uint32_t tid = thread->GetThreadId();
+ bool is_in_runnable_thread_ids =
+ std::find(runnable_thread_ids.begin(), runnable_thread_ids.end(), tid) !=
+ runnable_thread_ids.end();
+ if (is_in_runnable_thread_ids &&
+ thread->ReadFlag(kEmptyCheckpointRequest)) {
+ // Found a runnable thread that hasn't responded to the empty checkpoint request.
+ // Assume it's stuck and safe to dump its stack.
+ thread->Dump(LOG_STREAM(FATAL_WITHOUT_ABORT),
+ /*dump_native_stack*/ true,
+ /*backtrace_map*/ nullptr,
+ /*force_dump_stack*/ true);
+ }
+ }
+ }
+ LOG(FATAL_WITHOUT_ABORT)
+ << "Dumped runnable threads that haven't responded to empty checkpoint.";
+ // Now use ThreadList::Dump() to dump more threads, noting it may get stuck.
+ Dump(LOG_STREAM(FATAL_WITHOUT_ABORT));
+ LOG(FATAL) << "Dumped all threads.";
+ }
+ }
+ }
}
// Request that a checkpoint function be run on all active (non-suspended)
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index b60fca1..70917eb 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -109,9 +109,7 @@
// in-flight mutator heap access (eg. a read barrier.) Runnable threads will respond by
// decrementing the empty checkpoint barrier count. This works even when the weak ref access is
// disabled. Only one concurrent use is currently supported.
- // In debug build, runnable_thread_ids will be populated with the thread IDS of the runnable
- // thread to wait for.
- size_t RunEmptyCheckpoint(std::vector<uint32_t>& runnable_thread_ids)
+ void RunEmptyCheckpoint()
REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);
size_t RunCheckpointOnRunnableThreads(Closure* checkpoint_function)