Reduce VM aborts during high CPU stress. The VM has some timeouts that are meant to kill the current process if something gets stuck (e.g. a thread grabs a lock and then manages to die while the rest of the process continues on). These were tripping a little too easily during some high-load situations. This changes the order of operations so that we now unlock the "thread suspend" lock before sending a wakeup broadcast to the condition variable that threads sleep on. This should make it less likely for a thread to be running for an extended period while the lock is held. (Relates to internal bug 1664687.) This also wraps a couple of things (pthread_create, dlopen) with a state change to VMWAIT. During high load situations these can take a while to complete, and we would (with the K-Means Visualizer load generator running) very occasionally time out. Augmented the debug output in a couple of minor ways. Updated comments.

commit: 2aa43610c391868eb6ef80bf3b1f947776defcca [log] [tgz]
author: Andy McFadden <fadden@android.com> Wed Jun 17 16:29:30 2009 -0700
committer: Andy McFadden <fadden@android.com> Wed Jun 17 16:55:40 2009 -0700
tree: 3a212711da1c0177b47059d66994b6cffbe3e568
parent: eba51c25771a07993224e5ae74e8cac715985ba6 [diff] [blame]
diff --git a/vm/Thread.c b/vm/Thread.c
index 6420533..ae81230 100644
--- a/vm/Thread.c
+++ b/vm/Thread.c

@@ -527,9 +527,15 @@
         if (cc != 0) {
             if (!dvmCheckSuspendPending(NULL)) {
                 /*
-                 * Could be we hit the window as the suspend or resume
-                 * was started (i.e. the lock has been grabbed but the
-                 * other thread hasn't yet set our "please suspend" flag).
+                 * Could be that a resume-all is in progress, and something
+                 * grabbed the CPU when the wakeup was broadcast.  The thread
+                 * performing the resume hasn't had a chance to release the
+                 * thread suspend lock.  (Should no longer be an issue --
+                 * we now release before broadcast.)
+                 *
+                 * Could be we hit the window as a suspend was started,
+                 * and the lock has been grabbed but the suspend counts
+                 * haven't been incremented yet.
                  *
                  * Could be an unusual JNI thread-attach thing.
                  *
@@ -551,6 +557,7 @@
                 LOGE("threadid=%d: couldn't get thread-suspend lock (%s:%s),"
                      " bailing\n",
                     dvmThreadSelf()->threadId, who, getSuspendCauseStr(why));
+                /* threads are not suspended, thread dump could crash */
                 dvmDumpAllThreads(false);
                 dvmAbort();
             }
@@ -1260,9 +1267,13 @@
      */
     dvmUnlockThreadList();
 
-    if (pthread_create(&threadHandle, &threadAttr, interpThreadStart,
-            newThread) != 0)
-    {
+    int cc, oldStatus;
+    oldStatus = dvmChangeStatus(self, THREAD_VMWAIT);
+    cc = pthread_create(&threadHandle, &threadAttr, interpThreadStart,
+            newThread);
+    oldStatus = dvmChangeStatus(self, oldStatus);
+
+    if (cc != 0) {
         /*
          * Failure generally indicates that we have exceeded system
          * resource limits.  VirtualMachineError is probably too severe,
@@ -1307,7 +1318,8 @@
      *
      * The easiest way to deal with this is to prevent the new thread from
      * running until the parent says it's okay.  This results in the
-     * following sequence of events for a "badly timed" GC:
+     * following (correct) sequence of events for a "badly timed" GC
+     * (where '-' is us, 'o' is the child, and '+' is some other thread):
      *
      *  - call pthread_create()
      *  - lock thread list
@@ -2359,6 +2371,7 @@
 {
     const int kMaxRetries = 10;
     int spinSleepTime = FIRST_SLEEP;
+    bool complained = false;
 
     int sleepIter = 0;
     int retryCount = 0;
@@ -2384,6 +2397,7 @@
                 self->threadId, (int)self->handle,
                 thread->threadId, (int)thread->handle);
             dumpWedgedThread(thread);
+            complained = true;
 
             // keep going; could be slow due to valgrind
             sleepIter = 0;
@@ -2397,6 +2411,11 @@
             }
         }
     }
+
+    if (complained) {
+        LOGW("threadid=%d: spin on suspend resolved\n", self->threadId);
+        //dvmDumpThread(thread, false);   /* suspended, so dump is safe */
+    }
 }
 
 /*
@@ -2533,7 +2552,9 @@
         /* debugger events don't suspend JDWP thread */
         if ((why == SUSPEND_FOR_DEBUG || why == SUSPEND_FOR_DEBUG_EVENT) &&
             thread->handle == dvmJdwpGetDebugThread(gDvm.jdwpState))
+        {
             continue;
+        }
 
         if (thread->suspendCount > 0) {
             dvmAddToThreadSuspendCount(&thread->suspendCount, -1);
@@ -2548,6 +2569,43 @@
     dvmUnlockThreadList();
 
     /*
+     * In some ways it makes sense to continue to hold the thread-suspend
+     * lock while we issue the wakeup broadcast.  It allows us to complete
+     * one operation before moving on to the next, which simplifies the
+     * thread activity debug traces.
+     *
+     * This approach caused us some difficulty under Linux, because the
+     * condition variable broadcast not only made the threads runnable,
+     * but actually caused them to execute, and it was a while before
+     * the thread performing the wakeup had an opportunity to release the
+     * thread-suspend lock.
+     *
+     * This is a problem because, when a thread tries to acquire that
+     * lock, it times out after 3 seconds.  If at some point the thread
+     * is told to suspend, the clock resets; but since the VM is still
+     * theoretically mid-resume, there's no suspend pending.  If, for
+     * example, the GC was waking threads up while the SIGQUIT handler
+     * was trying to acquire the lock, we would occasionally time out on
+     * a busy system and SignalCatcher would abort.
+     *
+     * We now perform the unlock before the wakeup broadcast.  The next
+     * suspend can't actually start until the broadcast completes and
+     * returns, because we're holding the thread-suspend-count lock, but the
+     * suspending thread is now able to make progress and we avoid the abort.
+     *
+     * (Technically there is a narrow window between when we release
+     * the thread-suspend lock and grab the thread-suspend-count lock.
+     * This could cause us to send a broadcast to threads with nonzero
+     * suspend counts, but this is expected and they'll all just fall
+     * right back to sleep.  It's probably safe to grab the suspend-count
+     * lock before releasing thread-suspend, since we're still following
+     * the correct order of acquisition, but it feels weird.)
+     */
+
+    LOG_THREAD("threadid=%d: ResumeAll waking others\n", self->threadId);
+    unlockThreadSuspend();
+
+    /*
      * Broadcast a notification to all suspended threads, some or all of
      * which may choose to wake up.  No need to wait for them.
      */
@@ -2556,8 +2614,6 @@
     assert(cc == 0);
     unlockThreadSuspendCount();
 
-    unlockThreadSuspend();
-
     LOG_THREAD("threadid=%d: ResumeAll complete\n", self->threadId);
 }
 
@@ -3026,9 +3082,9 @@
         threadName, isDaemon ? " daemon" : "",
         priority, thread->threadId, kStatusNames[thread->status]);
     dvmPrintDebugMessage(target,
-        "  | group=\"%s\" sCount=%d dsCount=%d s=%c obj=%p\n",
+        "  | group=\"%s\" sCount=%d dsCount=%d s=%c obj=%p self=%p\n",
         groupName, thread->suspendCount, thread->dbgSuspendCount,
-        thread->isSuspended ? 'Y' : 'N', thread->threadObj);
+        thread->isSuspended ? 'Y' : 'N', thread->threadObj, thread);
     dvmPrintDebugMessage(target,
         "  | sysTid=%d nice=%d sched=%d/%d handle=%d\n",
         thread->systemTid, getpriority(PRIO_PROCESS, thread->systemTid),
commit	2aa43610c391868eb6ef80bf3b1f947776defcca	[log] [tgz]
author	Andy McFadden <fadden@android.com>	Wed Jun 17 16:29:30 2009 -0700
committer	Andy McFadden <fadden@android.com>	Wed Jun 17 16:55:40 2009 -0700
tree	3a212711da1c0177b47059d66994b6cffbe3e568
parent	eba51c25771a07993224e5ae74e8cac715985ba6 [diff] [blame]