Merge change 5967 into donut
* changes:
Modifies OpenSSLSocketImpl to use a different lock for the instance count. It was using the same lock when use around native methods meaning that the finalizer could be blocked unnecessarily resulting in a VM crash.
diff --git a/libcore/luni-kernel/src/main/native/java_lang_ProcessManager.c b/libcore/luni-kernel/src/main/native/java_lang_ProcessManager.c
index ee2fc58..fcff1df 100644
--- a/libcore/luni-kernel/src/main/native/java_lang_ProcessManager.c
+++ b/libcore/luni-kernel/src/main/native/java_lang_ProcessManager.c
@@ -16,6 +16,7 @@
#define LOG_TAG "ProcessManager"
+#include <sys/resource.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
@@ -24,7 +25,6 @@
#include <stdlib.h>
#include <string.h>
#include <errno.h>
-#include <dirent.h>
#include "jni.h"
#include "JNIHelp.h"
@@ -155,31 +155,21 @@
}
}
-/** Close all open fds > 2 (i.e. everything but stdin/out/err). */
+/** Close all open fds > 2 (i.e. everything but stdin/out/err), != skipFd. */
static void closeNonStandardFds(int skipFd) {
- DIR* dir = opendir("/proc/self/fd");
+ struct rlimit rlimit;
+ getrlimit(RLIMIT_NOFILE, &rlimit);
- if (dir == NULL) {
- // Print message to standard err. The parent process can read this
- // from Process.getErrorStream().
- perror("opendir");
- return;
- }
-
- struct dirent* entry;
- int dirFd = dirfd(dir);
- while ((entry = readdir(dir)) != NULL) {
- int fd = atoi(entry->d_name);
- if (fd > 2 && fd != dirFd && fd != skipFd
+ int fd;
+ for (fd = 3; fd < rlimit.rlim_max; fd++) {
+ if (fd != skipFd
#ifdef ANDROID
&& fd != androidSystemPropertiesFd
#endif
) {
close(fd);
- }
+ }
}
-
- closedir(dir);
}
#define PIPE_COUNT (4) // number of pipes used to communicate with child proc
@@ -233,6 +223,12 @@
// If this is the child process...
if (childPid == 0) {
+ /*
+ * Note: We cannot malloc() or free() after this point!
+ * A no-longer-running thread may be holding on to the heap lock, and
+ * an attempt to malloc() or free() would result in deadlock.
+ */
+
// Replace stdin, out, and err with pipes.
dup2(stdinIn, 0);
dup2(stdoutOut, 1);
diff --git a/vm/Debugger.c b/vm/Debugger.c
index c667893..3affa97 100644
--- a/vm/Debugger.c
+++ b/vm/Debugger.c
@@ -2697,10 +2697,10 @@
dvmUnlockThreadList();
/*
- * We change our thread status (which should be THREAD_RUNNING) so the
- * VM can suspend for a GC if the invoke request causes us to run out
- * of memory. It's also a good idea to change it before locking the
- * invokeReq mutex, although that should never be held for long.
+ * We change our (JDWP thread) status, which should be THREAD_RUNNING,
+ * so the VM can suspend for a GC if the invoke request causes us to
+ * run out of memory. It's also a good idea to change it before locking
+ * the invokeReq mutex, although that should never be held for long.
*/
Thread* self = dvmThreadSelf();
int oldStatus = dvmChangeStatus(self, THREAD_VMWAIT);
@@ -2774,18 +2774,25 @@
/*
* Execute the method described by "*pReq".
+ *
+ * We're currently in VMWAIT, because we're stopped on a breakpoint. We
+ * want to switch to RUNNING while we execute.
*/
void dvmDbgExecuteMethod(DebugInvokeReq* pReq)
{
Thread* self = dvmThreadSelf();
const Method* meth;
Object* oldExcept;
+ int oldStatus;
/*
* We can be called while an exception is pending in the VM. We need
* to preserve that across the method invocation.
*/
oldExcept = dvmGetException(self);
+ dvmClearException(self);
+
+ oldStatus = dvmChangeStatus(self, THREAD_RUNNING);
/*
* Translate the method through the vtable, unless we're calling a
@@ -2832,6 +2839,7 @@
if (oldExcept != NULL)
dvmSetException(self, oldExcept);
+ dvmChangeStatus(self, oldStatus);
}
// for dvmAddressSetForLine
diff --git a/vm/Globals.h b/vm/Globals.h
index 79b9e91..a2de271 100644
--- a/vm/Globals.h
+++ b/vm/Globals.h
@@ -321,7 +321,7 @@
/*
* The thread code grabs this before suspending all threads. There
- * are four things that can cause a "suspend all":
+ * are a few things that can cause a "suspend all":
* (1) the GC is starting;
* (2) the debugger has sent a "suspend all" request;
* (3) a thread has hit a breakpoint or exception that the debugger
@@ -332,9 +332,6 @@
* do a blocking "lock" call on this mutex -- if it has been acquired,
* somebody is probably trying to put you to sleep. The leading '_' is
* intended as a reminder that this lock is special.
- *
- * This lock is also held while attaching an externally-created thread
- * through JNI. That way we can correctly set the initial suspend state.
*/
pthread_mutex_t _threadSuspendLock;
diff --git a/vm/Native.c b/vm/Native.c
index 7a153d6..dcf68b6 100644
--- a/vm/Native.c
+++ b/vm/Native.c
@@ -423,9 +423,21 @@
* - check to see if the library is valid
* - check config/prelink-linux-arm.map to ensure that the library
* is listed and is not being overrun by the previous entry (if
- * loading suddenly stops working, this is a good one to check)
+ * loading suddenly stops working on a prelinked library, this is
+ * a good one to check)
+ * - write a trivial app that calls sleep() then dlopen(), attach
+ * to it with "strace -p <pid>" while it sleeps, and watch for
+ * attempts to open nonexistent dependent shared libs
+ *
+ * This can execute slowly for a large library on a busy system, so we
+ * want to switch from RUNNING to VMWAIT while it executes. This allows
+ * the GC to ignore us.
*/
+ Thread* self = dvmThreadSelf();
+ int oldStatus = dvmChangeStatus(self, THREAD_VMWAIT);
handle = dlopen(pathName, RTLD_LAZY);
+ dvmChangeStatus(self, oldStatus);
+
if (handle == NULL) {
LOGI("Unable to dlopen(%s): %s\n", pathName, dlerror());
return false;
@@ -461,9 +473,9 @@
Object* prevOverride = self->classLoaderOverride;
self->classLoaderOverride = classLoader;
- dvmChangeStatus(NULL, THREAD_NATIVE);
+ oldStatus = dvmChangeStatus(self, THREAD_NATIVE);
version = (*func)(gDvm.vmList, NULL);
- dvmChangeStatus(NULL, THREAD_RUNNING);
+ dvmChangeStatus(self, oldStatus);
self->classLoaderOverride = prevOverride;
if (version != JNI_VERSION_1_2 && version != JNI_VERSION_1_4 &&
diff --git a/vm/Thread.c b/vm/Thread.c
index 0adde38..ee195eb 100644
--- a/vm/Thread.c
+++ b/vm/Thread.c
@@ -481,6 +481,20 @@
assert(cc == 0);
}
+/*
+ * Convert SuspendCause to a string.
+ */
+static const char* getSuspendCauseStr(SuspendCause why)
+{
+ switch (why) {
+ case SUSPEND_NOT: return "NOT?";
+ case SUSPEND_FOR_GC: return "gc";
+ case SUSPEND_FOR_DEBUG: return "debug";
+ case SUSPEND_FOR_DEBUG_EVENT: return "debug-event";
+ case SUSPEND_FOR_STACK_DUMP: return "stack-dump";
+ default: return "UNKNOWN";
+ }
+}
/*
* Grab the "thread suspend" lock. This is required to prevent the
@@ -492,7 +506,6 @@
*/
static void lockThreadSuspend(const char* who, SuspendCause why)
{
- const int kMaxRetries = 10;
const int kSpinSleepTime = 3*1000*1000; /* 3s */
u8 startWhen = 0; // init req'd to placate gcc
int sleepIter = 0;
@@ -503,23 +516,37 @@
if (cc != 0) {
if (!dvmCheckSuspendPending(NULL)) {
/*
- * Could be unusual JNI-attach thing, could be we hit
- * the window as the suspend or resume was started. Could
- * also be the debugger telling us to resume at roughly
+ * Could be that a resume-all is in progress, and something
+ * grabbed the CPU when the wakeup was broadcast. The thread
+ * performing the resume hasn't had a chance to release the
+ * thread suspend lock. (Should no longer be an issue --
+ * we now release before broadcast.)
+ *
+ * Could be we hit the window as a suspend was started,
+ * and the lock has been grabbed but the suspend counts
+ * haven't been incremented yet.
+ *
+ * Could be an unusual JNI thread-attach thing.
+ *
+ * Could be the debugger telling us to resume at roughly
* the same time we're posting an event.
*/
- LOGI("threadid=%d ODD: thread-suspend lock held (%s:%d)"
- " but suspend not pending\n",
- dvmThreadSelf()->threadId, who, why);
+ LOGI("threadid=%d ODD: want thread-suspend lock (%s:%s),"
+ " it's held, no suspend pending\n",
+ dvmThreadSelf()->threadId, who, getSuspendCauseStr(why));
+ } else {
+ /* we suspended; reset timeout */
+ sleepIter = 0;
}
/* give the lock-holder a chance to do some work */
if (sleepIter == 0)
startWhen = dvmGetRelativeTimeUsec();
if (!dvmIterativeSleep(sleepIter++, kSpinSleepTime, startWhen)) {
- LOGE("threadid=%d: couldn't get thread-suspend lock (%s:%d),"
+ LOGE("threadid=%d: couldn't get thread-suspend lock (%s:%s),"
" bailing\n",
- dvmThreadSelf()->threadId, who, why);
+ dvmThreadSelf()->threadId, who, getSuspendCauseStr(why));
+ /* threads are not suspended, thread dump could crash */
dvmDumpAllThreads(false);
dvmAbort();
}
@@ -1224,9 +1251,13 @@
*/
dvmUnlockThreadList();
- if (pthread_create(&threadHandle, &threadAttr, interpThreadStart,
- newThread) != 0)
- {
+ int cc, oldStatus;
+ oldStatus = dvmChangeStatus(self, THREAD_VMWAIT);
+ cc = pthread_create(&threadHandle, &threadAttr, interpThreadStart,
+ newThread);
+ oldStatus = dvmChangeStatus(self, oldStatus);
+
+ if (cc != 0) {
/*
* Failure generally indicates that we have exceeded system
* resource limits. VirtualMachineError is probably too severe,
@@ -1271,7 +1302,8 @@
*
* The easiest way to deal with this is to prevent the new thread from
* running until the parent says it's okay. This results in the
- * following sequence of events for a "badly timed" GC:
+ * following (correct) sequence of events for a "badly timed" GC
+ * (where '-' is us, 'o' is the child, and '+' is some other thread):
*
* - call pthread_create()
* - lock thread list
@@ -2202,8 +2234,15 @@
&gDvm.threadSuspendCountLock);
assert(cc == 0);
if (self->suspendCount != 0) {
- LOGD("threadid=%d: still suspended after undo (s=%d d=%d)\n",
- self->threadId, self->suspendCount, self->dbgSuspendCount);
+ /*
+ * The condition was signaled but we're still suspended. This
+ * can happen if the debugger lets go while a SIGQUIT thread
+ * dump event is pending (assuming SignalCatcher was resumed for
+ * just long enough to try to grab the thread-suspend lock).
+ */
+ LOGD("threadid=%d: still suspended after undo (sc=%d dc=%d s=%c)\n",
+ self->threadId, self->suspendCount, self->dbgSuspendCount,
+ self->isSuspended ? 'Y' : 'N');
}
}
assert(self->suspendCount == 0 && self->dbgSuspendCount == 0);
@@ -2308,6 +2347,7 @@
{
const int kMaxRetries = 10;
const int kSpinSleepTime = 750*1000; /* 0.75s */
+ bool complained = false;
int sleepIter = 0;
int retryCount = 0;
@@ -2322,6 +2362,7 @@
self->threadId, (int)self->handle,
thread->threadId, (int)thread->handle);
dumpWedgedThread(thread);
+ complained = true;
// keep going; could be slow due to valgrind
sleepIter = 0;
@@ -2334,6 +2375,11 @@
}
}
}
+
+ if (complained) {
+ LOGW("threadid=%d: spin on suspend resolved\n", self->threadId);
+ //dvmDumpThread(thread, false); /* suspended, so dump is safe */
+ }
}
/*
@@ -2470,7 +2516,9 @@
/* debugger events don't suspend JDWP thread */
if ((why == SUSPEND_FOR_DEBUG || why == SUSPEND_FOR_DEBUG_EVENT) &&
thread->handle == dvmJdwpGetDebugThread(gDvm.jdwpState))
+ {
continue;
+ }
if (thread->suspendCount > 0) {
thread->suspendCount--;
@@ -2485,6 +2533,43 @@
dvmUnlockThreadList();
/*
+ * In some ways it makes sense to continue to hold the thread-suspend
+ * lock while we issue the wakeup broadcast. It allows us to complete
+ * one operation before moving on to the next, which simplifies the
+ * thread activity debug traces.
+ *
+ * This approach caused us some difficulty under Linux, because the
+ * condition variable broadcast not only made the threads runnable,
+ * but actually caused them to execute, and it was a while before
+ * the thread performing the wakeup had an opportunity to release the
+ * thread-suspend lock.
+ *
+ * This is a problem because, when a thread tries to acquire that
+ * lock, it times out after 3 seconds. If at some point the thread
+ * is told to suspend, the clock resets; but since the VM is still
+ * theoretically mid-resume, there's no suspend pending. If, for
+ * example, the GC was waking threads up while the SIGQUIT handler
+ * was trying to acquire the lock, we would occasionally time out on
+ * a busy system and SignalCatcher would abort.
+ *
+ * We now perform the unlock before the wakeup broadcast. The next
+ * suspend can't actually start until the broadcast completes and
+ * returns, because we're holding the thread-suspend-count lock, but the
+ * suspending thread is now able to make progress and we avoid the abort.
+ *
+ * (Technically there is a narrow window between when we release
+ * the thread-suspend lock and grab the thread-suspend-count lock.
+ * This could cause us to send a broadcast to threads with nonzero
+ * suspend counts, but this is expected and they'll all just fall
+ * right back to sleep. It's probably safe to grab the suspend-count
+ * lock before releasing thread-suspend, since we're still following
+ * the correct order of acquisition, but it feels weird.)
+ */
+
+ LOG_THREAD("threadid=%d: ResumeAll waking others\n", self->threadId);
+ unlockThreadSuspend();
+
+ /*
* Broadcast a notification to all suspended threads, some or all of
* which may choose to wake up. No need to wait for them.
*/
@@ -2493,8 +2578,6 @@
assert(cc == 0);
unlockThreadSuspendCount();
- unlockThreadSuspend();
-
LOG_THREAD("threadid=%d: ResumeAll complete\n", self->threadId);
}
@@ -2962,9 +3045,9 @@
threadName, isDaemon ? " daemon" : "",
priority, thread->threadId, kStatusNames[thread->status]);
dvmPrintDebugMessage(target,
- " | group=\"%s\" sCount=%d dsCount=%d s=%d obj=%p\n",
+ " | group=\"%s\" sCount=%d dsCount=%d s=%c obj=%p self=%p\n",
groupName, thread->suspendCount, thread->dbgSuspendCount,
- thread->isSuspended, thread->threadObj);
+ thread->isSuspended ? 'Y' : 'N', thread->threadObj, thread);
dvmPrintDebugMessage(target,
" | sysTid=%d nice=%d sched=%d/%d handle=%d\n",
thread->systemTid, getpriority(PRIO_PROCESS, thread->systemTid),
diff --git a/vm/interp/Stack.c b/vm/interp/Stack.c
index 103d2b4..d5c5fe9 100644
--- a/vm/interp/Stack.c
+++ b/vm/interp/Stack.c
@@ -351,7 +351,8 @@
#ifndef NDEBUG
if (self->status != THREAD_RUNNING) {
- LOGW("Status=%d on call to %s.%s -\n", self->status,
+ LOGW("threadid=%d: status=%d on call to %s.%s -\n",
+ self->threadId, self->status,
method->clazz->descriptor, method->name);
}
#endif