system_server : trigger runtime restart when we're close to the soft FD limit.
We arbitrarily define a high water mark as 12 below the max limit (1024)
and dump the list of open descriptors and restart the system_server when we
reach that mark.
The list of file descriptors is dumped to a file with prefix /data/anr/anr_fd_xxxxx.
This might be construed as a hack, but it allows us to take advantage of
existing code in dumpstate that will capture this file and add it to any
bugreport that's collected after.
Test: Manual
Bug: 63004717
Change-Id: I4052625574a3ab2df9ddf591f281a412e7b4b511
diff --git a/services/core/java/com/android/server/Watchdog.java b/services/core/java/com/android/server/Watchdog.java
index 6a81d32..8d46d1e 100644
--- a/services/core/java/com/android/server/Watchdog.java
+++ b/services/core/java/com/android/server/Watchdog.java
@@ -18,7 +18,11 @@
import android.app.IActivityController;
import android.os.Binder;
+import android.os.Build;
import android.os.RemoteException;
+import android.system.ErrnoException;
+import android.system.OsConstants;
+import android.system.StructRlimit;
import com.android.internal.os.ZygoteConnectionConstants;
import com.android.server.am.ActivityManagerService;
@@ -45,6 +49,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
import java.util.List;
@@ -107,6 +112,7 @@
int mPhonePid;
IActivityController mController;
boolean mAllowRestart = true;
+ final OpenFdMonitor mOpenFdMonitor;
/**
* Used for checking status of handle threads and scheduling monitor callbacks.
@@ -269,6 +275,8 @@
// Initialize monitor for Binder threads.
addMonitor(new BinderThreadMonitor());
+ mOpenFdMonitor = OpenFdMonitor.create();
+
// See the notes on DEFAULT_TIMEOUT.
assert DB ||
DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
@@ -358,7 +366,7 @@
return checkers;
}
- private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
+ private String describeCheckersLocked(List<HandlerChecker> checkers) {
StringBuilder builder = new StringBuilder(128);
for (int i=0; i<checkers.size(); i++) {
if (builder.length() > 0) {
@@ -410,7 +418,7 @@
public void run() {
boolean waitedHalf = false;
while (true) {
- final ArrayList<HandlerChecker> blockedCheckers;
+ final List<HandlerChecker> blockedCheckers;
final String subject;
final boolean allowRestart;
int debuggerWasConnected = 0;
@@ -447,30 +455,40 @@
timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
}
- final int waitState = evaluateCheckerCompletionLocked();
- if (waitState == COMPLETED) {
- // The monitors have returned; reset
- waitedHalf = false;
- continue;
- } else if (waitState == WAITING) {
- // still waiting but within their configured intervals; back off and recheck
- continue;
- } else if (waitState == WAITED_HALF) {
- if (!waitedHalf) {
- // We've waited half the deadlock-detection interval. Pull a stack
- // trace and wait another half.
- ArrayList<Integer> pids = new ArrayList<Integer>();
- pids.add(Process.myPid());
- ActivityManagerService.dumpStackTraces(true, pids, null, null,
- getInterestingNativePids());
- waitedHalf = true;
- }
- continue;
+ boolean fdLimitTriggered = false;
+ if (mOpenFdMonitor != null) {
+ fdLimitTriggered = mOpenFdMonitor.monitor();
}
- // something is overdue!
- blockedCheckers = getBlockedCheckersLocked();
- subject = describeCheckersLocked(blockedCheckers);
+ if (!fdLimitTriggered) {
+ final int waitState = evaluateCheckerCompletionLocked();
+ if (waitState == COMPLETED) {
+ // The monitors have returned; reset
+ waitedHalf = false;
+ continue;
+ } else if (waitState == WAITING) {
+ // still waiting but within their configured intervals; back off and recheck
+ continue;
+ } else if (waitState == WAITED_HALF) {
+ if (!waitedHalf) {
+ // We've waited half the deadlock-detection interval. Pull a stack
+ // trace and wait another half.
+ ArrayList<Integer> pids = new ArrayList<Integer>();
+ pids.add(Process.myPid());
+ ActivityManagerService.dumpStackTraces(true, pids, null, null,
+ getInterestingNativePids());
+ waitedHalf = true;
+ }
+ continue;
+ }
+
+ // something is overdue!
+ blockedCheckers = getBlockedCheckersLocked();
+ subject = describeCheckersLocked(blockedCheckers);
+ } else {
+ blockedCheckers = Collections.emptyList();
+ subject = "Open FD high water mark reached";
+ }
allowRestart = mAllowRestart;
}
@@ -584,4 +602,87 @@
}
private native void native_dumpKernelStacks(String tracesPath);
+
+ public static final class OpenFdMonitor {
+ /**
+ * Number of FDs below the soft limit that we trigger a runtime restart at. This was
+ * chosen arbitrarily, but will need to be at least 6 in order to have a sufficient number
+ * of FDs in reserve to complete a dump.
+ */
+ private static final int FD_HIGH_WATER_MARK = 12;
+
+ private final File mDumpDir;
+ private final File mFdHighWaterMark;
+
+ public static OpenFdMonitor create() {
+ // Only run the FD monitor on debuggable builds (such as userdebug and eng builds).
+ if (!Build.IS_DEBUGGABLE) {
+ return null;
+ }
+
+ // Don't run the FD monitor on builds that have a global ANR trace file. We're using
+ // the ANR trace directory as a quick hack in order to get these traces in bugreports
+ // and we wouldn't want to overwrite something important.
+ final String dumpDirStr = SystemProperties.get("dalvik.vm.stack-trace-dir", "");
+ if (dumpDirStr.isEmpty()) {
+ return null;
+ }
+
+ final StructRlimit rlimit;
+ try {
+ rlimit = android.system.Os.getrlimit(OsConstants.RLIMIT_NOFILE);
+ } catch (ErrnoException errno) {
+ Slog.w(TAG, "Error thrown from getrlimit(RLIMIT_NOFILE)", errno);
+ return null;
+ }
+
+ // The assumption we're making here is that FD numbers are allocated (more or less)
+ // sequentially, which is currently (and historically) true since open is currently
+ // specified to always return the lowest-numbered non-open file descriptor for the
+ // current process.
+ //
+ // We do this to avoid having to enumerate the contents of /proc/self/fd in order to
+ // count the number of descriptors open in the process.
+ final File fdThreshold = new File("/proc/self/fd/" + (rlimit.rlim_cur - FD_HIGH_WATER_MARK));
+ return new OpenFdMonitor(new File(dumpDirStr), fdThreshold);
+ }
+
+ OpenFdMonitor(File dumpDir, File fdThreshold) {
+ mDumpDir = dumpDir;
+ mFdHighWaterMark = fdThreshold;
+ }
+
+ private void dumpOpenDescriptors() {
+ try {
+ File dumpFile = File.createTempFile("anr_fd_", "", mDumpDir);
+ java.lang.Process proc = new ProcessBuilder()
+ .command("/system/bin/lsof", "-p", String.valueOf(Process.myPid()))
+ .redirectErrorStream(true)
+ .redirectOutput(dumpFile)
+ .start();
+
+ int returnCode = proc.waitFor();
+ if (returnCode != 0) {
+ Slog.w(TAG, "Unable to dump open descriptors, lsof return code: "
+ + returnCode);
+ dumpFile.delete();
+ }
+ } catch (IOException | InterruptedException ex) {
+ Slog.w(TAG, "Unable to dump open descriptors: " + ex);
+ }
+ }
+
+ /**
+ * @return {@code true} if the high water mark was breached and a dump was written,
+ * {@code false} otherwise.
+ */
+ public boolean monitor() {
+ if (mFdHighWaterMark.exists()) {
+ dumpOpenDescriptors();
+ return true;
+ }
+
+ return false;
+ }
+ }
}