Dumping stack traces to proto.

Bug: 72177715
Test: flash device and check incident.proto output
Change-Id: Id2a15e0fc62b66efe875949af97f0eb651c7e322
diff --git a/Android.bp b/Android.bp
index 1b9210c..1caa497 100644
--- a/Android.bp
+++ b/Android.bp
@@ -813,6 +813,7 @@
     ],
 
     srcs: [
+        "core/proto/android/os/backtrace.proto",
         "core/proto/android/os/batterytype.proto",
         "core/proto/android/os/cpufreq.proto",
         "core/proto/android/os/cpuinfo.proto",
diff --git a/cmds/incidentd/Android.mk b/cmds/incidentd/Android.mk
index 3a47fe1..008a1bf 100644
--- a/cmds/incidentd/Android.mk
+++ b/cmds/incidentd/Android.mk
@@ -15,8 +15,10 @@
 LOCAL_PATH:= $(call my-dir)
 
 # proto files used in incidentd to generate cppstream proto headers.
-PROTO_FILES:= frameworks/base/core/proto/android/util/log.proto \
-        frameworks/base/core/proto/android/os/data.proto
+PROTO_FILES:= \
+        frameworks/base/core/proto/android/os/backtrace.proto \
+        frameworks/base/core/proto/android/os/data.proto \
+        frameworks/base/core/proto/android/util/log.proto
 
 # ========= #
 # incidentd #
@@ -46,6 +48,8 @@
         libbase \
         libbinder \
         libcutils \
+        libdebuggerd_client \
+        libdumputils \
         libincident \
         liblog \
         libprotobuf-cpp-lite \
@@ -119,6 +123,8 @@
     libbase \
     libbinder \
     libcutils \
+    libdebuggerd_client \
+    libdumputils \
     libincident \
     liblog \
     libprotobuf-cpp-lite \
diff --git a/cmds/incidentd/incidentd.rc b/cmds/incidentd/incidentd.rc
index 6dd8114..9c16a1c 100644
--- a/cmds/incidentd/incidentd.rc
+++ b/cmds/incidentd/incidentd.rc
@@ -16,6 +16,7 @@
     class main
     user incidentd
     group incidentd log readproc
+    capabilities KILL SYS_PTRACE
 
 on post-fs-data
     # Create directory for incidentd
diff --git a/cmds/incidentd/src/FdBuffer.cpp b/cmds/incidentd/src/FdBuffer.cpp
index 64da677..3570144 100644
--- a/cmds/incidentd/src/FdBuffer.cpp
+++ b/cmds/incidentd/src/FdBuffer.cpp
@@ -87,6 +87,35 @@
     return NO_ERROR;
 }
 
+status_t FdBuffer::readFully(int fd) {
+    mStartTime = uptimeMillis();
+
+    while (true) {
+        if (mBuffer.size() >= MAX_BUFFER_COUNT * BUFFER_SIZE) {
+            // Don't let it get too big.
+            mTruncated = true;
+            VLOG("Truncating data");
+            break;
+        }
+        if (mBuffer.writeBuffer() == NULL) return NO_MEMORY;
+
+        ssize_t amt =
+                TEMP_FAILURE_RETRY(::read(fd, mBuffer.writeBuffer(), mBuffer.currentToWrite()));
+        if (amt < 0) {
+            VLOG("Fail to read %d: %s", fd, strerror(errno));
+            return -errno;
+        } else if (amt == 0) {
+            VLOG("Done reading %zu bytes", mBuffer.size());
+            // We're done.
+            break;
+        }
+        mBuffer.wp()->move(amt);
+    }
+
+    mFinishTime = uptimeMillis();
+    return NO_ERROR;
+}
+
 status_t FdBuffer::readProcessedDataInStream(int fd, int toFd, int fromFd, int64_t timeoutMs,
                                              const bool isSysfs) {
     struct pollfd pfds[] = {
diff --git a/cmds/incidentd/src/FdBuffer.h b/cmds/incidentd/src/FdBuffer.h
index 66a3de1..34ebcf5 100644
--- a/cmds/incidentd/src/FdBuffer.h
+++ b/cmds/incidentd/src/FdBuffer.h
@@ -41,6 +41,12 @@
     status_t read(int fd, int64_t timeoutMs);
 
     /**
+     * Read the data until we hit eof.
+     * Returns NO_ERROR if there were no errors.
+     */
+    status_t readFully(int fd);
+
+    /**
      * Read processed results by streaming data to a parsing process, e.g. incident helper.
      * The parsing process provides IO fds which are 'toFd' and 'fromFd'. The function
      * reads original data in 'fd' and writes to parsing process through 'toFd', then it reads
diff --git a/cmds/incidentd/src/Section.cpp b/cmds/incidentd/src/Section.cpp
index 334d77c..509ba0d 100644
--- a/cmds/incidentd/src/Section.cpp
+++ b/cmds/incidentd/src/Section.cpp
@@ -18,13 +18,19 @@
 
 #include "Section.h"
 
+#include <dirent.h>
+#include <errno.h>
 #include <wait.h>
 
 #include <mutex>
+#include <set>
 
 #include <android-base/file.h>
+#include <android-base/stringprintf.h>
 #include <android/util/protobuf.h>
 #include <binder/IServiceManager.h>
+#include <debuggerd/client.h>
+#include <dumputils/dump_utils.h>
 #include <log/log_event_list.h>
 #include <log/log_read.h>
 #include <log/logprint.h>
@@ -33,6 +39,7 @@
 #include "FdBuffer.h"
 #include "Privacy.h"
 #include "PrivacyBuffer.h"
+#include "frameworks/base/core/proto/android/os/backtrace.proto.h"
 #include "frameworks/base/core/proto/android/os/data.proto.h"
 #include "frameworks/base/core/proto/android/util/log.proto.h"
 #include "incidentd_util.h"
@@ -95,6 +102,7 @@
     return WriteFully(fd, buf, p - buf) ? NO_ERROR : -errno;
 }
 
+// Reads data from FdBuffer and writes it to the requests file descriptor.
 static status_t write_report_requests(const int id, const FdBuffer& buffer,
                                       ReportRequestSet* requests) {
     status_t err = -EBADF;
@@ -387,6 +395,7 @@
 
     return NO_ERROR;
 }
+
 // ================================================================================
 struct WorkerThreadData : public virtual RefBase {
     const WorkerThreadSection* section;
@@ -413,7 +422,8 @@
 WorkerThreadData::~WorkerThreadData() {}
 
 // ================================================================================
-WorkerThreadSection::WorkerThreadSection(int id) : Section(id) {}
+WorkerThreadSection::WorkerThreadSection(int id, const int64_t timeoutMs)
+    : Section(id, timeoutMs) {}
 
 WorkerThreadSection::~WorkerThreadSection() {}
 
@@ -594,7 +604,7 @@
         return readStatus;
     }
 
-    // TODO: wait for command here has one trade-off: the failed status of command won't be detected
+    // Waiting for command here has one trade-off: the failed status of command won't be detected
     // until buffer timeout, but it has advatage on starting the data stream earlier.
     status_t cmdStatus = wait_child(cmdPid);
     status_t ihStatus = wait_child(ihPid);
@@ -694,7 +704,6 @@
 }
 
 status_t LogSection::BlockingCall(int pipeWriteFd) const {
-    status_t err = NO_ERROR;
     // Open log buffer and getting logs since last retrieved time if any.
     unique_ptr<logger_list, void (*)(logger_list*)> loggers(
             gLastLogsRetrieved.find(mLogID) == gLastLogsRetrieved.end()
@@ -705,15 +714,16 @@
 
     if (android_logger_open(loggers.get(), mLogID) == NULL) {
         ALOGW("LogSection %s: Can't get logger.", this->name.string());
-        return err;
+        return NO_ERROR;
     }
 
     log_msg msg;
     log_time lastTimestamp(0);
 
+    status_t err = NO_ERROR;
     ProtoOutputStream proto;
     while (true) {  // keeps reading until logd buffer is fully read.
-        status_t err = android_logger_list_read(loggers.get(), &msg);
+        err = android_logger_list_read(loggers.get(), &msg);
         // err = 0 - no content, unexpected connection drop or EOF.
         // err = +ive number - size of retrieved data from logger
         // err = -ive number, OS supplied error _except_ for -EAGAIN
@@ -814,3 +824,133 @@
     proto.flush(pipeWriteFd);
     return err;
 }
+
+// ================================================================================
+
+TombstoneSection::TombstoneSection(int id, const char* type, const int64_t timeoutMs)
+    : WorkerThreadSection(id, timeoutMs), mType(type) {
+    name += "tombstone ";
+    name += type;
+}
+
+TombstoneSection::~TombstoneSection() {}
+
+status_t TombstoneSection::BlockingCall(int pipeWriteFd) const {
+    std::unique_ptr<DIR, decltype(&closedir)> proc(opendir("/proc"), closedir);
+    if (proc.get() == nullptr) {
+        ALOGE("opendir /proc failed: %s\n", strerror(errno));
+        return -errno;
+    }
+
+    const std::set<int> hal_pids = get_interesting_hal_pids();
+
+    ProtoOutputStream proto;
+    struct dirent* d;
+    status_t err = NO_ERROR;
+    while ((d = readdir(proc.get()))) {
+        int pid = atoi(d->d_name);
+        if (pid <= 0) {
+            continue;
+        }
+
+        const std::string link_name = android::base::StringPrintf("/proc/%d/exe", pid);
+        std::string exe;
+        if (!android::base::Readlink(link_name, &exe)) {
+            ALOGE("Can't read '%s': %s\n", link_name.c_str(), strerror(errno));
+            continue;
+        }
+
+        bool is_java_process;
+        if (exe == "/system/bin/app_process32" || exe == "/system/bin/app_process64") {
+            if (mType != "java") continue;
+            // Don't bother dumping backtraces for the zygote.
+            if (IsZygote(pid)) {
+                VLOG("Skipping Zygote");
+                continue;
+            }
+
+            is_java_process = true;
+        } else if (should_dump_native_traces(exe.c_str())) {
+            if (mType != "native") continue;
+            is_java_process = false;
+        } else if (hal_pids.find(pid) != hal_pids.end()) {
+            if (mType != "hal") continue;
+            is_java_process = false;
+        } else {
+            // Probably a native process we don't care about, continue.
+            VLOG("Skipping %d", pid);
+            continue;
+        }
+
+        Fpipe dumpPipe;
+        if (!dumpPipe.init()) {
+            ALOGW("TombstoneSection '%s' failed to setup dump pipe", this->name.string());
+            err = -errno;
+            break;
+        }
+
+        const uint64_t start = Nanotime();
+        pid_t child = fork();
+        if (child < 0) {
+            ALOGE("Failed to fork child process");
+            break;
+        } else if (child == 0) {
+            // This is the child process.
+            close(dumpPipe.readFd());
+            const int ret = dump_backtrace_to_file_timeout(
+                    pid, is_java_process ? kDebuggerdJavaBacktrace : kDebuggerdNativeBacktrace,
+                    is_java_process ? 5 : 20, dumpPipe.writeFd());
+            if (ret == -1) {
+                if (errno == 0) {
+                    ALOGW("Dumping failed for pid '%d', likely due to a timeout\n", pid);
+                } else {
+                    ALOGE("Dumping failed for pid '%d': %s\n", pid, strerror(errno));
+                }
+            }
+            if (close(dumpPipe.writeFd()) != 0) {
+                ALOGW("TombstoneSection '%s' failed to close dump pipe writeFd: %d",
+                      this->name.string(), errno);
+                _exit(EXIT_FAILURE);
+            }
+
+            _exit(EXIT_SUCCESS);
+        }
+        close(dumpPipe.writeFd());
+        // Parent process.
+        // Read from the pipe concurrently to avoid blocking the child.
+        FdBuffer buffer;
+        err = buffer.readFully(dumpPipe.readFd());
+        if (err != NO_ERROR) {
+            ALOGW("TombstoneSection '%s' failed to read stack dump: %d", this->name.string(), err);
+            if (close(dumpPipe.readFd()) != 0) {
+                ALOGW("TombstoneSection '%s' failed to close dump pipe readFd: %s",
+                      this->name.string(), strerror(errno));
+            }
+            break;
+        }
+
+        auto dump = std::make_unique<char[]>(buffer.size());
+        auto iterator = buffer.data();
+        int i = 0;
+        while (iterator.hasNext()) {
+            dump[i] = iterator.next();
+            i++;
+        }
+        long long token = proto.start(android::os::BackTraceProto::TRACES);
+        proto.write(android::os::BackTraceProto::Stack::PID, pid);
+        proto.write(android::os::BackTraceProto::Stack::DUMP, dump.get(), i);
+        proto.write(android::os::BackTraceProto::Stack::DUMP_DURATION_NS,
+                    static_cast<long long>(Nanotime() - start));
+        proto.end(token);
+
+        if (close(dumpPipe.readFd()) != 0) {
+            ALOGW("TombstoneSection '%s' failed to close dump pipe readFd: %d", this->name.string(),
+                  errno);
+            err = -errno;
+            break;
+        }
+    }
+
+    proto.flush(pipeWriteFd);
+    return err;
+}
diff --git a/cmds/incidentd/src/Section.h b/cmds/incidentd/src/Section.h
index 8294be1..19ef7ee 100644
--- a/cmds/incidentd/src/Section.h
+++ b/cmds/incidentd/src/Section.h
@@ -103,7 +103,7 @@
  */
 class WorkerThreadSection : public Section {
 public:
-    WorkerThreadSection(int id);
+    WorkerThreadSection(int id, const int64_t timeoutMs = REMOTE_CALL_TIMEOUT_MS);
     virtual ~WorkerThreadSection();
 
     virtual status_t Execute(ReportRequestSet* requests) const;
@@ -161,4 +161,18 @@
     bool mBinary;
 };
 
+/**
+ * Section that gets data from tombstoned.
+ */
+class TombstoneSection : public WorkerThreadSection {
+public:
+    TombstoneSection(int id, const char* type, const int64_t timeoutMs = 30000 /* 30 seconds */);
+    virtual ~TombstoneSection();
+
+    virtual status_t BlockingCall(int pipeWriteFd) const;
+
+private:
+    std::string mType;
+};
+
 #endif  // SECTIONS_H
diff --git a/cmds/incidentd/src/incidentd_util.cpp b/cmds/incidentd/src/incidentd_util.cpp
index c095f2b..c869c7a 100644
--- a/cmds/incidentd/src/incidentd_util.cpp
+++ b/cmds/incidentd/src/incidentd_util.cpp
@@ -80,6 +80,7 @@
     close(output->writeFd());
     return pid;
 }
+
 // ================================================================================
 const char** varargs(const char* first, va_list rest) {
     va_list copied_rest;
@@ -101,3 +102,11 @@
     ret[numOfArgs] = NULL;
     return ret;
 }
+
+// ================================================================================
+const uint64_t NANOS_PER_SEC = 1000000000;
+uint64_t Nanotime() {
+    timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return static_cast<uint64_t>(ts.tv_sec * NANOS_PER_SEC + ts.tv_nsec);
+}
diff --git a/cmds/incidentd/src/incidentd_util.h b/cmds/incidentd/src/incidentd_util.h
index db7ec82..3f7df91 100644
--- a/cmds/incidentd/src/incidentd_util.h
+++ b/cmds/incidentd/src/incidentd_util.h
@@ -60,4 +60,9 @@
  */
 const char** varargs(const char* first, va_list rest);
 
-#endif  // INCIDENTD_UTIL_H
\ No newline at end of file
+/**
+ * Returns the current monotonic clock time in nanoseconds.
+ */
+uint64_t Nanotime();
+
+#endif  // INCIDENTD_UTIL_H
diff --git a/core/proto/android/os/backtrace.proto b/core/proto/android/os/backtrace.proto
new file mode 100644
index 0000000..ba81386
--- /dev/null
+++ b/core/proto/android/os/backtrace.proto
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto2";
+package android.os;
+
+option java_multiple_files = true;
+
+import "frameworks/base/libs/incident/proto/android/privacy.proto";
+
+message BackTraceProto {
+    option (android.msg_privacy).dest = DEST_AUTOMATIC;
+
+    message Stack {
+        option (android.msg_privacy).dest = DEST_AUTOMATIC;
+
+        optional int32 pid = 1;
+        optional string dump = 2;
+        // Time it took to dump the stacktrace.
+        optional int64 dump_duration_ns = 3;
+    }
+    repeated Stack traces = 1;
+}
diff --git a/core/proto/android/os/incident.proto b/core/proto/android/os/incident.proto
index 7326829..bb9568b 100644
--- a/core/proto/android/os/incident.proto
+++ b/core/proto/android/os/incident.proto
@@ -17,6 +17,7 @@
 syntax = "proto2";
 option java_multiple_files = true;
 
+import "frameworks/base/core/proto/android/os/backtrace.proto";
 import "frameworks/base/core/proto/android/os/batterytype.proto";
 import "frameworks/base/core/proto/android/os/cpufreq.proto";
 import "frameworks/base/core/proto/android/os/cpuinfo.proto";
@@ -115,6 +116,22 @@
         (section).args = "LOG_ID_KERNEL"
     ];
 
+    // Stack dumps
+    optional android.os.BackTraceProto native_traces = 1200 [
+        (section).type = SECTION_TOMBSTONE,
+        (section).args = "native"
+    ];
+
+    optional android.os.BackTraceProto hal_traces = 1201 [
+        (section).type = SECTION_TOMBSTONE,
+        (section).args = "hal"
+    ];
+
+    optional android.os.BackTraceProto java_traces = 1202 [
+        (section).type = SECTION_TOMBSTONE,
+        (section).args = "java"
+    ];
+
     // Linux services
     optional ProcrankProto procrank = 2000 [
         (section).type = SECTION_NONE, // disable procrank until figure out permission
diff --git a/libs/incident/proto/android/section.proto b/libs/incident/proto/android/section.proto
index ef6a8ff..b3ed393 100644
--- a/libs/incident/proto/android/section.proto
+++ b/libs/incident/proto/android/section.proto
@@ -43,6 +43,9 @@
 
     // incidentd read file and gzip the data in bytes field
     SECTION_GZIP = 5;
+
+    // incidentd calls tombstoned for annotated field
+    SECTION_TOMBSTONE = 6;
 }
 
 message SectionFlags {
diff --git a/tools/incident_section_gen/main.cpp b/tools/incident_section_gen/main.cpp
index 9183918..36c3b1f 100644
--- a/tools/incident_section_gen/main.cpp
+++ b/tools/incident_section_gen/main.cpp
@@ -415,6 +415,8 @@
                 printf("    new GZipSection(%d,", field->number());
                 splitAndPrint(s.args());
                 printf(" NULL),\n");
+            case SECTION_TOMBSTONE:
+                printf("    new TombstoneSection(%d, \"%s\"),\n", field->number(), s.args().c_str());
                 break;
         }
     }