Refine incidentd error handling and logging

Added or changed some error messages and logging for easier debugging.
Log a few section errors to incident metadata in addition to logcat.
Also let incident reporting continue instead of failing when non-fatal
error occurs.

Bug: 119417232
Test: Take an incident report, and logcat.
Change-Id: Id18b7b690100923a8074d99b2312ee80c0bcd760
diff --git a/cmds/incidentd/Android.bp b/cmds/incidentd/Android.bp
index 1e970f4..40da583 100644
--- a/cmds/incidentd/Android.bp
+++ b/cmds/incidentd/Android.bp
@@ -53,6 +53,7 @@
         "libprotoutil",
         "libservices",
         "libutils",
+        "libprotobuf-cpp-lite",
     ],
 
     init_rc: ["incidentd.rc"],
diff --git a/cmds/incidentd/src/FdBuffer.cpp b/cmds/incidentd/src/FdBuffer.cpp
index a8ef831..74cda1d 100644
--- a/cmds/incidentd/src/FdBuffer.cpp
+++ b/cmds/incidentd/src/FdBuffer.cpp
@@ -47,9 +47,13 @@
     while (true) {
         if (mBuffer.size() >= MAX_BUFFER_COUNT * BUFFER_SIZE) {
             mTruncated = true;
+            VLOG("Truncating data");
             break;
         }
-        if (mBuffer.writeBuffer() == NULL) return NO_MEMORY;
+        if (mBuffer.writeBuffer() == NULL) {
+            VLOG("No memory");
+            return NO_MEMORY;
+        }
 
         int64_t remainingTime = (mStartTime + timeout) - uptimeMillis();
         if (remainingTime <= 0) {
@@ -58,7 +62,7 @@
             break;
         }
 
-        int count = poll(&pfds, 1, remainingTime);
+        int count = TEMP_FAILURE_RETRY(poll(&pfds, 1, remainingTime));
         if (count == 0) {
             VLOG("timed out due to block calling poll");
             mTimedOut = true;
@@ -102,7 +106,10 @@
             VLOG("Truncating data");
             break;
         }
-        if (mBuffer.writeBuffer() == NULL) return NO_MEMORY;
+        if (mBuffer.writeBuffer() == NULL) {
+            VLOG("No memory");
+            return NO_MEMORY;
+        }
 
         ssize_t amt =
                 TEMP_FAILURE_RETRY(::read(fd, mBuffer.writeBuffer(), mBuffer.currentToWrite()));
@@ -144,10 +151,14 @@
     // This is the buffer used to store processed data
     while (true) {
         if (mBuffer.size() >= MAX_BUFFER_COUNT * BUFFER_SIZE) {
+            VLOG("Truncating data");
             mTruncated = true;
             break;
         }
-        if (mBuffer.writeBuffer() == NULL) return NO_MEMORY;
+        if (mBuffer.writeBuffer() == NULL) {
+            VLOG("No memory");
+            return NO_MEMORY;
+        }
 
         int64_t remainingTime = (mStartTime + timeoutMs) - uptimeMillis();
         if (remainingTime <= 0) {
@@ -157,7 +168,7 @@
         }
 
         // wait for any pfds to be ready to perform IO
-        int count = poll(pfds, 3, remainingTime);
+        int count = TEMP_FAILURE_RETRY(poll(pfds, 3, remainingTime));
         if (count == 0) {
             VLOG("timed out due to block calling poll");
             mTimedOut = true;
diff --git a/cmds/incidentd/src/IncidentService.cpp b/cmds/incidentd/src/IncidentService.cpp
index b23c1ed..3d892ad 100644
--- a/cmds/incidentd/src/IncidentService.cpp
+++ b/cmds/incidentd/src/IncidentService.cpp
@@ -42,13 +42,11 @@
 #define DEFAULT_BYTES_SIZE_LIMIT (20 * 1024 * 1024)        // 20MB
 #define DEFAULT_REFACTORY_PERIOD_MS (24 * 60 * 60 * 1000)  // 1 Day
 
-// Skip logs (1100 - 1108) because they are already in the bug report
-// Skip 1200, 1201, 1202, 3018 because they take too long
-// TODO(120079956): Skip 3008, 3015 because of error
+// Skip these sections for dumpstate only. Dumpstate allows 10s max for each service to dump.
+// Skip logs (1100 - 1108) and traces (1200 - 1202) because they are already in the bug report.
+// Skip 3018 because it takes too long.
 #define SKIPPED_SECTIONS { 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, /* Logs */ \
                            1200, 1201, 1202, /* Native, hal, java traces */ \
-                           3008, /* "package --proto" */ \
-                           3015, /* "activity --proto processes" */ \
                            3018  /* "meminfo -a --proto" */ }
 
 namespace android {
diff --git a/cmds/incidentd/src/PrivacyBuffer.cpp b/cmds/incidentd/src/PrivacyBuffer.cpp
index 7a8ebe3..6967e33 100644
--- a/cmds/incidentd/src/PrivacyBuffer.cpp
+++ b/cmds/incidentd/src/PrivacyBuffer.cpp
@@ -96,7 +96,12 @@
     uint64_t token = mProto.start(encode_field_id(policy));
     while (mData.rp()->pos() - start != msgSize) {
         status_t err = stripField(policy, spec, depth + 1);
-        if (err != NO_ERROR) return err;
+        if (err != NO_ERROR) {
+            VLOG("Bad value when stripping id %d, wiretype %d, tag %#x, depth %d, size %d, "
+                "relative pos %zu, ", fieldId, read_wire_type(fieldTag), fieldTag, depth,
+                msgSize, mData.rp()->pos() - start);
+            return err;
+        }
     }
     mProto.end(token);
     return NO_ERROR;
@@ -117,9 +122,13 @@
     }
     while (mData.hasNext()) {
         status_t err = stripField(mPolicy, spec, 0);
-        if (err != NO_ERROR) return err;
+        if (err != NO_ERROR) return err; // Error logged in stripField.
     }
-    if (mData.bytesRead() != mData.size()) return BAD_VALUE;
+    if (mData.bytesRead() != mData.size()) {
+        VLOG("Buffer corrupted: expect %zu bytes, read %zu bytes",
+            mData.size(), mData.bytesRead());
+        return BAD_VALUE;
+    }
     mSize = mProto.size();
     mData.rp()->rewind();  // rewind the read pointer back to beginning after the strip.
     return NO_ERROR;
diff --git a/cmds/incidentd/src/Reporter.cpp b/cmds/incidentd/src/Reporter.cpp
index b3bab0c..8f62da2 100644
--- a/cmds/incidentd/src/Reporter.cpp
+++ b/cmds/incidentd/src/Reporter.cpp
@@ -129,6 +129,7 @@
     bool needMainFd = false;
     int mainFd = -1;
     int mainDest = -1;
+    int sectionCount = 0;
     HeaderSection headers;
     MetadataSection metadataSection;
     std::string buildType = android::base::GetProperty("ro.build.type", "");
@@ -180,12 +181,12 @@
     for (const Section** section = SECTION_LIST; *section; section++) {
         const int id = (*section)->id;
         if ((*section)->userdebugAndEngOnly && !isUserdebugOrEng) {
-            ALOGD("Skipping incident report section %d '%s' because it's limited to userdebug/eng",
+            VLOG("Skipping incident report section %d '%s' because it's limited to userdebug/eng",
                   id, (*section)->name.string());
             continue;
         }
         if (this->batch.containsSection(id)) {
-            ALOGD("Taking incident report section %d '%s'", id, (*section)->name.string());
+            VLOG("Taking incident report section %d '%s'", id, (*section)->name.string());
             for (ReportRequestSet::iterator it = batch.begin(); it != batch.end(); it++) {
                 if ((*it)->listener != NULL && (*it)->args.containsSection(id)) {
                     (*it)->listener->onReportSectionStatus(
@@ -198,11 +199,12 @@
             int64_t startTime = uptimeMillis();
             err = (*section)->Execute(&batch);
             int64_t endTime = uptimeMillis();
-            stats->set_success(err == NO_ERROR);
             stats->set_exec_duration_ms(endTime - startTime);
             if (err != NO_ERROR) {
                 ALOGW("Incident section %s (%d) failed: %s. Stopping report.",
                       (*section)->name.string(), id, strerror(-err));
+                // Execute() has already recorded this status. Only update if there's new failure.
+                stats->set_success(false);
                 goto DONE;
             }
             (*reportByteSize) += stats->report_size_bytes();
@@ -214,11 +216,13 @@
                             id, IIncidentReportStatusListener::STATUS_FINISHED);
                 }
             }
-            ALOGD("Finish incident report section %d '%s'", id, (*section)->name.string());
+            VLOG("Finish incident report section %d '%s'", id, (*section)->name.string());
+            sectionCount++;
         }
     }
 
 DONE:
+    ALOGD("Incident reporting took %d sections.", sectionCount);
     // Reports the metdadata when taking the incident report.
     if (!isTest) metadataSection.Execute(&batch);
 
diff --git a/cmds/incidentd/src/Section.cpp b/cmds/incidentd/src/Section.cpp
index cd48af9..10d2268 100644
--- a/cmds/incidentd/src/Section.cpp
+++ b/cmds/incidentd/src/Section.cpp
@@ -75,6 +75,7 @@
     stats->set_dump_duration_ms(buffer.durationMs());
     stats->set_timed_out(buffer.timedOut());
     stats->set_is_truncated(buffer.truncated());
+    stats->set_success(!buffer.timedOut() && !buffer.truncated());
 }
 
 // Reads data from FdBuffer and writes it to the requests file descriptor.
@@ -83,7 +84,8 @@
     status_t err = -EBADF;
     EncodedBuffer::iterator data = buffer.data();
     PrivacyBuffer privacyBuffer(get_privacy_of_section(id), data);
-    int writeable = 0;
+    IncidentMetadata::SectionStats* stats = requests->sectionStats(id);
+    int nPassed = 0;
 
     // The streaming ones, group requests by spec in order to save unnecessary strip operations
     map<PrivacySpec, vector<sp<ReportRequest>>> requestsBySpec;
@@ -99,7 +101,18 @@
     for (auto mit = requestsBySpec.begin(); mit != requestsBySpec.end(); mit++) {
         PrivacySpec spec = mit->first;
         err = privacyBuffer.strip(spec);
-        if (err != NO_ERROR) return err;  // it means the privacyBuffer data is corrupted.
+        if (err != NO_ERROR) {
+            // Privacy Buffer is corrupted, probably due to an ill-formatted proto. This is a
+            // non-fatal error. The whole report is still valid. So just log the failure.
+            ALOGW("Failed to strip section %d with spec %d: %s",
+                id, spec.dest, strerror(-err));
+            stats->set_success(false);
+            stats->set_error_msg("Failed to strip section: privacy buffer corrupted, probably "
+                "due to an ill-formatted proto");
+            nPassed++;
+            continue;
+        }
+
         if (privacyBuffer.size() == 0) continue;
 
         for (auto it = mit->second.begin(); it != mit->second.end(); it++) {
@@ -114,7 +127,7 @@
                 request->err = err;
                 continue;
             }
-            writeable++;
+            nPassed++;
             VLOG("Section %d flushed %zu bytes to fd %d with spec %d", id, privacyBuffer.size(),
                  request->fd, spec.dest);
         }
@@ -125,7 +138,15 @@
     if (requests->mainFd() >= 0) {
         PrivacySpec spec = PrivacySpec::new_spec(requests->mainDest());
         err = privacyBuffer.strip(spec);
-        if (err != NO_ERROR) return err;  // the buffer data is corrupted.
+        if (err != NO_ERROR) {
+            ALOGW("Failed to strip section %d with spec %d for dropbox: %s",
+                id, spec.dest, strerror(-err));
+            stats->set_success(false);
+            stats->set_error_msg("Failed to strip section: privacy buffer corrupted, probably "
+                "due to an ill-formatted proto");
+            nPassed++;
+            goto DONE;
+        }
         if (privacyBuffer.size() == 0) goto DONE;
 
         err = write_section_header(requests->mainFd(), id, privacyBuffer.size());
@@ -138,7 +159,7 @@
             requests->setMainFd(-1);
             goto DONE;
         }
-        writeable++;
+        nPassed++;
         VLOG("Section %d flushed %zu bytes to dropbox %d with spec %d", id, privacyBuffer.size(),
              requests->mainFd(), spec.dest);
         // Reports bytes of the section uploaded via dropbox after filtering.
@@ -147,7 +168,7 @@
 
 DONE:
     // only returns error if there is no fd to write to.
-    return writeable > 0 ? NO_ERROR : err;
+    return nPassed > 0 ? NO_ERROR : err;
 }
 
 // ================================================================================
@@ -213,6 +234,8 @@
                     stats.timed_out());
         proto.write(FIELD_TYPE_BOOL | IncidentMetadata::SectionStats::kIsTruncatedFieldNumber,
                     stats.is_truncated());
+        proto.write(FIELD_TYPE_STRING | IncidentMetadata::SectionStats::kErrorMsgFieldNumber,
+                    stats.error_msg());
         proto.end(token);
     }
 
@@ -439,8 +462,9 @@
     status_t err = NO_ERROR;
     pthread_t thread;
     pthread_attr_t attr;
-    bool timedOut = false;
+    bool workerDone = false;
     FdBuffer buffer;
+    IncidentMetadata::SectionStats* stats = requests->sectionStats(this->id);
 
     // Data shared between this thread and the worker thread.
     sp<WorkerThreadData> data = new WorkerThreadData(this);
@@ -475,8 +499,7 @@
     // Loop reading until either the timeout or the worker side is done (i.e. eof).
     err = buffer.read(data->pipe.readFd().get(), this->timeoutMs);
     if (err != NO_ERROR) {
-        // TODO: Log this error into the incident report.
-        ALOGW("[%s] reader failed with error '%s'", this->name.string(), strerror(-err));
+        ALOGE("[%s] reader failed with error '%s'", this->name.string(), strerror(-err));
     }
 
     // Done with the read fd. The worker thread closes the write one so
@@ -484,31 +507,32 @@
     data->pipe.readFd().reset();
 
     // If the worker side is finished, then return its error (which may overwrite
-    // our possible error -- but it's more interesting anyway).  If not, then we timed out.
+    // our possible error -- but it's more interesting anyway). If not, then we timed out.
     {
         unique_lock<mutex> lock(data->lock);
-        if (!data->workerDone) {
-            // We timed out
-            timedOut = true;
-        } else {
-            if (data->workerError != NO_ERROR) {
-                err = data->workerError;
-                // TODO: Log this error into the incident report.
-                ALOGW("[%s] worker failed with error '%s'", this->name.string(), strerror(-err));
-            }
+        if (data->workerError != NO_ERROR) {
+            err = data->workerError;
+            ALOGE("[%s] worker failed with error '%s'", this->name.string(), strerror(-err));
         }
+        workerDone = data->workerDone;
     }
 
-    write_section_stats(requests->sectionStats(this->id), buffer);
-    if (timedOut || buffer.timedOut()) {
-        ALOGW("[%s] timed out", this->name.string());
+    write_section_stats(stats, buffer);
+    if (err != NO_ERROR) {
+        char errMsg[128];
+        snprintf(errMsg, 128, "[%s] failed with error '%s'",
+            this->name.string(), strerror(-err));
+        stats->set_success(false);
+        stats->set_error_msg(errMsg);
         return NO_ERROR;
     }
-
-    // TODO: There was an error with the command or buffering. Report that.  For now
-    // just exit with a log messasge.
-    if (err != NO_ERROR) {
-        ALOGW("[%s] failed with error '%s'", this->name.string(), strerror(-err));
+    if (buffer.truncated()) {
+        ALOGW("[%s] too large, truncating", this->name.string());
+        // Do not write a truncated section. It won't pass through the PrivacyBuffer.
+        return NO_ERROR;
+    }
+    if (!workerDone || buffer.timedOut()) {
+        ALOGW("[%s] timed out", this->name.string());
         return NO_ERROR;
     }
 
@@ -617,14 +641,8 @@
     sp<IBinder> service = defaultServiceManager()->checkService(mService);
 
     if (service == NULL) {
-        // Returning an error interrupts the entire incident report, so just
-        // log the failure.
-        // TODO: have a meta record inside the report that would log this
-        // failure inside the report, because the fact that we can't find
-        // the service is good data in and of itself. This is running in
-        // another thread so lock that carefully...
         ALOGW("DumpsysSection: Can't lookup service: %s", String8(mService).string());
-        return NO_ERROR;
+        return NAME_NOT_FOUND;
     }
 
     service->dump(pipeWriteFd, mArgs);
diff --git a/libs/incident/proto/android/os/metadata.proto b/libs/incident/proto/android/os/metadata.proto
index f8f4e36..3b0e9c9 100644
--- a/libs/incident/proto/android/os/metadata.proto
+++ b/libs/incident/proto/android/os/metadata.proto
@@ -61,8 +61,10 @@
         optional bool timed_out = 7;
         // true if the section is truncated.
         optional bool is_truncated = 8;
+        // message for debugging if there is an error.
+        optional string error_msg = 9;
 
-        // Next Tag: 9
+        // Next Tag: 10;
     }
     repeated SectionStats sections = 6;