Add more accurate logging for stats log loss.

+ Only record eventual failure after the retries to get a accurate log loss count.
+ Record all types of failures which lead to log loss
+ Change the timestamp from elapsedRealtime to wallclock time for easier debugging server side
+ Also log the count too.

Bug: 80538532
Test: manually tested

Change-Id: I4fcccae3fa39c9e280a842e27c6432bb0a090b85
diff --git a/cmds/statsd/src/guardrail/StatsdStats.cpp b/cmds/statsd/src/guardrail/StatsdStats.cpp
index c37d0cf..33f3917 100644
--- a/cmds/statsd/src/guardrail/StatsdStats.cpp
+++ b/cmds/statsd/src/guardrail/StatsdStats.cpp
@@ -50,7 +50,7 @@
 // const int FIELD_ID_PULLED_ATOM_STATS = 10; // The proto is written in stats_log_util.cpp
 const int FIELD_ID_LOGGER_ERROR_STATS = 11;
 const int FIELD_ID_PERIODIC_ALARM_STATS = 12;
-const int FIELD_ID_LOG_LOSS_STATS = 14;
+// const int FIELD_ID_LOG_LOSS_STATS = 14;
 const int FIELD_ID_SYSTEM_SERVER_RESTART = 15;
 
 const int FIELD_ID_ATOM_STATS_TAG = 1;
@@ -180,12 +180,12 @@
     noteConfigResetInternalLocked(key);
 }
 
-void StatsdStats::noteLogLost(int64_t timestampNs, int32_t count) {
+void StatsdStats::noteLogLost(int32_t wallClockTimeSec, int32_t count) {
     lock_guard<std::mutex> lock(mLock);
-    if (mLogLossTimestampNs.size() == kMaxLoggerErrors) {
-        mLogLossTimestampNs.pop_front();
+    if (mLogLossStats.size() == kMaxLoggerErrors) {
+        mLogLossStats.pop_front();
     }
-    mLogLossTimestampNs.push_back(std::make_pair(timestampNs, count));
+    mLogLossStats.push_back(std::make_pair(wallClockTimeSec, count));
 }
 
 void StatsdStats::noteBroadcastSent(const ConfigKey& key) {
@@ -365,15 +365,6 @@
     mSystemServerRestartSec.push_back(timeSec);
 }
 
-void StatsdStats::noteLoggerError(int error) {
-    lock_guard<std::mutex> lock(mLock);
-    // grows strictly one at a time. so it won't > kMaxLoggerErrors
-    if (mLoggerErrors.size() == kMaxLoggerErrors) {
-        mLoggerErrors.pop_front();
-    }
-    mLoggerErrors.push_back(std::make_pair(getWallClockSec(), error));
-}
-
 void StatsdStats::reset() {
     lock_guard<std::mutex> lock(mLock);
     resetInternalLocked();
@@ -386,9 +377,8 @@
     std::fill(mPushedAtomStats.begin(), mPushedAtomStats.end(), 0);
     mAnomalyAlarmRegisteredStats = 0;
     mPeriodicAlarmRegisteredStats = 0;
-    mLoggerErrors.clear();
     mSystemServerRestartSec.clear();
-    mLogLossTimestampNs.clear();
+    mLogLossStats.clear();
     for (auto& config : mConfigStats) {
         config.second->broadcast_sent_time_sec.clear();
         config.second->data_drop_time_sec.clear();
@@ -515,21 +505,13 @@
             mUidMapStats.bytes_used, mUidMapStats.changes, mUidMapStats.deleted_apps,
             mUidMapStats.dropped_changes);
 
-    for (const auto& error : mLoggerErrors) {
-        time_t error_time = error.first;
-        struct tm* error_tm = localtime(&error_time);
-        char buffer[80];
-        strftime(buffer, sizeof(buffer), "%Y-%m-%d %I:%M%p\n", error_tm);
-        fprintf(out, "Logger error %d at %s\n", error.second, buffer);
-    }
-
     for (const auto& restart : mSystemServerRestartSec) {
         fprintf(out, "System server restarts at %s(%lld)\n",
             buildTimeString(restart).c_str(), (long long)restart);
     }
 
-    for (const auto& loss : mLogLossTimestampNs) {
-        fprintf(out, "Log loss: %lld (elapsedRealtimeNs) - %d (count)\n", (long long)loss.first,
+    for (const auto& loss : mLogLossStats) {
+        fprintf(out, "Log loss: %lld (wall clock sec) - %d (count)\n", (long long)loss.first,
                 loss.second);
     }
 }
@@ -678,7 +660,10 @@
     proto.write(FIELD_TYPE_INT32 | FIELD_ID_UID_MAP_DELETED_APPS, mUidMapStats.deleted_apps);
     proto.end(uidMapToken);
 
-    for (const auto& error : mLoggerErrors) {
+    for (const auto& error : mLogLossStats) {
+        // The logger error stats are not used anymore since we move away from logd.
+        // Temporarily use this field to log the log loss timestamp and count
+        // TODO(b/80538532) Add a dedicated field in stats_log for this.
         uint64_t token = proto.start(FIELD_TYPE_MESSAGE | FIELD_ID_LOGGER_ERROR_STATS |
                                       FIELD_COUNT_REPEATED);
         proto.write(FIELD_TYPE_INT32 | FIELD_ID_LOGGER_STATS_TIME, error.first);
@@ -686,11 +671,6 @@
         proto.end(token);
     }
 
-    for (const auto& loss : mLogLossTimestampNs) {
-        proto.write(FIELD_TYPE_INT64 | FIELD_ID_LOG_LOSS_STATS | FIELD_COUNT_REPEATED,
-                    (long long)loss.first);
-    }
-
     for (const auto& restart : mSystemServerRestartSec) {
         proto.write(FIELD_TYPE_INT32 | FIELD_ID_SYSTEM_SERVER_RESTART | FIELD_COUNT_REPEATED,
                     restart);
diff --git a/cmds/statsd/src/guardrail/StatsdStats.h b/cmds/statsd/src/guardrail/StatsdStats.h
index daea027..b5156da 100644
--- a/cmds/statsd/src/guardrail/StatsdStats.h
+++ b/cmds/statsd/src/guardrail/StatsdStats.h
@@ -271,11 +271,6 @@
     // Notify pull request for an atom served from cached data
     void notePullFromCache(int pullAtomId);
 
-    /**
-     * Records statsd met an error while reading from logd.
-     */
-    void noteLoggerError(int error);
-
     /*
     * Records when system server restarts.
     */
@@ -284,7 +279,7 @@
     /**
      * Records statsd skipped an event.
      */
-    void noteLogLost(int64_t timestamp, int32_t count);
+    void noteLogLost(int32_t wallClockTimeSec, int32_t count);
 
     /**
      * Reset the historical stats. Including all stats in icebox, and the tracked stats about
@@ -338,11 +333,8 @@
     // Maps PullAtomId to its stats. The size is capped by the puller atom counts.
     std::map<int, PulledAtomStats> mPulledAtomStats;
 
-    // Logd errors. Size capped by kMaxLoggerErrors.
-    std::list<const std::pair<int, int>> mLoggerErrors;
-
     // Timestamps when we detect log loss, and the number of logs lost.
-    std::list<std::pair<int64_t, int32_t>> mLogLossTimestampNs;
+    std::list<std::pair<int32_t, int32_t>> mLogLossStats;
 
     std::list<int32_t> mSystemServerRestartSec;
 
diff --git a/cmds/statsd/src/socket/StatsSocketListener.cpp b/cmds/statsd/src/socket/StatsSocketListener.cpp
index 4041da7..9b0691b 100755
--- a/cmds/statsd/src/socket/StatsSocketListener.cpp
+++ b/cmds/statsd/src/socket/StatsSocketListener.cpp
@@ -111,7 +111,8 @@
         android_log_event_int_t* int_event = reinterpret_cast<android_log_event_int_t*>(ptr);
         if (int_event->header.tag == kLibLogTag && int_event->payload.type == EVENT_TYPE_INT) {
             ALOGE("Found dropped events: %d", int_event->payload.data);
-            StatsdStats::getInstance().noteLogLost(getElapsedRealtimeNs(), int_event->payload.data);
+            StatsdStats::getInstance().noteLogLost((int32_t)getWallClockSec(),
+                                                   int_event->payload.data);
             return true;
         }
     }
diff --git a/tools/stats_log_api_gen/main.cpp b/tools/stats_log_api_gen/main.cpp
index e519909..8038a3a 100644
--- a/tools/stats_log_api_gen/main.cpp
+++ b/tools/stats_log_api_gen/main.cpp
@@ -349,8 +349,7 @@
            argIndex++;
        }
        fprintf(out, ");\n");
-       fprintf(out, "      if (ret >= 0) { return retry; }\n");
-
+       fprintf(out, "      if (ret >= 0) { break; }\n");
 
        fprintf(out, "      {\n");
        fprintf(out, "          std::lock_guard<std::mutex> lock(mLogdRetryMutex);\n");
@@ -360,6 +359,9 @@
        fprintf(out, "      }\n");
        fprintf(out, "      std::this_thread::sleep_for(std::chrono::milliseconds(10));\n");
        fprintf(out, "  }\n");
+       fprintf(out, "  if (ret < 0) {\n");
+       fprintf(out, "      note_log_drop();\n");
+       fprintf(out, "  }\n");
        fprintf(out, "  return ret;\n");
        fprintf(out, "}\n");
        fprintf(out, "\n");
@@ -439,7 +441,7 @@
            argIndex++;
        }
        fprintf(out, ");\n");
-       fprintf(out, "      if (ret >= 0) { return retry; }\n");
+       fprintf(out, "      if (ret >= 0) { break; }\n");
 
        fprintf(out, "      {\n");
        fprintf(out, "          std::lock_guard<std::mutex> lock(mLogdRetryMutex);\n");
@@ -450,7 +452,10 @@
 
        fprintf(out, "      std::this_thread::sleep_for(std::chrono::milliseconds(10));\n");
        fprintf(out, "  }\n");
-       fprintf(out, "  return ret;\n");
+       fprintf(out, "  if (ret < 0) {\n");
+       fprintf(out, "      note_log_drop();\n");
+       fprintf(out, "  }\n");
+       fprintf(out, "  return ret;\n\n");
        fprintf(out, "}\n");
 
        fprintf(out, "\n");