Add more accurate logging for stats log loss.
+ Only record eventual failure after the retries to get a accurate log loss count.
+ Record all types of failures which lead to log loss
+ Change the timestamp from elapsedRealtime to wallclock time for easier debugging server side
+ Also log the count too.
Bug: 80538532
Test: manually tested
Change-Id: I4fcccae3fa39c9e280a842e27c6432bb0a090b85
diff --git a/cmds/statsd/src/guardrail/StatsdStats.cpp b/cmds/statsd/src/guardrail/StatsdStats.cpp
index c37d0cf..33f3917 100644
--- a/cmds/statsd/src/guardrail/StatsdStats.cpp
+++ b/cmds/statsd/src/guardrail/StatsdStats.cpp
@@ -50,7 +50,7 @@
// const int FIELD_ID_PULLED_ATOM_STATS = 10; // The proto is written in stats_log_util.cpp
const int FIELD_ID_LOGGER_ERROR_STATS = 11;
const int FIELD_ID_PERIODIC_ALARM_STATS = 12;
-const int FIELD_ID_LOG_LOSS_STATS = 14;
+// const int FIELD_ID_LOG_LOSS_STATS = 14;
const int FIELD_ID_SYSTEM_SERVER_RESTART = 15;
const int FIELD_ID_ATOM_STATS_TAG = 1;
@@ -180,12 +180,12 @@
noteConfigResetInternalLocked(key);
}
-void StatsdStats::noteLogLost(int64_t timestampNs, int32_t count) {
+void StatsdStats::noteLogLost(int32_t wallClockTimeSec, int32_t count) {
lock_guard<std::mutex> lock(mLock);
- if (mLogLossTimestampNs.size() == kMaxLoggerErrors) {
- mLogLossTimestampNs.pop_front();
+ if (mLogLossStats.size() == kMaxLoggerErrors) {
+ mLogLossStats.pop_front();
}
- mLogLossTimestampNs.push_back(std::make_pair(timestampNs, count));
+ mLogLossStats.push_back(std::make_pair(wallClockTimeSec, count));
}
void StatsdStats::noteBroadcastSent(const ConfigKey& key) {
@@ -365,15 +365,6 @@
mSystemServerRestartSec.push_back(timeSec);
}
-void StatsdStats::noteLoggerError(int error) {
- lock_guard<std::mutex> lock(mLock);
- // grows strictly one at a time. so it won't > kMaxLoggerErrors
- if (mLoggerErrors.size() == kMaxLoggerErrors) {
- mLoggerErrors.pop_front();
- }
- mLoggerErrors.push_back(std::make_pair(getWallClockSec(), error));
-}
-
void StatsdStats::reset() {
lock_guard<std::mutex> lock(mLock);
resetInternalLocked();
@@ -386,9 +377,8 @@
std::fill(mPushedAtomStats.begin(), mPushedAtomStats.end(), 0);
mAnomalyAlarmRegisteredStats = 0;
mPeriodicAlarmRegisteredStats = 0;
- mLoggerErrors.clear();
mSystemServerRestartSec.clear();
- mLogLossTimestampNs.clear();
+ mLogLossStats.clear();
for (auto& config : mConfigStats) {
config.second->broadcast_sent_time_sec.clear();
config.second->data_drop_time_sec.clear();
@@ -515,21 +505,13 @@
mUidMapStats.bytes_used, mUidMapStats.changes, mUidMapStats.deleted_apps,
mUidMapStats.dropped_changes);
- for (const auto& error : mLoggerErrors) {
- time_t error_time = error.first;
- struct tm* error_tm = localtime(&error_time);
- char buffer[80];
- strftime(buffer, sizeof(buffer), "%Y-%m-%d %I:%M%p\n", error_tm);
- fprintf(out, "Logger error %d at %s\n", error.second, buffer);
- }
-
for (const auto& restart : mSystemServerRestartSec) {
fprintf(out, "System server restarts at %s(%lld)\n",
buildTimeString(restart).c_str(), (long long)restart);
}
- for (const auto& loss : mLogLossTimestampNs) {
- fprintf(out, "Log loss: %lld (elapsedRealtimeNs) - %d (count)\n", (long long)loss.first,
+ for (const auto& loss : mLogLossStats) {
+ fprintf(out, "Log loss: %lld (wall clock sec) - %d (count)\n", (long long)loss.first,
loss.second);
}
}
@@ -678,7 +660,10 @@
proto.write(FIELD_TYPE_INT32 | FIELD_ID_UID_MAP_DELETED_APPS, mUidMapStats.deleted_apps);
proto.end(uidMapToken);
- for (const auto& error : mLoggerErrors) {
+ for (const auto& error : mLogLossStats) {
+ // The logger error stats are not used anymore since we move away from logd.
+ // Temporarily use this field to log the log loss timestamp and count
+ // TODO(b/80538532) Add a dedicated field in stats_log for this.
uint64_t token = proto.start(FIELD_TYPE_MESSAGE | FIELD_ID_LOGGER_ERROR_STATS |
FIELD_COUNT_REPEATED);
proto.write(FIELD_TYPE_INT32 | FIELD_ID_LOGGER_STATS_TIME, error.first);
@@ -686,11 +671,6 @@
proto.end(token);
}
- for (const auto& loss : mLogLossTimestampNs) {
- proto.write(FIELD_TYPE_INT64 | FIELD_ID_LOG_LOSS_STATS | FIELD_COUNT_REPEATED,
- (long long)loss.first);
- }
-
for (const auto& restart : mSystemServerRestartSec) {
proto.write(FIELD_TYPE_INT32 | FIELD_ID_SYSTEM_SERVER_RESTART | FIELD_COUNT_REPEATED,
restart);
diff --git a/cmds/statsd/src/guardrail/StatsdStats.h b/cmds/statsd/src/guardrail/StatsdStats.h
index daea027..b5156da 100644
--- a/cmds/statsd/src/guardrail/StatsdStats.h
+++ b/cmds/statsd/src/guardrail/StatsdStats.h
@@ -271,11 +271,6 @@
// Notify pull request for an atom served from cached data
void notePullFromCache(int pullAtomId);
- /**
- * Records statsd met an error while reading from logd.
- */
- void noteLoggerError(int error);
-
/*
* Records when system server restarts.
*/
@@ -284,7 +279,7 @@
/**
* Records statsd skipped an event.
*/
- void noteLogLost(int64_t timestamp, int32_t count);
+ void noteLogLost(int32_t wallClockTimeSec, int32_t count);
/**
* Reset the historical stats. Including all stats in icebox, and the tracked stats about
@@ -338,11 +333,8 @@
// Maps PullAtomId to its stats. The size is capped by the puller atom counts.
std::map<int, PulledAtomStats> mPulledAtomStats;
- // Logd errors. Size capped by kMaxLoggerErrors.
- std::list<const std::pair<int, int>> mLoggerErrors;
-
// Timestamps when we detect log loss, and the number of logs lost.
- std::list<std::pair<int64_t, int32_t>> mLogLossTimestampNs;
+ std::list<std::pair<int32_t, int32_t>> mLogLossStats;
std::list<int32_t> mSystemServerRestartSec;
diff --git a/cmds/statsd/src/socket/StatsSocketListener.cpp b/cmds/statsd/src/socket/StatsSocketListener.cpp
index 4041da7..9b0691b 100755
--- a/cmds/statsd/src/socket/StatsSocketListener.cpp
+++ b/cmds/statsd/src/socket/StatsSocketListener.cpp
@@ -111,7 +111,8 @@
android_log_event_int_t* int_event = reinterpret_cast<android_log_event_int_t*>(ptr);
if (int_event->header.tag == kLibLogTag && int_event->payload.type == EVENT_TYPE_INT) {
ALOGE("Found dropped events: %d", int_event->payload.data);
- StatsdStats::getInstance().noteLogLost(getElapsedRealtimeNs(), int_event->payload.data);
+ StatsdStats::getInstance().noteLogLost((int32_t)getWallClockSec(),
+ int_event->payload.data);
return true;
}
}
diff --git a/tools/stats_log_api_gen/main.cpp b/tools/stats_log_api_gen/main.cpp
index e519909..8038a3a 100644
--- a/tools/stats_log_api_gen/main.cpp
+++ b/tools/stats_log_api_gen/main.cpp
@@ -349,8 +349,7 @@
argIndex++;
}
fprintf(out, ");\n");
- fprintf(out, " if (ret >= 0) { return retry; }\n");
-
+ fprintf(out, " if (ret >= 0) { break; }\n");
fprintf(out, " {\n");
fprintf(out, " std::lock_guard<std::mutex> lock(mLogdRetryMutex);\n");
@@ -360,6 +359,9 @@
fprintf(out, " }\n");
fprintf(out, " std::this_thread::sleep_for(std::chrono::milliseconds(10));\n");
fprintf(out, " }\n");
+ fprintf(out, " if (ret < 0) {\n");
+ fprintf(out, " note_log_drop();\n");
+ fprintf(out, " }\n");
fprintf(out, " return ret;\n");
fprintf(out, "}\n");
fprintf(out, "\n");
@@ -439,7 +441,7 @@
argIndex++;
}
fprintf(out, ");\n");
- fprintf(out, " if (ret >= 0) { return retry; }\n");
+ fprintf(out, " if (ret >= 0) { break; }\n");
fprintf(out, " {\n");
fprintf(out, " std::lock_guard<std::mutex> lock(mLogdRetryMutex);\n");
@@ -450,7 +452,10 @@
fprintf(out, " std::this_thread::sleep_for(std::chrono::milliseconds(10));\n");
fprintf(out, " }\n");
- fprintf(out, " return ret;\n");
+ fprintf(out, " if (ret < 0) {\n");
+ fprintf(out, " note_log_drop();\n");
+ fprintf(out, " }\n");
+ fprintf(out, " return ret;\n\n");
fprintf(out, "}\n");
fprintf(out, "\n");