blob: bae5766be0affd9faeb45e2c91b3401d09ac118b [file] [log] [blame]
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -08001/**
2 * Copyright (c) 2020, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "carwatchdogd"
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -080018
19#include "IoPerfCollection.h"
20
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080021#include <android-base/file.h>
Lakshman Annadorai19bf2752020-03-05 17:45:43 -080022#include <android-base/parseint.h>
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080023#include <android-base/stringprintf.h>
24#include <binder/IServiceManager.h>
25#include <cutils/android_filesystem_config.h>
26#include <inttypes.h>
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -080027#include <log/log.h>
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080028#include <processgroup/sched_policy.h>
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080029#include <pwd.h>
30
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080031#include <iomanip>
32#include <limits>
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080033#include <string>
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080034#include <thread>
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080035#include <unordered_map>
36#include <unordered_set>
37#include <vector>
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -080038
39namespace android {
40namespace automotive {
41namespace watchdog {
42
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080043using android::defaultServiceManager;
44using android::IBinder;
45using android::IServiceManager;
46using android::sp;
47using android::String16;
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -080048using android::base::Error;
Lakshman Annadorai19bf2752020-03-05 17:45:43 -080049using android::base::ParseUint;
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -080050using android::base::Result;
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080051using android::base::StringAppendF;
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080052using android::base::WriteStringToFd;
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080053using android::content::pm::IPackageManagerNative;
54
55namespace {
56
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080057const std::string kDumpMajorDelimiter = std::string(100, '-') + "\n";
58
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080059double percentage(uint64_t numer, uint64_t denom) {
60 return denom == 0 ? 0.0 : (static_cast<double>(numer) / static_cast<double>(denom)) * 100.0;
61}
62
Lakshman Annadorai325e9652020-02-20 17:27:11 -080063struct UidProcessStats {
64 uint64_t uid = 0;
65 uint32_t ioBlockedTasksCnt = 0;
66 uint32_t totalTasksCnt = 0;
67 uint64_t majorFaults = 0;
68};
69
70std::unordered_map<uint32_t, UidProcessStats> getUidProcessStats(
71 const std::vector<ProcessStats>& processStats) {
72 std::unordered_map<uint32_t, UidProcessStats> uidProcessStats;
73 for (const auto& stats : processStats) {
74 if (stats.uid < 0) {
75 continue;
76 }
77 uint32_t uid = static_cast<uint32_t>(stats.uid);
78 if (uidProcessStats.find(uid) == uidProcessStats.end()) {
79 uidProcessStats[uid] = UidProcessStats{.uid = uid};
80 }
81 auto& curUidProcessStats = uidProcessStats[uid];
82 // Top-level process stats has the aggregated major page faults count and this should be
83 // persistent across thread creation/termination. Thus use the value from this field.
84 curUidProcessStats.majorFaults += stats.process.majorFaults;
85 curUidProcessStats.totalTasksCnt += stats.threads.size();
86 // The process state is the same as the main thread state. Thus to avoid double counting
87 // ignore the process state.
88 for (const auto& threadStat : stats.threads) {
89 curUidProcessStats.ioBlockedTasksCnt += threadStat.second.state == "D" ? 1 : 0;
90 }
91 }
92 return uidProcessStats;
93}
94
Lakshman Annadorai19bf2752020-03-05 17:45:43 -080095Result<std::chrono::seconds> parseSecondsFlag(Vector<String16> args, size_t pos) {
96 if (args.size() < pos) {
97 return Error() << "Value not provided";
98 }
99
100 uint64_t value;
101 std::string strValue = std::string(String8(args[pos]).string());
102 if (!ParseUint(strValue, &value)) {
103 return Error() << "Invalid value " << args[pos].string() << ", must be an integer";
104 }
105 return std::chrono::seconds(value);
106}
107
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800108} // namespace
109
110std::string toString(const UidIoPerfData& data) {
111 std::string buffer;
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800112 if (data.topNReads.size() > 0) {
113 StringAppendF(&buffer, "\nTop N Reads:\n%s\n", std::string(12, '-').c_str());
114 StringAppendF(&buffer,
115 "Android User ID, Package Name, Foreground Bytes, Foreground Bytes %%, "
116 "Foreground Fsync, Foreground Fsync %%, Background Bytes, "
117 "Background Bytes %%, Background Fsync, Background Fsync %%\n");
118 }
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800119 for (const auto& stat : data.topNReads) {
120 StringAppendF(&buffer, "%" PRIu32 ", %s", stat.userId, stat.packageName.c_str());
121 for (int i = 0; i < UID_STATES; ++i) {
122 StringAppendF(&buffer, ", %" PRIu64 ", %.2f%%, %" PRIu64 ", %.2f%%", stat.bytes[i],
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800123 percentage(stat.bytes[i], data.total[READ_BYTES][i]), stat.fsync[i],
124 percentage(stat.fsync[i], data.total[FSYNC_COUNT][i]));
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800125 }
126 StringAppendF(&buffer, "\n");
127 }
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800128 if (data.topNWrites.size() > 0) {
129 StringAppendF(&buffer, "\nTop N Writes:\n%s\n", std::string(13, '-').c_str());
130 StringAppendF(&buffer,
131 "Android User ID, Package Name, Foreground Bytes, Foreground Bytes %%, "
132 "Foreground Fsync, Foreground Fsync %%, Background Bytes, "
133 "Background Bytes %%, Background Fsync, Background Fsync %%\n");
134 }
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800135 for (const auto& stat : data.topNWrites) {
136 StringAppendF(&buffer, "%" PRIu32 ", %s", stat.userId, stat.packageName.c_str());
137 for (int i = 0; i < UID_STATES; ++i) {
138 StringAppendF(&buffer, ", %" PRIu64 ", %.2f%%, %" PRIu64 ", %.2f%%", stat.bytes[i],
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800139 percentage(stat.bytes[i], data.total[WRITE_BYTES][i]), stat.fsync[i],
140 percentage(stat.fsync[i], data.total[FSYNC_COUNT][i]));
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800141 }
142 StringAppendF(&buffer, "\n");
143 }
144 return buffer;
145}
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800146
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800147std::string toString(const SystemIoPerfData& data) {
148 std::string buffer;
149 StringAppendF(&buffer, "CPU I/O wait time/percent: %" PRIu64 " / %.2f%%\n", data.cpuIoWaitTime,
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800150 percentage(data.cpuIoWaitTime, data.totalCpuTime));
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800151 StringAppendF(&buffer, "Number of I/O blocked processes/percent: %" PRIu32 " / %.2f%%\n",
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800152 data.ioBlockedProcessesCnt,
153 percentage(data.ioBlockedProcessesCnt, data.totalProcessesCnt));
154 return buffer;
155}
156
157std::string toString(const ProcessIoPerfData& data) {
158 std::string buffer;
159 StringAppendF(&buffer, "Number of major page faults since last collection: %" PRIu64 "\n",
160 data.totalMajorFaults);
161 StringAppendF(&buffer,
162 "Percentage of change in major page faults since last collection: %.2f%%\n",
163 data.majorFaultsPercentChange);
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800164 if (data.topNMajorFaults.size() > 0) {
165 StringAppendF(&buffer, "\nTop N major page faults:\n%s\n", std::string(24, '-').c_str());
166 StringAppendF(&buffer,
167 "Android User ID, Package Name, Number of major page faults, "
168 "Percentage of total major page faults\n");
169 }
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800170 for (const auto& stat : data.topNMajorFaults) {
171 StringAppendF(&buffer, "%" PRIu32 ", %s, %" PRIu64 ", %.2f%%\n", stat.userId,
172 stat.packageName.c_str(), stat.count,
173 percentage(stat.count, data.totalMajorFaults));
174 }
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800175 if (data.topNIoBlockedUids.size() > 0) {
176 StringAppendF(&buffer, "\nTop N I/O waiting UIDs:\n%s\n", std::string(23, '-').c_str());
177 StringAppendF(&buffer,
178 "Android User ID, Package Name, Number of owned tasks waiting for I/O, "
179 "Percentage of owned tasks waiting for I/O\n");
180 }
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800181 for (size_t i = 0; i < data.topNIoBlockedUids.size(); ++i) {
182 const auto& stat = data.topNIoBlockedUids[i];
183 StringAppendF(&buffer, "%" PRIu32 ", %s, %" PRIu64 ", %.2f%%\n", stat.userId,
184 stat.packageName.c_str(), stat.count,
185 percentage(stat.count, data.topNIoBlockedUidsTotalTaskCnt[i]));
186 }
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800187 return buffer;
188}
189
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800190std::string toString(const IoPerfRecord& record) {
191 std::string buffer;
192 StringAppendF(&buffer, "%s%s%s", toString(record.systemIoPerfData).c_str(),
193 toString(record.processIoPerfData).c_str(),
194 toString(record.uidIoPerfData).c_str());
195 return buffer;
196}
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800197
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800198std::string toString(const CollectionInfo& collectionInfo) {
199 std::string buffer;
200 StringAppendF(&buffer, "Number of collections: %zu\n", collectionInfo.records.size());
201 auto interval =
202 std::chrono::duration_cast<std::chrono::seconds>(collectionInfo.interval).count();
203 StringAppendF(&buffer, "Collection interval: %lld second%s\n", interval,
204 ((interval > 1) ? "s" : ""));
205 for (size_t i = 0; i < collectionInfo.records.size(); ++i) {
206 const auto& record = collectionInfo.records[i];
207 std::stringstream timestamp;
208 timestamp << std::put_time(std::localtime(&record.time), "%c %Z");
209 StringAppendF(&buffer, "Collection %zu: <%s>\n%s\n%s\n", i, timestamp.str().c_str(),
210 std::string(45, '=').c_str(), toString(record).c_str());
211 }
212 return buffer;
213}
214
215Result<void> IoPerfCollection::start() {
216 {
217 Mutex::Autolock lock(mMutex);
218 if (mCurrCollectionEvent != CollectionEvent::INIT || mCollectionThread.joinable()) {
Lakshman Annadorai2c0b0d12020-03-04 11:14:59 -0800219 return Error(INVALID_OPERATION)
220 << "Cannot start I/O performance collection more than once";
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800221 }
222
223 // TODO(b/148489461): Once |kTopNStatsPerCategory|, |kBoottimeCollectionInterval| and
224 // |kPeriodicCollectionInterval| constants are moved to read-only persistent properties,
225 // read and store them in the collection infos.
226
227 mBoottimeCollection = {
228 .interval = kBoottimeCollectionInterval,
229 .maxCacheSize = std::numeric_limits<std::size_t>::max(),
230 .lastCollectionUptime = 0,
231 .records = {},
232 };
233 mPeriodicCollection = {
234 .interval = kPeriodicCollectionInterval,
235 .maxCacheSize = kPeriodicCollectionBufferSize,
236 .lastCollectionUptime = 0,
237 .records = {},
238 };
239 }
240
241 mCollectionThread = std::thread([&]() {
242 {
243 Mutex::Autolock lock(mMutex);
244 if (mCurrCollectionEvent != CollectionEvent::INIT) {
245 ALOGE("Skipping I/O performance data collection as the current collection event "
246 "%s != %s",
247 toString(mCurrCollectionEvent).c_str(),
248 toString(CollectionEvent::INIT).c_str());
249 return;
250 }
251 mCurrCollectionEvent = CollectionEvent::BOOT_TIME;
252 mBoottimeCollection.lastCollectionUptime = mHandlerLooper->now();
253 mHandlerLooper->setLooper(Looper::prepare(/*opts=*/0));
254 mHandlerLooper->sendMessage(this, CollectionEvent::BOOT_TIME);
255 }
256 if (set_sched_policy(0, SP_BACKGROUND) != 0) {
257 ALOGW("Failed to set background scheduling priority to I/O performance data collection "
258 "thread");
259 }
260 bool isCollectionActive = true;
261 // Loop until the collection is not active -- I/O perf collection runs on this thread in a
262 // handler.
263 while (isCollectionActive) {
264 mHandlerLooper->pollAll(/*timeoutMillis=*/-1);
265 Mutex::Autolock lock(mMutex);
266 isCollectionActive = mCurrCollectionEvent != CollectionEvent::TERMINATED;
267 }
268 });
269 return {};
270}
271
272void IoPerfCollection::terminate() {
273 {
274 Mutex::Autolock lock(mMutex);
275 if (mCurrCollectionEvent == CollectionEvent::TERMINATED) {
276 ALOGE("I/O performance data collection was terminated already");
277 return;
278 }
279 ALOGE("Terminating I/O performance data collection");
280 mCurrCollectionEvent = CollectionEvent::TERMINATED;
281 }
282 if (mCollectionThread.joinable()) {
283 mHandlerLooper->removeMessages(this);
284 mHandlerLooper->wake();
285 mCollectionThread.join();
286 }
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800287}
288
289Result<void> IoPerfCollection::onBootFinished() {
290 Mutex::Autolock lock(mMutex);
291 if (mCurrCollectionEvent != CollectionEvent::BOOT_TIME) {
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700292 return Error(INVALID_OPERATION)
293 << "Current I/O performance data collection event "
294 << toString(mCurrCollectionEvent) << " != " << toString(CollectionEvent::BOOT_TIME)
295 << " collection event";
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800296 }
297 mHandlerLooper->removeMessages(this);
298 mCurrCollectionEvent = CollectionEvent::PERIODIC;
299 mPeriodicCollection.lastCollectionUptime = mHandlerLooper->now();
300 mHandlerLooper->sendMessage(this, CollectionEvent::PERIODIC);
301 return {};
302}
303
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700304Result<void> IoPerfCollection::dump(int fd, const Vector<String16>& args) {
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800305 if (args.empty()) {
306 const auto& ret = dumpCollection(fd);
307 if (!ret) {
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700308 return ret;
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800309 }
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700310 return {};
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800311 }
312
313 if (args[0] == String16(kStartCustomCollectionFlag)) {
314 if (args.size() > 5) {
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700315 return Error(INVALID_OPERATION) << "Number of arguments to start custom "
316 << "I/O performance data collection cannot exceed 5";
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800317 }
318 std::chrono::nanoseconds interval = kCustomCollectionInterval;
319 std::chrono::nanoseconds maxDuration = kCustomCollectionDuration;
320 for (size_t i = 1; i < args.size(); ++i) {
321 if (args[i] == String16(kIntervalFlag)) {
322 const auto& ret = parseSecondsFlag(args, i + 1);
323 if (!ret) {
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700324 return Error(FAILED_TRANSACTION)
325 << "Failed to parse " << kIntervalFlag << ": " << ret.error();
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800326 }
327 interval = std::chrono::duration_cast<std::chrono::nanoseconds>(*ret);
328 ++i;
329 continue;
330 }
331 if (args[i] == String16(kMaxDurationFlag)) {
332 const auto& ret = parseSecondsFlag(args, i + 1);
333 if (!ret) {
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700334 return Error(FAILED_TRANSACTION)
335 << "Failed to parse " << kMaxDurationFlag << ": " << ret.error();
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800336 }
337 maxDuration = std::chrono::duration_cast<std::chrono::nanoseconds>(*ret);
338 ++i;
339 continue;
340 }
341 ALOGW("Unknown flag %s provided to start custom I/O performance data collection",
342 String8(args[i]).string());
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700343 return Error(INVALID_OPERATION) << "Unknown flag " << String8(args[i]).string()
344 << " provided to start custom I/O performance data "
345 << "collection";
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800346 }
347 const auto& ret = startCustomCollection(interval, maxDuration);
348 if (!ret) {
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700349 return ret;
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800350 }
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700351 return {};
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800352 }
353
354 if (args[0] == String16(kEndCustomCollectionFlag)) {
355 if (args.size() != 1) {
356 ALOGW("Number of arguments to end custom I/O performance data collection cannot "
357 "exceed 1");
358 }
359 const auto& ret = endCustomCollection(fd);
360 if (!ret) {
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700361 return ret;
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800362 }
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700363 return {};
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800364 }
365
Lakshman Annadorai0dfeeeb2020-03-13 16:57:12 -0700366 return Error(INVALID_OPERATION)
367 << "Dump arguments start neither with " << kStartCustomCollectionFlag << " nor with "
368 << kEndCustomCollectionFlag << " flags";
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800369}
370
371Result<void> IoPerfCollection::dumpCollection(int fd) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800372 Mutex::Autolock lock(mMutex);
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800373 if (mCurrCollectionEvent == CollectionEvent::TERMINATED) {
374 ALOGW("I/O performance data collection not active. Dumping cached data");
375 if (!WriteStringToFd("I/O performance data collection not active. Dumping cached data.",
376 fd)) {
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800377 return Error(FAILED_TRANSACTION) << "Failed to write I/O performance collection status";
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800378 }
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800379 }
380
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800381 const auto& ret = dumpCollectorsStatusLocked(fd);
382 if (!ret) {
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800383 return Error(FAILED_TRANSACTION) << ret.error();
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800384 }
385
386 if (!WriteStringToFd(StringPrintf("%sI/O performance data reports:\n%sBoot-time collection "
387 "report:\n%s\n",
388 kDumpMajorDelimiter.c_str(), kDumpMajorDelimiter.c_str(),
389 std::string(28, '=').c_str()),
390 fd) ||
391 !WriteStringToFd(toString(mBoottimeCollection), fd) ||
392 !WriteStringToFd(StringPrintf("%s\nPeriodic collection report:\n%s\n",
393 std::string(75, '-').c_str(), std::string(27, '=').c_str()),
394 fd) ||
395 !WriteStringToFd(toString(mPeriodicCollection), fd) ||
396 !WriteStringToFd(kDumpMajorDelimiter, fd)) {
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800397 return Error(FAILED_TRANSACTION)
398 << "Failed to dump the boot-time and periodic collection reports.";
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800399 }
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800400 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800401}
402
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800403Result<void> IoPerfCollection::dumpCollectorsStatusLocked(int fd) {
404 if (!mUidIoStats->enabled() &&
405 !WriteStringToFd(StringPrintf("UidIoStats collector failed to access the file %s",
406 mUidIoStats->filePath().c_str()),
407 fd)) {
408 return Error() << "Failed to write UidIoStats collector status";
409 }
410 if (!mProcStat->enabled() &&
411 !WriteStringToFd(StringPrintf("ProcStat collector failed to access the file %s",
412 mProcStat->filePath().c_str()),
413 fd)) {
414 return Error() << "Failed to write ProcStat collector status";
415 }
416 if (!mProcPidStat->enabled() &&
417 !WriteStringToFd(StringPrintf("ProcPidStat collector failed to access the directory %s",
418 mProcPidStat->dirPath().c_str()),
419 fd)) {
420 return Error() << "Failed to write ProcPidStat collector status";
421 }
422 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800423}
424
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800425Result<void> IoPerfCollection::startCustomCollection(std::chrono::nanoseconds interval,
426 std::chrono::nanoseconds maxDuration) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800427 if (interval < kMinCollectionInterval || maxDuration < kMinCollectionInterval) {
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800428 return Error(INVALID_OPERATION)
429 << "Collection interval and maximum duration must be >= "
430 << std::chrono::duration_cast<std::chrono::milliseconds>(kMinCollectionInterval)
431 .count()
432 << " milliseconds.";
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800433 }
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800434 Mutex::Autolock lock(mMutex);
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800435 if (mCurrCollectionEvent != CollectionEvent::PERIODIC) {
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800436 return Error(INVALID_OPERATION)
437 << "Cannot start a custom collection when "
438 << "the current collection event " << toString(mCurrCollectionEvent)
439 << " != " << toString(CollectionEvent::PERIODIC) << " collection event";
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800440 }
441
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800442 mCustomCollection = {
443 .interval = interval,
444 .maxCacheSize = std::numeric_limits<std::size_t>::max(),
445 .lastCollectionUptime = mHandlerLooper->now(),
446 .records = {},
447 };
448
449 mHandlerLooper->removeMessages(this);
450 nsecs_t uptime = mHandlerLooper->now() + maxDuration.count();
451 mHandlerLooper->sendMessageAtTime(uptime, this, SwitchEvent::END_CUSTOM_COLLECTION);
452 mCurrCollectionEvent = CollectionEvent::CUSTOM;
453 mHandlerLooper->sendMessage(this, CollectionEvent::CUSTOM);
454 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800455}
456
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800457Result<void> IoPerfCollection::endCustomCollection(int fd) {
458 Mutex::Autolock lock(mMutex);
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800459 if (mCurrCollectionEvent != CollectionEvent::CUSTOM) {
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800460 return Error(INVALID_OPERATION) << "No custom collection is running";
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800461 }
462
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800463 mHandlerLooper->removeMessages(this);
464 mHandlerLooper->sendMessage(this, SwitchEvent::END_CUSTOM_COLLECTION);
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800465
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800466 const auto& ret = dumpCollectorsStatusLocked(fd);
467 if (!ret) {
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800468 return Error(FAILED_TRANSACTION) << ret.error();
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800469 }
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800470
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800471 if (!WriteStringToFd(StringPrintf("%sI/O performance data report for custom collection:\n%s",
472 kDumpMajorDelimiter.c_str(), kDumpMajorDelimiter.c_str()),
473 fd) ||
474 !WriteStringToFd(toString(mCustomCollection), fd) ||
475 !WriteStringToFd(kDumpMajorDelimiter, fd)) {
Lakshman Annadorai19bf2752020-03-05 17:45:43 -0800476 return Error(FAILED_TRANSACTION) << "Failed to write custom collection report.";
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800477 }
478
479 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800480}
481
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800482void IoPerfCollection::handleMessage(const Message& message) {
483 Result<void> result;
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800484
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800485 switch (message.what) {
486 case static_cast<int>(CollectionEvent::BOOT_TIME):
487 result = processCollectionEvent(CollectionEvent::BOOT_TIME, &mBoottimeCollection);
488 break;
489 case static_cast<int>(CollectionEvent::PERIODIC):
490 result = processCollectionEvent(CollectionEvent::PERIODIC, &mPeriodicCollection);
491 break;
492 case static_cast<int>(CollectionEvent::CUSTOM):
493 result = processCollectionEvent(CollectionEvent::CUSTOM, &mCustomCollection);
494 break;
495 case static_cast<int>(SwitchEvent::END_CUSTOM_COLLECTION): {
496 Mutex::Autolock lock(mMutex);
497 if (mCurrCollectionEvent != CollectionEvent::CUSTOM) {
498 ALOGW("Skipping END_CUSTOM_COLLECTION message as the current collection %s != %s",
499 toString(mCurrCollectionEvent).c_str(),
500 toString(CollectionEvent::CUSTOM).c_str());
501 return;
502 }
503 mCustomCollection = {};
504 mHandlerLooper->removeMessages(this);
505 mCurrCollectionEvent = CollectionEvent::PERIODIC;
506 mPeriodicCollection.lastCollectionUptime = mHandlerLooper->now();
507 mHandlerLooper->sendMessage(this, CollectionEvent::PERIODIC);
508 return;
509 }
510 default:
511 result = Error() << "Unknown message: " << message.what;
512 }
513
514 if (!result) {
515 Mutex::Autolock lock(mMutex);
516 ALOGE("Terminating I/O performance data collection: %s", result.error().message().c_str());
517 // DO NOT CALL terminate() as it tries to join the collection thread but this code is
518 // executed on the collection thread. Thus it will result in a deadlock.
519 mCurrCollectionEvent = CollectionEvent::TERMINATED;
520 mHandlerLooper->removeMessages(this);
521 mHandlerLooper->wake();
522 }
523}
524
525Result<void> IoPerfCollection::processCollectionEvent(CollectionEvent event, CollectionInfo* info) {
526 Mutex::Autolock lock(mMutex);
527 // Messages sent to the looper are intrinsically racy such that a message from the previous
528 // collection event may land in the looper after the current collection has already begun. Thus
529 // verify the current collection event before starting the collection.
530 if (mCurrCollectionEvent != event) {
531 ALOGW("Skipping %s collection message on collection event %s", toString(event).c_str(),
532 toString(mCurrCollectionEvent).c_str());
533 return {};
534 }
535 if (info->maxCacheSize == 0) {
536 return Error() << "Maximum cache size for " << toString(event) << " collection cannot be 0";
537 }
538 if (info->interval < kMinCollectionInterval) {
539 return Error()
540 << "Collection interval of "
541 << std::chrono::duration_cast<std::chrono::seconds>(info->interval).count()
542 << " seconds for " << toString(event) << " collection cannot be less than "
543 << std::chrono::duration_cast<std::chrono::seconds>(kMinCollectionInterval).count()
544 << " seconds";
545 }
546 auto ret = collectLocked(info);
547 if (!ret) {
548 return Error() << toString(event) << " collection failed: " << ret.error();
549 }
550 info->lastCollectionUptime += info->interval.count();
551 mHandlerLooper->sendMessageAtTime(info->lastCollectionUptime, this, event);
552 return {};
553}
554
555Result<void> IoPerfCollection::collectLocked(CollectionInfo* collectionInfo) {
556 if (!mUidIoStats->enabled() && !mProcStat->enabled() && !mProcPidStat->enabled()) {
557 return Error() << "No collectors enabled";
558 }
559 IoPerfRecord record{
560 .time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()),
561 };
562 auto ret = collectSystemIoPerfDataLocked(&record.systemIoPerfData);
563 if (!ret) {
564 return ret;
565 }
566 ret = collectProcessIoPerfDataLocked(&record.processIoPerfData);
567 if (!ret) {
568 return ret;
569 }
570 ret = collectUidIoPerfDataLocked(&record.uidIoPerfData);
571 if (!ret) {
572 return ret;
573 }
574 if (collectionInfo->records.size() > collectionInfo->maxCacheSize) {
575 collectionInfo->records.erase(collectionInfo->records.begin()); // Erase the oldest record.
576 }
577 collectionInfo->records.emplace_back(record);
578 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800579}
580
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800581Result<void> IoPerfCollection::collectUidIoPerfDataLocked(UidIoPerfData* uidIoPerfData) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800582 if (!mUidIoStats->enabled()) {
583 // Don't return an error to avoid pre-mature termination. Instead, fetch data from other
584 // collectors.
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800585 return {};
586 }
587
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800588 const Result<std::unordered_map<uint32_t, UidIoUsage>>& usage = mUidIoStats->collect();
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800589 if (!usage) {
590 return Error() << "Failed to collect uid I/O usage: " << usage.error();
591 }
592
593 // Fetch only the top N reads and writes from the usage records.
594 UidIoUsage tempUsage = {};
595 std::vector<const UidIoUsage*> topNReads(mTopNStatsPerCategory, &tempUsage);
596 std::vector<const UidIoUsage*> topNWrites(mTopNStatsPerCategory, &tempUsage);
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800597 std::unordered_set<uint32_t> unmappedUids;
598
599 for (const auto& uIt : *usage) {
600 const UidIoUsage& curUsage = uIt.second;
601 if (curUsage.ios.isZero()) {
602 continue;
603 }
604 if (mUidToPackageNameMapping.find(curUsage.uid) == mUidToPackageNameMapping.end()) {
605 unmappedUids.insert(curUsage.uid);
606 }
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800607 uidIoPerfData->total[READ_BYTES][FOREGROUND] +=
608 curUsage.ios.metrics[READ_BYTES][FOREGROUND];
609 uidIoPerfData->total[READ_BYTES][BACKGROUND] +=
610 curUsage.ios.metrics[READ_BYTES][BACKGROUND];
611 uidIoPerfData->total[WRITE_BYTES][FOREGROUND] +=
612 curUsage.ios.metrics[WRITE_BYTES][FOREGROUND];
613 uidIoPerfData->total[WRITE_BYTES][BACKGROUND] +=
614 curUsage.ios.metrics[WRITE_BYTES][BACKGROUND];
615 uidIoPerfData->total[FSYNC_COUNT][FOREGROUND] +=
616 curUsage.ios.metrics[FSYNC_COUNT][FOREGROUND];
617 uidIoPerfData->total[FSYNC_COUNT][BACKGROUND] +=
618 curUsage.ios.metrics[FSYNC_COUNT][BACKGROUND];
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800619
620 for (auto it = topNReads.begin(); it != topNReads.end(); ++it) {
621 const UidIoUsage* curRead = *it;
622 if (curRead->ios.sumReadBytes() > curUsage.ios.sumReadBytes()) {
623 continue;
624 }
625 topNReads.erase(topNReads.end() - 1);
626 topNReads.emplace(it, &curUsage);
627 break;
628 }
629 for (auto it = topNWrites.begin(); it != topNWrites.end(); ++it) {
630 const UidIoUsage* curWrite = *it;
631 if (curWrite->ios.sumWriteBytes() > curUsage.ios.sumWriteBytes()) {
632 continue;
633 }
634 topNWrites.erase(topNWrites.end() - 1);
635 topNWrites.emplace(it, &curUsage);
636 break;
637 }
638 }
639
640 const auto& ret = updateUidToPackageNameMapping(unmappedUids);
641 if (!ret) {
642 ALOGW("%s", ret.error().message().c_str());
643 }
644
645 // Convert the top N I/O usage to UidIoPerfData.
646 for (const auto& usage : topNReads) {
647 if (usage->ios.isZero()) {
648 // End of non-zero usage records. This case occurs when the number of UIDs with active
649 // I/O operations is < |kTopNStatsPerCategory|.
650 break;
651 }
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800652 UidIoPerfData::Stats stats = {
653 .userId = multiuser_get_user_id(usage->uid),
654 .packageName = std::to_string(usage->uid),
655 .bytes = {usage->ios.metrics[READ_BYTES][FOREGROUND],
656 usage->ios.metrics[READ_BYTES][BACKGROUND]},
657 .fsync = {usage->ios.metrics[FSYNC_COUNT][FOREGROUND],
658 usage->ios.metrics[FSYNC_COUNT][BACKGROUND]},
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800659 };
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800660 if (mUidToPackageNameMapping.find(usage->uid) != mUidToPackageNameMapping.end()) {
661 stats.packageName = mUidToPackageNameMapping[usage->uid];
662 }
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800663 uidIoPerfData->topNReads.emplace_back(stats);
664 }
665
666 for (const auto& usage : topNWrites) {
667 if (usage->ios.isZero()) {
668 // End of non-zero usage records. This case occurs when the number of UIDs with active
669 // I/O operations is < |kTopNStatsPerCategory|.
670 break;
671 }
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800672 UidIoPerfData::Stats stats = {
673 .userId = multiuser_get_user_id(usage->uid),
674 .packageName = std::to_string(usage->uid),
675 .bytes = {usage->ios.metrics[WRITE_BYTES][FOREGROUND],
676 usage->ios.metrics[WRITE_BYTES][BACKGROUND]},
677 .fsync = {usage->ios.metrics[FSYNC_COUNT][FOREGROUND],
678 usage->ios.metrics[FSYNC_COUNT][BACKGROUND]},
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800679 };
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800680 if (mUidToPackageNameMapping.find(usage->uid) != mUidToPackageNameMapping.end()) {
681 stats.packageName = mUidToPackageNameMapping[usage->uid];
682 }
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800683 uidIoPerfData->topNWrites.emplace_back(stats);
684 }
685 return {};
686}
687
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800688Result<void> IoPerfCollection::collectSystemIoPerfDataLocked(SystemIoPerfData* systemIoPerfData) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800689 if (!mProcStat->enabled()) {
690 // Don't return an error to avoid pre-mature termination. Instead, fetch data from other
691 // collectors.
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800692 return {};
693 }
694
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800695 const Result<ProcStatInfo>& procStatInfo = mProcStat->collect();
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800696 if (!procStatInfo) {
697 return Error() << "Failed to collect proc stats: " << procStatInfo.error();
698 }
699
700 systemIoPerfData->cpuIoWaitTime = procStatInfo->cpuStats.ioWaitTime;
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800701 systemIoPerfData->totalCpuTime = procStatInfo->totalCpuTime();
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800702 systemIoPerfData->ioBlockedProcessesCnt = procStatInfo->ioBlockedProcessesCnt;
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800703 systemIoPerfData->totalProcessesCnt = procStatInfo->totalProcessesCnt();
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800704 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800705}
706
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800707Result<void> IoPerfCollection::collectProcessIoPerfDataLocked(
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800708 ProcessIoPerfData* processIoPerfData) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800709 if (!mProcPidStat->enabled()) {
710 // Don't return an error to avoid pre-mature termination. Instead, fetch data from other
711 // collectors.
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800712 return {};
713 }
714
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800715 const Result<std::vector<ProcessStats>>& processStats = mProcPidStat->collect();
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800716 if (!processStats) {
717 return Error() << "Failed to collect process stats: " << processStats.error();
718 }
719
720 const auto& uidProcessStats = getUidProcessStats(*processStats);
721
722 std::unordered_set<uint32_t> unmappedUids;
723 // Fetch only the top N I/O blocked UIDs and UIDs with most major page faults.
724 UidProcessStats temp = {};
725 std::vector<const UidProcessStats*> topNIoBlockedUids(mTopNStatsPerCategory, &temp);
726 std::vector<const UidProcessStats*> topNMajorFaults(mTopNStatsPerCategory, &temp);
727 processIoPerfData->totalMajorFaults = 0;
728 for (const auto& it : uidProcessStats) {
729 const UidProcessStats& curStats = it.second;
730 if (mUidToPackageNameMapping.find(curStats.uid) == mUidToPackageNameMapping.end()) {
731 unmappedUids.insert(curStats.uid);
732 }
733 processIoPerfData->totalMajorFaults += curStats.majorFaults;
734 for (auto it = topNIoBlockedUids.begin(); it != topNIoBlockedUids.end(); ++it) {
735 const UidProcessStats* topStats = *it;
736 if (topStats->ioBlockedTasksCnt > curStats.ioBlockedTasksCnt) {
737 continue;
738 }
739 topNIoBlockedUids.erase(topNIoBlockedUids.end() - 1);
740 topNIoBlockedUids.emplace(it, &curStats);
741 break;
742 }
743 for (auto it = topNMajorFaults.begin(); it != topNMajorFaults.end(); ++it) {
744 const UidProcessStats* topStats = *it;
745 if (topStats->majorFaults > curStats.majorFaults) {
746 continue;
747 }
748 topNMajorFaults.erase(topNMajorFaults.end() - 1);
749 topNMajorFaults.emplace(it, &curStats);
750 break;
751 }
752 }
753
754 const auto& ret = updateUidToPackageNameMapping(unmappedUids);
755 if (!ret) {
756 ALOGW("%s", ret.error().message().c_str());
757 }
758
759 // Convert the top N uid process stats to ProcessIoPerfData.
760 for (const auto& it : topNIoBlockedUids) {
761 if (it->ioBlockedTasksCnt == 0) {
762 // End of non-zero elements. This case occurs when the number of UIDs with I/O blocked
763 // processes is < |kTopNStatsPerCategory|.
764 break;
765 }
766 ProcessIoPerfData::Stats stats = {
767 .userId = multiuser_get_user_id(it->uid),
768 .packageName = std::to_string(it->uid),
769 .count = it->ioBlockedTasksCnt,
770 };
771 if (mUidToPackageNameMapping.find(it->uid) != mUidToPackageNameMapping.end()) {
772 stats.packageName = mUidToPackageNameMapping[it->uid];
773 }
774 processIoPerfData->topNIoBlockedUids.emplace_back(stats);
775 processIoPerfData->topNIoBlockedUidsTotalTaskCnt.emplace_back(it->totalTasksCnt);
776 }
777 for (const auto& it : topNMajorFaults) {
778 if (it->majorFaults == 0) {
779 // End of non-zero elements. This case occurs when the number of UIDs with major faults
780 // is < |kTopNStatsPerCategory|.
781 break;
782 }
783 ProcessIoPerfData::Stats stats = {
784 .userId = multiuser_get_user_id(it->uid),
785 .packageName = std::to_string(it->uid),
786 .count = it->majorFaults,
787 };
788 if (mUidToPackageNameMapping.find(it->uid) != mUidToPackageNameMapping.end()) {
789 stats.packageName = mUidToPackageNameMapping[it->uid];
790 }
791 processIoPerfData->topNMajorFaults.emplace_back(stats);
792 }
793 if (mLastMajorFaults == 0) {
794 processIoPerfData->majorFaultsPercentChange = 0;
795 } else {
796 int64_t increase = processIoPerfData->totalMajorFaults - mLastMajorFaults;
797 processIoPerfData->majorFaultsPercentChange =
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800798 (static_cast<double>(increase) / static_cast<double>(mLastMajorFaults)) * 100.0;
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800799 }
800 mLastMajorFaults = processIoPerfData->totalMajorFaults;
801 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800802}
803
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800804Result<void> IoPerfCollection::updateUidToPackageNameMapping(
805 const std::unordered_set<uint32_t>& uids) {
806 std::vector<int32_t> appUids;
807
808 for (const auto& uid : uids) {
809 if (uid >= AID_APP_START) {
810 appUids.emplace_back(static_cast<int32_t>(uid));
811 continue;
812 }
813 // System/native UIDs.
814 passwd* usrpwd = getpwuid(uid);
815 if (!usrpwd) {
816 continue;
817 }
818 mUidToPackageNameMapping[uid] = std::string(usrpwd->pw_name);
819 }
820
821 if (appUids.empty()) {
822 return {};
823 }
824
825 if (mPackageManager == nullptr) {
826 auto ret = retrievePackageManager();
827 if (!ret) {
828 return Error() << "Failed to retrieve package manager: " << ret.error();
829 }
830 }
831
832 std::vector<std::string> packageNames;
833 const binder::Status& status = mPackageManager->getNamesForUids(appUids, &packageNames);
834 if (!status.isOk()) {
835 return Error() << "package_native::getNamesForUids failed: " << status.exceptionMessage();
836 }
837
838 for (uint32_t i = 0; i < appUids.size(); i++) {
839 if (!packageNames[i].empty()) {
840 mUidToPackageNameMapping[appUids[i]] = packageNames[i];
841 }
842 }
843
844 return {};
845}
846
847Result<void> IoPerfCollection::retrievePackageManager() {
848 const sp<IServiceManager> sm = defaultServiceManager();
849 if (sm == nullptr) {
850 return Error() << "Failed to retrieve defaultServiceManager";
851 }
852
853 sp<IBinder> binder = sm->getService(String16("package_native"));
854 if (binder == nullptr) {
855 return Error() << "Failed to get service package_native";
856 }
857 mPackageManager = interface_cast<IPackageManagerNative>(binder);
858 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800859}
860
861} // namespace watchdog
862} // namespace automotive
863} // namespace android