blob: ebb9b4e27332b4ad705b00927ba2ca17bcd18217 [file] [log] [blame]
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -08001/**
2 * Copyright (c) 2020, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "carwatchdogd"
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -080018
19#include "IoPerfCollection.h"
20
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080021#include <android-base/file.h>
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080022#include <android-base/stringprintf.h>
23#include <binder/IServiceManager.h>
24#include <cutils/android_filesystem_config.h>
25#include <inttypes.h>
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -080026#include <log/log.h>
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080027#include <processgroup/sched_policy.h>
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080028#include <pwd.h>
29
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080030#include <iomanip>
31#include <limits>
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080032#include <string>
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080033#include <thread>
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080034#include <unordered_map>
35#include <unordered_set>
36#include <vector>
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -080037
38namespace android {
39namespace automotive {
40namespace watchdog {
41
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080042using android::defaultServiceManager;
43using android::IBinder;
44using android::IServiceManager;
45using android::sp;
46using android::String16;
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -080047using android::base::Error;
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -080048using android::base::Result;
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080049using android::base::StringAppendF;
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080050using android::base::WriteStringToFd;
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080051using android::content::pm::IPackageManagerNative;
52
53namespace {
54
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080055const std::string kDumpMajorDelimiter = std::string(100, '-') + "\n";
56
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080057double percentage(uint64_t numer, uint64_t denom) {
58 return denom == 0 ? 0.0 : (static_cast<double>(numer) / static_cast<double>(denom)) * 100.0;
59}
60
Lakshman Annadorai325e9652020-02-20 17:27:11 -080061struct UidProcessStats {
62 uint64_t uid = 0;
63 uint32_t ioBlockedTasksCnt = 0;
64 uint32_t totalTasksCnt = 0;
65 uint64_t majorFaults = 0;
66};
67
68std::unordered_map<uint32_t, UidProcessStats> getUidProcessStats(
69 const std::vector<ProcessStats>& processStats) {
70 std::unordered_map<uint32_t, UidProcessStats> uidProcessStats;
71 for (const auto& stats : processStats) {
72 if (stats.uid < 0) {
73 continue;
74 }
75 uint32_t uid = static_cast<uint32_t>(stats.uid);
76 if (uidProcessStats.find(uid) == uidProcessStats.end()) {
77 uidProcessStats[uid] = UidProcessStats{.uid = uid};
78 }
79 auto& curUidProcessStats = uidProcessStats[uid];
80 // Top-level process stats has the aggregated major page faults count and this should be
81 // persistent across thread creation/termination. Thus use the value from this field.
82 curUidProcessStats.majorFaults += stats.process.majorFaults;
83 curUidProcessStats.totalTasksCnt += stats.threads.size();
84 // The process state is the same as the main thread state. Thus to avoid double counting
85 // ignore the process state.
86 for (const auto& threadStat : stats.threads) {
87 curUidProcessStats.ioBlockedTasksCnt += threadStat.second.state == "D" ? 1 : 0;
88 }
89 }
90 return uidProcessStats;
91}
92
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -080093} // namespace
94
95std::string toString(const UidIoPerfData& data) {
96 std::string buffer;
Lakshman Annadoraif2855b22020-03-03 14:13:10 -080097 if (data.topNReads.size() > 0) {
98 StringAppendF(&buffer, "\nTop N Reads:\n%s\n", std::string(12, '-').c_str());
99 StringAppendF(&buffer,
100 "Android User ID, Package Name, Foreground Bytes, Foreground Bytes %%, "
101 "Foreground Fsync, Foreground Fsync %%, Background Bytes, "
102 "Background Bytes %%, Background Fsync, Background Fsync %%\n");
103 }
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800104 for (const auto& stat : data.topNReads) {
105 StringAppendF(&buffer, "%" PRIu32 ", %s", stat.userId, stat.packageName.c_str());
106 for (int i = 0; i < UID_STATES; ++i) {
107 StringAppendF(&buffer, ", %" PRIu64 ", %.2f%%, %" PRIu64 ", %.2f%%", stat.bytes[i],
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800108 percentage(stat.bytes[i], data.total[READ_BYTES][i]), stat.fsync[i],
109 percentage(stat.fsync[i], data.total[FSYNC_COUNT][i]));
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800110 }
111 StringAppendF(&buffer, "\n");
112 }
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800113 if (data.topNWrites.size() > 0) {
114 StringAppendF(&buffer, "\nTop N Writes:\n%s\n", std::string(13, '-').c_str());
115 StringAppendF(&buffer,
116 "Android User ID, Package Name, Foreground Bytes, Foreground Bytes %%, "
117 "Foreground Fsync, Foreground Fsync %%, Background Bytes, "
118 "Background Bytes %%, Background Fsync, Background Fsync %%\n");
119 }
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800120 for (const auto& stat : data.topNWrites) {
121 StringAppendF(&buffer, "%" PRIu32 ", %s", stat.userId, stat.packageName.c_str());
122 for (int i = 0; i < UID_STATES; ++i) {
123 StringAppendF(&buffer, ", %" PRIu64 ", %.2f%%, %" PRIu64 ", %.2f%%", stat.bytes[i],
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800124 percentage(stat.bytes[i], data.total[WRITE_BYTES][i]), stat.fsync[i],
125 percentage(stat.fsync[i], data.total[FSYNC_COUNT][i]));
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800126 }
127 StringAppendF(&buffer, "\n");
128 }
129 return buffer;
130}
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800131
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800132std::string toString(const SystemIoPerfData& data) {
133 std::string buffer;
134 StringAppendF(&buffer, "CPU I/O wait time/percent: %" PRIu64 " / %.2f%%\n", data.cpuIoWaitTime,
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800135 percentage(data.cpuIoWaitTime, data.totalCpuTime));
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800136 StringAppendF(&buffer, "Number of I/O blocked processes/percent: %" PRIu32 " / %.2f%%\n",
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800137 data.ioBlockedProcessesCnt,
138 percentage(data.ioBlockedProcessesCnt, data.totalProcessesCnt));
139 return buffer;
140}
141
142std::string toString(const ProcessIoPerfData& data) {
143 std::string buffer;
144 StringAppendF(&buffer, "Number of major page faults since last collection: %" PRIu64 "\n",
145 data.totalMajorFaults);
146 StringAppendF(&buffer,
147 "Percentage of change in major page faults since last collection: %.2f%%\n",
148 data.majorFaultsPercentChange);
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800149 if (data.topNMajorFaults.size() > 0) {
150 StringAppendF(&buffer, "\nTop N major page faults:\n%s\n", std::string(24, '-').c_str());
151 StringAppendF(&buffer,
152 "Android User ID, Package Name, Number of major page faults, "
153 "Percentage of total major page faults\n");
154 }
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800155 for (const auto& stat : data.topNMajorFaults) {
156 StringAppendF(&buffer, "%" PRIu32 ", %s, %" PRIu64 ", %.2f%%\n", stat.userId,
157 stat.packageName.c_str(), stat.count,
158 percentage(stat.count, data.totalMajorFaults));
159 }
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800160 if (data.topNIoBlockedUids.size() > 0) {
161 StringAppendF(&buffer, "\nTop N I/O waiting UIDs:\n%s\n", std::string(23, '-').c_str());
162 StringAppendF(&buffer,
163 "Android User ID, Package Name, Number of owned tasks waiting for I/O, "
164 "Percentage of owned tasks waiting for I/O\n");
165 }
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800166 for (size_t i = 0; i < data.topNIoBlockedUids.size(); ++i) {
167 const auto& stat = data.topNIoBlockedUids[i];
168 StringAppendF(&buffer, "%" PRIu32 ", %s, %" PRIu64 ", %.2f%%\n", stat.userId,
169 stat.packageName.c_str(), stat.count,
170 percentage(stat.count, data.topNIoBlockedUidsTotalTaskCnt[i]));
171 }
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800172 return buffer;
173}
174
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800175std::string toString(const IoPerfRecord& record) {
176 std::string buffer;
177 StringAppendF(&buffer, "%s%s%s", toString(record.systemIoPerfData).c_str(),
178 toString(record.processIoPerfData).c_str(),
179 toString(record.uidIoPerfData).c_str());
180 return buffer;
181}
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800182
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800183std::string toString(const CollectionInfo& collectionInfo) {
184 std::string buffer;
185 StringAppendF(&buffer, "Number of collections: %zu\n", collectionInfo.records.size());
186 auto interval =
187 std::chrono::duration_cast<std::chrono::seconds>(collectionInfo.interval).count();
188 StringAppendF(&buffer, "Collection interval: %lld second%s\n", interval,
189 ((interval > 1) ? "s" : ""));
190 for (size_t i = 0; i < collectionInfo.records.size(); ++i) {
191 const auto& record = collectionInfo.records[i];
192 std::stringstream timestamp;
193 timestamp << std::put_time(std::localtime(&record.time), "%c %Z");
194 StringAppendF(&buffer, "Collection %zu: <%s>\n%s\n%s\n", i, timestamp.str().c_str(),
195 std::string(45, '=').c_str(), toString(record).c_str());
196 }
197 return buffer;
198}
199
200Result<void> IoPerfCollection::start() {
201 {
202 Mutex::Autolock lock(mMutex);
203 if (mCurrCollectionEvent != CollectionEvent::INIT || mCollectionThread.joinable()) {
204 return Error() << "Cannot start I/O performance collection more than once";
205 }
206
207 // TODO(b/148489461): Once |kTopNStatsPerCategory|, |kBoottimeCollectionInterval| and
208 // |kPeriodicCollectionInterval| constants are moved to read-only persistent properties,
209 // read and store them in the collection infos.
210
211 mBoottimeCollection = {
212 .interval = kBoottimeCollectionInterval,
213 .maxCacheSize = std::numeric_limits<std::size_t>::max(),
214 .lastCollectionUptime = 0,
215 .records = {},
216 };
217 mPeriodicCollection = {
218 .interval = kPeriodicCollectionInterval,
219 .maxCacheSize = kPeriodicCollectionBufferSize,
220 .lastCollectionUptime = 0,
221 .records = {},
222 };
223 }
224
225 mCollectionThread = std::thread([&]() {
226 {
227 Mutex::Autolock lock(mMutex);
228 if (mCurrCollectionEvent != CollectionEvent::INIT) {
229 ALOGE("Skipping I/O performance data collection as the current collection event "
230 "%s != %s",
231 toString(mCurrCollectionEvent).c_str(),
232 toString(CollectionEvent::INIT).c_str());
233 return;
234 }
235 mCurrCollectionEvent = CollectionEvent::BOOT_TIME;
236 mBoottimeCollection.lastCollectionUptime = mHandlerLooper->now();
237 mHandlerLooper->setLooper(Looper::prepare(/*opts=*/0));
238 mHandlerLooper->sendMessage(this, CollectionEvent::BOOT_TIME);
239 }
240 if (set_sched_policy(0, SP_BACKGROUND) != 0) {
241 ALOGW("Failed to set background scheduling priority to I/O performance data collection "
242 "thread");
243 }
244 bool isCollectionActive = true;
245 // Loop until the collection is not active -- I/O perf collection runs on this thread in a
246 // handler.
247 while (isCollectionActive) {
248 mHandlerLooper->pollAll(/*timeoutMillis=*/-1);
249 Mutex::Autolock lock(mMutex);
250 isCollectionActive = mCurrCollectionEvent != CollectionEvent::TERMINATED;
251 }
252 });
253 return {};
254}
255
256void IoPerfCollection::terminate() {
257 {
258 Mutex::Autolock lock(mMutex);
259 if (mCurrCollectionEvent == CollectionEvent::TERMINATED) {
260 ALOGE("I/O performance data collection was terminated already");
261 return;
262 }
263 ALOGE("Terminating I/O performance data collection");
264 mCurrCollectionEvent = CollectionEvent::TERMINATED;
265 }
266 if (mCollectionThread.joinable()) {
267 mHandlerLooper->removeMessages(this);
268 mHandlerLooper->wake();
269 mCollectionThread.join();
270 }
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800271}
272
273Result<void> IoPerfCollection::onBootFinished() {
274 Mutex::Autolock lock(mMutex);
275 if (mCurrCollectionEvent != CollectionEvent::BOOT_TIME) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800276 return Error() << "Current I/O performance data collection event "
277 << toString(mCurrCollectionEvent)
278 << " != " << toString(CollectionEvent::BOOT_TIME) << " collection event";
279 }
280 mHandlerLooper->removeMessages(this);
281 mCurrCollectionEvent = CollectionEvent::PERIODIC;
282 mPeriodicCollection.lastCollectionUptime = mHandlerLooper->now();
283 mHandlerLooper->sendMessage(this, CollectionEvent::PERIODIC);
284 return {};
285}
286
287status_t IoPerfCollection::dump(int fd, const Vector<String16>& /*args*/) {
288 Mutex::Autolock lock(mMutex);
289
290 // TODO(b/148489461): Parse the arguments and figure out whether to start/end custom collection
291 // or dump the boot-time/periodic collection records.
292
293 if (mCurrCollectionEvent == CollectionEvent::TERMINATED) {
294 ALOGW("I/O performance data collection not active. Dumping cached data");
295 if (!WriteStringToFd("I/O performance data collection not active. Dumping cached data.",
296 fd)) {
297 ALOGW("Failed to write I/O performance collection status");
298 return FAILED_TRANSACTION;
299 }
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800300 }
301
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800302 const auto& ret = dumpCollectorsStatusLocked(fd);
303 if (!ret) {
304 ALOGW("%s", ret.error().message().c_str());
305 return FAILED_TRANSACTION;
306 }
307
308 if (!WriteStringToFd(StringPrintf("%sI/O performance data reports:\n%sBoot-time collection "
309 "report:\n%s\n",
310 kDumpMajorDelimiter.c_str(), kDumpMajorDelimiter.c_str(),
311 std::string(28, '=').c_str()),
312 fd) ||
313 !WriteStringToFd(toString(mBoottimeCollection), fd) ||
314 !WriteStringToFd(StringPrintf("%s\nPeriodic collection report:\n%s\n",
315 std::string(75, '-').c_str(), std::string(27, '=').c_str()),
316 fd) ||
317 !WriteStringToFd(toString(mPeriodicCollection), fd) ||
318 !WriteStringToFd(kDumpMajorDelimiter, fd)) {
319 ALOGE("Failed to dump the boot-time and periodic collection reports.");
320 return FAILED_TRANSACTION;
321 }
322
323 return OK;
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800324}
325
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800326Result<void> IoPerfCollection::dumpCollectorsStatusLocked(int fd) {
327 if (!mUidIoStats->enabled() &&
328 !WriteStringToFd(StringPrintf("UidIoStats collector failed to access the file %s",
329 mUidIoStats->filePath().c_str()),
330 fd)) {
331 return Error() << "Failed to write UidIoStats collector status";
332 }
333 if (!mProcStat->enabled() &&
334 !WriteStringToFd(StringPrintf("ProcStat collector failed to access the file %s",
335 mProcStat->filePath().c_str()),
336 fd)) {
337 return Error() << "Failed to write ProcStat collector status";
338 }
339 if (!mProcPidStat->enabled() &&
340 !WriteStringToFd(StringPrintf("ProcPidStat collector failed to access the directory %s",
341 mProcPidStat->dirPath().c_str()),
342 fd)) {
343 return Error() << "Failed to write ProcPidStat collector status";
344 }
345 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800346}
347
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800348Result<void> IoPerfCollection::startCustomCollectionLocked(std::chrono::nanoseconds interval,
349 std::chrono::nanoseconds maxDuration) {
350 if (interval < kMinCollectionInterval || maxDuration < kMinCollectionInterval) {
351 return Error() << "Collection interval and maximum duration must be >= "
352 << std::chrono::duration_cast<std::chrono::milliseconds>(
353 kMinCollectionInterval)
354 .count()
355 << " milliseconds.";
356 }
357
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800358 if (mCurrCollectionEvent != CollectionEvent::PERIODIC) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800359 return Error() << "Cannot start a custom collection when the current collection event "
360 << toString(mCurrCollectionEvent)
361 << " != " << toString(CollectionEvent::PERIODIC) << "collection event";
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800362 }
363
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800364 mCustomCollection = {
365 .interval = interval,
366 .maxCacheSize = std::numeric_limits<std::size_t>::max(),
367 .lastCollectionUptime = mHandlerLooper->now(),
368 .records = {},
369 };
370
371 mHandlerLooper->removeMessages(this);
372 nsecs_t uptime = mHandlerLooper->now() + maxDuration.count();
373 mHandlerLooper->sendMessageAtTime(uptime, this, SwitchEvent::END_CUSTOM_COLLECTION);
374 mCurrCollectionEvent = CollectionEvent::CUSTOM;
375 mHandlerLooper->sendMessage(this, CollectionEvent::CUSTOM);
376 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800377}
378
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800379Result<void> IoPerfCollection::endCustomCollectionLocked(int fd) {
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800380 if (mCurrCollectionEvent != CollectionEvent::CUSTOM) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800381 return Error() << "No custom collection is running";
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800382 }
383
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800384 mHandlerLooper->removeMessages(this);
385 mHandlerLooper->sendMessage(this, SwitchEvent::END_CUSTOM_COLLECTION);
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800386
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800387 const auto& ret = dumpCollectorsStatusLocked(fd);
388 if (!ret) {
389 return ret;
390 }
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800391
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800392 if (!WriteStringToFd(StringPrintf("%sI/O performance data report for custom collection:\n%s",
393 kDumpMajorDelimiter.c_str(), kDumpMajorDelimiter.c_str()),
394 fd) ||
395 !WriteStringToFd(toString(mCustomCollection), fd) ||
396 !WriteStringToFd(kDumpMajorDelimiter, fd)) {
397 return Error() << "Failed to write custom collection report.";
398 }
399
400 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800401}
402
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800403void IoPerfCollection::handleMessage(const Message& message) {
404 Result<void> result;
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800405
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800406 switch (message.what) {
407 case static_cast<int>(CollectionEvent::BOOT_TIME):
408 result = processCollectionEvent(CollectionEvent::BOOT_TIME, &mBoottimeCollection);
409 break;
410 case static_cast<int>(CollectionEvent::PERIODIC):
411 result = processCollectionEvent(CollectionEvent::PERIODIC, &mPeriodicCollection);
412 break;
413 case static_cast<int>(CollectionEvent::CUSTOM):
414 result = processCollectionEvent(CollectionEvent::CUSTOM, &mCustomCollection);
415 break;
416 case static_cast<int>(SwitchEvent::END_CUSTOM_COLLECTION): {
417 Mutex::Autolock lock(mMutex);
418 if (mCurrCollectionEvent != CollectionEvent::CUSTOM) {
419 ALOGW("Skipping END_CUSTOM_COLLECTION message as the current collection %s != %s",
420 toString(mCurrCollectionEvent).c_str(),
421 toString(CollectionEvent::CUSTOM).c_str());
422 return;
423 }
424 mCustomCollection = {};
425 mHandlerLooper->removeMessages(this);
426 mCurrCollectionEvent = CollectionEvent::PERIODIC;
427 mPeriodicCollection.lastCollectionUptime = mHandlerLooper->now();
428 mHandlerLooper->sendMessage(this, CollectionEvent::PERIODIC);
429 return;
430 }
431 default:
432 result = Error() << "Unknown message: " << message.what;
433 }
434
435 if (!result) {
436 Mutex::Autolock lock(mMutex);
437 ALOGE("Terminating I/O performance data collection: %s", result.error().message().c_str());
438 // DO NOT CALL terminate() as it tries to join the collection thread but this code is
439 // executed on the collection thread. Thus it will result in a deadlock.
440 mCurrCollectionEvent = CollectionEvent::TERMINATED;
441 mHandlerLooper->removeMessages(this);
442 mHandlerLooper->wake();
443 }
444}
445
446Result<void> IoPerfCollection::processCollectionEvent(CollectionEvent event, CollectionInfo* info) {
447 Mutex::Autolock lock(mMutex);
448 // Messages sent to the looper are intrinsically racy such that a message from the previous
449 // collection event may land in the looper after the current collection has already begun. Thus
450 // verify the current collection event before starting the collection.
451 if (mCurrCollectionEvent != event) {
452 ALOGW("Skipping %s collection message on collection event %s", toString(event).c_str(),
453 toString(mCurrCollectionEvent).c_str());
454 return {};
455 }
456 if (info->maxCacheSize == 0) {
457 return Error() << "Maximum cache size for " << toString(event) << " collection cannot be 0";
458 }
459 if (info->interval < kMinCollectionInterval) {
460 return Error()
461 << "Collection interval of "
462 << std::chrono::duration_cast<std::chrono::seconds>(info->interval).count()
463 << " seconds for " << toString(event) << " collection cannot be less than "
464 << std::chrono::duration_cast<std::chrono::seconds>(kMinCollectionInterval).count()
465 << " seconds";
466 }
467 auto ret = collectLocked(info);
468 if (!ret) {
469 return Error() << toString(event) << " collection failed: " << ret.error();
470 }
471 info->lastCollectionUptime += info->interval.count();
472 mHandlerLooper->sendMessageAtTime(info->lastCollectionUptime, this, event);
473 return {};
474}
475
476Result<void> IoPerfCollection::collectLocked(CollectionInfo* collectionInfo) {
477 if (!mUidIoStats->enabled() && !mProcStat->enabled() && !mProcPidStat->enabled()) {
478 return Error() << "No collectors enabled";
479 }
480 IoPerfRecord record{
481 .time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()),
482 };
483 auto ret = collectSystemIoPerfDataLocked(&record.systemIoPerfData);
484 if (!ret) {
485 return ret;
486 }
487 ret = collectProcessIoPerfDataLocked(&record.processIoPerfData);
488 if (!ret) {
489 return ret;
490 }
491 ret = collectUidIoPerfDataLocked(&record.uidIoPerfData);
492 if (!ret) {
493 return ret;
494 }
495 if (collectionInfo->records.size() > collectionInfo->maxCacheSize) {
496 collectionInfo->records.erase(collectionInfo->records.begin()); // Erase the oldest record.
497 }
498 collectionInfo->records.emplace_back(record);
499 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800500}
501
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800502Result<void> IoPerfCollection::collectUidIoPerfDataLocked(UidIoPerfData* uidIoPerfData) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800503 if (!mUidIoStats->enabled()) {
504 // Don't return an error to avoid pre-mature termination. Instead, fetch data from other
505 // collectors.
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800506 return {};
507 }
508
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800509 const Result<std::unordered_map<uint32_t, UidIoUsage>>& usage = mUidIoStats->collect();
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800510 if (!usage) {
511 return Error() << "Failed to collect uid I/O usage: " << usage.error();
512 }
513
514 // Fetch only the top N reads and writes from the usage records.
515 UidIoUsage tempUsage = {};
516 std::vector<const UidIoUsage*> topNReads(mTopNStatsPerCategory, &tempUsage);
517 std::vector<const UidIoUsage*> topNWrites(mTopNStatsPerCategory, &tempUsage);
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800518 std::unordered_set<uint32_t> unmappedUids;
519
520 for (const auto& uIt : *usage) {
521 const UidIoUsage& curUsage = uIt.second;
522 if (curUsage.ios.isZero()) {
523 continue;
524 }
525 if (mUidToPackageNameMapping.find(curUsage.uid) == mUidToPackageNameMapping.end()) {
526 unmappedUids.insert(curUsage.uid);
527 }
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800528 uidIoPerfData->total[READ_BYTES][FOREGROUND] +=
529 curUsage.ios.metrics[READ_BYTES][FOREGROUND];
530 uidIoPerfData->total[READ_BYTES][BACKGROUND] +=
531 curUsage.ios.metrics[READ_BYTES][BACKGROUND];
532 uidIoPerfData->total[WRITE_BYTES][FOREGROUND] +=
533 curUsage.ios.metrics[WRITE_BYTES][FOREGROUND];
534 uidIoPerfData->total[WRITE_BYTES][BACKGROUND] +=
535 curUsage.ios.metrics[WRITE_BYTES][BACKGROUND];
536 uidIoPerfData->total[FSYNC_COUNT][FOREGROUND] +=
537 curUsage.ios.metrics[FSYNC_COUNT][FOREGROUND];
538 uidIoPerfData->total[FSYNC_COUNT][BACKGROUND] +=
539 curUsage.ios.metrics[FSYNC_COUNT][BACKGROUND];
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800540
541 for (auto it = topNReads.begin(); it != topNReads.end(); ++it) {
542 const UidIoUsage* curRead = *it;
543 if (curRead->ios.sumReadBytes() > curUsage.ios.sumReadBytes()) {
544 continue;
545 }
546 topNReads.erase(topNReads.end() - 1);
547 topNReads.emplace(it, &curUsage);
548 break;
549 }
550 for (auto it = topNWrites.begin(); it != topNWrites.end(); ++it) {
551 const UidIoUsage* curWrite = *it;
552 if (curWrite->ios.sumWriteBytes() > curUsage.ios.sumWriteBytes()) {
553 continue;
554 }
555 topNWrites.erase(topNWrites.end() - 1);
556 topNWrites.emplace(it, &curUsage);
557 break;
558 }
559 }
560
561 const auto& ret = updateUidToPackageNameMapping(unmappedUids);
562 if (!ret) {
563 ALOGW("%s", ret.error().message().c_str());
564 }
565
566 // Convert the top N I/O usage to UidIoPerfData.
567 for (const auto& usage : topNReads) {
568 if (usage->ios.isZero()) {
569 // End of non-zero usage records. This case occurs when the number of UIDs with active
570 // I/O operations is < |kTopNStatsPerCategory|.
571 break;
572 }
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800573 UidIoPerfData::Stats stats = {
574 .userId = multiuser_get_user_id(usage->uid),
575 .packageName = std::to_string(usage->uid),
576 .bytes = {usage->ios.metrics[READ_BYTES][FOREGROUND],
577 usage->ios.metrics[READ_BYTES][BACKGROUND]},
578 .fsync = {usage->ios.metrics[FSYNC_COUNT][FOREGROUND],
579 usage->ios.metrics[FSYNC_COUNT][BACKGROUND]},
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800580 };
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800581 if (mUidToPackageNameMapping.find(usage->uid) != mUidToPackageNameMapping.end()) {
582 stats.packageName = mUidToPackageNameMapping[usage->uid];
583 }
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800584 uidIoPerfData->topNReads.emplace_back(stats);
585 }
586
587 for (const auto& usage : topNWrites) {
588 if (usage->ios.isZero()) {
589 // End of non-zero usage records. This case occurs when the number of UIDs with active
590 // I/O operations is < |kTopNStatsPerCategory|.
591 break;
592 }
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800593 UidIoPerfData::Stats stats = {
594 .userId = multiuser_get_user_id(usage->uid),
595 .packageName = std::to_string(usage->uid),
596 .bytes = {usage->ios.metrics[WRITE_BYTES][FOREGROUND],
597 usage->ios.metrics[WRITE_BYTES][BACKGROUND]},
598 .fsync = {usage->ios.metrics[FSYNC_COUNT][FOREGROUND],
599 usage->ios.metrics[FSYNC_COUNT][BACKGROUND]},
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800600 };
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800601 if (mUidToPackageNameMapping.find(usage->uid) != mUidToPackageNameMapping.end()) {
602 stats.packageName = mUidToPackageNameMapping[usage->uid];
603 }
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800604 uidIoPerfData->topNWrites.emplace_back(stats);
605 }
606 return {};
607}
608
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800609Result<void> IoPerfCollection::collectSystemIoPerfDataLocked(SystemIoPerfData* systemIoPerfData) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800610 if (!mProcStat->enabled()) {
611 // Don't return an error to avoid pre-mature termination. Instead, fetch data from other
612 // collectors.
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800613 return {};
614 }
615
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800616 const Result<ProcStatInfo>& procStatInfo = mProcStat->collect();
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800617 if (!procStatInfo) {
618 return Error() << "Failed to collect proc stats: " << procStatInfo.error();
619 }
620
621 systemIoPerfData->cpuIoWaitTime = procStatInfo->cpuStats.ioWaitTime;
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800622 systemIoPerfData->totalCpuTime = procStatInfo->totalCpuTime();
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800623 systemIoPerfData->ioBlockedProcessesCnt = procStatInfo->ioBlockedProcessesCnt;
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800624 systemIoPerfData->totalProcessesCnt = procStatInfo->totalProcessesCnt();
Lakshman Annadoraif9b47c22020-02-10 16:45:18 -0800625 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800626}
627
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800628Result<void> IoPerfCollection::collectProcessIoPerfDataLocked(
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800629 ProcessIoPerfData* processIoPerfData) {
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800630 if (!mProcPidStat->enabled()) {
631 // Don't return an error to avoid pre-mature termination. Instead, fetch data from other
632 // collectors.
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800633 return {};
634 }
635
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800636 const Result<std::vector<ProcessStats>>& processStats = mProcPidStat->collect();
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800637 if (!processStats) {
638 return Error() << "Failed to collect process stats: " << processStats.error();
639 }
640
641 const auto& uidProcessStats = getUidProcessStats(*processStats);
642
643 std::unordered_set<uint32_t> unmappedUids;
644 // Fetch only the top N I/O blocked UIDs and UIDs with most major page faults.
645 UidProcessStats temp = {};
646 std::vector<const UidProcessStats*> topNIoBlockedUids(mTopNStatsPerCategory, &temp);
647 std::vector<const UidProcessStats*> topNMajorFaults(mTopNStatsPerCategory, &temp);
648 processIoPerfData->totalMajorFaults = 0;
649 for (const auto& it : uidProcessStats) {
650 const UidProcessStats& curStats = it.second;
651 if (mUidToPackageNameMapping.find(curStats.uid) == mUidToPackageNameMapping.end()) {
652 unmappedUids.insert(curStats.uid);
653 }
654 processIoPerfData->totalMajorFaults += curStats.majorFaults;
655 for (auto it = topNIoBlockedUids.begin(); it != topNIoBlockedUids.end(); ++it) {
656 const UidProcessStats* topStats = *it;
657 if (topStats->ioBlockedTasksCnt > curStats.ioBlockedTasksCnt) {
658 continue;
659 }
660 topNIoBlockedUids.erase(topNIoBlockedUids.end() - 1);
661 topNIoBlockedUids.emplace(it, &curStats);
662 break;
663 }
664 for (auto it = topNMajorFaults.begin(); it != topNMajorFaults.end(); ++it) {
665 const UidProcessStats* topStats = *it;
666 if (topStats->majorFaults > curStats.majorFaults) {
667 continue;
668 }
669 topNMajorFaults.erase(topNMajorFaults.end() - 1);
670 topNMajorFaults.emplace(it, &curStats);
671 break;
672 }
673 }
674
675 const auto& ret = updateUidToPackageNameMapping(unmappedUids);
676 if (!ret) {
677 ALOGW("%s", ret.error().message().c_str());
678 }
679
680 // Convert the top N uid process stats to ProcessIoPerfData.
681 for (const auto& it : topNIoBlockedUids) {
682 if (it->ioBlockedTasksCnt == 0) {
683 // End of non-zero elements. This case occurs when the number of UIDs with I/O blocked
684 // processes is < |kTopNStatsPerCategory|.
685 break;
686 }
687 ProcessIoPerfData::Stats stats = {
688 .userId = multiuser_get_user_id(it->uid),
689 .packageName = std::to_string(it->uid),
690 .count = it->ioBlockedTasksCnt,
691 };
692 if (mUidToPackageNameMapping.find(it->uid) != mUidToPackageNameMapping.end()) {
693 stats.packageName = mUidToPackageNameMapping[it->uid];
694 }
695 processIoPerfData->topNIoBlockedUids.emplace_back(stats);
696 processIoPerfData->topNIoBlockedUidsTotalTaskCnt.emplace_back(it->totalTasksCnt);
697 }
698 for (const auto& it : topNMajorFaults) {
699 if (it->majorFaults == 0) {
700 // End of non-zero elements. This case occurs when the number of UIDs with major faults
701 // is < |kTopNStatsPerCategory|.
702 break;
703 }
704 ProcessIoPerfData::Stats stats = {
705 .userId = multiuser_get_user_id(it->uid),
706 .packageName = std::to_string(it->uid),
707 .count = it->majorFaults,
708 };
709 if (mUidToPackageNameMapping.find(it->uid) != mUidToPackageNameMapping.end()) {
710 stats.packageName = mUidToPackageNameMapping[it->uid];
711 }
712 processIoPerfData->topNMajorFaults.emplace_back(stats);
713 }
714 if (mLastMajorFaults == 0) {
715 processIoPerfData->majorFaultsPercentChange = 0;
716 } else {
717 int64_t increase = processIoPerfData->totalMajorFaults - mLastMajorFaults;
718 processIoPerfData->majorFaultsPercentChange =
Lakshman Annadoraif2855b22020-03-03 14:13:10 -0800719 (static_cast<double>(increase) / static_cast<double>(mLastMajorFaults)) * 100.0;
Lakshman Annadorai325e9652020-02-20 17:27:11 -0800720 }
721 mLastMajorFaults = processIoPerfData->totalMajorFaults;
722 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800723}
724
Lakshman Annadoraiab4d3fd2020-02-06 11:24:56 -0800725Result<void> IoPerfCollection::updateUidToPackageNameMapping(
726 const std::unordered_set<uint32_t>& uids) {
727 std::vector<int32_t> appUids;
728
729 for (const auto& uid : uids) {
730 if (uid >= AID_APP_START) {
731 appUids.emplace_back(static_cast<int32_t>(uid));
732 continue;
733 }
734 // System/native UIDs.
735 passwd* usrpwd = getpwuid(uid);
736 if (!usrpwd) {
737 continue;
738 }
739 mUidToPackageNameMapping[uid] = std::string(usrpwd->pw_name);
740 }
741
742 if (appUids.empty()) {
743 return {};
744 }
745
746 if (mPackageManager == nullptr) {
747 auto ret = retrievePackageManager();
748 if (!ret) {
749 return Error() << "Failed to retrieve package manager: " << ret.error();
750 }
751 }
752
753 std::vector<std::string> packageNames;
754 const binder::Status& status = mPackageManager->getNamesForUids(appUids, &packageNames);
755 if (!status.isOk()) {
756 return Error() << "package_native::getNamesForUids failed: " << status.exceptionMessage();
757 }
758
759 for (uint32_t i = 0; i < appUids.size(); i++) {
760 if (!packageNames[i].empty()) {
761 mUidToPackageNameMapping[appUids[i]] = packageNames[i];
762 }
763 }
764
765 return {};
766}
767
768Result<void> IoPerfCollection::retrievePackageManager() {
769 const sp<IServiceManager> sm = defaultServiceManager();
770 if (sm == nullptr) {
771 return Error() << "Failed to retrieve defaultServiceManager";
772 }
773
774 sp<IBinder> binder = sm->getService(String16("package_native"));
775 if (binder == nullptr) {
776 return Error() << "Failed to get service package_native";
777 }
778 mPackageManager = interface_cast<IPackageManagerNative>(binder);
779 return {};
Lakshman Annadorai6094e9a2020-01-31 10:03:33 -0800780}
781
782} // namespace watchdog
783} // namespace automotive
784} // namespace android