Lakshman Annadorai | 2fce001 | 2020-02-18 14:05:22 -0800 | [diff] [blame] | 1 | /** |
| 2 | * Copyright (c) 2020, The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #define LOG_TAG "carwatchdogd" |
| 18 | |
| 19 | #include "ProcPidStat.h" |
| 20 | |
| 21 | #include <android-base/file.h> |
| 22 | #include <android-base/parseint.h> |
| 23 | #include <android-base/strings.h> |
| 24 | #include <dirent.h> |
| 25 | #include <log/log.h> |
| 26 | |
| 27 | #include <string> |
| 28 | #include <unordered_map> |
| 29 | #include <vector> |
| 30 | |
| 31 | namespace android { |
| 32 | namespace automotive { |
| 33 | namespace watchdog { |
| 34 | |
| 35 | using android::base::EndsWith; |
| 36 | using android::base::Error; |
| 37 | using android::base::ParseInt; |
| 38 | using android::base::ParseUint; |
| 39 | using android::base::ReadFileToString; |
| 40 | using android::base::Result; |
| 41 | using android::base::Split; |
| 42 | |
| 43 | namespace { |
| 44 | |
| 45 | enum ReadError { |
| 46 | ERR_INVALID_FILE = 0, |
| 47 | ERR_FILE_OPEN_READ = 1, |
| 48 | NUM_ERRORS = 2, |
| 49 | }; |
| 50 | |
| 51 | // /proc/PID/stat or /proc/PID/task/TID/stat format: |
| 52 | // <pid> <comm> <state> <ppid> <pgrp ID> <session ID> <tty_nr> <tpgid> <flags> <minor faults> |
| 53 | // <children minor faults> <major faults> <children major faults> <user mode time> |
| 54 | // <system mode time> <children user mode time> <children kernel mode time> <priority> <nice value> |
| 55 | // <num threads> <start time since boot> <virtual memory size> <resident set size> <rss soft limit> |
| 56 | // <start code addr> <end code addr> <start stack addr> <ESP value> <EIP> <bitmap of pending sigs> |
| 57 | // <bitmap of blocked sigs> <bitmap of ignored sigs> <waiting channel> <num pages swapped> |
| 58 | // <cumulative pages swapped> <exit signal> <processor #> <real-time prio> <agg block I/O delays> |
| 59 | // <guest time> <children guest time> <start data addr> <end data addr> <start break addr> |
| 60 | // <cmd line args start addr> <amd line args end addr> <env start addr> <env end addr> <exit code> |
| 61 | // Example line: 1 (init) S 0 0 0 0 0 0 0 0 220 0 0 0 0 0 0 0 2 0 0 ...etc... |
| 62 | bool parsePidStatLine(const std::string& line, PidStat* pidStat) { |
| 63 | std::vector<std::string> fields = Split(line, " "); |
| 64 | |
| 65 | // Note: Regex parsing for the below logic increased the time taken to run the |
| 66 | // ProcPidStatTest#TestProcPidStatContentsFromDevice from 151.7ms to 1.3 seconds. |
| 67 | |
| 68 | // Comm string is enclosed with ( ) brackets and may contain space(s). Thus calculate the |
| 69 | // commEndOffset based on the field that contains the closing bracket. |
| 70 | size_t commEndOffset = 0; |
| 71 | for (size_t i = 1; i < fields.size(); ++i) { |
| 72 | pidStat->comm += fields[i]; |
| 73 | if (EndsWith(fields[i], ")")) { |
| 74 | commEndOffset = i - 1; |
| 75 | break; |
| 76 | } |
| 77 | pidStat->comm += " "; |
| 78 | } |
| 79 | |
| 80 | if (pidStat->comm.front() != '(' || pidStat->comm.back() != ')') { |
| 81 | ALOGW("Comm string `%s` not enclosed in brackets", pidStat->comm.c_str()); |
| 82 | return false; |
| 83 | } |
| 84 | pidStat->comm.erase(pidStat->comm.begin()); |
| 85 | pidStat->comm.erase(pidStat->comm.end() - 1); |
| 86 | |
| 87 | // The required data is in the first 22 + |commEndOffset| fields so make sure there are at least |
| 88 | // these many fields in the file. |
| 89 | if (fields.size() < 22 + commEndOffset || !ParseUint(fields[0], &pidStat->pid) || |
| 90 | !ParseUint(fields[3 + commEndOffset], &pidStat->ppid) || |
| 91 | !ParseUint(fields[11 + commEndOffset], &pidStat->majorFaults) || |
| 92 | !ParseUint(fields[19 + commEndOffset], &pidStat->numThreads) || |
| 93 | !ParseUint(fields[21 + commEndOffset], &pidStat->startTime)) { |
| 94 | ALOGW("Invalid proc pid stat contents: \"%s\"", line.c_str()); |
| 95 | return false; |
| 96 | } |
| 97 | pidStat->state = fields[2 + commEndOffset]; |
| 98 | return true; |
| 99 | } |
| 100 | |
| 101 | Result<void> readPidStatFile(const std::string& path, PidStat* pidStat) { |
| 102 | std::string buffer; |
| 103 | if (!ReadFileToString(path, &buffer)) { |
| 104 | return Error(ERR_FILE_OPEN_READ) << "ReadFileToString failed for " << path; |
| 105 | } |
| 106 | std::vector<std::string> lines = Split(std::move(buffer), "\n"); |
| 107 | if (lines.size() != 1 && (lines.size() != 2 || !lines[1].empty())) { |
| 108 | return Error(ERR_INVALID_FILE) << path << " contains " << lines.size() << " lines != 1"; |
| 109 | } |
| 110 | if (!parsePidStatLine(std::move(lines[0]), pidStat)) { |
| 111 | return Error(ERR_INVALID_FILE) << "Failed to parse the contents of " << path; |
| 112 | } |
| 113 | return {}; |
| 114 | } |
| 115 | |
| 116 | } // namespace |
| 117 | |
| 118 | Result<std::vector<ProcessStats>> ProcPidStat::collect() { |
| 119 | if (!mEnabled) { |
| 120 | return Error() << "Can not access PID stat files under " << kProcDirPath; |
| 121 | } |
| 122 | |
| 123 | Mutex::Autolock lock(mMutex); |
| 124 | const auto& processStats = getProcessStatsLocked(); |
| 125 | if (!processStats) { |
| 126 | return Error() << processStats.error(); |
| 127 | } |
| 128 | |
| 129 | std::vector<ProcessStats> delta; |
| 130 | for (const auto& it : *processStats) { |
| 131 | const ProcessStats& curStats = it.second; |
| 132 | const auto& cachedIt = mLastProcessStats.find(it.first); |
| 133 | if (cachedIt == mLastProcessStats.end() || |
| 134 | cachedIt->second.process.startTime != curStats.process.startTime) { |
| 135 | // New/reused PID so don't calculate the delta. |
| 136 | delta.emplace_back(curStats); |
| 137 | continue; |
| 138 | } |
| 139 | |
| 140 | ProcessStats deltaStats = curStats; |
| 141 | const ProcessStats& cachedStats = cachedIt->second; |
| 142 | deltaStats.process.majorFaults -= cachedStats.process.majorFaults; |
| 143 | for (auto& deltaThread : deltaStats.threads) { |
| 144 | const auto& cachedThread = cachedStats.threads.find(deltaThread.first); |
| 145 | if (cachedThread == cachedStats.threads.end() || |
| 146 | cachedThread->second.startTime != deltaThread.second.startTime) { |
| 147 | // New TID or TID reused by the same PID so don't calculate the delta. |
| 148 | continue; |
| 149 | } |
| 150 | deltaThread.second.majorFaults -= cachedThread->second.majorFaults; |
| 151 | } |
| 152 | delta.emplace_back(deltaStats); |
| 153 | } |
| 154 | mLastProcessStats = *processStats; |
| 155 | return delta; |
| 156 | } |
| 157 | |
| 158 | Result<std::unordered_map<uint32_t, ProcessStats>> ProcPidStat::getProcessStatsLocked() const { |
| 159 | std::unordered_map<uint32_t, ProcessStats> processStats; |
| 160 | auto procDirp = std::unique_ptr<DIR, int (*)(DIR*)>(opendir(mPath.c_str()), closedir); |
| 161 | if (!procDirp) { |
| 162 | return Error() << "Failed to open " << mPath << " directory"; |
| 163 | } |
| 164 | dirent* pidDir = nullptr; |
| 165 | while ((pidDir = readdir(procDirp.get())) != nullptr) { |
| 166 | // 1. Read top-level pid stats. |
| 167 | uint32_t pid = 0; |
| 168 | if (pidDir->d_type != DT_DIR || !ParseUint(pidDir->d_name, &pid)) { |
| 169 | continue; |
| 170 | } |
| 171 | ProcessStats curStats; |
| 172 | std::string path = StringPrintf((mPath + kStatFileFormat).c_str(), pid); |
| 173 | const auto& ret = readPidStatFile(path, &curStats.process); |
| 174 | if (!ret) { |
| 175 | // PID may disappear between scanning the directory and parsing the stat file. |
| 176 | // Thus treat ERR_FILE_OPEN_READ errors as soft errors. |
| 177 | if (ret.error().code() != ERR_FILE_OPEN_READ) { |
| 178 | return Error() << "Failed to read top-level per-process stat file: " |
| 179 | << ret.error().message().c_str(); |
| 180 | } |
| 181 | ALOGW("Failed to read top-level per-process stat file %s: %s", path.c_str(), |
| 182 | ret.error().message().c_str()); |
| 183 | continue; |
| 184 | } |
| 185 | |
| 186 | // 2. When not found in the cache, fetch tgid/UID as soon as possible because processes |
| 187 | // may terminate during scanning. |
| 188 | const auto& it = mLastProcessStats.find(curStats.process.pid); |
| 189 | if (it == mLastProcessStats.end() || |
| 190 | it->second.process.startTime != curStats.process.startTime || it->second.tgid == -1 || |
| 191 | it->second.uid == -1) { |
| 192 | const auto& ret = getPidStatusLocked(&curStats); |
| 193 | if (!ret) { |
| 194 | if (ret.error().code() != ERR_FILE_OPEN_READ) { |
| 195 | return Error() << "Failed to read pid status for pid " << curStats.process.pid |
| 196 | << ": " << ret.error().message().c_str(); |
| 197 | } |
| 198 | ALOGW("Failed to read pid status for pid %" PRIu32 ": %s", curStats.process.pid, |
| 199 | ret.error().message().c_str()); |
| 200 | // Default tgid and uid values are -1 (aka unknown). |
| 201 | } |
| 202 | } else { |
| 203 | // Fetch from cache. |
| 204 | curStats.tgid = it->second.tgid; |
| 205 | curStats.uid = it->second.uid; |
| 206 | } |
| 207 | |
| 208 | if (curStats.tgid != -1 && curStats.tgid != curStats.process.pid) { |
| 209 | ALOGW("Skipping non-process (i.e., Tgid != PID) entry for PID %" PRIu32, |
| 210 | curStats.process.pid); |
| 211 | continue; |
| 212 | } |
| 213 | |
| 214 | // 3. Fetch per-thread stats. |
| 215 | std::string taskDir = StringPrintf((mPath + kTaskDirFormat).c_str(), pid); |
| 216 | auto taskDirp = std::unique_ptr<DIR, int (*)(DIR*)>(opendir(taskDir.c_str()), closedir); |
| 217 | if (!taskDirp) { |
| 218 | // Treat this as a soft error so at least the process stats will be collected. |
| 219 | ALOGW("Failed to open %s directory", taskDir.c_str()); |
| 220 | } |
| 221 | dirent* tidDir = nullptr; |
| 222 | bool didReadMainThread = false; |
| 223 | while (taskDirp != nullptr && (tidDir = readdir(taskDirp.get())) != nullptr) { |
| 224 | uint32_t tid = 0; |
| 225 | if (tidDir->d_type != DT_DIR || !ParseUint(tidDir->d_name, &tid)) { |
| 226 | continue; |
| 227 | } |
| 228 | if (processStats.find(tid) != processStats.end()) { |
| 229 | return Error() << "Process stats already exists for TID " << tid |
| 230 | << ". Stats will be double counted"; |
| 231 | } |
| 232 | |
| 233 | PidStat curThreadStat = {}; |
| 234 | path = StringPrintf((taskDir + kStatFileFormat).c_str(), tid); |
| 235 | const auto& ret = readPidStatFile(path, &curThreadStat); |
| 236 | if (!ret) { |
| 237 | if (ret.error().code() != ERR_FILE_OPEN_READ) { |
| 238 | return Error() << "Failed to read per-thread stat file: " |
| 239 | << ret.error().message().c_str(); |
| 240 | } |
| 241 | // Maybe the thread terminated before reading the file so skip this thread and |
| 242 | // continue with scanning the next thread's stat. |
| 243 | ALOGW("Failed to read per-thread stat file %s: %s", path.c_str(), |
| 244 | ret.error().message().c_str()); |
| 245 | continue; |
| 246 | } |
| 247 | if (curThreadStat.pid == curStats.process.pid) { |
| 248 | didReadMainThread = true; |
| 249 | } |
| 250 | curStats.threads[curThreadStat.pid] = curThreadStat; |
| 251 | } |
| 252 | if (!didReadMainThread) { |
| 253 | // In the event of failure to read main-thread info (mostly because the process |
| 254 | // terminated during scanning/parsing), fill out the stat that are common between main |
| 255 | // thread and the process. |
| 256 | curStats.threads[curStats.process.pid] = PidStat{ |
| 257 | .pid = curStats.process.pid, |
| 258 | .comm = curStats.process.comm, |
| 259 | .state = curStats.process.state, |
| 260 | .ppid = curStats.process.ppid, |
| 261 | .numThreads = curStats.process.numThreads, |
| 262 | .startTime = curStats.process.startTime, |
| 263 | }; |
| 264 | } |
| 265 | processStats[curStats.process.pid] = curStats; |
| 266 | } |
| 267 | return processStats; |
| 268 | } |
| 269 | |
| 270 | Result<void> ProcPidStat::getPidStatusLocked(ProcessStats* processStats) const { |
| 271 | std::string buffer; |
| 272 | std::string path = StringPrintf((mPath + kStatusFileFormat).c_str(), processStats->process.pid); |
| 273 | if (!ReadFileToString(path, &buffer)) { |
| 274 | return Error(ERR_FILE_OPEN_READ) << "ReadFileToString failed for " << path; |
| 275 | } |
| 276 | std::vector<std::string> lines = Split(std::move(buffer), "\n"); |
| 277 | bool didReadUid = false; |
| 278 | bool didReadTgid = false; |
| 279 | for (size_t i = 0; i < lines.size(); ++i) { |
| 280 | if (lines[i].empty()) { |
| 281 | continue; |
| 282 | } |
| 283 | if (!lines[i].compare(0, 4, "Uid:")) { |
| 284 | if (didReadUid) { |
| 285 | return Error(ERR_INVALID_FILE) |
| 286 | << "Duplicate UID line: \"" << lines[i] << "\" in file " << path; |
| 287 | } |
| 288 | std::vector<std::string> fields = Split(lines[i], "\t"); |
| 289 | if (fields.size() < 2 || !ParseInt(fields[1], &processStats->uid)) { |
| 290 | return Error(ERR_INVALID_FILE) |
| 291 | << "Invalid UID line: \"" << lines[i] << "\" in file " << path; |
| 292 | } |
| 293 | didReadUid = true; |
| 294 | } else if (!lines[i].compare(0, 5, "Tgid:")) { |
| 295 | if (didReadTgid) { |
| 296 | return Error(ERR_INVALID_FILE) |
| 297 | << "Duplicate Tgid line: \"" << lines[i] << "\" in file" << path; |
| 298 | } |
| 299 | std::vector<std::string> fields = Split(lines[i], "\t"); |
| 300 | if (fields.size() != 2 || !ParseInt(fields[1], &processStats->tgid)) { |
| 301 | return Error(ERR_INVALID_FILE) |
| 302 | << "Invalid tgid line: \"" << lines[i] << "\" in file" << path; |
| 303 | } |
| 304 | didReadTgid = true; |
| 305 | } |
| 306 | } |
| 307 | if (!didReadUid || !didReadTgid) { |
| 308 | return Error(ERR_INVALID_FILE) << "Incomplete file " << mPath + kStatusFileFormat; |
| 309 | } |
| 310 | return {}; |
| 311 | } |
| 312 | |
| 313 | } // namespace watchdog |
| 314 | } // namespace automotive |
| 315 | } // namespace android |