blob: 749ed30136d7e10365de2d7817703c9776d1c019 [file] [log] [blame]
/**
* Copyright (c) 2020, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define LOG_TAG "carwatchdogd"
#define DEBUG true // TODO(b/151474489): stop ship if true.
#include "WatchdogProcessService.h"
#include <android-base/chrono_utils.h>
#include <android-base/file.h>
#include <android-base/properties.h>
#include <android-base/stringprintf.h>
#include <binder/IPCThreadState.h>
namespace android {
namespace automotive {
namespace watchdog {
using std::literals::chrono_literals::operator""s;
using android::base::Error;
using android::base::GetProperty;
using android::base::Result;
using android::base::StringAppendF;
using android::base::StringPrintf;
using android::base::WriteStringToFd;
using android::binder::Status;
namespace {
const std::vector<TimeoutLength> kTimeouts = {TimeoutLength::TIMEOUT_CRITICAL,
TimeoutLength::TIMEOUT_MODERATE,
TimeoutLength::TIMEOUT_NORMAL};
std::chrono::nanoseconds timeoutToDurationNs(const TimeoutLength& timeout) {
switch (timeout) {
case TimeoutLength::TIMEOUT_CRITICAL:
return 3s; // 3s and no buffer time.
case TimeoutLength::TIMEOUT_MODERATE:
return 6s; // 5s + 1s as buffer time.
case TimeoutLength::TIMEOUT_NORMAL:
return 12s; // 10s + 2s as buffer time.
}
}
std::string pidArrayToString(const std::vector<int32_t>& pids) {
size_t size = pids.size();
if (size == 0) {
return "";
}
std::string buffer;
StringAppendF(&buffer, "%d", pids[0]);
for (int i = 1; i < size; i++) {
int pid = pids[i];
StringAppendF(&buffer, ", %d", pid);
}
return buffer;
}
bool isSystemShuttingDown() {
std::string sysPowerCtl;
std::istringstream tokenStream(GetProperty("sys.powerctl", ""));
std::getline(tokenStream, sysPowerCtl, ',');
return sysPowerCtl == "reboot" || sysPowerCtl == "shutdown";
}
} // namespace
WatchdogProcessService::WatchdogProcessService(const sp<Looper>& handlerLooper) :
mHandlerLooper(handlerLooper), mLastSessionId(0) {
mMessageHandler = new MessageHandlerImpl(this);
mWatchdogEnabled = true;
for (const auto& timeout : kTimeouts) {
mClients.insert(std::make_pair(timeout, std::vector<ClientInfo>()));
mPingedClients.insert(std::make_pair(timeout, PingedClientMap()));
}
}
Status WatchdogProcessService::registerClient(const sp<ICarWatchdogClient>& client,
TimeoutLength timeout) {
Mutex::Autolock lock(mMutex);
return registerClientLocked(client, timeout, ClientType::Regular);
}
Status WatchdogProcessService::unregisterClient(const sp<ICarWatchdogClient>& client) {
Mutex::Autolock lock(mMutex);
sp<IBinder> binder = BnCarWatchdog::asBinder(client);
// kTimeouts is declared as global static constant to cover all kinds of timeout (CRITICAL,
// MODERATE, NORMAL).
return unregisterClientLocked(kTimeouts, binder, ClientType::Regular);
}
Status WatchdogProcessService::registerMediator(const sp<ICarWatchdogClient>& mediator) {
Mutex::Autolock lock(mMutex);
// Mediator's timeout is always TIMEOUT_CRITICAL.
return registerClientLocked(mediator, TimeoutLength::TIMEOUT_CRITICAL, ClientType::Mediator);
}
Status WatchdogProcessService::unregisterMediator(const sp<ICarWatchdogClient>& mediator) {
std::vector<TimeoutLength> timeouts = {TimeoutLength::TIMEOUT_CRITICAL};
sp<IBinder> binder = BnCarWatchdog::asBinder(mediator);
Mutex::Autolock lock(mMutex);
return unregisterClientLocked(timeouts, binder, ClientType::Mediator);
}
Status WatchdogProcessService::registerMonitor(const sp<ICarWatchdogMonitor>& monitor) {
Mutex::Autolock lock(mMutex);
sp<IBinder> binder = BnCarWatchdog::asBinder(monitor);
if (mMonitor != nullptr && binder == BnCarWatchdog::asBinder(mMonitor)) {
return Status::ok();
}
status_t ret = binder->linkToDeath(this);
if (ret != OK) {
ALOGW("Cannot register the monitor. The monitor is dead.");
return Status::fromExceptionCode(Status::EX_ILLEGAL_STATE, "The monitor is dead.");
}
mMonitor = monitor;
if (DEBUG) {
ALOGD("Car watchdog monitor is registered");
}
return Status::ok();
}
Status WatchdogProcessService::unregisterMonitor(const sp<ICarWatchdogMonitor>& monitor) {
Mutex::Autolock lock(mMutex);
if (mMonitor != monitor) {
ALOGW("Cannot unregister the monitor. The monitor has not been registered.");
return Status::fromExceptionCode(Status::EX_ILLEGAL_ARGUMENT,
"The monitor has not been registered.");
}
sp<IBinder> binder = BnCarWatchdog::asBinder(monitor);
binder->unlinkToDeath(this);
mMonitor = nullptr;
if (DEBUG) {
ALOGD("Car watchdog monitor is unregistered");
}
return Status::ok();
}
Status WatchdogProcessService::tellClientAlive(const sp<ICarWatchdogClient>& client,
int32_t sessionId) {
Mutex::Autolock lock(mMutex);
return tellClientAliveLocked(client, sessionId);
}
Status WatchdogProcessService::tellMediatorAlive(const sp<ICarWatchdogClient>& mediator,
const std::vector<int32_t>& clientsNotResponding,
int32_t sessionId) {
Status status;
{
Mutex::Autolock lock(mMutex);
if (DEBUG) {
std::string buffer;
int size = clientsNotResponding.size();
if (size != 0) {
StringAppendF(&buffer, "%d", clientsNotResponding[0]);
for (int i = 1; i < clientsNotResponding.size(); i++) {
StringAppendF(&buffer, ", %d", clientsNotResponding[i]);
}
ALOGD("Mediator(session: %d) responded with non-responding clients: %s", sessionId,
buffer.c_str());
}
}
status = tellClientAliveLocked(mediator, sessionId);
}
if (status.isOk()) {
dumpAndKillAllProcesses(clientsNotResponding);
}
return status;
}
Status WatchdogProcessService::tellDumpFinished(const sp<ICarWatchdogMonitor>& monitor,
int32_t pid) {
Mutex::Autolock lock(mMutex);
if (mMonitor == nullptr || monitor == nullptr ||
BnCarWatchdog::asBinder(monitor) != BnCarWatchdog::asBinder(mMonitor)) {
return Status::
fromExceptionCode(Status::EX_ILLEGAL_ARGUMENT,
"The monitor is not registered or an invalid monitor is given");
}
ALOGI("Process(pid: %d) has been dumped and killed", pid);
return Status::ok();
}
Status WatchdogProcessService::notifyPowerCycleChange(PowerCycle cycle) {
std::string buffer;
Mutex::Autolock lock(mMutex);
bool oldStatus = mWatchdogEnabled;
switch (cycle) {
case PowerCycle::POWER_CYCLE_SHUTDOWN:
mWatchdogEnabled = false;
buffer = "SHUTDOWN power cycle";
break;
case PowerCycle::POWER_CYCLE_SUSPEND:
mWatchdogEnabled = false;
buffer = "SUSPEND power cycle";
break;
case PowerCycle::POWER_CYCLE_RESUME:
mWatchdogEnabled = true;
for (const auto& timeout : kTimeouts) {
startHealthCheckingLocked(timeout);
}
buffer = "RESUME power cycle";
break;
default:
ALOGW("Unsupported power cycle: %d", cycle);
return Status::fromExceptionCode(Status::EX_ILLEGAL_ARGUMENT,
"Unsupported power cycle");
}
ALOGI("Received %s", buffer.c_str());
if (oldStatus != mWatchdogEnabled) {
ALOGI("Car watchdog is %s", mWatchdogEnabled ? "enabled" : "disabled");
}
return Status::ok();
}
Status WatchdogProcessService::notifyUserStateChange(userid_t userId, UserState state) {
std::string buffer;
Mutex::Autolock lock(mMutex);
switch (state) {
case UserState::USER_STATE_STARTED:
mStoppedUserId.erase(userId);
buffer = StringPrintf("user(%d) is started", userId);
break;
case UserState::USER_STATE_STOPPED:
mStoppedUserId.insert(userId);
buffer = StringPrintf("user(%d) is stopped", userId);
break;
default:
ALOGW("Unsupported user state: %d", state);
return Status::fromExceptionCode(Status::EX_ILLEGAL_ARGUMENT, "Unsupported user state");
}
ALOGI("Received user state change: %s", buffer.c_str());
return Status::ok();
}
Result<void> WatchdogProcessService::dump(int fd, const Vector<String16>& /*args*/) {
Mutex::Autolock lock(mMutex);
const char* indent = " ";
const char* doubleIndent = " ";
std::string buffer;
WriteStringToFd("CAR WATCHDOG PROCESS SERVICE\n", fd);
WriteStringToFd(StringPrintf("%sWatchdog enabled: %s\n", indent,
mWatchdogEnabled ? "true" : "false"),
fd);
WriteStringToFd(StringPrintf("%sRegistered clients\n", indent), fd);
int count = 1;
for (const auto& timeout : kTimeouts) {
std::vector<ClientInfo>& clients = mClients[timeout];
for (auto it = clients.begin(); it != clients.end(); it++, count++) {
WriteStringToFd(StringPrintf("%sClient #%d: %s\n", doubleIndent, count,
it->toString().c_str()),
fd);
}
}
WriteStringToFd(StringPrintf("%sMonitor registered: %s\n", indent,
mMonitor == nullptr ? "false" : "true"),
fd);
WriteStringToFd(StringPrintf("%sisSystemShuttingDown: %s\n", indent,
isSystemShuttingDown() ? "true" : "false"),
fd);
buffer = "none";
bool first = true;
for (const auto& userId : mStoppedUserId) {
if (first) {
buffer = StringPrintf("%d", userId);
first = false;
} else {
StringAppendF(&buffer, ", %d", userId);
}
}
WriteStringToFd(StringPrintf("%sStopped users: %s\n", indent, buffer.c_str()), fd);
return {};
}
void WatchdogProcessService::doHealthCheck(int what) {
mHandlerLooper->removeMessages(mMessageHandler, what);
if (!isWatchdogEnabled()) {
return;
}
const TimeoutLength timeout = static_cast<TimeoutLength>(what);
dumpAndKillClientsIfNotResponding(timeout);
/* Generates a temporary/local vector containing clients.
* Using a local copy may send unnecessary ping messages to clients after they are unregistered.
* Clients should be able to handle them.
*/
std::vector<ClientInfo> clientsToCheck;
PingedClientMap& pingedClients = mPingedClients[timeout];
{
Mutex::Autolock lock(mMutex);
pingedClients.clear();
clientsToCheck = mClients[timeout];
for (auto& clientInfo : clientsToCheck) {
if (mStoppedUserId.count(clientInfo.userId) > 0) {
continue;
}
int sessionId = getNewSessionId();
clientInfo.sessionId = sessionId;
pingedClients.insert(std::make_pair(sessionId, clientInfo));
}
}
for (const auto& clientInfo : clientsToCheck) {
Status status = clientInfo.client->checkIfAlive(clientInfo.sessionId, timeout);
if (!status.isOk()) {
ALOGW("Sending a ping message to client(pid: %d) failed: %s", clientInfo.pid,
status.exceptionMessage().c_str());
{
Mutex::Autolock lock(mMutex);
pingedClients.erase(clientInfo.sessionId);
}
}
}
// Though the size of pingedClients is a more specific measure, clientsToCheck is used as a
// conservative approach.
if (clientsToCheck.size() > 0) {
auto durationNs = timeoutToDurationNs(timeout);
mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler, Message(what));
}
}
void WatchdogProcessService::terminate() {
Mutex::Autolock lock(mMutex);
for (const auto& timeout : kTimeouts) {
std::vector<ClientInfo>& clients = mClients[timeout];
for (auto it = clients.begin(); it != clients.end();) {
sp<IBinder> binder = BnCarWatchdog::asBinder((*it).client);
binder->unlinkToDeath(this);
it = clients.erase(it);
}
}
}
void WatchdogProcessService::binderDied(const wp<IBinder>& who) {
Mutex::Autolock lock(mMutex);
IBinder* binder = who.unsafe_get();
// Check if dead binder is monitor.
sp<IBinder> monitor = BnCarWatchdog::asBinder(mMonitor);
if (monitor == binder) {
mMonitor = nullptr;
ALOGW("The monitor has died.");
return;
}
findClientAndProcessLocked(kTimeouts, binder,
[&](std::vector<ClientInfo>& clients,
std::vector<ClientInfo>::const_iterator it) {
ALOGW("Client(pid: %d) died", it->pid);
clients.erase(it);
});
}
bool WatchdogProcessService::isRegisteredLocked(const sp<ICarWatchdogClient>& client) {
sp<IBinder> binder = BnCarWatchdog::asBinder(client);
return findClientAndProcessLocked(kTimeouts, binder, nullptr);
}
Status WatchdogProcessService::registerClientLocked(const sp<ICarWatchdogClient>& client,
TimeoutLength timeout, ClientType clientType) {
const char* clientName = clientType == ClientType::Regular ? "client" : "mediator";
if (isRegisteredLocked(client)) {
ALOGW("Cannot register the %s: the %s is already registered.", clientName, clientName);
return Status::ok();
}
sp<IBinder> binder = BnCarWatchdog::asBinder(client);
status_t status = binder->linkToDeath(this);
if (status != OK) {
std::string errorStr = StringPrintf("The %s is dead", clientName);
const char* errorCause = errorStr.c_str();
ALOGW("Cannot register the %s: %s", clientName, errorCause);
return Status::fromExceptionCode(Status::EX_ILLEGAL_STATE, errorCause);
}
std::vector<ClientInfo>& clients = mClients[timeout];
pid_t callingPid = IPCThreadState::self()->getCallingPid();
uid_t callingUid = IPCThreadState::self()->getCallingUid();
clients.push_back(ClientInfo(client, callingPid, callingUid, clientType));
// If the client array becomes non-empty, start health checking.
if (clients.size() == 1) {
startHealthCheckingLocked(timeout);
}
if (DEBUG) {
ALOGD("Car watchdog %s(pid: %d, timeout: %d) is registered", clientName, callingPid,
timeout);
}
return Status::ok();
}
Status WatchdogProcessService::unregisterClientLocked(const std::vector<TimeoutLength>& timeouts,
sp<IBinder> binder, ClientType clientType) {
const char* clientName = clientType == ClientType::Regular ? "client" : "mediator";
bool result = findClientAndProcessLocked(timeouts, binder,
[&](std::vector<ClientInfo>& clients,
std::vector<ClientInfo>::const_iterator it) {
binder->unlinkToDeath(this);
clients.erase(it);
});
if (!result) {
std::string errorStr = StringPrintf("The %s has not been registered", clientName);
const char* errorCause = errorStr.c_str();
ALOGW("Cannot unregister the %s: %s", clientName, errorCause);
return Status::fromExceptionCode(Status::EX_ILLEGAL_ARGUMENT, errorCause);
}
if (DEBUG) {
ALOGD("Car watchdog %s is unregistered", clientName);
}
return Status::ok();
}
Status WatchdogProcessService::tellClientAliveLocked(const sp<ICarWatchdogClient>& client,
int32_t sessionId) {
const sp<IBinder> binder = BnCarWatchdog::asBinder(client);
for (const auto& timeout : kTimeouts) {
PingedClientMap& clients = mPingedClients[timeout];
PingedClientMap::const_iterator it = clients.find(sessionId);
if (it == clients.cend() || binder != BnCarWatchdog::asBinder(it->second.client)) {
continue;
}
clients.erase(it);
return Status::ok();
}
return Status::fromExceptionCode(Status::EX_ILLEGAL_ARGUMENT,
"The client is not registered or the session ID is not found");
}
bool WatchdogProcessService::findClientAndProcessLocked(const std::vector<TimeoutLength> timeouts,
const sp<IBinder> binder,
const Processor& processor) {
for (const auto& timeout : timeouts) {
std::vector<ClientInfo>& clients = mClients[timeout];
for (auto it = clients.begin(); it != clients.end(); it++) {
if (BnCarWatchdog::asBinder((*it).client) != binder) {
continue;
}
if (processor != nullptr) {
processor(clients, it);
}
return true;
}
}
return false;
}
Result<void> WatchdogProcessService::startHealthCheckingLocked(TimeoutLength timeout) {
PingedClientMap& clients = mPingedClients[timeout];
clients.clear();
int what = static_cast<int>(timeout);
auto durationNs = timeoutToDurationNs(timeout);
mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler, Message(what));
return {};
}
Result<void> WatchdogProcessService::dumpAndKillClientsIfNotResponding(TimeoutLength timeout) {
std::vector<int32_t> processIds;
std::vector<sp<ICarWatchdogClient>> clientsToNotify;
{
Mutex::Autolock lock(mMutex);
PingedClientMap& clients = mPingedClients[timeout];
for (PingedClientMap::const_iterator it = clients.cbegin(); it != clients.cend(); it++) {
pid_t pid = -1;
userid_t userId = -1;
sp<ICarWatchdogClient> client = it->second.client;
sp<IBinder> binder = BnCarWatchdog::asBinder(client);
std::vector<TimeoutLength> timeouts = {timeout};
findClientAndProcessLocked(timeouts, binder,
[&](std::vector<ClientInfo>& clients,
std::vector<ClientInfo>::const_iterator it) {
pid = (*it).pid;
userId = (*it).userId;
clients.erase(it);
});
if (pid != -1 && mStoppedUserId.count(userId) == 0) {
clientsToNotify.push_back(client);
processIds.push_back(pid);
}
}
}
for (auto&& client : clientsToNotify) {
client->prepareProcessTermination();
}
return dumpAndKillAllProcesses(processIds);
}
Result<void> WatchdogProcessService::dumpAndKillAllProcesses(
const std::vector<int32_t>& processesNotResponding) {
size_t size = processesNotResponding.size();
if (size == 0) {
return {};
}
std::string pidString = pidArrayToString(processesNotResponding);
sp<ICarWatchdogMonitor> monitor;
{
Mutex::Autolock lock(mMutex);
if (mMonitor == nullptr) {
std::string errorMsg =
StringPrintf("Cannot dump and kill processes(pid = %s): Monitor is not set",
pidString.c_str());
ALOGW("%s", errorMsg.c_str());
return Error() << errorMsg;
}
monitor = mMonitor;
}
if (isSystemShuttingDown()) {
ALOGI("Skip dumping and killing processes(%s): The system is shutting down",
pidString.c_str());
return {};
}
monitor->onClientsNotResponding(processesNotResponding);
if (DEBUG) {
ALOGD("Dumping and killing processes is requested: %s", pidString.c_str());
}
return {};
}
int32_t WatchdogProcessService::getNewSessionId() {
// Make sure that session id is always positive number.
if (++mLastSessionId <= 0) {
mLastSessionId = 1;
}
return mLastSessionId;
}
bool WatchdogProcessService::isWatchdogEnabled() {
Mutex::Autolock lock(mMutex);
return mWatchdogEnabled;
}
std::string WatchdogProcessService::ClientInfo::toString() {
std::string buffer;
StringAppendF(&buffer, "pid = %d, userId = %d, type = %s", pid, userId,
type == Regular ? "Regular" : "Mediator");
return buffer;
}
WatchdogProcessService::MessageHandlerImpl::MessageHandlerImpl(
const sp<WatchdogProcessService>& service) :
mService(service) {}
void WatchdogProcessService::MessageHandlerImpl::handleMessage(const Message& message) {
switch (message.what) {
case static_cast<int>(TimeoutLength::TIMEOUT_CRITICAL):
case static_cast<int>(TimeoutLength::TIMEOUT_MODERATE):
case static_cast<int>(TimeoutLength::TIMEOUT_NORMAL):
mService->doHealthCheck(message.what);
break;
default:
ALOGW("Unknown message: %d", message.what);
}
}
} // namespace watchdog
} // namespace automotive
} // namespace android