userspace reboot: stop post-data services and wait for them to be killed
* Refactored code around stopping services a little bit to reuse it
between full reboot and userspace reboot.
* Add a scope_guard to fallback to full reboot in case userspace reboot
fails.
* In case of userspace reboot init will also wait for services to be
terminated/killed and log the ones that didn't react to
SIGTERM/SIGKILL in time.
* If some of the services didn't react to SIGKILL, fail userspace reboot.
Test: adb reboot userspace
Bug: 135984674
Change-Id: I820c7bc406169333b0f929f0eea028d8384eb2ac
diff --git a/init/reboot.cpp b/init/reboot.cpp
index 41965a1..4b892b7 100644
--- a/init/reboot.cpp
+++ b/init/reboot.cpp
@@ -22,6 +22,7 @@
#include <linux/loop.h>
#include <mntent.h>
#include <semaphore.h>
+#include <stdlib.h>
#include <sys/cdefs.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
@@ -31,6 +32,7 @@
#include <sys/types.h>
#include <sys/wait.h>
+#include <chrono>
#include <memory>
#include <set>
#include <thread>
@@ -41,6 +43,7 @@
#include <android-base/logging.h>
#include <android-base/macros.h>
#include <android-base/properties.h>
+#include <android-base/scopeguard.h>
#include <android-base/strings.h>
#include <android-base/unique_fd.h>
#include <bootloader_message/bootloader_message.h>
@@ -59,6 +62,7 @@
#include "service.h"
#include "service_list.h"
#include "sigchld_handler.h"
+#include "util.h"
#define PROC_SYSRQ "/proc/sysrq-trigger"
@@ -75,6 +79,19 @@
static bool shutting_down = false;
+static const std::set<std::string> kDebuggingServices{"tombstoned", "logd", "adbd", "console"};
+
+static std::vector<Service*> GetDebuggingServices(bool only_post_data) {
+ std::vector<Service*> ret;
+ ret.reserve(kDebuggingServices.size());
+ for (const auto& s : ServiceList::GetInstance()) {
+ if (kDebuggingServices.count(s->name()) && (!only_post_data || s->is_post_data())) {
+ ret.push_back(s.get());
+ }
+ }
+ return ret;
+}
+
// represents umount status during reboot / shutdown.
enum UmountStat {
/* umount succeeded. */
@@ -446,6 +463,49 @@
LOG(INFO) << "zram_backing_dev: `" << backing_dev << "` is cleared successfully.";
}
+// Stops given services, waits for them to be stopped for |timeout| ms.
+// If terminate is true, then SIGTERM is sent to services, otherwise SIGKILL is sent.
+static void StopServices(const std::vector<Service*>& services, std::chrono::milliseconds timeout,
+ bool terminate) {
+ LOG(INFO) << "Stopping " << services.size() << " services by sending "
+ << (terminate ? "SIGTERM" : "SIGKILL");
+ std::vector<pid_t> pids;
+ pids.reserve(services.size());
+ for (const auto& s : services) {
+ if (s->pid() > 0) {
+ pids.push_back(s->pid());
+ }
+ if (terminate) {
+ s->Terminate();
+ } else {
+ s->Stop();
+ }
+ }
+ if (timeout > 0ms) {
+ WaitToBeReaped(pids, timeout);
+ } else {
+ // Even if we don't to wait for services to stop, we still optimistically reap zombies.
+ ReapAnyOutstandingChildren();
+ }
+}
+
+// Like StopServices, but also logs all the services that failed to stop after the provided timeout.
+// Returns number of violators.
+static int StopServicesAndLogViolations(const std::vector<Service*>& services,
+ std::chrono::milliseconds timeout, bool terminate) {
+ StopServices(services, timeout, terminate);
+ int still_running = 0;
+ for (const auto& s : services) {
+ if (s->IsRunning()) {
+ LOG(ERROR) << "[service-misbehaving] : service '" << s->name() << "' is still running "
+ << timeout.count() << "ms after receiving "
+ << (terminate ? "SIGTERM" : "SIGKILL");
+ still_running++;
+ }
+ }
+ return still_running;
+}
+
//* Reboot / shutdown the system.
// cmd ANDROID_RB_* as defined in android_reboot.h
// reason Reason string like "reboot", "shutdown,userrequested"
@@ -510,12 +570,13 @@
// Start reboot monitor thread
sem_post(&reboot_semaphore);
- // keep debugging tools until non critical ones are all gone.
- const std::set<std::string> kill_after_apps{"tombstoned", "logd", "adbd"};
// watchdogd is a vendor specific component but should be alive to complete shutdown safely.
const std::set<std::string> to_starts{"watchdogd"};
+ std::vector<Service*> stop_first;
+ stop_first.reserve(ServiceList::GetInstance().services().size());
for (const auto& s : ServiceList::GetInstance()) {
- if (kill_after_apps.count(s->name())) {
+ if (kDebuggingServices.count(s->name())) {
+ // keep debugging tools until non critical ones are all gone.
s->SetShutdownCritical();
} else if (to_starts.count(s->name())) {
if (auto result = s->Start(); !result) {
@@ -529,6 +590,8 @@
LOG(ERROR) << "Could not start shutdown critical service '" << s->name()
<< "': " << result.error();
}
+ } else {
+ stop_first.push_back(s.get());
}
}
@@ -571,49 +634,12 @@
// optional shutdown step
// 1. terminate all services except shutdown critical ones. wait for delay to finish
if (shutdown_timeout > 0ms) {
- LOG(INFO) << "terminating init services";
-
- // Ask all services to terminate except shutdown critical ones.
- for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
- if (!s->IsShutdownCritical()) s->Terminate();
- }
-
- int service_count = 0;
- // Only wait up to half of timeout here
- auto termination_wait_timeout = shutdown_timeout / 2;
- while (t.duration() < termination_wait_timeout) {
- ReapAnyOutstandingChildren();
-
- service_count = 0;
- for (const auto& s : ServiceList::GetInstance()) {
- // Count the number of services running except shutdown critical.
- // Exclude the console as it will ignore the SIGTERM signal
- // and not exit.
- // Note: SVC_CONSOLE actually means "requires console" but
- // it is only used by the shell.
- if (!s->IsShutdownCritical() && s->pid() != 0 && (s->flags() & SVC_CONSOLE) == 0) {
- service_count++;
- }
- }
-
- if (service_count == 0) {
- // All terminable services terminated. We can exit early.
- break;
- }
-
- // Wait a bit before recounting the number or running services.
- std::this_thread::sleep_for(50ms);
- }
- LOG(INFO) << "Terminating running services took " << t
- << " with remaining services:" << service_count;
+ StopServicesAndLogViolations(stop_first, shutdown_timeout / 2, true /* SIGTERM */);
}
-
- // minimum safety steps before restarting
- // 2. kill all services except ones that are necessary for the shutdown sequence.
- for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
- if (!s->IsShutdownCritical()) s->Stop();
- }
+ // Send SIGKILL to ones that didn't terminate cleanly.
+ StopServicesAndLogViolations(stop_first, 0ms, false /* SIGKILL */);
SubcontextTerminate();
+ // Reap subcontext pids.
ReapAnyOutstandingChildren();
// 3. send volume shutdown to vold
@@ -625,9 +651,7 @@
LOG(INFO) << "vold not running, skipping vold shutdown";
}
// logcat stopped here
- for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
- if (kill_after_apps.count(s->name())) s->Stop();
- }
+ StopServices(GetDebuggingServices(false /* only_post_data */), 0ms, false /* SIGKILL */);
// 4. sync, try umount, and optionally run fsck for user shutdown
{
Timer sync_timer;
@@ -660,6 +684,7 @@
}
static void EnterShutdown() {
+ LOG(INFO) << "Entering shutdown mode";
shutting_down = true;
// Skip wait for prop if it is in progress
ResetWaitForProp();
@@ -675,21 +700,61 @@
}
static void LeaveShutdown() {
+ LOG(INFO) << "Leaving shutdown mode";
shutting_down = false;
SendStartSendingMessagesMessage();
}
-static void DoUserspaceReboot() {
+static Result<void> DoUserspaceReboot() {
+ LOG(INFO) << "Userspace reboot initiated";
+ auto guard = android::base::make_scope_guard([] {
+ // Leave shutdown so that we can handle a full reboot.
+ LeaveShutdown();
+ property_set("sys.powerctl", "reboot,abort-userspace-reboot");
+ });
// Triggering userspace-reboot-requested will result in a bunch of set_prop
// actions. We should make sure, that all of them are propagated before
// proceeding with userspace reboot.
// TODO(b/135984674): implement proper synchronization logic.
std::this_thread::sleep_for(500ms);
EnterShutdown();
- // TODO(b/135984674): tear down post-data services
- LeaveShutdown();
+ std::vector<Service*> stop_first;
+ // Remember the services that were enabled. We will need to manually enable them again otherwise
+ // triggers like class_start won't restart them.
+ std::vector<Service*> were_enabled;
+ stop_first.reserve(ServiceList::GetInstance().services().size());
+ for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
+ if (s->is_post_data() && !kDebuggingServices.count(s->name())) {
+ stop_first.push_back(s);
+ }
+ if (s->is_post_data() && s->IsEnabled()) {
+ were_enabled.push_back(s);
+ }
+ }
+ // TODO(b/135984674): do we need shutdown animation for userspace reboot?
+ // TODO(b/135984674): control userspace timeout via read-only property?
+ StopServicesAndLogViolations(stop_first, 10s, true /* SIGTERM */);
+ if (int r = StopServicesAndLogViolations(stop_first, 20s, false /* SIGKILL */); r > 0) {
+ // TODO(b/135984674): store information about offending services for debugging.
+ return Error() << r << " post-data services are still running";
+ }
// TODO(b/135984674): remount userdata
+ if (int r = StopServicesAndLogViolations(GetDebuggingServices(true /* only_post_data */), 5s,
+ false /* SIGKILL */);
+ r > 0) {
+ // TODO(b/135984674): store information about offending services for debugging.
+ return Error() << r << " debugging services are still running";
+ }
+ // TODO(b/135984674): deactivate APEX modules and switch back to bootstrap namespace.
+ // Re-enable services
+ for (const auto& s : were_enabled) {
+ LOG(INFO) << "Re-enabling service '" << s->name() << "'";
+ s->Enable();
+ }
+ LeaveShutdown();
ActionManager::GetInstance().QueueEventTrigger("userspace-reboot-resume");
+ guard.Disable(); // Go on with userspace reboot.
+ return {};
}
static void HandleUserspaceReboot() {
@@ -697,10 +762,7 @@
auto& am = ActionManager::GetInstance();
am.ClearQueue();
am.QueueEventTrigger("userspace-reboot-requested");
- auto handler = [](const BuiltinArguments&) {
- DoUserspaceReboot();
- return Result<void>{};
- };
+ auto handler = [](const BuiltinArguments&) { return DoUserspaceReboot(); };
am.QueueBuiltinAction(handler, "userspace-reboot");
}