Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2018 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 17 | #include <sys/types.h> |
| 18 | #include <sys/wait.h> |
| 19 | |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 20 | #include <assert.h> |
| 21 | #include <errno.h> |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 22 | #include <signal.h> |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 23 | #include <stdio.h> |
| 24 | |
| 25 | #include <map> |
| 26 | |
| 27 | #include <glog/logging.h> |
| 28 | |
| 29 | #include "common/libs/fs/shared_select.h" |
Cody Schuffelen | 147b88e | 2019-09-09 16:00:11 -0700 | [diff] [blame] | 30 | #include "host/commands/run_cvd/process_monitor.h" |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 31 | |
| 32 | namespace cvd { |
| 33 | |
| 34 | namespace { |
| 35 | |
| 36 | void NotifyThread(SharedFD fd) { |
| 37 | // The restarter thread is (likely) blocked on a call to select, to make it |
| 38 | // wake up and do some work we write something (anything, the content is not |
| 39 | // important) into the main side of the socket pair so that the call to select |
| 40 | // returns and the notification fd (restarter side of the socket pair) is |
| 41 | // marked as ready to read. |
| 42 | char buffer = 'a'; |
| 43 | fd->Write(&buffer, sizeof(buffer)); |
| 44 | } |
| 45 | |
| 46 | void ConsumeNotifications(SharedFD fd) { |
| 47 | // Once the starter thread is waken up due to a notification, the calls to |
| 48 | // select will continue to return immediately unless we read what was written |
| 49 | // on the main side of the socket pair. More than one notification can |
| 50 | // accumulate before the restarter thread consumes them, so we attempt to read |
| 51 | // more than it's written to consume them all at once. In the unlikely case of |
| 52 | // more than 8 notifications acummulating we simply read the first 8 and have |
| 53 | // another iteration on the restarter thread loop. |
| 54 | char buffer[8]; |
| 55 | fd->Read(buffer, sizeof(buffer)); |
| 56 | } |
| 57 | |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 58 | } // namespace |
| 59 | |
| 60 | ProcessMonitor::ProcessMonitor() { |
| 61 | if (!SharedFD::SocketPair(AF_LOCAL, SOCK_STREAM, 0, &thread_comm_main_, |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 62 | &thread_comm_monitor_)) { |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 63 | LOG(ERROR) << "Unable to create restarter communication socket pair: " |
| 64 | << strerror(errno); |
| 65 | return; |
| 66 | } |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 67 | monitor_thread_ = std::thread([this]() { MonitorRoutine(); }); |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 68 | } |
| 69 | |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 70 | void ProcessMonitor::StartSubprocess(Command cmd, OnSocketReadyCb callback) { |
Cody Schuffelen | e567087 | 2019-12-10 15:04:59 -0800 | [diff] [blame^] | 71 | cmd.SetWithControlSocket(true); |
| 72 | auto proc = cmd.StartInGroup(); |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 73 | if (!proc.Started()) { |
| 74 | LOG(ERROR) << "Failed to start process"; |
| 75 | return; |
| 76 | } |
Jorge E. Moreira | 734e1d1 | 2018-11-07 20:30:38 -0800 | [diff] [blame] | 77 | MonitorExistingSubprocess(std::move(cmd), std::move(proc), callback); |
| 78 | } |
| 79 | |
| 80 | void ProcessMonitor::MonitorExistingSubprocess(Command cmd, Subprocess proc, |
| 81 | OnSocketReadyCb callback) { |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 82 | { |
| 83 | std::lock_guard<std::mutex> lock(processes_mutex_); |
| 84 | monitored_processes_.push_back(MonitorEntry()); |
| 85 | auto& entry = monitored_processes_.back(); |
| 86 | entry.cmd.reset(new Command(std::move(cmd))); |
| 87 | entry.proc.reset(new Subprocess(std::move(proc))); |
| 88 | entry.on_control_socket_ready_cb = callback; |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 89 | } |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 90 | // Wake the restarter thread up so that it starts monitoring this subprocess |
| 91 | // Do this after releasing the lock so that the restarter thread is free to |
| 92 | // begin work as soon as select returns. |
| 93 | NotifyThread(thread_comm_main_); |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 94 | } |
| 95 | |
Jorge E. Moreira | 7123e2e | 2019-09-05 13:47:29 -0700 | [diff] [blame] | 96 | bool ProcessMonitor::StopMonitoredProcesses() { |
| 97 | // Because the mutex is held while this function executes, the restarter |
| 98 | // thread is kept blocked and by the time it resumes execution there are no |
| 99 | // more processes to monitor |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 100 | std::lock_guard<std::mutex> lock(processes_mutex_); |
Jorge E. Moreira | 7123e2e | 2019-09-05 13:47:29 -0700 | [diff] [blame] | 101 | bool result = true; |
| 102 | // Processes were started in the order they appear in the vector, stop them in |
| 103 | // reverse order for symmetry. |
| 104 | for (auto entry_it = monitored_processes_.rbegin(); |
| 105 | entry_it != monitored_processes_.rend(); ++entry_it) { |
| 106 | auto& entry = *entry_it; |
| 107 | result = result && entry.proc->Stop(); |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 108 | } |
Jorge E. Moreira | 7123e2e | 2019-09-05 13:47:29 -0700 | [diff] [blame] | 109 | // Wait for all processes to actually exit. |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 110 | for (auto& entry : monitored_processes_) { |
Jorge E. Moreira | 7123e2e | 2019-09-05 13:47:29 -0700 | [diff] [blame] | 111 | // Most processes are being killed by signals, calling Wait(void) would be |
| 112 | // too verbose on the logs. |
| 113 | int wstatus; |
| 114 | auto ret = entry.proc->Wait(&wstatus, 0); |
| 115 | if (ret < 0) { |
| 116 | LOG(WARNING) << "Failed to wait for process " |
| 117 | << entry.cmd->GetShortName(); |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 118 | } |
| 119 | } |
| 120 | // Clear the list to ensure they are not started again |
| 121 | monitored_processes_.clear(); |
Jorge E. Moreira | 7123e2e | 2019-09-05 13:47:29 -0700 | [diff] [blame] | 122 | return result; |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 123 | } |
| 124 | |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 125 | bool ProcessMonitor::RestartOnExitCb(MonitorEntry* entry) { |
| 126 | // Make sure the process actually exited |
| 127 | char buffer[16]; |
| 128 | auto bytes_read = entry->proc->control_socket()->Read(buffer, sizeof(buffer)); |
| 129 | if (bytes_read > 0) { |
| 130 | LOG(WARNING) << "Subprocess " << entry->cmd->GetShortName() << " wrote " |
| 131 | << bytes_read |
| 132 | << " bytes on the control socket, this is unexpected"; |
| 133 | // The process may not have exited, continue monitoring without restarting |
| 134 | return true; |
| 135 | } |
| 136 | |
| 137 | LOG(INFO) << "Detected exit of monitored subprocess"; |
| 138 | // Make sure the subprocess isn't left in a zombie state, and that the |
| 139 | // pid is logged |
| 140 | int wstatus; |
| 141 | auto wait_ret = TEMP_FAILURE_RETRY(entry->proc->Wait(&wstatus, 0)); |
| 142 | // None of the error conditions specified on waitpid(2) apply |
| 143 | assert(wait_ret > 0); |
| 144 | if (WIFEXITED(wstatus)) { |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 145 | LOG(INFO) << "Subprocess " << entry->cmd->GetShortName() << " (" << wait_ret |
| 146 | << ") has exited with exit code " << WEXITSTATUS(wstatus); |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 147 | } else if (WIFSIGNALED(wstatus)) { |
| 148 | LOG(ERROR) << "Subprocess " << entry->cmd->GetShortName() << " (" |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 149 | << wait_ret |
| 150 | << ") was interrupted by a signal: " << WTERMSIG(wstatus); |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 151 | } else { |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 152 | LOG(INFO) << "subprocess " << entry->cmd->GetShortName() << " (" << wait_ret |
| 153 | << ") has exited for unknown reasons"; |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 154 | } |
Cody Schuffelen | e567087 | 2019-12-10 15:04:59 -0800 | [diff] [blame^] | 155 | entry->proc.reset(new Subprocess(entry->cmd->Start())); |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 156 | return true; |
| 157 | } |
| 158 | |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 159 | bool ProcessMonitor::DoNotMonitorCb(MonitorEntry*) { return false; } |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 160 | |
| 161 | void ProcessMonitor::MonitorRoutine() { |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 162 | LOG(INFO) << "Started monitoring subprocesses"; |
| 163 | do { |
| 164 | SharedFDSet read_set; |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 165 | read_set.Set(thread_comm_monitor_); |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 166 | { |
| 167 | std::lock_guard<std::mutex> lock(processes_mutex_); |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 168 | for (auto& monitored_process : monitored_processes_) { |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 169 | auto control_socket = monitored_process.proc->control_socket(); |
Jorge E. Moreira | 2be306d | 2019-08-30 11:54:32 -0700 | [diff] [blame] | 170 | if (!control_socket->IsOpen()) { |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 171 | LOG(ERROR) << "The control socket for " |
| 172 | << monitored_process.cmd->GetShortName() |
| 173 | << " is closed, it's effectively NOT being monitored"; |
| 174 | } |
| 175 | read_set.Set(control_socket); |
| 176 | } |
| 177 | } |
| 178 | // We can't call select while holding the lock as it would lead to a |
| 179 | // deadlock (restarter thread waiting for notifications from main thread, |
| 180 | // main thread waiting for the lock) |
| 181 | int num_fds = cvd::Select(&read_set, nullptr, nullptr, nullptr); |
| 182 | if (num_fds < 0) { |
| 183 | LOG(ERROR) << "Select call returned error on restarter thread: " |
| 184 | << strerror(errno); |
| 185 | } |
| 186 | if (num_fds > 0) { |
| 187 | // Try the communication fd, it's the most likely to be set |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 188 | if (read_set.IsSet(thread_comm_monitor_)) { |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 189 | --num_fds; |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 190 | ConsumeNotifications(thread_comm_monitor_); |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 191 | } |
| 192 | } |
| 193 | { |
| 194 | std::lock_guard<std::mutex> lock(processes_mutex_); |
| 195 | // Keep track of the number of file descriptors ready for read, chances |
| 196 | // are we don't need to go over the entire list of subprocesses |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 197 | auto it = monitored_processes_.begin(); |
| 198 | while (it != monitored_processes_.end()) { |
| 199 | auto control_socket = it->proc->control_socket(); |
| 200 | bool keep_monitoring = true; |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 201 | if (read_set.IsSet(control_socket)) { |
| 202 | --num_fds; |
Jorge E. Moreira | 3d955ae | 2018-11-07 23:06:44 -0800 | [diff] [blame] | 203 | keep_monitoring = it->on_control_socket_ready_cb(&(*it)); |
| 204 | } |
| 205 | if (keep_monitoring) { |
| 206 | ++it; |
| 207 | } else { |
| 208 | it = monitored_processes_.erase(it); |
Jorge E. Moreira | 1a62e76 | 2018-11-05 22:05:57 -0800 | [diff] [blame] | 209 | } |
| 210 | } |
| 211 | } |
| 212 | assert(num_fds == 0); |
| 213 | } while (true); |
| 214 | } |
| 215 | |
| 216 | } // namespace cvd |