beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 1 | #pylint: disable-msg=C0111 |
| 2 | |
| 3 | """ |
| 4 | Pidfile monitor. |
| 5 | """ |
| 6 | |
Alex Miller | 502d162 | 2014-01-29 17:06:35 -0800 | [diff] [blame] | 7 | import logging |
Aviv Keshet | 82c508c | 2017-01-03 14:10:17 -0800 | [diff] [blame] | 8 | import time |
| 9 | import traceback |
| 10 | |
| 11 | import common |
| 12 | |
Dan Shi | 5e2efb7 | 2017-02-07 11:40:23 -0800 | [diff] [blame] | 13 | from autotest_lib.client.common_lib import utils |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 14 | from autotest_lib.client.common_lib import global_config |
Aviv Keshet | 82c508c | 2017-01-03 14:10:17 -0800 | [diff] [blame] | 15 | from autotest_lib.scheduler import drone_manager |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 16 | from autotest_lib.scheduler import scheduler_config |
| 17 | |
Dan Shi | 5e2efb7 | 2017-02-07 11:40:23 -0800 | [diff] [blame] | 18 | try: |
| 19 | from chromite.lib import metrics |
| 20 | except ImportError: |
| 21 | metrics = utils.metrics_mock |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 22 | |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 23 | |
| 24 | def _get_pidfile_timeout_secs(): |
| 25 | """@returns How long to wait for autoserv to write pidfile.""" |
| 26 | pidfile_timeout_mins = global_config.global_config.get_config_value( |
| 27 | scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int) |
| 28 | return pidfile_timeout_mins * 60 |
| 29 | |
| 30 | |
| 31 | class PidfileRunMonitor(object): |
| 32 | """ |
| 33 | Client must call either run() to start a new process or |
| 34 | attach_to_existing_process(). |
| 35 | """ |
| 36 | |
| 37 | class _PidfileException(Exception): |
| 38 | """ |
| 39 | Raised when there's some unexpected behavior with the pid file, but only |
| 40 | used internally (never allowed to escape this class). |
| 41 | """ |
| 42 | |
| 43 | |
| 44 | def __init__(self): |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 45 | self._drone_manager = drone_manager.instance() |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 46 | self.lost_process = False |
| 47 | self._start_time = None |
| 48 | self.pidfile_id = None |
Alex Miller | 502d162 | 2014-01-29 17:06:35 -0800 | [diff] [blame] | 49 | self._killed = False |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 50 | self._state = drone_manager.PidfileContents() |
| 51 | |
| 52 | |
| 53 | def _add_nice_command(self, command, nice_level): |
| 54 | if not nice_level: |
| 55 | return command |
| 56 | return ['nice', '-n', str(nice_level)] + command |
| 57 | |
| 58 | |
| 59 | def _set_start_time(self): |
| 60 | self._start_time = time.time() |
| 61 | |
| 62 | |
| 63 | def run(self, command, working_directory, num_processes, nice_level=None, |
| 64 | log_file=None, pidfile_name=None, paired_with_pidfile=None, |
| 65 | username=None, drone_hostnames_allowed=None): |
| 66 | assert command is not None |
| 67 | if nice_level is not None: |
| 68 | command = ['nice', '-n', str(nice_level)] + command |
| 69 | self._set_start_time() |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 70 | self.pidfile_id = self._drone_manager.execute_command( |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 71 | command, working_directory, pidfile_name=pidfile_name, |
| 72 | num_processes=num_processes, log_file=log_file, |
| 73 | paired_with_pidfile=paired_with_pidfile, username=username, |
| 74 | drone_hostnames_allowed=drone_hostnames_allowed) |
| 75 | |
| 76 | |
| 77 | def attach_to_existing_process(self, execution_path, |
| 78 | pidfile_name=drone_manager.AUTOSERV_PID_FILE, |
| 79 | num_processes=None): |
| 80 | self._set_start_time() |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 81 | self.pidfile_id = self._drone_manager.get_pidfile_id_from( |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 82 | execution_path, pidfile_name=pidfile_name) |
| 83 | if num_processes is not None: |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 84 | self._drone_manager.declare_process_count(self.pidfile_id, num_processes) |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 85 | |
| 86 | |
| 87 | def kill(self): |
| 88 | if self.has_process(): |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 89 | self._drone_manager.kill_process(self.get_process()) |
Alex Miller | 502d162 | 2014-01-29 17:06:35 -0800 | [diff] [blame] | 90 | self._killed = True |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 91 | |
| 92 | |
| 93 | def has_process(self): |
| 94 | self._get_pidfile_info() |
| 95 | return self._state.process is not None |
| 96 | |
| 97 | |
| 98 | def get_process(self): |
| 99 | self._get_pidfile_info() |
| 100 | assert self._state.process is not None |
| 101 | return self._state.process |
| 102 | |
| 103 | |
| 104 | def _read_pidfile(self, use_second_read=False): |
| 105 | assert self.pidfile_id is not None, ( |
| 106 | 'You must call run() or attach_to_existing_process()') |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 107 | contents = self._drone_manager.get_pidfile_contents( |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 108 | self.pidfile_id, use_second_read=use_second_read) |
| 109 | if contents.is_invalid(): |
| 110 | self._state = drone_manager.PidfileContents() |
| 111 | raise self._PidfileException(contents) |
| 112 | self._state = contents |
| 113 | |
| 114 | |
| 115 | def _handle_pidfile_error(self, error, message=''): |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 116 | self.on_lost_process(self._state.process) |
| 117 | |
| 118 | |
| 119 | def _get_pidfile_info_helper(self): |
| 120 | if self.lost_process: |
| 121 | return |
| 122 | |
| 123 | self._read_pidfile() |
| 124 | |
| 125 | if self._state.process is None: |
| 126 | self._handle_no_process() |
| 127 | return |
| 128 | |
| 129 | if self._state.exit_status is None: |
| 130 | # double check whether or not autoserv is running |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 131 | if self._drone_manager.is_process_running(self._state.process): |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 132 | return |
| 133 | |
| 134 | # pid but no running process - maybe process *just* exited |
| 135 | self._read_pidfile(use_second_read=True) |
| 136 | if self._state.exit_status is None: |
| 137 | # autoserv exited without writing an exit code |
| 138 | # to the pidfile |
| 139 | self._handle_pidfile_error( |
| 140 | 'autoserv died without writing exit code') |
| 141 | |
| 142 | |
| 143 | def _get_pidfile_info(self): |
| 144 | """\ |
| 145 | After completion, self._state will contain: |
| 146 | pid=None, exit_status=None if autoserv has not yet run |
| 147 | pid!=None, exit_status=None if autoserv is running |
| 148 | pid!=None, exit_status!=None if autoserv has completed |
| 149 | """ |
| 150 | try: |
| 151 | self._get_pidfile_info_helper() |
| 152 | except self._PidfileException, exc: |
| 153 | self._handle_pidfile_error('Pidfile error', traceback.format_exc()) |
| 154 | |
| 155 | |
| 156 | def _handle_no_process(self): |
| 157 | """\ |
| 158 | Called when no pidfile is found or no pid is in the pidfile. |
| 159 | """ |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 160 | if time.time() - self._start_time > _get_pidfile_timeout_secs(): |
Alex Miller | 502d162 | 2014-01-29 17:06:35 -0800 | [diff] [blame] | 161 | # If we aborted the process, and we find that it has exited without |
| 162 | # writing a pidfile, then it's because we killed it, and thus this |
| 163 | # isn't a surprising situation. |
| 164 | if not self._killed: |
Aviv Keshet | 82c508c | 2017-01-03 14:10:17 -0800 | [diff] [blame] | 165 | metrics.Counter('chromeos/autotest/errors/scheduler/no_pidfile' |
| 166 | ).increment() |
Alex Miller | 502d162 | 2014-01-29 17:06:35 -0800 | [diff] [blame] | 167 | else: |
| 168 | logging.warning("%s didn't exit after SIGTERM", self.pidfile_id) |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 169 | self.on_lost_process() |
| 170 | |
| 171 | |
| 172 | def on_lost_process(self, process=None): |
| 173 | """\ |
| 174 | Called when autoserv has exited without writing an exit status, |
| 175 | or we've timed out waiting for autoserv to write a pid to the |
| 176 | pidfile. In either case, we just return failure and the caller |
| 177 | should signal some kind of warning. |
| 178 | |
| 179 | process is unimportant here, as it shouldn't be used by anyone. |
| 180 | """ |
| 181 | self.lost_process = True |
| 182 | self._state.process = process |
| 183 | self._state.exit_status = 1 |
| 184 | self._state.num_tests_failed = 0 |
| 185 | |
| 186 | |
| 187 | def exit_code(self): |
| 188 | self._get_pidfile_info() |
| 189 | return self._state.exit_status |
| 190 | |
| 191 | |
| 192 | def num_tests_failed(self): |
| 193 | """@returns The number of tests that failed or -1 if unknown.""" |
| 194 | self._get_pidfile_info() |
| 195 | if self._state.num_tests_failed is None: |
| 196 | return -1 |
| 197 | return self._state.num_tests_failed |
| 198 | |
| 199 | |
| 200 | def try_copy_results_on_drone(self, **kwargs): |
| 201 | if self.has_process(): |
| 202 | # copy results logs into the normal place for job results |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 203 | self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs) |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 204 | |
| 205 | |
| 206 | def try_copy_to_results_repository(self, source, **kwargs): |
| 207 | if self.has_process(): |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 208 | self._drone_manager.copy_to_results_repository(self.get_process(), |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 209 | source, **kwargs) |
| 210 | |