| #pylint: disable-msg=C0111 |
| |
| """ |
| Pidfile monitor. |
| """ |
| |
| import logging |
| import time |
| import traceback |
| |
| import common |
| |
| from autotest_lib.client.common_lib import utils |
| from autotest_lib.client.common_lib import global_config |
| from autotest_lib.scheduler import drone_manager |
| from autotest_lib.scheduler import scheduler_config |
| |
| try: |
| from chromite.lib import metrics |
| except ImportError: |
| metrics = utils.metrics_mock |
| |
| |
| def _get_pidfile_timeout_secs(): |
| """@returns How long to wait for autoserv to write pidfile.""" |
| pidfile_timeout_mins = global_config.global_config.get_config_value( |
| scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int) |
| return pidfile_timeout_mins * 60 |
| |
| |
| class PidfileRunMonitor(object): |
| """ |
| Client must call either run() to start a new process or |
| attach_to_existing_process(). |
| """ |
| |
| class _PidfileException(Exception): |
| """ |
| Raised when there's some unexpected behavior with the pid file, but only |
| used internally (never allowed to escape this class). |
| """ |
| |
| |
| def __init__(self): |
| self._drone_manager = drone_manager.instance() |
| self.lost_process = False |
| self._start_time = None |
| self.pidfile_id = None |
| self._killed = False |
| self._state = drone_manager.PidfileContents() |
| |
| |
| def _add_nice_command(self, command, nice_level): |
| if not nice_level: |
| return command |
| return ['nice', '-n', str(nice_level)] + command |
| |
| |
| def _set_start_time(self): |
| self._start_time = time.time() |
| |
| |
| def run(self, command, working_directory, num_processes, nice_level=None, |
| log_file=None, pidfile_name=None, paired_with_pidfile=None, |
| username=None, drone_hostnames_allowed=None): |
| assert command is not None |
| if nice_level is not None: |
| command = ['nice', '-n', str(nice_level)] + command |
| self._set_start_time() |
| self.pidfile_id = self._drone_manager.execute_command( |
| command, working_directory, pidfile_name=pidfile_name, |
| num_processes=num_processes, log_file=log_file, |
| paired_with_pidfile=paired_with_pidfile, username=username, |
| drone_hostnames_allowed=drone_hostnames_allowed) |
| |
| |
| def attach_to_existing_process(self, execution_path, |
| pidfile_name=drone_manager.AUTOSERV_PID_FILE, |
| num_processes=None): |
| self._set_start_time() |
| self.pidfile_id = self._drone_manager.get_pidfile_id_from( |
| execution_path, pidfile_name=pidfile_name) |
| if num_processes is not None: |
| self._drone_manager.declare_process_count(self.pidfile_id, num_processes) |
| |
| |
| def kill(self): |
| if self.has_process(): |
| self._drone_manager.kill_process(self.get_process()) |
| self._killed = True |
| |
| |
| def has_process(self): |
| self._get_pidfile_info() |
| return self._state.process is not None |
| |
| |
| def get_process(self): |
| self._get_pidfile_info() |
| assert self._state.process is not None |
| return self._state.process |
| |
| |
| def _read_pidfile(self, use_second_read=False): |
| assert self.pidfile_id is not None, ( |
| 'You must call run() or attach_to_existing_process()') |
| contents = self._drone_manager.get_pidfile_contents( |
| self.pidfile_id, use_second_read=use_second_read) |
| if contents.is_invalid(): |
| self._state = drone_manager.PidfileContents() |
| raise self._PidfileException(contents) |
| self._state = contents |
| |
| |
| def _handle_pidfile_error(self, error, message=''): |
| self.on_lost_process(self._state.process) |
| |
| |
| def _get_pidfile_info_helper(self): |
| if self.lost_process: |
| return |
| |
| self._read_pidfile() |
| |
| if self._state.process is None: |
| self._handle_no_process() |
| return |
| |
| if self._state.exit_status is None: |
| # double check whether or not autoserv is running |
| if self._drone_manager.is_process_running(self._state.process): |
| return |
| |
| # pid but no running process - maybe process *just* exited |
| self._read_pidfile(use_second_read=True) |
| if self._state.exit_status is None: |
| # autoserv exited without writing an exit code |
| # to the pidfile |
| self._handle_pidfile_error( |
| 'autoserv died without writing exit code') |
| |
| |
| def _get_pidfile_info(self): |
| """\ |
| After completion, self._state will contain: |
| pid=None, exit_status=None if autoserv has not yet run |
| pid!=None, exit_status=None if autoserv is running |
| pid!=None, exit_status!=None if autoserv has completed |
| """ |
| try: |
| self._get_pidfile_info_helper() |
| except self._PidfileException, exc: |
| self._handle_pidfile_error('Pidfile error', traceback.format_exc()) |
| |
| |
| def _handle_no_process(self): |
| """\ |
| Called when no pidfile is found or no pid is in the pidfile. |
| """ |
| if time.time() - self._start_time > _get_pidfile_timeout_secs(): |
| # If we aborted the process, and we find that it has exited without |
| # writing a pidfile, then it's because we killed it, and thus this |
| # isn't a surprising situation. |
| if not self._killed: |
| metrics.Counter('chromeos/autotest/errors/scheduler/no_pidfile' |
| ).increment() |
| else: |
| logging.warning("%s didn't exit after SIGTERM", self.pidfile_id) |
| self.on_lost_process() |
| |
| |
| def on_lost_process(self, process=None): |
| """\ |
| Called when autoserv has exited without writing an exit status, |
| or we've timed out waiting for autoserv to write a pid to the |
| pidfile. In either case, we just return failure and the caller |
| should signal some kind of warning. |
| |
| process is unimportant here, as it shouldn't be used by anyone. |
| """ |
| self.lost_process = True |
| self._state.process = process |
| self._state.exit_status = 1 |
| self._state.num_tests_failed = 0 |
| |
| |
| def exit_code(self): |
| self._get_pidfile_info() |
| return self._state.exit_status |
| |
| |
| def num_tests_failed(self): |
| """@returns The number of tests that failed or -1 if unknown.""" |
| self._get_pidfile_info() |
| if self._state.num_tests_failed is None: |
| return -1 |
| return self._state.num_tests_failed |
| |
| |
| def try_copy_results_on_drone(self, **kwargs): |
| if self.has_process(): |
| # copy results logs into the normal place for job results |
| self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs) |
| |
| |
| def try_copy_to_results_repository(self, source, **kwargs): |
| if self.has_process(): |
| self._drone_manager.copy_to_results_repository(self.get_process(), |
| source, **kwargs) |
| |