blob: 1b61e1caecb10aac7a4dead1ecb7fd3d43405be2 [file] [log] [blame]
#pylint: disable-msg=C0111
"""
Pidfile monitor.
"""
import logging
import time
import traceback
import common
from autotest_lib.client.common_lib import utils
from autotest_lib.client.common_lib import global_config
from autotest_lib.scheduler import drone_manager
from autotest_lib.scheduler import scheduler_config
try:
from chromite.lib import metrics
except ImportError:
metrics = utils.metrics_mock
def _get_pidfile_timeout_secs():
"""@returns How long to wait for autoserv to write pidfile."""
pidfile_timeout_mins = global_config.global_config.get_config_value(
scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int)
return pidfile_timeout_mins * 60
class PidfileRunMonitor(object):
"""
Client must call either run() to start a new process or
attach_to_existing_process().
"""
class _PidfileException(Exception):
"""
Raised when there's some unexpected behavior with the pid file, but only
used internally (never allowed to escape this class).
"""
def __init__(self):
self._drone_manager = drone_manager.instance()
self.lost_process = False
self._start_time = None
self.pidfile_id = None
self._killed = False
self._state = drone_manager.PidfileContents()
def _add_nice_command(self, command, nice_level):
if not nice_level:
return command
return ['nice', '-n', str(nice_level)] + command
def _set_start_time(self):
self._start_time = time.time()
def run(self, command, working_directory, num_processes, nice_level=None,
log_file=None, pidfile_name=None, paired_with_pidfile=None,
username=None, drone_hostnames_allowed=None):
assert command is not None
if nice_level is not None:
command = ['nice', '-n', str(nice_level)] + command
self._set_start_time()
self.pidfile_id = self._drone_manager.execute_command(
command, working_directory, pidfile_name=pidfile_name,
num_processes=num_processes, log_file=log_file,
paired_with_pidfile=paired_with_pidfile, username=username,
drone_hostnames_allowed=drone_hostnames_allowed)
def attach_to_existing_process(self, execution_path,
pidfile_name=drone_manager.AUTOSERV_PID_FILE,
num_processes=None):
self._set_start_time()
self.pidfile_id = self._drone_manager.get_pidfile_id_from(
execution_path, pidfile_name=pidfile_name)
if num_processes is not None:
self._drone_manager.declare_process_count(self.pidfile_id, num_processes)
def kill(self):
if self.has_process():
self._drone_manager.kill_process(self.get_process())
self._killed = True
def has_process(self):
self._get_pidfile_info()
return self._state.process is not None
def get_process(self):
self._get_pidfile_info()
assert self._state.process is not None
return self._state.process
def _read_pidfile(self, use_second_read=False):
assert self.pidfile_id is not None, (
'You must call run() or attach_to_existing_process()')
contents = self._drone_manager.get_pidfile_contents(
self.pidfile_id, use_second_read=use_second_read)
if contents.is_invalid():
self._state = drone_manager.PidfileContents()
raise self._PidfileException(contents)
self._state = contents
def _handle_pidfile_error(self, error, message=''):
self.on_lost_process(self._state.process)
def _get_pidfile_info_helper(self):
if self.lost_process:
return
self._read_pidfile()
if self._state.process is None:
self._handle_no_process()
return
if self._state.exit_status is None:
# double check whether or not autoserv is running
if self._drone_manager.is_process_running(self._state.process):
return
# pid but no running process - maybe process *just* exited
self._read_pidfile(use_second_read=True)
if self._state.exit_status is None:
# autoserv exited without writing an exit code
# to the pidfile
self._handle_pidfile_error(
'autoserv died without writing exit code')
def _get_pidfile_info(self):
"""\
After completion, self._state will contain:
pid=None, exit_status=None if autoserv has not yet run
pid!=None, exit_status=None if autoserv is running
pid!=None, exit_status!=None if autoserv has completed
"""
try:
self._get_pidfile_info_helper()
except self._PidfileException, exc:
self._handle_pidfile_error('Pidfile error', traceback.format_exc())
def _handle_no_process(self):
"""\
Called when no pidfile is found or no pid is in the pidfile.
"""
if time.time() - self._start_time > _get_pidfile_timeout_secs():
# If we aborted the process, and we find that it has exited without
# writing a pidfile, then it's because we killed it, and thus this
# isn't a surprising situation.
if not self._killed:
metrics.Counter('chromeos/autotest/errors/scheduler/no_pidfile'
).increment()
else:
logging.warning("%s didn't exit after SIGTERM", self.pidfile_id)
self.on_lost_process()
def on_lost_process(self, process=None):
"""\
Called when autoserv has exited without writing an exit status,
or we've timed out waiting for autoserv to write a pid to the
pidfile. In either case, we just return failure and the caller
should signal some kind of warning.
process is unimportant here, as it shouldn't be used by anyone.
"""
self.lost_process = True
self._state.process = process
self._state.exit_status = 1
self._state.num_tests_failed = 0
def exit_code(self):
self._get_pidfile_info()
return self._state.exit_status
def num_tests_failed(self):
"""@returns The number of tests that failed or -1 if unknown."""
self._get_pidfile_info()
if self._state.num_tests_failed is None:
return -1
return self._state.num_tests_failed
def try_copy_results_on_drone(self, **kwargs):
if self.has_process():
# copy results logs into the normal place for job results
self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs)
def try_copy_to_results_repository(self, source, **kwargs):
if self.has_process():
self._drone_manager.copy_to_results_repository(self.get_process(),
source, **kwargs)