blob: 1b61e1caecb10aac7a4dead1ecb7fd3d43405be2 [file] [log] [blame]
beeps5e2bb4a2013-10-28 11:26:45 -07001#pylint: disable-msg=C0111
2
3"""
4Pidfile monitor.
5"""
6
Alex Miller502d1622014-01-29 17:06:35 -08007import logging
Aviv Keshet82c508c2017-01-03 14:10:17 -08008import time
9import traceback
10
11import common
12
Dan Shi5e2efb72017-02-07 11:40:23 -080013from autotest_lib.client.common_lib import utils
beeps5e2bb4a2013-10-28 11:26:45 -070014from autotest_lib.client.common_lib import global_config
Aviv Keshet82c508c2017-01-03 14:10:17 -080015from autotest_lib.scheduler import drone_manager
beeps5e2bb4a2013-10-28 11:26:45 -070016from autotest_lib.scheduler import scheduler_config
17
Dan Shi5e2efb72017-02-07 11:40:23 -080018try:
19 from chromite.lib import metrics
20except ImportError:
21 metrics = utils.metrics_mock
beeps5e2bb4a2013-10-28 11:26:45 -070022
beeps5e2bb4a2013-10-28 11:26:45 -070023
24def _get_pidfile_timeout_secs():
25 """@returns How long to wait for autoserv to write pidfile."""
26 pidfile_timeout_mins = global_config.global_config.get_config_value(
27 scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int)
28 return pidfile_timeout_mins * 60
29
30
31class PidfileRunMonitor(object):
32 """
33 Client must call either run() to start a new process or
34 attach_to_existing_process().
35 """
36
37 class _PidfileException(Exception):
38 """
39 Raised when there's some unexpected behavior with the pid file, but only
40 used internally (never allowed to escape this class).
41 """
42
43
44 def __init__(self):
Jakob Jülich36accc62014-07-23 10:26:55 -070045 self._drone_manager = drone_manager.instance()
beeps5e2bb4a2013-10-28 11:26:45 -070046 self.lost_process = False
47 self._start_time = None
48 self.pidfile_id = None
Alex Miller502d1622014-01-29 17:06:35 -080049 self._killed = False
beeps5e2bb4a2013-10-28 11:26:45 -070050 self._state = drone_manager.PidfileContents()
51
52
53 def _add_nice_command(self, command, nice_level):
54 if not nice_level:
55 return command
56 return ['nice', '-n', str(nice_level)] + command
57
58
59 def _set_start_time(self):
60 self._start_time = time.time()
61
62
63 def run(self, command, working_directory, num_processes, nice_level=None,
64 log_file=None, pidfile_name=None, paired_with_pidfile=None,
65 username=None, drone_hostnames_allowed=None):
66 assert command is not None
67 if nice_level is not None:
68 command = ['nice', '-n', str(nice_level)] + command
69 self._set_start_time()
Jakob Jülich36accc62014-07-23 10:26:55 -070070 self.pidfile_id = self._drone_manager.execute_command(
beeps5e2bb4a2013-10-28 11:26:45 -070071 command, working_directory, pidfile_name=pidfile_name,
72 num_processes=num_processes, log_file=log_file,
73 paired_with_pidfile=paired_with_pidfile, username=username,
74 drone_hostnames_allowed=drone_hostnames_allowed)
75
76
77 def attach_to_existing_process(self, execution_path,
78 pidfile_name=drone_manager.AUTOSERV_PID_FILE,
79 num_processes=None):
80 self._set_start_time()
Jakob Jülich36accc62014-07-23 10:26:55 -070081 self.pidfile_id = self._drone_manager.get_pidfile_id_from(
beeps5e2bb4a2013-10-28 11:26:45 -070082 execution_path, pidfile_name=pidfile_name)
83 if num_processes is not None:
Jakob Jülich36accc62014-07-23 10:26:55 -070084 self._drone_manager.declare_process_count(self.pidfile_id, num_processes)
beeps5e2bb4a2013-10-28 11:26:45 -070085
86
87 def kill(self):
88 if self.has_process():
Jakob Jülich36accc62014-07-23 10:26:55 -070089 self._drone_manager.kill_process(self.get_process())
Alex Miller502d1622014-01-29 17:06:35 -080090 self._killed = True
beeps5e2bb4a2013-10-28 11:26:45 -070091
92
93 def has_process(self):
94 self._get_pidfile_info()
95 return self._state.process is not None
96
97
98 def get_process(self):
99 self._get_pidfile_info()
100 assert self._state.process is not None
101 return self._state.process
102
103
104 def _read_pidfile(self, use_second_read=False):
105 assert self.pidfile_id is not None, (
106 'You must call run() or attach_to_existing_process()')
Jakob Jülich36accc62014-07-23 10:26:55 -0700107 contents = self._drone_manager.get_pidfile_contents(
beeps5e2bb4a2013-10-28 11:26:45 -0700108 self.pidfile_id, use_second_read=use_second_read)
109 if contents.is_invalid():
110 self._state = drone_manager.PidfileContents()
111 raise self._PidfileException(contents)
112 self._state = contents
113
114
115 def _handle_pidfile_error(self, error, message=''):
beeps5e2bb4a2013-10-28 11:26:45 -0700116 self.on_lost_process(self._state.process)
117
118
119 def _get_pidfile_info_helper(self):
120 if self.lost_process:
121 return
122
123 self._read_pidfile()
124
125 if self._state.process is None:
126 self._handle_no_process()
127 return
128
129 if self._state.exit_status is None:
130 # double check whether or not autoserv is running
Jakob Jülich36accc62014-07-23 10:26:55 -0700131 if self._drone_manager.is_process_running(self._state.process):
beeps5e2bb4a2013-10-28 11:26:45 -0700132 return
133
134 # pid but no running process - maybe process *just* exited
135 self._read_pidfile(use_second_read=True)
136 if self._state.exit_status is None:
137 # autoserv exited without writing an exit code
138 # to the pidfile
139 self._handle_pidfile_error(
140 'autoserv died without writing exit code')
141
142
143 def _get_pidfile_info(self):
144 """\
145 After completion, self._state will contain:
146 pid=None, exit_status=None if autoserv has not yet run
147 pid!=None, exit_status=None if autoserv is running
148 pid!=None, exit_status!=None if autoserv has completed
149 """
150 try:
151 self._get_pidfile_info_helper()
152 except self._PidfileException, exc:
153 self._handle_pidfile_error('Pidfile error', traceback.format_exc())
154
155
156 def _handle_no_process(self):
157 """\
158 Called when no pidfile is found or no pid is in the pidfile.
159 """
beeps5e2bb4a2013-10-28 11:26:45 -0700160 if time.time() - self._start_time > _get_pidfile_timeout_secs():
Alex Miller502d1622014-01-29 17:06:35 -0800161 # If we aborted the process, and we find that it has exited without
162 # writing a pidfile, then it's because we killed it, and thus this
163 # isn't a surprising situation.
164 if not self._killed:
Aviv Keshet82c508c2017-01-03 14:10:17 -0800165 metrics.Counter('chromeos/autotest/errors/scheduler/no_pidfile'
166 ).increment()
Alex Miller502d1622014-01-29 17:06:35 -0800167 else:
168 logging.warning("%s didn't exit after SIGTERM", self.pidfile_id)
beeps5e2bb4a2013-10-28 11:26:45 -0700169 self.on_lost_process()
170
171
172 def on_lost_process(self, process=None):
173 """\
174 Called when autoserv has exited without writing an exit status,
175 or we've timed out waiting for autoserv to write a pid to the
176 pidfile. In either case, we just return failure and the caller
177 should signal some kind of warning.
178
179 process is unimportant here, as it shouldn't be used by anyone.
180 """
181 self.lost_process = True
182 self._state.process = process
183 self._state.exit_status = 1
184 self._state.num_tests_failed = 0
185
186
187 def exit_code(self):
188 self._get_pidfile_info()
189 return self._state.exit_status
190
191
192 def num_tests_failed(self):
193 """@returns The number of tests that failed or -1 if unknown."""
194 self._get_pidfile_info()
195 if self._state.num_tests_failed is None:
196 return -1
197 return self._state.num_tests_failed
198
199
200 def try_copy_results_on_drone(self, **kwargs):
201 if self.has_process():
202 # copy results logs into the normal place for job results
Jakob Jülich36accc62014-07-23 10:26:55 -0700203 self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs)
beeps5e2bb4a2013-10-28 11:26:45 -0700204
205
206 def try_copy_to_results_repository(self, source, **kwargs):
207 if self.has_process():
Jakob Jülich36accc62014-07-23 10:26:55 -0700208 self._drone_manager.copy_to_results_repository(self.get_process(),
beeps5e2bb4a2013-10-28 11:26:45 -0700209 source, **kwargs)
210