blob: c064a110df7d3593c5eb6c2018cf30518fbc6717 [file] [log] [blame]
beeps5e2bb4a2013-10-28 11:26:45 -07001#pylint: disable-msg=C0111
2
3"""
4Pidfile monitor.
5"""
6
Alex Miller502d1622014-01-29 17:06:35 -08007import logging
beeps5e2bb4a2013-10-28 11:26:45 -07008import time, traceback
9from autotest_lib.client.common_lib import global_config
Dan Shi80f7c532015-08-25 10:23:14 -070010from autotest_lib.client.common_lib.cros.graphite import autotest_stats
beeps5e2bb4a2013-10-28 11:26:45 -070011from autotest_lib.scheduler import drone_manager, email_manager
12from autotest_lib.scheduler import scheduler_config
13
14
beeps5e2bb4a2013-10-28 11:26:45 -070015
16def _get_pidfile_timeout_secs():
17 """@returns How long to wait for autoserv to write pidfile."""
18 pidfile_timeout_mins = global_config.global_config.get_config_value(
19 scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int)
20 return pidfile_timeout_mins * 60
21
22
23class PidfileRunMonitor(object):
24 """
25 Client must call either run() to start a new process or
26 attach_to_existing_process().
27 """
28
29 class _PidfileException(Exception):
30 """
31 Raised when there's some unexpected behavior with the pid file, but only
32 used internally (never allowed to escape this class).
33 """
34
35
36 def __init__(self):
Jakob Jülich36accc62014-07-23 10:26:55 -070037 self._drone_manager = drone_manager.instance()
beeps5e2bb4a2013-10-28 11:26:45 -070038 self.lost_process = False
39 self._start_time = None
40 self.pidfile_id = None
Alex Miller502d1622014-01-29 17:06:35 -080041 self._killed = False
beeps5e2bb4a2013-10-28 11:26:45 -070042 self._state = drone_manager.PidfileContents()
43
44
45 def _add_nice_command(self, command, nice_level):
46 if not nice_level:
47 return command
48 return ['nice', '-n', str(nice_level)] + command
49
50
51 def _set_start_time(self):
52 self._start_time = time.time()
53
54
55 def run(self, command, working_directory, num_processes, nice_level=None,
56 log_file=None, pidfile_name=None, paired_with_pidfile=None,
57 username=None, drone_hostnames_allowed=None):
58 assert command is not None
59 if nice_level is not None:
60 command = ['nice', '-n', str(nice_level)] + command
61 self._set_start_time()
Jakob Jülich36accc62014-07-23 10:26:55 -070062 self.pidfile_id = self._drone_manager.execute_command(
beeps5e2bb4a2013-10-28 11:26:45 -070063 command, working_directory, pidfile_name=pidfile_name,
64 num_processes=num_processes, log_file=log_file,
65 paired_with_pidfile=paired_with_pidfile, username=username,
66 drone_hostnames_allowed=drone_hostnames_allowed)
67
68
69 def attach_to_existing_process(self, execution_path,
70 pidfile_name=drone_manager.AUTOSERV_PID_FILE,
71 num_processes=None):
72 self._set_start_time()
Jakob Jülich36accc62014-07-23 10:26:55 -070073 self.pidfile_id = self._drone_manager.get_pidfile_id_from(
beeps5e2bb4a2013-10-28 11:26:45 -070074 execution_path, pidfile_name=pidfile_name)
75 if num_processes is not None:
Jakob Jülich36accc62014-07-23 10:26:55 -070076 self._drone_manager.declare_process_count(self.pidfile_id, num_processes)
beeps5e2bb4a2013-10-28 11:26:45 -070077
78
79 def kill(self):
80 if self.has_process():
Jakob Jülich36accc62014-07-23 10:26:55 -070081 self._drone_manager.kill_process(self.get_process())
Alex Miller502d1622014-01-29 17:06:35 -080082 self._killed = True
beeps5e2bb4a2013-10-28 11:26:45 -070083
84
85 def has_process(self):
86 self._get_pidfile_info()
87 return self._state.process is not None
88
89
90 def get_process(self):
91 self._get_pidfile_info()
92 assert self._state.process is not None
93 return self._state.process
94
95
96 def _read_pidfile(self, use_second_read=False):
97 assert self.pidfile_id is not None, (
98 'You must call run() or attach_to_existing_process()')
Jakob Jülich36accc62014-07-23 10:26:55 -070099 contents = self._drone_manager.get_pidfile_contents(
beeps5e2bb4a2013-10-28 11:26:45 -0700100 self.pidfile_id, use_second_read=use_second_read)
101 if contents.is_invalid():
102 self._state = drone_manager.PidfileContents()
103 raise self._PidfileException(contents)
104 self._state = contents
105
106
107 def _handle_pidfile_error(self, error, message=''):
Dan Shi80f7c532015-08-25 10:23:14 -0700108 metadata = {'_type': 'scheduler_error',
109 'error': 'autoserv died without writing exit code',
110 'process': str(self._state.process),
111 'pidfile_id': str(self.pidfile_id)}
112 autotest_stats.Counter('autoserv_died_without_writing_exit_code',
113 metadata=metadata).increment()
beeps5e2bb4a2013-10-28 11:26:45 -0700114 self.on_lost_process(self._state.process)
115
116
117 def _get_pidfile_info_helper(self):
118 if self.lost_process:
119 return
120
121 self._read_pidfile()
122
123 if self._state.process is None:
124 self._handle_no_process()
125 return
126
127 if self._state.exit_status is None:
128 # double check whether or not autoserv is running
Jakob Jülich36accc62014-07-23 10:26:55 -0700129 if self._drone_manager.is_process_running(self._state.process):
beeps5e2bb4a2013-10-28 11:26:45 -0700130 return
131
132 # pid but no running process - maybe process *just* exited
133 self._read_pidfile(use_second_read=True)
134 if self._state.exit_status is None:
135 # autoserv exited without writing an exit code
136 # to the pidfile
137 self._handle_pidfile_error(
138 'autoserv died without writing exit code')
139
140
141 def _get_pidfile_info(self):
142 """\
143 After completion, self._state will contain:
144 pid=None, exit_status=None if autoserv has not yet run
145 pid!=None, exit_status=None if autoserv is running
146 pid!=None, exit_status!=None if autoserv has completed
147 """
148 try:
149 self._get_pidfile_info_helper()
150 except self._PidfileException, exc:
151 self._handle_pidfile_error('Pidfile error', traceback.format_exc())
152
153
154 def _handle_no_process(self):
155 """\
156 Called when no pidfile is found or no pid is in the pidfile.
157 """
158 message = 'No pid found at %s' % self.pidfile_id
159 if time.time() - self._start_time > _get_pidfile_timeout_secs():
Alex Miller502d1622014-01-29 17:06:35 -0800160 # If we aborted the process, and we find that it has exited without
161 # writing a pidfile, then it's because we killed it, and thus this
162 # isn't a surprising situation.
163 if not self._killed:
164 email_manager.manager.enqueue_notify_email(
165 'Process has failed to write pidfile', message)
166 else:
167 logging.warning("%s didn't exit after SIGTERM", self.pidfile_id)
beeps5e2bb4a2013-10-28 11:26:45 -0700168 self.on_lost_process()
169
170
171 def on_lost_process(self, process=None):
172 """\
173 Called when autoserv has exited without writing an exit status,
174 or we've timed out waiting for autoserv to write a pid to the
175 pidfile. In either case, we just return failure and the caller
176 should signal some kind of warning.
177
178 process is unimportant here, as it shouldn't be used by anyone.
179 """
180 self.lost_process = True
181 self._state.process = process
182 self._state.exit_status = 1
183 self._state.num_tests_failed = 0
184
185
186 def exit_code(self):
187 self._get_pidfile_info()
188 return self._state.exit_status
189
190
191 def num_tests_failed(self):
192 """@returns The number of tests that failed or -1 if unknown."""
193 self._get_pidfile_info()
194 if self._state.num_tests_failed is None:
195 return -1
196 return self._state.num_tests_failed
197
198
199 def try_copy_results_on_drone(self, **kwargs):
200 if self.has_process():
201 # copy results logs into the normal place for job results
Jakob Jülich36accc62014-07-23 10:26:55 -0700202 self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs)
beeps5e2bb4a2013-10-28 11:26:45 -0700203
204
205 def try_copy_to_results_repository(self, source, **kwargs):
206 if self.has_process():
Jakob Jülich36accc62014-07-23 10:26:55 -0700207 self._drone_manager.copy_to_results_repository(self.get_process(),
beeps5e2bb4a2013-10-28 11:26:45 -0700208 source, **kwargs)
209