beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 1 | #pylint: disable-msg=C0111 |
| 2 | |
| 3 | """ |
| 4 | Pidfile monitor. |
| 5 | """ |
| 6 | |
Alex Miller | 502d162 | 2014-01-29 17:06:35 -0800 | [diff] [blame] | 7 | import logging |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 8 | import time, traceback |
| 9 | from autotest_lib.client.common_lib import global_config |
Dan Shi | 80f7c53 | 2015-08-25 10:23:14 -0700 | [diff] [blame] | 10 | from autotest_lib.client.common_lib.cros.graphite import autotest_stats |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 11 | from autotest_lib.scheduler import drone_manager, email_manager |
| 12 | from autotest_lib.scheduler import scheduler_config |
| 13 | |
| 14 | |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 15 | |
| 16 | def _get_pidfile_timeout_secs(): |
| 17 | """@returns How long to wait for autoserv to write pidfile.""" |
| 18 | pidfile_timeout_mins = global_config.global_config.get_config_value( |
| 19 | scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int) |
| 20 | return pidfile_timeout_mins * 60 |
| 21 | |
| 22 | |
| 23 | class PidfileRunMonitor(object): |
| 24 | """ |
| 25 | Client must call either run() to start a new process or |
| 26 | attach_to_existing_process(). |
| 27 | """ |
| 28 | |
| 29 | class _PidfileException(Exception): |
| 30 | """ |
| 31 | Raised when there's some unexpected behavior with the pid file, but only |
| 32 | used internally (never allowed to escape this class). |
| 33 | """ |
| 34 | |
| 35 | |
| 36 | def __init__(self): |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 37 | self._drone_manager = drone_manager.instance() |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 38 | self.lost_process = False |
| 39 | self._start_time = None |
| 40 | self.pidfile_id = None |
Alex Miller | 502d162 | 2014-01-29 17:06:35 -0800 | [diff] [blame] | 41 | self._killed = False |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 42 | self._state = drone_manager.PidfileContents() |
| 43 | |
| 44 | |
| 45 | def _add_nice_command(self, command, nice_level): |
| 46 | if not nice_level: |
| 47 | return command |
| 48 | return ['nice', '-n', str(nice_level)] + command |
| 49 | |
| 50 | |
| 51 | def _set_start_time(self): |
| 52 | self._start_time = time.time() |
| 53 | |
| 54 | |
| 55 | def run(self, command, working_directory, num_processes, nice_level=None, |
| 56 | log_file=None, pidfile_name=None, paired_with_pidfile=None, |
| 57 | username=None, drone_hostnames_allowed=None): |
| 58 | assert command is not None |
| 59 | if nice_level is not None: |
| 60 | command = ['nice', '-n', str(nice_level)] + command |
| 61 | self._set_start_time() |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 62 | self.pidfile_id = self._drone_manager.execute_command( |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 63 | command, working_directory, pidfile_name=pidfile_name, |
| 64 | num_processes=num_processes, log_file=log_file, |
| 65 | paired_with_pidfile=paired_with_pidfile, username=username, |
| 66 | drone_hostnames_allowed=drone_hostnames_allowed) |
| 67 | |
| 68 | |
| 69 | def attach_to_existing_process(self, execution_path, |
| 70 | pidfile_name=drone_manager.AUTOSERV_PID_FILE, |
| 71 | num_processes=None): |
| 72 | self._set_start_time() |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 73 | self.pidfile_id = self._drone_manager.get_pidfile_id_from( |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 74 | execution_path, pidfile_name=pidfile_name) |
| 75 | if num_processes is not None: |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 76 | self._drone_manager.declare_process_count(self.pidfile_id, num_processes) |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 77 | |
| 78 | |
| 79 | def kill(self): |
| 80 | if self.has_process(): |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 81 | self._drone_manager.kill_process(self.get_process()) |
Alex Miller | 502d162 | 2014-01-29 17:06:35 -0800 | [diff] [blame] | 82 | self._killed = True |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 83 | |
| 84 | |
| 85 | def has_process(self): |
| 86 | self._get_pidfile_info() |
| 87 | return self._state.process is not None |
| 88 | |
| 89 | |
| 90 | def get_process(self): |
| 91 | self._get_pidfile_info() |
| 92 | assert self._state.process is not None |
| 93 | return self._state.process |
| 94 | |
| 95 | |
| 96 | def _read_pidfile(self, use_second_read=False): |
| 97 | assert self.pidfile_id is not None, ( |
| 98 | 'You must call run() or attach_to_existing_process()') |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 99 | contents = self._drone_manager.get_pidfile_contents( |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 100 | self.pidfile_id, use_second_read=use_second_read) |
| 101 | if contents.is_invalid(): |
| 102 | self._state = drone_manager.PidfileContents() |
| 103 | raise self._PidfileException(contents) |
| 104 | self._state = contents |
| 105 | |
| 106 | |
| 107 | def _handle_pidfile_error(self, error, message=''): |
Dan Shi | 80f7c53 | 2015-08-25 10:23:14 -0700 | [diff] [blame] | 108 | metadata = {'_type': 'scheduler_error', |
| 109 | 'error': 'autoserv died without writing exit code', |
| 110 | 'process': str(self._state.process), |
| 111 | 'pidfile_id': str(self.pidfile_id)} |
| 112 | autotest_stats.Counter('autoserv_died_without_writing_exit_code', |
| 113 | metadata=metadata).increment() |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 114 | self.on_lost_process(self._state.process) |
| 115 | |
| 116 | |
| 117 | def _get_pidfile_info_helper(self): |
| 118 | if self.lost_process: |
| 119 | return |
| 120 | |
| 121 | self._read_pidfile() |
| 122 | |
| 123 | if self._state.process is None: |
| 124 | self._handle_no_process() |
| 125 | return |
| 126 | |
| 127 | if self._state.exit_status is None: |
| 128 | # double check whether or not autoserv is running |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 129 | if self._drone_manager.is_process_running(self._state.process): |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 130 | return |
| 131 | |
| 132 | # pid but no running process - maybe process *just* exited |
| 133 | self._read_pidfile(use_second_read=True) |
| 134 | if self._state.exit_status is None: |
| 135 | # autoserv exited without writing an exit code |
| 136 | # to the pidfile |
| 137 | self._handle_pidfile_error( |
| 138 | 'autoserv died without writing exit code') |
| 139 | |
| 140 | |
| 141 | def _get_pidfile_info(self): |
| 142 | """\ |
| 143 | After completion, self._state will contain: |
| 144 | pid=None, exit_status=None if autoserv has not yet run |
| 145 | pid!=None, exit_status=None if autoserv is running |
| 146 | pid!=None, exit_status!=None if autoserv has completed |
| 147 | """ |
| 148 | try: |
| 149 | self._get_pidfile_info_helper() |
| 150 | except self._PidfileException, exc: |
| 151 | self._handle_pidfile_error('Pidfile error', traceback.format_exc()) |
| 152 | |
| 153 | |
| 154 | def _handle_no_process(self): |
| 155 | """\ |
| 156 | Called when no pidfile is found or no pid is in the pidfile. |
| 157 | """ |
| 158 | message = 'No pid found at %s' % self.pidfile_id |
| 159 | if time.time() - self._start_time > _get_pidfile_timeout_secs(): |
Alex Miller | 502d162 | 2014-01-29 17:06:35 -0800 | [diff] [blame] | 160 | # If we aborted the process, and we find that it has exited without |
| 161 | # writing a pidfile, then it's because we killed it, and thus this |
| 162 | # isn't a surprising situation. |
| 163 | if not self._killed: |
| 164 | email_manager.manager.enqueue_notify_email( |
| 165 | 'Process has failed to write pidfile', message) |
| 166 | else: |
| 167 | logging.warning("%s didn't exit after SIGTERM", self.pidfile_id) |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 168 | self.on_lost_process() |
| 169 | |
| 170 | |
| 171 | def on_lost_process(self, process=None): |
| 172 | """\ |
| 173 | Called when autoserv has exited without writing an exit status, |
| 174 | or we've timed out waiting for autoserv to write a pid to the |
| 175 | pidfile. In either case, we just return failure and the caller |
| 176 | should signal some kind of warning. |
| 177 | |
| 178 | process is unimportant here, as it shouldn't be used by anyone. |
| 179 | """ |
| 180 | self.lost_process = True |
| 181 | self._state.process = process |
| 182 | self._state.exit_status = 1 |
| 183 | self._state.num_tests_failed = 0 |
| 184 | |
| 185 | |
| 186 | def exit_code(self): |
| 187 | self._get_pidfile_info() |
| 188 | return self._state.exit_status |
| 189 | |
| 190 | |
| 191 | def num_tests_failed(self): |
| 192 | """@returns The number of tests that failed or -1 if unknown.""" |
| 193 | self._get_pidfile_info() |
| 194 | if self._state.num_tests_failed is None: |
| 195 | return -1 |
| 196 | return self._state.num_tests_failed |
| 197 | |
| 198 | |
| 199 | def try_copy_results_on_drone(self, **kwargs): |
| 200 | if self.has_process(): |
| 201 | # copy results logs into the normal place for job results |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 202 | self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs) |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 203 | |
| 204 | |
| 205 | def try_copy_to_results_repository(self, source, **kwargs): |
| 206 | if self.has_process(): |
Jakob Jülich | 36accc6 | 2014-07-23 10:26:55 -0700 | [diff] [blame] | 207 | self._drone_manager.copy_to_results_repository(self.get_process(), |
beeps | 5e2bb4a | 2013-10-28 11:26:45 -0700 | [diff] [blame] | 208 | source, **kwargs) |
| 209 | |