Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame^] | 1 | import collections |
| 2 | import heapq |
| 3 | import os |
| 4 | import Queue |
| 5 | import time |
| 6 | import threading |
| 7 | import traceback |
| 8 | import logging |
Michael Liang | da8c60a | 2014-06-03 13:24:51 -0700 | [diff] [blame] | 9 | |
| 10 | import common |
Simran Basi | cced309 | 2012-08-02 15:09:23 -0700 | [diff] [blame] | 11 | from autotest_lib.client.common_lib import error, global_config, utils |
Michael Liang | da8c60a | 2014-06-03 13:24:51 -0700 | [diff] [blame] | 12 | from autotest_lib.client.common_lib.cros.graphite import stats |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 13 | from autotest_lib.scheduler import email_manager, drone_utility, drones |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 14 | from autotest_lib.scheduler import scheduler_config |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame^] | 15 | from autotest_lib.scheduler import thread_lib |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 16 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 17 | |
showard | c75fded | 2009-10-14 16:20:02 +0000 | [diff] [blame] | 18 | # results on drones will be placed under the drone_installation_directory in a |
| 19 | # directory with this name |
| 20 | _DRONE_RESULTS_DIR_SUFFIX = 'results' |
| 21 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 22 | WORKING_DIRECTORY = object() # see execute_command() |
| 23 | |
showard | 8d3dbca | 2009-09-25 20:29:38 +0000 | [diff] [blame] | 24 | |
jamesren | c44ae99 | 2010-02-19 00:12:54 +0000 | [diff] [blame] | 25 | AUTOSERV_PID_FILE = '.autoserv_execute' |
| 26 | CRASHINFO_PID_FILE = '.collect_crashinfo_execute' |
| 27 | PARSER_PID_FILE = '.parser_execute' |
| 28 | ARCHIVER_PID_FILE = '.archiver_execute' |
| 29 | |
| 30 | ALL_PIDFILE_NAMES = (AUTOSERV_PID_FILE, CRASHINFO_PID_FILE, PARSER_PID_FILE, |
| 31 | ARCHIVER_PID_FILE) |
| 32 | |
| 33 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 34 | class DroneManagerError(Exception): |
| 35 | pass |
| 36 | |
| 37 | |
| 38 | class CustomEquals(object): |
| 39 | def _id(self): |
| 40 | raise NotImplementedError |
| 41 | |
| 42 | |
| 43 | def __eq__(self, other): |
| 44 | if not isinstance(other, type(self)): |
| 45 | return NotImplemented |
| 46 | return self._id() == other._id() |
| 47 | |
| 48 | |
| 49 | def __ne__(self, other): |
| 50 | return not self == other |
| 51 | |
| 52 | |
| 53 | def __hash__(self): |
| 54 | return hash(self._id()) |
| 55 | |
| 56 | |
| 57 | class Process(CustomEquals): |
| 58 | def __init__(self, hostname, pid, ppid=None): |
| 59 | self.hostname = hostname |
| 60 | self.pid = pid |
| 61 | self.ppid = ppid |
| 62 | |
| 63 | def _id(self): |
| 64 | return (self.hostname, self.pid) |
| 65 | |
| 66 | |
| 67 | def __str__(self): |
| 68 | return '%s/%s' % (self.hostname, self.pid) |
| 69 | |
| 70 | |
| 71 | def __repr__(self): |
| 72 | return super(Process, self).__repr__() + '<%s>' % self |
| 73 | |
| 74 | |
| 75 | class PidfileId(CustomEquals): |
| 76 | def __init__(self, path): |
| 77 | self.path = path |
| 78 | |
| 79 | |
| 80 | def _id(self): |
| 81 | return self.path |
| 82 | |
| 83 | |
| 84 | def __str__(self): |
| 85 | return str(self.path) |
| 86 | |
| 87 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 88 | class _PidfileInfo(object): |
| 89 | age = 0 |
| 90 | num_processes = None |
| 91 | |
| 92 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 93 | class PidfileContents(object): |
| 94 | process = None |
| 95 | exit_status = None |
| 96 | num_tests_failed = None |
| 97 | |
| 98 | def is_invalid(self): |
| 99 | return False |
| 100 | |
| 101 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 102 | def is_running(self): |
| 103 | return self.process and not self.exit_status |
| 104 | |
| 105 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 106 | class InvalidPidfile(object): |
Simran Basi | 899f9fe | 2013-02-27 11:58:49 -0800 | [diff] [blame] | 107 | process = None |
| 108 | exit_status = None |
| 109 | num_tests_failed = None |
Simran Basi | 4d7bca2 | 2013-02-27 10:57:04 -0800 | [diff] [blame] | 110 | |
| 111 | |
Simran Basi | 899f9fe | 2013-02-27 11:58:49 -0800 | [diff] [blame] | 112 | def __init__(self, error): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 113 | self.error = error |
| 114 | |
| 115 | |
| 116 | def is_invalid(self): |
| 117 | return True |
| 118 | |
| 119 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 120 | def is_running(self): |
| 121 | return False |
| 122 | |
| 123 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 124 | def __str__(self): |
| 125 | return self.error |
| 126 | |
| 127 | |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 128 | class _DroneHeapWrapper(object): |
| 129 | """Wrapper to compare drones based on used_capacity(). |
| 130 | |
| 131 | These objects can be used to keep a heap of drones by capacity. |
| 132 | """ |
| 133 | def __init__(self, drone): |
| 134 | self.drone = drone |
| 135 | |
| 136 | |
| 137 | def __cmp__(self, other): |
| 138 | assert isinstance(other, _DroneHeapWrapper) |
| 139 | return cmp(self.drone.used_capacity(), other.drone.used_capacity()) |
| 140 | |
| 141 | |
Simran Basi | cced309 | 2012-08-02 15:09:23 -0700 | [diff] [blame] | 142 | class BaseDroneManager(object): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 143 | """ |
| 144 | This class acts as an interface from the scheduler to drones, whether it be |
| 145 | only a single "drone" for localhost or multiple remote drones. |
| 146 | |
| 147 | All paths going into and out of this class are relative to the full results |
| 148 | directory, except for those returns by absolute_path(). |
| 149 | """ |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 150 | |
| 151 | |
| 152 | # Minimum time to wait before next email |
| 153 | # about a drone hitting process limit is sent. |
| 154 | NOTIFY_INTERVAL = 60 * 60 * 24 # one day |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame^] | 155 | _STATS_KEY = 'drone_manager' |
| 156 | _timer = stats.Timer(_STATS_KEY) |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 157 | |
| 158 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 159 | def __init__(self): |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 160 | # absolute path of base results dir |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 161 | self._results_dir = None |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 162 | # holds Process objects |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 163 | self._process_set = set() |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 164 | # holds the list of all processes running on all drones |
| 165 | self._all_processes = {} |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 166 | # maps PidfileId to PidfileContents |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 167 | self._pidfiles = {} |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 168 | # same as _pidfiles |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 169 | self._pidfiles_second_read = {} |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 170 | # maps PidfileId to _PidfileInfo |
| 171 | self._registered_pidfile_info = {} |
| 172 | # used to generate unique temporary paths |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 173 | self._temporary_path_counter = 0 |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 174 | # maps hostname to Drone object |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 175 | self._drones = {} |
| 176 | self._results_drone = None |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 177 | # maps results dir to dict mapping file path to contents |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 178 | self._attached_files = {} |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 179 | # heapq of _DroneHeapWrappers |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 180 | self._drone_queue = [] |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 181 | # map drone hostname to time stamp of email that |
| 182 | # has been sent about the drone hitting process limit. |
| 183 | self._notify_record = {} |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame^] | 184 | # A threaded task queue used to refresh drones asynchronously. |
| 185 | self._refresh_task_queue = thread_lib.ThreadedTaskQueue( |
| 186 | name='%s.refresh_queue' % self._STATS_KEY) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 187 | |
| 188 | |
| 189 | def initialize(self, base_results_dir, drone_hostnames, |
| 190 | results_repository_hostname): |
| 191 | self._results_dir = base_results_dir |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 192 | |
| 193 | for hostname in drone_hostnames: |
Eric Li | 861b2d5 | 2011-02-04 14:50:35 -0800 | [diff] [blame] | 194 | self._add_drone(hostname) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 195 | |
| 196 | if not self._drones: |
| 197 | # all drones failed to initialize |
| 198 | raise DroneManagerError('No valid drones found') |
| 199 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 200 | self.refresh_drone_configs() |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 201 | |
showard | 4460ee8 | 2009-07-07 20:54:29 +0000 | [diff] [blame] | 202 | logging.info('Using results repository on %s', |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 203 | results_repository_hostname) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 204 | self._results_drone = drones.get_drone(results_repository_hostname) |
showard | ac5b000 | 2009-10-19 18:34:00 +0000 | [diff] [blame] | 205 | results_installation_dir = global_config.global_config.get_config_value( |
| 206 | scheduler_config.CONFIG_SECTION, |
| 207 | 'results_host_installation_directory', default=None) |
| 208 | if results_installation_dir: |
| 209 | self._results_drone.set_autotest_install_dir( |
| 210 | results_installation_dir) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 211 | # don't initialize() the results drone - we don't want to clear out any |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 212 | # directories and we don't need to kill any processes |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 213 | |
| 214 | |
| 215 | def reinitialize_drones(self): |
| 216 | self._call_all_drones('initialize', self._results_dir) |
| 217 | |
| 218 | |
| 219 | def shutdown(self): |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 220 | for drone in self.get_drones(): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 221 | drone.shutdown() |
| 222 | |
| 223 | |
showard | 8d3dbca | 2009-09-25 20:29:38 +0000 | [diff] [blame] | 224 | def _get_max_pidfile_refreshes(self): |
| 225 | """ |
| 226 | Normally refresh() is called on every monitor_db.Dispatcher.tick(). |
| 227 | |
| 228 | @returns: The number of refresh() calls before we forget a pidfile. |
| 229 | """ |
| 230 | pidfile_timeout = global_config.global_config.get_config_value( |
| 231 | scheduler_config.CONFIG_SECTION, 'max_pidfile_refreshes', |
| 232 | type=int, default=2000) |
| 233 | return pidfile_timeout |
| 234 | |
| 235 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 236 | def _add_drone(self, hostname): |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 237 | logging.info('Adding drone %s' % hostname) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 238 | drone = drones.get_drone(hostname) |
Eric Li | 861b2d5 | 2011-02-04 14:50:35 -0800 | [diff] [blame] | 239 | if drone: |
| 240 | self._drones[drone.hostname] = drone |
| 241 | drone.call('initialize', self.absolute_path('')) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 242 | |
| 243 | |
| 244 | def _remove_drone(self, hostname): |
| 245 | self._drones.pop(hostname, None) |
| 246 | |
| 247 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 248 | def refresh_drone_configs(self): |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 249 | """ |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 250 | Reread global config options for all drones. |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 251 | """ |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 252 | config = global_config.global_config |
| 253 | section = scheduler_config.CONFIG_SECTION |
| 254 | config.parse_config_file() |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 255 | for hostname, drone in self._drones.iteritems(): |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 256 | disabled = config.get_config_value( |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 257 | section, '%s_disabled' % hostname, default='') |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 258 | drone.enabled = not bool(disabled) |
| 259 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 260 | drone.max_processes = config.get_config_value( |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 261 | section, '%s_max_processes' % hostname, type=int, |
| 262 | default=scheduler_config.config.max_processes_per_drone) |
| 263 | |
| 264 | allowed_users = config.get_config_value( |
| 265 | section, '%s_users' % hostname, default=None) |
| 266 | if allowed_users is not None: |
showard | 1b7142d | 2010-01-15 00:21:37 +0000 | [diff] [blame] | 267 | allowed_users = set(allowed_users.split()) |
| 268 | drone.allowed_users = allowed_users |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 269 | |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 270 | self._reorder_drone_queue() # max_processes may have changed |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 271 | # Clear notification record about reaching max_processes limit. |
| 272 | self._notify_record = {} |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 273 | |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 274 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 275 | def get_drones(self): |
| 276 | return self._drones.itervalues() |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 277 | |
| 278 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 279 | def _get_drone_for_process(self, process): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 280 | return self._drones[process.hostname] |
| 281 | |
| 282 | |
| 283 | def _get_drone_for_pidfile_id(self, pidfile_id): |
| 284 | pidfile_contents = self.get_pidfile_contents(pidfile_id) |
| 285 | assert pidfile_contents.process is not None |
| 286 | return self._get_drone_for_process(pidfile_contents.process) |
| 287 | |
| 288 | |
| 289 | def _drop_old_pidfiles(self): |
showard | d349624 | 2009-12-10 21:41:43 +0000 | [diff] [blame] | 290 | # use items() since the dict is modified in unregister_pidfile() |
| 291 | for pidfile_id, info in self._registered_pidfile_info.items(): |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 292 | if info.age > self._get_max_pidfile_refreshes(): |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 293 | logging.warning('dropping leaked pidfile %s', pidfile_id) |
| 294 | self.unregister_pidfile(pidfile_id) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 295 | else: |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 296 | info.age += 1 |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 297 | |
| 298 | |
| 299 | def _reset(self): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 300 | self._process_set = set() |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 301 | self._all_processes = {} |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 302 | self._pidfiles = {} |
| 303 | self._pidfiles_second_read = {} |
| 304 | self._drone_queue = [] |
| 305 | |
| 306 | |
| 307 | def _call_all_drones(self, method, *args, **kwargs): |
| 308 | all_results = {} |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 309 | for drone in self.get_drones(): |
Prashanth B | f78a6fb | 2014-06-10 16:09:40 -0700 | [diff] [blame] | 310 | with self._timer.get_client( |
| 311 | '%s.%s' % (drone.hostname.replace('.', '_'), method)): |
| 312 | all_results[drone] = drone.call(method, *args, **kwargs) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 313 | return all_results |
| 314 | |
| 315 | |
| 316 | def _parse_pidfile(self, drone, raw_contents): |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame^] | 317 | """Parse raw pidfile contents. |
| 318 | |
| 319 | @param drone: The drone on which this pidfile was found. |
| 320 | @param raw_contents: The raw contents of a pidfile, eg: |
| 321 | "pid\nexit_staus\nnum_tests_failed\n". |
| 322 | """ |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 323 | contents = PidfileContents() |
| 324 | if not raw_contents: |
| 325 | return contents |
| 326 | lines = raw_contents.splitlines() |
| 327 | if len(lines) > 3: |
| 328 | return InvalidPidfile('Corrupt pid file (%d lines):\n%s' % |
| 329 | (len(lines), lines)) |
| 330 | try: |
| 331 | pid = int(lines[0]) |
| 332 | contents.process = Process(drone.hostname, pid) |
| 333 | # if len(lines) == 2, assume we caught Autoserv between writing |
| 334 | # exit_status and num_failed_tests, so just ignore it and wait for |
| 335 | # the next cycle |
| 336 | if len(lines) == 3: |
| 337 | contents.exit_status = int(lines[1]) |
| 338 | contents.num_tests_failed = int(lines[2]) |
| 339 | except ValueError, exc: |
| 340 | return InvalidPidfile('Corrupt pid file: ' + str(exc.args)) |
| 341 | |
| 342 | return contents |
| 343 | |
| 344 | |
| 345 | def _process_pidfiles(self, drone, pidfiles, store_in_dict): |
| 346 | for pidfile_path, contents in pidfiles.iteritems(): |
| 347 | pidfile_id = PidfileId(pidfile_path) |
| 348 | contents = self._parse_pidfile(drone, contents) |
| 349 | store_in_dict[pidfile_id] = contents |
| 350 | |
| 351 | |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 352 | def _add_process(self, drone, process_info): |
| 353 | process = Process(drone.hostname, int(process_info['pid']), |
| 354 | int(process_info['ppid'])) |
| 355 | self._process_set.add(process) |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 356 | |
| 357 | |
| 358 | def _add_autoserv_process(self, drone, process_info): |
| 359 | assert process_info['comm'] == 'autoserv' |
| 360 | # only root autoserv processes have pgid == pid |
| 361 | if process_info['pgid'] != process_info['pid']: |
| 362 | return |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 363 | self._add_process(drone, process_info) |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 364 | |
| 365 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 366 | def _enqueue_drone(self, drone): |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 367 | heapq.heappush(self._drone_queue, _DroneHeapWrapper(drone)) |
| 368 | |
| 369 | |
| 370 | def _reorder_drone_queue(self): |
| 371 | heapq.heapify(self._drone_queue) |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 372 | |
| 373 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 374 | def _compute_active_processes(self, drone): |
| 375 | drone.active_processes = 0 |
| 376 | for pidfile_id, contents in self._pidfiles.iteritems(): |
| 377 | is_running = contents.exit_status is None |
| 378 | on_this_drone = (contents.process |
| 379 | and contents.process.hostname == drone.hostname) |
| 380 | if is_running and on_this_drone: |
| 381 | info = self._registered_pidfile_info[pidfile_id] |
| 382 | if info.num_processes is not None: |
| 383 | drone.active_processes += info.num_processes |
| 384 | |
| 385 | |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 386 | def _check_drone_process_limit(self, drone): |
| 387 | """ |
| 388 | Notify if the number of processes on |drone| is approaching limit. |
| 389 | |
| 390 | @param drone: A Drone object. |
| 391 | """ |
Alex Miller | da713d9 | 2013-12-06 10:02:43 -0800 | [diff] [blame] | 392 | try: |
| 393 | percent = float(drone.active_processes) / drone.max_processes |
| 394 | except ZeroDivisionError: |
| 395 | percent = 100 |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 396 | max_percent = scheduler_config.config.max_processes_warning_threshold |
| 397 | if percent >= max_percent: |
| 398 | message = ('Drone %s is hitting %s of process limit.' % |
| 399 | (drone.hostname, format(percent, '.2%'))) |
| 400 | logging.warning(message) |
| 401 | last_notified = self._notify_record.get(drone.hostname, 0) |
| 402 | now = time.time() |
| 403 | if last_notified + BaseDroneManager.NOTIFY_INTERVAL < now: |
| 404 | body = ('Active processes/Process limit: %d/%d (%s)' % |
| 405 | (drone.active_processes, drone.max_processes, |
| 406 | format(percent, '.2%'))) |
| 407 | email_manager.manager.enqueue_notify_email(message, body) |
| 408 | self._notify_record[drone.hostname] = now |
| 409 | |
| 410 | |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame^] | 411 | def trigger_refresh(self): |
| 412 | """Triggers a drone manager refresh. |
| 413 | |
| 414 | @raises DroneManagerError: If a drone has un-executed calls. |
| 415 | Since they will get clobbered when we queue refresh calls. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 416 | """ |
| 417 | self._reset() |
showard | bf9695d | 2009-07-06 20:22:24 +0000 | [diff] [blame] | 418 | self._drop_old_pidfiles() |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 419 | pidfile_paths = [pidfile_id.path |
| 420 | for pidfile_id in self._registered_pidfile_info] |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame^] | 421 | drones = list(self.get_drones()) |
| 422 | for drone in drones: |
| 423 | calls = drone.get_calls() |
| 424 | if calls: |
| 425 | raise DroneManagerError('Drone %s has un-executed calls: %s ' |
| 426 | 'which might get corrupted through ' |
| 427 | 'this invocation' % |
| 428 | (drone, [str(call) for call in calls])) |
| 429 | drone.queue_call('refresh', pidfile_paths) |
| 430 | logging.info("Invoking drone refresh.") |
| 431 | with self._timer.get_client('trigger_refresh'): |
| 432 | self._refresh_task_queue.execute(drones, wait=False) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 433 | |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame^] | 434 | |
| 435 | def sync_refresh(self): |
| 436 | """Complete the drone refresh started by trigger_refresh. |
| 437 | |
| 438 | Waits for all drone threads then refreshes internal datastructures |
| 439 | with drone process information. |
| 440 | """ |
| 441 | |
| 442 | # This gives us a dictionary like what follows: |
| 443 | # {drone: [{'pidfiles': (raw contents of pidfile paths), |
| 444 | # 'autoserv_processes': (autoserv process info from ps), |
| 445 | # 'all_processes': (all process info from ps), |
| 446 | # 'parse_processes': (parse process infor from ps), |
| 447 | # 'pidfile_second_read': (pidfile contents, again),}] |
| 448 | # drone2: ...} |
| 449 | # The values of each drone are only a list because this adheres to the |
| 450 | # drone utility interface (each call is executed and its results are |
| 451 | # places in a list, but since we never couple the refresh calls with |
| 452 | # any other call, this list will always contain a single dict). |
| 453 | with self._timer.get_client('sync_refresh'): |
| 454 | all_results = self._refresh_task_queue.get_results() |
| 455 | logging.info("Drones refreshed.") |
| 456 | |
| 457 | # The loop below goes through and parses pidfile contents. Pidfiles |
| 458 | # are used to track autoserv execution, and will always contain < 3 |
| 459 | # lines of the following: pid, exit code, number of tests. Each pidfile |
| 460 | # is identified by a PidfileId object, which contains a unique pidfile |
| 461 | # path (unique because it contains the job id) making it hashable. |
| 462 | # All pidfiles are stored in the drone managers _pidfiles dict as: |
| 463 | # {pidfile_id: pidfile_contents(Process(drone, pid), |
| 464 | # exit_code, num_tests_failed)} |
| 465 | # In handle agents, each agent knows its pidfile_id, and uses this |
| 466 | # to retrieve the refreshed contents of its pidfile via the |
| 467 | # PidfileRunMonitor (through its tick) before making decisions. If |
| 468 | # the agent notices that its process has exited, it unregisters the |
| 469 | # pidfile from the drone_managers._registered_pidfile_info dict |
| 470 | # through its epilog. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 471 | for drone, results_list in all_results.iteritems(): |
| 472 | results = results_list[0] |
Alex Miller | 82d7a9f | 2014-05-16 14:43:32 -0700 | [diff] [blame] | 473 | drone_hostname = drone.hostname.replace('.', '_') |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 474 | |
Alex Miller | 82d7a9f | 2014-05-16 14:43:32 -0700 | [diff] [blame] | 475 | with self._timer.get_client('%s.results' % drone_hostname): |
Alex Miller | 6cbd758 | 2014-05-14 19:06:52 -0700 | [diff] [blame] | 476 | for process_info in results['all_processes']: |
| 477 | if process_info['comm'] == 'autoserv': |
| 478 | self._add_autoserv_process(drone, process_info) |
| 479 | drone_pid = drone.hostname, int(process_info['pid']) |
| 480 | self._all_processes[drone_pid] = process_info |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 481 | |
Alex Miller | 6cbd758 | 2014-05-14 19:06:52 -0700 | [diff] [blame] | 482 | for process_info in results['parse_processes']: |
| 483 | self._add_process(drone, process_info) |
| 484 | |
Alex Miller | 82d7a9f | 2014-05-16 14:43:32 -0700 | [diff] [blame] | 485 | with self._timer.get_client('%s.pidfiles' % drone_hostname): |
Alex Miller | 6cbd758 | 2014-05-14 19:06:52 -0700 | [diff] [blame] | 486 | self._process_pidfiles(drone, results['pidfiles'], |
| 487 | self._pidfiles) |
Alex Miller | 82d7a9f | 2014-05-16 14:43:32 -0700 | [diff] [blame] | 488 | with self._timer.get_client('%s.pidfiles_second' % drone_hostname): |
Alex Miller | 6cbd758 | 2014-05-14 19:06:52 -0700 | [diff] [blame] | 489 | self._process_pidfiles(drone, results['pidfiles_second_read'], |
| 490 | self._pidfiles_second_read) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 491 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 492 | self._compute_active_processes(drone) |
| 493 | if drone.enabled: |
| 494 | self._enqueue_drone(drone) |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 495 | self._check_drone_process_limit(drone) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 496 | |
| 497 | |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame^] | 498 | def refresh(self): |
| 499 | """Refresh all drones.""" |
| 500 | with self._timer.get_client('refresh'): |
| 501 | self.trigger_refresh() |
| 502 | self.sync_refresh() |
| 503 | |
| 504 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 505 | def execute_actions(self): |
| 506 | """ |
| 507 | Called at the end of a scheduler cycle to execute all queued actions |
| 508 | on drones. |
| 509 | """ |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame^] | 510 | # Invoke calls queued on all drones since the last call to execute |
| 511 | # and wait for them to return. |
| 512 | thread_lib.ThreadedTaskQueue( |
| 513 | name='%s.execute_queue' % self._STATS_KEY).execute( |
| 514 | self._drones.values()) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 515 | |
| 516 | try: |
mbligh | 1ef218d | 2009-08-03 16:57:56 +0000 | [diff] [blame] | 517 | self._results_drone.execute_queued_calls() |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 518 | except error.AutoservError: |
| 519 | warning = ('Results repository failed to execute calls:\n' + |
| 520 | traceback.format_exc()) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 521 | email_manager.manager.enqueue_notify_email( |
| 522 | 'Results repository error', warning) |
| 523 | self._results_drone.clear_call_queue() |
| 524 | |
| 525 | |
| 526 | def get_orphaned_autoserv_processes(self): |
| 527 | """ |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 528 | Returns a set of Process objects for orphaned processes only. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 529 | """ |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 530 | return set(process for process in self._process_set |
| 531 | if process.ppid == 1) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 532 | |
| 533 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 534 | def kill_process(self, process): |
| 535 | """ |
| 536 | Kill the given process. |
| 537 | """ |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 538 | logging.info('killing %s', process) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 539 | drone = self._get_drone_for_process(process) |
| 540 | drone.queue_call('kill_process', process) |
| 541 | |
| 542 | |
| 543 | def _ensure_directory_exists(self, path): |
| 544 | if not os.path.exists(path): |
| 545 | os.makedirs(path) |
| 546 | |
| 547 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 548 | def total_running_processes(self): |
| 549 | return sum(drone.active_processes for drone in self.get_drones()) |
| 550 | |
| 551 | |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 552 | def max_runnable_processes(self, username, drone_hostnames_allowed): |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 553 | """ |
| 554 | Return the maximum number of processes that can be run (in a single |
| 555 | execution) given the current load on drones. |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 556 | @param username: login of user to run a process. may be None. |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 557 | @param drone_hostnames_allowed: list of drones that can be used. May be |
| 558 | None |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 559 | """ |
showard | 1b7142d | 2010-01-15 00:21:37 +0000 | [diff] [blame] | 560 | usable_drone_wrappers = [wrapper for wrapper in self._drone_queue |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 561 | if wrapper.drone.usable_by(username) and |
| 562 | (drone_hostnames_allowed is None or |
| 563 | wrapper.drone.hostname in |
| 564 | drone_hostnames_allowed)] |
showard | 1b7142d | 2010-01-15 00:21:37 +0000 | [diff] [blame] | 565 | if not usable_drone_wrappers: |
| 566 | # all drones disabled or inaccessible |
showard | de700d3 | 2009-02-25 00:12:42 +0000 | [diff] [blame] | 567 | return 0 |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 568 | runnable_processes = [ |
| 569 | wrapper.drone.max_processes - wrapper.drone.active_processes |
| 570 | for wrapper in usable_drone_wrappers] |
| 571 | return max([0] + runnable_processes) |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 572 | |
| 573 | |
showard | e39ebe9 | 2009-06-18 23:14:48 +0000 | [diff] [blame] | 574 | def _least_loaded_drone(self, drones): |
| 575 | drone_to_use = drones[0] |
| 576 | for drone in drones[1:]: |
| 577 | if drone.used_capacity() < drone_to_use.used_capacity(): |
| 578 | drone_to_use = drone |
| 579 | return drone_to_use |
| 580 | |
| 581 | |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 582 | def _choose_drone_for_execution(self, num_processes, username, |
| 583 | drone_hostnames_allowed): |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 584 | # cycle through drones is order of increasing used capacity until |
| 585 | # we find one that can handle these processes |
| 586 | checked_drones = [] |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 587 | usable_drones = [] |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 588 | drone_to_use = None |
| 589 | while self._drone_queue: |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 590 | drone = heapq.heappop(self._drone_queue).drone |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 591 | checked_drones.append(drone) |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 592 | logging.info('Checking drone %s', drone.hostname) |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 593 | if not drone.usable_by(username): |
| 594 | continue |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 595 | |
| 596 | drone_allowed = (drone_hostnames_allowed is None |
| 597 | or drone.hostname in drone_hostnames_allowed) |
| 598 | if not drone_allowed: |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 599 | logging.debug('Drone %s not allowed: ', drone.hostname) |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 600 | continue |
| 601 | |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 602 | usable_drones.append(drone) |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 603 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 604 | if drone.active_processes + num_processes <= drone.max_processes: |
| 605 | drone_to_use = drone |
| 606 | break |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 607 | logging.info('Drone %s has %d active + %s requested > %s max', |
| 608 | drone.hostname, drone.active_processes, num_processes, |
| 609 | drone.max_processes) |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 610 | |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 611 | if not drone_to_use and usable_drones: |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 612 | drone_summary = ','.join('%s %s/%s' % (drone.hostname, |
| 613 | drone.active_processes, |
| 614 | drone.max_processes) |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 615 | for drone in usable_drones) |
| 616 | logging.error('No drone has capacity to handle %d processes (%s) ' |
| 617 | 'for user %s', num_processes, drone_summary, username) |
| 618 | drone_to_use = self._least_loaded_drone(usable_drones) |
showard | e39ebe9 | 2009-06-18 23:14:48 +0000 | [diff] [blame] | 619 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 620 | # refill _drone_queue |
| 621 | for drone in checked_drones: |
| 622 | self._enqueue_drone(drone) |
| 623 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 624 | return drone_to_use |
| 625 | |
| 626 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 627 | def _substitute_working_directory_into_command(self, command, |
| 628 | working_directory): |
| 629 | for i, item in enumerate(command): |
| 630 | if item is WORKING_DIRECTORY: |
| 631 | command[i] = working_directory |
| 632 | |
| 633 | |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 634 | def execute_command(self, command, working_directory, pidfile_name, |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 635 | num_processes, log_file=None, paired_with_pidfile=None, |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 636 | username=None, drone_hostnames_allowed=None): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 637 | """ |
| 638 | Execute the given command, taken as an argv list. |
| 639 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 640 | @param command: command to execute as a list. if any item is |
| 641 | WORKING_DIRECTORY, the absolute path to the working directory |
| 642 | will be substituted for it. |
| 643 | @param working_directory: directory in which the pidfile will be written |
| 644 | @param pidfile_name: name of the pidfile this process will write |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 645 | @param num_processes: number of processes to account for from this |
| 646 | execution |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 647 | @param log_file (optional): path (in the results repository) to hold |
| 648 | command output. |
| 649 | @param paired_with_pidfile (optional): a PidfileId for an |
| 650 | already-executed process; the new process will execute on the |
| 651 | same drone as the previous process. |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 652 | @param username (optional): login of the user responsible for this |
| 653 | process. |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 654 | @param drone_hostnames_allowed (optional): hostnames of the drones that |
| 655 | this command is allowed to |
| 656 | execute on |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 657 | """ |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 658 | abs_working_directory = self.absolute_path(working_directory) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 659 | if not log_file: |
| 660 | log_file = self.get_temporary_path('execute') |
| 661 | log_file = self.absolute_path(log_file) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 662 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 663 | self._substitute_working_directory_into_command(command, |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 664 | abs_working_directory) |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 665 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 666 | if paired_with_pidfile: |
| 667 | drone = self._get_drone_for_pidfile_id(paired_with_pidfile) |
| 668 | else: |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 669 | drone = self._choose_drone_for_execution(num_processes, username, |
| 670 | drone_hostnames_allowed) |
| 671 | |
| 672 | if not drone: |
| 673 | raise DroneManagerError('command failed; no drones available: %s' |
| 674 | % command) |
| 675 | |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 676 | logging.info("command = %s" % command) |
| 677 | logging.info('log file = %s:%s' % (drone.hostname, log_file)) |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 678 | self._write_attached_files(working_directory, drone) |
| 679 | drone.queue_call('execute_command', command, abs_working_directory, |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 680 | log_file, pidfile_name) |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 681 | drone.active_processes += num_processes |
| 682 | self._reorder_drone_queue() |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 683 | |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 684 | pidfile_path = os.path.join(abs_working_directory, pidfile_name) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 685 | pidfile_id = PidfileId(pidfile_path) |
| 686 | self.register_pidfile(pidfile_id) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 687 | self._registered_pidfile_info[pidfile_id].num_processes = num_processes |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 688 | return pidfile_id |
| 689 | |
| 690 | |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 691 | def get_pidfile_id_from(self, execution_tag, pidfile_name): |
| 692 | path = os.path.join(self.absolute_path(execution_tag), pidfile_name) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 693 | return PidfileId(path) |
| 694 | |
| 695 | |
| 696 | def register_pidfile(self, pidfile_id): |
| 697 | """ |
| 698 | Indicate that the DroneManager should look for the given pidfile when |
| 699 | refreshing. |
| 700 | """ |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 701 | if pidfile_id not in self._registered_pidfile_info: |
showard | 3739978 | 2009-08-20 23:32:20 +0000 | [diff] [blame] | 702 | logging.info('monitoring pidfile %s', pidfile_id) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 703 | self._registered_pidfile_info[pidfile_id] = _PidfileInfo() |
showard | c6fb604 | 2010-01-25 21:48:20 +0000 | [diff] [blame] | 704 | self._reset_pidfile_age(pidfile_id) |
| 705 | |
| 706 | |
| 707 | def _reset_pidfile_age(self, pidfile_id): |
| 708 | if pidfile_id in self._registered_pidfile_info: |
| 709 | self._registered_pidfile_info[pidfile_id].age = 0 |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 710 | |
| 711 | |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 712 | def unregister_pidfile(self, pidfile_id): |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 713 | if pidfile_id in self._registered_pidfile_info: |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 714 | logging.info('forgetting pidfile %s', pidfile_id) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 715 | del self._registered_pidfile_info[pidfile_id] |
| 716 | |
| 717 | |
| 718 | def declare_process_count(self, pidfile_id, num_processes): |
| 719 | self._registered_pidfile_info[pidfile_id].num_processes = num_processes |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 720 | |
| 721 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 722 | def get_pidfile_contents(self, pidfile_id, use_second_read=False): |
| 723 | """ |
| 724 | Retrieve a PidfileContents object for the given pidfile_id. If |
| 725 | use_second_read is True, use results that were read after the processes |
| 726 | were checked, instead of before. |
| 727 | """ |
showard | c6fb604 | 2010-01-25 21:48:20 +0000 | [diff] [blame] | 728 | self._reset_pidfile_age(pidfile_id) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 729 | if use_second_read: |
| 730 | pidfile_map = self._pidfiles_second_read |
| 731 | else: |
| 732 | pidfile_map = self._pidfiles |
| 733 | return pidfile_map.get(pidfile_id, PidfileContents()) |
| 734 | |
| 735 | |
| 736 | def is_process_running(self, process): |
| 737 | """ |
| 738 | Check if the given process is in the running process list. |
| 739 | """ |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 740 | if process in self._process_set: |
| 741 | return True |
| 742 | |
Alex Miller | 06a5f75 | 2013-08-15 11:16:40 -0700 | [diff] [blame] | 743 | drone_pid = process.hostname, process.pid |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 744 | if drone_pid in self._all_processes: |
| 745 | logging.error('Process %s found, but not an autoserv process. ' |
| 746 | 'Is %s', process, self._all_processes[drone_pid]) |
| 747 | return True |
| 748 | |
| 749 | return False |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 750 | |
| 751 | |
| 752 | def get_temporary_path(self, base_name): |
| 753 | """ |
| 754 | Get a new temporary path guaranteed to be unique across all drones |
| 755 | for this scheduler execution. |
| 756 | """ |
| 757 | self._temporary_path_counter += 1 |
| 758 | return os.path.join(drone_utility._TEMPORARY_DIRECTORY, |
| 759 | '%s.%s' % (base_name, self._temporary_path_counter)) |
| 760 | |
| 761 | |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 762 | def absolute_path(self, path, on_results_repository=False): |
| 763 | if on_results_repository: |
| 764 | base_dir = self._results_dir |
| 765 | else: |
showard | c75fded | 2009-10-14 16:20:02 +0000 | [diff] [blame] | 766 | base_dir = os.path.join(drones.AUTOTEST_INSTALL_DIR, |
| 767 | _DRONE_RESULTS_DIR_SUFFIX) |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 768 | return os.path.join(base_dir, path) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 769 | |
| 770 | |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 771 | def _copy_results_helper(self, process, source_path, destination_path, |
| 772 | to_results_repository=False): |
Simran Basi | 882f15b | 2013-10-29 14:59:34 -0700 | [diff] [blame] | 773 | logging.debug('_copy_results_helper. process: %s, source_path: %s, ' |
| 774 | 'destination_path: %s, to_results_repository: %s', |
| 775 | process, source_path, destination_path, |
| 776 | to_results_repository) |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 777 | full_source = self.absolute_path(source_path) |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 778 | full_destination = self.absolute_path( |
| 779 | destination_path, on_results_repository=to_results_repository) |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 780 | source_drone = self._get_drone_for_process(process) |
| 781 | if to_results_repository: |
| 782 | source_drone.send_file_to(self._results_drone, full_source, |
| 783 | full_destination, can_fail=True) |
| 784 | else: |
| 785 | source_drone.queue_call('copy_file_or_directory', full_source, |
| 786 | full_destination) |
| 787 | |
| 788 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 789 | def copy_to_results_repository(self, process, source_path, |
| 790 | destination_path=None): |
| 791 | """ |
| 792 | Copy results from the given process at source_path to destination_path |
| 793 | in the results repository. |
| 794 | """ |
| 795 | if destination_path is None: |
| 796 | destination_path = source_path |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 797 | self._copy_results_helper(process, source_path, destination_path, |
| 798 | to_results_repository=True) |
| 799 | |
| 800 | |
| 801 | def copy_results_on_drone(self, process, source_path, destination_path): |
| 802 | """ |
| 803 | Copy a results directory from one place to another on the drone. |
| 804 | """ |
| 805 | self._copy_results_helper(process, source_path, destination_path) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 806 | |
| 807 | |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 808 | def _write_attached_files(self, results_dir, drone): |
| 809 | attached_files = self._attached_files.pop(results_dir, {}) |
showard | 73ec044 | 2009-02-07 02:05:20 +0000 | [diff] [blame] | 810 | for file_path, contents in attached_files.iteritems(): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 811 | drone.queue_call('write_to_file', self.absolute_path(file_path), |
| 812 | contents) |
| 813 | |
| 814 | |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 815 | def attach_file_to_execution(self, results_dir, file_contents, |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 816 | file_path=None): |
| 817 | """ |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 818 | When the process for the results directory is executed, the given file |
| 819 | contents will be placed in a file on the drone. Returns the path at |
| 820 | which the file will be placed. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 821 | """ |
| 822 | if not file_path: |
| 823 | file_path = self.get_temporary_path('attach') |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 824 | files_for_execution = self._attached_files.setdefault(results_dir, {}) |
showard | 73ec044 | 2009-02-07 02:05:20 +0000 | [diff] [blame] | 825 | assert file_path not in files_for_execution |
| 826 | files_for_execution[file_path] = file_contents |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 827 | return file_path |
| 828 | |
| 829 | |
showard | 35162b0 | 2009-03-03 02:17:30 +0000 | [diff] [blame] | 830 | def write_lines_to_file(self, file_path, lines, paired_with_process=None): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 831 | """ |
| 832 | Write the given lines (as a list of strings) to a file. If |
showard | 35162b0 | 2009-03-03 02:17:30 +0000 | [diff] [blame] | 833 | paired_with_process is given, the file will be written on the drone |
| 834 | running the given Process. Otherwise, the file will be written to the |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 835 | results repository. |
| 836 | """ |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 837 | file_contents = '\n'.join(lines) + '\n' |
showard | 35162b0 | 2009-03-03 02:17:30 +0000 | [diff] [blame] | 838 | if paired_with_process: |
| 839 | drone = self._get_drone_for_process(paired_with_process) |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 840 | on_results_repository = False |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 841 | else: |
| 842 | drone = self._results_drone |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 843 | on_results_repository = True |
| 844 | full_path = self.absolute_path( |
| 845 | file_path, on_results_repository=on_results_repository) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 846 | drone.queue_call('write_to_file', full_path, file_contents) |
jamesren | c44ae99 | 2010-02-19 00:12:54 +0000 | [diff] [blame] | 847 | |
| 848 | |
Simran Basi | cced309 | 2012-08-02 15:09:23 -0700 | [diff] [blame] | 849 | SiteDroneManager = utils.import_site_class( |
| 850 | __file__, 'autotest_lib.scheduler.site_drone_manager', |
| 851 | 'SiteDroneManager', BaseDroneManager) |
| 852 | |
| 853 | |
| 854 | class DroneManager(SiteDroneManager): |
| 855 | pass |
| 856 | |
| 857 | |
jamesren | c44ae99 | 2010-02-19 00:12:54 +0000 | [diff] [blame] | 858 | _the_instance = None |
| 859 | |
| 860 | def instance(): |
| 861 | if _the_instance is None: |
| 862 | _set_instance(DroneManager()) |
| 863 | return _the_instance |
| 864 | |
| 865 | |
| 866 | def _set_instance(instance): # usable for testing |
| 867 | global _the_instance |
| 868 | _the_instance = instance |