Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 1 | import heapq |
| 2 | import os |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 3 | import logging |
Michael Liang | da8c60a | 2014-06-03 13:24:51 -0700 | [diff] [blame] | 4 | |
| 5 | import common |
Dan Shi | 5e2efb7 | 2017-02-07 11:40:23 -0800 | [diff] [blame] | 6 | from autotest_lib.client.common_lib import error |
| 7 | from autotest_lib.client.common_lib import global_config |
| 8 | from autotest_lib.client.common_lib import utils |
MK Ryu | 7911ad5 | 2015-12-18 11:40:04 -0800 | [diff] [blame] | 9 | from autotest_lib.scheduler import drone_task_queue |
Allen Li | 036b3a6 | 2017-02-06 15:19:43 -0800 | [diff] [blame] | 10 | from autotest_lib.scheduler import drone_utility |
| 11 | from autotest_lib.scheduler import drones |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 12 | from autotest_lib.scheduler import scheduler_config |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 13 | from autotest_lib.scheduler import thread_lib |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 14 | |
Dan Shi | 5e2efb7 | 2017-02-07 11:40:23 -0800 | [diff] [blame] | 15 | try: |
| 16 | from chromite.lib import metrics |
| 17 | except ImportError: |
| 18 | metrics = utils.metrics_mock |
| 19 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 20 | |
showard | c75fded | 2009-10-14 16:20:02 +0000 | [diff] [blame] | 21 | # results on drones will be placed under the drone_installation_directory in a |
| 22 | # directory with this name |
| 23 | _DRONE_RESULTS_DIR_SUFFIX = 'results' |
| 24 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 25 | WORKING_DIRECTORY = object() # see execute_command() |
| 26 | |
showard | 8d3dbca | 2009-09-25 20:29:38 +0000 | [diff] [blame] | 27 | |
jamesren | c44ae99 | 2010-02-19 00:12:54 +0000 | [diff] [blame] | 28 | AUTOSERV_PID_FILE = '.autoserv_execute' |
| 29 | CRASHINFO_PID_FILE = '.collect_crashinfo_execute' |
| 30 | PARSER_PID_FILE = '.parser_execute' |
| 31 | ARCHIVER_PID_FILE = '.archiver_execute' |
| 32 | |
| 33 | ALL_PIDFILE_NAMES = (AUTOSERV_PID_FILE, CRASHINFO_PID_FILE, PARSER_PID_FILE, |
| 34 | ARCHIVER_PID_FILE) |
| 35 | |
MK Ryu | 7911ad5 | 2015-12-18 11:40:04 -0800 | [diff] [blame] | 36 | _THREADED_DRONE_MANAGER = global_config.global_config.get_config_value( |
| 37 | scheduler_config.CONFIG_SECTION, 'threaded_drone_manager', |
| 38 | type=bool, default=True) |
| 39 | |
Allen Li | 036b3a6 | 2017-02-06 15:19:43 -0800 | [diff] [blame] | 40 | HOSTS_JOB_SUBDIR = 'hosts/' |
| 41 | PARSE_LOG = '.parse.log' |
| 42 | ENABLE_ARCHIVING = global_config.global_config.get_config_value( |
| 43 | scheduler_config.CONFIG_SECTION, 'enable_archiving', type=bool) |
| 44 | |
jamesren | c44ae99 | 2010-02-19 00:12:54 +0000 | [diff] [blame] | 45 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 46 | class DroneManagerError(Exception): |
| 47 | pass |
| 48 | |
| 49 | |
| 50 | class CustomEquals(object): |
| 51 | def _id(self): |
| 52 | raise NotImplementedError |
| 53 | |
| 54 | |
| 55 | def __eq__(self, other): |
| 56 | if not isinstance(other, type(self)): |
| 57 | return NotImplemented |
| 58 | return self._id() == other._id() |
| 59 | |
| 60 | |
| 61 | def __ne__(self, other): |
| 62 | return not self == other |
| 63 | |
| 64 | |
| 65 | def __hash__(self): |
| 66 | return hash(self._id()) |
| 67 | |
| 68 | |
| 69 | class Process(CustomEquals): |
| 70 | def __init__(self, hostname, pid, ppid=None): |
| 71 | self.hostname = hostname |
| 72 | self.pid = pid |
| 73 | self.ppid = ppid |
| 74 | |
| 75 | def _id(self): |
| 76 | return (self.hostname, self.pid) |
| 77 | |
| 78 | |
| 79 | def __str__(self): |
| 80 | return '%s/%s' % (self.hostname, self.pid) |
| 81 | |
| 82 | |
| 83 | def __repr__(self): |
| 84 | return super(Process, self).__repr__() + '<%s>' % self |
| 85 | |
| 86 | |
| 87 | class PidfileId(CustomEquals): |
| 88 | def __init__(self, path): |
| 89 | self.path = path |
| 90 | |
| 91 | |
| 92 | def _id(self): |
| 93 | return self.path |
| 94 | |
| 95 | |
| 96 | def __str__(self): |
| 97 | return str(self.path) |
| 98 | |
| 99 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 100 | class _PidfileInfo(object): |
| 101 | age = 0 |
| 102 | num_processes = None |
| 103 | |
| 104 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 105 | class PidfileContents(object): |
| 106 | process = None |
| 107 | exit_status = None |
| 108 | num_tests_failed = None |
| 109 | |
| 110 | def is_invalid(self): |
| 111 | return False |
| 112 | |
| 113 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 114 | def is_running(self): |
| 115 | return self.process and not self.exit_status |
| 116 | |
| 117 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 118 | class InvalidPidfile(object): |
Simran Basi | 899f9fe | 2013-02-27 11:58:49 -0800 | [diff] [blame] | 119 | process = None |
| 120 | exit_status = None |
| 121 | num_tests_failed = None |
Simran Basi | 4d7bca2 | 2013-02-27 10:57:04 -0800 | [diff] [blame] | 122 | |
| 123 | |
Simran Basi | 899f9fe | 2013-02-27 11:58:49 -0800 | [diff] [blame] | 124 | def __init__(self, error): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 125 | self.error = error |
| 126 | |
| 127 | |
| 128 | def is_invalid(self): |
| 129 | return True |
| 130 | |
| 131 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 132 | def is_running(self): |
| 133 | return False |
| 134 | |
| 135 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 136 | def __str__(self): |
| 137 | return self.error |
| 138 | |
| 139 | |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 140 | class _DroneHeapWrapper(object): |
| 141 | """Wrapper to compare drones based on used_capacity(). |
| 142 | |
| 143 | These objects can be used to keep a heap of drones by capacity. |
| 144 | """ |
| 145 | def __init__(self, drone): |
| 146 | self.drone = drone |
| 147 | |
| 148 | |
| 149 | def __cmp__(self, other): |
| 150 | assert isinstance(other, _DroneHeapWrapper) |
| 151 | return cmp(self.drone.used_capacity(), other.drone.used_capacity()) |
| 152 | |
| 153 | |
Allen Li | 036b3a6 | 2017-02-06 15:19:43 -0800 | [diff] [blame] | 154 | class DroneManager(object): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 155 | """ |
| 156 | This class acts as an interface from the scheduler to drones, whether it be |
| 157 | only a single "drone" for localhost or multiple remote drones. |
| 158 | |
| 159 | All paths going into and out of this class are relative to the full results |
| 160 | directory, except for those returns by absolute_path(). |
| 161 | """ |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 162 | |
| 163 | |
| 164 | # Minimum time to wait before next email |
| 165 | # about a drone hitting process limit is sent. |
| 166 | NOTIFY_INTERVAL = 60 * 60 * 24 # one day |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 167 | _STATS_KEY = 'drone_manager' |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 168 | |
Aviv Keshet | 99e6adb | 2016-07-14 16:35:32 -0700 | [diff] [blame] | 169 | |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 170 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 171 | def __init__(self): |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 172 | # absolute path of base results dir |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 173 | self._results_dir = None |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 174 | # holds Process objects |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 175 | self._process_set = set() |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 176 | # holds the list of all processes running on all drones |
| 177 | self._all_processes = {} |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 178 | # maps PidfileId to PidfileContents |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 179 | self._pidfiles = {} |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 180 | # same as _pidfiles |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 181 | self._pidfiles_second_read = {} |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 182 | # maps PidfileId to _PidfileInfo |
| 183 | self._registered_pidfile_info = {} |
| 184 | # used to generate unique temporary paths |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 185 | self._temporary_path_counter = 0 |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 186 | # maps hostname to Drone object |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 187 | self._drones = {} |
| 188 | self._results_drone = None |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 189 | # maps results dir to dict mapping file path to contents |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 190 | self._attached_files = {} |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 191 | # heapq of _DroneHeapWrappers |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 192 | self._drone_queue = [] |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 193 | # A threaded task queue used to refresh drones asynchronously. |
MK Ryu | 7911ad5 | 2015-12-18 11:40:04 -0800 | [diff] [blame] | 194 | if _THREADED_DRONE_MANAGER: |
| 195 | self._refresh_task_queue = thread_lib.ThreadedTaskQueue( |
| 196 | name='%s.refresh_queue' % self._STATS_KEY) |
| 197 | else: |
| 198 | self._refresh_task_queue = drone_task_queue.DroneTaskQueue() |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 199 | |
| 200 | |
| 201 | def initialize(self, base_results_dir, drone_hostnames, |
| 202 | results_repository_hostname): |
| 203 | self._results_dir = base_results_dir |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 204 | |
| 205 | for hostname in drone_hostnames: |
Eric Li | 861b2d5 | 2011-02-04 14:50:35 -0800 | [diff] [blame] | 206 | self._add_drone(hostname) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 207 | |
| 208 | if not self._drones: |
| 209 | # all drones failed to initialize |
| 210 | raise DroneManagerError('No valid drones found') |
| 211 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 212 | self.refresh_drone_configs() |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 213 | |
showard | 4460ee8 | 2009-07-07 20:54:29 +0000 | [diff] [blame] | 214 | logging.info('Using results repository on %s', |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 215 | results_repository_hostname) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 216 | self._results_drone = drones.get_drone(results_repository_hostname) |
showard | ac5b000 | 2009-10-19 18:34:00 +0000 | [diff] [blame] | 217 | results_installation_dir = global_config.global_config.get_config_value( |
| 218 | scheduler_config.CONFIG_SECTION, |
| 219 | 'results_host_installation_directory', default=None) |
| 220 | if results_installation_dir: |
| 221 | self._results_drone.set_autotest_install_dir( |
| 222 | results_installation_dir) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 223 | # don't initialize() the results drone - we don't want to clear out any |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 224 | # directories and we don't need to kill any processes |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 225 | |
| 226 | |
| 227 | def reinitialize_drones(self): |
Prathmesh Prabhu | 3623862 | 2016-11-22 18:41:06 -0800 | [diff] [blame] | 228 | for drone in self.get_drones(): |
| 229 | with metrics.SecondsTimer( |
| 230 | 'chromeos/autotest/drone_manager/' |
| 231 | 'reinitialize_drones_duration', |
| 232 | fields={'drone': drone.hostname}): |
| 233 | drone.call('initialize', self._results_dir) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 234 | |
| 235 | |
| 236 | def shutdown(self): |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 237 | for drone in self.get_drones(): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 238 | drone.shutdown() |
| 239 | |
| 240 | |
showard | 8d3dbca | 2009-09-25 20:29:38 +0000 | [diff] [blame] | 241 | def _get_max_pidfile_refreshes(self): |
| 242 | """ |
| 243 | Normally refresh() is called on every monitor_db.Dispatcher.tick(). |
| 244 | |
| 245 | @returns: The number of refresh() calls before we forget a pidfile. |
| 246 | """ |
| 247 | pidfile_timeout = global_config.global_config.get_config_value( |
| 248 | scheduler_config.CONFIG_SECTION, 'max_pidfile_refreshes', |
| 249 | type=int, default=2000) |
| 250 | return pidfile_timeout |
| 251 | |
| 252 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 253 | def _add_drone(self, hostname): |
Allen Li | 036b3a6 | 2017-02-06 15:19:43 -0800 | [diff] [blame] | 254 | """ |
| 255 | Add drone. |
| 256 | |
| 257 | Catches AutoservRunError if the drone fails initialization and does not |
| 258 | add it to the list of usable drones. |
| 259 | |
| 260 | @param hostname: Hostname of the drone we are trying to add. |
| 261 | """ |
| 262 | logging.info('Adding drone %s' % hostname) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 263 | drone = drones.get_drone(hostname) |
Eric Li | 861b2d5 | 2011-02-04 14:50:35 -0800 | [diff] [blame] | 264 | if drone: |
Allen Li | 036b3a6 | 2017-02-06 15:19:43 -0800 | [diff] [blame] | 265 | try: |
| 266 | drone.call('initialize', self.absolute_path('')) |
| 267 | except error.AutoservRunError as e: |
| 268 | logging.error('Failed to initialize drone %s with error: %s', |
| 269 | hostname, e) |
| 270 | return |
Eric Li | 861b2d5 | 2011-02-04 14:50:35 -0800 | [diff] [blame] | 271 | self._drones[drone.hostname] = drone |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 272 | |
| 273 | |
| 274 | def _remove_drone(self, hostname): |
| 275 | self._drones.pop(hostname, None) |
| 276 | |
| 277 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 278 | def refresh_drone_configs(self): |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 279 | """ |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 280 | Reread global config options for all drones. |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 281 | """ |
Dan Shi | b9144a4 | 2014-12-01 16:09:32 -0800 | [diff] [blame] | 282 | # Import server_manager_utils is delayed rather than at the beginning of |
| 283 | # this module. The reason is that test_that imports drone_manager when |
| 284 | # importing autoserv_utils. The import is done before test_that setup |
| 285 | # django (test_that only setup django in setup_local_afe, since it's |
| 286 | # not needed when test_that runs the test in a lab duts through :lab: |
| 287 | # option. Therefore, if server_manager_utils is imported at the |
| 288 | # beginning of this module, test_that will fail since django is not |
| 289 | # setup yet. |
| 290 | from autotest_lib.site_utils import server_manager_utils |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 291 | config = global_config.global_config |
| 292 | section = scheduler_config.CONFIG_SECTION |
| 293 | config.parse_config_file() |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 294 | for hostname, drone in self._drones.iteritems(): |
Dan Shi | b9144a4 | 2014-12-01 16:09:32 -0800 | [diff] [blame] | 295 | if server_manager_utils.use_server_db(): |
| 296 | server = server_manager_utils.get_servers(hostname=hostname)[0] |
| 297 | attributes = dict([(a.attribute, a.value) |
| 298 | for a in server.attributes.all()]) |
| 299 | drone.enabled = ( |
| 300 | int(attributes.get('disabled', 0)) == 0) |
| 301 | drone.max_processes = int( |
| 302 | attributes.get( |
| 303 | 'max_processes', |
| 304 | scheduler_config.config.max_processes_per_drone)) |
| 305 | allowed_users = attributes.get('users', None) |
| 306 | else: |
| 307 | disabled = config.get_config_value( |
| 308 | section, '%s_disabled' % hostname, default='') |
| 309 | drone.enabled = not bool(disabled) |
Dan Shi | b9144a4 | 2014-12-01 16:09:32 -0800 | [diff] [blame] | 310 | drone.max_processes = config.get_config_value( |
| 311 | section, '%s_max_processes' % hostname, type=int, |
| 312 | default=scheduler_config.config.max_processes_per_drone) |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 313 | |
Dan Shi | b9144a4 | 2014-12-01 16:09:32 -0800 | [diff] [blame] | 314 | allowed_users = config.get_config_value( |
| 315 | section, '%s_users' % hostname, default=None) |
| 316 | if allowed_users: |
| 317 | drone.allowed_users = set(allowed_users.split()) |
| 318 | else: |
| 319 | drone.allowed_users = None |
| 320 | logging.info('Drone %s.max_processes: %s', hostname, |
| 321 | drone.max_processes) |
| 322 | logging.info('Drone %s.enabled: %s', hostname, drone.enabled) |
| 323 | logging.info('Drone %s.allowed_users: %s', hostname, |
| 324 | drone.allowed_users) |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 325 | |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 326 | self._reorder_drone_queue() # max_processes may have changed |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 327 | # Clear notification record about reaching max_processes limit. |
| 328 | self._notify_record = {} |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 329 | |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 330 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 331 | def get_drones(self): |
| 332 | return self._drones.itervalues() |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 333 | |
| 334 | |
Dan Shi | c458f66 | 2015-04-29 12:12:38 -0700 | [diff] [blame] | 335 | def cleanup_orphaned_containers(self): |
| 336 | """Queue cleanup_orphaned_containers call at each drone. |
| 337 | """ |
Dan Shi | 55d5899 | 2015-05-05 09:10:02 -0700 | [diff] [blame] | 338 | for drone in self._drones.values(): |
| 339 | logging.info('Queue cleanup_orphaned_containers at %s', |
| 340 | drone.hostname) |
Dan Shi | c458f66 | 2015-04-29 12:12:38 -0700 | [diff] [blame] | 341 | drone.queue_call('cleanup_orphaned_containers') |
Dan Shi | c458f66 | 2015-04-29 12:12:38 -0700 | [diff] [blame] | 342 | |
| 343 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 344 | def _get_drone_for_process(self, process): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 345 | return self._drones[process.hostname] |
| 346 | |
| 347 | |
| 348 | def _get_drone_for_pidfile_id(self, pidfile_id): |
| 349 | pidfile_contents = self.get_pidfile_contents(pidfile_id) |
Shuqian Zhao | 502a6ac | 2018-02-21 14:44:59 -0800 | [diff] [blame] | 350 | if pidfile_contents.process is None: |
| 351 | raise DroneManagerError('Fail to get a drone due to empty pidfile') |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 352 | return self._get_drone_for_process(pidfile_contents.process) |
| 353 | |
| 354 | |
Allen Li | ff7064f | 2017-09-13 15:11:31 -0700 | [diff] [blame] | 355 | def get_drone_for_pidfile_id(self, pidfile_id): |
| 356 | """Public API for luciferlib. |
| 357 | |
| 358 | @param pidfile_id: PidfileId instance. |
| 359 | """ |
| 360 | return self._get_drone_for_pidfile_id(pidfile_id) |
| 361 | |
| 362 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 363 | def _drop_old_pidfiles(self): |
showard | d349624 | 2009-12-10 21:41:43 +0000 | [diff] [blame] | 364 | # use items() since the dict is modified in unregister_pidfile() |
| 365 | for pidfile_id, info in self._registered_pidfile_info.items(): |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 366 | if info.age > self._get_max_pidfile_refreshes(): |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 367 | logging.warning('dropping leaked pidfile %s', pidfile_id) |
| 368 | self.unregister_pidfile(pidfile_id) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 369 | else: |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 370 | info.age += 1 |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 371 | |
| 372 | |
| 373 | def _reset(self): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 374 | self._process_set = set() |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 375 | self._all_processes = {} |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 376 | self._pidfiles = {} |
| 377 | self._pidfiles_second_read = {} |
| 378 | self._drone_queue = [] |
| 379 | |
| 380 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 381 | def _parse_pidfile(self, drone, raw_contents): |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 382 | """Parse raw pidfile contents. |
| 383 | |
| 384 | @param drone: The drone on which this pidfile was found. |
| 385 | @param raw_contents: The raw contents of a pidfile, eg: |
| 386 | "pid\nexit_staus\nnum_tests_failed\n". |
| 387 | """ |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 388 | contents = PidfileContents() |
| 389 | if not raw_contents: |
| 390 | return contents |
| 391 | lines = raw_contents.splitlines() |
| 392 | if len(lines) > 3: |
| 393 | return InvalidPidfile('Corrupt pid file (%d lines):\n%s' % |
| 394 | (len(lines), lines)) |
| 395 | try: |
| 396 | pid = int(lines[0]) |
| 397 | contents.process = Process(drone.hostname, pid) |
| 398 | # if len(lines) == 2, assume we caught Autoserv between writing |
| 399 | # exit_status and num_failed_tests, so just ignore it and wait for |
| 400 | # the next cycle |
| 401 | if len(lines) == 3: |
| 402 | contents.exit_status = int(lines[1]) |
| 403 | contents.num_tests_failed = int(lines[2]) |
| 404 | except ValueError, exc: |
| 405 | return InvalidPidfile('Corrupt pid file: ' + str(exc.args)) |
| 406 | |
| 407 | return contents |
| 408 | |
| 409 | |
| 410 | def _process_pidfiles(self, drone, pidfiles, store_in_dict): |
| 411 | for pidfile_path, contents in pidfiles.iteritems(): |
| 412 | pidfile_id = PidfileId(pidfile_path) |
| 413 | contents = self._parse_pidfile(drone, contents) |
| 414 | store_in_dict[pidfile_id] = contents |
| 415 | |
| 416 | |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 417 | def _add_process(self, drone, process_info): |
| 418 | process = Process(drone.hostname, int(process_info['pid']), |
| 419 | int(process_info['ppid'])) |
| 420 | self._process_set.add(process) |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 421 | |
| 422 | |
| 423 | def _add_autoserv_process(self, drone, process_info): |
| 424 | assert process_info['comm'] == 'autoserv' |
| 425 | # only root autoserv processes have pgid == pid |
| 426 | if process_info['pgid'] != process_info['pid']: |
| 427 | return |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 428 | self._add_process(drone, process_info) |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 429 | |
| 430 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 431 | def _enqueue_drone(self, drone): |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 432 | heapq.heappush(self._drone_queue, _DroneHeapWrapper(drone)) |
| 433 | |
| 434 | |
| 435 | def _reorder_drone_queue(self): |
| 436 | heapq.heapify(self._drone_queue) |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 437 | |
| 438 | |
Allen Li | fb0cb10 | 2018-06-20 14:31:51 -0700 | [diff] [blame] | 439 | def reorder_drone_queue(self): |
| 440 | """Reorder drone queue according to modified process counts. |
| 441 | |
| 442 | This public API is exposed for luciferlib to wrap. |
| 443 | """ |
| 444 | self._reorder_drone_queue() |
| 445 | |
| 446 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 447 | def _compute_active_processes(self, drone): |
| 448 | drone.active_processes = 0 |
| 449 | for pidfile_id, contents in self._pidfiles.iteritems(): |
| 450 | is_running = contents.exit_status is None |
| 451 | on_this_drone = (contents.process |
| 452 | and contents.process.hostname == drone.hostname) |
| 453 | if is_running and on_this_drone: |
| 454 | info = self._registered_pidfile_info[pidfile_id] |
| 455 | if info.num_processes is not None: |
| 456 | drone.active_processes += info.num_processes |
Paul Hobbs | e141622 | 2017-08-20 21:41:29 -0700 | [diff] [blame] | 457 | |
| 458 | metrics.Gauge('chromeos/autotest/drone/active_processes').set( |
Aviv Keshet | 99e6adb | 2016-07-14 16:35:32 -0700 | [diff] [blame] | 459 | drone.active_processes, |
| 460 | fields={'drone_hostname': drone.hostname}) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 461 | |
| 462 | |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 463 | def _check_drone_process_limit(self, drone): |
| 464 | """ |
| 465 | Notify if the number of processes on |drone| is approaching limit. |
| 466 | |
| 467 | @param drone: A Drone object. |
| 468 | """ |
Alex Miller | da713d9 | 2013-12-06 10:02:43 -0800 | [diff] [blame] | 469 | try: |
| 470 | percent = float(drone.active_processes) / drone.max_processes |
| 471 | except ZeroDivisionError: |
| 472 | percent = 100 |
Prathmesh Prabhu | 21e0971 | 2016-12-20 11:50:26 -0800 | [diff] [blame] | 473 | metrics.Float('chromeos/autotest/drone/active_process_percentage' |
Aviv Keshet | c29b4c7 | 2016-12-14 22:27:35 -0800 | [diff] [blame] | 474 | ).set(percent, fields={'drone_hostname': drone.hostname}) |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 475 | |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 476 | def trigger_refresh(self): |
| 477 | """Triggers a drone manager refresh. |
| 478 | |
| 479 | @raises DroneManagerError: If a drone has un-executed calls. |
| 480 | Since they will get clobbered when we queue refresh calls. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 481 | """ |
| 482 | self._reset() |
showard | bf9695d | 2009-07-06 20:22:24 +0000 | [diff] [blame] | 483 | self._drop_old_pidfiles() |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 484 | pidfile_paths = [pidfile_id.path |
| 485 | for pidfile_id in self._registered_pidfile_info] |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 486 | drones = list(self.get_drones()) |
| 487 | for drone in drones: |
| 488 | calls = drone.get_calls() |
| 489 | if calls: |
| 490 | raise DroneManagerError('Drone %s has un-executed calls: %s ' |
| 491 | 'which might get corrupted through ' |
| 492 | 'this invocation' % |
| 493 | (drone, [str(call) for call in calls])) |
| 494 | drone.queue_call('refresh', pidfile_paths) |
| 495 | logging.info("Invoking drone refresh.") |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 496 | with metrics.SecondsTimer( |
| 497 | 'chromeos/autotest/drone_manager/trigger_refresh_duration'): |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 498 | self._refresh_task_queue.execute(drones, wait=False) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 499 | |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 500 | |
| 501 | def sync_refresh(self): |
| 502 | """Complete the drone refresh started by trigger_refresh. |
| 503 | |
| 504 | Waits for all drone threads then refreshes internal datastructures |
| 505 | with drone process information. |
| 506 | """ |
| 507 | |
| 508 | # This gives us a dictionary like what follows: |
| 509 | # {drone: [{'pidfiles': (raw contents of pidfile paths), |
| 510 | # 'autoserv_processes': (autoserv process info from ps), |
| 511 | # 'all_processes': (all process info from ps), |
| 512 | # 'parse_processes': (parse process infor from ps), |
| 513 | # 'pidfile_second_read': (pidfile contents, again),}] |
| 514 | # drone2: ...} |
| 515 | # The values of each drone are only a list because this adheres to the |
| 516 | # drone utility interface (each call is executed and its results are |
| 517 | # places in a list, but since we never couple the refresh calls with |
| 518 | # any other call, this list will always contain a single dict). |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 519 | with metrics.SecondsTimer( |
| 520 | 'chromeos/autotest/drone_manager/sync_refresh_duration'): |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 521 | all_results = self._refresh_task_queue.get_results() |
| 522 | logging.info("Drones refreshed.") |
| 523 | |
| 524 | # The loop below goes through and parses pidfile contents. Pidfiles |
| 525 | # are used to track autoserv execution, and will always contain < 3 |
| 526 | # lines of the following: pid, exit code, number of tests. Each pidfile |
| 527 | # is identified by a PidfileId object, which contains a unique pidfile |
| 528 | # path (unique because it contains the job id) making it hashable. |
| 529 | # All pidfiles are stored in the drone managers _pidfiles dict as: |
| 530 | # {pidfile_id: pidfile_contents(Process(drone, pid), |
| 531 | # exit_code, num_tests_failed)} |
| 532 | # In handle agents, each agent knows its pidfile_id, and uses this |
| 533 | # to retrieve the refreshed contents of its pidfile via the |
| 534 | # PidfileRunMonitor (through its tick) before making decisions. If |
| 535 | # the agent notices that its process has exited, it unregisters the |
| 536 | # pidfile from the drone_managers._registered_pidfile_info dict |
| 537 | # through its epilog. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 538 | for drone, results_list in all_results.iteritems(): |
| 539 | results = results_list[0] |
Alex Miller | 82d7a9f | 2014-05-16 14:43:32 -0700 | [diff] [blame] | 540 | drone_hostname = drone.hostname.replace('.', '_') |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 541 | |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 542 | for process_info in results['all_processes']: |
| 543 | if process_info['comm'] == 'autoserv': |
| 544 | self._add_autoserv_process(drone, process_info) |
| 545 | drone_pid = drone.hostname, int(process_info['pid']) |
| 546 | self._all_processes[drone_pid] = process_info |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 547 | |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 548 | for process_info in results['parse_processes']: |
| 549 | self._add_process(drone, process_info) |
Alex Miller | 6cbd758 | 2014-05-14 19:06:52 -0700 | [diff] [blame] | 550 | |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 551 | self._process_pidfiles(drone, results['pidfiles'], self._pidfiles) |
| 552 | self._process_pidfiles(drone, results['pidfiles_second_read'], |
| 553 | self._pidfiles_second_read) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 554 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 555 | self._compute_active_processes(drone) |
| 556 | if drone.enabled: |
| 557 | self._enqueue_drone(drone) |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 558 | self._check_drone_process_limit(drone) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 559 | |
| 560 | |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 561 | def refresh(self): |
| 562 | """Refresh all drones.""" |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 563 | with metrics.SecondsTimer( |
| 564 | 'chromeos/autotest/drone_manager/refresh_duration'): |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 565 | self.trigger_refresh() |
| 566 | self.sync_refresh() |
| 567 | |
| 568 | |
Prathmesh Prabhu | 3623862 | 2016-11-22 18:41:06 -0800 | [diff] [blame] | 569 | @metrics.SecondsTimerDecorator( |
| 570 | 'chromeos/autotest/drone_manager/execute_actions_duration') |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 571 | def execute_actions(self): |
| 572 | """ |
| 573 | Called at the end of a scheduler cycle to execute all queued actions |
| 574 | on drones. |
| 575 | """ |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 576 | # Invoke calls queued on all drones since the last call to execute |
| 577 | # and wait for them to return. |
MK Ryu | 7911ad5 | 2015-12-18 11:40:04 -0800 | [diff] [blame] | 578 | if _THREADED_DRONE_MANAGER: |
| 579 | thread_lib.ThreadedTaskQueue( |
| 580 | name='%s.execute_queue' % self._STATS_KEY).execute( |
| 581 | self._drones.values()) |
| 582 | else: |
| 583 | drone_task_queue.DroneTaskQueue().execute(self._drones.values()) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 584 | |
| 585 | try: |
mbligh | 1ef218d | 2009-08-03 16:57:56 +0000 | [diff] [blame] | 586 | self._results_drone.execute_queued_calls() |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 587 | except error.AutoservError: |
Aviv Keshet | c29b4c7 | 2016-12-14 22:27:35 -0800 | [diff] [blame] | 588 | m = 'chromeos/autotest/errors/results_repository_failed' |
| 589 | metrics.Counter(m).increment( |
| 590 | fields={'drone_hostname': self._results_drone.hostname}) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 591 | self._results_drone.clear_call_queue() |
| 592 | |
| 593 | |
| 594 | def get_orphaned_autoserv_processes(self): |
| 595 | """ |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 596 | Returns a set of Process objects for orphaned processes only. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 597 | """ |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 598 | return set(process for process in self._process_set |
| 599 | if process.ppid == 1) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 600 | |
| 601 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 602 | def kill_process(self, process): |
| 603 | """ |
| 604 | Kill the given process. |
| 605 | """ |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 606 | logging.info('killing %s', process) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 607 | drone = self._get_drone_for_process(process) |
Allen Li | 036b3a6 | 2017-02-06 15:19:43 -0800 | [diff] [blame] | 608 | drone.queue_kill_process(process) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 609 | |
| 610 | |
| 611 | def _ensure_directory_exists(self, path): |
| 612 | if not os.path.exists(path): |
| 613 | os.makedirs(path) |
| 614 | |
| 615 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 616 | def total_running_processes(self): |
| 617 | return sum(drone.active_processes for drone in self.get_drones()) |
| 618 | |
| 619 | |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 620 | def max_runnable_processes(self, username, drone_hostnames_allowed): |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 621 | """ |
| 622 | Return the maximum number of processes that can be run (in a single |
| 623 | execution) given the current load on drones. |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 624 | @param username: login of user to run a process. may be None. |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 625 | @param drone_hostnames_allowed: list of drones that can be used. May be |
| 626 | None |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 627 | """ |
showard | 1b7142d | 2010-01-15 00:21:37 +0000 | [diff] [blame] | 628 | usable_drone_wrappers = [wrapper for wrapper in self._drone_queue |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 629 | if wrapper.drone.usable_by(username) and |
| 630 | (drone_hostnames_allowed is None or |
| 631 | wrapper.drone.hostname in |
| 632 | drone_hostnames_allowed)] |
showard | 1b7142d | 2010-01-15 00:21:37 +0000 | [diff] [blame] | 633 | if not usable_drone_wrappers: |
| 634 | # all drones disabled or inaccessible |
showard | de700d3 | 2009-02-25 00:12:42 +0000 | [diff] [blame] | 635 | return 0 |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 636 | runnable_processes = [ |
| 637 | wrapper.drone.max_processes - wrapper.drone.active_processes |
| 638 | for wrapper in usable_drone_wrappers] |
| 639 | return max([0] + runnable_processes) |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 640 | |
| 641 | |
showard | e39ebe9 | 2009-06-18 23:14:48 +0000 | [diff] [blame] | 642 | def _least_loaded_drone(self, drones): |
Paul Hobbs | 7f3c6fa | 2017-08-20 21:43:53 -0700 | [diff] [blame] | 643 | return min(drones, key=lambda d: d.used_capacity()) |
showard | e39ebe9 | 2009-06-18 23:14:48 +0000 | [diff] [blame] | 644 | |
| 645 | |
Prathmesh Prabhu | 8993161 | 2018-08-11 18:16:22 -0700 | [diff] [blame] | 646 | def pick_drone_to_use(self, num_processes=1): |
Allen Li | ff7064f | 2017-09-13 15:11:31 -0700 | [diff] [blame] | 647 | """Return a drone to use. |
| 648 | |
| 649 | Various options can be passed to optimize drone selection. |
| 650 | |
| 651 | num_processes is the number of processes the drone is intended |
| 652 | to run. |
| 653 | |
Allen Li | ff7064f | 2017-09-13 15:11:31 -0700 | [diff] [blame] | 654 | This public API is exposed for luciferlib to wrap. |
| 655 | |
| 656 | Returns a drone instance (see drones.py). |
| 657 | """ |
| 658 | return self._choose_drone_for_execution( |
| 659 | num_processes=num_processes, |
| 660 | username=None, # Always allow all drones |
| 661 | drone_hostnames_allowed=None, # Always allow all drones |
Allen Li | ff7064f | 2017-09-13 15:11:31 -0700 | [diff] [blame] | 662 | ) |
| 663 | |
| 664 | |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 665 | def _choose_drone_for_execution(self, num_processes, username, |
Prathmesh Prabhu | 8993161 | 2018-08-11 18:16:22 -0700 | [diff] [blame] | 666 | drone_hostnames_allowed): |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 667 | """Choose a drone to execute command. |
| 668 | |
| 669 | @param num_processes: Number of processes needed for execution. |
| 670 | @param username: Name of the user to execute the command. |
| 671 | @param drone_hostnames_allowed: A list of names of drone allowed. |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 672 | |
| 673 | @return: A drone object to be used for execution. |
| 674 | """ |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 675 | # cycle through drones is order of increasing used capacity until |
| 676 | # we find one that can handle these processes |
| 677 | checked_drones = [] |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 678 | usable_drones = [] |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 679 | drone_to_use = None |
| 680 | while self._drone_queue: |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 681 | drone = heapq.heappop(self._drone_queue).drone |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 682 | checked_drones.append(drone) |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 683 | logging.info('Checking drone %s', drone.hostname) |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 684 | if not drone.usable_by(username): |
| 685 | continue |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 686 | |
| 687 | drone_allowed = (drone_hostnames_allowed is None |
| 688 | or drone.hostname in drone_hostnames_allowed) |
| 689 | if not drone_allowed: |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 690 | logging.debug('Drone %s not allowed: ', drone.hostname) |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 691 | continue |
| 692 | |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 693 | usable_drones.append(drone) |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 694 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 695 | if drone.active_processes + num_processes <= drone.max_processes: |
| 696 | drone_to_use = drone |
| 697 | break |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 698 | logging.info('Drone %s has %d active + %s requested > %s max', |
| 699 | drone.hostname, drone.active_processes, num_processes, |
| 700 | drone.max_processes) |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 701 | |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 702 | if not drone_to_use and usable_drones: |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 703 | # Drones are all over loaded, pick the one with least load. |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 704 | drone_summary = ','.join('%s %s/%s' % (drone.hostname, |
| 705 | drone.active_processes, |
| 706 | drone.max_processes) |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 707 | for drone in usable_drones) |
| 708 | logging.error('No drone has capacity to handle %d processes (%s) ' |
| 709 | 'for user %s', num_processes, drone_summary, username) |
| 710 | drone_to_use = self._least_loaded_drone(usable_drones) |
showard | e39ebe9 | 2009-06-18 23:14:48 +0000 | [diff] [blame] | 711 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 712 | # refill _drone_queue |
| 713 | for drone in checked_drones: |
| 714 | self._enqueue_drone(drone) |
| 715 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 716 | return drone_to_use |
| 717 | |
| 718 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 719 | def _substitute_working_directory_into_command(self, command, |
| 720 | working_directory): |
| 721 | for i, item in enumerate(command): |
| 722 | if item is WORKING_DIRECTORY: |
| 723 | command[i] = working_directory |
| 724 | |
| 725 | |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 726 | def execute_command(self, command, working_directory, pidfile_name, |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 727 | num_processes, log_file=None, paired_with_pidfile=None, |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 728 | username=None, drone_hostnames_allowed=None): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 729 | """ |
| 730 | Execute the given command, taken as an argv list. |
| 731 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 732 | @param command: command to execute as a list. if any item is |
| 733 | WORKING_DIRECTORY, the absolute path to the working directory |
| 734 | will be substituted for it. |
| 735 | @param working_directory: directory in which the pidfile will be written |
| 736 | @param pidfile_name: name of the pidfile this process will write |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 737 | @param num_processes: number of processes to account for from this |
| 738 | execution |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 739 | @param log_file (optional): path (in the results repository) to hold |
| 740 | command output. |
| 741 | @param paired_with_pidfile (optional): a PidfileId for an |
| 742 | already-executed process; the new process will execute on the |
| 743 | same drone as the previous process. |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 744 | @param username (optional): login of the user responsible for this |
| 745 | process. |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 746 | @param drone_hostnames_allowed (optional): hostnames of the drones that |
| 747 | this command is allowed to |
| 748 | execute on |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 749 | """ |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 750 | abs_working_directory = self.absolute_path(working_directory) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 751 | if not log_file: |
| 752 | log_file = self.get_temporary_path('execute') |
| 753 | log_file = self.absolute_path(log_file) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 754 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 755 | self._substitute_working_directory_into_command(command, |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 756 | abs_working_directory) |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 757 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 758 | if paired_with_pidfile: |
| 759 | drone = self._get_drone_for_pidfile_id(paired_with_pidfile) |
| 760 | else: |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 761 | drone = self._choose_drone_for_execution( |
Prathmesh Prabhu | 8993161 | 2018-08-11 18:16:22 -0700 | [diff] [blame] | 762 | num_processes, username, drone_hostnames_allowed) |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 763 | |
| 764 | if not drone: |
| 765 | raise DroneManagerError('command failed; no drones available: %s' |
| 766 | % command) |
| 767 | |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 768 | logging.info("command = %s", command) |
| 769 | logging.info('log file = %s:%s', drone.hostname, log_file) |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 770 | self._write_attached_files(working_directory, drone) |
| 771 | drone.queue_call('execute_command', command, abs_working_directory, |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 772 | log_file, pidfile_name) |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 773 | drone.active_processes += num_processes |
| 774 | self._reorder_drone_queue() |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 775 | |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 776 | pidfile_path = os.path.join(abs_working_directory, pidfile_name) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 777 | pidfile_id = PidfileId(pidfile_path) |
| 778 | self.register_pidfile(pidfile_id) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 779 | self._registered_pidfile_info[pidfile_id].num_processes = num_processes |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 780 | return pidfile_id |
| 781 | |
| 782 | |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 783 | def get_pidfile_id_from(self, execution_tag, pidfile_name): |
| 784 | path = os.path.join(self.absolute_path(execution_tag), pidfile_name) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 785 | return PidfileId(path) |
| 786 | |
| 787 | |
| 788 | def register_pidfile(self, pidfile_id): |
| 789 | """ |
| 790 | Indicate that the DroneManager should look for the given pidfile when |
| 791 | refreshing. |
| 792 | """ |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 793 | if pidfile_id not in self._registered_pidfile_info: |
showard | 3739978 | 2009-08-20 23:32:20 +0000 | [diff] [blame] | 794 | logging.info('monitoring pidfile %s', pidfile_id) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 795 | self._registered_pidfile_info[pidfile_id] = _PidfileInfo() |
showard | c6fb604 | 2010-01-25 21:48:20 +0000 | [diff] [blame] | 796 | self._reset_pidfile_age(pidfile_id) |
| 797 | |
| 798 | |
| 799 | def _reset_pidfile_age(self, pidfile_id): |
| 800 | if pidfile_id in self._registered_pidfile_info: |
| 801 | self._registered_pidfile_info[pidfile_id].age = 0 |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 802 | |
| 803 | |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 804 | def unregister_pidfile(self, pidfile_id): |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 805 | if pidfile_id in self._registered_pidfile_info: |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 806 | logging.info('forgetting pidfile %s', pidfile_id) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 807 | del self._registered_pidfile_info[pidfile_id] |
| 808 | |
| 809 | |
| 810 | def declare_process_count(self, pidfile_id, num_processes): |
| 811 | self._registered_pidfile_info[pidfile_id].num_processes = num_processes |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 812 | |
| 813 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 814 | def get_pidfile_contents(self, pidfile_id, use_second_read=False): |
| 815 | """ |
| 816 | Retrieve a PidfileContents object for the given pidfile_id. If |
| 817 | use_second_read is True, use results that were read after the processes |
| 818 | were checked, instead of before. |
| 819 | """ |
showard | c6fb604 | 2010-01-25 21:48:20 +0000 | [diff] [blame] | 820 | self._reset_pidfile_age(pidfile_id) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 821 | if use_second_read: |
| 822 | pidfile_map = self._pidfiles_second_read |
| 823 | else: |
| 824 | pidfile_map = self._pidfiles |
| 825 | return pidfile_map.get(pidfile_id, PidfileContents()) |
| 826 | |
| 827 | |
| 828 | def is_process_running(self, process): |
| 829 | """ |
| 830 | Check if the given process is in the running process list. |
| 831 | """ |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 832 | if process in self._process_set: |
| 833 | return True |
| 834 | |
Alex Miller | 06a5f75 | 2013-08-15 11:16:40 -0700 | [diff] [blame] | 835 | drone_pid = process.hostname, process.pid |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 836 | if drone_pid in self._all_processes: |
| 837 | logging.error('Process %s found, but not an autoserv process. ' |
| 838 | 'Is %s', process, self._all_processes[drone_pid]) |
| 839 | return True |
| 840 | |
| 841 | return False |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 842 | |
| 843 | |
| 844 | def get_temporary_path(self, base_name): |
| 845 | """ |
| 846 | Get a new temporary path guaranteed to be unique across all drones |
| 847 | for this scheduler execution. |
| 848 | """ |
| 849 | self._temporary_path_counter += 1 |
| 850 | return os.path.join(drone_utility._TEMPORARY_DIRECTORY, |
| 851 | '%s.%s' % (base_name, self._temporary_path_counter)) |
| 852 | |
| 853 | |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 854 | def absolute_path(self, path, on_results_repository=False): |
| 855 | if on_results_repository: |
| 856 | base_dir = self._results_dir |
| 857 | else: |
showard | c75fded | 2009-10-14 16:20:02 +0000 | [diff] [blame] | 858 | base_dir = os.path.join(drones.AUTOTEST_INSTALL_DIR, |
| 859 | _DRONE_RESULTS_DIR_SUFFIX) |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 860 | return os.path.join(base_dir, path) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 861 | |
| 862 | |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 863 | def _copy_results_helper(self, process, source_path, destination_path, |
| 864 | to_results_repository=False): |
Simran Basi | 882f15b | 2013-10-29 14:59:34 -0700 | [diff] [blame] | 865 | logging.debug('_copy_results_helper. process: %s, source_path: %s, ' |
| 866 | 'destination_path: %s, to_results_repository: %s', |
| 867 | process, source_path, destination_path, |
| 868 | to_results_repository) |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 869 | full_source = self.absolute_path(source_path) |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 870 | full_destination = self.absolute_path( |
| 871 | destination_path, on_results_repository=to_results_repository) |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 872 | source_drone = self._get_drone_for_process(process) |
| 873 | if to_results_repository: |
| 874 | source_drone.send_file_to(self._results_drone, full_source, |
| 875 | full_destination, can_fail=True) |
| 876 | else: |
| 877 | source_drone.queue_call('copy_file_or_directory', full_source, |
| 878 | full_destination) |
| 879 | |
| 880 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 881 | def copy_to_results_repository(self, process, source_path, |
| 882 | destination_path=None): |
| 883 | """ |
| 884 | Copy results from the given process at source_path to destination_path |
| 885 | in the results repository. |
Allen Li | 036b3a6 | 2017-02-06 15:19:43 -0800 | [diff] [blame] | 886 | |
| 887 | This will only copy the results back for Special Agent Tasks (Cleanup, |
| 888 | Verify, Repair) that reside in the hosts/ subdirectory of results if |
| 889 | the copy_task_results_back flag has been set to True inside |
| 890 | global_config.ini |
| 891 | |
| 892 | It will also only copy .parse.log files back to the scheduler if the |
| 893 | copy_parse_log_back flag in global_config.ini has been set to True. |
| 894 | """ |
| 895 | if not ENABLE_ARCHIVING: |
| 896 | return |
| 897 | copy_task_results_back = global_config.global_config.get_config_value( |
| 898 | scheduler_config.CONFIG_SECTION, 'copy_task_results_back', |
| 899 | type=bool) |
| 900 | copy_parse_log_back = global_config.global_config.get_config_value( |
| 901 | scheduler_config.CONFIG_SECTION, 'copy_parse_log_back', |
| 902 | type=bool) |
| 903 | special_task = source_path.startswith(HOSTS_JOB_SUBDIR) |
| 904 | parse_log = source_path.endswith(PARSE_LOG) |
| 905 | if (copy_task_results_back or not special_task) and ( |
| 906 | copy_parse_log_back or not parse_log): |
| 907 | if destination_path is None: |
| 908 | destination_path = source_path |
| 909 | self._copy_results_helper(process, source_path, destination_path, |
| 910 | to_results_repository=True) |
| 911 | |
| 912 | def _copy_to_results_repository(self, process, source_path, |
| 913 | destination_path=None): |
| 914 | """ |
| 915 | Copy results from the given process at source_path to destination_path |
| 916 | in the results repository, without special task handling. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 917 | """ |
| 918 | if destination_path is None: |
| 919 | destination_path = source_path |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 920 | self._copy_results_helper(process, source_path, destination_path, |
| 921 | to_results_repository=True) |
| 922 | |
| 923 | |
| 924 | def copy_results_on_drone(self, process, source_path, destination_path): |
| 925 | """ |
| 926 | Copy a results directory from one place to another on the drone. |
| 927 | """ |
| 928 | self._copy_results_helper(process, source_path, destination_path) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 929 | |
| 930 | |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 931 | def _write_attached_files(self, results_dir, drone): |
| 932 | attached_files = self._attached_files.pop(results_dir, {}) |
showard | 73ec044 | 2009-02-07 02:05:20 +0000 | [diff] [blame] | 933 | for file_path, contents in attached_files.iteritems(): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 934 | drone.queue_call('write_to_file', self.absolute_path(file_path), |
| 935 | contents) |
| 936 | |
| 937 | |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 938 | def attach_file_to_execution(self, results_dir, file_contents, |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 939 | file_path=None): |
| 940 | """ |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 941 | When the process for the results directory is executed, the given file |
| 942 | contents will be placed in a file on the drone. Returns the path at |
| 943 | which the file will be placed. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 944 | """ |
| 945 | if not file_path: |
| 946 | file_path = self.get_temporary_path('attach') |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 947 | files_for_execution = self._attached_files.setdefault(results_dir, {}) |
showard | 73ec044 | 2009-02-07 02:05:20 +0000 | [diff] [blame] | 948 | assert file_path not in files_for_execution |
| 949 | files_for_execution[file_path] = file_contents |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 950 | return file_path |
| 951 | |
| 952 | |
showard | 35162b0 | 2009-03-03 02:17:30 +0000 | [diff] [blame] | 953 | def write_lines_to_file(self, file_path, lines, paired_with_process=None): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 954 | """ |
| 955 | Write the given lines (as a list of strings) to a file. If |
showard | 35162b0 | 2009-03-03 02:17:30 +0000 | [diff] [blame] | 956 | paired_with_process is given, the file will be written on the drone |
| 957 | running the given Process. Otherwise, the file will be written to the |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 958 | results repository. |
| 959 | """ |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 960 | file_contents = '\n'.join(lines) + '\n' |
showard | 35162b0 | 2009-03-03 02:17:30 +0000 | [diff] [blame] | 961 | if paired_with_process: |
| 962 | drone = self._get_drone_for_process(paired_with_process) |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 963 | on_results_repository = False |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 964 | else: |
| 965 | drone = self._results_drone |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 966 | on_results_repository = True |
| 967 | full_path = self.absolute_path( |
| 968 | file_path, on_results_repository=on_results_repository) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 969 | drone.queue_call('write_to_file', full_path, file_contents) |
jamesren | c44ae99 | 2010-02-19 00:12:54 +0000 | [diff] [blame] | 970 | |
| 971 | |
| 972 | _the_instance = None |
| 973 | |
| 974 | def instance(): |
| 975 | if _the_instance is None: |
| 976 | _set_instance(DroneManager()) |
| 977 | return _the_instance |
| 978 | |
| 979 | |
| 980 | def _set_instance(instance): # usable for testing |
| 981 | global _the_instance |
| 982 | _the_instance = instance |