Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 1 | import heapq |
| 2 | import os |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 3 | import logging |
Michael Liang | da8c60a | 2014-06-03 13:24:51 -0700 | [diff] [blame] | 4 | |
| 5 | import common |
Dan Shi | 5e2efb7 | 2017-02-07 11:40:23 -0800 | [diff] [blame^] | 6 | from autotest_lib.client.common_lib import error |
| 7 | from autotest_lib.client.common_lib import global_config |
| 8 | from autotest_lib.client.common_lib import utils |
| 9 | from autotest_lib.scheduler import drones |
| 10 | from autotest_lib.scheduler import drone_utility |
MK Ryu | 7911ad5 | 2015-12-18 11:40:04 -0800 | [diff] [blame] | 11 | from autotest_lib.scheduler import drone_task_queue |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 12 | from autotest_lib.scheduler import scheduler_config |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 13 | from autotest_lib.scheduler import thread_lib |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 14 | |
Dan Shi | 5e2efb7 | 2017-02-07 11:40:23 -0800 | [diff] [blame^] | 15 | try: |
| 16 | from chromite.lib import metrics |
| 17 | except ImportError: |
| 18 | metrics = utils.metrics_mock |
| 19 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 20 | |
showard | c75fded | 2009-10-14 16:20:02 +0000 | [diff] [blame] | 21 | # results on drones will be placed under the drone_installation_directory in a |
| 22 | # directory with this name |
| 23 | _DRONE_RESULTS_DIR_SUFFIX = 'results' |
| 24 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 25 | WORKING_DIRECTORY = object() # see execute_command() |
| 26 | |
showard | 8d3dbca | 2009-09-25 20:29:38 +0000 | [diff] [blame] | 27 | |
jamesren | c44ae99 | 2010-02-19 00:12:54 +0000 | [diff] [blame] | 28 | AUTOSERV_PID_FILE = '.autoserv_execute' |
| 29 | CRASHINFO_PID_FILE = '.collect_crashinfo_execute' |
| 30 | PARSER_PID_FILE = '.parser_execute' |
| 31 | ARCHIVER_PID_FILE = '.archiver_execute' |
| 32 | |
| 33 | ALL_PIDFILE_NAMES = (AUTOSERV_PID_FILE, CRASHINFO_PID_FILE, PARSER_PID_FILE, |
| 34 | ARCHIVER_PID_FILE) |
| 35 | |
MK Ryu | 7911ad5 | 2015-12-18 11:40:04 -0800 | [diff] [blame] | 36 | _THREADED_DRONE_MANAGER = global_config.global_config.get_config_value( |
| 37 | scheduler_config.CONFIG_SECTION, 'threaded_drone_manager', |
| 38 | type=bool, default=True) |
| 39 | |
jamesren | c44ae99 | 2010-02-19 00:12:54 +0000 | [diff] [blame] | 40 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 41 | class DroneManagerError(Exception): |
| 42 | pass |
| 43 | |
| 44 | |
| 45 | class CustomEquals(object): |
| 46 | def _id(self): |
| 47 | raise NotImplementedError |
| 48 | |
| 49 | |
| 50 | def __eq__(self, other): |
| 51 | if not isinstance(other, type(self)): |
| 52 | return NotImplemented |
| 53 | return self._id() == other._id() |
| 54 | |
| 55 | |
| 56 | def __ne__(self, other): |
| 57 | return not self == other |
| 58 | |
| 59 | |
| 60 | def __hash__(self): |
| 61 | return hash(self._id()) |
| 62 | |
| 63 | |
| 64 | class Process(CustomEquals): |
| 65 | def __init__(self, hostname, pid, ppid=None): |
| 66 | self.hostname = hostname |
| 67 | self.pid = pid |
| 68 | self.ppid = ppid |
| 69 | |
| 70 | def _id(self): |
| 71 | return (self.hostname, self.pid) |
| 72 | |
| 73 | |
| 74 | def __str__(self): |
| 75 | return '%s/%s' % (self.hostname, self.pid) |
| 76 | |
| 77 | |
| 78 | def __repr__(self): |
| 79 | return super(Process, self).__repr__() + '<%s>' % self |
| 80 | |
| 81 | |
| 82 | class PidfileId(CustomEquals): |
| 83 | def __init__(self, path): |
| 84 | self.path = path |
| 85 | |
| 86 | |
| 87 | def _id(self): |
| 88 | return self.path |
| 89 | |
| 90 | |
| 91 | def __str__(self): |
| 92 | return str(self.path) |
| 93 | |
| 94 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 95 | class _PidfileInfo(object): |
| 96 | age = 0 |
| 97 | num_processes = None |
| 98 | |
| 99 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 100 | class PidfileContents(object): |
| 101 | process = None |
| 102 | exit_status = None |
| 103 | num_tests_failed = None |
| 104 | |
| 105 | def is_invalid(self): |
| 106 | return False |
| 107 | |
| 108 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 109 | def is_running(self): |
| 110 | return self.process and not self.exit_status |
| 111 | |
| 112 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 113 | class InvalidPidfile(object): |
Simran Basi | 899f9fe | 2013-02-27 11:58:49 -0800 | [diff] [blame] | 114 | process = None |
| 115 | exit_status = None |
| 116 | num_tests_failed = None |
Simran Basi | 4d7bca2 | 2013-02-27 10:57:04 -0800 | [diff] [blame] | 117 | |
| 118 | |
Simran Basi | 899f9fe | 2013-02-27 11:58:49 -0800 | [diff] [blame] | 119 | def __init__(self, error): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 120 | self.error = error |
| 121 | |
| 122 | |
| 123 | def is_invalid(self): |
| 124 | return True |
| 125 | |
| 126 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 127 | def is_running(self): |
| 128 | return False |
| 129 | |
| 130 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 131 | def __str__(self): |
| 132 | return self.error |
| 133 | |
| 134 | |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 135 | class _DroneHeapWrapper(object): |
| 136 | """Wrapper to compare drones based on used_capacity(). |
| 137 | |
| 138 | These objects can be used to keep a heap of drones by capacity. |
| 139 | """ |
| 140 | def __init__(self, drone): |
| 141 | self.drone = drone |
| 142 | |
| 143 | |
| 144 | def __cmp__(self, other): |
| 145 | assert isinstance(other, _DroneHeapWrapper) |
| 146 | return cmp(self.drone.used_capacity(), other.drone.used_capacity()) |
| 147 | |
| 148 | |
Simran Basi | cced309 | 2012-08-02 15:09:23 -0700 | [diff] [blame] | 149 | class BaseDroneManager(object): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 150 | """ |
| 151 | This class acts as an interface from the scheduler to drones, whether it be |
| 152 | only a single "drone" for localhost or multiple remote drones. |
| 153 | |
| 154 | All paths going into and out of this class are relative to the full results |
| 155 | directory, except for those returns by absolute_path(). |
| 156 | """ |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 157 | |
| 158 | |
| 159 | # Minimum time to wait before next email |
| 160 | # about a drone hitting process limit is sent. |
| 161 | NOTIFY_INTERVAL = 60 * 60 * 24 # one day |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 162 | _STATS_KEY = 'drone_manager' |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 163 | |
Aviv Keshet | 99e6adb | 2016-07-14 16:35:32 -0700 | [diff] [blame] | 164 | _ACTIVE_PROCESS_GAUGE = metrics.Gauge( |
| 165 | 'chromeos/autotest/drone/active_processes') |
| 166 | |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 167 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 168 | def __init__(self): |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 169 | # absolute path of base results dir |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 170 | self._results_dir = None |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 171 | # holds Process objects |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 172 | self._process_set = set() |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 173 | # holds the list of all processes running on all drones |
| 174 | self._all_processes = {} |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 175 | # maps PidfileId to PidfileContents |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 176 | self._pidfiles = {} |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 177 | # same as _pidfiles |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 178 | self._pidfiles_second_read = {} |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 179 | # maps PidfileId to _PidfileInfo |
| 180 | self._registered_pidfile_info = {} |
| 181 | # used to generate unique temporary paths |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 182 | self._temporary_path_counter = 0 |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 183 | # maps hostname to Drone object |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 184 | self._drones = {} |
| 185 | self._results_drone = None |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 186 | # maps results dir to dict mapping file path to contents |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 187 | self._attached_files = {} |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 188 | # heapq of _DroneHeapWrappers |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 189 | self._drone_queue = [] |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 190 | # A threaded task queue used to refresh drones asynchronously. |
MK Ryu | 7911ad5 | 2015-12-18 11:40:04 -0800 | [diff] [blame] | 191 | if _THREADED_DRONE_MANAGER: |
| 192 | self._refresh_task_queue = thread_lib.ThreadedTaskQueue( |
| 193 | name='%s.refresh_queue' % self._STATS_KEY) |
| 194 | else: |
| 195 | self._refresh_task_queue = drone_task_queue.DroneTaskQueue() |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 196 | |
| 197 | |
| 198 | def initialize(self, base_results_dir, drone_hostnames, |
| 199 | results_repository_hostname): |
| 200 | self._results_dir = base_results_dir |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 201 | |
| 202 | for hostname in drone_hostnames: |
Eric Li | 861b2d5 | 2011-02-04 14:50:35 -0800 | [diff] [blame] | 203 | self._add_drone(hostname) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 204 | |
| 205 | if not self._drones: |
| 206 | # all drones failed to initialize |
| 207 | raise DroneManagerError('No valid drones found') |
| 208 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 209 | self.refresh_drone_configs() |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 210 | |
showard | 4460ee8 | 2009-07-07 20:54:29 +0000 | [diff] [blame] | 211 | logging.info('Using results repository on %s', |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 212 | results_repository_hostname) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 213 | self._results_drone = drones.get_drone(results_repository_hostname) |
showard | ac5b000 | 2009-10-19 18:34:00 +0000 | [diff] [blame] | 214 | results_installation_dir = global_config.global_config.get_config_value( |
| 215 | scheduler_config.CONFIG_SECTION, |
| 216 | 'results_host_installation_directory', default=None) |
| 217 | if results_installation_dir: |
| 218 | self._results_drone.set_autotest_install_dir( |
| 219 | results_installation_dir) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 220 | # don't initialize() the results drone - we don't want to clear out any |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 221 | # directories and we don't need to kill any processes |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 222 | |
| 223 | |
| 224 | def reinitialize_drones(self): |
Prathmesh Prabhu | 3623862 | 2016-11-22 18:41:06 -0800 | [diff] [blame] | 225 | for drone in self.get_drones(): |
| 226 | with metrics.SecondsTimer( |
| 227 | 'chromeos/autotest/drone_manager/' |
| 228 | 'reinitialize_drones_duration', |
| 229 | fields={'drone': drone.hostname}): |
| 230 | drone.call('initialize', self._results_dir) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 231 | |
| 232 | |
| 233 | def shutdown(self): |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 234 | for drone in self.get_drones(): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 235 | drone.shutdown() |
| 236 | |
| 237 | |
showard | 8d3dbca | 2009-09-25 20:29:38 +0000 | [diff] [blame] | 238 | def _get_max_pidfile_refreshes(self): |
| 239 | """ |
| 240 | Normally refresh() is called on every monitor_db.Dispatcher.tick(). |
| 241 | |
| 242 | @returns: The number of refresh() calls before we forget a pidfile. |
| 243 | """ |
| 244 | pidfile_timeout = global_config.global_config.get_config_value( |
| 245 | scheduler_config.CONFIG_SECTION, 'max_pidfile_refreshes', |
| 246 | type=int, default=2000) |
| 247 | return pidfile_timeout |
| 248 | |
| 249 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 250 | def _add_drone(self, hostname): |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 251 | logging.info('Adding drone %s', hostname) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 252 | drone = drones.get_drone(hostname) |
Eric Li | 861b2d5 | 2011-02-04 14:50:35 -0800 | [diff] [blame] | 253 | if drone: |
| 254 | self._drones[drone.hostname] = drone |
| 255 | drone.call('initialize', self.absolute_path('')) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 256 | |
| 257 | |
| 258 | def _remove_drone(self, hostname): |
| 259 | self._drones.pop(hostname, None) |
| 260 | |
| 261 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 262 | def refresh_drone_configs(self): |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 263 | """ |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 264 | Reread global config options for all drones. |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 265 | """ |
Dan Shi | b9144a4 | 2014-12-01 16:09:32 -0800 | [diff] [blame] | 266 | # Import server_manager_utils is delayed rather than at the beginning of |
| 267 | # this module. The reason is that test_that imports drone_manager when |
| 268 | # importing autoserv_utils. The import is done before test_that setup |
| 269 | # django (test_that only setup django in setup_local_afe, since it's |
| 270 | # not needed when test_that runs the test in a lab duts through :lab: |
| 271 | # option. Therefore, if server_manager_utils is imported at the |
| 272 | # beginning of this module, test_that will fail since django is not |
| 273 | # setup yet. |
| 274 | from autotest_lib.site_utils import server_manager_utils |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 275 | config = global_config.global_config |
| 276 | section = scheduler_config.CONFIG_SECTION |
| 277 | config.parse_config_file() |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 278 | for hostname, drone in self._drones.iteritems(): |
Dan Shi | b9144a4 | 2014-12-01 16:09:32 -0800 | [diff] [blame] | 279 | if server_manager_utils.use_server_db(): |
| 280 | server = server_manager_utils.get_servers(hostname=hostname)[0] |
| 281 | attributes = dict([(a.attribute, a.value) |
| 282 | for a in server.attributes.all()]) |
| 283 | drone.enabled = ( |
| 284 | int(attributes.get('disabled', 0)) == 0) |
| 285 | drone.max_processes = int( |
| 286 | attributes.get( |
| 287 | 'max_processes', |
| 288 | scheduler_config.config.max_processes_per_drone)) |
| 289 | allowed_users = attributes.get('users', None) |
| 290 | else: |
| 291 | disabled = config.get_config_value( |
| 292 | section, '%s_disabled' % hostname, default='') |
| 293 | drone.enabled = not bool(disabled) |
Dan Shi | b9144a4 | 2014-12-01 16:09:32 -0800 | [diff] [blame] | 294 | drone.max_processes = config.get_config_value( |
| 295 | section, '%s_max_processes' % hostname, type=int, |
| 296 | default=scheduler_config.config.max_processes_per_drone) |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 297 | |
Dan Shi | b9144a4 | 2014-12-01 16:09:32 -0800 | [diff] [blame] | 298 | allowed_users = config.get_config_value( |
| 299 | section, '%s_users' % hostname, default=None) |
| 300 | if allowed_users: |
| 301 | drone.allowed_users = set(allowed_users.split()) |
| 302 | else: |
| 303 | drone.allowed_users = None |
| 304 | logging.info('Drone %s.max_processes: %s', hostname, |
| 305 | drone.max_processes) |
| 306 | logging.info('Drone %s.enabled: %s', hostname, drone.enabled) |
| 307 | logging.info('Drone %s.allowed_users: %s', hostname, |
| 308 | drone.allowed_users) |
Dan Shi | 37bee22 | 2015-04-13 15:46:47 -0700 | [diff] [blame] | 309 | logging.info('Drone %s.support_ssp: %s', hostname, |
| 310 | drone.support_ssp) |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 311 | |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 312 | self._reorder_drone_queue() # max_processes may have changed |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 313 | # Clear notification record about reaching max_processes limit. |
| 314 | self._notify_record = {} |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 315 | |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 316 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 317 | def get_drones(self): |
| 318 | return self._drones.itervalues() |
showard | c5afc46 | 2009-01-13 00:09:39 +0000 | [diff] [blame] | 319 | |
| 320 | |
Dan Shi | c458f66 | 2015-04-29 12:12:38 -0700 | [diff] [blame] | 321 | def cleanup_orphaned_containers(self): |
| 322 | """Queue cleanup_orphaned_containers call at each drone. |
| 323 | """ |
Dan Shi | 55d5899 | 2015-05-05 09:10:02 -0700 | [diff] [blame] | 324 | for drone in self._drones.values(): |
| 325 | logging.info('Queue cleanup_orphaned_containers at %s', |
| 326 | drone.hostname) |
Dan Shi | c458f66 | 2015-04-29 12:12:38 -0700 | [diff] [blame] | 327 | drone.queue_call('cleanup_orphaned_containers') |
Dan Shi | c458f66 | 2015-04-29 12:12:38 -0700 | [diff] [blame] | 328 | |
| 329 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 330 | def _get_drone_for_process(self, process): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 331 | return self._drones[process.hostname] |
| 332 | |
| 333 | |
| 334 | def _get_drone_for_pidfile_id(self, pidfile_id): |
| 335 | pidfile_contents = self.get_pidfile_contents(pidfile_id) |
| 336 | assert pidfile_contents.process is not None |
| 337 | return self._get_drone_for_process(pidfile_contents.process) |
| 338 | |
| 339 | |
| 340 | def _drop_old_pidfiles(self): |
showard | d349624 | 2009-12-10 21:41:43 +0000 | [diff] [blame] | 341 | # use items() since the dict is modified in unregister_pidfile() |
| 342 | for pidfile_id, info in self._registered_pidfile_info.items(): |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 343 | if info.age > self._get_max_pidfile_refreshes(): |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 344 | logging.warning('dropping leaked pidfile %s', pidfile_id) |
| 345 | self.unregister_pidfile(pidfile_id) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 346 | else: |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 347 | info.age += 1 |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 348 | |
| 349 | |
| 350 | def _reset(self): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 351 | self._process_set = set() |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 352 | self._all_processes = {} |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 353 | self._pidfiles = {} |
| 354 | self._pidfiles_second_read = {} |
| 355 | self._drone_queue = [] |
| 356 | |
| 357 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 358 | def _parse_pidfile(self, drone, raw_contents): |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 359 | """Parse raw pidfile contents. |
| 360 | |
| 361 | @param drone: The drone on which this pidfile was found. |
| 362 | @param raw_contents: The raw contents of a pidfile, eg: |
| 363 | "pid\nexit_staus\nnum_tests_failed\n". |
| 364 | """ |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 365 | contents = PidfileContents() |
| 366 | if not raw_contents: |
| 367 | return contents |
| 368 | lines = raw_contents.splitlines() |
| 369 | if len(lines) > 3: |
| 370 | return InvalidPidfile('Corrupt pid file (%d lines):\n%s' % |
| 371 | (len(lines), lines)) |
| 372 | try: |
| 373 | pid = int(lines[0]) |
| 374 | contents.process = Process(drone.hostname, pid) |
| 375 | # if len(lines) == 2, assume we caught Autoserv between writing |
| 376 | # exit_status and num_failed_tests, so just ignore it and wait for |
| 377 | # the next cycle |
| 378 | if len(lines) == 3: |
| 379 | contents.exit_status = int(lines[1]) |
| 380 | contents.num_tests_failed = int(lines[2]) |
| 381 | except ValueError, exc: |
| 382 | return InvalidPidfile('Corrupt pid file: ' + str(exc.args)) |
| 383 | |
| 384 | return contents |
| 385 | |
| 386 | |
| 387 | def _process_pidfiles(self, drone, pidfiles, store_in_dict): |
| 388 | for pidfile_path, contents in pidfiles.iteritems(): |
| 389 | pidfile_id = PidfileId(pidfile_path) |
| 390 | contents = self._parse_pidfile(drone, contents) |
| 391 | store_in_dict[pidfile_id] = contents |
| 392 | |
| 393 | |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 394 | def _add_process(self, drone, process_info): |
| 395 | process = Process(drone.hostname, int(process_info['pid']), |
| 396 | int(process_info['ppid'])) |
| 397 | self._process_set.add(process) |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 398 | |
| 399 | |
| 400 | def _add_autoserv_process(self, drone, process_info): |
| 401 | assert process_info['comm'] == 'autoserv' |
| 402 | # only root autoserv processes have pgid == pid |
| 403 | if process_info['pgid'] != process_info['pid']: |
| 404 | return |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 405 | self._add_process(drone, process_info) |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 406 | |
| 407 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 408 | def _enqueue_drone(self, drone): |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 409 | heapq.heappush(self._drone_queue, _DroneHeapWrapper(drone)) |
| 410 | |
| 411 | |
| 412 | def _reorder_drone_queue(self): |
| 413 | heapq.heapify(self._drone_queue) |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 414 | |
| 415 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 416 | def _compute_active_processes(self, drone): |
| 417 | drone.active_processes = 0 |
| 418 | for pidfile_id, contents in self._pidfiles.iteritems(): |
| 419 | is_running = contents.exit_status is None |
| 420 | on_this_drone = (contents.process |
| 421 | and contents.process.hostname == drone.hostname) |
| 422 | if is_running and on_this_drone: |
| 423 | info = self._registered_pidfile_info[pidfile_id] |
| 424 | if info.num_processes is not None: |
| 425 | drone.active_processes += info.num_processes |
Aviv Keshet | 99e6adb | 2016-07-14 16:35:32 -0700 | [diff] [blame] | 426 | self._ACTIVE_PROCESS_GAUGE.set( |
| 427 | drone.active_processes, |
| 428 | fields={'drone_hostname': drone.hostname}) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 429 | |
| 430 | |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 431 | def _check_drone_process_limit(self, drone): |
| 432 | """ |
| 433 | Notify if the number of processes on |drone| is approaching limit. |
| 434 | |
| 435 | @param drone: A Drone object. |
| 436 | """ |
Alex Miller | da713d9 | 2013-12-06 10:02:43 -0800 | [diff] [blame] | 437 | try: |
| 438 | percent = float(drone.active_processes) / drone.max_processes |
| 439 | except ZeroDivisionError: |
| 440 | percent = 100 |
Prathmesh Prabhu | 21e0971 | 2016-12-20 11:50:26 -0800 | [diff] [blame] | 441 | metrics.Float('chromeos/autotest/drone/active_process_percentage' |
Aviv Keshet | c29b4c7 | 2016-12-14 22:27:35 -0800 | [diff] [blame] | 442 | ).set(percent, fields={'drone_hostname': drone.hostname}) |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 443 | |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 444 | def trigger_refresh(self): |
| 445 | """Triggers a drone manager refresh. |
| 446 | |
| 447 | @raises DroneManagerError: If a drone has un-executed calls. |
| 448 | Since they will get clobbered when we queue refresh calls. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 449 | """ |
| 450 | self._reset() |
showard | bf9695d | 2009-07-06 20:22:24 +0000 | [diff] [blame] | 451 | self._drop_old_pidfiles() |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 452 | pidfile_paths = [pidfile_id.path |
| 453 | for pidfile_id in self._registered_pidfile_info] |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 454 | drones = list(self.get_drones()) |
| 455 | for drone in drones: |
| 456 | calls = drone.get_calls() |
| 457 | if calls: |
| 458 | raise DroneManagerError('Drone %s has un-executed calls: %s ' |
| 459 | 'which might get corrupted through ' |
| 460 | 'this invocation' % |
| 461 | (drone, [str(call) for call in calls])) |
| 462 | drone.queue_call('refresh', pidfile_paths) |
| 463 | logging.info("Invoking drone refresh.") |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 464 | with metrics.SecondsTimer( |
| 465 | 'chromeos/autotest/drone_manager/trigger_refresh_duration'): |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 466 | self._refresh_task_queue.execute(drones, wait=False) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 467 | |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 468 | |
| 469 | def sync_refresh(self): |
| 470 | """Complete the drone refresh started by trigger_refresh. |
| 471 | |
| 472 | Waits for all drone threads then refreshes internal datastructures |
| 473 | with drone process information. |
| 474 | """ |
| 475 | |
| 476 | # This gives us a dictionary like what follows: |
| 477 | # {drone: [{'pidfiles': (raw contents of pidfile paths), |
| 478 | # 'autoserv_processes': (autoserv process info from ps), |
| 479 | # 'all_processes': (all process info from ps), |
| 480 | # 'parse_processes': (parse process infor from ps), |
| 481 | # 'pidfile_second_read': (pidfile contents, again),}] |
| 482 | # drone2: ...} |
| 483 | # The values of each drone are only a list because this adheres to the |
| 484 | # drone utility interface (each call is executed and its results are |
| 485 | # places in a list, but since we never couple the refresh calls with |
| 486 | # any other call, this list will always contain a single dict). |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 487 | with metrics.SecondsTimer( |
| 488 | 'chromeos/autotest/drone_manager/sync_refresh_duration'): |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 489 | all_results = self._refresh_task_queue.get_results() |
| 490 | logging.info("Drones refreshed.") |
| 491 | |
| 492 | # The loop below goes through and parses pidfile contents. Pidfiles |
| 493 | # are used to track autoserv execution, and will always contain < 3 |
| 494 | # lines of the following: pid, exit code, number of tests. Each pidfile |
| 495 | # is identified by a PidfileId object, which contains a unique pidfile |
| 496 | # path (unique because it contains the job id) making it hashable. |
| 497 | # All pidfiles are stored in the drone managers _pidfiles dict as: |
| 498 | # {pidfile_id: pidfile_contents(Process(drone, pid), |
| 499 | # exit_code, num_tests_failed)} |
| 500 | # In handle agents, each agent knows its pidfile_id, and uses this |
| 501 | # to retrieve the refreshed contents of its pidfile via the |
| 502 | # PidfileRunMonitor (through its tick) before making decisions. If |
| 503 | # the agent notices that its process has exited, it unregisters the |
| 504 | # pidfile from the drone_managers._registered_pidfile_info dict |
| 505 | # through its epilog. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 506 | for drone, results_list in all_results.iteritems(): |
| 507 | results = results_list[0] |
Alex Miller | 82d7a9f | 2014-05-16 14:43:32 -0700 | [diff] [blame] | 508 | drone_hostname = drone.hostname.replace('.', '_') |
showard | 0205a3e | 2009-01-16 03:03:50 +0000 | [diff] [blame] | 509 | |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 510 | for process_info in results['all_processes']: |
| 511 | if process_info['comm'] == 'autoserv': |
| 512 | self._add_autoserv_process(drone, process_info) |
| 513 | drone_pid = drone.hostname, int(process_info['pid']) |
| 514 | self._all_processes[drone_pid] = process_info |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 515 | |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 516 | for process_info in results['parse_processes']: |
| 517 | self._add_process(drone, process_info) |
Alex Miller | 6cbd758 | 2014-05-14 19:06:52 -0700 | [diff] [blame] | 518 | |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 519 | self._process_pidfiles(drone, results['pidfiles'], self._pidfiles) |
| 520 | self._process_pidfiles(drone, results['pidfiles_second_read'], |
| 521 | self._pidfiles_second_read) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 522 | |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 523 | self._compute_active_processes(drone) |
| 524 | if drone.enabled: |
| 525 | self._enqueue_drone(drone) |
Fang Deng | 9a0c6c3 | 2013-09-04 15:34:55 -0700 | [diff] [blame] | 526 | self._check_drone_process_limit(drone) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 527 | |
| 528 | |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 529 | def refresh(self): |
| 530 | """Refresh all drones.""" |
Aviv Keshet | 14cac44 | 2016-11-20 21:44:11 -0800 | [diff] [blame] | 531 | with metrics.SecondsTimer( |
| 532 | 'chromeos/autotest/drone_manager/refresh_duration'): |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 533 | self.trigger_refresh() |
| 534 | self.sync_refresh() |
| 535 | |
| 536 | |
Prathmesh Prabhu | 3623862 | 2016-11-22 18:41:06 -0800 | [diff] [blame] | 537 | @metrics.SecondsTimerDecorator( |
| 538 | 'chromeos/autotest/drone_manager/execute_actions_duration') |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 539 | def execute_actions(self): |
| 540 | """ |
| 541 | Called at the end of a scheduler cycle to execute all queued actions |
| 542 | on drones. |
| 543 | """ |
Prashanth B | 340fd1e | 2014-06-22 12:44:10 -0700 | [diff] [blame] | 544 | # Invoke calls queued on all drones since the last call to execute |
| 545 | # and wait for them to return. |
MK Ryu | 7911ad5 | 2015-12-18 11:40:04 -0800 | [diff] [blame] | 546 | if _THREADED_DRONE_MANAGER: |
| 547 | thread_lib.ThreadedTaskQueue( |
| 548 | name='%s.execute_queue' % self._STATS_KEY).execute( |
| 549 | self._drones.values()) |
| 550 | else: |
| 551 | drone_task_queue.DroneTaskQueue().execute(self._drones.values()) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 552 | |
| 553 | try: |
mbligh | 1ef218d | 2009-08-03 16:57:56 +0000 | [diff] [blame] | 554 | self._results_drone.execute_queued_calls() |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 555 | except error.AutoservError: |
Aviv Keshet | c29b4c7 | 2016-12-14 22:27:35 -0800 | [diff] [blame] | 556 | m = 'chromeos/autotest/errors/results_repository_failed' |
| 557 | metrics.Counter(m).increment( |
| 558 | fields={'drone_hostname': self._results_drone.hostname}) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 559 | self._results_drone.clear_call_queue() |
| 560 | |
| 561 | |
| 562 | def get_orphaned_autoserv_processes(self): |
| 563 | """ |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 564 | Returns a set of Process objects for orphaned processes only. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 565 | """ |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 566 | return set(process for process in self._process_set |
| 567 | if process.ppid == 1) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 568 | |
| 569 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 570 | def kill_process(self, process): |
| 571 | """ |
| 572 | Kill the given process. |
| 573 | """ |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 574 | logging.info('killing %s', process) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 575 | drone = self._get_drone_for_process(process) |
| 576 | drone.queue_call('kill_process', process) |
| 577 | |
| 578 | |
| 579 | def _ensure_directory_exists(self, path): |
| 580 | if not os.path.exists(path): |
| 581 | os.makedirs(path) |
| 582 | |
| 583 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 584 | def total_running_processes(self): |
| 585 | return sum(drone.active_processes for drone in self.get_drones()) |
| 586 | |
| 587 | |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 588 | def max_runnable_processes(self, username, drone_hostnames_allowed): |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 589 | """ |
| 590 | Return the maximum number of processes that can be run (in a single |
| 591 | execution) given the current load on drones. |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 592 | @param username: login of user to run a process. may be None. |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 593 | @param drone_hostnames_allowed: list of drones that can be used. May be |
| 594 | None |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 595 | """ |
showard | 1b7142d | 2010-01-15 00:21:37 +0000 | [diff] [blame] | 596 | usable_drone_wrappers = [wrapper for wrapper in self._drone_queue |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 597 | if wrapper.drone.usable_by(username) and |
| 598 | (drone_hostnames_allowed is None or |
| 599 | wrapper.drone.hostname in |
| 600 | drone_hostnames_allowed)] |
showard | 1b7142d | 2010-01-15 00:21:37 +0000 | [diff] [blame] | 601 | if not usable_drone_wrappers: |
| 602 | # all drones disabled or inaccessible |
showard | de700d3 | 2009-02-25 00:12:42 +0000 | [diff] [blame] | 603 | return 0 |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 604 | runnable_processes = [ |
| 605 | wrapper.drone.max_processes - wrapper.drone.active_processes |
| 606 | for wrapper in usable_drone_wrappers] |
| 607 | return max([0] + runnable_processes) |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 608 | |
| 609 | |
showard | e39ebe9 | 2009-06-18 23:14:48 +0000 | [diff] [blame] | 610 | def _least_loaded_drone(self, drones): |
| 611 | drone_to_use = drones[0] |
| 612 | for drone in drones[1:]: |
| 613 | if drone.used_capacity() < drone_to_use.used_capacity(): |
| 614 | drone_to_use = drone |
| 615 | return drone_to_use |
| 616 | |
| 617 | |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 618 | def _choose_drone_for_execution(self, num_processes, username, |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 619 | drone_hostnames_allowed, |
| 620 | require_ssp=False): |
| 621 | """Choose a drone to execute command. |
| 622 | |
| 623 | @param num_processes: Number of processes needed for execution. |
| 624 | @param username: Name of the user to execute the command. |
| 625 | @param drone_hostnames_allowed: A list of names of drone allowed. |
| 626 | @param require_ssp: Require server-side packaging to execute the, |
| 627 | command, default to False. |
| 628 | |
| 629 | @return: A drone object to be used for execution. |
| 630 | """ |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 631 | # cycle through drones is order of increasing used capacity until |
| 632 | # we find one that can handle these processes |
| 633 | checked_drones = [] |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 634 | usable_drones = [] |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 635 | # Drones do not support server-side packaging, used as backup if no |
| 636 | # drone is found to run command requires server-side packaging. |
| 637 | no_ssp_drones = [] |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 638 | drone_to_use = None |
| 639 | while self._drone_queue: |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 640 | drone = heapq.heappop(self._drone_queue).drone |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 641 | checked_drones.append(drone) |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 642 | logging.info('Checking drone %s', drone.hostname) |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 643 | if not drone.usable_by(username): |
| 644 | continue |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 645 | |
| 646 | drone_allowed = (drone_hostnames_allowed is None |
| 647 | or drone.hostname in drone_hostnames_allowed) |
| 648 | if not drone_allowed: |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 649 | logging.debug('Drone %s not allowed: ', drone.hostname) |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 650 | continue |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 651 | if require_ssp and not drone.support_ssp: |
| 652 | logging.debug('Drone %s does not support server-side ' |
| 653 | 'packaging.', drone.hostname) |
| 654 | no_ssp_drones.append(drone) |
| 655 | continue |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 656 | |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 657 | usable_drones.append(drone) |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 658 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 659 | if drone.active_processes + num_processes <= drone.max_processes: |
| 660 | drone_to_use = drone |
| 661 | break |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 662 | logging.info('Drone %s has %d active + %s requested > %s max', |
| 663 | drone.hostname, drone.active_processes, num_processes, |
| 664 | drone.max_processes) |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 665 | |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 666 | if not drone_to_use and usable_drones: |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 667 | # Drones are all over loaded, pick the one with least load. |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 668 | drone_summary = ','.join('%s %s/%s' % (drone.hostname, |
| 669 | drone.active_processes, |
| 670 | drone.max_processes) |
jamesren | 37b5045 | 2010-03-25 20:38:56 +0000 | [diff] [blame] | 671 | for drone in usable_drones) |
| 672 | logging.error('No drone has capacity to handle %d processes (%s) ' |
| 673 | 'for user %s', num_processes, drone_summary, username) |
| 674 | drone_to_use = self._least_loaded_drone(usable_drones) |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 675 | elif not drone_to_use and require_ssp and no_ssp_drones: |
| 676 | # No drone supports server-side packaging, choose the least loaded. |
| 677 | drone_to_use = self._least_loaded_drone(no_ssp_drones) |
showard | e39ebe9 | 2009-06-18 23:14:48 +0000 | [diff] [blame] | 678 | |
showard | 324bf81 | 2009-01-20 23:23:38 +0000 | [diff] [blame] | 679 | # refill _drone_queue |
| 680 | for drone in checked_drones: |
| 681 | self._enqueue_drone(drone) |
| 682 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 683 | return drone_to_use |
| 684 | |
| 685 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 686 | def _substitute_working_directory_into_command(self, command, |
| 687 | working_directory): |
| 688 | for i, item in enumerate(command): |
| 689 | if item is WORKING_DIRECTORY: |
| 690 | command[i] = working_directory |
| 691 | |
| 692 | |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 693 | def execute_command(self, command, working_directory, pidfile_name, |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 694 | num_processes, log_file=None, paired_with_pidfile=None, |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 695 | username=None, drone_hostnames_allowed=None): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 696 | """ |
| 697 | Execute the given command, taken as an argv list. |
| 698 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 699 | @param command: command to execute as a list. if any item is |
| 700 | WORKING_DIRECTORY, the absolute path to the working directory |
| 701 | will be substituted for it. |
| 702 | @param working_directory: directory in which the pidfile will be written |
| 703 | @param pidfile_name: name of the pidfile this process will write |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 704 | @param num_processes: number of processes to account for from this |
| 705 | execution |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 706 | @param log_file (optional): path (in the results repository) to hold |
| 707 | command output. |
| 708 | @param paired_with_pidfile (optional): a PidfileId for an |
| 709 | already-executed process; the new process will execute on the |
| 710 | same drone as the previous process. |
showard | 9bb960b | 2009-11-19 01:02:11 +0000 | [diff] [blame] | 711 | @param username (optional): login of the user responsible for this |
| 712 | process. |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 713 | @param drone_hostnames_allowed (optional): hostnames of the drones that |
| 714 | this command is allowed to |
| 715 | execute on |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 716 | """ |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 717 | abs_working_directory = self.absolute_path(working_directory) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 718 | if not log_file: |
| 719 | log_file = self.get_temporary_path('execute') |
| 720 | log_file = self.absolute_path(log_file) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 721 | |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 722 | self._substitute_working_directory_into_command(command, |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 723 | abs_working_directory) |
showard | ed2afea | 2009-07-07 20:54:07 +0000 | [diff] [blame] | 724 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 725 | if paired_with_pidfile: |
| 726 | drone = self._get_drone_for_pidfile_id(paired_with_pidfile) |
| 727 | else: |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 728 | require_ssp = '--require-ssp' in command |
| 729 | drone = self._choose_drone_for_execution( |
| 730 | num_processes, username, drone_hostnames_allowed, |
| 731 | require_ssp=require_ssp) |
| 732 | # Enable --warn-no-ssp option for autoserv to log a warning and run |
| 733 | # the command without using server-side packaging. |
| 734 | if require_ssp and not drone.support_ssp: |
| 735 | command.append('--warn-no-ssp') |
jamesren | 76fcf19 | 2010-04-21 20:39:50 +0000 | [diff] [blame] | 736 | |
| 737 | if not drone: |
| 738 | raise DroneManagerError('command failed; no drones available: %s' |
| 739 | % command) |
| 740 | |
Dan Shi | ec1d47d | 2015-02-13 11:38:13 -0800 | [diff] [blame] | 741 | logging.info("command = %s", command) |
| 742 | logging.info('log file = %s:%s', drone.hostname, log_file) |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 743 | self._write_attached_files(working_directory, drone) |
| 744 | drone.queue_call('execute_command', command, abs_working_directory, |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 745 | log_file, pidfile_name) |
showard | 418785b | 2009-11-23 20:19:59 +0000 | [diff] [blame] | 746 | drone.active_processes += num_processes |
| 747 | self._reorder_drone_queue() |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 748 | |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 749 | pidfile_path = os.path.join(abs_working_directory, pidfile_name) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 750 | pidfile_id = PidfileId(pidfile_path) |
| 751 | self.register_pidfile(pidfile_id) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 752 | self._registered_pidfile_info[pidfile_id].num_processes = num_processes |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 753 | return pidfile_id |
| 754 | |
| 755 | |
showard | d3dc199 | 2009-04-22 21:01:40 +0000 | [diff] [blame] | 756 | def get_pidfile_id_from(self, execution_tag, pidfile_name): |
| 757 | path = os.path.join(self.absolute_path(execution_tag), pidfile_name) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 758 | return PidfileId(path) |
| 759 | |
| 760 | |
| 761 | def register_pidfile(self, pidfile_id): |
| 762 | """ |
| 763 | Indicate that the DroneManager should look for the given pidfile when |
| 764 | refreshing. |
| 765 | """ |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 766 | if pidfile_id not in self._registered_pidfile_info: |
showard | 3739978 | 2009-08-20 23:32:20 +0000 | [diff] [blame] | 767 | logging.info('monitoring pidfile %s', pidfile_id) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 768 | self._registered_pidfile_info[pidfile_id] = _PidfileInfo() |
showard | c6fb604 | 2010-01-25 21:48:20 +0000 | [diff] [blame] | 769 | self._reset_pidfile_age(pidfile_id) |
| 770 | |
| 771 | |
| 772 | def _reset_pidfile_age(self, pidfile_id): |
| 773 | if pidfile_id in self._registered_pidfile_info: |
| 774 | self._registered_pidfile_info[pidfile_id].age = 0 |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 775 | |
| 776 | |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 777 | def unregister_pidfile(self, pidfile_id): |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 778 | if pidfile_id in self._registered_pidfile_info: |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 779 | logging.info('forgetting pidfile %s', pidfile_id) |
showard | d119565 | 2009-12-08 22:21:02 +0000 | [diff] [blame] | 780 | del self._registered_pidfile_info[pidfile_id] |
| 781 | |
| 782 | |
| 783 | def declare_process_count(self, pidfile_id, num_processes): |
| 784 | self._registered_pidfile_info[pidfile_id].num_processes = num_processes |
showard | f85a0b7 | 2009-10-07 20:48:45 +0000 | [diff] [blame] | 785 | |
| 786 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 787 | def get_pidfile_contents(self, pidfile_id, use_second_read=False): |
| 788 | """ |
| 789 | Retrieve a PidfileContents object for the given pidfile_id. If |
| 790 | use_second_read is True, use results that were read after the processes |
| 791 | were checked, instead of before. |
| 792 | """ |
showard | c6fb604 | 2010-01-25 21:48:20 +0000 | [diff] [blame] | 793 | self._reset_pidfile_age(pidfile_id) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 794 | if use_second_read: |
| 795 | pidfile_map = self._pidfiles_second_read |
| 796 | else: |
| 797 | pidfile_map = self._pidfiles |
| 798 | return pidfile_map.get(pidfile_id, PidfileContents()) |
| 799 | |
| 800 | |
| 801 | def is_process_running(self, process): |
| 802 | """ |
| 803 | Check if the given process is in the running process list. |
| 804 | """ |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 805 | if process in self._process_set: |
| 806 | return True |
| 807 | |
Alex Miller | 06a5f75 | 2013-08-15 11:16:40 -0700 | [diff] [blame] | 808 | drone_pid = process.hostname, process.pid |
Alex Miller | e76e225 | 2013-08-15 09:24:27 -0700 | [diff] [blame] | 809 | if drone_pid in self._all_processes: |
| 810 | logging.error('Process %s found, but not an autoserv process. ' |
| 811 | 'Is %s', process, self._all_processes[drone_pid]) |
| 812 | return True |
| 813 | |
| 814 | return False |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 815 | |
| 816 | |
| 817 | def get_temporary_path(self, base_name): |
| 818 | """ |
| 819 | Get a new temporary path guaranteed to be unique across all drones |
| 820 | for this scheduler execution. |
| 821 | """ |
| 822 | self._temporary_path_counter += 1 |
| 823 | return os.path.join(drone_utility._TEMPORARY_DIRECTORY, |
| 824 | '%s.%s' % (base_name, self._temporary_path_counter)) |
| 825 | |
| 826 | |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 827 | def absolute_path(self, path, on_results_repository=False): |
| 828 | if on_results_repository: |
| 829 | base_dir = self._results_dir |
| 830 | else: |
showard | c75fded | 2009-10-14 16:20:02 +0000 | [diff] [blame] | 831 | base_dir = os.path.join(drones.AUTOTEST_INSTALL_DIR, |
| 832 | _DRONE_RESULTS_DIR_SUFFIX) |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 833 | return os.path.join(base_dir, path) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 834 | |
| 835 | |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 836 | def _copy_results_helper(self, process, source_path, destination_path, |
| 837 | to_results_repository=False): |
Simran Basi | 882f15b | 2013-10-29 14:59:34 -0700 | [diff] [blame] | 838 | logging.debug('_copy_results_helper. process: %s, source_path: %s, ' |
| 839 | 'destination_path: %s, to_results_repository: %s', |
| 840 | process, source_path, destination_path, |
| 841 | to_results_repository) |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 842 | full_source = self.absolute_path(source_path) |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 843 | full_destination = self.absolute_path( |
| 844 | destination_path, on_results_repository=to_results_repository) |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 845 | source_drone = self._get_drone_for_process(process) |
| 846 | if to_results_repository: |
| 847 | source_drone.send_file_to(self._results_drone, full_source, |
| 848 | full_destination, can_fail=True) |
| 849 | else: |
| 850 | source_drone.queue_call('copy_file_or_directory', full_source, |
| 851 | full_destination) |
| 852 | |
| 853 | |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 854 | def copy_to_results_repository(self, process, source_path, |
| 855 | destination_path=None): |
| 856 | """ |
| 857 | Copy results from the given process at source_path to destination_path |
| 858 | in the results repository. |
| 859 | """ |
| 860 | if destination_path is None: |
| 861 | destination_path = source_path |
showard | 678df4f | 2009-02-04 21:36:39 +0000 | [diff] [blame] | 862 | self._copy_results_helper(process, source_path, destination_path, |
| 863 | to_results_repository=True) |
| 864 | |
| 865 | |
| 866 | def copy_results_on_drone(self, process, source_path, destination_path): |
| 867 | """ |
| 868 | Copy a results directory from one place to another on the drone. |
| 869 | """ |
| 870 | self._copy_results_helper(process, source_path, destination_path) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 871 | |
| 872 | |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 873 | def _write_attached_files(self, results_dir, drone): |
| 874 | attached_files = self._attached_files.pop(results_dir, {}) |
showard | 73ec044 | 2009-02-07 02:05:20 +0000 | [diff] [blame] | 875 | for file_path, contents in attached_files.iteritems(): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 876 | drone.queue_call('write_to_file', self.absolute_path(file_path), |
| 877 | contents) |
| 878 | |
| 879 | |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 880 | def attach_file_to_execution(self, results_dir, file_contents, |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 881 | file_path=None): |
| 882 | """ |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 883 | When the process for the results directory is executed, the given file |
| 884 | contents will be placed in a file on the drone. Returns the path at |
| 885 | which the file will be placed. |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 886 | """ |
| 887 | if not file_path: |
| 888 | file_path = self.get_temporary_path('attach') |
showard | db50276 | 2009-09-09 15:31:20 +0000 | [diff] [blame] | 889 | files_for_execution = self._attached_files.setdefault(results_dir, {}) |
showard | 73ec044 | 2009-02-07 02:05:20 +0000 | [diff] [blame] | 890 | assert file_path not in files_for_execution |
| 891 | files_for_execution[file_path] = file_contents |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 892 | return file_path |
| 893 | |
| 894 | |
showard | 35162b0 | 2009-03-03 02:17:30 +0000 | [diff] [blame] | 895 | def write_lines_to_file(self, file_path, lines, paired_with_process=None): |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 896 | """ |
| 897 | Write the given lines (as a list of strings) to a file. If |
showard | 35162b0 | 2009-03-03 02:17:30 +0000 | [diff] [blame] | 898 | paired_with_process is given, the file will be written on the drone |
| 899 | running the given Process. Otherwise, the file will be written to the |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 900 | results repository. |
| 901 | """ |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 902 | file_contents = '\n'.join(lines) + '\n' |
showard | 35162b0 | 2009-03-03 02:17:30 +0000 | [diff] [blame] | 903 | if paired_with_process: |
| 904 | drone = self._get_drone_for_process(paired_with_process) |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 905 | on_results_repository = False |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 906 | else: |
| 907 | drone = self._results_drone |
showard | 42d4498 | 2009-10-12 20:34:03 +0000 | [diff] [blame] | 908 | on_results_repository = True |
| 909 | full_path = self.absolute_path( |
| 910 | file_path, on_results_repository=on_results_repository) |
showard | 170873e | 2009-01-07 00:22:26 +0000 | [diff] [blame] | 911 | drone.queue_call('write_to_file', full_path, file_contents) |
jamesren | c44ae99 | 2010-02-19 00:12:54 +0000 | [diff] [blame] | 912 | |
| 913 | |
Simran Basi | cced309 | 2012-08-02 15:09:23 -0700 | [diff] [blame] | 914 | SiteDroneManager = utils.import_site_class( |
| 915 | __file__, 'autotest_lib.scheduler.site_drone_manager', |
| 916 | 'SiteDroneManager', BaseDroneManager) |
| 917 | |
| 918 | |
| 919 | class DroneManager(SiteDroneManager): |
| 920 | pass |
| 921 | |
| 922 | |
jamesren | c44ae99 | 2010-02-19 00:12:54 +0000 | [diff] [blame] | 923 | _the_instance = None |
| 924 | |
| 925 | def instance(): |
| 926 | if _the_instance is None: |
| 927 | _set_instance(DroneManager()) |
| 928 | return _the_instance |
| 929 | |
| 930 | |
| 931 | def _set_instance(instance): # usable for testing |
| 932 | global _the_instance |
| 933 | _the_instance = instance |