Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | # |
| 3 | # Copyright (c) 2014 The Chromium OS Authors. All rights reserved. |
| 4 | # Use of this source code is governed by a BSD-style license that can be |
| 5 | # found in the LICENSE file. |
| 6 | |
| 7 | import datetime as datetime_base |
| 8 | import logging |
| 9 | from datetime import datetime |
| 10 | |
| 11 | import common |
| 12 | |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 13 | from autotest_lib.client.common_lib import global_config |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 14 | from autotest_lib.client.common_lib import time_utils |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 15 | from autotest_lib.server import utils |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 16 | from autotest_lib.server.cros.dynamic_suite import reporting_utils |
Aviv Keshet | 7ee9586 | 2016-08-30 15:18:27 -0700 | [diff] [blame] | 17 | from autotest_lib.server.lib import status_history |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame] | 18 | from autotest_lib.utils import labellib |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 19 | |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 20 | CONFIG = global_config.global_config |
| 21 | |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 22 | |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 23 | class BoardNotAvailableError(utils.TestLabException): |
| 24 | """Raised when a board is not available in the lab.""" |
| 25 | |
| 26 | |
| 27 | class NotEnoughDutsError(utils.TestLabException): |
| 28 | """Rasied when the lab doesn't have the minimum number of duts.""" |
| 29 | |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 30 | def __init__(self, board, pool, num_available, num_required, hosts): |
| 31 | """Initialize instance. |
| 32 | |
| 33 | Please pass arguments by keyword. |
| 34 | |
| 35 | @param board: Name of board. |
| 36 | @param pool: Name of pool. |
| 37 | @param num_available: Number of available hosts. |
| 38 | @param num_required: Number of hosts required. |
| 39 | @param hosts: Sequence of Host instances for given board and pool. |
| 40 | """ |
| 41 | self.board = board |
| 42 | self.pool = pool |
| 43 | self.num_available = num_available |
| 44 | self.num_required = num_required |
| 45 | self.hosts = hosts |
| 46 | self.bug_id = None |
| 47 | self.suite_name = None |
| 48 | self.build = None |
| 49 | |
| 50 | |
| 51 | def __repr__(self): |
| 52 | return ( |
| 53 | '<{cls} at 0x{id:x} with' |
| 54 | ' board={this.board!r},' |
| 55 | ' pool={this.pool!r},' |
| 56 | ' num_available={this.num_available!r},' |
| 57 | ' num_required={this.num_required!r},' |
| 58 | ' bug_id={this.bug_id!r},' |
| 59 | ' suite_name={this.suite_name!r},' |
| 60 | ' build={this.build!r}>' |
| 61 | .format(cls=type(self).__name__, id=id(self), this=self) |
| 62 | ) |
| 63 | |
| 64 | |
| 65 | def __str__(self): |
| 66 | msg_parts = [ |
| 67 | 'Not enough DUTs for board: {this.board}, pool: {this.pool};' |
| 68 | ' required: {this.num_required}, found: {this.num_available}' |
| 69 | ] |
Allen Li | dc2c69a | 2016-09-14 19:05:47 -0700 | [diff] [blame] | 70 | format_dict = {'this': self} |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 71 | if self.bug_id is not None: |
Allen Li | dc2c69a | 2016-09-14 19:05:47 -0700 | [diff] [blame] | 72 | msg_parts.append('bug: {bug_url}') |
| 73 | format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id) |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 74 | if self.suite_name is not None: |
Allen Li | dc2c69a | 2016-09-14 19:05:47 -0700 | [diff] [blame] | 75 | msg_parts.append('suite: {this.suite_name}') |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 76 | if self.build is not None: |
Allen Li | dc2c69a | 2016-09-14 19:05:47 -0700 | [diff] [blame] | 77 | msg_parts.append('build: {this.build}') |
| 78 | return ', '.join(msg_parts).format(**format_dict) |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 79 | |
| 80 | |
| 81 | def add_bug_id(self, bug_id): |
| 82 | """Add crbug id associated with this exception. |
| 83 | |
| 84 | @param bug_id crbug id whose str() value is used in a crbug URL. |
| 85 | """ |
| 86 | self.bug_id = bug_id |
| 87 | |
| 88 | |
| 89 | def add_suite_name(self, suite_name): |
| 90 | """Add name of test suite that needed the DUTs. |
| 91 | |
| 92 | @param suite_name Name of test suite. |
| 93 | """ |
| 94 | self.suite_name = suite_name |
| 95 | |
| 96 | |
| 97 | def add_build(self, build): |
| 98 | """Add name of build of job that needed the DUTs. |
| 99 | |
| 100 | @param build Name of build. |
| 101 | """ |
| 102 | self.build = build |
| 103 | |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 104 | |
Prashanth B | a7be207 | 2014-07-15 15:03:21 -0700 | [diff] [blame] | 105 | class SimpleTimer(object): |
| 106 | """A simple timer used to periodically check if a deadline has passed.""" |
| 107 | |
| 108 | def _reset(self): |
| 109 | """Reset the deadline.""" |
| 110 | if not self.interval_hours or self.interval_hours < 0: |
| 111 | logging.error('Bad interval %s', self.interval_hours) |
| 112 | self.deadline = None |
| 113 | return |
| 114 | self.deadline = datetime.now() + datetime_base.timedelta( |
| 115 | hours=self.interval_hours) |
| 116 | |
| 117 | |
| 118 | def __init__(self, interval_hours=0.5): |
| 119 | """Initialize a simple periodic deadline timer. |
| 120 | |
| 121 | @param interval_hours: Interval of the deadline. |
| 122 | """ |
| 123 | self.interval_hours = interval_hours |
| 124 | self._reset() |
| 125 | |
| 126 | |
| 127 | def poll(self): |
| 128 | """Poll the timer to see if we've hit the deadline. |
| 129 | |
| 130 | This method resets the deadline if it has passed. If the deadline |
| 131 | hasn't been set, or the current time is less than the deadline, the |
| 132 | method returns False. |
| 133 | |
| 134 | @return: True if the deadline has passed, False otherwise. |
| 135 | """ |
| 136 | if not self.deadline or datetime.now() < self.deadline: |
| 137 | return False |
| 138 | self._reset() |
| 139 | return True |
| 140 | |
| 141 | |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 142 | class JobTimer(object): |
| 143 | """Utility class capable of measuring job timeouts. |
| 144 | """ |
| 145 | |
| 146 | # Format used in datetime - string conversion. |
| 147 | time_format = '%m-%d-%Y [%H:%M:%S]' |
| 148 | |
| 149 | def __init__(self, job_created_time, timeout_mins): |
| 150 | """JobTimer constructor. |
| 151 | |
| 152 | @param job_created_time: float representing the time a job was |
| 153 | created. Eg: time.time() |
| 154 | @param timeout_mins: float representing the timeout in minutes. |
| 155 | """ |
| 156 | self.job_created_time = datetime.fromtimestamp(job_created_time) |
| 157 | self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0) |
Prashanth B | a7be207 | 2014-07-15 15:03:21 -0700 | [diff] [blame] | 158 | self.debug_output_timer = SimpleTimer(interval_hours=0.5) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 159 | self.past_halftime = False |
| 160 | |
| 161 | |
| 162 | @classmethod |
| 163 | def format_time(cls, datetime_obj): |
| 164 | """Get the string formatted version of the datetime object. |
| 165 | |
| 166 | @param datetime_obj: A datetime.datetime object. |
| 167 | Eg: datetime.datetime.now() |
| 168 | |
| 169 | @return: A formatted string containing the date/time of the |
| 170 | input datetime. |
| 171 | """ |
| 172 | return datetime_obj.strftime(cls.time_format) |
| 173 | |
| 174 | |
| 175 | def elapsed_time(self): |
| 176 | """Get the time elapsed since this job was created. |
| 177 | |
| 178 | @return: A timedelta object representing the elapsed time. |
| 179 | """ |
| 180 | return datetime.now() - self.job_created_time |
| 181 | |
| 182 | |
| 183 | def is_suite_timeout(self): |
| 184 | """Check if the suite timed out. |
| 185 | |
| 186 | @return: True if more than timeout_hours has elapsed since the suite job |
| 187 | was created. |
| 188 | """ |
| 189 | if self.elapsed_time() >= self.timeout_hours: |
| 190 | logging.info('Suite timed out. Started on %s, timed out on %s', |
| 191 | self.format_time(self.job_created_time), |
| 192 | self.format_time(datetime.now())) |
| 193 | return True |
| 194 | return False |
| 195 | |
| 196 | |
| 197 | def first_past_halftime(self): |
| 198 | """Check if we just crossed half time. |
| 199 | |
| 200 | This method will only return True once, the first time it is called |
| 201 | after a job's elapsed time is past half its timeout. |
| 202 | |
| 203 | @return True: If this is the first call of the method after halftime. |
| 204 | """ |
| 205 | if (not self.past_halftime and |
| 206 | self.elapsed_time() > self.timeout_hours/2): |
| 207 | self.past_halftime = True |
| 208 | return True |
| 209 | return False |
| 210 | |
| 211 | |
| 212 | class RPCHelper(object): |
| 213 | """A class to help diagnose a suite run through the rpc interface. |
| 214 | """ |
| 215 | |
| 216 | def __init__(self, rpc_interface): |
| 217 | """Constructor for rpc helper class. |
| 218 | |
| 219 | @param rpc_interface: An rpc object, eg: A RetryingAFE instance. |
| 220 | """ |
| 221 | self.rpc_interface = rpc_interface |
| 222 | |
| 223 | |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 224 | def diagnose_pool(self, board, pool, time_delta_hours, limit=10): |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 225 | """Log diagnostic information about a timeout for a board/pool. |
| 226 | |
| 227 | @param board: The board for which the current suite was run. |
| 228 | @param pool: The pool against which the current suite was run. |
| 229 | @param time_delta_hours: The time from which we should log information. |
| 230 | This is a datetime.timedelta object, as stored by the JobTimer. |
| 231 | @param limit: The maximum number of jobs per host, to log. |
| 232 | |
| 233 | @raises proxy.JSONRPCException: For exceptions thrown across the wire. |
| 234 | """ |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 235 | end_time = datetime.now() |
| 236 | start_time = end_time - time_delta_hours |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame] | 237 | labels = labellib.LabelsMapping() |
| 238 | labels['board'] = board |
| 239 | labels['pool'] = pool |
| 240 | host_histories = status_history.HostJobHistory.get_multiple_histories( |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 241 | self.rpc_interface, |
| 242 | time_utils.to_epoch_time(start_time), |
| 243 | time_utils.to_epoch_time(end_time), |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame] | 244 | labels.getlabels(), |
| 245 | ) |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 246 | if not host_histories: |
| 247 | logging.error('No hosts found for board:%s in pool:%s', |
Alex Miller | 8bf5020 | 2014-06-02 04:10:51 -0700 | [diff] [blame] | 248 | board, pool) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 249 | return |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 250 | status_map = { |
| 251 | status_history.UNUSED: 'Unused', |
| 252 | status_history.UNKNOWN: 'No job history', |
| 253 | status_history.WORKING: 'Working', |
| 254 | status_history.BROKEN: 'Failed repair' |
| 255 | } |
| 256 | for history in host_histories: |
| 257 | count = 0 |
| 258 | job_info ='' |
| 259 | for job in history: |
| 260 | start_time = ( |
| 261 | time_utils.epoch_time_to_date_string(job.start_time)) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 262 | job_info += ('%s %s started on: %s status %s\n' % |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 263 | (job.id, job.name, start_time, job.job_status)) |
| 264 | count += 1 |
| 265 | if count >= limit: |
| 266 | break |
| 267 | host = history.host |
| 268 | logging.error('host: %s, status: %s, locked: %s ' |
| 269 | 'diagnosis: %s\n' |
| 270 | 'labels: %s\nLast %s jobs within %s:\n' |
| 271 | '%s', |
| 272 | history.hostname, host.status, host.locked, |
| 273 | status_map[history.last_diagnosis()[0]], |
| 274 | host.labels, limit, time_delta_hours, |
| 275 | job_info) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 276 | |
| 277 | |
Ningning Xia | f2c206c | 2016-04-13 14:15:51 -0700 | [diff] [blame] | 278 | def check_dut_availability(self, board, pool, minimum_duts=0, skip_duts_check=False): |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 279 | """Check if DUT availability for a given board and pool is less than |
| 280 | minimum. |
| 281 | |
| 282 | @param board: The board to check DUT availability. |
| 283 | @param pool: The pool to check DUT availability. |
Dan Shi | 8136337 | 2014-06-03 22:27:37 -0700 | [diff] [blame] | 284 | @param minimum_duts: Minimum Number of available machines required to |
| 285 | run the suite. Default is set to 0, which means do |
| 286 | not force the check of available machines before |
| 287 | running the suite. |
Ningning Xia | f2c206c | 2016-04-13 14:15:51 -0700 | [diff] [blame] | 288 | @param skip_duts_check: If True, skip minimum available DUTs check. |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 289 | @raise: NotEnoughDutsError if DUT availability is lower than minimum. |
| 290 | @raise: BoardNotAvailableError if no host found for requested |
| 291 | board/pool. |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 292 | """ |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 293 | if minimum_duts == 0: |
Dan Shi | 94234cb | 2014-05-23 20:04:31 -0700 | [diff] [blame] | 294 | return |
| 295 | |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 296 | # TODO(ayatane): Replace label prefixes with constants in |
Xixuan Wu | 93e646c | 2017-12-07 18:36:10 -0800 | [diff] [blame] | 297 | # autotest_lib.server.constants |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 298 | hosts = self.rpc_interface.get_hosts( |
| 299 | invalid=False, |
| 300 | multiple_labels=('pool:%s' % pool, 'board:%s' % board)) |
| 301 | if not hosts: |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 302 | raise BoardNotAvailableError( |
Shuqian Zhao | ade6e7d | 2015-12-07 18:01:11 -0800 | [diff] [blame] | 303 | 'No hosts found for board:%s in pool:%s. The test lab ' |
| 304 | 'currently does not cover test for this board and pool.'% |
Alex Miller | 8bf5020 | 2014-06-02 04:10:51 -0700 | [diff] [blame] | 305 | (board, pool)) |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 306 | |
Ningning Xia | f2c206c | 2016-04-13 14:15:51 -0700 | [diff] [blame] | 307 | if skip_duts_check: |
| 308 | # Bypass minimum avilable DUTs check |
| 309 | logging.debug('skip_duts_check is on, do not enforce minimum DUTs check.') |
| 310 | return |
| 311 | |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 312 | if len(hosts) < minimum_duts: |
Dan Shi | 94234cb | 2014-05-23 20:04:31 -0700 | [diff] [blame] | 313 | logging.debug('The total number of DUTs for %s in pool:%s is %d, ' |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 314 | 'which is less than %d, the required minimum number of' |
| 315 | ' available DUTS', board, pool, len(hosts), |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 316 | minimum_duts) |
Ningning Xia | f2c206c | 2016-04-13 14:15:51 -0700 | [diff] [blame] | 317 | |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 318 | available_hosts = 0 |
Prashanth Balasubramanian | a6c03be | 2014-10-20 18:47:24 -0700 | [diff] [blame] | 319 | for host in hosts: |
Allen Li | f74957d | 2017-11-20 17:46:48 -0800 | [diff] [blame] | 320 | if host.is_available(): |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 321 | available_hosts += 1 |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 322 | logging.debug('%d of %d DUTs are available for board %s pool %s.', |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 323 | available_hosts, len(hosts), board, pool) |
| 324 | if available_hosts < minimum_duts: |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 325 | raise NotEnoughDutsError( |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 326 | board=board, |
| 327 | pool=pool, |
| 328 | num_available=available_hosts, |
| 329 | num_required=minimum_duts, |
| 330 | hosts=hosts) |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 331 | |
| 332 | |
MK Ryu | 4790eec | 2014-07-31 11:39:02 -0700 | [diff] [blame] | 333 | def diagnose_job(self, job_id, instance_server): |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 334 | """Diagnose a suite job. |
| 335 | |
| 336 | Logs information about the jobs that are still to run in the suite. |
| 337 | |
| 338 | @param job_id: The id of the suite job to get information about. |
| 339 | No meaningful information gets logged if the id is for a sub-job. |
MK Ryu | 4790eec | 2014-07-31 11:39:02 -0700 | [diff] [blame] | 340 | @param instance_server: The instance server. |
| 341 | Eg: cautotest, cautotest-cq, localhost. |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 342 | """ |
| 343 | incomplete_jobs = self.rpc_interface.get_jobs( |
| 344 | parent_job_id=job_id, summary=True, |
| 345 | hostqueueentry__complete=False) |
| 346 | if incomplete_jobs: |
| 347 | logging.info('\n%s printing summary of incomplete jobs (%s):\n', |
| 348 | JobTimer.format_time(datetime.now()), |
| 349 | len(incomplete_jobs)) |
| 350 | for job in incomplete_jobs: |
| 351 | logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:], |
MK Ryu | 4790eec | 2014-07-31 11:39:02 -0700 | [diff] [blame] | 352 | reporting_utils.link_job(job.id, instance_server)) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 353 | else: |
| 354 | logging.info('All jobs in suite have already completed.') |