Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | # |
| 3 | # Copyright (c) 2014 The Chromium OS Authors. All rights reserved. |
| 4 | # Use of this source code is governed by a BSD-style license that can be |
| 5 | # found in the LICENSE file. |
| 6 | |
| 7 | import datetime as datetime_base |
| 8 | import logging |
| 9 | from datetime import datetime |
| 10 | |
| 11 | import common |
| 12 | |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 13 | from autotest_lib.client.common_lib import global_config |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 14 | from autotest_lib.client.common_lib import time_utils |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 15 | from autotest_lib.server import utils |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 16 | from autotest_lib.server.cros.dynamic_suite import reporting_utils |
Aviv Keshet | 7ee9586 | 2016-08-30 15:18:27 -0700 | [diff] [blame] | 17 | from autotest_lib.server.lib import status_history |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 18 | |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 19 | CONFIG = global_config.global_config |
| 20 | |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 21 | |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 22 | class DUTsNotAvailableError(utils.TestLabException): |
| 23 | """Raised when a DUT label combination is not available in the lab.""" |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 24 | |
| 25 | |
| 26 | class NotEnoughDutsError(utils.TestLabException): |
| 27 | """Rasied when the lab doesn't have the minimum number of duts.""" |
| 28 | |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 29 | def __init__(self, labels, num_available, num_required, hosts): |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 30 | """Initialize instance. |
| 31 | |
| 32 | Please pass arguments by keyword. |
| 33 | |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 34 | @param labels: Labels required, including board an pool labels. |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 35 | @param num_available: Number of available hosts. |
| 36 | @param num_required: Number of hosts required. |
| 37 | @param hosts: Sequence of Host instances for given board and pool. |
| 38 | """ |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 39 | self.labels = labels |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 40 | self.num_available = num_available |
| 41 | self.num_required = num_required |
| 42 | self.hosts = hosts |
| 43 | self.bug_id = None |
| 44 | self.suite_name = None |
| 45 | self.build = None |
| 46 | |
| 47 | |
| 48 | def __repr__(self): |
| 49 | return ( |
| 50 | '<{cls} at 0x{id:x} with' |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 51 | ' labels={this.labels!r},' |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 52 | ' num_available={this.num_available!r},' |
| 53 | ' num_required={this.num_required!r},' |
| 54 | ' bug_id={this.bug_id!r},' |
| 55 | ' suite_name={this.suite_name!r},' |
| 56 | ' build={this.build!r}>' |
| 57 | .format(cls=type(self).__name__, id=id(self), this=self) |
| 58 | ) |
| 59 | |
| 60 | |
| 61 | def __str__(self): |
| 62 | msg_parts = [ |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 63 | 'Not enough DUTs for requirements: {this.labels};' |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 64 | ' required: {this.num_required}, found: {this.num_available}' |
| 65 | ] |
Allen Li | dc2c69a | 2016-09-14 19:05:47 -0700 | [diff] [blame] | 66 | format_dict = {'this': self} |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 67 | if self.bug_id is not None: |
Allen Li | dc2c69a | 2016-09-14 19:05:47 -0700 | [diff] [blame] | 68 | msg_parts.append('bug: {bug_url}') |
| 69 | format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id) |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 70 | if self.suite_name is not None: |
Allen Li | dc2c69a | 2016-09-14 19:05:47 -0700 | [diff] [blame] | 71 | msg_parts.append('suite: {this.suite_name}') |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 72 | if self.build is not None: |
Allen Li | dc2c69a | 2016-09-14 19:05:47 -0700 | [diff] [blame] | 73 | msg_parts.append('build: {this.build}') |
| 74 | return ', '.join(msg_parts).format(**format_dict) |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 75 | |
| 76 | |
| 77 | def add_bug_id(self, bug_id): |
| 78 | """Add crbug id associated with this exception. |
| 79 | |
| 80 | @param bug_id crbug id whose str() value is used in a crbug URL. |
| 81 | """ |
| 82 | self.bug_id = bug_id |
| 83 | |
| 84 | |
| 85 | def add_suite_name(self, suite_name): |
| 86 | """Add name of test suite that needed the DUTs. |
| 87 | |
| 88 | @param suite_name Name of test suite. |
| 89 | """ |
| 90 | self.suite_name = suite_name |
| 91 | |
| 92 | |
| 93 | def add_build(self, build): |
| 94 | """Add name of build of job that needed the DUTs. |
| 95 | |
| 96 | @param build Name of build. |
| 97 | """ |
| 98 | self.build = build |
| 99 | |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 100 | |
Prashanth B | a7be207 | 2014-07-15 15:03:21 -0700 | [diff] [blame] | 101 | class SimpleTimer(object): |
| 102 | """A simple timer used to periodically check if a deadline has passed.""" |
| 103 | |
| 104 | def _reset(self): |
| 105 | """Reset the deadline.""" |
| 106 | if not self.interval_hours or self.interval_hours < 0: |
| 107 | logging.error('Bad interval %s', self.interval_hours) |
| 108 | self.deadline = None |
| 109 | return |
| 110 | self.deadline = datetime.now() + datetime_base.timedelta( |
| 111 | hours=self.interval_hours) |
| 112 | |
| 113 | |
| 114 | def __init__(self, interval_hours=0.5): |
| 115 | """Initialize a simple periodic deadline timer. |
| 116 | |
| 117 | @param interval_hours: Interval of the deadline. |
| 118 | """ |
| 119 | self.interval_hours = interval_hours |
| 120 | self._reset() |
| 121 | |
| 122 | |
| 123 | def poll(self): |
| 124 | """Poll the timer to see if we've hit the deadline. |
| 125 | |
| 126 | This method resets the deadline if it has passed. If the deadline |
| 127 | hasn't been set, or the current time is less than the deadline, the |
| 128 | method returns False. |
| 129 | |
| 130 | @return: True if the deadline has passed, False otherwise. |
| 131 | """ |
| 132 | if not self.deadline or datetime.now() < self.deadline: |
| 133 | return False |
| 134 | self._reset() |
| 135 | return True |
| 136 | |
| 137 | |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 138 | class JobTimer(object): |
| 139 | """Utility class capable of measuring job timeouts. |
| 140 | """ |
| 141 | |
| 142 | # Format used in datetime - string conversion. |
| 143 | time_format = '%m-%d-%Y [%H:%M:%S]' |
| 144 | |
| 145 | def __init__(self, job_created_time, timeout_mins): |
| 146 | """JobTimer constructor. |
| 147 | |
| 148 | @param job_created_time: float representing the time a job was |
| 149 | created. Eg: time.time() |
| 150 | @param timeout_mins: float representing the timeout in minutes. |
| 151 | """ |
| 152 | self.job_created_time = datetime.fromtimestamp(job_created_time) |
| 153 | self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0) |
Prashanth B | a7be207 | 2014-07-15 15:03:21 -0700 | [diff] [blame] | 154 | self.debug_output_timer = SimpleTimer(interval_hours=0.5) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 155 | self.past_halftime = False |
| 156 | |
| 157 | |
| 158 | @classmethod |
| 159 | def format_time(cls, datetime_obj): |
| 160 | """Get the string formatted version of the datetime object. |
| 161 | |
| 162 | @param datetime_obj: A datetime.datetime object. |
| 163 | Eg: datetime.datetime.now() |
| 164 | |
| 165 | @return: A formatted string containing the date/time of the |
| 166 | input datetime. |
| 167 | """ |
| 168 | return datetime_obj.strftime(cls.time_format) |
| 169 | |
| 170 | |
| 171 | def elapsed_time(self): |
| 172 | """Get the time elapsed since this job was created. |
| 173 | |
| 174 | @return: A timedelta object representing the elapsed time. |
| 175 | """ |
| 176 | return datetime.now() - self.job_created_time |
| 177 | |
| 178 | |
| 179 | def is_suite_timeout(self): |
| 180 | """Check if the suite timed out. |
| 181 | |
| 182 | @return: True if more than timeout_hours has elapsed since the suite job |
| 183 | was created. |
| 184 | """ |
| 185 | if self.elapsed_time() >= self.timeout_hours: |
| 186 | logging.info('Suite timed out. Started on %s, timed out on %s', |
| 187 | self.format_time(self.job_created_time), |
| 188 | self.format_time(datetime.now())) |
| 189 | return True |
| 190 | return False |
| 191 | |
| 192 | |
| 193 | def first_past_halftime(self): |
| 194 | """Check if we just crossed half time. |
| 195 | |
| 196 | This method will only return True once, the first time it is called |
| 197 | after a job's elapsed time is past half its timeout. |
| 198 | |
| 199 | @return True: If this is the first call of the method after halftime. |
| 200 | """ |
| 201 | if (not self.past_halftime and |
| 202 | self.elapsed_time() > self.timeout_hours/2): |
| 203 | self.past_halftime = True |
| 204 | return True |
| 205 | return False |
| 206 | |
| 207 | |
| 208 | class RPCHelper(object): |
| 209 | """A class to help diagnose a suite run through the rpc interface. |
| 210 | """ |
| 211 | |
| 212 | def __init__(self, rpc_interface): |
| 213 | """Constructor for rpc helper class. |
| 214 | |
| 215 | @param rpc_interface: An rpc object, eg: A RetryingAFE instance. |
| 216 | """ |
| 217 | self.rpc_interface = rpc_interface |
| 218 | |
| 219 | |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 220 | def diagnose_pool(self, labels, time_delta_hours, limit=10): |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 221 | """Log diagnostic information about a timeout for a board/pool. |
| 222 | |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 223 | @param labels: DUT label dependencies, including board and pool |
| 224 | labels. |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 225 | @param time_delta_hours: The time from which we should log information. |
| 226 | This is a datetime.timedelta object, as stored by the JobTimer. |
| 227 | @param limit: The maximum number of jobs per host, to log. |
| 228 | |
| 229 | @raises proxy.JSONRPCException: For exceptions thrown across the wire. |
| 230 | """ |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 231 | end_time = datetime.now() |
| 232 | start_time = end_time - time_delta_hours |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame] | 233 | host_histories = status_history.HostJobHistory.get_multiple_histories( |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 234 | self.rpc_interface, |
| 235 | time_utils.to_epoch_time(start_time), |
| 236 | time_utils.to_epoch_time(end_time), |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 237 | labels, |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame] | 238 | ) |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 239 | if not host_histories: |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 240 | logging.error('No hosts found for labels %r', labels) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 241 | return |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 242 | status_map = { |
| 243 | status_history.UNUSED: 'Unused', |
| 244 | status_history.UNKNOWN: 'No job history', |
| 245 | status_history.WORKING: 'Working', |
| 246 | status_history.BROKEN: 'Failed repair' |
| 247 | } |
| 248 | for history in host_histories: |
| 249 | count = 0 |
| 250 | job_info ='' |
| 251 | for job in history: |
| 252 | start_time = ( |
| 253 | time_utils.epoch_time_to_date_string(job.start_time)) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 254 | job_info += ('%s %s started on: %s status %s\n' % |
J. Richard Barnette | af1e826 | 2016-03-04 12:55:11 -0800 | [diff] [blame] | 255 | (job.id, job.name, start_time, job.job_status)) |
| 256 | count += 1 |
| 257 | if count >= limit: |
| 258 | break |
| 259 | host = history.host |
| 260 | logging.error('host: %s, status: %s, locked: %s ' |
| 261 | 'diagnosis: %s\n' |
| 262 | 'labels: %s\nLast %s jobs within %s:\n' |
| 263 | '%s', |
| 264 | history.hostname, host.status, host.locked, |
| 265 | status_map[history.last_diagnosis()[0]], |
| 266 | host.labels, limit, time_delta_hours, |
| 267 | job_info) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 268 | |
| 269 | |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 270 | def check_dut_availability(self, labels, minimum_duts=0, |
| 271 | skip_duts_check=False): |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 272 | """Check if DUT availability for a given board and pool is less than |
| 273 | minimum. |
| 274 | |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 275 | @param labels: DUT label dependencies, including board and pool |
| 276 | labels. |
Dan Shi | 8136337 | 2014-06-03 22:27:37 -0700 | [diff] [blame] | 277 | @param minimum_duts: Minimum Number of available machines required to |
| 278 | run the suite. Default is set to 0, which means do |
| 279 | not force the check of available machines before |
| 280 | running the suite. |
Ningning Xia | f2c206c | 2016-04-13 14:15:51 -0700 | [diff] [blame] | 281 | @param skip_duts_check: If True, skip minimum available DUTs check. |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 282 | @raise: NotEnoughDutsError if DUT availability is lower than minimum. |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 283 | @raise: DUTsNotAvailableError if no host found for requested |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 284 | board/pool. |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 285 | """ |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 286 | if minimum_duts == 0: |
Dan Shi | 94234cb | 2014-05-23 20:04:31 -0700 | [diff] [blame] | 287 | return |
| 288 | |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 289 | hosts = self.rpc_interface.get_hosts( |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 290 | invalid=False, multiple_labels=labels) |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 291 | if not hosts: |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 292 | raise DUTsNotAvailableError( |
| 293 | 'No hosts found for labels %r. The test lab ' |
| 294 | 'currently does not cover test for those DUTs.' % |
| 295 | (labels,)) |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 296 | |
Ningning Xia | f2c206c | 2016-04-13 14:15:51 -0700 | [diff] [blame] | 297 | if skip_duts_check: |
| 298 | # Bypass minimum avilable DUTs check |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 299 | logging.debug('skip_duts_check is on, do not enforce minimum ' |
| 300 | 'DUTs check.') |
Ningning Xia | f2c206c | 2016-04-13 14:15:51 -0700 | [diff] [blame] | 301 | return |
| 302 | |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 303 | if len(hosts) < minimum_duts: |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 304 | logging.debug('The total number of DUTs for %r is %d, ' |
| 305 | 'which is less than %d, the required minimum ' |
| 306 | 'number of available DUTS', labels, len(hosts), |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 307 | minimum_duts) |
Ningning Xia | f2c206c | 2016-04-13 14:15:51 -0700 | [diff] [blame] | 308 | |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 309 | available_hosts = 0 |
Prashanth Balasubramanian | a6c03be | 2014-10-20 18:47:24 -0700 | [diff] [blame] | 310 | for host in hosts: |
Allen Li | f74957d | 2017-11-20 17:46:48 -0800 | [diff] [blame] | 311 | if host.is_available(): |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 312 | available_hosts += 1 |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 313 | logging.debug('%d of %d DUTs are available for %r.', |
| 314 | available_hosts, len(hosts), labels) |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 315 | if available_hosts < minimum_duts: |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 316 | raise NotEnoughDutsError( |
Richard Barnette | ae9eaa4 | 2018-09-12 10:28:07 -0700 | [diff] [blame] | 317 | labels=labels, |
Allen Li | 6a61239 | 2016-08-18 12:09:32 -0700 | [diff] [blame] | 318 | num_available=available_hosts, |
| 319 | num_required=minimum_duts, |
| 320 | hosts=hosts) |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 321 | |
| 322 | |
MK Ryu | 4790eec | 2014-07-31 11:39:02 -0700 | [diff] [blame] | 323 | def diagnose_job(self, job_id, instance_server): |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 324 | """Diagnose a suite job. |
| 325 | |
| 326 | Logs information about the jobs that are still to run in the suite. |
| 327 | |
| 328 | @param job_id: The id of the suite job to get information about. |
| 329 | No meaningful information gets logged if the id is for a sub-job. |
MK Ryu | 4790eec | 2014-07-31 11:39:02 -0700 | [diff] [blame] | 330 | @param instance_server: The instance server. |
| 331 | Eg: cautotest, cautotest-cq, localhost. |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 332 | """ |
| 333 | incomplete_jobs = self.rpc_interface.get_jobs( |
| 334 | parent_job_id=job_id, summary=True, |
| 335 | hostqueueentry__complete=False) |
| 336 | if incomplete_jobs: |
| 337 | logging.info('\n%s printing summary of incomplete jobs (%s):\n', |
| 338 | JobTimer.format_time(datetime.now()), |
| 339 | len(incomplete_jobs)) |
| 340 | for job in incomplete_jobs: |
| 341 | logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:], |
MK Ryu | 4790eec | 2014-07-31 11:39:02 -0700 | [diff] [blame] | 342 | reporting_utils.link_job(job.id, instance_server)) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 343 | else: |
| 344 | logging.info('All jobs in suite have already completed.') |