Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | # |
| 3 | # Copyright (c) 2014 The Chromium OS Authors. All rights reserved. |
| 4 | # Use of this source code is governed by a BSD-style license that can be |
| 5 | # found in the LICENSE file. |
| 6 | |
| 7 | import datetime as datetime_base |
| 8 | import logging |
| 9 | from datetime import datetime |
| 10 | |
| 11 | import common |
| 12 | |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 13 | from autotest_lib.server import utils |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 14 | from autotest_lib.server.cros.dynamic_suite import reporting_utils |
| 15 | |
| 16 | |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 17 | class BoardNotAvailableError(utils.TestLabException): |
| 18 | """Raised when a board is not available in the lab.""" |
| 19 | |
| 20 | |
| 21 | class NotEnoughDutsError(utils.TestLabException): |
| 22 | """Rasied when the lab doesn't have the minimum number of duts.""" |
| 23 | |
| 24 | |
Prashanth B | a7be207 | 2014-07-15 15:03:21 -0700 | [diff] [blame] | 25 | class SimpleTimer(object): |
| 26 | """A simple timer used to periodically check if a deadline has passed.""" |
| 27 | |
| 28 | def _reset(self): |
| 29 | """Reset the deadline.""" |
| 30 | if not self.interval_hours or self.interval_hours < 0: |
| 31 | logging.error('Bad interval %s', self.interval_hours) |
| 32 | self.deadline = None |
| 33 | return |
| 34 | self.deadline = datetime.now() + datetime_base.timedelta( |
| 35 | hours=self.interval_hours) |
| 36 | |
| 37 | |
| 38 | def __init__(self, interval_hours=0.5): |
| 39 | """Initialize a simple periodic deadline timer. |
| 40 | |
| 41 | @param interval_hours: Interval of the deadline. |
| 42 | """ |
| 43 | self.interval_hours = interval_hours |
| 44 | self._reset() |
| 45 | |
| 46 | |
| 47 | def poll(self): |
| 48 | """Poll the timer to see if we've hit the deadline. |
| 49 | |
| 50 | This method resets the deadline if it has passed. If the deadline |
| 51 | hasn't been set, or the current time is less than the deadline, the |
| 52 | method returns False. |
| 53 | |
| 54 | @return: True if the deadline has passed, False otherwise. |
| 55 | """ |
| 56 | if not self.deadline or datetime.now() < self.deadline: |
| 57 | return False |
| 58 | self._reset() |
| 59 | return True |
| 60 | |
| 61 | |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 62 | class JobTimer(object): |
| 63 | """Utility class capable of measuring job timeouts. |
| 64 | """ |
| 65 | |
| 66 | # Format used in datetime - string conversion. |
| 67 | time_format = '%m-%d-%Y [%H:%M:%S]' |
| 68 | |
| 69 | def __init__(self, job_created_time, timeout_mins): |
| 70 | """JobTimer constructor. |
| 71 | |
| 72 | @param job_created_time: float representing the time a job was |
| 73 | created. Eg: time.time() |
| 74 | @param timeout_mins: float representing the timeout in minutes. |
| 75 | """ |
| 76 | self.job_created_time = datetime.fromtimestamp(job_created_time) |
| 77 | self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0) |
Prashanth B | a7be207 | 2014-07-15 15:03:21 -0700 | [diff] [blame] | 78 | self.debug_output_timer = SimpleTimer(interval_hours=0.5) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 79 | self.past_halftime = False |
| 80 | |
| 81 | |
| 82 | @classmethod |
| 83 | def format_time(cls, datetime_obj): |
| 84 | """Get the string formatted version of the datetime object. |
| 85 | |
| 86 | @param datetime_obj: A datetime.datetime object. |
| 87 | Eg: datetime.datetime.now() |
| 88 | |
| 89 | @return: A formatted string containing the date/time of the |
| 90 | input datetime. |
| 91 | """ |
| 92 | return datetime_obj.strftime(cls.time_format) |
| 93 | |
| 94 | |
| 95 | def elapsed_time(self): |
| 96 | """Get the time elapsed since this job was created. |
| 97 | |
| 98 | @return: A timedelta object representing the elapsed time. |
| 99 | """ |
| 100 | return datetime.now() - self.job_created_time |
| 101 | |
| 102 | |
| 103 | def is_suite_timeout(self): |
| 104 | """Check if the suite timed out. |
| 105 | |
| 106 | @return: True if more than timeout_hours has elapsed since the suite job |
| 107 | was created. |
| 108 | """ |
| 109 | if self.elapsed_time() >= self.timeout_hours: |
| 110 | logging.info('Suite timed out. Started on %s, timed out on %s', |
| 111 | self.format_time(self.job_created_time), |
| 112 | self.format_time(datetime.now())) |
| 113 | return True |
| 114 | return False |
| 115 | |
| 116 | |
| 117 | def first_past_halftime(self): |
| 118 | """Check if we just crossed half time. |
| 119 | |
| 120 | This method will only return True once, the first time it is called |
| 121 | after a job's elapsed time is past half its timeout. |
| 122 | |
| 123 | @return True: If this is the first call of the method after halftime. |
| 124 | """ |
| 125 | if (not self.past_halftime and |
| 126 | self.elapsed_time() > self.timeout_hours/2): |
| 127 | self.past_halftime = True |
| 128 | return True |
| 129 | return False |
| 130 | |
| 131 | |
| 132 | class RPCHelper(object): |
| 133 | """A class to help diagnose a suite run through the rpc interface. |
| 134 | """ |
| 135 | |
| 136 | def __init__(self, rpc_interface): |
| 137 | """Constructor for rpc helper class. |
| 138 | |
| 139 | @param rpc_interface: An rpc object, eg: A RetryingAFE instance. |
| 140 | """ |
| 141 | self.rpc_interface = rpc_interface |
| 142 | |
| 143 | |
| 144 | def diagnose_pool(self, board, pool, time_delta_hours, limit=5): |
| 145 | """Log diagnostic information about a timeout for a board/pool. |
| 146 | |
| 147 | @param board: The board for which the current suite was run. |
| 148 | @param pool: The pool against which the current suite was run. |
| 149 | @param time_delta_hours: The time from which we should log information. |
| 150 | This is a datetime.timedelta object, as stored by the JobTimer. |
| 151 | @param limit: The maximum number of jobs per host, to log. |
| 152 | |
| 153 | @raises proxy.JSONRPCException: For exceptions thrown across the wire. |
| 154 | """ |
| 155 | hosts = self.rpc_interface.get_hosts( |
| 156 | multiple_labels=('pool:%s' % pool, 'board:%s' % board)) |
| 157 | if not hosts: |
Alex Miller | 8bf5020 | 2014-06-02 04:10:51 -0700 | [diff] [blame] | 158 | logging.warning('No hosts found for board:%s in pool:%s', |
| 159 | board, pool) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 160 | return |
| 161 | cutoff = datetime.now() - time_delta_hours |
| 162 | for host in hosts: |
| 163 | jobs = self.rpc_interface.get_host_queue_entries( |
| 164 | host__id=host.id, started_on__gte=str(cutoff)) |
| 165 | job_info = '' |
| 166 | for job in jobs[-limit:]: |
| 167 | job_info += ('%s %s started on: %s status %s\n' % |
| 168 | (job.id, job.job.name, job.started_on, job.status)) |
| 169 | logging.error('host:%s, status:%s, locked: %s' |
| 170 | '\nlabels: %s\nLast %s jobs within %s:\n%s', |
| 171 | host.hostname, host.status, host.locked, host.labels, |
| 172 | limit, time_delta_hours, job_info) |
| 173 | |
| 174 | |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 175 | def check_dut_availability(self, board, pool, minimum_duts=0): |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 176 | """Check if DUT availability for a given board and pool is less than |
| 177 | minimum. |
| 178 | |
| 179 | @param board: The board to check DUT availability. |
| 180 | @param pool: The pool to check DUT availability. |
Dan Shi | 8136337 | 2014-06-03 22:27:37 -0700 | [diff] [blame] | 181 | @param minimum_duts: Minimum Number of available machines required to |
| 182 | run the suite. Default is set to 0, which means do |
| 183 | not force the check of available machines before |
| 184 | running the suite. |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 185 | @raise: NotEnoughDutsError if DUT availability is lower than minimum. |
| 186 | @raise: BoardNotAvailableError if no host found for requested |
| 187 | board/pool. |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 188 | """ |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 189 | if minimum_duts == 0: |
Dan Shi | 94234cb | 2014-05-23 20:04:31 -0700 | [diff] [blame] | 190 | return |
| 191 | |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 192 | hosts = self.rpc_interface.get_hosts( |
| 193 | invalid=False, |
| 194 | multiple_labels=('pool:%s' % pool, 'board:%s' % board)) |
| 195 | if not hosts: |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 196 | raise BoardNotAvailableError( |
Alex Miller | 8bf5020 | 2014-06-02 04:10:51 -0700 | [diff] [blame] | 197 | 'No hosts found for board:%s in pool:%s' % |
| 198 | (board, pool)) |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 199 | |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 200 | if len(hosts) < minimum_duts: |
Dan Shi | 94234cb | 2014-05-23 20:04:31 -0700 | [diff] [blame] | 201 | logging.debug('The total number of DUTs for %s in pool:%s is %d, ' |
| 202 | 'which is no more than the required minimum number of' |
| 203 | ' available DUTS of %d. Minimum available DUT rule is' |
| 204 | ' not enforced.', board, pool, len(hosts), |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 205 | minimum_duts) |
Dan Shi | 94234cb | 2014-05-23 20:04:31 -0700 | [diff] [blame] | 206 | return |
| 207 | |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 208 | # TODO(dshi): Replace the hard coded string with enum value, |
| 209 | # models.Host.Status.REPAIRING and REPAIR_FAILED |
| 210 | # setup_django_environment can't be imported now as paygen server does |
| 211 | # not have django package. |
Dan Shi | f1f77a5 | 2014-10-08 14:34:34 -0700 | [diff] [blame] | 212 | bad_statuses = ('Repair Failed', 'Repairing', 'Verifying') |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 213 | available_hosts = [host for host in hosts |
Dan Shi | f1f77a5 | 2014-10-08 14:34:34 -0700 | [diff] [blame] | 214 | if not host.status in bad_statuses and |
| 215 | not host.locked] |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 216 | logging.debug('%d of %d DUTs are available for board %s pool %s.', |
| 217 | len(available_hosts), len(hosts), board, pool) |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 218 | if len(available_hosts) < minimum_duts: |
Fang Deng | 6197da3 | 2014-09-25 10:18:48 -0700 | [diff] [blame] | 219 | raise NotEnoughDutsError( |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 220 | 'Number of available DUTs for board %s pool %s is %d, which' |
| 221 | ' is less than the minimum value %d.' % |
Dan Shi | 8de6d1b | 2014-06-12 09:10:37 -0700 | [diff] [blame] | 222 | (board, pool, len(available_hosts), minimum_duts)) |
Dan Shi | 20952c1 | 2014-05-14 17:07:38 -0700 | [diff] [blame] | 223 | |
| 224 | |
MK Ryu | 4790eec | 2014-07-31 11:39:02 -0700 | [diff] [blame] | 225 | def diagnose_job(self, job_id, instance_server): |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 226 | """Diagnose a suite job. |
| 227 | |
| 228 | Logs information about the jobs that are still to run in the suite. |
| 229 | |
| 230 | @param job_id: The id of the suite job to get information about. |
| 231 | No meaningful information gets logged if the id is for a sub-job. |
MK Ryu | 4790eec | 2014-07-31 11:39:02 -0700 | [diff] [blame] | 232 | @param instance_server: The instance server. |
| 233 | Eg: cautotest, cautotest-cq, localhost. |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 234 | """ |
| 235 | incomplete_jobs = self.rpc_interface.get_jobs( |
| 236 | parent_job_id=job_id, summary=True, |
| 237 | hostqueueentry__complete=False) |
| 238 | if incomplete_jobs: |
| 239 | logging.info('\n%s printing summary of incomplete jobs (%s):\n', |
| 240 | JobTimer.format_time(datetime.now()), |
| 241 | len(incomplete_jobs)) |
| 242 | for job in incomplete_jobs: |
| 243 | logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:], |
MK Ryu | 4790eec | 2014-07-31 11:39:02 -0700 | [diff] [blame] | 244 | reporting_utils.link_job(job.id, instance_server)) |
Prashanth B | 923ca26 | 2014-03-14 12:36:29 -0700 | [diff] [blame] | 245 | else: |
| 246 | logging.info('All jobs in suite have already completed.') |