blob: 25abb0622707f767bd6b920219929b648bd9146c [file] [log] [blame]
Prashanth B923ca262014-03-14 12:36:29 -07001#!/usr/bin/python
2#
3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7import datetime as datetime_base
8import logging
9from datetime import datetime
10
11import common
12
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -080013from autotest_lib.client.common_lib import time_utils
Dan Shi20952c12014-05-14 17:07:38 -070014from autotest_lib.server import utils
Prashanth Balasubramaniana6c03be2014-10-20 18:47:24 -070015from autotest_lib.server.cros.dynamic_suite import reporting
Prashanth B923ca262014-03-14 12:36:29 -070016from autotest_lib.server.cros.dynamic_suite import reporting_utils
Aviv Keshet7ee95862016-08-30 15:18:27 -070017from autotest_lib.server.lib import status_history
Prashanth B923ca262014-03-14 12:36:29 -070018
19
Fang Deng6197da32014-09-25 10:18:48 -070020class BoardNotAvailableError(utils.TestLabException):
21 """Raised when a board is not available in the lab."""
22
23
24class NotEnoughDutsError(utils.TestLabException):
25 """Rasied when the lab doesn't have the minimum number of duts."""
26
27
Prashanth Ba7be2072014-07-15 15:03:21 -070028class SimpleTimer(object):
29 """A simple timer used to periodically check if a deadline has passed."""
30
31 def _reset(self):
32 """Reset the deadline."""
33 if not self.interval_hours or self.interval_hours < 0:
34 logging.error('Bad interval %s', self.interval_hours)
35 self.deadline = None
36 return
37 self.deadline = datetime.now() + datetime_base.timedelta(
38 hours=self.interval_hours)
39
40
41 def __init__(self, interval_hours=0.5):
42 """Initialize a simple periodic deadline timer.
43
44 @param interval_hours: Interval of the deadline.
45 """
46 self.interval_hours = interval_hours
47 self._reset()
48
49
50 def poll(self):
51 """Poll the timer to see if we've hit the deadline.
52
53 This method resets the deadline if it has passed. If the deadline
54 hasn't been set, or the current time is less than the deadline, the
55 method returns False.
56
57 @return: True if the deadline has passed, False otherwise.
58 """
59 if not self.deadline or datetime.now() < self.deadline:
60 return False
61 self._reset()
62 return True
63
64
Prashanth B923ca262014-03-14 12:36:29 -070065class JobTimer(object):
66 """Utility class capable of measuring job timeouts.
67 """
68
69 # Format used in datetime - string conversion.
70 time_format = '%m-%d-%Y [%H:%M:%S]'
71
72 def __init__(self, job_created_time, timeout_mins):
73 """JobTimer constructor.
74
75 @param job_created_time: float representing the time a job was
76 created. Eg: time.time()
77 @param timeout_mins: float representing the timeout in minutes.
78 """
79 self.job_created_time = datetime.fromtimestamp(job_created_time)
80 self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
Prashanth Ba7be2072014-07-15 15:03:21 -070081 self.debug_output_timer = SimpleTimer(interval_hours=0.5)
Prashanth B923ca262014-03-14 12:36:29 -070082 self.past_halftime = False
83
84
85 @classmethod
86 def format_time(cls, datetime_obj):
87 """Get the string formatted version of the datetime object.
88
89 @param datetime_obj: A datetime.datetime object.
90 Eg: datetime.datetime.now()
91
92 @return: A formatted string containing the date/time of the
93 input datetime.
94 """
95 return datetime_obj.strftime(cls.time_format)
96
97
98 def elapsed_time(self):
99 """Get the time elapsed since this job was created.
100
101 @return: A timedelta object representing the elapsed time.
102 """
103 return datetime.now() - self.job_created_time
104
105
106 def is_suite_timeout(self):
107 """Check if the suite timed out.
108
109 @return: True if more than timeout_hours has elapsed since the suite job
110 was created.
111 """
112 if self.elapsed_time() >= self.timeout_hours:
113 logging.info('Suite timed out. Started on %s, timed out on %s',
114 self.format_time(self.job_created_time),
115 self.format_time(datetime.now()))
116 return True
117 return False
118
119
120 def first_past_halftime(self):
121 """Check if we just crossed half time.
122
123 This method will only return True once, the first time it is called
124 after a job's elapsed time is past half its timeout.
125
126 @return True: If this is the first call of the method after halftime.
127 """
128 if (not self.past_halftime and
129 self.elapsed_time() > self.timeout_hours/2):
130 self.past_halftime = True
131 return True
132 return False
133
134
135class RPCHelper(object):
136 """A class to help diagnose a suite run through the rpc interface.
137 """
138
139 def __init__(self, rpc_interface):
140 """Constructor for rpc helper class.
141
142 @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
143 """
144 self.rpc_interface = rpc_interface
Prashanth Balasubramanianae437212014-10-27 11:17:26 -0700145 # Any bug filed during diagnosis. Generally these bugs are critical so
146 # there should only be one.
147 self.bug = ''
Prashanth B923ca262014-03-14 12:36:29 -0700148
149
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800150 def diagnose_pool(self, board, pool, time_delta_hours, limit=10):
Prashanth B923ca262014-03-14 12:36:29 -0700151 """Log diagnostic information about a timeout for a board/pool.
152
153 @param board: The board for which the current suite was run.
154 @param pool: The pool against which the current suite was run.
155 @param time_delta_hours: The time from which we should log information.
156 This is a datetime.timedelta object, as stored by the JobTimer.
157 @param limit: The maximum number of jobs per host, to log.
158
159 @raises proxy.JSONRPCException: For exceptions thrown across the wire.
160 """
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800161 end_time = datetime.now()
162 start_time = end_time - time_delta_hours
163 get_histories = status_history.HostJobHistory.get_multiple_histories
164 host_histories = get_histories(
165 self.rpc_interface,
166 time_utils.to_epoch_time(start_time),
167 time_utils.to_epoch_time(end_time),
168 board=board, pool=pool)
169 if not host_histories:
170 logging.error('No hosts found for board:%s in pool:%s',
Alex Miller8bf50202014-06-02 04:10:51 -0700171 board, pool)
Prashanth B923ca262014-03-14 12:36:29 -0700172 return
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800173 status_map = {
174 status_history.UNUSED: 'Unused',
175 status_history.UNKNOWN: 'No job history',
176 status_history.WORKING: 'Working',
177 status_history.BROKEN: 'Failed repair'
178 }
179 for history in host_histories:
180 count = 0
181 job_info =''
182 for job in history:
183 start_time = (
184 time_utils.epoch_time_to_date_string(job.start_time))
Prashanth B923ca262014-03-14 12:36:29 -0700185 job_info += ('%s %s started on: %s status %s\n' %
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800186 (job.id, job.name, start_time, job.job_status))
187 count += 1
188 if count >= limit:
189 break
190 host = history.host
191 logging.error('host: %s, status: %s, locked: %s '
192 'diagnosis: %s\n'
193 'labels: %s\nLast %s jobs within %s:\n'
194 '%s',
195 history.hostname, host.status, host.locked,
196 status_map[history.last_diagnosis()[0]],
197 host.labels, limit, time_delta_hours,
198 job_info)
Prashanth B923ca262014-03-14 12:36:29 -0700199
200
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700201 def check_dut_availability(self, board, pool, minimum_duts=0, skip_duts_check=False):
Dan Shi20952c12014-05-14 17:07:38 -0700202 """Check if DUT availability for a given board and pool is less than
203 minimum.
204
205 @param board: The board to check DUT availability.
206 @param pool: The pool to check DUT availability.
Dan Shi81363372014-06-03 22:27:37 -0700207 @param minimum_duts: Minimum Number of available machines required to
208 run the suite. Default is set to 0, which means do
209 not force the check of available machines before
210 running the suite.
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700211 @param skip_duts_check: If True, skip minimum available DUTs check.
Fang Deng6197da32014-09-25 10:18:48 -0700212 @raise: NotEnoughDutsError if DUT availability is lower than minimum.
213 @raise: BoardNotAvailableError if no host found for requested
214 board/pool.
Dan Shi20952c12014-05-14 17:07:38 -0700215 """
Dan Shi8de6d1b2014-06-12 09:10:37 -0700216 if minimum_duts == 0:
Dan Shi94234cb2014-05-23 20:04:31 -0700217 return
218
Dan Shi20952c12014-05-14 17:07:38 -0700219 hosts = self.rpc_interface.get_hosts(
220 invalid=False,
221 multiple_labels=('pool:%s' % pool, 'board:%s' % board))
222 if not hosts:
Fang Deng6197da32014-09-25 10:18:48 -0700223 raise BoardNotAvailableError(
Shuqian Zhaoade6e7d2015-12-07 18:01:11 -0800224 'No hosts found for board:%s in pool:%s. The test lab '
225 'currently does not cover test for this board and pool.'%
Alex Miller8bf50202014-06-02 04:10:51 -0700226 (board, pool))
Dan Shi20952c12014-05-14 17:07:38 -0700227
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700228 if skip_duts_check:
229 # Bypass minimum avilable DUTs check
230 logging.debug('skip_duts_check is on, do not enforce minimum DUTs check.')
231 return
232
Dan Shi8de6d1b2014-06-12 09:10:37 -0700233 if len(hosts) < minimum_duts:
Dan Shi94234cb2014-05-23 20:04:31 -0700234 logging.debug('The total number of DUTs for %s in pool:%s is %d, '
235 'which is no more than the required minimum number of'
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700236 ' available DUTS of %d.', board, pool, len(hosts),
Dan Shi8de6d1b2014-06-12 09:10:37 -0700237 minimum_duts)
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700238
239 # skip_duts_check is off, enfore checking dut availability
240 raise NotEnoughDutsError('skip_duts_check option is on and total number DUTs '
241 'in the pool is less than the required '
242 'minimum avaialble DUTs.')
Dan Shi94234cb2014-05-23 20:04:31 -0700243
Dan Shi20952c12014-05-14 17:07:38 -0700244 # TODO(dshi): Replace the hard coded string with enum value,
245 # models.Host.Status.REPAIRING and REPAIR_FAILED
246 # setup_django_environment can't be imported now as paygen server does
247 # not have django package.
Dan Shif1f77a52014-10-08 14:34:34 -0700248 bad_statuses = ('Repair Failed', 'Repairing', 'Verifying')
Prashanth Balasubramaniana6c03be2014-10-20 18:47:24 -0700249 unusable_hosts = []
250 available_hosts = []
251 for host in hosts:
252 if host.status in bad_statuses or host.locked:
253 unusable_hosts.append(host.hostname)
254 else:
255 available_hosts.append(host)
Dan Shi20952c12014-05-14 17:07:38 -0700256 logging.debug('%d of %d DUTs are available for board %s pool %s.',
257 len(available_hosts), len(hosts), board, pool)
Dan Shi8de6d1b2014-06-12 09:10:37 -0700258 if len(available_hosts) < minimum_duts:
Prashanth Balasubramaniana6c03be2014-10-20 18:47:24 -0700259 if unusable_hosts:
260 pool_health_bug = reporting.PoolHealthBug(
261 pool, board, unusable_hosts)
Prashanth Balasubramanianae437212014-10-27 11:17:26 -0700262 self.bug = reporting.Reporter().report(pool_health_bug)[0]
Fang Deng6197da32014-09-25 10:18:48 -0700263 raise NotEnoughDutsError(
Dan Shi20952c12014-05-14 17:07:38 -0700264 'Number of available DUTs for board %s pool %s is %d, which'
Prashanth Balasubramaniana6c03be2014-10-20 18:47:24 -0700265 ' is less than the minimum value %d. '
266 'Filed https://crbug.com/%s' %
Prashanth Balasubramanianae437212014-10-27 11:17:26 -0700267 (board, pool, len(available_hosts), minimum_duts, self.bug))
Dan Shi20952c12014-05-14 17:07:38 -0700268
269
MK Ryu4790eec2014-07-31 11:39:02 -0700270 def diagnose_job(self, job_id, instance_server):
Prashanth B923ca262014-03-14 12:36:29 -0700271 """Diagnose a suite job.
272
273 Logs information about the jobs that are still to run in the suite.
274
275 @param job_id: The id of the suite job to get information about.
276 No meaningful information gets logged if the id is for a sub-job.
MK Ryu4790eec2014-07-31 11:39:02 -0700277 @param instance_server: The instance server.
278 Eg: cautotest, cautotest-cq, localhost.
Prashanth B923ca262014-03-14 12:36:29 -0700279 """
280 incomplete_jobs = self.rpc_interface.get_jobs(
281 parent_job_id=job_id, summary=True,
282 hostqueueentry__complete=False)
283 if incomplete_jobs:
284 logging.info('\n%s printing summary of incomplete jobs (%s):\n',
285 JobTimer.format_time(datetime.now()),
286 len(incomplete_jobs))
287 for job in incomplete_jobs:
288 logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
MK Ryu4790eec2014-07-31 11:39:02 -0700289 reporting_utils.link_job(job.id, instance_server))
Prashanth B923ca262014-03-14 12:36:29 -0700290 else:
291 logging.info('All jobs in suite have already completed.')