blob: 061db78b009899a7f1811f3700931e0f2500624e [file] [log] [blame]
Prashanth B923ca262014-03-14 12:36:29 -07001#!/usr/bin/python
2#
3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7import datetime as datetime_base
8import logging
9from datetime import datetime
10
11import common
12
Allen Li6a612392016-08-18 12:09:32 -070013from autotest_lib.client.common_lib import global_config
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -080014from autotest_lib.client.common_lib import time_utils
Dan Shi20952c12014-05-14 17:07:38 -070015from autotest_lib.server import utils
Prashanth B923ca262014-03-14 12:36:29 -070016from autotest_lib.server.cros.dynamic_suite import reporting_utils
Aviv Keshet7ee95862016-08-30 15:18:27 -070017from autotest_lib.server.lib import status_history
Prathmesh Prabhu68acc402017-11-09 15:24:15 -080018from autotest_lib.utils import labellib
Prashanth B923ca262014-03-14 12:36:29 -070019
Allen Li6a612392016-08-18 12:09:32 -070020CONFIG = global_config.global_config
21
Prashanth B923ca262014-03-14 12:36:29 -070022
Fang Deng6197da32014-09-25 10:18:48 -070023class BoardNotAvailableError(utils.TestLabException):
24 """Raised when a board is not available in the lab."""
25
26
27class NotEnoughDutsError(utils.TestLabException):
28 """Rasied when the lab doesn't have the minimum number of duts."""
29
Allen Li6a612392016-08-18 12:09:32 -070030 def __init__(self, board, pool, num_available, num_required, hosts):
31 """Initialize instance.
32
33 Please pass arguments by keyword.
34
35 @param board: Name of board.
36 @param pool: Name of pool.
37 @param num_available: Number of available hosts.
38 @param num_required: Number of hosts required.
39 @param hosts: Sequence of Host instances for given board and pool.
40 """
41 self.board = board
42 self.pool = pool
43 self.num_available = num_available
44 self.num_required = num_required
45 self.hosts = hosts
46 self.bug_id = None
47 self.suite_name = None
48 self.build = None
49
50
51 def __repr__(self):
52 return (
53 '<{cls} at 0x{id:x} with'
54 ' board={this.board!r},'
55 ' pool={this.pool!r},'
56 ' num_available={this.num_available!r},'
57 ' num_required={this.num_required!r},'
58 ' bug_id={this.bug_id!r},'
59 ' suite_name={this.suite_name!r},'
60 ' build={this.build!r}>'
61 .format(cls=type(self).__name__, id=id(self), this=self)
62 )
63
64
65 def __str__(self):
66 msg_parts = [
67 'Not enough DUTs for board: {this.board}, pool: {this.pool};'
68 ' required: {this.num_required}, found: {this.num_available}'
69 ]
Allen Lidc2c69a2016-09-14 19:05:47 -070070 format_dict = {'this': self}
Allen Li6a612392016-08-18 12:09:32 -070071 if self.bug_id is not None:
Allen Lidc2c69a2016-09-14 19:05:47 -070072 msg_parts.append('bug: {bug_url}')
73 format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id)
Allen Li6a612392016-08-18 12:09:32 -070074 if self.suite_name is not None:
Allen Lidc2c69a2016-09-14 19:05:47 -070075 msg_parts.append('suite: {this.suite_name}')
Allen Li6a612392016-08-18 12:09:32 -070076 if self.build is not None:
Allen Lidc2c69a2016-09-14 19:05:47 -070077 msg_parts.append('build: {this.build}')
78 return ', '.join(msg_parts).format(**format_dict)
Allen Li6a612392016-08-18 12:09:32 -070079
80
81 def add_bug_id(self, bug_id):
82 """Add crbug id associated with this exception.
83
84 @param bug_id crbug id whose str() value is used in a crbug URL.
85 """
86 self.bug_id = bug_id
87
88
89 def add_suite_name(self, suite_name):
90 """Add name of test suite that needed the DUTs.
91
92 @param suite_name Name of test suite.
93 """
94 self.suite_name = suite_name
95
96
97 def add_build(self, build):
98 """Add name of build of job that needed the DUTs.
99
100 @param build Name of build.
101 """
102 self.build = build
103
Fang Deng6197da32014-09-25 10:18:48 -0700104
Prashanth Ba7be2072014-07-15 15:03:21 -0700105class SimpleTimer(object):
106 """A simple timer used to periodically check if a deadline has passed."""
107
108 def _reset(self):
109 """Reset the deadline."""
110 if not self.interval_hours or self.interval_hours < 0:
111 logging.error('Bad interval %s', self.interval_hours)
112 self.deadline = None
113 return
114 self.deadline = datetime.now() + datetime_base.timedelta(
115 hours=self.interval_hours)
116
117
118 def __init__(self, interval_hours=0.5):
119 """Initialize a simple periodic deadline timer.
120
121 @param interval_hours: Interval of the deadline.
122 """
123 self.interval_hours = interval_hours
124 self._reset()
125
126
127 def poll(self):
128 """Poll the timer to see if we've hit the deadline.
129
130 This method resets the deadline if it has passed. If the deadline
131 hasn't been set, or the current time is less than the deadline, the
132 method returns False.
133
134 @return: True if the deadline has passed, False otherwise.
135 """
136 if not self.deadline or datetime.now() < self.deadline:
137 return False
138 self._reset()
139 return True
140
141
Prashanth B923ca262014-03-14 12:36:29 -0700142class JobTimer(object):
143 """Utility class capable of measuring job timeouts.
144 """
145
146 # Format used in datetime - string conversion.
147 time_format = '%m-%d-%Y [%H:%M:%S]'
148
149 def __init__(self, job_created_time, timeout_mins):
150 """JobTimer constructor.
151
152 @param job_created_time: float representing the time a job was
153 created. Eg: time.time()
154 @param timeout_mins: float representing the timeout in minutes.
155 """
156 self.job_created_time = datetime.fromtimestamp(job_created_time)
157 self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
Prashanth Ba7be2072014-07-15 15:03:21 -0700158 self.debug_output_timer = SimpleTimer(interval_hours=0.5)
Prashanth B923ca262014-03-14 12:36:29 -0700159 self.past_halftime = False
160
161
162 @classmethod
163 def format_time(cls, datetime_obj):
164 """Get the string formatted version of the datetime object.
165
166 @param datetime_obj: A datetime.datetime object.
167 Eg: datetime.datetime.now()
168
169 @return: A formatted string containing the date/time of the
170 input datetime.
171 """
172 return datetime_obj.strftime(cls.time_format)
173
174
175 def elapsed_time(self):
176 """Get the time elapsed since this job was created.
177
178 @return: A timedelta object representing the elapsed time.
179 """
180 return datetime.now() - self.job_created_time
181
182
183 def is_suite_timeout(self):
184 """Check if the suite timed out.
185
186 @return: True if more than timeout_hours has elapsed since the suite job
187 was created.
188 """
189 if self.elapsed_time() >= self.timeout_hours:
190 logging.info('Suite timed out. Started on %s, timed out on %s',
191 self.format_time(self.job_created_time),
192 self.format_time(datetime.now()))
193 return True
194 return False
195
196
197 def first_past_halftime(self):
198 """Check if we just crossed half time.
199
200 This method will only return True once, the first time it is called
201 after a job's elapsed time is past half its timeout.
202
203 @return True: If this is the first call of the method after halftime.
204 """
205 if (not self.past_halftime and
206 self.elapsed_time() > self.timeout_hours/2):
207 self.past_halftime = True
208 return True
209 return False
210
211
212class RPCHelper(object):
213 """A class to help diagnose a suite run through the rpc interface.
214 """
215
216 def __init__(self, rpc_interface):
217 """Constructor for rpc helper class.
218
219 @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
220 """
221 self.rpc_interface = rpc_interface
222
223
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800224 def diagnose_pool(self, board, pool, time_delta_hours, limit=10):
Prashanth B923ca262014-03-14 12:36:29 -0700225 """Log diagnostic information about a timeout for a board/pool.
226
227 @param board: The board for which the current suite was run.
228 @param pool: The pool against which the current suite was run.
229 @param time_delta_hours: The time from which we should log information.
230 This is a datetime.timedelta object, as stored by the JobTimer.
231 @param limit: The maximum number of jobs per host, to log.
232
233 @raises proxy.JSONRPCException: For exceptions thrown across the wire.
234 """
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800235 end_time = datetime.now()
236 start_time = end_time - time_delta_hours
Prathmesh Prabhu68acc402017-11-09 15:24:15 -0800237 labels = labellib.LabelsMapping()
238 labels['board'] = board
239 labels['pool'] = pool
240 host_histories = status_history.HostJobHistory.get_multiple_histories(
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800241 self.rpc_interface,
242 time_utils.to_epoch_time(start_time),
243 time_utils.to_epoch_time(end_time),
Prathmesh Prabhu68acc402017-11-09 15:24:15 -0800244 labels.getlabels(),
245 )
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800246 if not host_histories:
247 logging.error('No hosts found for board:%s in pool:%s',
Alex Miller8bf50202014-06-02 04:10:51 -0700248 board, pool)
Prashanth B923ca262014-03-14 12:36:29 -0700249 return
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800250 status_map = {
251 status_history.UNUSED: 'Unused',
252 status_history.UNKNOWN: 'No job history',
253 status_history.WORKING: 'Working',
254 status_history.BROKEN: 'Failed repair'
255 }
256 for history in host_histories:
257 count = 0
258 job_info =''
259 for job in history:
260 start_time = (
261 time_utils.epoch_time_to_date_string(job.start_time))
Prashanth B923ca262014-03-14 12:36:29 -0700262 job_info += ('%s %s started on: %s status %s\n' %
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800263 (job.id, job.name, start_time, job.job_status))
264 count += 1
265 if count >= limit:
266 break
267 host = history.host
268 logging.error('host: %s, status: %s, locked: %s '
269 'diagnosis: %s\n'
270 'labels: %s\nLast %s jobs within %s:\n'
271 '%s',
272 history.hostname, host.status, host.locked,
273 status_map[history.last_diagnosis()[0]],
274 host.labels, limit, time_delta_hours,
275 job_info)
Prashanth B923ca262014-03-14 12:36:29 -0700276
277
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700278 def check_dut_availability(self, board, pool, minimum_duts=0, skip_duts_check=False):
Dan Shi20952c12014-05-14 17:07:38 -0700279 """Check if DUT availability for a given board and pool is less than
280 minimum.
281
282 @param board: The board to check DUT availability.
283 @param pool: The pool to check DUT availability.
Dan Shi81363372014-06-03 22:27:37 -0700284 @param minimum_duts: Minimum Number of available machines required to
285 run the suite. Default is set to 0, which means do
286 not force the check of available machines before
287 running the suite.
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700288 @param skip_duts_check: If True, skip minimum available DUTs check.
Fang Deng6197da32014-09-25 10:18:48 -0700289 @raise: NotEnoughDutsError if DUT availability is lower than minimum.
290 @raise: BoardNotAvailableError if no host found for requested
291 board/pool.
Dan Shi20952c12014-05-14 17:07:38 -0700292 """
Dan Shi8de6d1b2014-06-12 09:10:37 -0700293 if minimum_duts == 0:
Dan Shi94234cb2014-05-23 20:04:31 -0700294 return
295
Allen Li6a612392016-08-18 12:09:32 -0700296 # TODO(ayatane): Replace label prefixes with constants in
Xixuan Wu93e646c2017-12-07 18:36:10 -0800297 # autotest_lib.server.constants
Dan Shi20952c12014-05-14 17:07:38 -0700298 hosts = self.rpc_interface.get_hosts(
299 invalid=False,
300 multiple_labels=('pool:%s' % pool, 'board:%s' % board))
301 if not hosts:
Fang Deng6197da32014-09-25 10:18:48 -0700302 raise BoardNotAvailableError(
Shuqian Zhaoade6e7d2015-12-07 18:01:11 -0800303 'No hosts found for board:%s in pool:%s. The test lab '
304 'currently does not cover test for this board and pool.'%
Alex Miller8bf50202014-06-02 04:10:51 -0700305 (board, pool))
Dan Shi20952c12014-05-14 17:07:38 -0700306
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700307 if skip_duts_check:
308 # Bypass minimum avilable DUTs check
309 logging.debug('skip_duts_check is on, do not enforce minimum DUTs check.')
310 return
311
Dan Shi8de6d1b2014-06-12 09:10:37 -0700312 if len(hosts) < minimum_duts:
Dan Shi94234cb2014-05-23 20:04:31 -0700313 logging.debug('The total number of DUTs for %s in pool:%s is %d, '
Allen Li6a612392016-08-18 12:09:32 -0700314 'which is less than %d, the required minimum number of'
315 ' available DUTS', board, pool, len(hosts),
Dan Shi8de6d1b2014-06-12 09:10:37 -0700316 minimum_duts)
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700317
Allen Li6a612392016-08-18 12:09:32 -0700318 available_hosts = 0
Prashanth Balasubramaniana6c03be2014-10-20 18:47:24 -0700319 for host in hosts:
Allen Lif74957d2017-11-20 17:46:48 -0800320 if host.is_available():
Allen Li6a612392016-08-18 12:09:32 -0700321 available_hosts += 1
Dan Shi20952c12014-05-14 17:07:38 -0700322 logging.debug('%d of %d DUTs are available for board %s pool %s.',
Allen Li6a612392016-08-18 12:09:32 -0700323 available_hosts, len(hosts), board, pool)
324 if available_hosts < minimum_duts:
Fang Deng6197da32014-09-25 10:18:48 -0700325 raise NotEnoughDutsError(
Allen Li6a612392016-08-18 12:09:32 -0700326 board=board,
327 pool=pool,
328 num_available=available_hosts,
329 num_required=minimum_duts,
330 hosts=hosts)
Dan Shi20952c12014-05-14 17:07:38 -0700331
332
MK Ryu4790eec2014-07-31 11:39:02 -0700333 def diagnose_job(self, job_id, instance_server):
Prashanth B923ca262014-03-14 12:36:29 -0700334 """Diagnose a suite job.
335
336 Logs information about the jobs that are still to run in the suite.
337
338 @param job_id: The id of the suite job to get information about.
339 No meaningful information gets logged if the id is for a sub-job.
MK Ryu4790eec2014-07-31 11:39:02 -0700340 @param instance_server: The instance server.
341 Eg: cautotest, cautotest-cq, localhost.
Prashanth B923ca262014-03-14 12:36:29 -0700342 """
343 incomplete_jobs = self.rpc_interface.get_jobs(
344 parent_job_id=job_id, summary=True,
345 hostqueueentry__complete=False)
346 if incomplete_jobs:
347 logging.info('\n%s printing summary of incomplete jobs (%s):\n',
348 JobTimer.format_time(datetime.now()),
349 len(incomplete_jobs))
350 for job in incomplete_jobs:
351 logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
MK Ryu4790eec2014-07-31 11:39:02 -0700352 reporting_utils.link_job(job.id, instance_server))
Prashanth B923ca262014-03-14 12:36:29 -0700353 else:
354 logging.info('All jobs in suite have already completed.')