blob: 01fb48eaefba2ee3693f018ec32cb37bd892de40 [file] [log] [blame]
Dan Shi4df39252013-03-19 13:19:45 -07001# pylint: disable-msg=C0111
2
Chris Masone859fdec2012-01-30 08:38:09 -08003# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7__author__ = 'cmasone@chromium.org (Chris Masone)'
8
9import common
Simran Basi773a86e2015-05-13 19:15:42 -070010import ConfigParser
Chris Masonea8066a92012-05-01 16:52:31 -070011import datetime
Chris Masone859fdec2012-01-30 08:38:09 -080012import logging
Simran Basi71206ef2014-08-13 13:51:18 -070013import os
14import shutil
Aviv Keshetd83ef442013-01-16 16:19:35 -080015
Jakob Juelich82b7d1c2014-09-15 16:10:57 -070016from autotest_lib.frontend.afe import models
Matthew Sartorid96fb9b2015-05-19 18:04:58 -070017from autotest_lib.client.common_lib import control_data
Aviv Keshetd83ef442013-01-16 16:19:35 -080018from autotest_lib.client.common_lib import error
Simran Basi71206ef2014-08-13 13:51:18 -070019from autotest_lib.client.common_lib import global_config
Alex Miller7d658cf2013-09-04 16:00:35 -070020from autotest_lib.client.common_lib import priorities
Dan Shidfea3682014-08-10 23:38:40 -070021from autotest_lib.client.common_lib import time_utils
Chris Masone859fdec2012-01-30 08:38:09 -080022from autotest_lib.client.common_lib.cros import dev_server
Gabe Black1e1c41b2015-02-04 23:55:15 -080023from autotest_lib.client.common_lib.cros.graphite import autotest_stats
Jakob Juelich9fffe4f2014-08-14 18:07:05 -070024from autotest_lib.frontend.afe import rpc_utils
Simran Basib6ec8ae2014-04-23 12:05:08 -070025from autotest_lib.server import utils
Dan Shi36cfd832014-10-10 13:38:51 -070026from autotest_lib.server.cros import provision
Chris Masone44e4d6c2012-08-15 14:25:53 -070027from autotest_lib.server.cros.dynamic_suite import constants
Chris Masoneb4935552012-08-14 12:05:54 -070028from autotest_lib.server.cros.dynamic_suite import control_file_getter
Chris Masone44e4d6c2012-08-15 14:25:53 -070029from autotest_lib.server.cros.dynamic_suite import tools
Dan Shi36cfd832014-10-10 13:38:51 -070030from autotest_lib.server.cros.dynamic_suite.suite import Suite
Simran Basi71206ef2014-08-13 13:51:18 -070031from autotest_lib.server.hosts import moblab_host
Dan Shidfea3682014-08-10 23:38:40 -070032from autotest_lib.site_utils import host_history
Dan Shi193905e2014-07-25 23:33:09 -070033from autotest_lib.site_utils import job_history
Dan Shid7bb4f12015-01-06 10:53:50 -080034from autotest_lib.site_utils import server_manager_utils
Dan Shi6964fa52014-12-18 11:04:27 -080035from autotest_lib.site_utils import stable_version_utils
Simran Basi71206ef2014-08-13 13:51:18 -070036
37
38_CONFIG = global_config.global_config
39MOBLAB_BOTO_LOCATION = '/home/moblab/.boto'
Chris Masone859fdec2012-01-30 08:38:09 -080040
Chris Masonef8b53062012-05-08 22:14:18 -070041# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py.
Chris Masone859fdec2012-01-30 08:38:09 -080042
43
Chris Masone62579122012-03-08 15:18:43 -080044def canonicalize_suite_name(suite_name):
Dan Shi70647ca2015-07-16 22:52:35 -070045 # Do not change this naming convention without updating
46 # site_utils.parse_job_name.
Chris Masone62579122012-03-08 15:18:43 -080047 return 'test_suites/control.%s' % suite_name
48
49
Chris Masoneaa10f8e2012-05-15 13:34:21 -070050def formatted_now():
Dan Shidfea3682014-08-10 23:38:40 -070051 return datetime.datetime.now().strftime(time_utils.TIME_FMT)
Chris Masoneaa10f8e2012-05-15 13:34:21 -070052
53
Simran Basib6ec8ae2014-04-23 12:05:08 -070054def _get_control_file_contents_by_name(build, ds, suite_name):
Chris Masone8dd27e02012-06-25 15:59:43 -070055 """Return control file contents for |suite_name|.
56
57 Query the dev server at |ds| for the control file |suite_name|, included
58 in |build| for |board|.
59
60 @param build: unique name by which to refer to the image from now on.
Chris Masone8dd27e02012-06-25 15:59:43 -070061 @param ds: a dev_server.DevServer instance to fetch control file with.
62 @param suite_name: canonicalized suite name, e.g. test_suites/control.bvt.
63 @raises ControlFileNotFound if a unique suite control file doesn't exist.
64 @raises NoControlFileList if we can't list the control files at all.
65 @raises ControlFileEmpty if the control file exists on the server, but
66 can't be read.
67
68 @return the contents of the desired control file.
69 """
70 getter = control_file_getter.DevServerGetter.create(build, ds)
Gabe Black1e1c41b2015-02-04 23:55:15 -080071 timer = autotest_stats.Timer('control_files.parse.%s.%s' %
72 (ds.get_server_name(ds.url()
73 ).replace('.', '_'),
74 suite_name.rsplit('.')[-1]))
Chris Masone8dd27e02012-06-25 15:59:43 -070075 # Get the control file for the suite.
76 try:
Prashanth Balasubramanianabe3bb72014-11-20 12:00:37 -080077 with timer:
78 control_file_in = getter.get_control_file_contents_by_name(
79 suite_name)
Chris Masone8dd27e02012-06-25 15:59:43 -070080 except error.CrosDynamicSuiteException as e:
Simran Basib6ec8ae2014-04-23 12:05:08 -070081 raise type(e)("%s while testing %s." % (e, build))
Chris Masone8dd27e02012-06-25 15:59:43 -070082 if not control_file_in:
83 raise error.ControlFileEmpty(
84 "Fetching %s returned no data." % suite_name)
Alex Millera713e252013-03-01 10:45:44 -080085 # Force control files to only contain ascii characters.
86 try:
87 control_file_in.encode('ascii')
88 except UnicodeDecodeError as e:
89 raise error.ControlFileMalformed(str(e))
90
Chris Masone8dd27e02012-06-25 15:59:43 -070091 return control_file_in
92
93
Simran Basib6ec8ae2014-04-23 12:05:08 -070094def _stage_build_artifacts(build):
95 """
96 Ensure components of |build| necessary for installing images are staged.
97
98 @param build image we want to stage.
99
Prashanth B6285f6a2014-05-08 18:01:27 -0700100 @raises StageControlFileFailure: if the dev server throws 500 while staging
101 suite control files.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700102
103 @return: dev_server.ImageServer instance to use with this build.
104 @return: timings dictionary containing staging start/end times.
105 """
106 timings = {}
Prashanth B6285f6a2014-05-08 18:01:27 -0700107 # Ensure components of |build| necessary for installing images are staged
108 # on the dev server. However set synchronous to False to allow other
109 # components to be downloaded in the background.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700110 ds = dev_server.ImageServer.resolve(build)
111 timings[constants.DOWNLOAD_STARTED_TIME] = formatted_now()
Gabe Black1e1c41b2015-02-04 23:55:15 -0800112 timer = autotest_stats.Timer('control_files.stage.%s' % (
113 ds.get_server_name(ds.url()).replace('.', '_')))
Simran Basib6ec8ae2014-04-23 12:05:08 -0700114 try:
Prashanth Balasubramanianabe3bb72014-11-20 12:00:37 -0800115 with timer:
116 ds.stage_artifacts(build, ['test_suites'])
Simran Basib6ec8ae2014-04-23 12:05:08 -0700117 except dev_server.DevServerException as e:
Prashanth B6285f6a2014-05-08 18:01:27 -0700118 raise error.StageControlFileFailure(
Simran Basib6ec8ae2014-04-23 12:05:08 -0700119 "Failed to stage %s: %s" % (build, e))
120 timings[constants.PAYLOAD_FINISHED_TIME] = formatted_now()
121 return (ds, timings)
122
123
MK Ryue301eb72015-06-25 12:51:02 -0700124@rpc_utils.route_rpc_to_master
Simran Basib6ec8ae2014-04-23 12:05:08 -0700125def create_suite_job(name='', board='', build='', pool='', control_file='',
126 check_hosts=True, num=None, file_bugs=False, timeout=24,
127 timeout_mins=None, priority=priorities.Priority.DEFAULT,
Fang Deng058860c2014-05-15 15:41:50 -0700128 suite_args=None, wait_for_results=True, job_retry=False,
Fang Deng443f1952015-01-02 14:51:49 -0800129 max_retries=None, max_runtime_mins=None, suite_min_duts=0,
Dan Shi36cfd832014-10-10 13:38:51 -0700130 offload_failures_only=False, builds={},
Dan Shi059261a2016-02-22 12:06:37 -0800131 test_source_build=None, run_prod_code=False,
132 delay_minutes=0, **kwargs):
Chris Masone859fdec2012-01-30 08:38:09 -0800133 """
134 Create a job to run a test suite on the given device with the given image.
135
136 When the timeout specified in the control file is reached, the
137 job is guaranteed to have completed and results will be available.
138
Simran Basib6ec8ae2014-04-23 12:05:08 -0700139 @param name: The test name if control_file is supplied, otherwise the name
140 of the test suite to run, e.g. 'bvt'.
Chris Masone859fdec2012-01-30 08:38:09 -0800141 @param board: the kind of device to run the tests on.
142 @param build: unique name by which to refer to the image from now on.
Dan Shi36cfd832014-10-10 13:38:51 -0700143 @param builds: the builds to install e.g.
144 {'cros-version:': 'x86-alex-release/R18-1655.0.0',
145 'fw-version:': 'x86-alex-firmware/R36-5771.50.0',
146 'fwro-version:': 'x86-alex-firmware/R36-5771.49.0'}
147 If builds is given a value, it overrides argument build.
148 @param test_source_build: Build that contains the server-side test code.
Scott Zawalski65650172012-02-16 11:48:26 -0500149 @param pool: Specify the pool of machines to use for scheduling
150 purposes.
Chris Masone62579122012-03-08 15:18:43 -0800151 @param check_hosts: require appropriate live hosts to exist in the lab.
Aviv Keshetd83ef442013-01-16 16:19:35 -0800152 @param num: Specify the number of machines to schedule across (integer).
153 Leave unspecified or use None to use default sharding factor.
Alex Millerc577f3e2012-09-27 14:06:07 -0700154 @param file_bugs: File a bug on each test failure in this suite.
Alex Miller139690b2013-09-07 15:35:49 -0700155 @param timeout: The max lifetime of this suite, in hours.
Simran Basi7e605742013-11-12 13:43:36 -0800156 @param timeout_mins: The max lifetime of this suite, in minutes. Takes
157 priority over timeout.
Alex Miller139690b2013-09-07 15:35:49 -0700158 @param priority: Integer denoting priority. Higher is more important.
Aviv Keshet7cd12312013-07-25 10:25:55 -0700159 @param suite_args: Optional arguments which will be parsed by the suite
160 control file. Used by control.test_that_wrapper to
161 determine which tests to run.
Dan Shi95122412013-11-12 16:20:33 -0800162 @param wait_for_results: Set to False to run the suite job without waiting
163 for test jobs to finish. Default is True.
Fang Deng058860c2014-05-15 15:41:50 -0700164 @param job_retry: Set to True to enable job-level retry. Default is False.
Fang Deng443f1952015-01-02 14:51:49 -0800165 @param max_retries: Integer, maximum job retries allowed at suite level.
166 None for no max.
Simran Basi102e3522014-09-11 11:46:10 -0700167 @param max_runtime_mins: Maximum amount of time a job can be running in
168 minutes.
Fang Dengcbc01212014-11-25 16:09:46 -0800169 @param suite_min_duts: Integer. Scheduler will prioritize getting the
170 minimum number of machines for the suite when it is
171 competing with another suite that has a higher
172 priority but already got minimum machines it needs.
Simran Basi1e10e922015-04-16 15:09:56 -0700173 @param offload_failures_only: Only enable gs_offloading for failed jobs.
Simran Basi5ace6f22016-01-06 17:30:44 -0800174 @param run_prod_code: If True, the suite will run the test code that
175 lives in prod aka the test code currently on the
176 lab servers. If False, the control files and test
177 code for this suite run will be retrieved from the
178 build artifacts.
Dan Shi059261a2016-02-22 12:06:37 -0800179 @param delay_minutes: Delay the creation of test jobs for a given number of
180 minutes.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700181 @param kwargs: extra keyword args. NOT USED.
Chris Masone859fdec2012-01-30 08:38:09 -0800182
Chris Masone8dd27e02012-06-25 15:59:43 -0700183 @raises ControlFileNotFound: if a unique suite control file doesn't exist.
184 @raises NoControlFileList: if we can't list the control files at all.
Prashanth B6285f6a2014-05-08 18:01:27 -0700185 @raises StageControlFileFailure: If the dev server throws 500 while
186 staging test_suites.
Chris Masone8dd27e02012-06-25 15:59:43 -0700187 @raises ControlFileEmpty: if the control file exists on the server, but
188 can't be read.
Chris Masone859fdec2012-01-30 08:38:09 -0800189
190 @return: the job ID of the suite; -1 on error.
191 """
Aviv Keshetd83ef442013-01-16 16:19:35 -0800192 if type(num) is not int and num is not None:
Chris Sosa18c70b32013-02-15 14:12:43 -0800193 raise error.SuiteArgumentException('Ill specified num argument %r. '
194 'Must be an integer or None.' % num)
Aviv Keshetd83ef442013-01-16 16:19:35 -0800195 if num == 0:
196 logging.warning("Can't run on 0 hosts; using default.")
197 num = None
Dan Shi36cfd832014-10-10 13:38:51 -0700198
199 # TODO(dshi): crbug.com/496782 Remove argument build and its reference after
200 # R45 falls out of stable channel.
201 if build and not builds:
202 builds = {provision.CROS_VERSION_PREFIX: build}
203 # TODO(dshi): crbug.com/497236 Remove this check after firmware ro provision
204 # is supported in Autotest.
205 if provision.FW_RO_VERSION_PREFIX in builds:
206 raise error.SuiteArgumentException(
207 'Updating RO firmware is not supported yet.')
Dan Shi2121a332016-02-25 14:22:22 -0800208 # Default test source build to CrOS build if it's not specified and
209 # run_prod_code is set to False.
210 if not run_prod_code:
211 test_source_build = Suite.get_test_source_build(
212 builds, test_source_build=test_source_build)
Dan Shi36cfd832014-10-10 13:38:51 -0700213
Simran Basi5ace6f22016-01-06 17:30:44 -0800214 suite_name = canonicalize_suite_name(name)
215 if run_prod_code:
216 ds = dev_server.ImageServer.resolve(build)
217 keyvals = {}
218 getter = control_file_getter.FileSystemGetter(
219 [_CONFIG.get_config_value('SCHEDULER',
220 'drone_installation_directory')])
221 control_file = getter.get_control_file_contents_by_name(suite_name)
222 else:
223 (ds, keyvals) = _stage_build_artifacts(test_source_build)
Fang Dengcbc01212014-11-25 16:09:46 -0800224 keyvals[constants.SUITE_MIN_DUTS_KEY] = suite_min_duts
Chris Masone859fdec2012-01-30 08:38:09 -0800225
Simran Basib6ec8ae2014-04-23 12:05:08 -0700226 if not control_file:
Dan Shi36cfd832014-10-10 13:38:51 -0700227 # No control file was supplied so look it up from the build artifacts.
228 suite_name = canonicalize_suite_name(name)
229 control_file = _get_control_file_contents_by_name(test_source_build,
230 ds, suite_name)
Simran Basi86fe9c92016-02-09 17:58:20 -0800231 # Do not change this naming convention without updating
232 # site_utils.parse_job_name.
Dan Shi2121a332016-02-25 14:22:22 -0800233 if not run_prod_code:
234 name = '%s-%s' % (test_source_build, suite_name)
235 else:
236 # If run_prod_code is True, test_source_build is not set, use the
237 # first build in the builds list for the sutie job name.
238 name = '%s-%s' % (builds.values()[0], suite_name)
Chris Masone46d0eb12012-07-27 18:56:39 -0700239
Simran Basi7e605742013-11-12 13:43:36 -0800240 timeout_mins = timeout_mins or timeout * 60
Simran Basi102e3522014-09-11 11:46:10 -0700241 max_runtime_mins = max_runtime_mins or timeout * 60
Simran Basi7e605742013-11-12 13:43:36 -0800242
Simran Basib6ec8ae2014-04-23 12:05:08 -0700243 if not board:
Dan Shid215dbe2015-06-18 16:14:59 -0700244 board = utils.ParseBuildName(builds[provision.CROS_VERSION_PREFIX])[0]
Chris Masone46d0eb12012-07-27 18:56:39 -0700245
Dan Shi36cfd832014-10-10 13:38:51 -0700246 # TODO(dshi): crbug.com/496782 Remove argument build and its reference after
247 # R45 falls out of stable channel.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700248 # Prepend build and board to the control file.
Scott Zawalski65650172012-02-16 11:48:26 -0500249 inject_dict = {'board': board,
Simran Basi5ace6f22016-01-06 17:30:44 -0800250 'build': builds.get(provision.CROS_VERSION_PREFIX),
Dan Shi36cfd832014-10-10 13:38:51 -0700251 'builds': builds,
Chris Masone62579122012-03-08 15:18:43 -0800252 'check_hosts': check_hosts,
Chris Masone46d0eb12012-07-27 18:56:39 -0700253 'pool': pool,
Aviv Keshetd83ef442013-01-16 16:19:35 -0800254 'num': num,
Dan Shib8a99112013-06-18 13:46:10 -0700255 'file_bugs': file_bugs,
Alex Miller139690b2013-09-07 15:35:49 -0700256 'timeout': timeout,
Simran Basi7e605742013-11-12 13:43:36 -0800257 'timeout_mins': timeout_mins,
Alex Miller7d658cf2013-09-04 16:00:35 -0700258 'devserver_url': ds.url(),
Aviv Keshet7cd12312013-07-25 10:25:55 -0700259 'priority': priority,
Dan Shi95122412013-11-12 16:20:33 -0800260 'suite_args' : suite_args,
Fang Deng058860c2014-05-15 15:41:50 -0700261 'wait_for_results': wait_for_results,
Simran Basi102e3522014-09-11 11:46:10 -0700262 'job_retry': job_retry,
Fang Deng443f1952015-01-02 14:51:49 -0800263 'max_retries': max_retries,
Fang Dengcbc01212014-11-25 16:09:46 -0800264 'max_runtime_mins': max_runtime_mins,
Dan Shi36cfd832014-10-10 13:38:51 -0700265 'offload_failures_only': offload_failures_only,
Simran Basi5ace6f22016-01-06 17:30:44 -0800266 'test_source_build': test_source_build,
Dan Shi059261a2016-02-22 12:06:37 -0800267 'run_prod_code': run_prod_code,
268 'delay_minutes': delay_minutes,
Aviv Keshet7cd12312013-07-25 10:25:55 -0700269 }
270
Simran Basib6ec8ae2014-04-23 12:05:08 -0700271 control_file = tools.inject_vars(inject_dict, control_file)
Chris Masone859fdec2012-01-30 08:38:09 -0800272
Jakob Juelich9fffe4f2014-08-14 18:07:05 -0700273 return rpc_utils.create_job_common(name,
Jakob Juelich59cfe542014-09-02 16:37:46 -0700274 priority=priority,
275 timeout_mins=timeout_mins,
276 max_runtime_mins=max_runtime_mins,
277 control_type='Server',
278 control_file=control_file,
279 hostless=True,
Fang Dengcbc01212014-11-25 16:09:46 -0800280 keyvals=keyvals)
Simran Basi71206ef2014-08-13 13:51:18 -0700281
282
283# TODO: hide the following rpcs under is_moblab
284def moblab_only(func):
285 """Ensure moblab specific functions only run on Moblab devices."""
286 def verify(*args, **kwargs):
287 if not utils.is_moblab():
288 raise error.RPCException('RPC: %s can only run on Moblab Systems!',
289 func.__name__)
290 return func(*args, **kwargs)
291 return verify
292
293
294@moblab_only
295def get_config_values():
296 """Returns all config values parsed from global and shadow configs.
297
298 Config values are grouped by sections, and each section is composed of
299 a list of name value pairs.
300 """
301 sections =_CONFIG.get_sections()
302 config_values = {}
303 for section in sections:
304 config_values[section] = _CONFIG.config.items(section)
Jakob Juelich9fffe4f2014-08-14 18:07:05 -0700305 return rpc_utils.prepare_for_serialization(config_values)
Simran Basi71206ef2014-08-13 13:51:18 -0700306
307
308@moblab_only
309def update_config_handler(config_values):
310 """
311 Update config values and override shadow config.
312
313 @param config_values: See get_moblab_settings().
314 """
Simran Basi773a86e2015-05-13 19:15:42 -0700315 original_config = global_config.global_config_class()
316 original_config.set_config_files(shadow_file='')
317 new_shadow = ConfigParser.RawConfigParser()
Simran Basi71206ef2014-08-13 13:51:18 -0700318 for section, config_value_list in config_values.iteritems():
319 for key, value in config_value_list:
Simran Basi773a86e2015-05-13 19:15:42 -0700320 if original_config.get_config_value(section, key,
321 default='',
322 allow_blank=True) != value:
323 if not new_shadow.has_section(section):
324 new_shadow.add_section(section)
325 new_shadow.set(section, key, value)
Simran Basi71206ef2014-08-13 13:51:18 -0700326 if not _CONFIG.shadow_file or not os.path.exists(_CONFIG.shadow_file):
327 raise error.RPCException('Shadow config file does not exist.')
328
329 with open(_CONFIG.shadow_file, 'w') as config_file:
Simran Basi773a86e2015-05-13 19:15:42 -0700330 new_shadow.write(config_file)
Simran Basi71206ef2014-08-13 13:51:18 -0700331 # TODO (sbasi) crbug.com/403916 - Remove the reboot command and
332 # instead restart the services that rely on the config values.
333 os.system('sudo reboot')
334
335
336@moblab_only
337def reset_config_settings():
338 with open(_CONFIG.shadow_file, 'w') as config_file:
Dan Shi36cfd832014-10-10 13:38:51 -0700339 pass
Simran Basi71206ef2014-08-13 13:51:18 -0700340 os.system('sudo reboot')
341
342
343@moblab_only
344def set_boto_key(boto_key):
345 """Update the boto_key file.
346
347 @param boto_key: File name of boto_key uploaded through handle_file_upload.
348 """
349 if not os.path.exists(boto_key):
350 raise error.RPCException('Boto key: %s does not exist!' % boto_key)
351 shutil.copyfile(boto_key, moblab_host.MOBLAB_BOTO_LOCATION)
Dan Shi193905e2014-07-25 23:33:09 -0700352
353
Dan Shiaec99012016-01-07 09:09:16 -0800354@moblab_only
355def set_launch_control_key(launch_control_key):
356 """Update the launch_control_key file.
357
358 @param launch_control_key: File name of launch_control_key uploaded through
359 handle_file_upload.
360 """
361 if not os.path.exists(launch_control_key):
362 raise error.RPCException('Launch Control key: %s does not exist!' %
363 launch_control_key)
364 shutil.copyfile(launch_control_key,
365 moblab_host.MOBLAB_LAUNCH_CONTROL_KEY_LOCATION)
366 # Restart the devserver service.
367 os.system('sudo restart moblab-devserver-init')
368
369
Dan Shi193905e2014-07-25 23:33:09 -0700370def get_job_history(**filter_data):
371 """Get history of the job, including the special tasks executed for the job
372
373 @param filter_data: filter for the call, should at least include
374 {'job_id': [job id]}
375 @returns: JSON string of the job's history, including the information such
376 as the hosts run the job and the special tasks executed before
377 and after the job.
378 """
379 job_id = filter_data['job_id']
380 job_info = job_history.get_job_info(job_id)
Dan Shidfea3682014-08-10 23:38:40 -0700381 return rpc_utils.prepare_for_serialization(job_info.get_history())
382
383
384def get_host_history(start_time, end_time, hosts=None, board=None, pool=None):
385 """Get history of a list of host.
386
387 The return is a JSON string of host history for each host, for example,
388 {'172.22.33.51': [{'status': 'Resetting'
389 'start_time': '2014-08-07 10:02:16',
390 'end_time': '2014-08-07 10:03:16',
391 'log_url': 'http://autotest/reset-546546/debug',
392 'dbg_str': 'Task: Special Task 19441991 (host ...)'},
393 {'status': 'Running'
394 'start_time': '2014-08-07 10:03:18',
395 'end_time': '2014-08-07 10:13:00',
396 'log_url': 'http://autotest/reset-546546/debug',
397 'dbg_str': 'HQE: 15305005, for job: 14995562'}
398 ]
399 }
400 @param start_time: start time to search for history, can be string value or
401 epoch time.
402 @param end_time: end time to search for history, can be string value or
403 epoch time.
404 @param hosts: A list of hosts to search for history. Default is None.
405 @param board: board type of hosts. Default is None.
406 @param pool: pool type of hosts. Default is None.
407 @returns: JSON string of the host history.
408 """
409 return rpc_utils.prepare_for_serialization(
410 host_history.get_history_details(
411 start_time=start_time, end_time=end_time,
412 hosts=hosts, board=board, pool=pool,
413 process_pool_size=4))
Jakob Juelich59cfe542014-09-02 16:37:46 -0700414
415
MK Ryu07a109f2015-07-21 17:44:32 -0700416def shard_heartbeat(shard_hostname, jobs=(), hqes=(), known_job_ids=(),
417 known_host_ids=(), known_host_statuses=()):
Jakob Juelich1b525742014-09-30 13:08:07 -0700418 """Receive updates for job statuses from shards and assign hosts and jobs.
Jakob Juelich59cfe542014-09-02 16:37:46 -0700419
420 @param shard_hostname: Hostname of the calling shard
Jakob Juelicha94efe62014-09-18 16:02:49 -0700421 @param jobs: Jobs in serialized form that should be updated with newer
422 status from a shard.
423 @param hqes: Hostqueueentries in serialized form that should be updated with
424 newer status from a shard. Note that for every hostqueueentry
425 the corresponding job must be in jobs.
Jakob Juelich1b525742014-09-30 13:08:07 -0700426 @param known_job_ids: List of ids of jobs the shard already has.
427 @param known_host_ids: List of ids of hosts the shard already has.
MK Ryu07a109f2015-07-21 17:44:32 -0700428 @param known_host_statuses: List of statuses of hosts the shard already has.
Jakob Juelicha94efe62014-09-18 16:02:49 -0700429
Fang Dengf3705992014-12-16 17:32:18 -0800430 @returns: Serialized representations of hosts, jobs, suite job keyvals
431 and their dependencies to be inserted into a shard's database.
Jakob Juelich59cfe542014-09-02 16:37:46 -0700432 """
Jakob Juelich1b525742014-09-30 13:08:07 -0700433 # The following alternatives to sending host and job ids in every heartbeat
434 # have been considered:
435 # 1. Sending the highest known job and host ids. This would work for jobs:
436 # Newer jobs always have larger ids. Also, if a job is not assigned to a
437 # particular shard during a heartbeat, it never will be assigned to this
438 # shard later.
439 # This is not true for hosts though: A host that is leased won't be sent
440 # to the shard now, but might be sent in a future heartbeat. This means
441 # sometimes hosts should be transfered that have a lower id than the
442 # maximum host id the shard knows.
443 # 2. Send the number of jobs/hosts the shard knows to the master in each
444 # heartbeat. Compare these to the number of records that already have
445 # the shard_id set to this shard. In the normal case, they should match.
446 # In case they don't, resend all entities of that type.
447 # This would work well for hosts, because there aren't that many.
448 # Resending all jobs is quite a big overhead though.
449 # Also, this approach might run into edge cases when entities are
450 # ever deleted.
451 # 3. Mixtures of the above: Use 1 for jobs and 2 for hosts.
452 # Using two different approaches isn't consistent and might cause
453 # confusion. Also the issues with the case of deletions might still
454 # occur.
455 #
456 # The overhead of sending all job and host ids in every heartbeat is low:
457 # At peaks one board has about 1200 created but unfinished jobs.
458 # See the numbers here: http://goo.gl/gQCGWH
459 # Assuming that job id's have 6 digits and that json serialization takes a
460 # comma and a space as overhead, the traffic per id sent is about 8 bytes.
461 # If 5000 ids need to be sent, this means 40 kilobytes of traffic.
462 # A NOT IN query with 5000 ids took about 30ms in tests made.
463 # These numbers seem low enough to outweigh the disadvantages of the
464 # solutions described above.
Gabe Black1e1c41b2015-02-04 23:55:15 -0800465 timer = autotest_stats.Timer('shard_heartbeat')
Jakob Juelich59cfe542014-09-02 16:37:46 -0700466 with timer:
467 shard_obj = rpc_utils.retrieve_shard(shard_hostname=shard_hostname)
Jakob Juelicha94efe62014-09-18 16:02:49 -0700468 rpc_utils.persist_records_sent_from_shard(shard_obj, jobs, hqes)
MK Ryu07a109f2015-07-21 17:44:32 -0700469 assert len(known_host_ids) == len(known_host_statuses)
470 for i in range(len(known_host_ids)):
471 host_model = models.Host.objects.get(pk=known_host_ids[i])
472 if host_model.status != known_host_statuses[i]:
473 host_model.status = known_host_statuses[i]
474 host_model.save()
475
Fang Dengf3705992014-12-16 17:32:18 -0800476 hosts, jobs, suite_keyvals = rpc_utils.find_records_for_shard(
MK Ryu07a109f2015-07-21 17:44:32 -0700477 shard_obj, known_job_ids=known_job_ids,
478 known_host_ids=known_host_ids)
Jakob Juelich59cfe542014-09-02 16:37:46 -0700479 return {
480 'hosts': [host.serialize() for host in hosts],
481 'jobs': [job.serialize() for job in jobs],
Fang Dengf3705992014-12-16 17:32:18 -0800482 'suite_keyvals': [kv.serialize() for kv in suite_keyvals],
Jakob Juelich59cfe542014-09-02 16:37:46 -0700483 }
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700484
485
486def get_shards(**filter_data):
487 """Return a list of all shards.
488
489 @returns A sequence of nested dictionaries of shard information.
490 """
491 shards = models.Shard.query_objects(filter_data)
492 serialized_shards = rpc_utils.prepare_rows_as_nested_dicts(shards, ())
493 for serialized, shard in zip(serialized_shards, shards):
494 serialized['labels'] = [label.name for label in shard.labels.all()]
495
496 return serialized_shards
497
498
MK Ryu5dfcc892015-07-16 15:34:04 -0700499def add_shard(hostname, labels):
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700500 """Add a shard and start running jobs on it.
501
502 @param hostname: The hostname of the shard to be added; needs to be unique.
MK Ryu5dfcc892015-07-16 15:34:04 -0700503 @param labels: Board labels separated by a comma. Jobs of one of the labels
504 will be assigned to the shard.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700505
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700506 @raises error.RPCException: If label provided doesn't start with `board:`
507 @raises model_logic.ValidationError: If a shard with the given hostname
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700508 already exists.
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700509 @raises models.Label.DoesNotExist: If the label specified doesn't exist.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700510 """
MK Ryu5dfcc892015-07-16 15:34:04 -0700511 labels = labels.split(',')
512 label_models = []
513 for label in labels:
514 if not label.startswith('board:'):
515 raise error.RPCException('Sharding only supports for `board:.*` '
516 'labels.')
517 # Fetch label first, so shard isn't created when label doesn't exist.
518 label_models.append(models.Label.smart_get(label))
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700519
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700520 shard = models.Shard.add_object(hostname=hostname)
MK Ryu5dfcc892015-07-16 15:34:04 -0700521 for label in label_models:
522 shard.labels.add(label)
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700523 return shard.id
524
525
526def delete_shard(hostname):
527 """Delete a shard and reclaim all resources from it.
528
529 This claims back all assigned hosts from the shard. To ensure all DUTs are
530 in a sane state, a Repair task is scheduled for them. This reboots the DUTs
531 and therefore clears all running processes that might be left.
532
533 The shard_id of jobs of that shard will be set to None.
534
535 The status of jobs that haven't been reported to be finished yet, will be
536 lost. The master scheduler will pick up the jobs and execute them.
537
538 @param hostname: Hostname of the shard to delete.
539 """
540 shard = rpc_utils.retrieve_shard(shard_hostname=hostname)
541
542 # TODO(beeps): Power off shard
543
544 # For ChromeOS hosts, repair reboots the DUT.
545 # Repair will excalate through multiple repair steps and will verify the
546 # success after each of them. Anyway, it will always run at least the first
547 # one, which includes a reboot.
548 # After a reboot we can be sure no processes from prior tests that were run
549 # by a shard are still running on the DUT.
550 # Important: Don't just set the status to Repair Failed, as that would run
551 # Verify first, before doing any repair measures. Verify would probably
552 # succeed, so this wouldn't change anything on the DUT.
553 for host in models.Host.objects.filter(shard=shard):
554 models.SpecialTask.objects.create(
555 task=models.SpecialTask.Task.REPAIR,
556 host=host,
557 requested_by=models.User.current_user())
558 models.Host.objects.filter(shard=shard).update(shard=None)
559
560 models.Job.objects.filter(shard=shard).update(shard=None)
561
562 shard.labels.clear()
563
564 shard.delete()
Dan Shi6964fa52014-12-18 11:04:27 -0800565
566
MK Ryua34e3b12015-08-21 16:20:47 -0700567def get_servers(hostname=None, role=None, status=None):
Dan Shid7bb4f12015-01-06 10:53:50 -0800568 """Get a list of servers with matching role and status.
569
MK Ryua34e3b12015-08-21 16:20:47 -0700570 @param hostname: FQDN of the server.
Dan Shid7bb4f12015-01-06 10:53:50 -0800571 @param role: Name of the server role, e.g., drone, scheduler. Default to
572 None to match any role.
573 @param status: Status of the server, e.g., primary, backup, repair_required.
574 Default to None to match any server status.
575
576 @raises error.RPCException: If server database is not used.
577 @return: A list of server names for servers with matching role and status.
578 """
579 if not server_manager_utils.use_server_db():
580 raise error.RPCException('Server database is not enabled. Please try '
581 'retrieve servers from global config.')
MK Ryua34e3b12015-08-21 16:20:47 -0700582 servers = server_manager_utils.get_servers(hostname=hostname, role=role,
Dan Shid7bb4f12015-01-06 10:53:50 -0800583 status=status)
584 return [s.get_details() for s in servers]
585
586
MK Ryufbb002c2015-06-08 14:13:16 -0700587@rpc_utils.route_rpc_to_master
Simran Basibeb2bb22016-02-03 15:25:48 -0800588def get_stable_version(board=stable_version_utils.DEFAULT, android=False):
Dan Shi6964fa52014-12-18 11:04:27 -0800589 """Get stable version for the given board.
590
591 @param board: Name of the board.
Simran Basibeb2bb22016-02-03 15:25:48 -0800592 @param android: If True, the given board is an Android-based device. If
593 False, assume its a Chrome OS-based device.
594
Dan Shi6964fa52014-12-18 11:04:27 -0800595 @return: Stable version of the given board. Return global configure value
596 of CROS.stable_cros_version if stable_versinos table does not have
597 entry of board DEFAULT.
598 """
Simran Basibeb2bb22016-02-03 15:25:48 -0800599 return stable_version_utils.get(board=board, android=android)
Dan Shi25e1fd42014-12-19 14:36:42 -0800600
601
MK Ryufbb002c2015-06-08 14:13:16 -0700602@rpc_utils.route_rpc_to_master
Dan Shi25e1fd42014-12-19 14:36:42 -0800603def get_all_stable_versions():
604 """Get stable versions for all boards.
605
606 @return: A dictionary of board:version.
607 """
608 return stable_version_utils.get_all()
609
610
MK Ryufbb002c2015-06-08 14:13:16 -0700611@rpc_utils.route_rpc_to_master
Dan Shi25e1fd42014-12-19 14:36:42 -0800612def set_stable_version(version, board=stable_version_utils.DEFAULT):
613 """Modify stable version for the given board.
614
615 @param version: The new value of stable version for given board.
616 @param board: Name of the board, default to value `DEFAULT`.
617 """
618 stable_version_utils.set(version=version, board=board)
619
620
MK Ryufbb002c2015-06-08 14:13:16 -0700621@rpc_utils.route_rpc_to_master
Dan Shi25e1fd42014-12-19 14:36:42 -0800622def delete_stable_version(board):
623 """Modify stable version for the given board.
624
625 Delete a stable version entry in afe_stable_versions table for a given
626 board, so default stable version will be used.
627
628 @param board: Name of the board.
629 """
630 stable_version_utils.delete(board=board)
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700631
632
633def get_tests_by_build(build):
634 """Get the tests that are available for the specified build.
635
636 @param build: unique name by which to refer to the image.
637
638 @return: A sorted list of all tests that are in the build specified.
639 """
640 # Stage the test artifacts.
641 try:
642 ds = dev_server.ImageServer.resolve(build)
643 build = ds.translate(build)
644 except dev_server.DevServerException as e:
645 raise ValueError('Could not resolve build %s: %s' % (build, e))
646
647 try:
648 ds.stage_artifacts(build, ['test_suites'])
649 except dev_server.DevServerException as e:
650 raise error.StageControlFileFailure(
651 'Failed to stage %s: %s' % (build, e))
652
653 # Collect the control files specified in this build
654 cfile_getter = control_file_getter.DevServerGetter.create(build, ds)
655 control_file_list = cfile_getter.get_control_file_list()
656
657 test_objects = []
658 _id = 0
659 for control_file_path in control_file_list:
660 # Read and parse the control file
661 control_file = cfile_getter.get_control_file_contents(
662 control_file_path)
663 control_obj = control_data.parse_control_string(control_file)
664
665 # Extract the values needed for the AFE from the control_obj.
666 # The keys list represents attributes in the control_obj that
667 # are required by the AFE
668 keys = ['author', 'doc', 'name', 'time', 'test_type', 'experimental',
669 'test_category', 'test_class', 'dependencies', 'run_verify',
670 'sync_count', 'job_retries', 'retries', 'path']
671
672 test_object = {}
673 for key in keys:
674 test_object[key] = getattr(control_obj, key) if hasattr(
675 control_obj, key) else ''
676
677 # Unfortunately, the AFE expects different key-names for certain
678 # values, these must be corrected to avoid the risk of tests
679 # being omitted by the AFE.
680 # The 'id' is an additional value used in the AFE.
Matthew Sartori10438092015-06-24 14:30:18 -0700681 # The control_data parsing does not reference 'run_reset', but it
682 # is also used in the AFE and defaults to True.
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700683 test_object['id'] = _id
Matthew Sartori10438092015-06-24 14:30:18 -0700684 test_object['run_reset'] = True
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700685 test_object['description'] = test_object.get('doc', '')
686 test_object['test_time'] = test_object.get('time', 0)
687 test_object['test_retry'] = test_object.get('retries', 0)
688
689 # Fix the test name to be consistent with the current presentation
690 # of test names in the AFE.
691 testpath, subname = os.path.split(control_file_path)
692 testname = os.path.basename(testpath)
693 subname = subname.split('.')[1:]
694 if subname:
695 testname = '%s:%s' % (testname, ':'.join(subname))
696
697 test_object['name'] = testname
698
Matthew Sartori10438092015-06-24 14:30:18 -0700699 # Correct the test path as parse_control_string sets an empty string.
700 test_object['path'] = control_file_path
701
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700702 _id += 1
703 test_objects.append(test_object)
704
Matthew Sartori10438092015-06-24 14:30:18 -0700705 test_objects = sorted(test_objects, key=lambda x: x.get('name'))
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700706 return rpc_utils.prepare_for_serialization(test_objects)