blob: 1ea6c4e0141783bcb198182e33b0797faab42045 [file] [log] [blame]
Chris Masone859fdec2012-01-30 08:38:09 -08001# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5__author__ = 'cmasone@chromium.org (Chris Masone)'
6
7import common
Chris Masonea8066a92012-05-01 16:52:31 -07008import datetime
Chris Masone859fdec2012-01-30 08:38:09 -08009import logging
Simran Basi71206ef2014-08-13 13:51:18 -070010import os
Aviv Keshetd83ef442013-01-16 16:19:35 -080011
Jakob Juelich82b7d1c2014-09-15 16:10:57 -070012from autotest_lib.frontend.afe import models
Matthew Sartorid96fb9b2015-05-19 18:04:58 -070013from autotest_lib.client.common_lib import control_data
Aviv Keshetd83ef442013-01-16 16:19:35 -080014from autotest_lib.client.common_lib import error
Simran Basi71206ef2014-08-13 13:51:18 -070015from autotest_lib.client.common_lib import global_config
Alex Miller7d658cf2013-09-04 16:00:35 -070016from autotest_lib.client.common_lib import priorities
Dan Shidfea3682014-08-10 23:38:40 -070017from autotest_lib.client.common_lib import time_utils
Chris Masone859fdec2012-01-30 08:38:09 -080018from autotest_lib.client.common_lib.cros import dev_server
Aviv Keshet14cac442016-11-20 21:44:11 -080019# TODO(akeshet): Replace with monarch.
Gabe Black1e1c41b2015-02-04 23:55:15 -080020from autotest_lib.client.common_lib.cros.graphite import autotest_stats
Jakob Juelich9fffe4f2014-08-14 18:07:05 -070021from autotest_lib.frontend.afe import rpc_utils
Simran Basib6ec8ae2014-04-23 12:05:08 -070022from autotest_lib.server import utils
Dan Shi36cfd832014-10-10 13:38:51 -070023from autotest_lib.server.cros import provision
Chris Masone44e4d6c2012-08-15 14:25:53 -070024from autotest_lib.server.cros.dynamic_suite import constants
Chris Masoneb4935552012-08-14 12:05:54 -070025from autotest_lib.server.cros.dynamic_suite import control_file_getter
Chris Masone44e4d6c2012-08-15 14:25:53 -070026from autotest_lib.server.cros.dynamic_suite import tools
xixuan0f7755d2016-04-18 14:49:12 -070027from autotest_lib.server.cros.dynamic_suite import suite as SuiteBase
Dan Shi36cfd832014-10-10 13:38:51 -070028from autotest_lib.server.cros.dynamic_suite.suite import Suite
Dan Shidfea3682014-08-10 23:38:40 -070029from autotest_lib.site_utils import host_history
Dan Shi193905e2014-07-25 23:33:09 -070030from autotest_lib.site_utils import job_history
Dan Shid7bb4f12015-01-06 10:53:50 -080031from autotest_lib.site_utils import server_manager_utils
Dan Shi6964fa52014-12-18 11:04:27 -080032from autotest_lib.site_utils import stable_version_utils
Simran Basi71206ef2014-08-13 13:51:18 -070033
34
35_CONFIG = global_config.global_config
Chris Masone859fdec2012-01-30 08:38:09 -080036
Chris Masonef8b53062012-05-08 22:14:18 -070037# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py.
Chris Masone859fdec2012-01-30 08:38:09 -080038
39
Chris Masone62579122012-03-08 15:18:43 -080040def canonicalize_suite_name(suite_name):
xixuanba232a32016-08-25 17:01:59 -070041 """Canonicalize the suite's name.
42
43 @param suite_name: the name of the suite.
44 """
Dan Shi70647ca2015-07-16 22:52:35 -070045 # Do not change this naming convention without updating
46 # site_utils.parse_job_name.
Chris Masone62579122012-03-08 15:18:43 -080047 return 'test_suites/control.%s' % suite_name
48
49
Chris Masoneaa10f8e2012-05-15 13:34:21 -070050def formatted_now():
xixuanba232a32016-08-25 17:01:59 -070051 """Format the current datetime."""
Dan Shidfea3682014-08-10 23:38:40 -070052 return datetime.datetime.now().strftime(time_utils.TIME_FMT)
Chris Masoneaa10f8e2012-05-15 13:34:21 -070053
54
Simran Basib6ec8ae2014-04-23 12:05:08 -070055def _get_control_file_contents_by_name(build, ds, suite_name):
Chris Masone8dd27e02012-06-25 15:59:43 -070056 """Return control file contents for |suite_name|.
57
58 Query the dev server at |ds| for the control file |suite_name|, included
59 in |build| for |board|.
60
61 @param build: unique name by which to refer to the image from now on.
Chris Masone8dd27e02012-06-25 15:59:43 -070062 @param ds: a dev_server.DevServer instance to fetch control file with.
63 @param suite_name: canonicalized suite name, e.g. test_suites/control.bvt.
64 @raises ControlFileNotFound if a unique suite control file doesn't exist.
65 @raises NoControlFileList if we can't list the control files at all.
66 @raises ControlFileEmpty if the control file exists on the server, but
67 can't be read.
68
69 @return the contents of the desired control file.
70 """
71 getter = control_file_getter.DevServerGetter.create(build, ds)
Gabe Black1e1c41b2015-02-04 23:55:15 -080072 timer = autotest_stats.Timer('control_files.parse.%s.%s' %
73 (ds.get_server_name(ds.url()
74 ).replace('.', '_'),
75 suite_name.rsplit('.')[-1]))
Chris Masone8dd27e02012-06-25 15:59:43 -070076 # Get the control file for the suite.
77 try:
Prashanth Balasubramanianabe3bb72014-11-20 12:00:37 -080078 with timer:
79 control_file_in = getter.get_control_file_contents_by_name(
80 suite_name)
Chris Masone8dd27e02012-06-25 15:59:43 -070081 except error.CrosDynamicSuiteException as e:
Simran Basib6ec8ae2014-04-23 12:05:08 -070082 raise type(e)("%s while testing %s." % (e, build))
Chris Masone8dd27e02012-06-25 15:59:43 -070083 if not control_file_in:
84 raise error.ControlFileEmpty(
85 "Fetching %s returned no data." % suite_name)
Alex Millera713e252013-03-01 10:45:44 -080086 # Force control files to only contain ascii characters.
87 try:
88 control_file_in.encode('ascii')
89 except UnicodeDecodeError as e:
90 raise error.ControlFileMalformed(str(e))
91
Chris Masone8dd27e02012-06-25 15:59:43 -070092 return control_file_in
93
94
Dan Shi5e8fa182016-04-15 11:04:36 -070095def _stage_build_artifacts(build, hostname=None):
Simran Basib6ec8ae2014-04-23 12:05:08 -070096 """
97 Ensure components of |build| necessary for installing images are staged.
98
99 @param build image we want to stage.
Dan Shi5e8fa182016-04-15 11:04:36 -0700100 @param hostname hostname of a dut may run test on. This is to help to locate
101 a devserver closer to duts if needed. Default is None.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700102
Prashanth B6285f6a2014-05-08 18:01:27 -0700103 @raises StageControlFileFailure: if the dev server throws 500 while staging
104 suite control files.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700105
106 @return: dev_server.ImageServer instance to use with this build.
107 @return: timings dictionary containing staging start/end times.
108 """
109 timings = {}
Prashanth B6285f6a2014-05-08 18:01:27 -0700110 # Ensure components of |build| necessary for installing images are staged
111 # on the dev server. However set synchronous to False to allow other
112 # components to be downloaded in the background.
Dan Shi5e8fa182016-04-15 11:04:36 -0700113 ds = dev_server.resolve(build, hostname=hostname)
Simran Basib6ec8ae2014-04-23 12:05:08 -0700114 timings[constants.DOWNLOAD_STARTED_TIME] = formatted_now()
Gabe Black1e1c41b2015-02-04 23:55:15 -0800115 timer = autotest_stats.Timer('control_files.stage.%s' % (
116 ds.get_server_name(ds.url()).replace('.', '_')))
Simran Basib6ec8ae2014-04-23 12:05:08 -0700117 try:
Prashanth Balasubramanianabe3bb72014-11-20 12:00:37 -0800118 with timer:
Dan Shi6450e142016-03-11 11:52:20 -0800119 ds.stage_artifacts(image=build, artifacts=['test_suites'])
Simran Basib6ec8ae2014-04-23 12:05:08 -0700120 except dev_server.DevServerException as e:
Prashanth B6285f6a2014-05-08 18:01:27 -0700121 raise error.StageControlFileFailure(
Simran Basib6ec8ae2014-04-23 12:05:08 -0700122 "Failed to stage %s: %s" % (build, e))
123 timings[constants.PAYLOAD_FINISHED_TIME] = formatted_now()
124 return (ds, timings)
125
126
MK Ryue301eb72015-06-25 12:51:02 -0700127@rpc_utils.route_rpc_to_master
Dan Shi5984d782016-04-05 18:43:51 -0700128def create_suite_job(name='', board='', pool='', control_file='',
Simran Basib6ec8ae2014-04-23 12:05:08 -0700129 check_hosts=True, num=None, file_bugs=False, timeout=24,
130 timeout_mins=None, priority=priorities.Priority.DEFAULT,
Fang Deng058860c2014-05-15 15:41:50 -0700131 suite_args=None, wait_for_results=True, job_retry=False,
Fang Deng443f1952015-01-02 14:51:49 -0800132 max_retries=None, max_runtime_mins=None, suite_min_duts=0,
Dan Shi36cfd832014-10-10 13:38:51 -0700133 offload_failures_only=False, builds={},
Dan Shi059261a2016-02-22 12:06:37 -0800134 test_source_build=None, run_prod_code=False,
Michael Tang6dc174e2016-05-31 23:13:42 -0700135 delay_minutes=0, is_cloning=False, **kwargs):
Chris Masone859fdec2012-01-30 08:38:09 -0800136 """
137 Create a job to run a test suite on the given device with the given image.
138
139 When the timeout specified in the control file is reached, the
140 job is guaranteed to have completed and results will be available.
141
Simran Basib6ec8ae2014-04-23 12:05:08 -0700142 @param name: The test name if control_file is supplied, otherwise the name
143 of the test suite to run, e.g. 'bvt'.
Chris Masone859fdec2012-01-30 08:38:09 -0800144 @param board: the kind of device to run the tests on.
Dan Shi36cfd832014-10-10 13:38:51 -0700145 @param builds: the builds to install e.g.
146 {'cros-version:': 'x86-alex-release/R18-1655.0.0',
Dan Shi5984d782016-04-05 18:43:51 -0700147 'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0',
Dan Shi36cfd832014-10-10 13:38:51 -0700148 'fwro-version:': 'x86-alex-firmware/R36-5771.49.0'}
149 If builds is given a value, it overrides argument build.
150 @param test_source_build: Build that contains the server-side test code.
Scott Zawalski65650172012-02-16 11:48:26 -0500151 @param pool: Specify the pool of machines to use for scheduling
152 purposes.
xixuanba232a32016-08-25 17:01:59 -0700153 @param control_file: the control file of the job.
Chris Masone62579122012-03-08 15:18:43 -0800154 @param check_hosts: require appropriate live hosts to exist in the lab.
Aviv Keshetd83ef442013-01-16 16:19:35 -0800155 @param num: Specify the number of machines to schedule across (integer).
156 Leave unspecified or use None to use default sharding factor.
Alex Millerc577f3e2012-09-27 14:06:07 -0700157 @param file_bugs: File a bug on each test failure in this suite.
Alex Miller139690b2013-09-07 15:35:49 -0700158 @param timeout: The max lifetime of this suite, in hours.
Simran Basi7e605742013-11-12 13:43:36 -0800159 @param timeout_mins: The max lifetime of this suite, in minutes. Takes
160 priority over timeout.
Alex Miller139690b2013-09-07 15:35:49 -0700161 @param priority: Integer denoting priority. Higher is more important.
Aviv Keshet7cd12312013-07-25 10:25:55 -0700162 @param suite_args: Optional arguments which will be parsed by the suite
163 control file. Used by control.test_that_wrapper to
164 determine which tests to run.
Dan Shi95122412013-11-12 16:20:33 -0800165 @param wait_for_results: Set to False to run the suite job without waiting
166 for test jobs to finish. Default is True.
Fang Deng058860c2014-05-15 15:41:50 -0700167 @param job_retry: Set to True to enable job-level retry. Default is False.
Fang Deng443f1952015-01-02 14:51:49 -0800168 @param max_retries: Integer, maximum job retries allowed at suite level.
169 None for no max.
Simran Basi102e3522014-09-11 11:46:10 -0700170 @param max_runtime_mins: Maximum amount of time a job can be running in
171 minutes.
Fang Dengcbc01212014-11-25 16:09:46 -0800172 @param suite_min_duts: Integer. Scheduler will prioritize getting the
173 minimum number of machines for the suite when it is
174 competing with another suite that has a higher
175 priority but already got minimum machines it needs.
Simran Basi1e10e922015-04-16 15:09:56 -0700176 @param offload_failures_only: Only enable gs_offloading for failed jobs.
Simran Basi5ace6f22016-01-06 17:30:44 -0800177 @param run_prod_code: If True, the suite will run the test code that
178 lives in prod aka the test code currently on the
179 lab servers. If False, the control files and test
180 code for this suite run will be retrieved from the
181 build artifacts.
Dan Shi059261a2016-02-22 12:06:37 -0800182 @param delay_minutes: Delay the creation of test jobs for a given number of
183 minutes.
Michael Tang6dc174e2016-05-31 23:13:42 -0700184 @param is_cloning: True if creating a cloning job.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700185 @param kwargs: extra keyword args. NOT USED.
Chris Masone859fdec2012-01-30 08:38:09 -0800186
Chris Masone8dd27e02012-06-25 15:59:43 -0700187 @raises ControlFileNotFound: if a unique suite control file doesn't exist.
188 @raises NoControlFileList: if we can't list the control files at all.
Prashanth B6285f6a2014-05-08 18:01:27 -0700189 @raises StageControlFileFailure: If the dev server throws 500 while
190 staging test_suites.
Chris Masone8dd27e02012-06-25 15:59:43 -0700191 @raises ControlFileEmpty: if the control file exists on the server, but
192 can't be read.
Chris Masone859fdec2012-01-30 08:38:09 -0800193
194 @return: the job ID of the suite; -1 on error.
195 """
Aviv Keshetd83ef442013-01-16 16:19:35 -0800196 if type(num) is not int and num is not None:
Chris Sosa18c70b32013-02-15 14:12:43 -0800197 raise error.SuiteArgumentException('Ill specified num argument %r. '
198 'Must be an integer or None.' % num)
Aviv Keshetd83ef442013-01-16 16:19:35 -0800199 if num == 0:
200 logging.warning("Can't run on 0 hosts; using default.")
201 num = None
Dan Shi36cfd832014-10-10 13:38:51 -0700202
Dan Shi2121a332016-02-25 14:22:22 -0800203 # Default test source build to CrOS build if it's not specified and
204 # run_prod_code is set to False.
205 if not run_prod_code:
206 test_source_build = Suite.get_test_source_build(
207 builds, test_source_build=test_source_build)
Dan Shi36cfd832014-10-10 13:38:51 -0700208
Dan Shi5e8fa182016-04-15 11:04:36 -0700209 # If 'prefer_local_devserver' is True in global setting, and both board
210 # and pool are specified, pick a dut in the given board and pool, and
211 # use that to help to pick a devserver in the same subnet of the duts
212 # to be used to run tests.
213 if dev_server.PREFER_LOCAL_DEVSERVER and pool and board:
214 sample_dut = rpc_utils.get_sample_dut(board, pool)
215 else:
216 sample_dut = None
217
Simran Basi5ace6f22016-01-06 17:30:44 -0800218 suite_name = canonicalize_suite_name(name)
219 if run_prod_code:
Dan Shi5e8fa182016-04-15 11:04:36 -0700220 ds = dev_server.resolve(test_source_build, hostname=sample_dut)
Simran Basi5ace6f22016-01-06 17:30:44 -0800221 keyvals = {}
222 getter = control_file_getter.FileSystemGetter(
223 [_CONFIG.get_config_value('SCHEDULER',
224 'drone_installation_directory')])
225 control_file = getter.get_control_file_contents_by_name(suite_name)
226 else:
Dan Shi5e8fa182016-04-15 11:04:36 -0700227 (ds, keyvals) = _stage_build_artifacts(
228 test_source_build, hostname=sample_dut)
Fang Dengcbc01212014-11-25 16:09:46 -0800229 keyvals[constants.SUITE_MIN_DUTS_KEY] = suite_min_duts
Chris Masone859fdec2012-01-30 08:38:09 -0800230
Simran Basib6ec8ae2014-04-23 12:05:08 -0700231 if not control_file:
Dan Shi36cfd832014-10-10 13:38:51 -0700232 # No control file was supplied so look it up from the build artifacts.
233 suite_name = canonicalize_suite_name(name)
234 control_file = _get_control_file_contents_by_name(test_source_build,
235 ds, suite_name)
Simran Basi86fe9c92016-02-09 17:58:20 -0800236 # Do not change this naming convention without updating
237 # site_utils.parse_job_name.
Dan Shi2121a332016-02-25 14:22:22 -0800238 if not run_prod_code:
239 name = '%s-%s' % (test_source_build, suite_name)
240 else:
241 # If run_prod_code is True, test_source_build is not set, use the
242 # first build in the builds list for the sutie job name.
243 name = '%s-%s' % (builds.values()[0], suite_name)
Chris Masone46d0eb12012-07-27 18:56:39 -0700244
Simran Basi7e605742013-11-12 13:43:36 -0800245 timeout_mins = timeout_mins or timeout * 60
Simran Basi102e3522014-09-11 11:46:10 -0700246 max_runtime_mins = max_runtime_mins or timeout * 60
Simran Basi7e605742013-11-12 13:43:36 -0800247
Simran Basib6ec8ae2014-04-23 12:05:08 -0700248 if not board:
Dan Shid215dbe2015-06-18 16:14:59 -0700249 board = utils.ParseBuildName(builds[provision.CROS_VERSION_PREFIX])[0]
Chris Masone46d0eb12012-07-27 18:56:39 -0700250
Dan Shi5984d782016-04-05 18:43:51 -0700251 # Prepend builds and board to the control file.
Scott Zawalski65650172012-02-16 11:48:26 -0500252 inject_dict = {'board': board,
Dan Shi6dc22d12016-04-06 22:10:04 -0700253 # `build` is needed for suites like AU to stage image inside
254 # suite control file.
255 'build': test_source_build,
Dan Shi36cfd832014-10-10 13:38:51 -0700256 'builds': builds,
Chris Masone62579122012-03-08 15:18:43 -0800257 'check_hosts': check_hosts,
Chris Masone46d0eb12012-07-27 18:56:39 -0700258 'pool': pool,
Aviv Keshetd83ef442013-01-16 16:19:35 -0800259 'num': num,
Dan Shib8a99112013-06-18 13:46:10 -0700260 'file_bugs': file_bugs,
Alex Miller139690b2013-09-07 15:35:49 -0700261 'timeout': timeout,
Simran Basi7e605742013-11-12 13:43:36 -0800262 'timeout_mins': timeout_mins,
Alex Miller7d658cf2013-09-04 16:00:35 -0700263 'devserver_url': ds.url(),
Aviv Keshet7cd12312013-07-25 10:25:55 -0700264 'priority': priority,
Dan Shi95122412013-11-12 16:20:33 -0800265 'suite_args' : suite_args,
Fang Deng058860c2014-05-15 15:41:50 -0700266 'wait_for_results': wait_for_results,
Simran Basi102e3522014-09-11 11:46:10 -0700267 'job_retry': job_retry,
Fang Deng443f1952015-01-02 14:51:49 -0800268 'max_retries': max_retries,
Fang Dengcbc01212014-11-25 16:09:46 -0800269 'max_runtime_mins': max_runtime_mins,
Dan Shi36cfd832014-10-10 13:38:51 -0700270 'offload_failures_only': offload_failures_only,
Simran Basi5ace6f22016-01-06 17:30:44 -0800271 'test_source_build': test_source_build,
Dan Shi059261a2016-02-22 12:06:37 -0800272 'run_prod_code': run_prod_code,
273 'delay_minutes': delay_minutes,
Aviv Keshet7cd12312013-07-25 10:25:55 -0700274 }
275
Michael Tang6dc174e2016-05-31 23:13:42 -0700276 if is_cloning:
277 control_file = tools.remove_injection(control_file)
Simran Basib6ec8ae2014-04-23 12:05:08 -0700278 control_file = tools.inject_vars(inject_dict, control_file)
Chris Masone859fdec2012-01-30 08:38:09 -0800279
Jakob Juelich9fffe4f2014-08-14 18:07:05 -0700280 return rpc_utils.create_job_common(name,
Jakob Juelich59cfe542014-09-02 16:37:46 -0700281 priority=priority,
282 timeout_mins=timeout_mins,
283 max_runtime_mins=max_runtime_mins,
284 control_type='Server',
285 control_file=control_file,
286 hostless=True,
Fang Dengcbc01212014-11-25 16:09:46 -0800287 keyvals=keyvals)
Simran Basi71206ef2014-08-13 13:51:18 -0700288
289
Dan Shi193905e2014-07-25 23:33:09 -0700290def get_job_history(**filter_data):
291 """Get history of the job, including the special tasks executed for the job
292
293 @param filter_data: filter for the call, should at least include
294 {'job_id': [job id]}
295 @returns: JSON string of the job's history, including the information such
296 as the hosts run the job and the special tasks executed before
297 and after the job.
298 """
299 job_id = filter_data['job_id']
300 job_info = job_history.get_job_info(job_id)
Dan Shidfea3682014-08-10 23:38:40 -0700301 return rpc_utils.prepare_for_serialization(job_info.get_history())
302
303
304def get_host_history(start_time, end_time, hosts=None, board=None, pool=None):
305 """Get history of a list of host.
306
307 The return is a JSON string of host history for each host, for example,
308 {'172.22.33.51': [{'status': 'Resetting'
309 'start_time': '2014-08-07 10:02:16',
310 'end_time': '2014-08-07 10:03:16',
311 'log_url': 'http://autotest/reset-546546/debug',
312 'dbg_str': 'Task: Special Task 19441991 (host ...)'},
313 {'status': 'Running'
314 'start_time': '2014-08-07 10:03:18',
315 'end_time': '2014-08-07 10:13:00',
316 'log_url': 'http://autotest/reset-546546/debug',
317 'dbg_str': 'HQE: 15305005, for job: 14995562'}
318 ]
319 }
320 @param start_time: start time to search for history, can be string value or
321 epoch time.
322 @param end_time: end time to search for history, can be string value or
323 epoch time.
324 @param hosts: A list of hosts to search for history. Default is None.
325 @param board: board type of hosts. Default is None.
326 @param pool: pool type of hosts. Default is None.
327 @returns: JSON string of the host history.
328 """
329 return rpc_utils.prepare_for_serialization(
330 host_history.get_history_details(
331 start_time=start_time, end_time=end_time,
332 hosts=hosts, board=board, pool=pool,
333 process_pool_size=4))
Jakob Juelich59cfe542014-09-02 16:37:46 -0700334
335
MK Ryu07a109f2015-07-21 17:44:32 -0700336def shard_heartbeat(shard_hostname, jobs=(), hqes=(), known_job_ids=(),
337 known_host_ids=(), known_host_statuses=()):
Jakob Juelich1b525742014-09-30 13:08:07 -0700338 """Receive updates for job statuses from shards and assign hosts and jobs.
Jakob Juelich59cfe542014-09-02 16:37:46 -0700339
340 @param shard_hostname: Hostname of the calling shard
Jakob Juelicha94efe62014-09-18 16:02:49 -0700341 @param jobs: Jobs in serialized form that should be updated with newer
342 status from a shard.
343 @param hqes: Hostqueueentries in serialized form that should be updated with
344 newer status from a shard. Note that for every hostqueueentry
345 the corresponding job must be in jobs.
Jakob Juelich1b525742014-09-30 13:08:07 -0700346 @param known_job_ids: List of ids of jobs the shard already has.
347 @param known_host_ids: List of ids of hosts the shard already has.
MK Ryu07a109f2015-07-21 17:44:32 -0700348 @param known_host_statuses: List of statuses of hosts the shard already has.
Jakob Juelicha94efe62014-09-18 16:02:49 -0700349
Fang Dengf3705992014-12-16 17:32:18 -0800350 @returns: Serialized representations of hosts, jobs, suite job keyvals
351 and their dependencies to be inserted into a shard's database.
Jakob Juelich59cfe542014-09-02 16:37:46 -0700352 """
Jakob Juelich1b525742014-09-30 13:08:07 -0700353 # The following alternatives to sending host and job ids in every heartbeat
354 # have been considered:
355 # 1. Sending the highest known job and host ids. This would work for jobs:
356 # Newer jobs always have larger ids. Also, if a job is not assigned to a
357 # particular shard during a heartbeat, it never will be assigned to this
358 # shard later.
359 # This is not true for hosts though: A host that is leased won't be sent
360 # to the shard now, but might be sent in a future heartbeat. This means
361 # sometimes hosts should be transfered that have a lower id than the
362 # maximum host id the shard knows.
363 # 2. Send the number of jobs/hosts the shard knows to the master in each
364 # heartbeat. Compare these to the number of records that already have
365 # the shard_id set to this shard. In the normal case, they should match.
366 # In case they don't, resend all entities of that type.
367 # This would work well for hosts, because there aren't that many.
368 # Resending all jobs is quite a big overhead though.
369 # Also, this approach might run into edge cases when entities are
370 # ever deleted.
371 # 3. Mixtures of the above: Use 1 for jobs and 2 for hosts.
372 # Using two different approaches isn't consistent and might cause
373 # confusion. Also the issues with the case of deletions might still
374 # occur.
375 #
376 # The overhead of sending all job and host ids in every heartbeat is low:
377 # At peaks one board has about 1200 created but unfinished jobs.
378 # See the numbers here: http://goo.gl/gQCGWH
379 # Assuming that job id's have 6 digits and that json serialization takes a
380 # comma and a space as overhead, the traffic per id sent is about 8 bytes.
381 # If 5000 ids need to be sent, this means 40 kilobytes of traffic.
382 # A NOT IN query with 5000 ids took about 30ms in tests made.
383 # These numbers seem low enough to outweigh the disadvantages of the
384 # solutions described above.
Gabe Black1e1c41b2015-02-04 23:55:15 -0800385 timer = autotest_stats.Timer('shard_heartbeat')
Jakob Juelich59cfe542014-09-02 16:37:46 -0700386 with timer:
387 shard_obj = rpc_utils.retrieve_shard(shard_hostname=shard_hostname)
Jakob Juelicha94efe62014-09-18 16:02:49 -0700388 rpc_utils.persist_records_sent_from_shard(shard_obj, jobs, hqes)
MK Ryu07a109f2015-07-21 17:44:32 -0700389 assert len(known_host_ids) == len(known_host_statuses)
390 for i in range(len(known_host_ids)):
391 host_model = models.Host.objects.get(pk=known_host_ids[i])
392 if host_model.status != known_host_statuses[i]:
393 host_model.status = known_host_statuses[i]
394 host_model.save()
395
Fang Dengf3705992014-12-16 17:32:18 -0800396 hosts, jobs, suite_keyvals = rpc_utils.find_records_for_shard(
MK Ryu07a109f2015-07-21 17:44:32 -0700397 shard_obj, known_job_ids=known_job_ids,
398 known_host_ids=known_host_ids)
Jakob Juelich59cfe542014-09-02 16:37:46 -0700399 return {
400 'hosts': [host.serialize() for host in hosts],
401 'jobs': [job.serialize() for job in jobs],
Fang Dengf3705992014-12-16 17:32:18 -0800402 'suite_keyvals': [kv.serialize() for kv in suite_keyvals],
Jakob Juelich59cfe542014-09-02 16:37:46 -0700403 }
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700404
405
406def get_shards(**filter_data):
407 """Return a list of all shards.
408
409 @returns A sequence of nested dictionaries of shard information.
410 """
411 shards = models.Shard.query_objects(filter_data)
412 serialized_shards = rpc_utils.prepare_rows_as_nested_dicts(shards, ())
413 for serialized, shard in zip(serialized_shards, shards):
414 serialized['labels'] = [label.name for label in shard.labels.all()]
415
416 return serialized_shards
417
418
Shuqian Zhao68139fb2016-08-03 16:29:56 -0700419def _assign_board_to_shard_precheck(labels):
420 """Verify whether board labels are valid to be added to a given shard.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700421
Shuqian Zhao68139fb2016-08-03 16:29:56 -0700422 First check whether board label is in correct format. Second, check whether
423 the board label exist. Third, check whether the board has already been
424 assigned to shard.
425
426 @param labels: Board labels separated by comma.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700427
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700428 @raises error.RPCException: If label provided doesn't start with `board:`
Shuqian Zhao68139fb2016-08-03 16:29:56 -0700429 or board has been added to shard already.
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700430 @raises models.Label.DoesNotExist: If the label specified doesn't exist.
Shuqian Zhao68139fb2016-08-03 16:29:56 -0700431
432 @returns: A list of label models that ready to be added to shard.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700433 """
MK Ryu5dfcc892015-07-16 15:34:04 -0700434 labels = labels.split(',')
435 label_models = []
436 for label in labels:
Shuqian Zhao68139fb2016-08-03 16:29:56 -0700437 # Check whether the board label is in correct format.
MK Ryu5dfcc892015-07-16 15:34:04 -0700438 if not label.startswith('board:'):
Shuqian Zhao68139fb2016-08-03 16:29:56 -0700439 raise error.RPCException('Sharding only supports `board:.*` label.')
440 # Check whether the board label exist. If not, exception will be thrown
441 # by smart_get function.
442 label = models.Label.smart_get(label)
443 label_id = models.Label.list_objects({'name':label})[0].get('id')
444 # Check whether the board has been sharded already
445 try:
446 shard = models.Shard.objects.get(labels=label)
447 raise error.RPCException(
448 '%s is already on shard %s' % (label, shard.hostname))
449 except models.Shard.DoesNotExist as e:
450 # board is not on any shard, so it's valid.
451 label_models.append(label)
452 return label_models
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700453
Shuqian Zhao68139fb2016-08-03 16:29:56 -0700454
455def add_shard(hostname, labels):
456 """Add a shard and start running jobs on it.
457
458 @param hostname: The hostname of the shard to be added; needs to be unique.
459 @param labels: Board labels separated by comma. Jobs of one of the labels
460 will be assigned to the shard.
461
462 @raises error.RPCException: If label provided doesn't start with `board:` or
463 board has been added to shard already.
464 @raises model_logic.ValidationError: If a shard with the given hostname
465 already exist.
466 @raises models.Label.DoesNotExist: If the label specified doesn't exist.
467
468 @returns: The id of the added shard.
469 """
470 labels = _assign_board_to_shard_precheck(labels)
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700471 shard = models.Shard.add_object(hostname=hostname)
Shuqian Zhao68139fb2016-08-03 16:29:56 -0700472 for label in labels:
473 shard.labels.add(label)
474 return shard.id
475
476
477def add_board_to_shard(hostname, labels):
478 """Add boards to a given shard
479
480 @param hostname: The hostname of the shard to be changed.
481 @param labels: Board labels separated by comma.
482
483 @raises error.RPCException: If label provided doesn't start with `board:` or
484 board has been added to shard already.
485 @raises models.Label.DoesNotExist: If the label specified doesn't exist.
486
487 @returns: The id of the changed shard.
488 """
489 labels = _assign_board_to_shard_precheck(labels)
490 shard = models.Shard.objects.get(hostname=hostname)
491 for label in labels:
MK Ryu5dfcc892015-07-16 15:34:04 -0700492 shard.labels.add(label)
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700493 return shard.id
494
495
496def delete_shard(hostname):
497 """Delete a shard and reclaim all resources from it.
498
499 This claims back all assigned hosts from the shard. To ensure all DUTs are
xixuan03cb93f2016-03-22 16:21:41 -0700500 in a sane state, a Reboot task with highest priority is scheduled for them.
501 This reboots the DUTs and then all left tasks continue to run in drone of
502 the master.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700503
xixuan03cb93f2016-03-22 16:21:41 -0700504 The procedure for deleting a shard:
505 * Lock all unlocked hosts on that shard.
506 * Remove shard information .
507 * Assign a reboot task with highest priority to these hosts.
508 * Unlock these hosts, then, the reboot tasks run in front of all other
509 tasks.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700510
511 The status of jobs that haven't been reported to be finished yet, will be
512 lost. The master scheduler will pick up the jobs and execute them.
513
514 @param hostname: Hostname of the shard to delete.
515 """
516 shard = rpc_utils.retrieve_shard(shard_hostname=hostname)
xixuan03cb93f2016-03-22 16:21:41 -0700517 hostnames_to_lock = [h.hostname for h in
518 models.Host.objects.filter(shard=shard, locked=False)]
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700519
520 # TODO(beeps): Power off shard
xixuan03cb93f2016-03-22 16:21:41 -0700521 # For ChromeOS hosts, a reboot test with the highest priority is added to
522 # the DUT. After a reboot it should be ganranteed that no processes from
523 # prior tests that were run by a shard are still running on.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700524
xixuan03cb93f2016-03-22 16:21:41 -0700525 # Lock all unlocked hosts.
526 dicts = {'locked': True, 'lock_time': datetime.datetime.now()}
527 models.Host.objects.filter(hostname__in=hostnames_to_lock).update(**dicts)
528
529 # Remove shard information.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700530 models.Host.objects.filter(shard=shard).update(shard=None)
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700531 models.Job.objects.filter(shard=shard).update(shard=None)
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700532 shard.labels.clear()
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700533 shard.delete()
Dan Shi6964fa52014-12-18 11:04:27 -0800534
xixuan03cb93f2016-03-22 16:21:41 -0700535 # Assign a reboot task with highest priority: Super.
536 t = models.Test.objects.get(name='platform_BootPerfServer:shard')
537 c = utils.read_file(os.path.join(common.autotest_dir, t.path))
538 if hostnames_to_lock:
539 rpc_utils.create_job_common(
540 'reboot_dut_for_shard_deletion',
541 priority=priorities.Priority.SUPER,
542 control_type='Server',
543 control_file=c, hosts=hostnames_to_lock)
544
545 # Unlock these shard-related hosts.
546 dicts = {'locked': False, 'lock_time': None}
547 models.Host.objects.filter(hostname__in=hostnames_to_lock).update(**dicts)
548
Dan Shi6964fa52014-12-18 11:04:27 -0800549
MK Ryua34e3b12015-08-21 16:20:47 -0700550def get_servers(hostname=None, role=None, status=None):
Dan Shid7bb4f12015-01-06 10:53:50 -0800551 """Get a list of servers with matching role and status.
552
MK Ryua34e3b12015-08-21 16:20:47 -0700553 @param hostname: FQDN of the server.
Dan Shid7bb4f12015-01-06 10:53:50 -0800554 @param role: Name of the server role, e.g., drone, scheduler. Default to
555 None to match any role.
556 @param status: Status of the server, e.g., primary, backup, repair_required.
557 Default to None to match any server status.
558
559 @raises error.RPCException: If server database is not used.
560 @return: A list of server names for servers with matching role and status.
561 """
562 if not server_manager_utils.use_server_db():
563 raise error.RPCException('Server database is not enabled. Please try '
564 'retrieve servers from global config.')
MK Ryua34e3b12015-08-21 16:20:47 -0700565 servers = server_manager_utils.get_servers(hostname=hostname, role=role,
Dan Shid7bb4f12015-01-06 10:53:50 -0800566 status=status)
567 return [s.get_details() for s in servers]
568
569
MK Ryufbb002c2015-06-08 14:13:16 -0700570@rpc_utils.route_rpc_to_master
Simran Basibeb2bb22016-02-03 15:25:48 -0800571def get_stable_version(board=stable_version_utils.DEFAULT, android=False):
Dan Shi6964fa52014-12-18 11:04:27 -0800572 """Get stable version for the given board.
573
574 @param board: Name of the board.
Simran Basibeb2bb22016-02-03 15:25:48 -0800575 @param android: If True, the given board is an Android-based device. If
576 False, assume its a Chrome OS-based device.
577
Dan Shi6964fa52014-12-18 11:04:27 -0800578 @return: Stable version of the given board. Return global configure value
579 of CROS.stable_cros_version if stable_versinos table does not have
580 entry of board DEFAULT.
581 """
Simran Basibeb2bb22016-02-03 15:25:48 -0800582 return stable_version_utils.get(board=board, android=android)
Dan Shi25e1fd42014-12-19 14:36:42 -0800583
584
MK Ryufbb002c2015-06-08 14:13:16 -0700585@rpc_utils.route_rpc_to_master
Dan Shi25e1fd42014-12-19 14:36:42 -0800586def get_all_stable_versions():
587 """Get stable versions for all boards.
588
589 @return: A dictionary of board:version.
590 """
591 return stable_version_utils.get_all()
592
593
MK Ryufbb002c2015-06-08 14:13:16 -0700594@rpc_utils.route_rpc_to_master
Dan Shi25e1fd42014-12-19 14:36:42 -0800595def set_stable_version(version, board=stable_version_utils.DEFAULT):
596 """Modify stable version for the given board.
597
598 @param version: The new value of stable version for given board.
599 @param board: Name of the board, default to value `DEFAULT`.
600 """
601 stable_version_utils.set(version=version, board=board)
602
603
MK Ryufbb002c2015-06-08 14:13:16 -0700604@rpc_utils.route_rpc_to_master
Dan Shi25e1fd42014-12-19 14:36:42 -0800605def delete_stable_version(board):
606 """Modify stable version for the given board.
607
608 Delete a stable version entry in afe_stable_versions table for a given
609 board, so default stable version will be used.
610
611 @param board: Name of the board.
612 """
613 stable_version_utils.delete(board=board)
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700614
615
Michael Tang84a2ecf2016-06-07 15:10:53 -0700616def _initialize_control_file_getter(build):
617 """Get the remote control file getter.
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700618
Michael Tang84a2ecf2016-06-07 15:10:53 -0700619 @param build: unique name by which to refer to a remote build image.
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700620
Michael Tang84a2ecf2016-06-07 15:10:53 -0700621 @return: A control file getter object.
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700622 """
623 # Stage the test artifacts.
624 try:
625 ds = dev_server.ImageServer.resolve(build)
626 build = ds.translate(build)
627 except dev_server.DevServerException as e:
628 raise ValueError('Could not resolve build %s: %s' % (build, e))
629
630 try:
Dan Shi6450e142016-03-11 11:52:20 -0800631 ds.stage_artifacts(image=build, artifacts=['test_suites'])
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700632 except dev_server.DevServerException as e:
633 raise error.StageControlFileFailure(
634 'Failed to stage %s: %s' % (build, e))
635
636 # Collect the control files specified in this build
Michael Tang84a2ecf2016-06-07 15:10:53 -0700637 return control_file_getter.DevServerGetter.create(build, ds)
638
639
640def get_tests_by_build(build, ignore_invalid_tests=True):
641 """Get the tests that are available for the specified build.
642
643 @param build: unique name by which to refer to the image.
644 @param ignore_invalid_tests: flag on if unparsable tests are ignored.
645
646 @return: A sorted list of all tests that are in the build specified.
647 """
648 # Collect the control files specified in this build
649 cfile_getter = _initialize_control_file_getter(build)
xixuan0f7755d2016-04-18 14:49:12 -0700650 if SuiteBase.ENABLE_CONTROLS_IN_BATCH:
651 control_file_info_list = cfile_getter.get_suite_info()
652 control_file_list = control_file_info_list.keys()
653 else:
654 control_file_list = cfile_getter.get_control_file_list()
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700655
656 test_objects = []
657 _id = 0
658 for control_file_path in control_file_list:
659 # Read and parse the control file
xixuan0f7755d2016-04-18 14:49:12 -0700660 if SuiteBase.ENABLE_CONTROLS_IN_BATCH:
661 control_file = control_file_info_list[control_file_path]
662 else:
663 control_file = cfile_getter.get_control_file_contents(
664 control_file_path)
Michael Tang340efe32016-04-16 12:15:17 -0700665 try:
666 control_obj = control_data.parse_control_string(control_file)
667 except:
Michael Tang6dc174e2016-05-31 23:13:42 -0700668 logging.info('Failed to parse control file: %s', control_file_path)
Michael Tang340efe32016-04-16 12:15:17 -0700669 if not ignore_invalid_tests:
670 raise
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700671
672 # Extract the values needed for the AFE from the control_obj.
673 # The keys list represents attributes in the control_obj that
674 # are required by the AFE
675 keys = ['author', 'doc', 'name', 'time', 'test_type', 'experimental',
676 'test_category', 'test_class', 'dependencies', 'run_verify',
677 'sync_count', 'job_retries', 'retries', 'path']
678
679 test_object = {}
680 for key in keys:
681 test_object[key] = getattr(control_obj, key) if hasattr(
682 control_obj, key) else ''
683
684 # Unfortunately, the AFE expects different key-names for certain
685 # values, these must be corrected to avoid the risk of tests
686 # being omitted by the AFE.
687 # The 'id' is an additional value used in the AFE.
Matthew Sartori10438092015-06-24 14:30:18 -0700688 # The control_data parsing does not reference 'run_reset', but it
689 # is also used in the AFE and defaults to True.
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700690 test_object['id'] = _id
Matthew Sartori10438092015-06-24 14:30:18 -0700691 test_object['run_reset'] = True
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700692 test_object['description'] = test_object.get('doc', '')
693 test_object['test_time'] = test_object.get('time', 0)
694 test_object['test_retry'] = test_object.get('retries', 0)
695
696 # Fix the test name to be consistent with the current presentation
697 # of test names in the AFE.
698 testpath, subname = os.path.split(control_file_path)
699 testname = os.path.basename(testpath)
700 subname = subname.split('.')[1:]
701 if subname:
702 testname = '%s:%s' % (testname, ':'.join(subname))
703
704 test_object['name'] = testname
705
Matthew Sartori10438092015-06-24 14:30:18 -0700706 # Correct the test path as parse_control_string sets an empty string.
707 test_object['path'] = control_file_path
708
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700709 _id += 1
710 test_objects.append(test_object)
711
Matthew Sartori10438092015-06-24 14:30:18 -0700712 test_objects = sorted(test_objects, key=lambda x: x.get('name'))
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700713 return rpc_utils.prepare_for_serialization(test_objects)
Michael Tang84a2ecf2016-06-07 15:10:53 -0700714
715
716def get_test_control_files_by_build(tests, build, ignore_invalid_tests=False):
717 """Get the test control files that are available for the specified build.
718
719 @param tests A sequence of test objects to run.
720 @param build: unique name by which to refer to the image.
721 @param ignore_invalid_tests: flag on if unparsable tests are ignored.
722
723 @return: A sorted list of all tests that are in the build specified.
724 """
725 raw_control_files = []
726 # shortcut to avoid staging the image.
727 if not tests:
728 return raw_control_files
729
730 cfile_getter = _initialize_control_file_getter(build)
731 if SuiteBase.ENABLE_CONTROLS_IN_BATCH:
732 control_file_info_list = cfile_getter.get_suite_info()
733
734 for test in tests:
735 # Read and parse the control file
736 if SuiteBase.ENABLE_CONTROLS_IN_BATCH:
737 control_file = control_file_info_list[test.path]
738 else:
739 control_file = cfile_getter.get_control_file_contents(
740 test.path)
741 raw_control_files.append(control_file)
742 return raw_control_files