blob: 0cd373923ded913a33d6f3f907a7c9a1e106711b [file] [log] [blame]
Dan Shi4df39252013-03-19 13:19:45 -07001# pylint: disable-msg=C0111
2
Chris Masone859fdec2012-01-30 08:38:09 -08003# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7__author__ = 'cmasone@chromium.org (Chris Masone)'
8
9import common
Simran Basi773a86e2015-05-13 19:15:42 -070010import ConfigParser
Chris Masonea8066a92012-05-01 16:52:31 -070011import datetime
Chris Masone859fdec2012-01-30 08:38:09 -080012import logging
Simran Basi71206ef2014-08-13 13:51:18 -070013import os
14import shutil
Aviv Keshetd83ef442013-01-16 16:19:35 -080015
Jakob Juelich82b7d1c2014-09-15 16:10:57 -070016from autotest_lib.frontend.afe import models
Matthew Sartorid96fb9b2015-05-19 18:04:58 -070017from autotest_lib.client.common_lib import control_data
Aviv Keshetd83ef442013-01-16 16:19:35 -080018from autotest_lib.client.common_lib import error
Simran Basi71206ef2014-08-13 13:51:18 -070019from autotest_lib.client.common_lib import global_config
Alex Miller7d658cf2013-09-04 16:00:35 -070020from autotest_lib.client.common_lib import priorities
Dan Shidfea3682014-08-10 23:38:40 -070021from autotest_lib.client.common_lib import time_utils
Chris Masone859fdec2012-01-30 08:38:09 -080022from autotest_lib.client.common_lib.cros import dev_server
Gabe Black1e1c41b2015-02-04 23:55:15 -080023from autotest_lib.client.common_lib.cros.graphite import autotest_stats
Jakob Juelich9fffe4f2014-08-14 18:07:05 -070024from autotest_lib.frontend.afe import rpc_utils
Simran Basib6ec8ae2014-04-23 12:05:08 -070025from autotest_lib.server import utils
Chris Masone44e4d6c2012-08-15 14:25:53 -070026from autotest_lib.server.cros.dynamic_suite import constants
Chris Masoneb4935552012-08-14 12:05:54 -070027from autotest_lib.server.cros.dynamic_suite import control_file_getter
Chris Masone44e4d6c2012-08-15 14:25:53 -070028from autotest_lib.server.cros.dynamic_suite import tools
Simran Basi71206ef2014-08-13 13:51:18 -070029from autotest_lib.server.hosts import moblab_host
Dan Shidfea3682014-08-10 23:38:40 -070030from autotest_lib.site_utils import host_history
Dan Shi193905e2014-07-25 23:33:09 -070031from autotest_lib.site_utils import job_history
Dan Shid7bb4f12015-01-06 10:53:50 -080032from autotest_lib.site_utils import server_manager_utils
Dan Shi6964fa52014-12-18 11:04:27 -080033from autotest_lib.site_utils import stable_version_utils
Simran Basi71206ef2014-08-13 13:51:18 -070034
35
36_CONFIG = global_config.global_config
37MOBLAB_BOTO_LOCATION = '/home/moblab/.boto'
Chris Masone859fdec2012-01-30 08:38:09 -080038
Chris Masonef8b53062012-05-08 22:14:18 -070039# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py.
Chris Masone859fdec2012-01-30 08:38:09 -080040
41
Chris Masone62579122012-03-08 15:18:43 -080042def canonicalize_suite_name(suite_name):
43 return 'test_suites/control.%s' % suite_name
44
45
Chris Masoneaa10f8e2012-05-15 13:34:21 -070046def formatted_now():
Dan Shidfea3682014-08-10 23:38:40 -070047 return datetime.datetime.now().strftime(time_utils.TIME_FMT)
Chris Masoneaa10f8e2012-05-15 13:34:21 -070048
49
Simran Basib6ec8ae2014-04-23 12:05:08 -070050def _get_control_file_contents_by_name(build, ds, suite_name):
Chris Masone8dd27e02012-06-25 15:59:43 -070051 """Return control file contents for |suite_name|.
52
53 Query the dev server at |ds| for the control file |suite_name|, included
54 in |build| for |board|.
55
56 @param build: unique name by which to refer to the image from now on.
Chris Masone8dd27e02012-06-25 15:59:43 -070057 @param ds: a dev_server.DevServer instance to fetch control file with.
58 @param suite_name: canonicalized suite name, e.g. test_suites/control.bvt.
59 @raises ControlFileNotFound if a unique suite control file doesn't exist.
60 @raises NoControlFileList if we can't list the control files at all.
61 @raises ControlFileEmpty if the control file exists on the server, but
62 can't be read.
63
64 @return the contents of the desired control file.
65 """
66 getter = control_file_getter.DevServerGetter.create(build, ds)
Gabe Black1e1c41b2015-02-04 23:55:15 -080067 timer = autotest_stats.Timer('control_files.parse.%s.%s' %
68 (ds.get_server_name(ds.url()
69 ).replace('.', '_'),
70 suite_name.rsplit('.')[-1]))
Chris Masone8dd27e02012-06-25 15:59:43 -070071 # Get the control file for the suite.
72 try:
Prashanth Balasubramanianabe3bb72014-11-20 12:00:37 -080073 with timer:
74 control_file_in = getter.get_control_file_contents_by_name(
75 suite_name)
Chris Masone8dd27e02012-06-25 15:59:43 -070076 except error.CrosDynamicSuiteException as e:
Simran Basib6ec8ae2014-04-23 12:05:08 -070077 raise type(e)("%s while testing %s." % (e, build))
Chris Masone8dd27e02012-06-25 15:59:43 -070078 if not control_file_in:
79 raise error.ControlFileEmpty(
80 "Fetching %s returned no data." % suite_name)
Alex Millera713e252013-03-01 10:45:44 -080081 # Force control files to only contain ascii characters.
82 try:
83 control_file_in.encode('ascii')
84 except UnicodeDecodeError as e:
85 raise error.ControlFileMalformed(str(e))
86
Chris Masone8dd27e02012-06-25 15:59:43 -070087 return control_file_in
88
89
Simran Basib6ec8ae2014-04-23 12:05:08 -070090def _stage_build_artifacts(build):
91 """
92 Ensure components of |build| necessary for installing images are staged.
93
94 @param build image we want to stage.
95
Prashanth B6285f6a2014-05-08 18:01:27 -070096 @raises StageControlFileFailure: if the dev server throws 500 while staging
97 suite control files.
Simran Basib6ec8ae2014-04-23 12:05:08 -070098
99 @return: dev_server.ImageServer instance to use with this build.
100 @return: timings dictionary containing staging start/end times.
101 """
102 timings = {}
Prashanth B6285f6a2014-05-08 18:01:27 -0700103 # Ensure components of |build| necessary for installing images are staged
104 # on the dev server. However set synchronous to False to allow other
105 # components to be downloaded in the background.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700106 ds = dev_server.ImageServer.resolve(build)
107 timings[constants.DOWNLOAD_STARTED_TIME] = formatted_now()
Gabe Black1e1c41b2015-02-04 23:55:15 -0800108 timer = autotest_stats.Timer('control_files.stage.%s' % (
109 ds.get_server_name(ds.url()).replace('.', '_')))
Simran Basib6ec8ae2014-04-23 12:05:08 -0700110 try:
Prashanth Balasubramanianabe3bb72014-11-20 12:00:37 -0800111 with timer:
112 ds.stage_artifacts(build, ['test_suites'])
Simran Basib6ec8ae2014-04-23 12:05:08 -0700113 except dev_server.DevServerException as e:
Prashanth B6285f6a2014-05-08 18:01:27 -0700114 raise error.StageControlFileFailure(
Simran Basib6ec8ae2014-04-23 12:05:08 -0700115 "Failed to stage %s: %s" % (build, e))
116 timings[constants.PAYLOAD_FINISHED_TIME] = formatted_now()
117 return (ds, timings)
118
119
120def create_suite_job(name='', board='', build='', pool='', control_file='',
121 check_hosts=True, num=None, file_bugs=False, timeout=24,
122 timeout_mins=None, priority=priorities.Priority.DEFAULT,
Fang Deng058860c2014-05-15 15:41:50 -0700123 suite_args=None, wait_for_results=True, job_retry=False,
Fang Deng443f1952015-01-02 14:51:49 -0800124 max_retries=None, max_runtime_mins=None, suite_min_duts=0,
Simran Basi1e10e922015-04-16 15:09:56 -0700125 offload_failures_only=False, **kwargs):
Chris Masone859fdec2012-01-30 08:38:09 -0800126 """
127 Create a job to run a test suite on the given device with the given image.
128
129 When the timeout specified in the control file is reached, the
130 job is guaranteed to have completed and results will be available.
131
Simran Basib6ec8ae2014-04-23 12:05:08 -0700132 @param name: The test name if control_file is supplied, otherwise the name
133 of the test suite to run, e.g. 'bvt'.
Chris Masone859fdec2012-01-30 08:38:09 -0800134 @param board: the kind of device to run the tests on.
135 @param build: unique name by which to refer to the image from now on.
Scott Zawalski65650172012-02-16 11:48:26 -0500136 @param pool: Specify the pool of machines to use for scheduling
137 purposes.
Chris Masone62579122012-03-08 15:18:43 -0800138 @param check_hosts: require appropriate live hosts to exist in the lab.
Aviv Keshetd83ef442013-01-16 16:19:35 -0800139 @param num: Specify the number of machines to schedule across (integer).
140 Leave unspecified or use None to use default sharding factor.
Alex Millerc577f3e2012-09-27 14:06:07 -0700141 @param file_bugs: File a bug on each test failure in this suite.
Alex Miller139690b2013-09-07 15:35:49 -0700142 @param timeout: The max lifetime of this suite, in hours.
Simran Basi7e605742013-11-12 13:43:36 -0800143 @param timeout_mins: The max lifetime of this suite, in minutes. Takes
144 priority over timeout.
Alex Miller139690b2013-09-07 15:35:49 -0700145 @param priority: Integer denoting priority. Higher is more important.
Aviv Keshet7cd12312013-07-25 10:25:55 -0700146 @param suite_args: Optional arguments which will be parsed by the suite
147 control file. Used by control.test_that_wrapper to
148 determine which tests to run.
Dan Shi95122412013-11-12 16:20:33 -0800149 @param wait_for_results: Set to False to run the suite job without waiting
150 for test jobs to finish. Default is True.
Fang Deng058860c2014-05-15 15:41:50 -0700151 @param job_retry: Set to True to enable job-level retry. Default is False.
Fang Deng443f1952015-01-02 14:51:49 -0800152 @param max_retries: Integer, maximum job retries allowed at suite level.
153 None for no max.
Simran Basi102e3522014-09-11 11:46:10 -0700154 @param max_runtime_mins: Maximum amount of time a job can be running in
155 minutes.
Fang Dengcbc01212014-11-25 16:09:46 -0800156 @param suite_min_duts: Integer. Scheduler will prioritize getting the
157 minimum number of machines for the suite when it is
158 competing with another suite that has a higher
159 priority but already got minimum machines it needs.
Simran Basi1e10e922015-04-16 15:09:56 -0700160 @param offload_failures_only: Only enable gs_offloading for failed jobs.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700161 @param kwargs: extra keyword args. NOT USED.
Chris Masone859fdec2012-01-30 08:38:09 -0800162
Chris Masone8dd27e02012-06-25 15:59:43 -0700163 @raises ControlFileNotFound: if a unique suite control file doesn't exist.
164 @raises NoControlFileList: if we can't list the control files at all.
Prashanth B6285f6a2014-05-08 18:01:27 -0700165 @raises StageControlFileFailure: If the dev server throws 500 while
166 staging test_suites.
Chris Masone8dd27e02012-06-25 15:59:43 -0700167 @raises ControlFileEmpty: if the control file exists on the server, but
168 can't be read.
Chris Masone859fdec2012-01-30 08:38:09 -0800169
170 @return: the job ID of the suite; -1 on error.
171 """
Aviv Keshetd83ef442013-01-16 16:19:35 -0800172 if type(num) is not int and num is not None:
Chris Sosa18c70b32013-02-15 14:12:43 -0800173 raise error.SuiteArgumentException('Ill specified num argument %r. '
174 'Must be an integer or None.' % num)
Aviv Keshetd83ef442013-01-16 16:19:35 -0800175 if num == 0:
176 logging.warning("Can't run on 0 hosts; using default.")
177 num = None
Fang Dengcbc01212014-11-25 16:09:46 -0800178 (ds, keyvals) = _stage_build_artifacts(build)
179 keyvals[constants.SUITE_MIN_DUTS_KEY] = suite_min_duts
Chris Masone859fdec2012-01-30 08:38:09 -0800180
Simran Basib6ec8ae2014-04-23 12:05:08 -0700181 if not control_file:
182 # No control file was supplied so look it up from the build artifacts.
183 suite_name = canonicalize_suite_name(name)
184 control_file = _get_control_file_contents_by_name(build, ds, suite_name)
185 name = '%s-%s' % (build, suite_name)
Chris Masone46d0eb12012-07-27 18:56:39 -0700186
Simran Basi7e605742013-11-12 13:43:36 -0800187 timeout_mins = timeout_mins or timeout * 60
Simran Basi102e3522014-09-11 11:46:10 -0700188 max_runtime_mins = max_runtime_mins or timeout * 60
Simran Basi7e605742013-11-12 13:43:36 -0800189
Simran Basib6ec8ae2014-04-23 12:05:08 -0700190 if not board:
191 board = utils.ParseBuildName(build)[0]
Chris Masone46d0eb12012-07-27 18:56:39 -0700192
Simran Basib6ec8ae2014-04-23 12:05:08 -0700193 # Prepend build and board to the control file.
Scott Zawalski65650172012-02-16 11:48:26 -0500194 inject_dict = {'board': board,
195 'build': build,
Chris Masone62579122012-03-08 15:18:43 -0800196 'check_hosts': check_hosts,
Chris Masone46d0eb12012-07-27 18:56:39 -0700197 'pool': pool,
Aviv Keshetd83ef442013-01-16 16:19:35 -0800198 'num': num,
Dan Shib8a99112013-06-18 13:46:10 -0700199 'file_bugs': file_bugs,
Alex Miller139690b2013-09-07 15:35:49 -0700200 'timeout': timeout,
Simran Basi7e605742013-11-12 13:43:36 -0800201 'timeout_mins': timeout_mins,
Alex Miller7d658cf2013-09-04 16:00:35 -0700202 'devserver_url': ds.url(),
Aviv Keshet7cd12312013-07-25 10:25:55 -0700203 'priority': priority,
Dan Shi95122412013-11-12 16:20:33 -0800204 'suite_args' : suite_args,
Fang Deng058860c2014-05-15 15:41:50 -0700205 'wait_for_results': wait_for_results,
Simran Basi102e3522014-09-11 11:46:10 -0700206 'job_retry': job_retry,
Fang Deng443f1952015-01-02 14:51:49 -0800207 'max_retries': max_retries,
Fang Dengcbc01212014-11-25 16:09:46 -0800208 'max_runtime_mins': max_runtime_mins,
Simran Basi1e10e922015-04-16 15:09:56 -0700209 'offload_failures_only': offload_failures_only
Aviv Keshet7cd12312013-07-25 10:25:55 -0700210 }
211
Simran Basib6ec8ae2014-04-23 12:05:08 -0700212 control_file = tools.inject_vars(inject_dict, control_file)
Chris Masone859fdec2012-01-30 08:38:09 -0800213
Jakob Juelich9fffe4f2014-08-14 18:07:05 -0700214 return rpc_utils.create_job_common(name,
Jakob Juelich59cfe542014-09-02 16:37:46 -0700215 priority=priority,
216 timeout_mins=timeout_mins,
217 max_runtime_mins=max_runtime_mins,
218 control_type='Server',
219 control_file=control_file,
220 hostless=True,
Fang Dengcbc01212014-11-25 16:09:46 -0800221 keyvals=keyvals)
Simran Basi71206ef2014-08-13 13:51:18 -0700222
223
224# TODO: hide the following rpcs under is_moblab
225def moblab_only(func):
226 """Ensure moblab specific functions only run on Moblab devices."""
227 def verify(*args, **kwargs):
228 if not utils.is_moblab():
229 raise error.RPCException('RPC: %s can only run on Moblab Systems!',
230 func.__name__)
231 return func(*args, **kwargs)
232 return verify
233
234
235@moblab_only
236def get_config_values():
237 """Returns all config values parsed from global and shadow configs.
238
239 Config values are grouped by sections, and each section is composed of
240 a list of name value pairs.
241 """
242 sections =_CONFIG.get_sections()
243 config_values = {}
244 for section in sections:
245 config_values[section] = _CONFIG.config.items(section)
Jakob Juelich9fffe4f2014-08-14 18:07:05 -0700246 return rpc_utils.prepare_for_serialization(config_values)
Simran Basi71206ef2014-08-13 13:51:18 -0700247
248
249@moblab_only
250def update_config_handler(config_values):
251 """
252 Update config values and override shadow config.
253
254 @param config_values: See get_moblab_settings().
255 """
Simran Basi773a86e2015-05-13 19:15:42 -0700256 original_config = global_config.global_config_class()
257 original_config.set_config_files(shadow_file='')
258 new_shadow = ConfigParser.RawConfigParser()
Simran Basi71206ef2014-08-13 13:51:18 -0700259 for section, config_value_list in config_values.iteritems():
260 for key, value in config_value_list:
Simran Basi773a86e2015-05-13 19:15:42 -0700261 if original_config.get_config_value(section, key,
262 default='',
263 allow_blank=True) != value:
264 if not new_shadow.has_section(section):
265 new_shadow.add_section(section)
266 new_shadow.set(section, key, value)
Simran Basi71206ef2014-08-13 13:51:18 -0700267 if not _CONFIG.shadow_file or not os.path.exists(_CONFIG.shadow_file):
268 raise error.RPCException('Shadow config file does not exist.')
269
270 with open(_CONFIG.shadow_file, 'w') as config_file:
Simran Basi773a86e2015-05-13 19:15:42 -0700271 new_shadow.write(config_file)
Simran Basi71206ef2014-08-13 13:51:18 -0700272 # TODO (sbasi) crbug.com/403916 - Remove the reboot command and
273 # instead restart the services that rely on the config values.
274 os.system('sudo reboot')
275
276
277@moblab_only
278def reset_config_settings():
279 with open(_CONFIG.shadow_file, 'w') as config_file:
280 pass
281 os.system('sudo reboot')
282
283
284@moblab_only
285def set_boto_key(boto_key):
286 """Update the boto_key file.
287
288 @param boto_key: File name of boto_key uploaded through handle_file_upload.
289 """
290 if not os.path.exists(boto_key):
291 raise error.RPCException('Boto key: %s does not exist!' % boto_key)
292 shutil.copyfile(boto_key, moblab_host.MOBLAB_BOTO_LOCATION)
Dan Shi193905e2014-07-25 23:33:09 -0700293
294
295def get_job_history(**filter_data):
296 """Get history of the job, including the special tasks executed for the job
297
298 @param filter_data: filter for the call, should at least include
299 {'job_id': [job id]}
300 @returns: JSON string of the job's history, including the information such
301 as the hosts run the job and the special tasks executed before
302 and after the job.
303 """
304 job_id = filter_data['job_id']
305 job_info = job_history.get_job_info(job_id)
Dan Shidfea3682014-08-10 23:38:40 -0700306 return rpc_utils.prepare_for_serialization(job_info.get_history())
307
308
309def get_host_history(start_time, end_time, hosts=None, board=None, pool=None):
310 """Get history of a list of host.
311
312 The return is a JSON string of host history for each host, for example,
313 {'172.22.33.51': [{'status': 'Resetting'
314 'start_time': '2014-08-07 10:02:16',
315 'end_time': '2014-08-07 10:03:16',
316 'log_url': 'http://autotest/reset-546546/debug',
317 'dbg_str': 'Task: Special Task 19441991 (host ...)'},
318 {'status': 'Running'
319 'start_time': '2014-08-07 10:03:18',
320 'end_time': '2014-08-07 10:13:00',
321 'log_url': 'http://autotest/reset-546546/debug',
322 'dbg_str': 'HQE: 15305005, for job: 14995562'}
323 ]
324 }
325 @param start_time: start time to search for history, can be string value or
326 epoch time.
327 @param end_time: end time to search for history, can be string value or
328 epoch time.
329 @param hosts: A list of hosts to search for history. Default is None.
330 @param board: board type of hosts. Default is None.
331 @param pool: pool type of hosts. Default is None.
332 @returns: JSON string of the host history.
333 """
334 return rpc_utils.prepare_for_serialization(
335 host_history.get_history_details(
336 start_time=start_time, end_time=end_time,
337 hosts=hosts, board=board, pool=pool,
338 process_pool_size=4))
Jakob Juelich59cfe542014-09-02 16:37:46 -0700339
340
Jakob Juelich1b525742014-09-30 13:08:07 -0700341def shard_heartbeat(shard_hostname, jobs=(), hqes=(),
342 known_job_ids=(), known_host_ids=()):
343 """Receive updates for job statuses from shards and assign hosts and jobs.
Jakob Juelich59cfe542014-09-02 16:37:46 -0700344
345 @param shard_hostname: Hostname of the calling shard
Jakob Juelicha94efe62014-09-18 16:02:49 -0700346 @param jobs: Jobs in serialized form that should be updated with newer
347 status from a shard.
348 @param hqes: Hostqueueentries in serialized form that should be updated with
349 newer status from a shard. Note that for every hostqueueentry
350 the corresponding job must be in jobs.
Jakob Juelich1b525742014-09-30 13:08:07 -0700351 @param known_job_ids: List of ids of jobs the shard already has.
352 @param known_host_ids: List of ids of hosts the shard already has.
Jakob Juelicha94efe62014-09-18 16:02:49 -0700353
Fang Dengf3705992014-12-16 17:32:18 -0800354 @returns: Serialized representations of hosts, jobs, suite job keyvals
355 and their dependencies to be inserted into a shard's database.
Jakob Juelich59cfe542014-09-02 16:37:46 -0700356 """
Jakob Juelich1b525742014-09-30 13:08:07 -0700357 # The following alternatives to sending host and job ids in every heartbeat
358 # have been considered:
359 # 1. Sending the highest known job and host ids. This would work for jobs:
360 # Newer jobs always have larger ids. Also, if a job is not assigned to a
361 # particular shard during a heartbeat, it never will be assigned to this
362 # shard later.
363 # This is not true for hosts though: A host that is leased won't be sent
364 # to the shard now, but might be sent in a future heartbeat. This means
365 # sometimes hosts should be transfered that have a lower id than the
366 # maximum host id the shard knows.
367 # 2. Send the number of jobs/hosts the shard knows to the master in each
368 # heartbeat. Compare these to the number of records that already have
369 # the shard_id set to this shard. In the normal case, they should match.
370 # In case they don't, resend all entities of that type.
371 # This would work well for hosts, because there aren't that many.
372 # Resending all jobs is quite a big overhead though.
373 # Also, this approach might run into edge cases when entities are
374 # ever deleted.
375 # 3. Mixtures of the above: Use 1 for jobs and 2 for hosts.
376 # Using two different approaches isn't consistent and might cause
377 # confusion. Also the issues with the case of deletions might still
378 # occur.
379 #
380 # The overhead of sending all job and host ids in every heartbeat is low:
381 # At peaks one board has about 1200 created but unfinished jobs.
382 # See the numbers here: http://goo.gl/gQCGWH
383 # Assuming that job id's have 6 digits and that json serialization takes a
384 # comma and a space as overhead, the traffic per id sent is about 8 bytes.
385 # If 5000 ids need to be sent, this means 40 kilobytes of traffic.
386 # A NOT IN query with 5000 ids took about 30ms in tests made.
387 # These numbers seem low enough to outweigh the disadvantages of the
388 # solutions described above.
Gabe Black1e1c41b2015-02-04 23:55:15 -0800389 timer = autotest_stats.Timer('shard_heartbeat')
Jakob Juelich59cfe542014-09-02 16:37:46 -0700390 with timer:
391 shard_obj = rpc_utils.retrieve_shard(shard_hostname=shard_hostname)
Jakob Juelicha94efe62014-09-18 16:02:49 -0700392 rpc_utils.persist_records_sent_from_shard(shard_obj, jobs, hqes)
Fang Dengf3705992014-12-16 17:32:18 -0800393 hosts, jobs, suite_keyvals = rpc_utils.find_records_for_shard(
Jakob Juelich1b525742014-09-30 13:08:07 -0700394 shard_obj,
395 known_job_ids=known_job_ids, known_host_ids=known_host_ids)
Jakob Juelich59cfe542014-09-02 16:37:46 -0700396 return {
397 'hosts': [host.serialize() for host in hosts],
398 'jobs': [job.serialize() for job in jobs],
Fang Dengf3705992014-12-16 17:32:18 -0800399 'suite_keyvals': [kv.serialize() for kv in suite_keyvals],
Jakob Juelich59cfe542014-09-02 16:37:46 -0700400 }
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700401
402
403def get_shards(**filter_data):
404 """Return a list of all shards.
405
406 @returns A sequence of nested dictionaries of shard information.
407 """
408 shards = models.Shard.query_objects(filter_data)
409 serialized_shards = rpc_utils.prepare_rows_as_nested_dicts(shards, ())
410 for serialized, shard in zip(serialized_shards, shards):
411 serialized['labels'] = [label.name for label in shard.labels.all()]
412
413 return serialized_shards
414
415
416def add_shard(hostname, label):
417 """Add a shard and start running jobs on it.
418
419 @param hostname: The hostname of the shard to be added; needs to be unique.
420 @param label: A platform label. Jobs of this label will be assigned to the
421 shard.
422
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700423 @raises error.RPCException: If label provided doesn't start with `board:`
424 @raises model_logic.ValidationError: If a shard with the given hostname
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700425 already exists.
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700426 @raises models.Label.DoesNotExist: If the label specified doesn't exist.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700427 """
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700428 if not label.startswith('board:'):
429 raise error.RPCException('Sharding only supported for `board:.*` '
430 'labels.')
431
432 # Fetch label first, so shard isn't created when label doesn't exist.
433 label = models.Label.smart_get(label)
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700434 shard = models.Shard.add_object(hostname=hostname)
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700435 shard.labels.add(label)
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700436 return shard.id
437
438
439def delete_shard(hostname):
440 """Delete a shard and reclaim all resources from it.
441
442 This claims back all assigned hosts from the shard. To ensure all DUTs are
443 in a sane state, a Repair task is scheduled for them. This reboots the DUTs
444 and therefore clears all running processes that might be left.
445
446 The shard_id of jobs of that shard will be set to None.
447
448 The status of jobs that haven't been reported to be finished yet, will be
449 lost. The master scheduler will pick up the jobs and execute them.
450
451 @param hostname: Hostname of the shard to delete.
452 """
453 shard = rpc_utils.retrieve_shard(shard_hostname=hostname)
454
455 # TODO(beeps): Power off shard
456
457 # For ChromeOS hosts, repair reboots the DUT.
458 # Repair will excalate through multiple repair steps and will verify the
459 # success after each of them. Anyway, it will always run at least the first
460 # one, which includes a reboot.
461 # After a reboot we can be sure no processes from prior tests that were run
462 # by a shard are still running on the DUT.
463 # Important: Don't just set the status to Repair Failed, as that would run
464 # Verify first, before doing any repair measures. Verify would probably
465 # succeed, so this wouldn't change anything on the DUT.
466 for host in models.Host.objects.filter(shard=shard):
467 models.SpecialTask.objects.create(
468 task=models.SpecialTask.Task.REPAIR,
469 host=host,
470 requested_by=models.User.current_user())
471 models.Host.objects.filter(shard=shard).update(shard=None)
472
473 models.Job.objects.filter(shard=shard).update(shard=None)
474
475 shard.labels.clear()
476
477 shard.delete()
Dan Shi6964fa52014-12-18 11:04:27 -0800478
479
Dan Shid7bb4f12015-01-06 10:53:50 -0800480def get_servers(role=None, status=None):
481 """Get a list of servers with matching role and status.
482
483 @param role: Name of the server role, e.g., drone, scheduler. Default to
484 None to match any role.
485 @param status: Status of the server, e.g., primary, backup, repair_required.
486 Default to None to match any server status.
487
488 @raises error.RPCException: If server database is not used.
489 @return: A list of server names for servers with matching role and status.
490 """
491 if not server_manager_utils.use_server_db():
492 raise error.RPCException('Server database is not enabled. Please try '
493 'retrieve servers from global config.')
494 servers = server_manager_utils.get_servers(hostname=None, role=role,
495 status=status)
496 return [s.get_details() for s in servers]
497
498
Dan Shi6964fa52014-12-18 11:04:27 -0800499def get_stable_version(board=stable_version_utils.DEFAULT):
500 """Get stable version for the given board.
501
502 @param board: Name of the board.
503 @return: Stable version of the given board. Return global configure value
504 of CROS.stable_cros_version if stable_versinos table does not have
505 entry of board DEFAULT.
506 """
MK Ryu6766de72015-05-13 16:08:24 -0700507 # This RPC call should be accepted only by master.
508 if utils.is_shard():
509 return rpc_utils.route_rpc_to_master('get_stable_version', board=board)
Dan Shi25e1fd42014-12-19 14:36:42 -0800510 return stable_version_utils.get(board)
511
512
513def get_all_stable_versions():
514 """Get stable versions for all boards.
515
516 @return: A dictionary of board:version.
517 """
MK Ryu6766de72015-05-13 16:08:24 -0700518 # This RPC call should be accepted only by master.
519 if utils.is_shard():
520 return rpc_utils.route_rpc_to_master('get_all_stable_versions')
Dan Shi25e1fd42014-12-19 14:36:42 -0800521 return stable_version_utils.get_all()
522
523
524def set_stable_version(version, board=stable_version_utils.DEFAULT):
525 """Modify stable version for the given board.
526
527 @param version: The new value of stable version for given board.
528 @param board: Name of the board, default to value `DEFAULT`.
529 """
MK Ryu6766de72015-05-13 16:08:24 -0700530 # This RPC call should be accepted only by master.
531 if utils.is_shard():
532 return rpc_utils.route_rpc_to_master('set_stable_version',
533 version=version, board=board)
Dan Shi25e1fd42014-12-19 14:36:42 -0800534 stable_version_utils.set(version=version, board=board)
535
536
537def delete_stable_version(board):
538 """Modify stable version for the given board.
539
540 Delete a stable version entry in afe_stable_versions table for a given
541 board, so default stable version will be used.
542
543 @param board: Name of the board.
544 """
MK Ryu6766de72015-05-13 16:08:24 -0700545 # This RPC call should be accepted only by master.
546 if utils.is_shard():
547 return rpc_utils.route_rpc_to_master('delete_stable_version',
548 board=board)
Dan Shi25e1fd42014-12-19 14:36:42 -0800549 stable_version_utils.delete(board=board)
Matthew Sartorid96fb9b2015-05-19 18:04:58 -0700550
551
552def get_tests_by_build(build):
553 """Get the tests that are available for the specified build.
554
555 @param build: unique name by which to refer to the image.
556
557 @return: A sorted list of all tests that are in the build specified.
558 """
559 # Stage the test artifacts.
560 try:
561 ds = dev_server.ImageServer.resolve(build)
562 build = ds.translate(build)
563 except dev_server.DevServerException as e:
564 raise ValueError('Could not resolve build %s: %s' % (build, e))
565
566 try:
567 ds.stage_artifacts(build, ['test_suites'])
568 except dev_server.DevServerException as e:
569 raise error.StageControlFileFailure(
570 'Failed to stage %s: %s' % (build, e))
571
572 # Collect the control files specified in this build
573 cfile_getter = control_file_getter.DevServerGetter.create(build, ds)
574 control_file_list = cfile_getter.get_control_file_list()
575
576 test_objects = []
577 _id = 0
578 for control_file_path in control_file_list:
579 # Read and parse the control file
580 control_file = cfile_getter.get_control_file_contents(
581 control_file_path)
582 control_obj = control_data.parse_control_string(control_file)
583
584 # Extract the values needed for the AFE from the control_obj.
585 # The keys list represents attributes in the control_obj that
586 # are required by the AFE
587 keys = ['author', 'doc', 'name', 'time', 'test_type', 'experimental',
588 'test_category', 'test_class', 'dependencies', 'run_verify',
589 'sync_count', 'job_retries', 'retries', 'path']
590
591 test_object = {}
592 for key in keys:
593 test_object[key] = getattr(control_obj, key) if hasattr(
594 control_obj, key) else ''
595
596 # Unfortunately, the AFE expects different key-names for certain
597 # values, these must be corrected to avoid the risk of tests
598 # being omitted by the AFE.
599 # The 'id' is an additional value used in the AFE.
600 test_object['id'] = _id
601 test_object['description'] = test_object.get('doc', '')
602 test_object['test_time'] = test_object.get('time', 0)
603 test_object['test_retry'] = test_object.get('retries', 0)
604
605 # Fix the test name to be consistent with the current presentation
606 # of test names in the AFE.
607 testpath, subname = os.path.split(control_file_path)
608 testname = os.path.basename(testpath)
609 subname = subname.split('.')[1:]
610 if subname:
611 testname = '%s:%s' % (testname, ':'.join(subname))
612
613 test_object['name'] = testname
614
615 _id += 1
616 test_objects.append(test_object)
617
618 return rpc_utils.prepare_for_serialization(test_objects)