blob: b7a945c29bc2b176bb015fd3f8859037797fd2fd [file] [log] [blame]
Dan Shi4df39252013-03-19 13:19:45 -07001# pylint: disable-msg=C0111
2
Chris Masone859fdec2012-01-30 08:38:09 -08003# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7__author__ = 'cmasone@chromium.org (Chris Masone)'
8
9import common
Chris Masonea8066a92012-05-01 16:52:31 -070010import datetime
Chris Masone859fdec2012-01-30 08:38:09 -080011import logging
Simran Basi71206ef2014-08-13 13:51:18 -070012import os
13import shutil
Aviv Keshetd83ef442013-01-16 16:19:35 -080014
Jakob Juelich82b7d1c2014-09-15 16:10:57 -070015from autotest_lib.frontend.afe import models
Aviv Keshetd83ef442013-01-16 16:19:35 -080016from autotest_lib.client.common_lib import error
Simran Basi71206ef2014-08-13 13:51:18 -070017from autotest_lib.client.common_lib import global_config
Alex Miller7d658cf2013-09-04 16:00:35 -070018from autotest_lib.client.common_lib import priorities
Dan Shidfea3682014-08-10 23:38:40 -070019from autotest_lib.client.common_lib import time_utils
Chris Masone859fdec2012-01-30 08:38:09 -080020from autotest_lib.client.common_lib.cros import dev_server
Jakob Juelich59cfe542014-09-02 16:37:46 -070021from autotest_lib.client.common_lib.cros.graphite import stats
Jakob Juelich9fffe4f2014-08-14 18:07:05 -070022from autotest_lib.frontend.afe import rpc_utils
Simran Basib6ec8ae2014-04-23 12:05:08 -070023from autotest_lib.server import utils
Chris Masone44e4d6c2012-08-15 14:25:53 -070024from autotest_lib.server.cros.dynamic_suite import constants
Chris Masoneb4935552012-08-14 12:05:54 -070025from autotest_lib.server.cros.dynamic_suite import control_file_getter
Chris Masone44e4d6c2012-08-15 14:25:53 -070026from autotest_lib.server.cros.dynamic_suite import tools
Simran Basi71206ef2014-08-13 13:51:18 -070027from autotest_lib.server.hosts import moblab_host
Dan Shidfea3682014-08-10 23:38:40 -070028from autotest_lib.site_utils import host_history
Dan Shi193905e2014-07-25 23:33:09 -070029from autotest_lib.site_utils import job_history
Simran Basi71206ef2014-08-13 13:51:18 -070030
31
32_CONFIG = global_config.global_config
33MOBLAB_BOTO_LOCATION = '/home/moblab/.boto'
Chris Masone859fdec2012-01-30 08:38:09 -080034
Chris Masonef8b53062012-05-08 22:14:18 -070035# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py.
Chris Masone859fdec2012-01-30 08:38:09 -080036
37
Chris Masone62579122012-03-08 15:18:43 -080038def canonicalize_suite_name(suite_name):
39 return 'test_suites/control.%s' % suite_name
40
41
Chris Masoneaa10f8e2012-05-15 13:34:21 -070042def formatted_now():
Dan Shidfea3682014-08-10 23:38:40 -070043 return datetime.datetime.now().strftime(time_utils.TIME_FMT)
Chris Masoneaa10f8e2012-05-15 13:34:21 -070044
45
Simran Basib6ec8ae2014-04-23 12:05:08 -070046def _get_control_file_contents_by_name(build, ds, suite_name):
Chris Masone8dd27e02012-06-25 15:59:43 -070047 """Return control file contents for |suite_name|.
48
49 Query the dev server at |ds| for the control file |suite_name|, included
50 in |build| for |board|.
51
52 @param build: unique name by which to refer to the image from now on.
Chris Masone8dd27e02012-06-25 15:59:43 -070053 @param ds: a dev_server.DevServer instance to fetch control file with.
54 @param suite_name: canonicalized suite name, e.g. test_suites/control.bvt.
55 @raises ControlFileNotFound if a unique suite control file doesn't exist.
56 @raises NoControlFileList if we can't list the control files at all.
57 @raises ControlFileEmpty if the control file exists on the server, but
58 can't be read.
59
60 @return the contents of the desired control file.
61 """
62 getter = control_file_getter.DevServerGetter.create(build, ds)
63 # Get the control file for the suite.
64 try:
65 control_file_in = getter.get_control_file_contents_by_name(suite_name)
66 except error.CrosDynamicSuiteException as e:
Simran Basib6ec8ae2014-04-23 12:05:08 -070067 raise type(e)("%s while testing %s." % (e, build))
Chris Masone8dd27e02012-06-25 15:59:43 -070068 if not control_file_in:
69 raise error.ControlFileEmpty(
70 "Fetching %s returned no data." % suite_name)
Alex Millera713e252013-03-01 10:45:44 -080071 # Force control files to only contain ascii characters.
72 try:
73 control_file_in.encode('ascii')
74 except UnicodeDecodeError as e:
75 raise error.ControlFileMalformed(str(e))
76
Chris Masone8dd27e02012-06-25 15:59:43 -070077 return control_file_in
78
79
Simran Basib6ec8ae2014-04-23 12:05:08 -070080def _stage_build_artifacts(build):
81 """
82 Ensure components of |build| necessary for installing images are staged.
83
84 @param build image we want to stage.
85
86 @raises StageBuildFailure: if the dev server throws 500 while staging
87 build.
88
89 @return: dev_server.ImageServer instance to use with this build.
90 @return: timings dictionary containing staging start/end times.
91 """
92 timings = {}
93 # Set synchronous to False to allow other components to be downloaded in
94 # the background.
95 ds = dev_server.ImageServer.resolve(build)
96 timings[constants.DOWNLOAD_STARTED_TIME] = formatted_now()
97 try:
98 ds.stage_artifacts(build, ['test_suites'])
99 except dev_server.DevServerException as e:
100 raise error.StageBuildFailure(
101 "Failed to stage %s: %s" % (build, e))
102 timings[constants.PAYLOAD_FINISHED_TIME] = formatted_now()
103 return (ds, timings)
104
105
106def create_suite_job(name='', board='', build='', pool='', control_file='',
107 check_hosts=True, num=None, file_bugs=False, timeout=24,
108 timeout_mins=None, priority=priorities.Priority.DEFAULT,
Fang Deng058860c2014-05-15 15:41:50 -0700109 suite_args=None, wait_for_results=True, job_retry=False,
Simran Basi102e3522014-09-11 11:46:10 -0700110 max_runtime_mins=None, **kwargs):
Chris Masone859fdec2012-01-30 08:38:09 -0800111 """
112 Create a job to run a test suite on the given device with the given image.
113
114 When the timeout specified in the control file is reached, the
115 job is guaranteed to have completed and results will be available.
116
Simran Basib6ec8ae2014-04-23 12:05:08 -0700117 @param name: The test name if control_file is supplied, otherwise the name
118 of the test suite to run, e.g. 'bvt'.
Chris Masone859fdec2012-01-30 08:38:09 -0800119 @param board: the kind of device to run the tests on.
120 @param build: unique name by which to refer to the image from now on.
Scott Zawalski65650172012-02-16 11:48:26 -0500121 @param pool: Specify the pool of machines to use for scheduling
122 purposes.
Chris Masone62579122012-03-08 15:18:43 -0800123 @param check_hosts: require appropriate live hosts to exist in the lab.
Aviv Keshetd83ef442013-01-16 16:19:35 -0800124 @param num: Specify the number of machines to schedule across (integer).
125 Leave unspecified or use None to use default sharding factor.
Alex Millerc577f3e2012-09-27 14:06:07 -0700126 @param file_bugs: File a bug on each test failure in this suite.
Alex Miller139690b2013-09-07 15:35:49 -0700127 @param timeout: The max lifetime of this suite, in hours.
Simran Basi7e605742013-11-12 13:43:36 -0800128 @param timeout_mins: The max lifetime of this suite, in minutes. Takes
129 priority over timeout.
Alex Miller139690b2013-09-07 15:35:49 -0700130 @param priority: Integer denoting priority. Higher is more important.
Aviv Keshet7cd12312013-07-25 10:25:55 -0700131 @param suite_args: Optional arguments which will be parsed by the suite
132 control file. Used by control.test_that_wrapper to
133 determine which tests to run.
Dan Shi95122412013-11-12 16:20:33 -0800134 @param wait_for_results: Set to False to run the suite job without waiting
135 for test jobs to finish. Default is True.
Fang Deng058860c2014-05-15 15:41:50 -0700136 @param job_retry: Set to True to enable job-level retry. Default is False.
Simran Basi102e3522014-09-11 11:46:10 -0700137 @param max_runtime_mins: Maximum amount of time a job can be running in
138 minutes.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700139 @param kwargs: extra keyword args. NOT USED.
Chris Masone859fdec2012-01-30 08:38:09 -0800140
Chris Masone8dd27e02012-06-25 15:59:43 -0700141 @raises ControlFileNotFound: if a unique suite control file doesn't exist.
142 @raises NoControlFileList: if we can't list the control files at all.
143 @raises StageBuildFailure: if the dev server throws 500 while staging build.
144 @raises ControlFileEmpty: if the control file exists on the server, but
145 can't be read.
Chris Masone859fdec2012-01-30 08:38:09 -0800146
147 @return: the job ID of the suite; -1 on error.
148 """
Aviv Keshetd83ef442013-01-16 16:19:35 -0800149 if type(num) is not int and num is not None:
Chris Sosa18c70b32013-02-15 14:12:43 -0800150 raise error.SuiteArgumentException('Ill specified num argument %r. '
151 'Must be an integer or None.' % num)
Aviv Keshetd83ef442013-01-16 16:19:35 -0800152 if num == 0:
153 logging.warning("Can't run on 0 hosts; using default.")
154 num = None
Chris Masonea8066a92012-05-01 16:52:31 -0700155
Simran Basib6ec8ae2014-04-23 12:05:08 -0700156 (ds, timings) = _stage_build_artifacts(build)
Chris Masone859fdec2012-01-30 08:38:09 -0800157
Simran Basib6ec8ae2014-04-23 12:05:08 -0700158 if not control_file:
159 # No control file was supplied so look it up from the build artifacts.
160 suite_name = canonicalize_suite_name(name)
161 control_file = _get_control_file_contents_by_name(build, ds, suite_name)
162 name = '%s-%s' % (build, suite_name)
Chris Masone46d0eb12012-07-27 18:56:39 -0700163
Simran Basi7e605742013-11-12 13:43:36 -0800164 timeout_mins = timeout_mins or timeout * 60
Simran Basi102e3522014-09-11 11:46:10 -0700165 max_runtime_mins = max_runtime_mins or timeout * 60
Simran Basi7e605742013-11-12 13:43:36 -0800166
Simran Basib6ec8ae2014-04-23 12:05:08 -0700167 if not board:
168 board = utils.ParseBuildName(build)[0]
Chris Masone46d0eb12012-07-27 18:56:39 -0700169
Simran Basib6ec8ae2014-04-23 12:05:08 -0700170 # Prepend build and board to the control file.
Scott Zawalski65650172012-02-16 11:48:26 -0500171 inject_dict = {'board': board,
172 'build': build,
Chris Masone62579122012-03-08 15:18:43 -0800173 'check_hosts': check_hosts,
Chris Masone46d0eb12012-07-27 18:56:39 -0700174 'pool': pool,
Aviv Keshetd83ef442013-01-16 16:19:35 -0800175 'num': num,
Dan Shib8a99112013-06-18 13:46:10 -0700176 'file_bugs': file_bugs,
Alex Miller139690b2013-09-07 15:35:49 -0700177 'timeout': timeout,
Simran Basi7e605742013-11-12 13:43:36 -0800178 'timeout_mins': timeout_mins,
Alex Miller7d658cf2013-09-04 16:00:35 -0700179 'devserver_url': ds.url(),
Aviv Keshet7cd12312013-07-25 10:25:55 -0700180 'priority': priority,
Dan Shi95122412013-11-12 16:20:33 -0800181 'suite_args' : suite_args,
Fang Deng058860c2014-05-15 15:41:50 -0700182 'wait_for_results': wait_for_results,
Simran Basi102e3522014-09-11 11:46:10 -0700183 'job_retry': job_retry,
184 'max_runtime_mins': max_runtime_mins
Aviv Keshet7cd12312013-07-25 10:25:55 -0700185 }
186
Simran Basib6ec8ae2014-04-23 12:05:08 -0700187 control_file = tools.inject_vars(inject_dict, control_file)
Chris Masone859fdec2012-01-30 08:38:09 -0800188
Jakob Juelich9fffe4f2014-08-14 18:07:05 -0700189 return rpc_utils.create_job_common(name,
Jakob Juelich59cfe542014-09-02 16:37:46 -0700190 priority=priority,
191 timeout_mins=timeout_mins,
192 max_runtime_mins=max_runtime_mins,
193 control_type='Server',
194 control_file=control_file,
195 hostless=True,
196 keyvals=timings)
Simran Basi71206ef2014-08-13 13:51:18 -0700197
198
199# TODO: hide the following rpcs under is_moblab
200def moblab_only(func):
201 """Ensure moblab specific functions only run on Moblab devices."""
202 def verify(*args, **kwargs):
203 if not utils.is_moblab():
204 raise error.RPCException('RPC: %s can only run on Moblab Systems!',
205 func.__name__)
206 return func(*args, **kwargs)
207 return verify
208
209
210@moblab_only
211def get_config_values():
212 """Returns all config values parsed from global and shadow configs.
213
214 Config values are grouped by sections, and each section is composed of
215 a list of name value pairs.
216 """
217 sections =_CONFIG.get_sections()
218 config_values = {}
219 for section in sections:
220 config_values[section] = _CONFIG.config.items(section)
Jakob Juelich9fffe4f2014-08-14 18:07:05 -0700221 return rpc_utils.prepare_for_serialization(config_values)
Simran Basi71206ef2014-08-13 13:51:18 -0700222
223
224@moblab_only
225def update_config_handler(config_values):
226 """
227 Update config values and override shadow config.
228
229 @param config_values: See get_moblab_settings().
230 """
231 for section, config_value_list in config_values.iteritems():
232 for key, value in config_value_list:
233 _CONFIG.override_config_value(section, key, value)
234 if not _CONFIG.shadow_file or not os.path.exists(_CONFIG.shadow_file):
235 raise error.RPCException('Shadow config file does not exist.')
236
237 with open(_CONFIG.shadow_file, 'w') as config_file:
238 _CONFIG.config.write(config_file)
239 # TODO (sbasi) crbug.com/403916 - Remove the reboot command and
240 # instead restart the services that rely on the config values.
241 os.system('sudo reboot')
242
243
244@moblab_only
245def reset_config_settings():
246 with open(_CONFIG.shadow_file, 'w') as config_file:
247 pass
248 os.system('sudo reboot')
249
250
251@moblab_only
252def set_boto_key(boto_key):
253 """Update the boto_key file.
254
255 @param boto_key: File name of boto_key uploaded through handle_file_upload.
256 """
257 if not os.path.exists(boto_key):
258 raise error.RPCException('Boto key: %s does not exist!' % boto_key)
259 shutil.copyfile(boto_key, moblab_host.MOBLAB_BOTO_LOCATION)
Dan Shi193905e2014-07-25 23:33:09 -0700260
261
262def get_job_history(**filter_data):
263 """Get history of the job, including the special tasks executed for the job
264
265 @param filter_data: filter for the call, should at least include
266 {'job_id': [job id]}
267 @returns: JSON string of the job's history, including the information such
268 as the hosts run the job and the special tasks executed before
269 and after the job.
270 """
271 job_id = filter_data['job_id']
272 job_info = job_history.get_job_info(job_id)
Dan Shidfea3682014-08-10 23:38:40 -0700273 return rpc_utils.prepare_for_serialization(job_info.get_history())
274
275
276def get_host_history(start_time, end_time, hosts=None, board=None, pool=None):
277 """Get history of a list of host.
278
279 The return is a JSON string of host history for each host, for example,
280 {'172.22.33.51': [{'status': 'Resetting'
281 'start_time': '2014-08-07 10:02:16',
282 'end_time': '2014-08-07 10:03:16',
283 'log_url': 'http://autotest/reset-546546/debug',
284 'dbg_str': 'Task: Special Task 19441991 (host ...)'},
285 {'status': 'Running'
286 'start_time': '2014-08-07 10:03:18',
287 'end_time': '2014-08-07 10:13:00',
288 'log_url': 'http://autotest/reset-546546/debug',
289 'dbg_str': 'HQE: 15305005, for job: 14995562'}
290 ]
291 }
292 @param start_time: start time to search for history, can be string value or
293 epoch time.
294 @param end_time: end time to search for history, can be string value or
295 epoch time.
296 @param hosts: A list of hosts to search for history. Default is None.
297 @param board: board type of hosts. Default is None.
298 @param pool: pool type of hosts. Default is None.
299 @returns: JSON string of the host history.
300 """
301 return rpc_utils.prepare_for_serialization(
302 host_history.get_history_details(
303 start_time=start_time, end_time=end_time,
304 hosts=hosts, board=board, pool=pool,
305 process_pool_size=4))
Jakob Juelich59cfe542014-09-02 16:37:46 -0700306
307
Owen Line5482e52014-09-26 08:39:31 +0000308def shard_heartbeat(shard_hostname):
309 """Register shard if it doesn't exist, then assign hosts and jobs.
Jakob Juelich59cfe542014-09-02 16:37:46 -0700310
311 @param shard_hostname: Hostname of the calling shard
312 @returns: Serialized representations of hosts, jobs and their dependencies
313 to be inserted into a shard's database.
314 """
315 timer = stats.Timer('shard_heartbeat')
316 with timer:
317 shard_obj = rpc_utils.retrieve_shard(shard_hostname=shard_hostname)
318 hosts, jobs = rpc_utils.find_records_for_shard(shard_obj)
319 return {
320 'hosts': [host.serialize() for host in hosts],
321 'jobs': [job.serialize() for job in jobs],
322 }
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700323
324
325def get_shards(**filter_data):
326 """Return a list of all shards.
327
328 @returns A sequence of nested dictionaries of shard information.
329 """
330 shards = models.Shard.query_objects(filter_data)
331 serialized_shards = rpc_utils.prepare_rows_as_nested_dicts(shards, ())
332 for serialized, shard in zip(serialized_shards, shards):
333 serialized['labels'] = [label.name for label in shard.labels.all()]
334
335 return serialized_shards
336
337
338def add_shard(hostname, label):
339 """Add a shard and start running jobs on it.
340
341 @param hostname: The hostname of the shard to be added; needs to be unique.
342 @param label: A platform label. Jobs of this label will be assigned to the
343 shard.
344
345 @raises model_logic.ValidationError if a shard with the given hostname
346 already exists.
347 """
348 shard = models.Shard.add_object(hostname=hostname)
349 shard.labels.add(models.Label.smart_get(label))
350 return shard.id
351
352
353def delete_shard(hostname):
354 """Delete a shard and reclaim all resources from it.
355
356 This claims back all assigned hosts from the shard. To ensure all DUTs are
357 in a sane state, a Repair task is scheduled for them. This reboots the DUTs
358 and therefore clears all running processes that might be left.
359
360 The shard_id of jobs of that shard will be set to None.
361
362 The status of jobs that haven't been reported to be finished yet, will be
363 lost. The master scheduler will pick up the jobs and execute them.
364
365 @param hostname: Hostname of the shard to delete.
366 """
367 shard = rpc_utils.retrieve_shard(shard_hostname=hostname)
368
369 # TODO(beeps): Power off shard
370
371 # For ChromeOS hosts, repair reboots the DUT.
372 # Repair will excalate through multiple repair steps and will verify the
373 # success after each of them. Anyway, it will always run at least the first
374 # one, which includes a reboot.
375 # After a reboot we can be sure no processes from prior tests that were run
376 # by a shard are still running on the DUT.
377 # Important: Don't just set the status to Repair Failed, as that would run
378 # Verify first, before doing any repair measures. Verify would probably
379 # succeed, so this wouldn't change anything on the DUT.
380 for host in models.Host.objects.filter(shard=shard):
381 models.SpecialTask.objects.create(
382 task=models.SpecialTask.Task.REPAIR,
383 host=host,
384 requested_by=models.User.current_user())
385 models.Host.objects.filter(shard=shard).update(shard=None)
386
387 models.Job.objects.filter(shard=shard).update(shard=None)
388
389 shard.labels.clear()
390
391 shard.delete()