blob: 9de362cab40c6186bf0f37b9f19c2c5cd0f65503 [file] [log] [blame]
Dan Shi4df39252013-03-19 13:19:45 -07001# pylint: disable-msg=C0111
2
Chris Masone859fdec2012-01-30 08:38:09 -08003# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7__author__ = 'cmasone@chromium.org (Chris Masone)'
8
9import common
Chris Masonea8066a92012-05-01 16:52:31 -070010import datetime
Chris Masone859fdec2012-01-30 08:38:09 -080011import logging
Simran Basi71206ef2014-08-13 13:51:18 -070012import os
13import shutil
Aviv Keshetd83ef442013-01-16 16:19:35 -080014
Jakob Juelich82b7d1c2014-09-15 16:10:57 -070015from autotest_lib.frontend.afe import models
Aviv Keshetd83ef442013-01-16 16:19:35 -080016from autotest_lib.client.common_lib import error
Simran Basi71206ef2014-08-13 13:51:18 -070017from autotest_lib.client.common_lib import global_config
Alex Miller7d658cf2013-09-04 16:00:35 -070018from autotest_lib.client.common_lib import priorities
Dan Shidfea3682014-08-10 23:38:40 -070019from autotest_lib.client.common_lib import time_utils
Chris Masone859fdec2012-01-30 08:38:09 -080020from autotest_lib.client.common_lib.cros import dev_server
Jakob Juelich59cfe542014-09-02 16:37:46 -070021from autotest_lib.client.common_lib.cros.graphite import stats
Jakob Juelich9fffe4f2014-08-14 18:07:05 -070022from autotest_lib.frontend.afe import rpc_utils
Simran Basib6ec8ae2014-04-23 12:05:08 -070023from autotest_lib.server import utils
Chris Masone44e4d6c2012-08-15 14:25:53 -070024from autotest_lib.server.cros.dynamic_suite import constants
Chris Masoneb4935552012-08-14 12:05:54 -070025from autotest_lib.server.cros.dynamic_suite import control_file_getter
Chris Masone44e4d6c2012-08-15 14:25:53 -070026from autotest_lib.server.cros.dynamic_suite import tools
Simran Basi71206ef2014-08-13 13:51:18 -070027from autotest_lib.server.hosts import moblab_host
Dan Shidfea3682014-08-10 23:38:40 -070028from autotest_lib.site_utils import host_history
Dan Shi193905e2014-07-25 23:33:09 -070029from autotest_lib.site_utils import job_history
Simran Basi71206ef2014-08-13 13:51:18 -070030
31
32_CONFIG = global_config.global_config
33MOBLAB_BOTO_LOCATION = '/home/moblab/.boto'
Chris Masone859fdec2012-01-30 08:38:09 -080034
Chris Masonef8b53062012-05-08 22:14:18 -070035# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py.
Chris Masone859fdec2012-01-30 08:38:09 -080036
37
Chris Masone62579122012-03-08 15:18:43 -080038def canonicalize_suite_name(suite_name):
39 return 'test_suites/control.%s' % suite_name
40
41
Chris Masoneaa10f8e2012-05-15 13:34:21 -070042def formatted_now():
Dan Shidfea3682014-08-10 23:38:40 -070043 return datetime.datetime.now().strftime(time_utils.TIME_FMT)
Chris Masoneaa10f8e2012-05-15 13:34:21 -070044
45
Simran Basib6ec8ae2014-04-23 12:05:08 -070046def _get_control_file_contents_by_name(build, ds, suite_name):
Chris Masone8dd27e02012-06-25 15:59:43 -070047 """Return control file contents for |suite_name|.
48
49 Query the dev server at |ds| for the control file |suite_name|, included
50 in |build| for |board|.
51
52 @param build: unique name by which to refer to the image from now on.
Chris Masone8dd27e02012-06-25 15:59:43 -070053 @param ds: a dev_server.DevServer instance to fetch control file with.
54 @param suite_name: canonicalized suite name, e.g. test_suites/control.bvt.
55 @raises ControlFileNotFound if a unique suite control file doesn't exist.
56 @raises NoControlFileList if we can't list the control files at all.
57 @raises ControlFileEmpty if the control file exists on the server, but
58 can't be read.
59
60 @return the contents of the desired control file.
61 """
62 getter = control_file_getter.DevServerGetter.create(build, ds)
Prashanth Balasubramanianabe3bb72014-11-20 12:00:37 -080063 timer = stats.Timer('control_files.parse.%s.%s' %
64 (ds.get_server_name(ds.url()).replace('.', '_'),
65 suite_name.rsplit('.')[-1]))
Chris Masone8dd27e02012-06-25 15:59:43 -070066 # Get the control file for the suite.
67 try:
Prashanth Balasubramanianabe3bb72014-11-20 12:00:37 -080068 with timer:
69 control_file_in = getter.get_control_file_contents_by_name(
70 suite_name)
Chris Masone8dd27e02012-06-25 15:59:43 -070071 except error.CrosDynamicSuiteException as e:
Simran Basib6ec8ae2014-04-23 12:05:08 -070072 raise type(e)("%s while testing %s." % (e, build))
Chris Masone8dd27e02012-06-25 15:59:43 -070073 if not control_file_in:
74 raise error.ControlFileEmpty(
75 "Fetching %s returned no data." % suite_name)
Alex Millera713e252013-03-01 10:45:44 -080076 # Force control files to only contain ascii characters.
77 try:
78 control_file_in.encode('ascii')
79 except UnicodeDecodeError as e:
80 raise error.ControlFileMalformed(str(e))
81
Chris Masone8dd27e02012-06-25 15:59:43 -070082 return control_file_in
83
84
Simran Basib6ec8ae2014-04-23 12:05:08 -070085def _stage_build_artifacts(build):
86 """
87 Ensure components of |build| necessary for installing images are staged.
88
89 @param build image we want to stage.
90
Prashanth B6285f6a2014-05-08 18:01:27 -070091 @raises StageControlFileFailure: if the dev server throws 500 while staging
92 suite control files.
Simran Basib6ec8ae2014-04-23 12:05:08 -070093
94 @return: dev_server.ImageServer instance to use with this build.
95 @return: timings dictionary containing staging start/end times.
96 """
97 timings = {}
Prashanth B6285f6a2014-05-08 18:01:27 -070098 # Ensure components of |build| necessary for installing images are staged
99 # on the dev server. However set synchronous to False to allow other
100 # components to be downloaded in the background.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700101 ds = dev_server.ImageServer.resolve(build)
102 timings[constants.DOWNLOAD_STARTED_TIME] = formatted_now()
Prashanth Balasubramanianabe3bb72014-11-20 12:00:37 -0800103 timer = stats.Timer('control_files.stage.%s' % (
104 ds.get_server_name(ds.url()).replace('.', '_')))
Simran Basib6ec8ae2014-04-23 12:05:08 -0700105 try:
Prashanth Balasubramanianabe3bb72014-11-20 12:00:37 -0800106 with timer:
107 ds.stage_artifacts(build, ['test_suites'])
Simran Basib6ec8ae2014-04-23 12:05:08 -0700108 except dev_server.DevServerException as e:
Prashanth B6285f6a2014-05-08 18:01:27 -0700109 raise error.StageControlFileFailure(
Simran Basib6ec8ae2014-04-23 12:05:08 -0700110 "Failed to stage %s: %s" % (build, e))
111 timings[constants.PAYLOAD_FINISHED_TIME] = formatted_now()
112 return (ds, timings)
113
114
115def create_suite_job(name='', board='', build='', pool='', control_file='',
116 check_hosts=True, num=None, file_bugs=False, timeout=24,
117 timeout_mins=None, priority=priorities.Priority.DEFAULT,
Fang Deng058860c2014-05-15 15:41:50 -0700118 suite_args=None, wait_for_results=True, job_retry=False,
Fang Dengcbc01212014-11-25 16:09:46 -0800119 max_runtime_mins=None, suite_min_duts=0, **kwargs):
Chris Masone859fdec2012-01-30 08:38:09 -0800120 """
121 Create a job to run a test suite on the given device with the given image.
122
123 When the timeout specified in the control file is reached, the
124 job is guaranteed to have completed and results will be available.
125
Simran Basib6ec8ae2014-04-23 12:05:08 -0700126 @param name: The test name if control_file is supplied, otherwise the name
127 of the test suite to run, e.g. 'bvt'.
Chris Masone859fdec2012-01-30 08:38:09 -0800128 @param board: the kind of device to run the tests on.
129 @param build: unique name by which to refer to the image from now on.
Scott Zawalski65650172012-02-16 11:48:26 -0500130 @param pool: Specify the pool of machines to use for scheduling
131 purposes.
Chris Masone62579122012-03-08 15:18:43 -0800132 @param check_hosts: require appropriate live hosts to exist in the lab.
Aviv Keshetd83ef442013-01-16 16:19:35 -0800133 @param num: Specify the number of machines to schedule across (integer).
134 Leave unspecified or use None to use default sharding factor.
Alex Millerc577f3e2012-09-27 14:06:07 -0700135 @param file_bugs: File a bug on each test failure in this suite.
Alex Miller139690b2013-09-07 15:35:49 -0700136 @param timeout: The max lifetime of this suite, in hours.
Simran Basi7e605742013-11-12 13:43:36 -0800137 @param timeout_mins: The max lifetime of this suite, in minutes. Takes
138 priority over timeout.
Alex Miller139690b2013-09-07 15:35:49 -0700139 @param priority: Integer denoting priority. Higher is more important.
Aviv Keshet7cd12312013-07-25 10:25:55 -0700140 @param suite_args: Optional arguments which will be parsed by the suite
141 control file. Used by control.test_that_wrapper to
142 determine which tests to run.
Dan Shi95122412013-11-12 16:20:33 -0800143 @param wait_for_results: Set to False to run the suite job without waiting
144 for test jobs to finish. Default is True.
Fang Deng058860c2014-05-15 15:41:50 -0700145 @param job_retry: Set to True to enable job-level retry. Default is False.
Simran Basi102e3522014-09-11 11:46:10 -0700146 @param max_runtime_mins: Maximum amount of time a job can be running in
147 minutes.
Fang Dengcbc01212014-11-25 16:09:46 -0800148 @param suite_min_duts: Integer. Scheduler will prioritize getting the
149 minimum number of machines for the suite when it is
150 competing with another suite that has a higher
151 priority but already got minimum machines it needs.
Simran Basib6ec8ae2014-04-23 12:05:08 -0700152 @param kwargs: extra keyword args. NOT USED.
Chris Masone859fdec2012-01-30 08:38:09 -0800153
Chris Masone8dd27e02012-06-25 15:59:43 -0700154 @raises ControlFileNotFound: if a unique suite control file doesn't exist.
155 @raises NoControlFileList: if we can't list the control files at all.
Prashanth B6285f6a2014-05-08 18:01:27 -0700156 @raises StageControlFileFailure: If the dev server throws 500 while
157 staging test_suites.
Chris Masone8dd27e02012-06-25 15:59:43 -0700158 @raises ControlFileEmpty: if the control file exists on the server, but
159 can't be read.
Chris Masone859fdec2012-01-30 08:38:09 -0800160
161 @return: the job ID of the suite; -1 on error.
162 """
Aviv Keshetd83ef442013-01-16 16:19:35 -0800163 if type(num) is not int and num is not None:
Chris Sosa18c70b32013-02-15 14:12:43 -0800164 raise error.SuiteArgumentException('Ill specified num argument %r. '
165 'Must be an integer or None.' % num)
Aviv Keshetd83ef442013-01-16 16:19:35 -0800166 if num == 0:
167 logging.warning("Can't run on 0 hosts; using default.")
168 num = None
Fang Dengcbc01212014-11-25 16:09:46 -0800169 (ds, keyvals) = _stage_build_artifacts(build)
170 keyvals[constants.SUITE_MIN_DUTS_KEY] = suite_min_duts
Chris Masone859fdec2012-01-30 08:38:09 -0800171
Simran Basib6ec8ae2014-04-23 12:05:08 -0700172 if not control_file:
173 # No control file was supplied so look it up from the build artifacts.
174 suite_name = canonicalize_suite_name(name)
175 control_file = _get_control_file_contents_by_name(build, ds, suite_name)
176 name = '%s-%s' % (build, suite_name)
Chris Masone46d0eb12012-07-27 18:56:39 -0700177
Simran Basi7e605742013-11-12 13:43:36 -0800178 timeout_mins = timeout_mins or timeout * 60
Simran Basi102e3522014-09-11 11:46:10 -0700179 max_runtime_mins = max_runtime_mins or timeout * 60
Simran Basi7e605742013-11-12 13:43:36 -0800180
Simran Basib6ec8ae2014-04-23 12:05:08 -0700181 if not board:
182 board = utils.ParseBuildName(build)[0]
Chris Masone46d0eb12012-07-27 18:56:39 -0700183
Simran Basib6ec8ae2014-04-23 12:05:08 -0700184 # Prepend build and board to the control file.
Scott Zawalski65650172012-02-16 11:48:26 -0500185 inject_dict = {'board': board,
186 'build': build,
Chris Masone62579122012-03-08 15:18:43 -0800187 'check_hosts': check_hosts,
Chris Masone46d0eb12012-07-27 18:56:39 -0700188 'pool': pool,
Aviv Keshetd83ef442013-01-16 16:19:35 -0800189 'num': num,
Dan Shib8a99112013-06-18 13:46:10 -0700190 'file_bugs': file_bugs,
Alex Miller139690b2013-09-07 15:35:49 -0700191 'timeout': timeout,
Simran Basi7e605742013-11-12 13:43:36 -0800192 'timeout_mins': timeout_mins,
Alex Miller7d658cf2013-09-04 16:00:35 -0700193 'devserver_url': ds.url(),
Aviv Keshet7cd12312013-07-25 10:25:55 -0700194 'priority': priority,
Dan Shi95122412013-11-12 16:20:33 -0800195 'suite_args' : suite_args,
Fang Deng058860c2014-05-15 15:41:50 -0700196 'wait_for_results': wait_for_results,
Simran Basi102e3522014-09-11 11:46:10 -0700197 'job_retry': job_retry,
Fang Dengcbc01212014-11-25 16:09:46 -0800198 'max_runtime_mins': max_runtime_mins,
Aviv Keshet7cd12312013-07-25 10:25:55 -0700199 }
200
Simran Basib6ec8ae2014-04-23 12:05:08 -0700201 control_file = tools.inject_vars(inject_dict, control_file)
Chris Masone859fdec2012-01-30 08:38:09 -0800202
Jakob Juelich9fffe4f2014-08-14 18:07:05 -0700203 return rpc_utils.create_job_common(name,
Jakob Juelich59cfe542014-09-02 16:37:46 -0700204 priority=priority,
205 timeout_mins=timeout_mins,
206 max_runtime_mins=max_runtime_mins,
207 control_type='Server',
208 control_file=control_file,
209 hostless=True,
Fang Dengcbc01212014-11-25 16:09:46 -0800210 keyvals=keyvals)
Simran Basi71206ef2014-08-13 13:51:18 -0700211
212
213# TODO: hide the following rpcs under is_moblab
214def moblab_only(func):
215 """Ensure moblab specific functions only run on Moblab devices."""
216 def verify(*args, **kwargs):
217 if not utils.is_moblab():
218 raise error.RPCException('RPC: %s can only run on Moblab Systems!',
219 func.__name__)
220 return func(*args, **kwargs)
221 return verify
222
223
224@moblab_only
225def get_config_values():
226 """Returns all config values parsed from global and shadow configs.
227
228 Config values are grouped by sections, and each section is composed of
229 a list of name value pairs.
230 """
231 sections =_CONFIG.get_sections()
232 config_values = {}
233 for section in sections:
234 config_values[section] = _CONFIG.config.items(section)
Jakob Juelich9fffe4f2014-08-14 18:07:05 -0700235 return rpc_utils.prepare_for_serialization(config_values)
Simran Basi71206ef2014-08-13 13:51:18 -0700236
237
238@moblab_only
239def update_config_handler(config_values):
240 """
241 Update config values and override shadow config.
242
243 @param config_values: See get_moblab_settings().
244 """
245 for section, config_value_list in config_values.iteritems():
246 for key, value in config_value_list:
247 _CONFIG.override_config_value(section, key, value)
248 if not _CONFIG.shadow_file or not os.path.exists(_CONFIG.shadow_file):
249 raise error.RPCException('Shadow config file does not exist.')
250
251 with open(_CONFIG.shadow_file, 'w') as config_file:
252 _CONFIG.config.write(config_file)
253 # TODO (sbasi) crbug.com/403916 - Remove the reboot command and
254 # instead restart the services that rely on the config values.
255 os.system('sudo reboot')
256
257
258@moblab_only
259def reset_config_settings():
260 with open(_CONFIG.shadow_file, 'w') as config_file:
261 pass
262 os.system('sudo reboot')
263
264
265@moblab_only
266def set_boto_key(boto_key):
267 """Update the boto_key file.
268
269 @param boto_key: File name of boto_key uploaded through handle_file_upload.
270 """
271 if not os.path.exists(boto_key):
272 raise error.RPCException('Boto key: %s does not exist!' % boto_key)
273 shutil.copyfile(boto_key, moblab_host.MOBLAB_BOTO_LOCATION)
Dan Shi193905e2014-07-25 23:33:09 -0700274
275
276def get_job_history(**filter_data):
277 """Get history of the job, including the special tasks executed for the job
278
279 @param filter_data: filter for the call, should at least include
280 {'job_id': [job id]}
281 @returns: JSON string of the job's history, including the information such
282 as the hosts run the job and the special tasks executed before
283 and after the job.
284 """
285 job_id = filter_data['job_id']
286 job_info = job_history.get_job_info(job_id)
Dan Shidfea3682014-08-10 23:38:40 -0700287 return rpc_utils.prepare_for_serialization(job_info.get_history())
288
289
290def get_host_history(start_time, end_time, hosts=None, board=None, pool=None):
291 """Get history of a list of host.
292
293 The return is a JSON string of host history for each host, for example,
294 {'172.22.33.51': [{'status': 'Resetting'
295 'start_time': '2014-08-07 10:02:16',
296 'end_time': '2014-08-07 10:03:16',
297 'log_url': 'http://autotest/reset-546546/debug',
298 'dbg_str': 'Task: Special Task 19441991 (host ...)'},
299 {'status': 'Running'
300 'start_time': '2014-08-07 10:03:18',
301 'end_time': '2014-08-07 10:13:00',
302 'log_url': 'http://autotest/reset-546546/debug',
303 'dbg_str': 'HQE: 15305005, for job: 14995562'}
304 ]
305 }
306 @param start_time: start time to search for history, can be string value or
307 epoch time.
308 @param end_time: end time to search for history, can be string value or
309 epoch time.
310 @param hosts: A list of hosts to search for history. Default is None.
311 @param board: board type of hosts. Default is None.
312 @param pool: pool type of hosts. Default is None.
313 @returns: JSON string of the host history.
314 """
315 return rpc_utils.prepare_for_serialization(
316 host_history.get_history_details(
317 start_time=start_time, end_time=end_time,
318 hosts=hosts, board=board, pool=pool,
319 process_pool_size=4))
Jakob Juelich59cfe542014-09-02 16:37:46 -0700320
321
Jakob Juelich1b525742014-09-30 13:08:07 -0700322def shard_heartbeat(shard_hostname, jobs=(), hqes=(),
323 known_job_ids=(), known_host_ids=()):
324 """Receive updates for job statuses from shards and assign hosts and jobs.
Jakob Juelich59cfe542014-09-02 16:37:46 -0700325
326 @param shard_hostname: Hostname of the calling shard
Jakob Juelicha94efe62014-09-18 16:02:49 -0700327 @param jobs: Jobs in serialized form that should be updated with newer
328 status from a shard.
329 @param hqes: Hostqueueentries in serialized form that should be updated with
330 newer status from a shard. Note that for every hostqueueentry
331 the corresponding job must be in jobs.
Jakob Juelich1b525742014-09-30 13:08:07 -0700332 @param known_job_ids: List of ids of jobs the shard already has.
333 @param known_host_ids: List of ids of hosts the shard already has.
Jakob Juelicha94efe62014-09-18 16:02:49 -0700334
Jakob Juelich59cfe542014-09-02 16:37:46 -0700335 @returns: Serialized representations of hosts, jobs and their dependencies
336 to be inserted into a shard's database.
337 """
Jakob Juelich1b525742014-09-30 13:08:07 -0700338 # The following alternatives to sending host and job ids in every heartbeat
339 # have been considered:
340 # 1. Sending the highest known job and host ids. This would work for jobs:
341 # Newer jobs always have larger ids. Also, if a job is not assigned to a
342 # particular shard during a heartbeat, it never will be assigned to this
343 # shard later.
344 # This is not true for hosts though: A host that is leased won't be sent
345 # to the shard now, but might be sent in a future heartbeat. This means
346 # sometimes hosts should be transfered that have a lower id than the
347 # maximum host id the shard knows.
348 # 2. Send the number of jobs/hosts the shard knows to the master in each
349 # heartbeat. Compare these to the number of records that already have
350 # the shard_id set to this shard. In the normal case, they should match.
351 # In case they don't, resend all entities of that type.
352 # This would work well for hosts, because there aren't that many.
353 # Resending all jobs is quite a big overhead though.
354 # Also, this approach might run into edge cases when entities are
355 # ever deleted.
356 # 3. Mixtures of the above: Use 1 for jobs and 2 for hosts.
357 # Using two different approaches isn't consistent and might cause
358 # confusion. Also the issues with the case of deletions might still
359 # occur.
360 #
361 # The overhead of sending all job and host ids in every heartbeat is low:
362 # At peaks one board has about 1200 created but unfinished jobs.
363 # See the numbers here: http://goo.gl/gQCGWH
364 # Assuming that job id's have 6 digits and that json serialization takes a
365 # comma and a space as overhead, the traffic per id sent is about 8 bytes.
366 # If 5000 ids need to be sent, this means 40 kilobytes of traffic.
367 # A NOT IN query with 5000 ids took about 30ms in tests made.
368 # These numbers seem low enough to outweigh the disadvantages of the
369 # solutions described above.
Jakob Juelich59cfe542014-09-02 16:37:46 -0700370 timer = stats.Timer('shard_heartbeat')
371 with timer:
372 shard_obj = rpc_utils.retrieve_shard(shard_hostname=shard_hostname)
Jakob Juelicha94efe62014-09-18 16:02:49 -0700373 rpc_utils.persist_records_sent_from_shard(shard_obj, jobs, hqes)
Jakob Juelich1b525742014-09-30 13:08:07 -0700374 hosts, jobs = rpc_utils.find_records_for_shard(
375 shard_obj,
376 known_job_ids=known_job_ids, known_host_ids=known_host_ids)
Jakob Juelich59cfe542014-09-02 16:37:46 -0700377 return {
378 'hosts': [host.serialize() for host in hosts],
379 'jobs': [job.serialize() for job in jobs],
380 }
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700381
382
383def get_shards(**filter_data):
384 """Return a list of all shards.
385
386 @returns A sequence of nested dictionaries of shard information.
387 """
388 shards = models.Shard.query_objects(filter_data)
389 serialized_shards = rpc_utils.prepare_rows_as_nested_dicts(shards, ())
390 for serialized, shard in zip(serialized_shards, shards):
391 serialized['labels'] = [label.name for label in shard.labels.all()]
392
393 return serialized_shards
394
395
396def add_shard(hostname, label):
397 """Add a shard and start running jobs on it.
398
399 @param hostname: The hostname of the shard to be added; needs to be unique.
400 @param label: A platform label. Jobs of this label will be assigned to the
401 shard.
402
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700403 @raises error.RPCException: If label provided doesn't start with `board:`
404 @raises model_logic.ValidationError: If a shard with the given hostname
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700405 already exists.
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700406 @raises models.Label.DoesNotExist: If the label specified doesn't exist.
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700407 """
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700408 if not label.startswith('board:'):
409 raise error.RPCException('Sharding only supported for `board:.*` '
410 'labels.')
411
412 # Fetch label first, so shard isn't created when label doesn't exist.
413 label = models.Label.smart_get(label)
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700414 shard = models.Shard.add_object(hostname=hostname)
Jakob Juelich8b110ee2014-09-15 16:13:42 -0700415 shard.labels.add(label)
Jakob Juelich82b7d1c2014-09-15 16:10:57 -0700416 return shard.id
417
418
419def delete_shard(hostname):
420 """Delete a shard and reclaim all resources from it.
421
422 This claims back all assigned hosts from the shard. To ensure all DUTs are
423 in a sane state, a Repair task is scheduled for them. This reboots the DUTs
424 and therefore clears all running processes that might be left.
425
426 The shard_id of jobs of that shard will be set to None.
427
428 The status of jobs that haven't been reported to be finished yet, will be
429 lost. The master scheduler will pick up the jobs and execute them.
430
431 @param hostname: Hostname of the shard to delete.
432 """
433 shard = rpc_utils.retrieve_shard(shard_hostname=hostname)
434
435 # TODO(beeps): Power off shard
436
437 # For ChromeOS hosts, repair reboots the DUT.
438 # Repair will excalate through multiple repair steps and will verify the
439 # success after each of them. Anyway, it will always run at least the first
440 # one, which includes a reboot.
441 # After a reboot we can be sure no processes from prior tests that were run
442 # by a shard are still running on the DUT.
443 # Important: Don't just set the status to Repair Failed, as that would run
444 # Verify first, before doing any repair measures. Verify would probably
445 # succeed, so this wouldn't change anything on the DUT.
446 for host in models.Host.objects.filter(shard=shard):
447 models.SpecialTask.objects.create(
448 task=models.SpecialTask.Task.REPAIR,
449 host=host,
450 requested_by=models.User.current_user())
451 models.Host.objects.filter(shard=shard).update(shard=None)
452
453 models.Job.objects.filter(shard=shard).update(shard=None)
454
455 shard.labels.clear()
456
457 shard.delete()