Blame - server/cros/dynamic_suite.py - platform/external/autotest

blob: 0104416f412ef1068c5f40387708b4f8aef45529 [file] [log] [blame]

Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	1	# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4
				5	import common
				6	import compiler, logging, os, random, re, time
Chris Masone	2ef1d4e	2011-12-20 11:06:53 -0800	[diff] [blame]	7	from autotest_lib.client.common_lib import control_data, global_config, error
				8	from autotest_lib.client.common_lib import utils
Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	9	from autotest_lib.server.cros import control_file_getter
				10	from autotest_lib.server import frontend
				11
				12
				13	VERSION_PREFIX = 'cros-version-'
Chris Masone	2ef1d4e	2011-12-20 11:06:53 -0800	[diff] [blame]	14	CONFIG = global_config.global_config
				15
				16
				17	def _image_url_pattern():
				18	return CONFIG.get_config_value('CROS', 'image_url_pattern', type=str)
				19
				20
				21	def _package_url_pattern():
				22	return CONFIG.get_config_value('CROS', 'package_url_pattern', type=str)
				23
Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	24
				25	class Reimager(object):
				26	"""
				27	A class that can run jobs to reimage devices.
				28
				29	@var _afe: a frontend.AFE instance used to talk to autotest.
				30	@var _tko: a frontend.TKO instance used to query the autotest results db.
				31	@var _cf_getter: a ControlFileGetter used to get the AU control file.
				32	"""
				33
				34
				35	def __init__(self, autotest_dir, afe=None, tko=None):
				36	"""
				37	Constructor
				38
				39	@param autotest_dir: the place to find autotests.
				40	@param afe: an instance of AFE as defined in server/frontend.py.
				41	@param tko: an instance of TKO as defined in server/frontend.py.
				42	"""
				43	self._afe = afe or frontend.AFE(debug=False)
				44	self._tko = tko or frontend.TKO(debug=False)
				45	self._cf_getter = control_file_getter.FileSystemGetter(
				46	[os.path.join(autotest_dir, 'server/site_tests')])
				47
				48
Chris Masone	2ef1d4e	2011-12-20 11:06:53 -0800	[diff] [blame]	49	def skip(self, g):
				50	return 'SKIP_IMAGE' in g and g['SKIP_IMAGE']
				51
				52
				53	def attempt(self, name, num, board, record):
Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	54	"""
				55	Synchronously attempt to reimage some machines.
				56
				57	Fire off attempts to reimage \|num\| machines of type \|board\|, using an
				58	image at \|url\| called \|name\|. Wait for completion, polling every
				59	10s, and log results with \|record\| upon completion.
				60
Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	61	@param name: the name of the image to install (must be unique).
				62	@param num: how many devices to reimage.
				63	@param board: which kind of devices to reimage.
				64	@param record: callable that records job status.
				65	prototype:
				66	record(status, subdir, name, reason)
				67	@return True if all reimaging jobs succeed, false otherwise.
				68	"""
				69	record('START', None, 'try new image')
				70	self._ensure_version_label(VERSION_PREFIX+name)
Chris Masone	2ef1d4e	2011-12-20 11:06:53 -0800	[diff] [blame]	71	canary = self._schedule_reimage_job(name, num, board)
Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	72	logging.debug('Created re-imaging job: %d', canary.id)
				73	while len(self._afe.get_jobs(id=canary.id, not_yet_run=True)) > 0:
				74	time.sleep(10)
				75	logging.debug('Re-imaging job running.')
				76	while len(self._afe.get_jobs(id=canary.id, finished=True)) == 0:
				77	time.sleep(10)
				78	logging.debug('Re-imaging job finished.')
				79	canary.result = self._afe.poll_job_results(self._tko, canary, 0)
				80
				81	if canary.result is True:
				82	self._report_results(canary, record)
				83	record('END GOOD', None, None)
				84	return True
				85
				86	if canary.result is None:
				87	record('FAIL', None, canary.name, 're-imaging tasks did not run')
				88	else: # canary.result is False
				89	self._report_results(canary, record)
				90
				91	record('END FAIL', None, None)
				92	return False
				93
				94
				95	def _ensure_version_label(self, name):
				96	"""
				97	Ensure that a label called \|name\| exists in the autotest DB.
				98
				99	@param name: the label to check for/create.
				100	"""
				101	labels = self._afe.get_labels(name=name)
				102	if len(labels) == 0:
				103	self._afe.create_label(name=name)
				104
				105
				106	def _inject_vars(self, vars, control_file_in):
				107	"""
				108	Inject the contents of \|vars\| into \|control_file_in\|
				109
				110	@param vars: a dict to shoehorn into the provided control file string.
				111	@param control_file_in: the contents of a control file to munge.
				112	@return the modified control file string.
				113	"""
				114	control_file = ''
				115	for key, value in vars.iteritems():
				116	control_file += "%s='%s'\n" % (key, value)
				117	return control_file + control_file_in
				118
				119
Chris Masone	2ef1d4e	2011-12-20 11:06:53 -0800	[diff] [blame]	120	def _schedule_reimage_job(self, name, num_machines, board):
Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	121	"""
				122	Schedules the reimaging of \|num_machines\| \|board\| devices with \|image\|.
				123
				124	Sends an RPC to the autotest frontend to enqueue reimaging jobs on
				125	\|num_machines\| devices of type \|board\|
				126
Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	127	@param name: the name of the image to install (must be unique).
Chris Masone	2ef1d4e	2011-12-20 11:06:53 -0800	[diff] [blame]	128	@param num_machines: how many devices to reimage.
Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	129	@param board: which kind of devices to reimage.
				130	@return a frontend.Job object for the reimaging job we scheduled.
				131	"""
				132	control_file = self._inject_vars(
Chris Masone	2ef1d4e	2011-12-20 11:06:53 -0800	[diff] [blame]	133	{ 'image_url': _image_url_pattern() % name,
Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	134	'image_name': name },
				135	self._cf_getter.get_control_file_contents_by_name('autoupdate'))
				136
Chris Masone	2ef1d4e	2011-12-20 11:06:53 -0800	[diff] [blame]	137	return self._afe.create_job(control_file=control_file,
				138	name=name + '-try',
				139	control_type='Server',
				140	meta_hosts=[board] * num_machines)
Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame]	141
				142
				143	def _report_results(self, job, record):
				144	"""
				145	Record results from a completed frontend.Job object.
				146
				147	@param job: a completed frontend.Job object populated by
				148	frontend.AFE.poll_job_results.
				149	@param record: callable that records job status.
				150	prototype:
				151	record(status, subdir, name, reason)
				152	"""
				153	if job.result == True:
				154	record('GOOD', None, job.name)
				155	return
				156
				157	for platform in job.results_platform_map:
				158	for status in job.results_platform_map[platform]:
				159	if status == 'Total':
				160	continue
				161	for host in job.results_platform_map[platform][status]:
				162	if host not in job.test_status:
				163	record('ERROR', None, host, 'Job failed to run.')
				164	elif status == 'Failed':
				165	for test_status in job.test_status[host].fail:
				166	record('FAIL', None, host, test_status.reason)
				167	elif status == 'Aborted':
				168	for test_status in job.test_status[host].fail:
				169	record('ABORT', None, host, test_status.reason)
				170	elif status == 'Completed':
				171	record('GOOD', None, host)
				172
				173
				174	class Suite(object):
				175	"""
				176	A suite of tests, defined by some predicate over control file variables.
				177
				178	Given a place to search for control files a predicate to match the desired
				179	tests, can gather tests and fire off jobs to run them, and then wait for
				180	results.
				181
				182	@var _predicate: a function that should return True when run over a
				183	ControlData representation of a control file that should be in
				184	this Suite.
				185	@var _tag: a string with which to tag jobs run in this suite.
				186	@var _afe: an instance of AFE as defined in server/frontend.py.
				187	@var _tko: an instance of TKO as defined in server/frontend.py.
				188	@var _jobs: currently scheduled jobs, if any.
				189	@var _cf_getter: a control_file_getter.ControlFileGetter
				190	"""
				191
				192
				193	@classmethod
				194	def create_from_name(cls, name, autotest_dir, afe=None, tko=None):
				195	"""
				196	Create a Suite using a predicate based on the SUITE control file var.
				197
				198	Makes a predicate based on \|name\| and uses it to instantiate a Suite
				199	that looks for tests in \|autotest_dir\| and will schedule them using
				200	\|afe\|. Results will be pulled from \|tko\| upon completion
				201
				202	@param name: a value of the SUITE control file variable to search for.
				203	@param autotest_dir: the place to find autotests.
				204	@param afe: an instance of AFE as defined in server/frontend.py.
				205	@param tko: an instance of TKO as defined in server/frontend.py.
				206	@return a Suite instance.
				207	"""
				208	return Suite(lambda t: hasattr(t, 'suite') and t.suite == name,
				209	name, autotest_dir, afe, tko)
				210
				211
				212	def __init__(self, predicate, tag, autotest_dir, afe=None, tko=None):
				213	"""
				214	Constructor
				215
				216	@param predicate: a function that should return True when run over a
				217	ControlData representation of a control file that should be in
				218	this Suite.
				219	@param tag: a string with which to tag jobs run in this suite.
				220	@param autotest_dir: the place to find autotests.
				221	@param afe: an instance of AFE as defined in server/frontend.py.
				222	@param tko: an instance of TKO as defined in server/frontend.py.
				223	"""
				224	self._predicate = predicate
				225	self._tag = tag
				226	self._afe = afe or frontend.AFE(debug=False)
				227	self._tko = tko or frontend.TKO(debug=False)
				228	self._jobs = []
				229
				230	# currently hard-coded places to look for tests.
				231	subpaths = [ 'server/site_tests', 'client/site_tests']
				232	directories = [ os.path.join(autotest_dir, p) for p in subpaths ]
				233	self._cf_getter = control_file_getter.FileSystemGetter(directories)
				234
				235	self._tests = Suite.find_and_parse_tests(self._cf_getter,
				236	self._predicate,
				237	add_experimental=True)
				238
				239
				240	@property
				241	def tests(self):
				242	"""
				243	A list of ControlData objects in the suite, with added \|text\| attr.
				244	"""
				245	return self._tests
				246
				247
				248	def stable_tests(self):
				249	"""
				250	\|self.tests\|, filtered for non-experimental tests.
				251	"""
				252	return filter(lambda t: not t.experimental, self.tests)
				253
				254
				255	def unstable_tests(self):
				256	"""
				257	\|self.tests\|, filtered for experimental tests.
				258	"""
				259	return filter(lambda t: t.experimental, self.tests)
				260
				261
				262	def _create_job(self, test, image_name):
				263	"""
				264	Thin wrapper around frontend.AFE.create_job().
				265
				266	@param test: ControlData object for a test to run.
				267	@param image_name: the name of an image against which to test.
				268	@return frontend.Job object for the job just scheduled.
				269	"""
				270	return self._afe.create_job(
				271	control_file=test.text,
				272	name='/'.join([image_name, self._tag, test.name]),
				273	control_type=test.test_type.capitalize(),
				274	meta_hosts=[VERSION_PREFIX+image_name])
				275
				276
				277	def run_and_wait(self, image_name, record, add_experimental=True):
				278	"""
				279	Synchronously run tests in \|self.tests\|.
				280
				281	Schedules tests against a device running image \|image_name\|, and
				282	then polls for status, using \|record\| to print status when each
				283	completes.
				284
				285	Tests returned by self.stable_tests() will always be run, while tests
				286	in self.unstable_tests() will only be run if \|add_experimental\| is true.
				287
				288	@param image_name: the name of an image against which to test.
				289	@param record: callable that records job status.
				290	prototype:
				291	record(status, subdir, name, reason)
				292	@param add_experimental: schedule experimental tests as well, or not.
				293	"""
				294	try:
				295	record('START', None, self._tag)
				296	self.schedule(image_name, add_experimental)
				297	try:
				298	for result in self.wait_for_results():
				299	record(*result)
				300	record('END GOOD', None, None)
				301	except Exception as e:
				302	logging.error(e)
				303	record('END ERROR', None, None, 'Exception waiting for results')
				304	except Exception as e:
				305	logging.error(e)
				306	record('END ERROR', None, None, 'Exception while scheduling suite')
				307
				308
				309	def schedule(self, image_name, add_experimental=True):
				310	"""
				311	Schedule jobs using \|self._afe\|.
				312
				313	frontend.Job objects representing each scheduled job will be put in
				314	\|self._jobs\|.
				315
				316	@param image_name: the name of an image against which to test.
				317	@param add_experimental: schedule experimental tests as well, or not.
				318	"""
				319	for test in self.stable_tests():
				320	logging.debug('Scheduling %s', test.name)
				321	self._jobs.append(self._create_job(test, image_name))
				322
				323	if add_experimental:
				324	# TODO(cmasone): ensure I can log results from these differently.
				325	for test in self.unstable_tests():
				326	logging.debug('Scheduling %s', test.name)
				327	self._jobs.append(self._create_job(test, image_name))
				328
				329
				330	def _status_is_relevant(self, status):
				331	"""
				332	Indicates whether the status of a given test is meaningful or not.
				333
				334	@param status: frontend.TestStatus object to look at.
				335	@return True if this is a test result worth looking at further.
				336	"""
				337	return not (status.test_name.startswith('SERVER_JOB') or
				338	status.test_name.startswith('CLIENT_JOB'))
				339
				340
				341	def _collate_aborted(self, current_value, entry):
				342	"""
				343	reduce() over a list of HostQueueEntries for a job; True if any aborted.
				344
				345	Functor that can be reduced()ed over a list of
				346	HostQueueEntries for a job. If any were aborted
				347	(\|entry.aborted\| exists and is True), then the reduce() will
				348	return True.
				349
				350	Ex:
				351	entries = self._afe.run('get_host_queue_entries', job=job.id)
				352	reduce(self._collate_aborted, entries, False)
				353
				354	@param current_value: the current accumulator (a boolean).
				355	@param entry: the current entry under consideration.
				356	@return the value of \|entry.aborted\| if it exists, False if not.
				357	"""
				358	return current_value or ('aborted' in entry and entry['aborted'])
				359
				360
				361	def wait_for_results(self):
				362	"""
				363	Wait for results of all tests in all jobs in \|self._jobs\|.
				364
				365	Currently polls for results every 5s. When all results are available,
				366	@return a list of tuples, one per test: (status, subdir, name, reason)
				367	"""
				368	results = []
				369	while self._jobs:
				370	for job in list(self._jobs):
				371	if not self._afe.get_jobs(id=job.id, finished=True):
				372	continue
				373
				374	self._jobs.remove(job)
				375
				376	entries = self._afe.run('get_host_queue_entries', job=job.id)
				377	if reduce(self._collate_aborted, entries, False):
				378	results.append(('ABORT', None, job.name))
				379	else:
				380	statuses = self._tko.get_status_counts(job=job.id)
				381	for s in filter(self._status_is_relevant, statuses):
				382	results.append((s.status, None, s.test_name, s.reason))
				383	time.sleep(5)
				384
				385	return results
				386
				387
				388	@classmethod
				389	def find_and_parse_tests(cls, cf_getter, predicate, add_experimental=False):
				390	"""
				391	Function to scan through all tests and find eligible tests.
				392
				393	Looks at control files returned by _cf_getter.get_control_file_list()
				394	for tests that pass self._predicate().
				395
				396	@param cf_getter: a control_file_getter.ControlFileGetter used to list
				397	and fetch the content of control files
				398	@param predicate: a function that should return True when run over a
				399	ControlData representation of a control file that should be in
				400	this Suite.
				401	@param add_experimental: add tests with experimental attribute set.
				402
				403	@return list of ControlData objects that should be run, with control
				404	file text added in \|text\| attribute.
				405	"""
				406	tests = {}
				407	files = cf_getter.get_control_file_list()
				408	for file in files:
				409	text = cf_getter.get_control_file_contents(file)
				410	try:
				411	found_test = control_data.parse_control_string(text,
				412	raise_warnings=True)
				413	if not add_experimental and found_test.experimental:
				414	continue
				415
				416	found_test.text = text
				417	tests[file] = found_test
				418	except control_data.ControlVariableException, e:
				419	logging.warn("Skipping %s\n%s", file, e)
				420	except Exception, e:
				421	logging.error("Bad %s\n%s", file, e)
				422
				423	return [test for test in tests.itervalues() if predicate(test)]