Blame - server/cros/dynamic_suite.py - platform/external/autotest

blob: 066661d30256b4d1a9c4c7f8403b6a08697b8a5d [file] [log] [blame]

Chris Masone	6fed646	2011-10-20 16:36:43 -0700	[diff] [blame^]	1	# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4
				5	import common
				6	import compiler, logging, os, random, re, time
				7	from autotest_lib.client.common_lib import control_data, error, utils
				8	from autotest_lib.server.cros import control_file_getter
				9	from autotest_lib.server import frontend
				10
				11
				12	VERSION_PREFIX = 'cros-version-'
				13
				14	class Reimager(object):
				15	"""
				16	A class that can run jobs to reimage devices.
				17
				18	@var _afe: a frontend.AFE instance used to talk to autotest.
				19	@var _tko: a frontend.TKO instance used to query the autotest results db.
				20	@var _cf_getter: a ControlFileGetter used to get the AU control file.
				21	"""
				22
				23
				24	def __init__(self, autotest_dir, afe=None, tko=None):
				25	"""
				26	Constructor
				27
				28	@param autotest_dir: the place to find autotests.
				29	@param afe: an instance of AFE as defined in server/frontend.py.
				30	@param tko: an instance of TKO as defined in server/frontend.py.
				31	"""
				32	self._afe = afe or frontend.AFE(debug=False)
				33	self._tko = tko or frontend.TKO(debug=False)
				34	self._cf_getter = control_file_getter.FileSystemGetter(
				35	[os.path.join(autotest_dir, 'server/site_tests')])
				36
				37
				38	def attempt(self, url, name, num, board, record):
				39	"""
				40	Synchronously attempt to reimage some machines.
				41
				42	Fire off attempts to reimage \|num\| machines of type \|board\|, using an
				43	image at \|url\| called \|name\|. Wait for completion, polling every
				44	10s, and log results with \|record\| upon completion.
				45
				46	@param url: the URL of the image to install.
				47	@param name: the name of the image to install (must be unique).
				48	@param num: how many devices to reimage.
				49	@param board: which kind of devices to reimage.
				50	@param record: callable that records job status.
				51	prototype:
				52	record(status, subdir, name, reason)
				53	@return True if all reimaging jobs succeed, false otherwise.
				54	"""
				55	record('START', None, 'try new image')
				56	self._ensure_version_label(VERSION_PREFIX+name)
				57	canary = self._schedule_reimage_job(url, name, num, board)
				58	logging.debug('Created re-imaging job: %d', canary.id)
				59	while len(self._afe.get_jobs(id=canary.id, not_yet_run=True)) > 0:
				60	time.sleep(10)
				61	logging.debug('Re-imaging job running.')
				62	while len(self._afe.get_jobs(id=canary.id, finished=True)) == 0:
				63	time.sleep(10)
				64	logging.debug('Re-imaging job finished.')
				65	canary.result = self._afe.poll_job_results(self._tko, canary, 0)
				66
				67	if canary.result is True:
				68	self._report_results(canary, record)
				69	record('END GOOD', None, None)
				70	return True
				71
				72	if canary.result is None:
				73	record('FAIL', None, canary.name, 're-imaging tasks did not run')
				74	else: # canary.result is False
				75	self._report_results(canary, record)
				76
				77	record('END FAIL', None, None)
				78	return False
				79
				80
				81	def _ensure_version_label(self, name):
				82	"""
				83	Ensure that a label called \|name\| exists in the autotest DB.
				84
				85	@param name: the label to check for/create.
				86	"""
				87	labels = self._afe.get_labels(name=name)
				88	if len(labels) == 0:
				89	self._afe.create_label(name=name)
				90
				91
				92	def _inject_vars(self, vars, control_file_in):
				93	"""
				94	Inject the contents of \|vars\| into \|control_file_in\|
				95
				96	@param vars: a dict to shoehorn into the provided control file string.
				97	@param control_file_in: the contents of a control file to munge.
				98	@return the modified control file string.
				99	"""
				100	control_file = ''
				101	for key, value in vars.iteritems():
				102	control_file += "%s='%s'\n" % (key, value)
				103	return control_file + control_file_in
				104
				105
				106	def _schedule_reimage_job(self, url, name, num_machines, board):
				107	"""
				108	Schedules the reimaging of \|num_machines\| \|board\| devices with \|image\|.
				109
				110	Sends an RPC to the autotest frontend to enqueue reimaging jobs on
				111	\|num_machines\| devices of type \|board\|
				112
				113	@param url: the URL of the image to install.
				114	@param name: the name of the image to install (must be unique).
				115	@param num: how many devices to reimage.
				116	@param board: which kind of devices to reimage.
				117	@return a frontend.Job object for the reimaging job we scheduled.
				118	"""
				119	control_file = self._inject_vars(
				120	{ 'image_url': url,
				121	'image_name': name },
				122	self._cf_getter.get_control_file_contents_by_name('autoupdate'))
				123
				124	dargs = { 'control_file': control_file,
				125	'name': name + '-try',
				126	'control_type': 'Server',
				127	'meta_hosts': [board] * num_machines }
				128
				129	return self._afe.create_job(**dargs)
				130
				131
				132	def _report_results(self, job, record):
				133	"""
				134	Record results from a completed frontend.Job object.
				135
				136	@param job: a completed frontend.Job object populated by
				137	frontend.AFE.poll_job_results.
				138	@param record: callable that records job status.
				139	prototype:
				140	record(status, subdir, name, reason)
				141	"""
				142	if job.result == True:
				143	record('GOOD', None, job.name)
				144	return
				145
				146	for platform in job.results_platform_map:
				147	for status in job.results_platform_map[platform]:
				148	if status == 'Total':
				149	continue
				150	for host in job.results_platform_map[platform][status]:
				151	if host not in job.test_status:
				152	record('ERROR', None, host, 'Job failed to run.')
				153	elif status == 'Failed':
				154	for test_status in job.test_status[host].fail:
				155	record('FAIL', None, host, test_status.reason)
				156	elif status == 'Aborted':
				157	for test_status in job.test_status[host].fail:
				158	record('ABORT', None, host, test_status.reason)
				159	elif status == 'Completed':
				160	record('GOOD', None, host)
				161
				162
				163	class Suite(object):
				164	"""
				165	A suite of tests, defined by some predicate over control file variables.
				166
				167	Given a place to search for control files a predicate to match the desired
				168	tests, can gather tests and fire off jobs to run them, and then wait for
				169	results.
				170
				171	@var _predicate: a function that should return True when run over a
				172	ControlData representation of a control file that should be in
				173	this Suite.
				174	@var _tag: a string with which to tag jobs run in this suite.
				175	@var _afe: an instance of AFE as defined in server/frontend.py.
				176	@var _tko: an instance of TKO as defined in server/frontend.py.
				177	@var _jobs: currently scheduled jobs, if any.
				178	@var _cf_getter: a control_file_getter.ControlFileGetter
				179	"""
				180
				181
				182	@classmethod
				183	def create_from_name(cls, name, autotest_dir, afe=None, tko=None):
				184	"""
				185	Create a Suite using a predicate based on the SUITE control file var.
				186
				187	Makes a predicate based on \|name\| and uses it to instantiate a Suite
				188	that looks for tests in \|autotest_dir\| and will schedule them using
				189	\|afe\|. Results will be pulled from \|tko\| upon completion
				190
				191	@param name: a value of the SUITE control file variable to search for.
				192	@param autotest_dir: the place to find autotests.
				193	@param afe: an instance of AFE as defined in server/frontend.py.
				194	@param tko: an instance of TKO as defined in server/frontend.py.
				195	@return a Suite instance.
				196	"""
				197	return Suite(lambda t: hasattr(t, 'suite') and t.suite == name,
				198	name, autotest_dir, afe, tko)
				199
				200
				201	def __init__(self, predicate, tag, autotest_dir, afe=None, tko=None):
				202	"""
				203	Constructor
				204
				205	@param predicate: a function that should return True when run over a
				206	ControlData representation of a control file that should be in
				207	this Suite.
				208	@param tag: a string with which to tag jobs run in this suite.
				209	@param autotest_dir: the place to find autotests.
				210	@param afe: an instance of AFE as defined in server/frontend.py.
				211	@param tko: an instance of TKO as defined in server/frontend.py.
				212	"""
				213	self._predicate = predicate
				214	self._tag = tag
				215	self._afe = afe or frontend.AFE(debug=False)
				216	self._tko = tko or frontend.TKO(debug=False)
				217	self._jobs = []
				218
				219	# currently hard-coded places to look for tests.
				220	subpaths = [ 'server/site_tests', 'client/site_tests']
				221	directories = [ os.path.join(autotest_dir, p) for p in subpaths ]
				222	self._cf_getter = control_file_getter.FileSystemGetter(directories)
				223
				224	self._tests = Suite.find_and_parse_tests(self._cf_getter,
				225	self._predicate,
				226	add_experimental=True)
				227
				228
				229	@property
				230	def tests(self):
				231	"""
				232	A list of ControlData objects in the suite, with added \|text\| attr.
				233	"""
				234	return self._tests
				235
				236
				237	def stable_tests(self):
				238	"""
				239	\|self.tests\|, filtered for non-experimental tests.
				240	"""
				241	return filter(lambda t: not t.experimental, self.tests)
				242
				243
				244	def unstable_tests(self):
				245	"""
				246	\|self.tests\|, filtered for experimental tests.
				247	"""
				248	return filter(lambda t: t.experimental, self.tests)
				249
				250
				251	def _create_job(self, test, image_name):
				252	"""
				253	Thin wrapper around frontend.AFE.create_job().
				254
				255	@param test: ControlData object for a test to run.
				256	@param image_name: the name of an image against which to test.
				257	@return frontend.Job object for the job just scheduled.
				258	"""
				259	return self._afe.create_job(
				260	control_file=test.text,
				261	name='/'.join([image_name, self._tag, test.name]),
				262	control_type=test.test_type.capitalize(),
				263	meta_hosts=[VERSION_PREFIX+image_name])
				264
				265
				266	def run_and_wait(self, image_name, record, add_experimental=True):
				267	"""
				268	Synchronously run tests in \|self.tests\|.
				269
				270	Schedules tests against a device running image \|image_name\|, and
				271	then polls for status, using \|record\| to print status when each
				272	completes.
				273
				274	Tests returned by self.stable_tests() will always be run, while tests
				275	in self.unstable_tests() will only be run if \|add_experimental\| is true.
				276
				277	@param image_name: the name of an image against which to test.
				278	@param record: callable that records job status.
				279	prototype:
				280	record(status, subdir, name, reason)
				281	@param add_experimental: schedule experimental tests as well, or not.
				282	"""
				283	try:
				284	record('START', None, self._tag)
				285	self.schedule(image_name, add_experimental)
				286	try:
				287	for result in self.wait_for_results():
				288	record(*result)
				289	record('END GOOD', None, None)
				290	except Exception as e:
				291	logging.error(e)
				292	record('END ERROR', None, None, 'Exception waiting for results')
				293	except Exception as e:
				294	logging.error(e)
				295	record('END ERROR', None, None, 'Exception while scheduling suite')
				296
				297
				298	def schedule(self, image_name, add_experimental=True):
				299	"""
				300	Schedule jobs using \|self._afe\|.
				301
				302	frontend.Job objects representing each scheduled job will be put in
				303	\|self._jobs\|.
				304
				305	@param image_name: the name of an image against which to test.
				306	@param add_experimental: schedule experimental tests as well, or not.
				307	"""
				308	for test in self.stable_tests():
				309	logging.debug('Scheduling %s', test.name)
				310	self._jobs.append(self._create_job(test, image_name))
				311
				312	if add_experimental:
				313	# TODO(cmasone): ensure I can log results from these differently.
				314	for test in self.unstable_tests():
				315	logging.debug('Scheduling %s', test.name)
				316	self._jobs.append(self._create_job(test, image_name))
				317
				318
				319	def _status_is_relevant(self, status):
				320	"""
				321	Indicates whether the status of a given test is meaningful or not.
				322
				323	@param status: frontend.TestStatus object to look at.
				324	@return True if this is a test result worth looking at further.
				325	"""
				326	return not (status.test_name.startswith('SERVER_JOB') or
				327	status.test_name.startswith('CLIENT_JOB'))
				328
				329
				330	def _collate_aborted(self, current_value, entry):
				331	"""
				332	reduce() over a list of HostQueueEntries for a job; True if any aborted.
				333
				334	Functor that can be reduced()ed over a list of
				335	HostQueueEntries for a job. If any were aborted
				336	(\|entry.aborted\| exists and is True), then the reduce() will
				337	return True.
				338
				339	Ex:
				340	entries = self._afe.run('get_host_queue_entries', job=job.id)
				341	reduce(self._collate_aborted, entries, False)
				342
				343	@param current_value: the current accumulator (a boolean).
				344	@param entry: the current entry under consideration.
				345	@return the value of \|entry.aborted\| if it exists, False if not.
				346	"""
				347	return current_value or ('aborted' in entry and entry['aborted'])
				348
				349
				350	def wait_for_results(self):
				351	"""
				352	Wait for results of all tests in all jobs in \|self._jobs\|.
				353
				354	Currently polls for results every 5s. When all results are available,
				355	@return a list of tuples, one per test: (status, subdir, name, reason)
				356	"""
				357	results = []
				358	while self._jobs:
				359	for job in list(self._jobs):
				360	if not self._afe.get_jobs(id=job.id, finished=True):
				361	continue
				362
				363	self._jobs.remove(job)
				364
				365	entries = self._afe.run('get_host_queue_entries', job=job.id)
				366	if reduce(self._collate_aborted, entries, False):
				367	results.append(('ABORT', None, job.name))
				368	else:
				369	statuses = self._tko.get_status_counts(job=job.id)
				370	for s in filter(self._status_is_relevant, statuses):
				371	results.append((s.status, None, s.test_name, s.reason))
				372	time.sleep(5)
				373
				374	return results
				375
				376
				377	@classmethod
				378	def find_and_parse_tests(cls, cf_getter, predicate, add_experimental=False):
				379	"""
				380	Function to scan through all tests and find eligible tests.
				381
				382	Looks at control files returned by _cf_getter.get_control_file_list()
				383	for tests that pass self._predicate().
				384
				385	@param cf_getter: a control_file_getter.ControlFileGetter used to list
				386	and fetch the content of control files
				387	@param predicate: a function that should return True when run over a
				388	ControlData representation of a control file that should be in
				389	this Suite.
				390	@param add_experimental: add tests with experimental attribute set.
				391
				392	@return list of ControlData objects that should be run, with control
				393	file text added in \|text\| attribute.
				394	"""
				395	tests = {}
				396	files = cf_getter.get_control_file_list()
				397	for file in files:
				398	text = cf_getter.get_control_file_contents(file)
				399	try:
				400	found_test = control_data.parse_control_string(text,
				401	raise_warnings=True)
				402	if not add_experimental and found_test.experimental:
				403	continue
				404
				405	found_test.text = text
				406	tests[file] = found_test
				407	except control_data.ControlVariableException, e:
				408	logging.warn("Skipping %s\n%s", file, e)
				409	except Exception, e:
				410	logging.error("Bad %s\n%s", file, e)
				411
				412	return [test for test in tests.itervalues() if predicate(test)]