Blame - server/cros/goofy_client.py - platform/external/autotest

blob: a0d6aae9299121d8f249877db1b4431602054b02 [file] [log] [blame]

beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame^]	1	# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4
				5	import collections
				6	import httplib
				7	import logging
				8	import os
				9	import re
				10	import socket
				11	import time
				12
				13	import common
				14
				15	from autotest_lib.client.common_lib import error
				16	from autotest_lib.client.common_lib.cros import retry
				17	from autotest_lib.server import utils
				18
				19
				20	GOOFY_JSONRPC_SERVER_PORT = 0x0FAC
				21	GOOFY_RUNNING = 'RUNNING'
				22
				23
				24	class GoofyProxyException(Exception):
				25	"""Exception raised when a goofy rpc fails."""
				26	pass
				27
				28
				29	class GoofyProxy(object):
				30	"""Client capable of making rpc calls to goofy.
				31
				32	Methods of this class that can cause goofy to change state
				33	usually need a retry decorator. Methods that have a retry decorator
				34	need to be 'pure', i.e return the same results when called multiple
				35	times with the same argument.
				36
				37	There are 2 known exceptions this class can deal with, a socket.error
				38	which happens when we try to execute an rpc when the DUT is, say, suspended
				39	and a BadStatusLine, which we get when we try to execute an rpc while the
				40	DUT is going through a factory_restart. Ideally we would like to handle
				41	socket timeouts different from BadStatusLines as we can get connection
				42	errors even when a device reboots and BadStatusLines ususally only when
				43	factory restarts. crbug.com/281714.
				44	"""
				45
				46	# This timeout was arbitrarily chosen as many tests in the factory test
				47	# suite run for days. Ideally we would like to split this into at least 2
				48	# timeouts, one which we use for rpcs that run while no other test is,
				49	# running and is smaller than the second that is designed for use with rpcs
				50	# that might execute simultaneously with a test. The latter needs a longer
				51	# timeout since tests could suspend,resume for a long time, and a call like
				52	# GetGoofyStatus should be tolerant to these suspend/resumes. In designing
				53	# the base timeout one needs to allocate time to component methods of this
				54	# class (such as _set_test_list) as a multiple of the number of rpcs it
				55	# executes.
				56	BASE_RPC_TIMEOUT = 1440
				57	POLLING_INTERVAL = 5
				58	FACTORY_BUG_RE = r'.(/tmp/factory_bug.tar.bz2).*'
				59	UNTAR_COMMAND = 'tar jxf %s -C %s'
				60
				61
				62	def __init__(self, host):
				63	"""
				64	@param host: The host object representing the DUT running goofy.
				65	"""
				66	self._host = host
				67	self._client = host.jsonrpc_connect(GOOFY_JSONRPC_SERVER_PORT)
				68
				69
				70	@retry.retry((httplib.BadStatusLine, socket.error),
				71	timeout_min=BASE_RPC_TIMEOUT)
				72	def _get_goofy_status(self):
				73	"""Return status of goofy, ignoring socket timeouts and http exceptions.
				74	"""
				75	status = self._client.GetGoofyStatus().get('status')
				76	return status
				77
				78
				79	def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2):
				80	"""Wait till goofy is running or a timeout occurs.
				81
				82	@param timeout_min: Minutes to wait before timing this call out.
				83	"""
				84	current_time = time.time()
				85	timeout_secs = timeout_min * 60
				86	logging.info('Waiting on goofy')
				87	while self._get_goofy_status() != GOOFY_RUNNING:
				88	if time.time() - current_time > timeout_secs:
				89	break
				90	return
				91
				92
				93	@retry.retry(socket.error, timeout_min=BASE_RPC_TIMEOUT*2)
				94	def _set_test_list(self, next_list):
				95	"""Set the given test list for execution.
				96
				97	Confirm that the given test list is a test that has been baked into
				98	the image, then run it. Some test lists are configured to start
				99	execution automatically when we call SetTestList, while others wait
				100	for a corresponding RunTest.
				101
				102	@param next_list: The name of the test list.
				103
				104	@raise jsonrpclib.ProtocolError: If the test list we're trying to switch
				105	to isn't on the DUT.
				106	"""
				107
				108	# As part of SwitchTestList we perform a factory restart,
				109	# which will throw a BadStatusLine. We don't want to retry
				110	# on this exception though, as that will lead to setting the same
				111	# test list over and over till the timeout expires. If the test
				112	# list is not already on the DUT this method will fail, emitting
				113	# the possible test lists one can switch to.
				114	try:
				115	self._client.SwitchTestList(next_list)
				116	except httplib.BadStatusLine:
				117	logging.info('Switched to list %s, goofy restarting', next_list)
				118	pass
				119
				120
				121	@retry.retry((httplib.BadStatusLine, socket.error),
				122	timeout_min=BASE_RPC_TIMEOUT*2)
				123	def _stop_running_tests(self):
				124	"""Stop all running tests.
				125
				126	Wrap the StopTest rpc so we can attempt to stop tests even while a DUT
				127	is suspended or rebooting.
				128	"""
				129	logging.info('Stopping tests.')
				130	self._client.StopTest()
				131
				132
				133	def _get_test_map(self):
				134	"""Get a mapping of test suites -> tests.
				135
				136	Ignore entries for tests that don't have a path.
				137
				138	@return: A dictionary of the form
				139	{'suite_name': ['suite_name.path_to_test', ...]}.
				140	"""
				141	test_all = set([test['path'] for test in self._client.GetTests()
				142	if test.get('path')])
				143
				144	test_map = collections.defaultdict(list)
				145	for names in test_all:
				146	test_map[names.split('.')[0]].append(names)
				147	return test_map
				148
				149
				150	def _log_test_results(self, test_status, current_suite):
				151	"""Format test status results and write them to status.log.
				152
				153	@param test_status: The status dictionary of a single test.
				154	@param current_suite: The current suite name.
				155	"""
				156	try:
				157	self._host.job.record('INFO', None, None,
				158	'suite %s, test %s, status: %s' %
				159	(current_suite, test_status.get('path'),
				160	test_status.get('status')))
				161	except AttributeError as e:
				162	logging.error('Could not gather results for current test: %s', e)
				163
				164
				165	@retry.retry((httplib.BadStatusLine, socket.error),
				166	timeout_min=BASE_RPC_TIMEOUT*2)
				167	def _get_test_info(self, test_name):
				168	"""Get the status of one test.
				169
				170	@param test_name: The name of the test we need the status of.
				171
				172	@return: The entry for the test in the status dictionary.
				173	"""
				174	for test in self._client.GetTests():
				175	if test['path'] == test_name:
				176	return test
				177	raise ValueError('Could not find test_name %s in _get_test_info.' %
				178	test_name)
				179
				180
				181	def _wait_on_barrier(self, barrier_name):
				182	"""Wait on a barrier.
				183
				184	This method is designed to wait on the Barrier of a suite. A Barrier
				185	is used to synchronize several tests that run in parallel within a
				186	suite; it will cause the suite to hang while it attempts to show
				187	an operator the status of each test, and is activated once all the
				188	tests in the suite are done.
				189
				190	@param barrier_name: The name of the barrier.
				191	"""
				192	logging.info('Waiting on barrier %s', barrier_name)
				193
				194	# TODO(beeps): crbug.com/279473
				195	while self._get_test_info(barrier_name)['status'] != 'ACTIVE':
				196	time.sleep(self.POLLING_INTERVAL)
				197
				198
				199	def _wait_on_suite(self, suite_name):
				200	"""Wait till a suite stops being active.
				201
				202	This method is designed to wait on the suite to change
				203	status if it lacks a 'Barrier'. If a suite has a barrier
				204	one should use _wait_on_barrier instead.
				205
				206	@param suite_name: The name of the suite to wait on.
				207	"""
				208	logging.info('Waiting on suite %s', suite_name)
				209
				210	while self._get_test_info(suite_name)['status'] == 'ACTIVE':
				211	time.sleep(self.POLLING_INTERVAL)
				212
				213
				214	def _synchronous_run_suite(self, suite_name, barrier_name=None):
				215	"""Run one suite and wait for it to finish.
				216
				217	Will wait till the specified suite_name becomes active,
				218	then wait till it switches out of active. If the suite
				219	has a barrier, will wait till the barrier becomes active
				220	instead, as this indicates that all tests have finished
				221	running.
				222
				223	@param suite_name: The name of the suite to wait for.
				224	@param barrier_name: The name of the barrier, if any.
				225
				226	@raises GoofyProxyException: If the status of the suite
				227	doesn't switch to active after we call RunTest.
				228
				229	@return: The result of the suite.
				230	"""
				231	self._client.RunTest(suite_name)
				232	result = self._get_test_info(suite_name)
				233
				234	#TODO(beeps): crbug.com/292975
				235	if result['status'] != 'ACTIVE':
				236	raise GoofyProxyException('Not waiting for test list %s. Either we '
				237	'could not start it or the test list '
				238	'already finished.' % suite_name)
				239
				240	if barrier_name:
				241	self._wait_on_barrier(barrier_name)
				242	else:
				243	self._wait_on_suite(suite_name)
				244
				245	# Since the barrier itself counts as a 'test' we need to stop
				246	# it before asking goofy for the suites results, or goofy will
				247	# think that the suite is still running. We also need to stop
				248	# any orphaned test that might have been kicked off during this
				249	# suite.
				250	self._stop_running_tests()
				251	return self._get_test_info(suite_name)
				252
				253
				254	def monitor_tests(self, test_list):
				255	"""Run a test list.
				256
				257	Will run each suite in the given list in sequence, starting each one
				258	by name and waiting on its results. This method makes the following
				259	assumptions:
				260	- A test list is made up of self contained suites.
				261	- These suites trigger several things in parallel.
				262	- After a suite finishes it leaves goofy in an idle state.
				263
				264	It is not safe to pull results for individual tests during the suite
				265	as the device could be rebooting, or goofy could be under stress.
				266	Instead, this method synchronously waits on an entire suite, then
				267	asks goofy for the status of each test in the suite. Since certain
				268	test lists automatically start and others don't, this method stops
				269	test list execution regardless, and sequentially triggers each suite.
				270
				271	@param test_list: The test list to run.
				272	"""
				273	self._set_test_list(test_list)
				274	self._wait_for_goofy()
				275	self._stop_running_tests()
				276
				277	test_map = self._get_test_map()
				278	for current_suite in test_map.keys():
				279	logging.info('Processing suite %s', current_suite)
				280
				281	# Check if any of these tests are actually a Barrier.
				282	barrier = None
				283	for test in test_map.get(current_suite):
				284	if '.' in test and 'Barrier' in test.split('.')[1]:
				285	barrier = test
				286	break
				287
				288	logging.info('Current suite = %s, barrier: %s', current_suite,
				289	barrier)
				290
				291	result = self._synchronous_run_suite(current_suite, barrier)
				292	logging.info(result)
				293
				294	for test_names in test_map.get(current_suite):
				295	self._log_test_results(self._get_test_info(test_names),
				296	current_suite)
				297
				298
				299	@retry.retry((httplib.BadStatusLine, socket.timeout), timeout_min=1)
				300	def get_results(self, resultsdir):
				301	"""Copies results from the DUT to a local results directory.
				302
				303	Copy the tarball over to the results folder, untar, and delete the
				304	tarball if everything was successful. This will effectively place
				305	all the logs relevant to factory testing in the job's results folder.
				306
				307	@param resultsdir: The directory in which to untar the contents of the
				308	tarball factory_bug generates.
				309	"""
				310	logging.info('Getting results logs for test_list.')
				311
				312	try:
				313	factory_bug_log = self._host.run('factory_bug').stderr
				314	except error.CmdError as e:
				315	logging.error('Could not execute factory_bug: %s', e)
				316	return
				317
				318	try:
				319	factory_bug_tar = re.match(self.FACTORY_BUG_RE,
				320	factory_bug_log).groups(1)[0]
				321	except (IndexError, AttributeError):
				322	logging.error('could not collect logs for factory results, '
				323	'factory bug returned %s', factory_bug_log)
				324	return
				325
				326	factory_bug_tar_file = os.path.basename(factory_bug_tar)
				327	local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file)
				328
				329	try:
				330	self._host.get_file(factory_bug_tar, local_factory_bug_tar)
				331	except error.AutoservRunError as e:
				332	logging.error('Failed to pull back the results tarball: %s', e)
				333	return
				334
				335	try:
				336	utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir))
				337	except error.CmdError as e:
				338	logging.error('Failed to untar the results tarball: %s', e)
				339	return
				340	finally:
				341	if os.path.exists(local_factory_bug_tar):
				342	os.remove(local_factory_bug_tar)
				343
				344