Blame - server/cros/goofy_client.py - platform/external/autotest

blob: 378eaecdad5eb8fa02a2bfc77e68181efed0433c [file] [log] [blame]

beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	1	# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4
				5	import collections
				6	import httplib
				7	import logging
				8	import os
				9	import re
				10	import socket
				11	import time
				12
				13	import common
				14
				15	from autotest_lib.client.common_lib import error
				16	from autotest_lib.client.common_lib.cros import retry
				17	from autotest_lib.server import utils
				18
				19
				20	GOOFY_JSONRPC_SERVER_PORT = 0x0FAC
				21	GOOFY_RUNNING = 'RUNNING'
				22
				23
				24	class GoofyProxyException(Exception):
				25	"""Exception raised when a goofy rpc fails."""
				26	pass
				27
				28
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	29	class GoofyRuntimeException(Exception):
				30	"""Exception raised when something goes wrong while a test is running."""
				31	pass
				32
				33
				34	def retry_goofy_rpc(exception_tuple, timeout_min=30):
				35	"""A decorator to use with goofy rpcs.
				36
				37	This decorator tries to recreate the goofy client proxy on
				38	socket error. It will continue trying to do so until it
				39	executes the method without any socket errors or till the
				40	retry.retry decorator hits it's timeout.
				41
				42	Usage:
				43	If you just want to recreate the proxy:
				44	1. @retry_goofy_rpc(exception_tuple=(<exception>, socket.error),
				45	timeout_min=<timeout>)
				46	2. @retry_goofy_rpc(socket.error, timeout_min=<timeout>)
				47	Note: you need to specify the socket.error exception because we
				48	want to retry the call after recreating the proxy.
				49
				50	@param exception_tuple: A tuple of exceptions to pass to
				51	the retry decorator. Any of these exceptions will result
				52	in retries for the duration of timeout_min.
				53	@param timeout_min: The timeout, in minutes, for which we should
				54	retry the method ignoring any exception in exception_tuple.
				55	"""
				56	def inner_decorator(method):
				57	"""Inner retry decorator applied to the method.
				58
				59	@param method: The method that needs to be wrapped in the decorator.
				60
				61	@return A wrapper function that implements the retry.
				62	"""
				63
				64	@retry.retry(exception_tuple, timeout_min=timeout_min)
				65	def wrapper(args, *kwargs):
				66	"""This wrapper handles socket errors.
				67
				68	If the method in question:
				69	1. Throws an exception in exception_tuple and it is not a
				70	socket.error, retry for timeout_min through retry.retry.
				71	2. Throws a socket.error, recreate the client proxy, and
				72	retry for timeout_min through retry.retry.
				73	3. Throws an exception not in exception_tuple, fail.
				74	"""
				75	try:
				76	return method(args, *kwargs)
				77	except socket.error as e:
				78	goofy_proxy = args[0]
				79	if type(goofy_proxy) is GoofyProxy:
				80	logging.warning('Socket error while running factory tests '
				81	'%s, recreating goofy proxy.', e)
				82	goofy_proxy._create_client_proxy(timeout_min=timeout_min)
				83	else:
				84	logging.warning('Connectivity was lost and the retry '
				85	'decorator was unable to recreate a goofy '
				86	'client proxy, args: %s.', args)
				87	raise
				88
				89	return wrapper
				90
				91	return inner_decorator
				92
				93
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	94	class GoofyProxy(object):
				95	"""Client capable of making rpc calls to goofy.
				96
				97	Methods of this class that can cause goofy to change state
				98	usually need a retry decorator. Methods that have a retry decorator
				99	need to be 'pure', i.e return the same results when called multiple
				100	times with the same argument.
				101
				102	There are 2 known exceptions this class can deal with, a socket.error
				103	which happens when we try to execute an rpc when the DUT is, say, suspended
				104	and a BadStatusLine, which we get when we try to execute an rpc while the
				105	DUT is going through a factory_restart. Ideally we would like to handle
				106	socket timeouts different from BadStatusLines as we can get connection
				107	errors even when a device reboots and BadStatusLines ususally only when
				108	factory restarts. crbug.com/281714.
				109	"""
				110
				111	# This timeout was arbitrarily chosen as many tests in the factory test
				112	# suite run for days. Ideally we would like to split this into at least 2
				113	# timeouts, one which we use for rpcs that run while no other test is,
				114	# running and is smaller than the second that is designed for use with rpcs
				115	# that might execute simultaneously with a test. The latter needs a longer
				116	# timeout since tests could suspend,resume for a long time, and a call like
				117	# GetGoofyStatus should be tolerant to these suspend/resumes. In designing
				118	# the base timeout one needs to allocate time to component methods of this
				119	# class (such as _set_test_list) as a multiple of the number of rpcs it
				120	# executes.
				121	BASE_RPC_TIMEOUT = 1440
				122	POLLING_INTERVAL = 5
				123	FACTORY_BUG_RE = r'.(/tmp/factory_bug.tar.bz2).*'
				124	UNTAR_COMMAND = 'tar jxf %s -C %s'
				125
				126
				127	def __init__(self, host):
				128	"""
				129	@param host: The host object representing the DUT running goofy.
				130	"""
				131	self._host = host
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	132	self._raw_stop_running_tests()
				133	self._create_client_proxy(timeout_min=self.BASE_RPC_TIMEOUT)
				134
				135
				136	def _create_client_proxy(self, timeout_min=30):
				137	"""Create a goofy client proxy.
				138
				139	Ping the host till it's up, then proceed to create a goofy proxy. We
				140	don't wrap this method with a retry because it's used in the retry
				141	decorator itself.
				142	"""
				143
				144	# We don't ssh ping here as there is a potential dealy in O(minutes)
				145	# with our ssh command against a sleeping DUT, once it wakes up, and
				146	# that will lead to significant overhead incurred over many reboots.
				147	self._host.ping_wait_up(timeout_min)
				148	logging.info('Host is pingable, creating goofy client proxy')
				149	self._client = self._host.jsonrpc_connect(GOOFY_JSONRPC_SERVER_PORT)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	150
				151
				152	@retry.retry((httplib.BadStatusLine, socket.error),
				153	timeout_min=BASE_RPC_TIMEOUT)
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	154	def _raw_stop_running_tests(self):
				155	"""Stop running tests by shelling out to the DUT.
				156
				157	Use this method only before we have actually created the client
				158	proxy, as shelling out has several pitfalls. We need to stop all
				159	tests in a retry loop because tests will start executing as soon
				160	as we have reimaged a DUT and trying to create the proxy while
				161	the DUT is rebooting will lead to a spurious failure.
				162
				163	Note that we use the plain retry decorator for this method since
				164	we don't need to recreate the client proxy on failure.
				165	"""
				166	logging.info('Stopping all tests and clearing factory state')
				167	self._host.run('factory clear')
				168
				169
				170	@retry_goofy_rpc((httplib.BadStatusLine, socket.error),
				171	timeout_min=BASE_RPC_TIMEOUT)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	172	def _get_goofy_status(self):
				173	"""Return status of goofy, ignoring socket timeouts and http exceptions.
				174	"""
				175	status = self._client.GetGoofyStatus().get('status')
				176	return status
				177
				178
				179	def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2):
				180	"""Wait till goofy is running or a timeout occurs.
				181
				182	@param timeout_min: Minutes to wait before timing this call out.
				183	"""
				184	current_time = time.time()
				185	timeout_secs = timeout_min * 60
				186	logging.info('Waiting on goofy')
				187	while self._get_goofy_status() != GOOFY_RUNNING:
				188	if time.time() - current_time > timeout_secs:
				189	break
				190	return
				191
				192
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	193	@retry_goofy_rpc(socket.error, timeout_min=BASE_RPC_TIMEOUT*2)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	194	def _set_test_list(self, next_list):
				195	"""Set the given test list for execution.
				196
				197	Confirm that the given test list is a test that has been baked into
				198	the image, then run it. Some test lists are configured to start
				199	execution automatically when we call SetTestList, while others wait
				200	for a corresponding RunTest.
				201
				202	@param next_list: The name of the test list.
				203
				204	@raise jsonrpclib.ProtocolError: If the test list we're trying to switch
				205	to isn't on the DUT.
				206	"""
				207
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	208	# We can get a BadStatus line on 2 occassions:
				209	# 1. As part of SwitchTestList goofy performs a factory restart, which
				210	# will throw a BadStatusLine because the rpc can't exit cleanly. We
				211	# don't want to retry on this exception, since we've already set the
				212	# right test list.
				213	# 2. If we try to set a test list while goofy is already down
				214	# (from a previous factory restart). In this case we wouldn't have
				215	# set the new test list, because we coulnd't connect to goofy.
				216	# To properly set a new test list it's important to wait till goofy is
				217	# up before attempting to set the test list, while being aware that the
				218	# preceding httplib error is from the rpc we just executed leading to
				219	# a factory restart. Also note that if the test list is not already on
				220	# the DUT this method will fail, emitting the possible test lists one
				221	# can switch to.
				222	self._wait_for_goofy()
				223	logging.info('Switching to test list %s', next_list)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	224	try:
				225	self._client.SwitchTestList(next_list)
				226	except httplib.BadStatusLine:
				227	logging.info('Switched to list %s, goofy restarting', next_list)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	228
				229
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	230	@retry_goofy_rpc((httplib.BadStatusLine, socket.error),
				231	timeout_min=BASE_RPC_TIMEOUT*2)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	232	def _stop_running_tests(self):
				233	"""Stop all running tests.
				234
				235	Wrap the StopTest rpc so we can attempt to stop tests even while a DUT
				236	is suspended or rebooting.
				237	"""
				238	logging.info('Stopping tests.')
				239	self._client.StopTest()
				240
				241
				242	def _get_test_map(self):
				243	"""Get a mapping of test suites -> tests.
				244
				245	Ignore entries for tests that don't have a path.
				246
				247	@return: A dictionary of the form
				248	{'suite_name': ['suite_name.path_to_test', ...]}.
				249	"""
				250	test_all = set([test['path'] for test in self._client.GetTests()
				251	if test.get('path')])
				252
				253	test_map = collections.defaultdict(list)
				254	for names in test_all:
				255	test_map[names.split('.')[0]].append(names)
				256	return test_map
				257
				258
				259	def _log_test_results(self, test_status, current_suite):
				260	"""Format test status results and write them to status.log.
				261
				262	@param test_status: The status dictionary of a single test.
				263	@param current_suite: The current suite name.
				264	"""
				265	try:
				266	self._host.job.record('INFO', None, None,
				267	'suite %s, test %s, status: %s' %
				268	(current_suite, test_status.get('path'),
				269	test_status.get('status')))
				270	except AttributeError as e:
				271	logging.error('Could not gather results for current test: %s', e)
				272
				273
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	274	@retry_goofy_rpc((httplib.BadStatusLine, socket.error),
				275	timeout_min=BASE_RPC_TIMEOUT*2)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	276	def _get_test_info(self, test_name):
				277	"""Get the status of one test.
				278
				279	@param test_name: The name of the test we need the status of.
				280
				281	@return: The entry for the test in the status dictionary.
				282	"""
				283	for test in self._client.GetTests():
				284	if test['path'] == test_name:
				285	return test
				286	raise ValueError('Could not find test_name %s in _get_test_info.' %
				287	test_name)
				288
				289
				290	def _wait_on_barrier(self, barrier_name):
				291	"""Wait on a barrier.
				292
				293	This method is designed to wait on the Barrier of a suite. A Barrier
				294	is used to synchronize several tests that run in parallel within a
				295	suite; it will cause the suite to hang while it attempts to show
				296	an operator the status of each test, and is activated once all the
				297	tests in the suite are done.
				298
				299	@param barrier_name: The name of the barrier.
				300	"""
				301	logging.info('Waiting on barrier %s', barrier_name)
				302
				303	# TODO(beeps): crbug.com/279473
				304	while self._get_test_info(barrier_name)['status'] != 'ACTIVE':
				305	time.sleep(self.POLLING_INTERVAL)
				306
				307
				308	def _wait_on_suite(self, suite_name):
				309	"""Wait till a suite stops being active.
				310
				311	This method is designed to wait on the suite to change
				312	status if it lacks a 'Barrier'. If a suite has a barrier
				313	one should use _wait_on_barrier instead.
				314
				315	@param suite_name: The name of the suite to wait on.
				316	"""
				317	logging.info('Waiting on suite %s', suite_name)
				318
				319	while self._get_test_info(suite_name)['status'] == 'ACTIVE':
				320	time.sleep(self.POLLING_INTERVAL)
				321
				322
				323	def _synchronous_run_suite(self, suite_name, barrier_name=None):
				324	"""Run one suite and wait for it to finish.
				325
				326	Will wait till the specified suite_name becomes active,
				327	then wait till it switches out of active. If the suite
				328	has a barrier, will wait till the barrier becomes active
				329	instead, as this indicates that all tests have finished
				330	running.
				331
				332	@param suite_name: The name of the suite to wait for.
				333	@param barrier_name: The name of the barrier, if any.
				334
				335	@raises GoofyProxyException: If the status of the suite
				336	doesn't switch to active after we call RunTest.
				337
				338	@return: The result of the suite.
				339	"""
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	340	logging.info('Starting suite: %s', suite_name)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	341	self._client.RunTest(suite_name)
				342	result = self._get_test_info(suite_name)
				343
				344	#TODO(beeps): crbug.com/292975
				345	if result['status'] != 'ACTIVE':
				346	raise GoofyProxyException('Not waiting for test list %s. Either we '
				347	'could not start it or the test list '
				348	'already finished.' % suite_name)
				349
				350	if barrier_name:
				351	self._wait_on_barrier(barrier_name)
				352	else:
				353	self._wait_on_suite(suite_name)
				354
				355	# Since the barrier itself counts as a 'test' we need to stop
				356	# it before asking goofy for the suites results, or goofy will
				357	# think that the suite is still running. We also need to stop
				358	# any orphaned test that might have been kicked off during this
				359	# suite.
				360	self._stop_running_tests()
				361	return self._get_test_info(suite_name)
				362
				363
				364	def monitor_tests(self, test_list):
				365	"""Run a test list.
				366
				367	Will run each suite in the given list in sequence, starting each one
				368	by name and waiting on its results. This method makes the following
				369	assumptions:
				370	- A test list is made up of self contained suites.
				371	- These suites trigger several things in parallel.
				372	- After a suite finishes it leaves goofy in an idle state.
				373
				374	It is not safe to pull results for individual tests during the suite
				375	as the device could be rebooting, or goofy could be under stress.
				376	Instead, this method synchronously waits on an entire suite, then
				377	asks goofy for the status of each test in the suite. Since certain
				378	test lists automatically start and others don't, this method stops
				379	test list execution regardless, and sequentially triggers each suite.
				380
				381	@param test_list: The test list to run.
				382	"""
				383	self._set_test_list(test_list)
				384	self._wait_for_goofy()
				385	self._stop_running_tests()
				386
				387	test_map = self._get_test_map()
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	388	if test_map:
				389	logging.info('About to execute tests: %s', test_map)
				390	else:
				391	raise GoofyRuntimeException('Test map is empty, you might have an '
				392	'error in your test_list.')
				393
				394
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	395	for current_suite in test_map.keys():
				396	logging.info('Processing suite %s', current_suite)
				397
				398	# Check if any of these tests are actually a Barrier.
				399	barrier = None
				400	for test in test_map.get(current_suite):
				401	if '.' in test and 'Barrier' in test.split('.')[1]:
				402	barrier = test
				403	break
				404
				405	logging.info('Current suite = %s, barrier: %s', current_suite,
				406	barrier)
				407
				408	result = self._synchronous_run_suite(current_suite, barrier)
				409	logging.info(result)
				410
				411	for test_names in test_map.get(current_suite):
				412	self._log_test_results(self._get_test_info(test_names),
				413	current_suite)
				414
				415
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	416	@retry_goofy_rpc((httplib.BadStatusLine, socket.error),
				417	timeout_min=BASE_RPC_TIMEOUT*2)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	418	def get_results(self, resultsdir):
				419	"""Copies results from the DUT to a local results directory.
				420
				421	Copy the tarball over to the results folder, untar, and delete the
				422	tarball if everything was successful. This will effectively place
				423	all the logs relevant to factory testing in the job's results folder.
				424
				425	@param resultsdir: The directory in which to untar the contents of the
				426	tarball factory_bug generates.
				427	"""
				428	logging.info('Getting results logs for test_list.')
				429
				430	try:
				431	factory_bug_log = self._host.run('factory_bug').stderr
				432	except error.CmdError as e:
				433	logging.error('Could not execute factory_bug: %s', e)
				434	return
				435
				436	try:
				437	factory_bug_tar = re.match(self.FACTORY_BUG_RE,
				438	factory_bug_log).groups(1)[0]
				439	except (IndexError, AttributeError):
				440	logging.error('could not collect logs for factory results, '
				441	'factory bug returned %s', factory_bug_log)
				442	return
				443
				444	factory_bug_tar_file = os.path.basename(factory_bug_tar)
				445	local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file)
				446
				447	try:
				448	self._host.get_file(factory_bug_tar, local_factory_bug_tar)
				449	except error.AutoservRunError as e:
				450	logging.error('Failed to pull back the results tarball: %s', e)
				451	return
				452
				453	try:
				454	utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir))
				455	except error.CmdError as e:
				456	logging.error('Failed to untar the results tarball: %s', e)
				457	return
				458	finally:
				459	if os.path.exists(local_factory_bug_tar):
				460	os.remove(local_factory_bug_tar)
				461
				462