Blame - server/cros/goofy_client.py - platform/external/autotest

blob: c183960c607f6760269d128276354b748e92a8ff [file] [log] [blame]

beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	1	# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4
				5	import collections
				6	import httplib
				7	import logging
				8	import os
				9	import re
				10	import socket
				11	import time
				12
				13	import common
				14
				15	from autotest_lib.client.common_lib import error
				16	from autotest_lib.client.common_lib.cros import retry
				17	from autotest_lib.server import utils
				18
				19
				20	GOOFY_JSONRPC_SERVER_PORT = 0x0FAC
				21	GOOFY_RUNNING = 'RUNNING'
				22
				23
				24	class GoofyProxyException(Exception):
				25	"""Exception raised when a goofy rpc fails."""
				26	pass
				27
				28
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	29	class GoofyRuntimeException(Exception):
				30	"""Exception raised when something goes wrong while a test is running."""
				31	pass
				32
				33
				34	def retry_goofy_rpc(exception_tuple, timeout_min=30):
				35	"""A decorator to use with goofy rpcs.
				36
				37	This decorator tries to recreate the goofy client proxy on
				38	socket error. It will continue trying to do so until it
				39	executes the method without any socket errors or till the
				40	retry.retry decorator hits it's timeout.
				41
				42	Usage:
				43	If you just want to recreate the proxy:
				44	1. @retry_goofy_rpc(exception_tuple=(<exception>, socket.error),
				45	timeout_min=<timeout>)
				46	2. @retry_goofy_rpc(socket.error, timeout_min=<timeout>)
				47	Note: you need to specify the socket.error exception because we
				48	want to retry the call after recreating the proxy.
				49
				50	@param exception_tuple: A tuple of exceptions to pass to
				51	the retry decorator. Any of these exceptions will result
				52	in retries for the duration of timeout_min.
				53	@param timeout_min: The timeout, in minutes, for which we should
				54	retry the method ignoring any exception in exception_tuple.
				55	"""
				56	def inner_decorator(method):
				57	"""Inner retry decorator applied to the method.
				58
				59	@param method: The method that needs to be wrapped in the decorator.
				60
				61	@return A wrapper function that implements the retry.
				62	"""
				63
				64	@retry.retry(exception_tuple, timeout_min=timeout_min)
				65	def wrapper(args, *kwargs):
				66	"""This wrapper handles socket errors.
				67
				68	If the method in question:
				69	1. Throws an exception in exception_tuple and it is not a
				70	socket.error, retry for timeout_min through retry.retry.
				71	2. Throws a socket.error, recreate the client proxy, and
				72	retry for timeout_min through retry.retry.
				73	3. Throws an exception not in exception_tuple, fail.
				74	"""
				75	try:
				76	return method(args, *kwargs)
				77	except socket.error as e:
				78	goofy_proxy = args[0]
				79	if type(goofy_proxy) is GoofyProxy:
				80	logging.warning('Socket error while running factory tests '
				81	'%s, recreating goofy proxy.', e)
				82	goofy_proxy._create_client_proxy(timeout_min=timeout_min)
				83	else:
				84	logging.warning('Connectivity was lost and the retry '
				85	'decorator was unable to recreate a goofy '
				86	'client proxy, args: %s.', args)
				87	raise
				88
				89	return wrapper
				90
				91	return inner_decorator
				92
				93
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	94	class GoofyProxy(object):
				95	"""Client capable of making rpc calls to goofy.
				96
				97	Methods of this class that can cause goofy to change state
				98	usually need a retry decorator. Methods that have a retry decorator
				99	need to be 'pure', i.e return the same results when called multiple
				100	times with the same argument.
				101
				102	There are 2 known exceptions this class can deal with, a socket.error
				103	which happens when we try to execute an rpc when the DUT is, say, suspended
				104	and a BadStatusLine, which we get when we try to execute an rpc while the
				105	DUT is going through a factory_restart. Ideally we would like to handle
				106	socket timeouts different from BadStatusLines as we can get connection
				107	errors even when a device reboots and BadStatusLines ususally only when
				108	factory restarts. crbug.com/281714.
				109	"""
				110
				111	# This timeout was arbitrarily chosen as many tests in the factory test
				112	# suite run for days. Ideally we would like to split this into at least 2
				113	# timeouts, one which we use for rpcs that run while no other test is,
				114	# running and is smaller than the second that is designed for use with rpcs
				115	# that might execute simultaneously with a test. The latter needs a longer
				116	# timeout since tests could suspend,resume for a long time, and a call like
				117	# GetGoofyStatus should be tolerant to these suspend/resumes. In designing
				118	# the base timeout one needs to allocate time to component methods of this
				119	# class (such as _set_test_list) as a multiple of the number of rpcs it
				120	# executes.
				121	BASE_RPC_TIMEOUT = 1440
				122	POLLING_INTERVAL = 5
				123	FACTORY_BUG_RE = r'.(/tmp/factory_bug.tar.bz2).*'
				124	UNTAR_COMMAND = 'tar jxf %s -C %s'
				125
				126
				127	def __init__(self, host):
				128	"""
				129	@param host: The host object representing the DUT running goofy.
				130	"""
				131	self._host = host
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	132	self._raw_stop_running_tests()
				133	self._create_client_proxy(timeout_min=self.BASE_RPC_TIMEOUT)
				134
				135
				136	def _create_client_proxy(self, timeout_min=30):
				137	"""Create a goofy client proxy.
				138
				139	Ping the host till it's up, then proceed to create a goofy proxy. We
				140	don't wrap this method with a retry because it's used in the retry
				141	decorator itself.
				142	"""
				143
				144	# We don't ssh ping here as there is a potential dealy in O(minutes)
				145	# with our ssh command against a sleeping DUT, once it wakes up, and
				146	# that will lead to significant overhead incurred over many reboots.
				147	self._host.ping_wait_up(timeout_min)
				148	logging.info('Host is pingable, creating goofy client proxy')
Roshan Pius	58e5dd3	2015-10-16 15:16:42 -0700	[diff] [blame]	149	self._client = self._host.rpc_server_tracker.jsonrpc_connect(
				150	GOOFY_JSONRPC_SERVER_PORT)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	151
				152
				153	@retry.retry((httplib.BadStatusLine, socket.error),
				154	timeout_min=BASE_RPC_TIMEOUT)
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	155	def _raw_stop_running_tests(self):
				156	"""Stop running tests by shelling out to the DUT.
				157
				158	Use this method only before we have actually created the client
				159	proxy, as shelling out has several pitfalls. We need to stop all
				160	tests in a retry loop because tests will start executing as soon
				161	as we have reimaged a DUT and trying to create the proxy while
				162	the DUT is rebooting will lead to a spurious failure.
				163
				164	Note that we use the plain retry decorator for this method since
				165	we don't need to recreate the client proxy on failure.
				166	"""
				167	logging.info('Stopping all tests and clearing factory state')
				168	self._host.run('factory clear')
				169
				170
				171	@retry_goofy_rpc((httplib.BadStatusLine, socket.error),
				172	timeout_min=BASE_RPC_TIMEOUT)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	173	def _get_goofy_status(self):
				174	"""Return status of goofy, ignoring socket timeouts and http exceptions.
				175	"""
				176	status = self._client.GetGoofyStatus().get('status')
				177	return status
				178
				179
				180	def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2):
				181	"""Wait till goofy is running or a timeout occurs.
				182
				183	@param timeout_min: Minutes to wait before timing this call out.
				184	"""
				185	current_time = time.time()
				186	timeout_secs = timeout_min * 60
				187	logging.info('Waiting on goofy')
				188	while self._get_goofy_status() != GOOFY_RUNNING:
				189	if time.time() - current_time > timeout_secs:
				190	break
				191	return
				192
				193
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	194	@retry_goofy_rpc(socket.error, timeout_min=BASE_RPC_TIMEOUT*2)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	195	def _set_test_list(self, next_list):
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	196	"""Set the given test list for execution and turn on test automation.
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	197
				198	Confirm that the given test list is a test that has been baked into
				199	the image, then run it. Some test lists are configured to start
				200	execution automatically when we call SetTestList, while others wait
				201	for a corresponding RunTest.
				202
				203	@param next_list: The name of the test list.
				204
				205	@raise jsonrpclib.ProtocolError: If the test list we're trying to switch
				206	to isn't on the DUT.
				207	"""
				208
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	209	# We can get a BadStatus line on 2 occassions:
				210	# 1. As part of SwitchTestList goofy performs a factory restart, which
				211	# will throw a BadStatusLine because the rpc can't exit cleanly. We
				212	# don't want to retry on this exception, since we've already set the
				213	# right test list.
				214	# 2. If we try to set a test list while goofy is already down
				215	# (from a previous factory restart). In this case we wouldn't have
				216	# set the new test list, because we coulnd't connect to goofy.
				217	# To properly set a new test list it's important to wait till goofy is
				218	# up before attempting to set the test list, while being aware that the
				219	# preceding httplib error is from the rpc we just executed leading to
				220	# a factory restart. Also note that if the test list is not already on
				221	# the DUT this method will fail, emitting the possible test lists one
				222	# can switch to.
				223	self._wait_for_goofy()
				224	logging.info('Switching to test list %s', next_list)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	225	try:
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	226	# Enable full factory test automation. Full test automation mode
				227	# skips all manual tests and test barriers, which is what we want in
				228	# the test lab. There are other automation modes: partial and none.
				229	# In partial automation mode manual tests and barrier are enabled
				230	# and user intervention is required; none disables automation.
				231	self._client.SwitchTestList(next_list, 'full')
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	232	except httplib.BadStatusLine:
				233	logging.info('Switched to list %s, goofy restarting', next_list)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	234
				235
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	236	@retry_goofy_rpc((httplib.BadStatusLine, socket.error),
				237	timeout_min=BASE_RPC_TIMEOUT*2)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	238	def _stop_running_tests(self):
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	239	"""Stop all running tests.
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	240
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	241	Wrap the StopTest rpc so we can attempt to stop tests even while a DUT
				242	is suspended or rebooting.
				243	"""
				244	logging.info('Stopping tests.')
				245	self._client.StopTest()
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	246
				247
				248	def _get_test_map(self):
				249	"""Get a mapping of test suites -> tests.
				250
				251	Ignore entries for tests that don't have a path.
				252
				253	@return: A dictionary of the form
				254	{'suite_name': ['suite_name.path_to_test', ...]}.
				255	"""
				256	test_all = set([test['path'] for test in self._client.GetTests()
				257	if test.get('path')])
				258
				259	test_map = collections.defaultdict(list)
				260	for names in test_all:
				261	test_map[names.split('.')[0]].append(names)
				262	return test_map
				263
				264
				265	def _log_test_results(self, test_status, current_suite):
				266	"""Format test status results and write them to status.log.
				267
				268	@param test_status: The status dictionary of a single test.
				269	@param current_suite: The current suite name.
				270	"""
				271	try:
				272	self._host.job.record('INFO', None, None,
				273	'suite %s, test %s, status: %s' %
				274	(current_suite, test_status.get('path'),
				275	test_status.get('status')))
				276	except AttributeError as e:
				277	logging.error('Could not gather results for current test: %s', e)
				278
				279
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	280	@retry_goofy_rpc((httplib.BadStatusLine, socket.error),
				281	timeout_min=BASE_RPC_TIMEOUT*2)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	282	def _get_test_info(self, test_name):
				283	"""Get the status of one test.
				284
				285	@param test_name: The name of the test we need the status of.
				286
				287	@return: The entry for the test in the status dictionary.
				288	"""
				289	for test in self._client.GetTests():
				290	if test['path'] == test_name:
				291	return test
				292	raise ValueError('Could not find test_name %s in _get_test_info.' %
				293	test_name)
				294
				295
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	296	@retry_goofy_rpc((httplib.BadStatusLine, socket.error),
				297	timeout_min=BASE_RPC_TIMEOUT*2)
				298	def _get_test_run_info(self, run_id):
				299	"""Get the information about the given test run.
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	300
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	301	@param run_id: The ID of the test run.
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	302
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	303	@return: A dict of test run status.
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	304	"""
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	305	return self._client.GetTestRunStatus(run_id)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	306
				307
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	308	def _wait_on_run(self, run_id):
				309	"""Wait until the given test run to end.
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	310
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	311	@param run_id: The ID of the test run.
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	312
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	313	@raises GoofyRuntimeException: If the test run does not finish
				314	gracefully.
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	315	"""
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	316	finished_tests = set()
				317	run_info = self._get_test_run_info(run_id)
				318	while run_info['status'] == 'RUNNING':
				319	finished = [(t['path'], t['status']) for t in
				320	run_info['scheduled_tests']
				321	if t['status'] in ('PASSED', 'FAILED')]
				322	for t in finished:
				323	if t not in finished_tests:
				324	logging.info('[%s] %s', t[1], t[0])
				325	finished_tests.add(t)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	326	time.sleep(self.POLLING_INTERVAL)
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	327	run_info = self._get_test_run_info(run_id)
				328	if run_info['status'] != 'FINISHED':
				329	raise GoofyRuntimeException(
				330	'The requested test run was interrupted.')
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	331
				332
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	333	def _synchronous_run_suite(self, suite_name):
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	334	"""Run one suite and wait for it to finish.
				335
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	336	Will start a test run for the specified suite_name and wait until it
				337	ends.
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	338
				339	@param suite_name: The name of the suite to wait for.
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	340
				341	@raises GoofyProxyException: If the status of the suite
				342	doesn't switch to active after we call RunTest.
				343
				344	@return: The result of the suite.
				345	"""
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	346	logging.info('Starting suite: %s', suite_name)
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	347	run_id = self._client.RunTest(suite_name)
				348	self._wait_on_run(run_id)
				349	return self._get_test_run_info(run_id)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	350
				351
				352	def monitor_tests(self, test_list):
				353	"""Run a test list.
				354
				355	Will run each suite in the given list in sequence, starting each one
				356	by name and waiting on its results. This method makes the following
				357	assumptions:
				358	- A test list is made up of self contained suites.
				359	- These suites trigger several things in parallel.
				360	- After a suite finishes it leaves goofy in an idle state.
				361
				362	It is not safe to pull results for individual tests during the suite
				363	as the device could be rebooting, or goofy could be under stress.
				364	Instead, this method synchronously waits on an entire suite, then
				365	asks goofy for the status of each test in the suite. Since certain
				366	test lists automatically start and others don't, this method stops
				367	test list execution regardless, and sequentially triggers each suite.
				368
				369	@param test_list: The test list to run.
				370	"""
				371	self._set_test_list(test_list)
				372	self._wait_for_goofy()
				373	self._stop_running_tests()
				374
				375	test_map = self._get_test_map()
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	376	if test_map:
				377	logging.info('About to execute tests: %s', test_map)
				378	else:
				379	raise GoofyRuntimeException('Test map is empty, you might have an '
				380	'error in your test_list.')
				381
				382
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	383	for current_suite in test_map.keys():
				384	logging.info('Processing suite %s', current_suite)
				385
Ricky Liang	7431706	2014-03-04 17:20:05 +0800	[diff] [blame]	386	result = self._synchronous_run_suite(current_suite)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	387	logging.info(result)
				388
				389	for test_names in test_map.get(current_suite):
				390	self._log_test_results(self._get_test_info(test_names),
				391	current_suite)
				392
				393
beeps	0456135	2013-10-04 11:07:47 -0700	[diff] [blame]	394	@retry_goofy_rpc((httplib.BadStatusLine, socket.error),
				395	timeout_min=BASE_RPC_TIMEOUT*2)
beeps	7583e58	2013-08-28 16:19:24 -0700	[diff] [blame]	396	def get_results(self, resultsdir):
				397	"""Copies results from the DUT to a local results directory.
				398
				399	Copy the tarball over to the results folder, untar, and delete the
				400	tarball if everything was successful. This will effectively place
				401	all the logs relevant to factory testing in the job's results folder.
				402
				403	@param resultsdir: The directory in which to untar the contents of the
				404	tarball factory_bug generates.
				405	"""
				406	logging.info('Getting results logs for test_list.')
				407
				408	try:
				409	factory_bug_log = self._host.run('factory_bug').stderr
				410	except error.CmdError as e:
				411	logging.error('Could not execute factory_bug: %s', e)
				412	return
				413
				414	try:
				415	factory_bug_tar = re.match(self.FACTORY_BUG_RE,
				416	factory_bug_log).groups(1)[0]
				417	except (IndexError, AttributeError):
				418	logging.error('could not collect logs for factory results, '
				419	'factory bug returned %s', factory_bug_log)
				420	return
				421
				422	factory_bug_tar_file = os.path.basename(factory_bug_tar)
				423	local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file)
				424
				425	try:
				426	self._host.get_file(factory_bug_tar, local_factory_bug_tar)
				427	except error.AutoservRunError as e:
				428	logging.error('Failed to pull back the results tarball: %s', e)
				429	return
				430
				431	try:
				432	utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir))
				433	except error.CmdError as e:
				434	logging.error('Failed to untar the results tarball: %s', e)
				435	return
				436	finally:
				437	if os.path.exists(local_factory_bug_tar):
				438	os.remove(local_factory_bug_tar)