| # Copyright (c) 2013 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import collections |
| import httplib |
| import logging |
| import os |
| import re |
| import socket |
| import time |
| |
| import common |
| |
| from autotest_lib.client.common_lib import error |
| from autotest_lib.client.common_lib.cros import retry |
| from autotest_lib.server import utils |
| |
| |
| GOOFY_JSONRPC_SERVER_PORT = 0x0FAC |
| GOOFY_RUNNING = 'RUNNING' |
| |
| |
| class GoofyProxyException(Exception): |
| """Exception raised when a goofy rpc fails.""" |
| pass |
| |
| |
| class GoofyRuntimeException(Exception): |
| """Exception raised when something goes wrong while a test is running.""" |
| pass |
| |
| |
| def retry_goofy_rpc(exception_tuple, timeout_min=30): |
| """A decorator to use with goofy rpcs. |
| |
| This decorator tries to recreate the goofy client proxy on |
| socket error. It will continue trying to do so until it |
| executes the method without any socket errors or till the |
| retry.retry decorator hits it's timeout. |
| |
| Usage: |
| If you just want to recreate the proxy: |
| 1. @retry_goofy_rpc(exception_tuple=(<exception>, socket.error), |
| timeout_min=<timeout>) |
| 2. @retry_goofy_rpc(socket.error, timeout_min=<timeout>) |
| Note: you need to specify the socket.error exception because we |
| want to retry the call after recreating the proxy. |
| |
| @param exception_tuple: A tuple of exceptions to pass to |
| the retry decorator. Any of these exceptions will result |
| in retries for the duration of timeout_min. |
| @param timeout_min: The timeout, in minutes, for which we should |
| retry the method ignoring any exception in exception_tuple. |
| """ |
| def inner_decorator(method): |
| """Inner retry decorator applied to the method. |
| |
| @param method: The method that needs to be wrapped in the decorator. |
| |
| @return A wrapper function that implements the retry. |
| """ |
| |
| @retry.retry(exception_tuple, timeout_min=timeout_min) |
| def wrapper(*args, **kwargs): |
| """This wrapper handles socket errors. |
| |
| If the method in question: |
| 1. Throws an exception in exception_tuple and it is not a |
| socket.error, retry for timeout_min through retry.retry. |
| 2. Throws a socket.error, recreate the client proxy, and |
| retry for timeout_min through retry.retry. |
| 3. Throws an exception not in exception_tuple, fail. |
| """ |
| try: |
| return method(*args, **kwargs) |
| except socket.error as e: |
| goofy_proxy = args[0] |
| if type(goofy_proxy) is GoofyProxy: |
| logging.warning('Socket error while running factory tests ' |
| '%s, recreating goofy proxy.', e) |
| goofy_proxy._create_client_proxy(timeout_min=timeout_min) |
| else: |
| logging.warning('Connectivity was lost and the retry ' |
| 'decorator was unable to recreate a goofy ' |
| 'client proxy, args: %s.', args) |
| raise |
| |
| return wrapper |
| |
| return inner_decorator |
| |
| |
| class GoofyProxy(object): |
| """Client capable of making rpc calls to goofy. |
| |
| Methods of this class that can cause goofy to change state |
| usually need a retry decorator. Methods that have a retry decorator |
| need to be 'pure', i.e return the same results when called multiple |
| times with the same argument. |
| |
| There are 2 known exceptions this class can deal with, a socket.error |
| which happens when we try to execute an rpc when the DUT is, say, suspended |
| and a BadStatusLine, which we get when we try to execute an rpc while the |
| DUT is going through a factory_restart. Ideally we would like to handle |
| socket timeouts different from BadStatusLines as we can get connection |
| errors even when a device reboots and BadStatusLines ususally only when |
| factory restarts. crbug.com/281714. |
| """ |
| |
| # This timeout was arbitrarily chosen as many tests in the factory test |
| # suite run for days. Ideally we would like to split this into at least 2 |
| # timeouts, one which we use for rpcs that run while no other test is, |
| # running and is smaller than the second that is designed for use with rpcs |
| # that might execute simultaneously with a test. The latter needs a longer |
| # timeout since tests could suspend,resume for a long time, and a call like |
| # GetGoofyStatus should be tolerant to these suspend/resumes. In designing |
| # the base timeout one needs to allocate time to component methods of this |
| # class (such as _set_test_list) as a multiple of the number of rpcs it |
| # executes. |
| BASE_RPC_TIMEOUT = 1440 |
| POLLING_INTERVAL = 5 |
| FACTORY_BUG_RE = r'.*(/tmp/factory_bug.*tar.bz2).*' |
| UNTAR_COMMAND = 'tar jxf %s -C %s' |
| |
| |
| def __init__(self, host): |
| """ |
| @param host: The host object representing the DUT running goofy. |
| """ |
| self._host = host |
| self._raw_stop_running_tests() |
| self._create_client_proxy(timeout_min=self.BASE_RPC_TIMEOUT) |
| |
| |
| def _create_client_proxy(self, timeout_min=30): |
| """Create a goofy client proxy. |
| |
| Ping the host till it's up, then proceed to create a goofy proxy. We |
| don't wrap this method with a retry because it's used in the retry |
| decorator itself. |
| """ |
| |
| # We don't ssh ping here as there is a potential dealy in O(minutes) |
| # with our ssh command against a sleeping DUT, once it wakes up, and |
| # that will lead to significant overhead incurred over many reboots. |
| self._host.ping_wait_up(timeout_min) |
| logging.info('Host is pingable, creating goofy client proxy') |
| self._client = self._host.jsonrpc_connect(GOOFY_JSONRPC_SERVER_PORT) |
| |
| |
| @retry.retry((httplib.BadStatusLine, socket.error), |
| timeout_min=BASE_RPC_TIMEOUT) |
| def _raw_stop_running_tests(self): |
| """Stop running tests by shelling out to the DUT. |
| |
| Use this method only before we have actually created the client |
| proxy, as shelling out has several pitfalls. We need to stop all |
| tests in a retry loop because tests will start executing as soon |
| as we have reimaged a DUT and trying to create the proxy while |
| the DUT is rebooting will lead to a spurious failure. |
| |
| Note that we use the plain retry decorator for this method since |
| we don't need to recreate the client proxy on failure. |
| """ |
| logging.info('Stopping all tests and clearing factory state') |
| self._host.run('factory clear') |
| |
| |
| @retry_goofy_rpc((httplib.BadStatusLine, socket.error), |
| timeout_min=BASE_RPC_TIMEOUT) |
| def _get_goofy_status(self): |
| """Return status of goofy, ignoring socket timeouts and http exceptions. |
| """ |
| status = self._client.GetGoofyStatus().get('status') |
| return status |
| |
| |
| def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2): |
| """Wait till goofy is running or a timeout occurs. |
| |
| @param timeout_min: Minutes to wait before timing this call out. |
| """ |
| current_time = time.time() |
| timeout_secs = timeout_min * 60 |
| logging.info('Waiting on goofy') |
| while self._get_goofy_status() != GOOFY_RUNNING: |
| if time.time() - current_time > timeout_secs: |
| break |
| return |
| |
| |
| @retry_goofy_rpc(socket.error, timeout_min=BASE_RPC_TIMEOUT*2) |
| def _set_test_list(self, next_list): |
| """Set the given test list for execution. |
| |
| Confirm that the given test list is a test that has been baked into |
| the image, then run it. Some test lists are configured to start |
| execution automatically when we call SetTestList, while others wait |
| for a corresponding RunTest. |
| |
| @param next_list: The name of the test list. |
| |
| @raise jsonrpclib.ProtocolError: If the test list we're trying to switch |
| to isn't on the DUT. |
| """ |
| |
| # We can get a BadStatus line on 2 occassions: |
| # 1. As part of SwitchTestList goofy performs a factory restart, which |
| # will throw a BadStatusLine because the rpc can't exit cleanly. We |
| # don't want to retry on this exception, since we've already set the |
| # right test list. |
| # 2. If we try to set a test list while goofy is already down |
| # (from a previous factory restart). In this case we wouldn't have |
| # set the new test list, because we coulnd't connect to goofy. |
| # To properly set a new test list it's important to wait till goofy is |
| # up before attempting to set the test list, while being aware that the |
| # preceding httplib error is from the rpc we just executed leading to |
| # a factory restart. Also note that if the test list is not already on |
| # the DUT this method will fail, emitting the possible test lists one |
| # can switch to. |
| self._wait_for_goofy() |
| logging.info('Switching to test list %s', next_list) |
| try: |
| self._client.SwitchTestList(next_list) |
| except httplib.BadStatusLine: |
| logging.info('Switched to list %s, goofy restarting', next_list) |
| |
| |
| @retry_goofy_rpc((httplib.BadStatusLine, socket.error), |
| timeout_min=BASE_RPC_TIMEOUT*2) |
| def _stop_running_tests(self): |
| """Stop all running tests. |
| |
| Wrap the StopTest rpc so we can attempt to stop tests even while a DUT |
| is suspended or rebooting. |
| """ |
| logging.info('Stopping tests.') |
| self._client.StopTest() |
| |
| |
| def _get_test_map(self): |
| """Get a mapping of test suites -> tests. |
| |
| Ignore entries for tests that don't have a path. |
| |
| @return: A dictionary of the form |
| {'suite_name': ['suite_name.path_to_test', ...]}. |
| """ |
| test_all = set([test['path'] for test in self._client.GetTests() |
| if test.get('path')]) |
| |
| test_map = collections.defaultdict(list) |
| for names in test_all: |
| test_map[names.split('.')[0]].append(names) |
| return test_map |
| |
| |
| def _log_test_results(self, test_status, current_suite): |
| """Format test status results and write them to status.log. |
| |
| @param test_status: The status dictionary of a single test. |
| @param current_suite: The current suite name. |
| """ |
| try: |
| self._host.job.record('INFO', None, None, |
| 'suite %s, test %s, status: %s' % |
| (current_suite, test_status.get('path'), |
| test_status.get('status'))) |
| except AttributeError as e: |
| logging.error('Could not gather results for current test: %s', e) |
| |
| |
| @retry_goofy_rpc((httplib.BadStatusLine, socket.error), |
| timeout_min=BASE_RPC_TIMEOUT*2) |
| def _get_test_info(self, test_name): |
| """Get the status of one test. |
| |
| @param test_name: The name of the test we need the status of. |
| |
| @return: The entry for the test in the status dictionary. |
| """ |
| for test in self._client.GetTests(): |
| if test['path'] == test_name: |
| return test |
| raise ValueError('Could not find test_name %s in _get_test_info.' % |
| test_name) |
| |
| |
| def _wait_on_barrier(self, barrier_name): |
| """Wait on a barrier. |
| |
| This method is designed to wait on the Barrier of a suite. A Barrier |
| is used to synchronize several tests that run in parallel within a |
| suite; it will cause the suite to hang while it attempts to show |
| an operator the status of each test, and is activated once all the |
| tests in the suite are done. |
| |
| @param barrier_name: The name of the barrier. |
| """ |
| logging.info('Waiting on barrier %s', barrier_name) |
| |
| # TODO(beeps): crbug.com/279473 |
| while self._get_test_info(barrier_name)['status'] != 'ACTIVE': |
| time.sleep(self.POLLING_INTERVAL) |
| |
| |
| def _wait_on_suite(self, suite_name): |
| """Wait till a suite stops being active. |
| |
| This method is designed to wait on the suite to change |
| status if it lacks a 'Barrier'. If a suite has a barrier |
| one should use _wait_on_barrier instead. |
| |
| @param suite_name: The name of the suite to wait on. |
| """ |
| logging.info('Waiting on suite %s', suite_name) |
| |
| while self._get_test_info(suite_name)['status'] == 'ACTIVE': |
| time.sleep(self.POLLING_INTERVAL) |
| |
| |
| def _synchronous_run_suite(self, suite_name, barrier_name=None): |
| """Run one suite and wait for it to finish. |
| |
| Will wait till the specified suite_name becomes active, |
| then wait till it switches out of active. If the suite |
| has a barrier, will wait till the barrier becomes active |
| instead, as this indicates that all tests have finished |
| running. |
| |
| @param suite_name: The name of the suite to wait for. |
| @param barrier_name: The name of the barrier, if any. |
| |
| @raises GoofyProxyException: If the status of the suite |
| doesn't switch to active after we call RunTest. |
| |
| @return: The result of the suite. |
| """ |
| logging.info('Starting suite: %s', suite_name) |
| self._client.RunTest(suite_name) |
| result = self._get_test_info(suite_name) |
| |
| #TODO(beeps): crbug.com/292975 |
| if result['status'] != 'ACTIVE': |
| raise GoofyProxyException('Not waiting for test list %s. Either we ' |
| 'could not start it or the test list ' |
| 'already finished.' % suite_name) |
| |
| if barrier_name: |
| self._wait_on_barrier(barrier_name) |
| else: |
| self._wait_on_suite(suite_name) |
| |
| # Since the barrier itself counts as a 'test' we need to stop |
| # it before asking goofy for the suites results, or goofy will |
| # think that the suite is still running. We also need to stop |
| # any orphaned test that might have been kicked off during this |
| # suite. |
| self._stop_running_tests() |
| return self._get_test_info(suite_name) |
| |
| |
| def monitor_tests(self, test_list): |
| """Run a test list. |
| |
| Will run each suite in the given list in sequence, starting each one |
| by name and waiting on its results. This method makes the following |
| assumptions: |
| - A test list is made up of self contained suites. |
| - These suites trigger several things in parallel. |
| - After a suite finishes it leaves goofy in an idle state. |
| |
| It is not safe to pull results for individual tests during the suite |
| as the device could be rebooting, or goofy could be under stress. |
| Instead, this method synchronously waits on an entire suite, then |
| asks goofy for the status of each test in the suite. Since certain |
| test lists automatically start and others don't, this method stops |
| test list execution regardless, and sequentially triggers each suite. |
| |
| @param test_list: The test list to run. |
| """ |
| self._set_test_list(test_list) |
| self._wait_for_goofy() |
| self._stop_running_tests() |
| |
| test_map = self._get_test_map() |
| if test_map: |
| logging.info('About to execute tests: %s', test_map) |
| else: |
| raise GoofyRuntimeException('Test map is empty, you might have an ' |
| 'error in your test_list.') |
| |
| |
| for current_suite in test_map.keys(): |
| logging.info('Processing suite %s', current_suite) |
| |
| # Check if any of these tests are actually a Barrier. |
| barrier = None |
| for test in test_map.get(current_suite): |
| if '.' in test and 'Barrier' in test.split('.')[1]: |
| barrier = test |
| break |
| |
| logging.info('Current suite = %s, barrier: %s', current_suite, |
| barrier) |
| |
| result = self._synchronous_run_suite(current_suite, barrier) |
| logging.info(result) |
| |
| for test_names in test_map.get(current_suite): |
| self._log_test_results(self._get_test_info(test_names), |
| current_suite) |
| |
| |
| @retry_goofy_rpc((httplib.BadStatusLine, socket.error), |
| timeout_min=BASE_RPC_TIMEOUT*2) |
| def get_results(self, resultsdir): |
| """Copies results from the DUT to a local results directory. |
| |
| Copy the tarball over to the results folder, untar, and delete the |
| tarball if everything was successful. This will effectively place |
| all the logs relevant to factory testing in the job's results folder. |
| |
| @param resultsdir: The directory in which to untar the contents of the |
| tarball factory_bug generates. |
| """ |
| logging.info('Getting results logs for test_list.') |
| |
| try: |
| factory_bug_log = self._host.run('factory_bug').stderr |
| except error.CmdError as e: |
| logging.error('Could not execute factory_bug: %s', e) |
| return |
| |
| try: |
| factory_bug_tar = re.match(self.FACTORY_BUG_RE, |
| factory_bug_log).groups(1)[0] |
| except (IndexError, AttributeError): |
| logging.error('could not collect logs for factory results, ' |
| 'factory bug returned %s', factory_bug_log) |
| return |
| |
| factory_bug_tar_file = os.path.basename(factory_bug_tar) |
| local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file) |
| |
| try: |
| self._host.get_file(factory_bug_tar, local_factory_bug_tar) |
| except error.AutoservRunError as e: |
| logging.error('Failed to pull back the results tarball: %s', e) |
| return |
| |
| try: |
| utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir)) |
| except error.CmdError as e: |
| logging.error('Failed to untar the results tarball: %s', e) |
| return |
| finally: |
| if os.path.exists(local_factory_bug_tar): |
| os.remove(local_factory_bug_tar) |
| |
| |