beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame^] | 1 | # Copyright (c) 2013 The Chromium OS Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | |
| 5 | import collections |
| 6 | import httplib |
| 7 | import logging |
| 8 | import os |
| 9 | import re |
| 10 | import socket |
| 11 | import time |
| 12 | |
| 13 | import common |
| 14 | |
| 15 | from autotest_lib.client.common_lib import error |
| 16 | from autotest_lib.client.common_lib.cros import retry |
| 17 | from autotest_lib.server import utils |
| 18 | |
| 19 | |
| 20 | GOOFY_JSONRPC_SERVER_PORT = 0x0FAC |
| 21 | GOOFY_RUNNING = 'RUNNING' |
| 22 | |
| 23 | |
| 24 | class GoofyProxyException(Exception): |
| 25 | """Exception raised when a goofy rpc fails.""" |
| 26 | pass |
| 27 | |
| 28 | |
| 29 | class GoofyProxy(object): |
| 30 | """Client capable of making rpc calls to goofy. |
| 31 | |
| 32 | Methods of this class that can cause goofy to change state |
| 33 | usually need a retry decorator. Methods that have a retry decorator |
| 34 | need to be 'pure', i.e return the same results when called multiple |
| 35 | times with the same argument. |
| 36 | |
| 37 | There are 2 known exceptions this class can deal with, a socket.error |
| 38 | which happens when we try to execute an rpc when the DUT is, say, suspended |
| 39 | and a BadStatusLine, which we get when we try to execute an rpc while the |
| 40 | DUT is going through a factory_restart. Ideally we would like to handle |
| 41 | socket timeouts different from BadStatusLines as we can get connection |
| 42 | errors even when a device reboots and BadStatusLines ususally only when |
| 43 | factory restarts. crbug.com/281714. |
| 44 | """ |
| 45 | |
| 46 | # This timeout was arbitrarily chosen as many tests in the factory test |
| 47 | # suite run for days. Ideally we would like to split this into at least 2 |
| 48 | # timeouts, one which we use for rpcs that run while no other test is, |
| 49 | # running and is smaller than the second that is designed for use with rpcs |
| 50 | # that might execute simultaneously with a test. The latter needs a longer |
| 51 | # timeout since tests could suspend,resume for a long time, and a call like |
| 52 | # GetGoofyStatus should be tolerant to these suspend/resumes. In designing |
| 53 | # the base timeout one needs to allocate time to component methods of this |
| 54 | # class (such as _set_test_list) as a multiple of the number of rpcs it |
| 55 | # executes. |
| 56 | BASE_RPC_TIMEOUT = 1440 |
| 57 | POLLING_INTERVAL = 5 |
| 58 | FACTORY_BUG_RE = r'.*(/tmp/factory_bug.*tar.bz2).*' |
| 59 | UNTAR_COMMAND = 'tar jxf %s -C %s' |
| 60 | |
| 61 | |
| 62 | def __init__(self, host): |
| 63 | """ |
| 64 | @param host: The host object representing the DUT running goofy. |
| 65 | """ |
| 66 | self._host = host |
| 67 | self._client = host.jsonrpc_connect(GOOFY_JSONRPC_SERVER_PORT) |
| 68 | |
| 69 | |
| 70 | @retry.retry((httplib.BadStatusLine, socket.error), |
| 71 | timeout_min=BASE_RPC_TIMEOUT) |
| 72 | def _get_goofy_status(self): |
| 73 | """Return status of goofy, ignoring socket timeouts and http exceptions. |
| 74 | """ |
| 75 | status = self._client.GetGoofyStatus().get('status') |
| 76 | return status |
| 77 | |
| 78 | |
| 79 | def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2): |
| 80 | """Wait till goofy is running or a timeout occurs. |
| 81 | |
| 82 | @param timeout_min: Minutes to wait before timing this call out. |
| 83 | """ |
| 84 | current_time = time.time() |
| 85 | timeout_secs = timeout_min * 60 |
| 86 | logging.info('Waiting on goofy') |
| 87 | while self._get_goofy_status() != GOOFY_RUNNING: |
| 88 | if time.time() - current_time > timeout_secs: |
| 89 | break |
| 90 | return |
| 91 | |
| 92 | |
| 93 | @retry.retry(socket.error, timeout_min=BASE_RPC_TIMEOUT*2) |
| 94 | def _set_test_list(self, next_list): |
| 95 | """Set the given test list for execution. |
| 96 | |
| 97 | Confirm that the given test list is a test that has been baked into |
| 98 | the image, then run it. Some test lists are configured to start |
| 99 | execution automatically when we call SetTestList, while others wait |
| 100 | for a corresponding RunTest. |
| 101 | |
| 102 | @param next_list: The name of the test list. |
| 103 | |
| 104 | @raise jsonrpclib.ProtocolError: If the test list we're trying to switch |
| 105 | to isn't on the DUT. |
| 106 | """ |
| 107 | |
| 108 | # As part of SwitchTestList we perform a factory restart, |
| 109 | # which will throw a BadStatusLine. We don't want to retry |
| 110 | # on this exception though, as that will lead to setting the same |
| 111 | # test list over and over till the timeout expires. If the test |
| 112 | # list is not already on the DUT this method will fail, emitting |
| 113 | # the possible test lists one can switch to. |
| 114 | try: |
| 115 | self._client.SwitchTestList(next_list) |
| 116 | except httplib.BadStatusLine: |
| 117 | logging.info('Switched to list %s, goofy restarting', next_list) |
| 118 | pass |
| 119 | |
| 120 | |
| 121 | @retry.retry((httplib.BadStatusLine, socket.error), |
| 122 | timeout_min=BASE_RPC_TIMEOUT*2) |
| 123 | def _stop_running_tests(self): |
| 124 | """Stop all running tests. |
| 125 | |
| 126 | Wrap the StopTest rpc so we can attempt to stop tests even while a DUT |
| 127 | is suspended or rebooting. |
| 128 | """ |
| 129 | logging.info('Stopping tests.') |
| 130 | self._client.StopTest() |
| 131 | |
| 132 | |
| 133 | def _get_test_map(self): |
| 134 | """Get a mapping of test suites -> tests. |
| 135 | |
| 136 | Ignore entries for tests that don't have a path. |
| 137 | |
| 138 | @return: A dictionary of the form |
| 139 | {'suite_name': ['suite_name.path_to_test', ...]}. |
| 140 | """ |
| 141 | test_all = set([test['path'] for test in self._client.GetTests() |
| 142 | if test.get('path')]) |
| 143 | |
| 144 | test_map = collections.defaultdict(list) |
| 145 | for names in test_all: |
| 146 | test_map[names.split('.')[0]].append(names) |
| 147 | return test_map |
| 148 | |
| 149 | |
| 150 | def _log_test_results(self, test_status, current_suite): |
| 151 | """Format test status results and write them to status.log. |
| 152 | |
| 153 | @param test_status: The status dictionary of a single test. |
| 154 | @param current_suite: The current suite name. |
| 155 | """ |
| 156 | try: |
| 157 | self._host.job.record('INFO', None, None, |
| 158 | 'suite %s, test %s, status: %s' % |
| 159 | (current_suite, test_status.get('path'), |
| 160 | test_status.get('status'))) |
| 161 | except AttributeError as e: |
| 162 | logging.error('Could not gather results for current test: %s', e) |
| 163 | |
| 164 | |
| 165 | @retry.retry((httplib.BadStatusLine, socket.error), |
| 166 | timeout_min=BASE_RPC_TIMEOUT*2) |
| 167 | def _get_test_info(self, test_name): |
| 168 | """Get the status of one test. |
| 169 | |
| 170 | @param test_name: The name of the test we need the status of. |
| 171 | |
| 172 | @return: The entry for the test in the status dictionary. |
| 173 | """ |
| 174 | for test in self._client.GetTests(): |
| 175 | if test['path'] == test_name: |
| 176 | return test |
| 177 | raise ValueError('Could not find test_name %s in _get_test_info.' % |
| 178 | test_name) |
| 179 | |
| 180 | |
| 181 | def _wait_on_barrier(self, barrier_name): |
| 182 | """Wait on a barrier. |
| 183 | |
| 184 | This method is designed to wait on the Barrier of a suite. A Barrier |
| 185 | is used to synchronize several tests that run in parallel within a |
| 186 | suite; it will cause the suite to hang while it attempts to show |
| 187 | an operator the status of each test, and is activated once all the |
| 188 | tests in the suite are done. |
| 189 | |
| 190 | @param barrier_name: The name of the barrier. |
| 191 | """ |
| 192 | logging.info('Waiting on barrier %s', barrier_name) |
| 193 | |
| 194 | # TODO(beeps): crbug.com/279473 |
| 195 | while self._get_test_info(barrier_name)['status'] != 'ACTIVE': |
| 196 | time.sleep(self.POLLING_INTERVAL) |
| 197 | |
| 198 | |
| 199 | def _wait_on_suite(self, suite_name): |
| 200 | """Wait till a suite stops being active. |
| 201 | |
| 202 | This method is designed to wait on the suite to change |
| 203 | status if it lacks a 'Barrier'. If a suite has a barrier |
| 204 | one should use _wait_on_barrier instead. |
| 205 | |
| 206 | @param suite_name: The name of the suite to wait on. |
| 207 | """ |
| 208 | logging.info('Waiting on suite %s', suite_name) |
| 209 | |
| 210 | while self._get_test_info(suite_name)['status'] == 'ACTIVE': |
| 211 | time.sleep(self.POLLING_INTERVAL) |
| 212 | |
| 213 | |
| 214 | def _synchronous_run_suite(self, suite_name, barrier_name=None): |
| 215 | """Run one suite and wait for it to finish. |
| 216 | |
| 217 | Will wait till the specified suite_name becomes active, |
| 218 | then wait till it switches out of active. If the suite |
| 219 | has a barrier, will wait till the barrier becomes active |
| 220 | instead, as this indicates that all tests have finished |
| 221 | running. |
| 222 | |
| 223 | @param suite_name: The name of the suite to wait for. |
| 224 | @param barrier_name: The name of the barrier, if any. |
| 225 | |
| 226 | @raises GoofyProxyException: If the status of the suite |
| 227 | doesn't switch to active after we call RunTest. |
| 228 | |
| 229 | @return: The result of the suite. |
| 230 | """ |
| 231 | self._client.RunTest(suite_name) |
| 232 | result = self._get_test_info(suite_name) |
| 233 | |
| 234 | #TODO(beeps): crbug.com/292975 |
| 235 | if result['status'] != 'ACTIVE': |
| 236 | raise GoofyProxyException('Not waiting for test list %s. Either we ' |
| 237 | 'could not start it or the test list ' |
| 238 | 'already finished.' % suite_name) |
| 239 | |
| 240 | if barrier_name: |
| 241 | self._wait_on_barrier(barrier_name) |
| 242 | else: |
| 243 | self._wait_on_suite(suite_name) |
| 244 | |
| 245 | # Since the barrier itself counts as a 'test' we need to stop |
| 246 | # it before asking goofy for the suites results, or goofy will |
| 247 | # think that the suite is still running. We also need to stop |
| 248 | # any orphaned test that might have been kicked off during this |
| 249 | # suite. |
| 250 | self._stop_running_tests() |
| 251 | return self._get_test_info(suite_name) |
| 252 | |
| 253 | |
| 254 | def monitor_tests(self, test_list): |
| 255 | """Run a test list. |
| 256 | |
| 257 | Will run each suite in the given list in sequence, starting each one |
| 258 | by name and waiting on its results. This method makes the following |
| 259 | assumptions: |
| 260 | - A test list is made up of self contained suites. |
| 261 | - These suites trigger several things in parallel. |
| 262 | - After a suite finishes it leaves goofy in an idle state. |
| 263 | |
| 264 | It is not safe to pull results for individual tests during the suite |
| 265 | as the device could be rebooting, or goofy could be under stress. |
| 266 | Instead, this method synchronously waits on an entire suite, then |
| 267 | asks goofy for the status of each test in the suite. Since certain |
| 268 | test lists automatically start and others don't, this method stops |
| 269 | test list execution regardless, and sequentially triggers each suite. |
| 270 | |
| 271 | @param test_list: The test list to run. |
| 272 | """ |
| 273 | self._set_test_list(test_list) |
| 274 | self._wait_for_goofy() |
| 275 | self._stop_running_tests() |
| 276 | |
| 277 | test_map = self._get_test_map() |
| 278 | for current_suite in test_map.keys(): |
| 279 | logging.info('Processing suite %s', current_suite) |
| 280 | |
| 281 | # Check if any of these tests are actually a Barrier. |
| 282 | barrier = None |
| 283 | for test in test_map.get(current_suite): |
| 284 | if '.' in test and 'Barrier' in test.split('.')[1]: |
| 285 | barrier = test |
| 286 | break |
| 287 | |
| 288 | logging.info('Current suite = %s, barrier: %s', current_suite, |
| 289 | barrier) |
| 290 | |
| 291 | result = self._synchronous_run_suite(current_suite, barrier) |
| 292 | logging.info(result) |
| 293 | |
| 294 | for test_names in test_map.get(current_suite): |
| 295 | self._log_test_results(self._get_test_info(test_names), |
| 296 | current_suite) |
| 297 | |
| 298 | |
| 299 | @retry.retry((httplib.BadStatusLine, socket.timeout), timeout_min=1) |
| 300 | def get_results(self, resultsdir): |
| 301 | """Copies results from the DUT to a local results directory. |
| 302 | |
| 303 | Copy the tarball over to the results folder, untar, and delete the |
| 304 | tarball if everything was successful. This will effectively place |
| 305 | all the logs relevant to factory testing in the job's results folder. |
| 306 | |
| 307 | @param resultsdir: The directory in which to untar the contents of the |
| 308 | tarball factory_bug generates. |
| 309 | """ |
| 310 | logging.info('Getting results logs for test_list.') |
| 311 | |
| 312 | try: |
| 313 | factory_bug_log = self._host.run('factory_bug').stderr |
| 314 | except error.CmdError as e: |
| 315 | logging.error('Could not execute factory_bug: %s', e) |
| 316 | return |
| 317 | |
| 318 | try: |
| 319 | factory_bug_tar = re.match(self.FACTORY_BUG_RE, |
| 320 | factory_bug_log).groups(1)[0] |
| 321 | except (IndexError, AttributeError): |
| 322 | logging.error('could not collect logs for factory results, ' |
| 323 | 'factory bug returned %s', factory_bug_log) |
| 324 | return |
| 325 | |
| 326 | factory_bug_tar_file = os.path.basename(factory_bug_tar) |
| 327 | local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file) |
| 328 | |
| 329 | try: |
| 330 | self._host.get_file(factory_bug_tar, local_factory_bug_tar) |
| 331 | except error.AutoservRunError as e: |
| 332 | logging.error('Failed to pull back the results tarball: %s', e) |
| 333 | return |
| 334 | |
| 335 | try: |
| 336 | utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir)) |
| 337 | except error.CmdError as e: |
| 338 | logging.error('Failed to untar the results tarball: %s', e) |
| 339 | return |
| 340 | finally: |
| 341 | if os.path.exists(local_factory_bug_tar): |
| 342 | os.remove(local_factory_bug_tar) |
| 343 | |
| 344 | |