beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 1 | # Copyright (c) 2013 The Chromium OS Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | |
| 5 | import collections |
| 6 | import httplib |
| 7 | import logging |
| 8 | import os |
| 9 | import re |
| 10 | import socket |
| 11 | import time |
| 12 | |
| 13 | import common |
| 14 | |
| 15 | from autotest_lib.client.common_lib import error |
| 16 | from autotest_lib.client.common_lib.cros import retry |
| 17 | from autotest_lib.server import utils |
| 18 | |
| 19 | |
| 20 | GOOFY_JSONRPC_SERVER_PORT = 0x0FAC |
| 21 | GOOFY_RUNNING = 'RUNNING' |
| 22 | |
| 23 | |
| 24 | class GoofyProxyException(Exception): |
| 25 | """Exception raised when a goofy rpc fails.""" |
| 26 | pass |
| 27 | |
| 28 | |
beeps | 0456135 | 2013-10-04 11:07:47 -0700 | [diff] [blame] | 29 | class GoofyRuntimeException(Exception): |
| 30 | """Exception raised when something goes wrong while a test is running.""" |
| 31 | pass |
| 32 | |
| 33 | |
| 34 | def retry_goofy_rpc(exception_tuple, timeout_min=30): |
| 35 | """A decorator to use with goofy rpcs. |
| 36 | |
| 37 | This decorator tries to recreate the goofy client proxy on |
| 38 | socket error. It will continue trying to do so until it |
| 39 | executes the method without any socket errors or till the |
| 40 | retry.retry decorator hits it's timeout. |
| 41 | |
| 42 | Usage: |
| 43 | If you just want to recreate the proxy: |
| 44 | 1. @retry_goofy_rpc(exception_tuple=(<exception>, socket.error), |
| 45 | timeout_min=<timeout>) |
| 46 | 2. @retry_goofy_rpc(socket.error, timeout_min=<timeout>) |
| 47 | Note: you need to specify the socket.error exception because we |
| 48 | want to retry the call after recreating the proxy. |
| 49 | |
| 50 | @param exception_tuple: A tuple of exceptions to pass to |
| 51 | the retry decorator. Any of these exceptions will result |
| 52 | in retries for the duration of timeout_min. |
| 53 | @param timeout_min: The timeout, in minutes, for which we should |
| 54 | retry the method ignoring any exception in exception_tuple. |
| 55 | """ |
| 56 | def inner_decorator(method): |
| 57 | """Inner retry decorator applied to the method. |
| 58 | |
| 59 | @param method: The method that needs to be wrapped in the decorator. |
| 60 | |
| 61 | @return A wrapper function that implements the retry. |
| 62 | """ |
| 63 | |
| 64 | @retry.retry(exception_tuple, timeout_min=timeout_min) |
| 65 | def wrapper(*args, **kwargs): |
| 66 | """This wrapper handles socket errors. |
| 67 | |
| 68 | If the method in question: |
| 69 | 1. Throws an exception in exception_tuple and it is not a |
| 70 | socket.error, retry for timeout_min through retry.retry. |
| 71 | 2. Throws a socket.error, recreate the client proxy, and |
| 72 | retry for timeout_min through retry.retry. |
| 73 | 3. Throws an exception not in exception_tuple, fail. |
| 74 | """ |
| 75 | try: |
| 76 | return method(*args, **kwargs) |
| 77 | except socket.error as e: |
| 78 | goofy_proxy = args[0] |
| 79 | if type(goofy_proxy) is GoofyProxy: |
| 80 | logging.warning('Socket error while running factory tests ' |
| 81 | '%s, recreating goofy proxy.', e) |
| 82 | goofy_proxy._create_client_proxy(timeout_min=timeout_min) |
| 83 | else: |
| 84 | logging.warning('Connectivity was lost and the retry ' |
| 85 | 'decorator was unable to recreate a goofy ' |
| 86 | 'client proxy, args: %s.', args) |
| 87 | raise |
| 88 | |
| 89 | return wrapper |
| 90 | |
| 91 | return inner_decorator |
| 92 | |
| 93 | |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 94 | class GoofyProxy(object): |
| 95 | """Client capable of making rpc calls to goofy. |
| 96 | |
| 97 | Methods of this class that can cause goofy to change state |
| 98 | usually need a retry decorator. Methods that have a retry decorator |
| 99 | need to be 'pure', i.e return the same results when called multiple |
| 100 | times with the same argument. |
| 101 | |
| 102 | There are 2 known exceptions this class can deal with, a socket.error |
| 103 | which happens when we try to execute an rpc when the DUT is, say, suspended |
| 104 | and a BadStatusLine, which we get when we try to execute an rpc while the |
| 105 | DUT is going through a factory_restart. Ideally we would like to handle |
| 106 | socket timeouts different from BadStatusLines as we can get connection |
| 107 | errors even when a device reboots and BadStatusLines ususally only when |
| 108 | factory restarts. crbug.com/281714. |
| 109 | """ |
| 110 | |
| 111 | # This timeout was arbitrarily chosen as many tests in the factory test |
| 112 | # suite run for days. Ideally we would like to split this into at least 2 |
| 113 | # timeouts, one which we use for rpcs that run while no other test is, |
| 114 | # running and is smaller than the second that is designed for use with rpcs |
| 115 | # that might execute simultaneously with a test. The latter needs a longer |
| 116 | # timeout since tests could suspend,resume for a long time, and a call like |
| 117 | # GetGoofyStatus should be tolerant to these suspend/resumes. In designing |
| 118 | # the base timeout one needs to allocate time to component methods of this |
| 119 | # class (such as _set_test_list) as a multiple of the number of rpcs it |
| 120 | # executes. |
| 121 | BASE_RPC_TIMEOUT = 1440 |
| 122 | POLLING_INTERVAL = 5 |
| 123 | FACTORY_BUG_RE = r'.*(/tmp/factory_bug.*tar.bz2).*' |
| 124 | UNTAR_COMMAND = 'tar jxf %s -C %s' |
| 125 | |
| 126 | |
| 127 | def __init__(self, host): |
| 128 | """ |
| 129 | @param host: The host object representing the DUT running goofy. |
| 130 | """ |
| 131 | self._host = host |
beeps | 0456135 | 2013-10-04 11:07:47 -0700 | [diff] [blame] | 132 | self._raw_stop_running_tests() |
| 133 | self._create_client_proxy(timeout_min=self.BASE_RPC_TIMEOUT) |
| 134 | |
| 135 | |
| 136 | def _create_client_proxy(self, timeout_min=30): |
| 137 | """Create a goofy client proxy. |
| 138 | |
| 139 | Ping the host till it's up, then proceed to create a goofy proxy. We |
| 140 | don't wrap this method with a retry because it's used in the retry |
| 141 | decorator itself. |
| 142 | """ |
| 143 | |
| 144 | # We don't ssh ping here as there is a potential dealy in O(minutes) |
| 145 | # with our ssh command against a sleeping DUT, once it wakes up, and |
| 146 | # that will lead to significant overhead incurred over many reboots. |
| 147 | self._host.ping_wait_up(timeout_min) |
| 148 | logging.info('Host is pingable, creating goofy client proxy') |
| 149 | self._client = self._host.jsonrpc_connect(GOOFY_JSONRPC_SERVER_PORT) |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 150 | |
| 151 | |
| 152 | @retry.retry((httplib.BadStatusLine, socket.error), |
| 153 | timeout_min=BASE_RPC_TIMEOUT) |
beeps | 0456135 | 2013-10-04 11:07:47 -0700 | [diff] [blame] | 154 | def _raw_stop_running_tests(self): |
| 155 | """Stop running tests by shelling out to the DUT. |
| 156 | |
| 157 | Use this method only before we have actually created the client |
| 158 | proxy, as shelling out has several pitfalls. We need to stop all |
| 159 | tests in a retry loop because tests will start executing as soon |
| 160 | as we have reimaged a DUT and trying to create the proxy while |
| 161 | the DUT is rebooting will lead to a spurious failure. |
| 162 | |
| 163 | Note that we use the plain retry decorator for this method since |
| 164 | we don't need to recreate the client proxy on failure. |
| 165 | """ |
| 166 | logging.info('Stopping all tests and clearing factory state') |
| 167 | self._host.run('factory clear') |
| 168 | |
| 169 | |
| 170 | @retry_goofy_rpc((httplib.BadStatusLine, socket.error), |
| 171 | timeout_min=BASE_RPC_TIMEOUT) |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 172 | def _get_goofy_status(self): |
| 173 | """Return status of goofy, ignoring socket timeouts and http exceptions. |
| 174 | """ |
| 175 | status = self._client.GetGoofyStatus().get('status') |
| 176 | return status |
| 177 | |
| 178 | |
| 179 | def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2): |
| 180 | """Wait till goofy is running or a timeout occurs. |
| 181 | |
| 182 | @param timeout_min: Minutes to wait before timing this call out. |
| 183 | """ |
| 184 | current_time = time.time() |
| 185 | timeout_secs = timeout_min * 60 |
| 186 | logging.info('Waiting on goofy') |
| 187 | while self._get_goofy_status() != GOOFY_RUNNING: |
| 188 | if time.time() - current_time > timeout_secs: |
| 189 | break |
| 190 | return |
| 191 | |
| 192 | |
beeps | 0456135 | 2013-10-04 11:07:47 -0700 | [diff] [blame] | 193 | @retry_goofy_rpc(socket.error, timeout_min=BASE_RPC_TIMEOUT*2) |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 194 | def _set_test_list(self, next_list): |
| 195 | """Set the given test list for execution. |
| 196 | |
| 197 | Confirm that the given test list is a test that has been baked into |
| 198 | the image, then run it. Some test lists are configured to start |
| 199 | execution automatically when we call SetTestList, while others wait |
| 200 | for a corresponding RunTest. |
| 201 | |
| 202 | @param next_list: The name of the test list. |
| 203 | |
| 204 | @raise jsonrpclib.ProtocolError: If the test list we're trying to switch |
| 205 | to isn't on the DUT. |
| 206 | """ |
| 207 | |
beeps | 0456135 | 2013-10-04 11:07:47 -0700 | [diff] [blame] | 208 | # We can get a BadStatus line on 2 occassions: |
| 209 | # 1. As part of SwitchTestList goofy performs a factory restart, which |
| 210 | # will throw a BadStatusLine because the rpc can't exit cleanly. We |
| 211 | # don't want to retry on this exception, since we've already set the |
| 212 | # right test list. |
| 213 | # 2. If we try to set a test list while goofy is already down |
| 214 | # (from a previous factory restart). In this case we wouldn't have |
| 215 | # set the new test list, because we coulnd't connect to goofy. |
| 216 | # To properly set a new test list it's important to wait till goofy is |
| 217 | # up before attempting to set the test list, while being aware that the |
| 218 | # preceding httplib error is from the rpc we just executed leading to |
| 219 | # a factory restart. Also note that if the test list is not already on |
| 220 | # the DUT this method will fail, emitting the possible test lists one |
| 221 | # can switch to. |
| 222 | self._wait_for_goofy() |
| 223 | logging.info('Switching to test list %s', next_list) |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 224 | try: |
| 225 | self._client.SwitchTestList(next_list) |
| 226 | except httplib.BadStatusLine: |
| 227 | logging.info('Switched to list %s, goofy restarting', next_list) |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 228 | |
| 229 | |
beeps | 0456135 | 2013-10-04 11:07:47 -0700 | [diff] [blame] | 230 | @retry_goofy_rpc((httplib.BadStatusLine, socket.error), |
| 231 | timeout_min=BASE_RPC_TIMEOUT*2) |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 232 | def _stop_running_tests(self): |
| 233 | """Stop all running tests. |
| 234 | |
| 235 | Wrap the StopTest rpc so we can attempt to stop tests even while a DUT |
| 236 | is suspended or rebooting. |
| 237 | """ |
| 238 | logging.info('Stopping tests.') |
| 239 | self._client.StopTest() |
| 240 | |
| 241 | |
| 242 | def _get_test_map(self): |
| 243 | """Get a mapping of test suites -> tests. |
| 244 | |
| 245 | Ignore entries for tests that don't have a path. |
| 246 | |
| 247 | @return: A dictionary of the form |
| 248 | {'suite_name': ['suite_name.path_to_test', ...]}. |
| 249 | """ |
| 250 | test_all = set([test['path'] for test in self._client.GetTests() |
| 251 | if test.get('path')]) |
| 252 | |
| 253 | test_map = collections.defaultdict(list) |
| 254 | for names in test_all: |
| 255 | test_map[names.split('.')[0]].append(names) |
| 256 | return test_map |
| 257 | |
| 258 | |
| 259 | def _log_test_results(self, test_status, current_suite): |
| 260 | """Format test status results and write them to status.log. |
| 261 | |
| 262 | @param test_status: The status dictionary of a single test. |
| 263 | @param current_suite: The current suite name. |
| 264 | """ |
| 265 | try: |
| 266 | self._host.job.record('INFO', None, None, |
| 267 | 'suite %s, test %s, status: %s' % |
| 268 | (current_suite, test_status.get('path'), |
| 269 | test_status.get('status'))) |
| 270 | except AttributeError as e: |
| 271 | logging.error('Could not gather results for current test: %s', e) |
| 272 | |
| 273 | |
beeps | 0456135 | 2013-10-04 11:07:47 -0700 | [diff] [blame] | 274 | @retry_goofy_rpc((httplib.BadStatusLine, socket.error), |
| 275 | timeout_min=BASE_RPC_TIMEOUT*2) |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 276 | def _get_test_info(self, test_name): |
| 277 | """Get the status of one test. |
| 278 | |
| 279 | @param test_name: The name of the test we need the status of. |
| 280 | |
| 281 | @return: The entry for the test in the status dictionary. |
| 282 | """ |
| 283 | for test in self._client.GetTests(): |
| 284 | if test['path'] == test_name: |
| 285 | return test |
| 286 | raise ValueError('Could not find test_name %s in _get_test_info.' % |
| 287 | test_name) |
| 288 | |
| 289 | |
| 290 | def _wait_on_barrier(self, barrier_name): |
| 291 | """Wait on a barrier. |
| 292 | |
| 293 | This method is designed to wait on the Barrier of a suite. A Barrier |
| 294 | is used to synchronize several tests that run in parallel within a |
| 295 | suite; it will cause the suite to hang while it attempts to show |
| 296 | an operator the status of each test, and is activated once all the |
| 297 | tests in the suite are done. |
| 298 | |
| 299 | @param barrier_name: The name of the barrier. |
| 300 | """ |
| 301 | logging.info('Waiting on barrier %s', barrier_name) |
| 302 | |
| 303 | # TODO(beeps): crbug.com/279473 |
| 304 | while self._get_test_info(barrier_name)['status'] != 'ACTIVE': |
| 305 | time.sleep(self.POLLING_INTERVAL) |
| 306 | |
| 307 | |
| 308 | def _wait_on_suite(self, suite_name): |
| 309 | """Wait till a suite stops being active. |
| 310 | |
| 311 | This method is designed to wait on the suite to change |
| 312 | status if it lacks a 'Barrier'. If a suite has a barrier |
| 313 | one should use _wait_on_barrier instead. |
| 314 | |
| 315 | @param suite_name: The name of the suite to wait on. |
| 316 | """ |
| 317 | logging.info('Waiting on suite %s', suite_name) |
| 318 | |
| 319 | while self._get_test_info(suite_name)['status'] == 'ACTIVE': |
| 320 | time.sleep(self.POLLING_INTERVAL) |
| 321 | |
| 322 | |
| 323 | def _synchronous_run_suite(self, suite_name, barrier_name=None): |
| 324 | """Run one suite and wait for it to finish. |
| 325 | |
| 326 | Will wait till the specified suite_name becomes active, |
| 327 | then wait till it switches out of active. If the suite |
| 328 | has a barrier, will wait till the barrier becomes active |
| 329 | instead, as this indicates that all tests have finished |
| 330 | running. |
| 331 | |
| 332 | @param suite_name: The name of the suite to wait for. |
| 333 | @param barrier_name: The name of the barrier, if any. |
| 334 | |
| 335 | @raises GoofyProxyException: If the status of the suite |
| 336 | doesn't switch to active after we call RunTest. |
| 337 | |
| 338 | @return: The result of the suite. |
| 339 | """ |
beeps | 0456135 | 2013-10-04 11:07:47 -0700 | [diff] [blame] | 340 | logging.info('Starting suite: %s', suite_name) |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 341 | self._client.RunTest(suite_name) |
| 342 | result = self._get_test_info(suite_name) |
| 343 | |
| 344 | #TODO(beeps): crbug.com/292975 |
| 345 | if result['status'] != 'ACTIVE': |
| 346 | raise GoofyProxyException('Not waiting for test list %s. Either we ' |
| 347 | 'could not start it or the test list ' |
| 348 | 'already finished.' % suite_name) |
| 349 | |
| 350 | if barrier_name: |
| 351 | self._wait_on_barrier(barrier_name) |
| 352 | else: |
| 353 | self._wait_on_suite(suite_name) |
| 354 | |
| 355 | # Since the barrier itself counts as a 'test' we need to stop |
| 356 | # it before asking goofy for the suites results, or goofy will |
| 357 | # think that the suite is still running. We also need to stop |
| 358 | # any orphaned test that might have been kicked off during this |
| 359 | # suite. |
| 360 | self._stop_running_tests() |
| 361 | return self._get_test_info(suite_name) |
| 362 | |
| 363 | |
| 364 | def monitor_tests(self, test_list): |
| 365 | """Run a test list. |
| 366 | |
| 367 | Will run each suite in the given list in sequence, starting each one |
| 368 | by name and waiting on its results. This method makes the following |
| 369 | assumptions: |
| 370 | - A test list is made up of self contained suites. |
| 371 | - These suites trigger several things in parallel. |
| 372 | - After a suite finishes it leaves goofy in an idle state. |
| 373 | |
| 374 | It is not safe to pull results for individual tests during the suite |
| 375 | as the device could be rebooting, or goofy could be under stress. |
| 376 | Instead, this method synchronously waits on an entire suite, then |
| 377 | asks goofy for the status of each test in the suite. Since certain |
| 378 | test lists automatically start and others don't, this method stops |
| 379 | test list execution regardless, and sequentially triggers each suite. |
| 380 | |
| 381 | @param test_list: The test list to run. |
| 382 | """ |
| 383 | self._set_test_list(test_list) |
| 384 | self._wait_for_goofy() |
| 385 | self._stop_running_tests() |
| 386 | |
| 387 | test_map = self._get_test_map() |
beeps | 0456135 | 2013-10-04 11:07:47 -0700 | [diff] [blame] | 388 | if test_map: |
| 389 | logging.info('About to execute tests: %s', test_map) |
| 390 | else: |
| 391 | raise GoofyRuntimeException('Test map is empty, you might have an ' |
| 392 | 'error in your test_list.') |
| 393 | |
| 394 | |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 395 | for current_suite in test_map.keys(): |
| 396 | logging.info('Processing suite %s', current_suite) |
| 397 | |
| 398 | # Check if any of these tests are actually a Barrier. |
| 399 | barrier = None |
| 400 | for test in test_map.get(current_suite): |
| 401 | if '.' in test and 'Barrier' in test.split('.')[1]: |
| 402 | barrier = test |
| 403 | break |
| 404 | |
| 405 | logging.info('Current suite = %s, barrier: %s', current_suite, |
| 406 | barrier) |
| 407 | |
| 408 | result = self._synchronous_run_suite(current_suite, barrier) |
| 409 | logging.info(result) |
| 410 | |
| 411 | for test_names in test_map.get(current_suite): |
| 412 | self._log_test_results(self._get_test_info(test_names), |
| 413 | current_suite) |
| 414 | |
| 415 | |
beeps | 0456135 | 2013-10-04 11:07:47 -0700 | [diff] [blame] | 416 | @retry_goofy_rpc((httplib.BadStatusLine, socket.error), |
| 417 | timeout_min=BASE_RPC_TIMEOUT*2) |
beeps | 7583e58 | 2013-08-28 16:19:24 -0700 | [diff] [blame] | 418 | def get_results(self, resultsdir): |
| 419 | """Copies results from the DUT to a local results directory. |
| 420 | |
| 421 | Copy the tarball over to the results folder, untar, and delete the |
| 422 | tarball if everything was successful. This will effectively place |
| 423 | all the logs relevant to factory testing in the job's results folder. |
| 424 | |
| 425 | @param resultsdir: The directory in which to untar the contents of the |
| 426 | tarball factory_bug generates. |
| 427 | """ |
| 428 | logging.info('Getting results logs for test_list.') |
| 429 | |
| 430 | try: |
| 431 | factory_bug_log = self._host.run('factory_bug').stderr |
| 432 | except error.CmdError as e: |
| 433 | logging.error('Could not execute factory_bug: %s', e) |
| 434 | return |
| 435 | |
| 436 | try: |
| 437 | factory_bug_tar = re.match(self.FACTORY_BUG_RE, |
| 438 | factory_bug_log).groups(1)[0] |
| 439 | except (IndexError, AttributeError): |
| 440 | logging.error('could not collect logs for factory results, ' |
| 441 | 'factory bug returned %s', factory_bug_log) |
| 442 | return |
| 443 | |
| 444 | factory_bug_tar_file = os.path.basename(factory_bug_tar) |
| 445 | local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file) |
| 446 | |
| 447 | try: |
| 448 | self._host.get_file(factory_bug_tar, local_factory_bug_tar) |
| 449 | except error.AutoservRunError as e: |
| 450 | logging.error('Failed to pull back the results tarball: %s', e) |
| 451 | return |
| 452 | |
| 453 | try: |
| 454 | utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir)) |
| 455 | except error.CmdError as e: |
| 456 | logging.error('Failed to untar the results tarball: %s', e) |
| 457 | return |
| 458 | finally: |
| 459 | if os.path.exists(local_factory_bug_tar): |
| 460 | os.remove(local_factory_bug_tar) |
| 461 | |
| 462 | |