blob: 378eaecdad5eb8fa02a2bfc77e68181efed0433c [file] [log] [blame]
beeps7583e582013-08-28 16:19:24 -07001# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import collections
6import httplib
7import logging
8import os
9import re
10import socket
11import time
12
13import common
14
15from autotest_lib.client.common_lib import error
16from autotest_lib.client.common_lib.cros import retry
17from autotest_lib.server import utils
18
19
20GOOFY_JSONRPC_SERVER_PORT = 0x0FAC
21GOOFY_RUNNING = 'RUNNING'
22
23
24class GoofyProxyException(Exception):
25 """Exception raised when a goofy rpc fails."""
26 pass
27
28
beeps04561352013-10-04 11:07:47 -070029class GoofyRuntimeException(Exception):
30 """Exception raised when something goes wrong while a test is running."""
31 pass
32
33
34def retry_goofy_rpc(exception_tuple, timeout_min=30):
35 """A decorator to use with goofy rpcs.
36
37 This decorator tries to recreate the goofy client proxy on
38 socket error. It will continue trying to do so until it
39 executes the method without any socket errors or till the
40 retry.retry decorator hits it's timeout.
41
42 Usage:
43 If you just want to recreate the proxy:
44 1. @retry_goofy_rpc(exception_tuple=(<exception>, socket.error),
45 timeout_min=<timeout>)
46 2. @retry_goofy_rpc(socket.error, timeout_min=<timeout>)
47 Note: you need to specify the socket.error exception because we
48 want to retry the call after recreating the proxy.
49
50 @param exception_tuple: A tuple of exceptions to pass to
51 the retry decorator. Any of these exceptions will result
52 in retries for the duration of timeout_min.
53 @param timeout_min: The timeout, in minutes, for which we should
54 retry the method ignoring any exception in exception_tuple.
55 """
56 def inner_decorator(method):
57 """Inner retry decorator applied to the method.
58
59 @param method: The method that needs to be wrapped in the decorator.
60
61 @return A wrapper function that implements the retry.
62 """
63
64 @retry.retry(exception_tuple, timeout_min=timeout_min)
65 def wrapper(*args, **kwargs):
66 """This wrapper handles socket errors.
67
68 If the method in question:
69 1. Throws an exception in exception_tuple and it is not a
70 socket.error, retry for timeout_min through retry.retry.
71 2. Throws a socket.error, recreate the client proxy, and
72 retry for timeout_min through retry.retry.
73 3. Throws an exception not in exception_tuple, fail.
74 """
75 try:
76 return method(*args, **kwargs)
77 except socket.error as e:
78 goofy_proxy = args[0]
79 if type(goofy_proxy) is GoofyProxy:
80 logging.warning('Socket error while running factory tests '
81 '%s, recreating goofy proxy.', e)
82 goofy_proxy._create_client_proxy(timeout_min=timeout_min)
83 else:
84 logging.warning('Connectivity was lost and the retry '
85 'decorator was unable to recreate a goofy '
86 'client proxy, args: %s.', args)
87 raise
88
89 return wrapper
90
91 return inner_decorator
92
93
beeps7583e582013-08-28 16:19:24 -070094class GoofyProxy(object):
95 """Client capable of making rpc calls to goofy.
96
97 Methods of this class that can cause goofy to change state
98 usually need a retry decorator. Methods that have a retry decorator
99 need to be 'pure', i.e return the same results when called multiple
100 times with the same argument.
101
102 There are 2 known exceptions this class can deal with, a socket.error
103 which happens when we try to execute an rpc when the DUT is, say, suspended
104 and a BadStatusLine, which we get when we try to execute an rpc while the
105 DUT is going through a factory_restart. Ideally we would like to handle
106 socket timeouts different from BadStatusLines as we can get connection
107 errors even when a device reboots and BadStatusLines ususally only when
108 factory restarts. crbug.com/281714.
109 """
110
111 # This timeout was arbitrarily chosen as many tests in the factory test
112 # suite run for days. Ideally we would like to split this into at least 2
113 # timeouts, one which we use for rpcs that run while no other test is,
114 # running and is smaller than the second that is designed for use with rpcs
115 # that might execute simultaneously with a test. The latter needs a longer
116 # timeout since tests could suspend,resume for a long time, and a call like
117 # GetGoofyStatus should be tolerant to these suspend/resumes. In designing
118 # the base timeout one needs to allocate time to component methods of this
119 # class (such as _set_test_list) as a multiple of the number of rpcs it
120 # executes.
121 BASE_RPC_TIMEOUT = 1440
122 POLLING_INTERVAL = 5
123 FACTORY_BUG_RE = r'.*(/tmp/factory_bug.*tar.bz2).*'
124 UNTAR_COMMAND = 'tar jxf %s -C %s'
125
126
127 def __init__(self, host):
128 """
129 @param host: The host object representing the DUT running goofy.
130 """
131 self._host = host
beeps04561352013-10-04 11:07:47 -0700132 self._raw_stop_running_tests()
133 self._create_client_proxy(timeout_min=self.BASE_RPC_TIMEOUT)
134
135
136 def _create_client_proxy(self, timeout_min=30):
137 """Create a goofy client proxy.
138
139 Ping the host till it's up, then proceed to create a goofy proxy. We
140 don't wrap this method with a retry because it's used in the retry
141 decorator itself.
142 """
143
144 # We don't ssh ping here as there is a potential dealy in O(minutes)
145 # with our ssh command against a sleeping DUT, once it wakes up, and
146 # that will lead to significant overhead incurred over many reboots.
147 self._host.ping_wait_up(timeout_min)
148 logging.info('Host is pingable, creating goofy client proxy')
149 self._client = self._host.jsonrpc_connect(GOOFY_JSONRPC_SERVER_PORT)
beeps7583e582013-08-28 16:19:24 -0700150
151
152 @retry.retry((httplib.BadStatusLine, socket.error),
153 timeout_min=BASE_RPC_TIMEOUT)
beeps04561352013-10-04 11:07:47 -0700154 def _raw_stop_running_tests(self):
155 """Stop running tests by shelling out to the DUT.
156
157 Use this method only before we have actually created the client
158 proxy, as shelling out has several pitfalls. We need to stop all
159 tests in a retry loop because tests will start executing as soon
160 as we have reimaged a DUT and trying to create the proxy while
161 the DUT is rebooting will lead to a spurious failure.
162
163 Note that we use the plain retry decorator for this method since
164 we don't need to recreate the client proxy on failure.
165 """
166 logging.info('Stopping all tests and clearing factory state')
167 self._host.run('factory clear')
168
169
170 @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
171 timeout_min=BASE_RPC_TIMEOUT)
beeps7583e582013-08-28 16:19:24 -0700172 def _get_goofy_status(self):
173 """Return status of goofy, ignoring socket timeouts and http exceptions.
174 """
175 status = self._client.GetGoofyStatus().get('status')
176 return status
177
178
179 def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2):
180 """Wait till goofy is running or a timeout occurs.
181
182 @param timeout_min: Minutes to wait before timing this call out.
183 """
184 current_time = time.time()
185 timeout_secs = timeout_min * 60
186 logging.info('Waiting on goofy')
187 while self._get_goofy_status() != GOOFY_RUNNING:
188 if time.time() - current_time > timeout_secs:
189 break
190 return
191
192
beeps04561352013-10-04 11:07:47 -0700193 @retry_goofy_rpc(socket.error, timeout_min=BASE_RPC_TIMEOUT*2)
beeps7583e582013-08-28 16:19:24 -0700194 def _set_test_list(self, next_list):
195 """Set the given test list for execution.
196
197 Confirm that the given test list is a test that has been baked into
198 the image, then run it. Some test lists are configured to start
199 execution automatically when we call SetTestList, while others wait
200 for a corresponding RunTest.
201
202 @param next_list: The name of the test list.
203
204 @raise jsonrpclib.ProtocolError: If the test list we're trying to switch
205 to isn't on the DUT.
206 """
207
beeps04561352013-10-04 11:07:47 -0700208 # We can get a BadStatus line on 2 occassions:
209 # 1. As part of SwitchTestList goofy performs a factory restart, which
210 # will throw a BadStatusLine because the rpc can't exit cleanly. We
211 # don't want to retry on this exception, since we've already set the
212 # right test list.
213 # 2. If we try to set a test list while goofy is already down
214 # (from a previous factory restart). In this case we wouldn't have
215 # set the new test list, because we coulnd't connect to goofy.
216 # To properly set a new test list it's important to wait till goofy is
217 # up before attempting to set the test list, while being aware that the
218 # preceding httplib error is from the rpc we just executed leading to
219 # a factory restart. Also note that if the test list is not already on
220 # the DUT this method will fail, emitting the possible test lists one
221 # can switch to.
222 self._wait_for_goofy()
223 logging.info('Switching to test list %s', next_list)
beeps7583e582013-08-28 16:19:24 -0700224 try:
225 self._client.SwitchTestList(next_list)
226 except httplib.BadStatusLine:
227 logging.info('Switched to list %s, goofy restarting', next_list)
beeps7583e582013-08-28 16:19:24 -0700228
229
beeps04561352013-10-04 11:07:47 -0700230 @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
231 timeout_min=BASE_RPC_TIMEOUT*2)
beeps7583e582013-08-28 16:19:24 -0700232 def _stop_running_tests(self):
233 """Stop all running tests.
234
235 Wrap the StopTest rpc so we can attempt to stop tests even while a DUT
236 is suspended or rebooting.
237 """
238 logging.info('Stopping tests.')
239 self._client.StopTest()
240
241
242 def _get_test_map(self):
243 """Get a mapping of test suites -> tests.
244
245 Ignore entries for tests that don't have a path.
246
247 @return: A dictionary of the form
248 {'suite_name': ['suite_name.path_to_test', ...]}.
249 """
250 test_all = set([test['path'] for test in self._client.GetTests()
251 if test.get('path')])
252
253 test_map = collections.defaultdict(list)
254 for names in test_all:
255 test_map[names.split('.')[0]].append(names)
256 return test_map
257
258
259 def _log_test_results(self, test_status, current_suite):
260 """Format test status results and write them to status.log.
261
262 @param test_status: The status dictionary of a single test.
263 @param current_suite: The current suite name.
264 """
265 try:
266 self._host.job.record('INFO', None, None,
267 'suite %s, test %s, status: %s' %
268 (current_suite, test_status.get('path'),
269 test_status.get('status')))
270 except AttributeError as e:
271 logging.error('Could not gather results for current test: %s', e)
272
273
beeps04561352013-10-04 11:07:47 -0700274 @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
275 timeout_min=BASE_RPC_TIMEOUT*2)
beeps7583e582013-08-28 16:19:24 -0700276 def _get_test_info(self, test_name):
277 """Get the status of one test.
278
279 @param test_name: The name of the test we need the status of.
280
281 @return: The entry for the test in the status dictionary.
282 """
283 for test in self._client.GetTests():
284 if test['path'] == test_name:
285 return test
286 raise ValueError('Could not find test_name %s in _get_test_info.' %
287 test_name)
288
289
290 def _wait_on_barrier(self, barrier_name):
291 """Wait on a barrier.
292
293 This method is designed to wait on the Barrier of a suite. A Barrier
294 is used to synchronize several tests that run in parallel within a
295 suite; it will cause the suite to hang while it attempts to show
296 an operator the status of each test, and is activated once all the
297 tests in the suite are done.
298
299 @param barrier_name: The name of the barrier.
300 """
301 logging.info('Waiting on barrier %s', barrier_name)
302
303 # TODO(beeps): crbug.com/279473
304 while self._get_test_info(barrier_name)['status'] != 'ACTIVE':
305 time.sleep(self.POLLING_INTERVAL)
306
307
308 def _wait_on_suite(self, suite_name):
309 """Wait till a suite stops being active.
310
311 This method is designed to wait on the suite to change
312 status if it lacks a 'Barrier'. If a suite has a barrier
313 one should use _wait_on_barrier instead.
314
315 @param suite_name: The name of the suite to wait on.
316 """
317 logging.info('Waiting on suite %s', suite_name)
318
319 while self._get_test_info(suite_name)['status'] == 'ACTIVE':
320 time.sleep(self.POLLING_INTERVAL)
321
322
323 def _synchronous_run_suite(self, suite_name, barrier_name=None):
324 """Run one suite and wait for it to finish.
325
326 Will wait till the specified suite_name becomes active,
327 then wait till it switches out of active. If the suite
328 has a barrier, will wait till the barrier becomes active
329 instead, as this indicates that all tests have finished
330 running.
331
332 @param suite_name: The name of the suite to wait for.
333 @param barrier_name: The name of the barrier, if any.
334
335 @raises GoofyProxyException: If the status of the suite
336 doesn't switch to active after we call RunTest.
337
338 @return: The result of the suite.
339 """
beeps04561352013-10-04 11:07:47 -0700340 logging.info('Starting suite: %s', suite_name)
beeps7583e582013-08-28 16:19:24 -0700341 self._client.RunTest(suite_name)
342 result = self._get_test_info(suite_name)
343
344 #TODO(beeps): crbug.com/292975
345 if result['status'] != 'ACTIVE':
346 raise GoofyProxyException('Not waiting for test list %s. Either we '
347 'could not start it or the test list '
348 'already finished.' % suite_name)
349
350 if barrier_name:
351 self._wait_on_barrier(barrier_name)
352 else:
353 self._wait_on_suite(suite_name)
354
355 # Since the barrier itself counts as a 'test' we need to stop
356 # it before asking goofy for the suites results, or goofy will
357 # think that the suite is still running. We also need to stop
358 # any orphaned test that might have been kicked off during this
359 # suite.
360 self._stop_running_tests()
361 return self._get_test_info(suite_name)
362
363
364 def monitor_tests(self, test_list):
365 """Run a test list.
366
367 Will run each suite in the given list in sequence, starting each one
368 by name and waiting on its results. This method makes the following
369 assumptions:
370 - A test list is made up of self contained suites.
371 - These suites trigger several things in parallel.
372 - After a suite finishes it leaves goofy in an idle state.
373
374 It is not safe to pull results for individual tests during the suite
375 as the device could be rebooting, or goofy could be under stress.
376 Instead, this method synchronously waits on an entire suite, then
377 asks goofy for the status of each test in the suite. Since certain
378 test lists automatically start and others don't, this method stops
379 test list execution regardless, and sequentially triggers each suite.
380
381 @param test_list: The test list to run.
382 """
383 self._set_test_list(test_list)
384 self._wait_for_goofy()
385 self._stop_running_tests()
386
387 test_map = self._get_test_map()
beeps04561352013-10-04 11:07:47 -0700388 if test_map:
389 logging.info('About to execute tests: %s', test_map)
390 else:
391 raise GoofyRuntimeException('Test map is empty, you might have an '
392 'error in your test_list.')
393
394
beeps7583e582013-08-28 16:19:24 -0700395 for current_suite in test_map.keys():
396 logging.info('Processing suite %s', current_suite)
397
398 # Check if any of these tests are actually a Barrier.
399 barrier = None
400 for test in test_map.get(current_suite):
401 if '.' in test and 'Barrier' in test.split('.')[1]:
402 barrier = test
403 break
404
405 logging.info('Current suite = %s, barrier: %s', current_suite,
406 barrier)
407
408 result = self._synchronous_run_suite(current_suite, barrier)
409 logging.info(result)
410
411 for test_names in test_map.get(current_suite):
412 self._log_test_results(self._get_test_info(test_names),
413 current_suite)
414
415
beeps04561352013-10-04 11:07:47 -0700416 @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
417 timeout_min=BASE_RPC_TIMEOUT*2)
beeps7583e582013-08-28 16:19:24 -0700418 def get_results(self, resultsdir):
419 """Copies results from the DUT to a local results directory.
420
421 Copy the tarball over to the results folder, untar, and delete the
422 tarball if everything was successful. This will effectively place
423 all the logs relevant to factory testing in the job's results folder.
424
425 @param resultsdir: The directory in which to untar the contents of the
426 tarball factory_bug generates.
427 """
428 logging.info('Getting results logs for test_list.')
429
430 try:
431 factory_bug_log = self._host.run('factory_bug').stderr
432 except error.CmdError as e:
433 logging.error('Could not execute factory_bug: %s', e)
434 return
435
436 try:
437 factory_bug_tar = re.match(self.FACTORY_BUG_RE,
438 factory_bug_log).groups(1)[0]
439 except (IndexError, AttributeError):
440 logging.error('could not collect logs for factory results, '
441 'factory bug returned %s', factory_bug_log)
442 return
443
444 factory_bug_tar_file = os.path.basename(factory_bug_tar)
445 local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file)
446
447 try:
448 self._host.get_file(factory_bug_tar, local_factory_bug_tar)
449 except error.AutoservRunError as e:
450 logging.error('Failed to pull back the results tarball: %s', e)
451 return
452
453 try:
454 utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir))
455 except error.CmdError as e:
456 logging.error('Failed to untar the results tarball: %s', e)
457 return
458 finally:
459 if os.path.exists(local_factory_bug_tar):
460 os.remove(local_factory_bug_tar)
461
462