blob: a0d6aae9299121d8f249877db1b4431602054b02 [file] [log] [blame]
beeps7583e582013-08-28 16:19:24 -07001# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import collections
6import httplib
7import logging
8import os
9import re
10import socket
11import time
12
13import common
14
15from autotest_lib.client.common_lib import error
16from autotest_lib.client.common_lib.cros import retry
17from autotest_lib.server import utils
18
19
20GOOFY_JSONRPC_SERVER_PORT = 0x0FAC
21GOOFY_RUNNING = 'RUNNING'
22
23
24class GoofyProxyException(Exception):
25 """Exception raised when a goofy rpc fails."""
26 pass
27
28
29class GoofyProxy(object):
30 """Client capable of making rpc calls to goofy.
31
32 Methods of this class that can cause goofy to change state
33 usually need a retry decorator. Methods that have a retry decorator
34 need to be 'pure', i.e return the same results when called multiple
35 times with the same argument.
36
37 There are 2 known exceptions this class can deal with, a socket.error
38 which happens when we try to execute an rpc when the DUT is, say, suspended
39 and a BadStatusLine, which we get when we try to execute an rpc while the
40 DUT is going through a factory_restart. Ideally we would like to handle
41 socket timeouts different from BadStatusLines as we can get connection
42 errors even when a device reboots and BadStatusLines ususally only when
43 factory restarts. crbug.com/281714.
44 """
45
46 # This timeout was arbitrarily chosen as many tests in the factory test
47 # suite run for days. Ideally we would like to split this into at least 2
48 # timeouts, one which we use for rpcs that run while no other test is,
49 # running and is smaller than the second that is designed for use with rpcs
50 # that might execute simultaneously with a test. The latter needs a longer
51 # timeout since tests could suspend,resume for a long time, and a call like
52 # GetGoofyStatus should be tolerant to these suspend/resumes. In designing
53 # the base timeout one needs to allocate time to component methods of this
54 # class (such as _set_test_list) as a multiple of the number of rpcs it
55 # executes.
56 BASE_RPC_TIMEOUT = 1440
57 POLLING_INTERVAL = 5
58 FACTORY_BUG_RE = r'.*(/tmp/factory_bug.*tar.bz2).*'
59 UNTAR_COMMAND = 'tar jxf %s -C %s'
60
61
62 def __init__(self, host):
63 """
64 @param host: The host object representing the DUT running goofy.
65 """
66 self._host = host
67 self._client = host.jsonrpc_connect(GOOFY_JSONRPC_SERVER_PORT)
68
69
70 @retry.retry((httplib.BadStatusLine, socket.error),
71 timeout_min=BASE_RPC_TIMEOUT)
72 def _get_goofy_status(self):
73 """Return status of goofy, ignoring socket timeouts and http exceptions.
74 """
75 status = self._client.GetGoofyStatus().get('status')
76 return status
77
78
79 def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2):
80 """Wait till goofy is running or a timeout occurs.
81
82 @param timeout_min: Minutes to wait before timing this call out.
83 """
84 current_time = time.time()
85 timeout_secs = timeout_min * 60
86 logging.info('Waiting on goofy')
87 while self._get_goofy_status() != GOOFY_RUNNING:
88 if time.time() - current_time > timeout_secs:
89 break
90 return
91
92
93 @retry.retry(socket.error, timeout_min=BASE_RPC_TIMEOUT*2)
94 def _set_test_list(self, next_list):
95 """Set the given test list for execution.
96
97 Confirm that the given test list is a test that has been baked into
98 the image, then run it. Some test lists are configured to start
99 execution automatically when we call SetTestList, while others wait
100 for a corresponding RunTest.
101
102 @param next_list: The name of the test list.
103
104 @raise jsonrpclib.ProtocolError: If the test list we're trying to switch
105 to isn't on the DUT.
106 """
107
108 # As part of SwitchTestList we perform a factory restart,
109 # which will throw a BadStatusLine. We don't want to retry
110 # on this exception though, as that will lead to setting the same
111 # test list over and over till the timeout expires. If the test
112 # list is not already on the DUT this method will fail, emitting
113 # the possible test lists one can switch to.
114 try:
115 self._client.SwitchTestList(next_list)
116 except httplib.BadStatusLine:
117 logging.info('Switched to list %s, goofy restarting', next_list)
118 pass
119
120
121 @retry.retry((httplib.BadStatusLine, socket.error),
122 timeout_min=BASE_RPC_TIMEOUT*2)
123 def _stop_running_tests(self):
124 """Stop all running tests.
125
126 Wrap the StopTest rpc so we can attempt to stop tests even while a DUT
127 is suspended or rebooting.
128 """
129 logging.info('Stopping tests.')
130 self._client.StopTest()
131
132
133 def _get_test_map(self):
134 """Get a mapping of test suites -> tests.
135
136 Ignore entries for tests that don't have a path.
137
138 @return: A dictionary of the form
139 {'suite_name': ['suite_name.path_to_test', ...]}.
140 """
141 test_all = set([test['path'] for test in self._client.GetTests()
142 if test.get('path')])
143
144 test_map = collections.defaultdict(list)
145 for names in test_all:
146 test_map[names.split('.')[0]].append(names)
147 return test_map
148
149
150 def _log_test_results(self, test_status, current_suite):
151 """Format test status results and write them to status.log.
152
153 @param test_status: The status dictionary of a single test.
154 @param current_suite: The current suite name.
155 """
156 try:
157 self._host.job.record('INFO', None, None,
158 'suite %s, test %s, status: %s' %
159 (current_suite, test_status.get('path'),
160 test_status.get('status')))
161 except AttributeError as e:
162 logging.error('Could not gather results for current test: %s', e)
163
164
165 @retry.retry((httplib.BadStatusLine, socket.error),
166 timeout_min=BASE_RPC_TIMEOUT*2)
167 def _get_test_info(self, test_name):
168 """Get the status of one test.
169
170 @param test_name: The name of the test we need the status of.
171
172 @return: The entry for the test in the status dictionary.
173 """
174 for test in self._client.GetTests():
175 if test['path'] == test_name:
176 return test
177 raise ValueError('Could not find test_name %s in _get_test_info.' %
178 test_name)
179
180
181 def _wait_on_barrier(self, barrier_name):
182 """Wait on a barrier.
183
184 This method is designed to wait on the Barrier of a suite. A Barrier
185 is used to synchronize several tests that run in parallel within a
186 suite; it will cause the suite to hang while it attempts to show
187 an operator the status of each test, and is activated once all the
188 tests in the suite are done.
189
190 @param barrier_name: The name of the barrier.
191 """
192 logging.info('Waiting on barrier %s', barrier_name)
193
194 # TODO(beeps): crbug.com/279473
195 while self._get_test_info(barrier_name)['status'] != 'ACTIVE':
196 time.sleep(self.POLLING_INTERVAL)
197
198
199 def _wait_on_suite(self, suite_name):
200 """Wait till a suite stops being active.
201
202 This method is designed to wait on the suite to change
203 status if it lacks a 'Barrier'. If a suite has a barrier
204 one should use _wait_on_barrier instead.
205
206 @param suite_name: The name of the suite to wait on.
207 """
208 logging.info('Waiting on suite %s', suite_name)
209
210 while self._get_test_info(suite_name)['status'] == 'ACTIVE':
211 time.sleep(self.POLLING_INTERVAL)
212
213
214 def _synchronous_run_suite(self, suite_name, barrier_name=None):
215 """Run one suite and wait for it to finish.
216
217 Will wait till the specified suite_name becomes active,
218 then wait till it switches out of active. If the suite
219 has a barrier, will wait till the barrier becomes active
220 instead, as this indicates that all tests have finished
221 running.
222
223 @param suite_name: The name of the suite to wait for.
224 @param barrier_name: The name of the barrier, if any.
225
226 @raises GoofyProxyException: If the status of the suite
227 doesn't switch to active after we call RunTest.
228
229 @return: The result of the suite.
230 """
231 self._client.RunTest(suite_name)
232 result = self._get_test_info(suite_name)
233
234 #TODO(beeps): crbug.com/292975
235 if result['status'] != 'ACTIVE':
236 raise GoofyProxyException('Not waiting for test list %s. Either we '
237 'could not start it or the test list '
238 'already finished.' % suite_name)
239
240 if barrier_name:
241 self._wait_on_barrier(barrier_name)
242 else:
243 self._wait_on_suite(suite_name)
244
245 # Since the barrier itself counts as a 'test' we need to stop
246 # it before asking goofy for the suites results, or goofy will
247 # think that the suite is still running. We also need to stop
248 # any orphaned test that might have been kicked off during this
249 # suite.
250 self._stop_running_tests()
251 return self._get_test_info(suite_name)
252
253
254 def monitor_tests(self, test_list):
255 """Run a test list.
256
257 Will run each suite in the given list in sequence, starting each one
258 by name and waiting on its results. This method makes the following
259 assumptions:
260 - A test list is made up of self contained suites.
261 - These suites trigger several things in parallel.
262 - After a suite finishes it leaves goofy in an idle state.
263
264 It is not safe to pull results for individual tests during the suite
265 as the device could be rebooting, or goofy could be under stress.
266 Instead, this method synchronously waits on an entire suite, then
267 asks goofy for the status of each test in the suite. Since certain
268 test lists automatically start and others don't, this method stops
269 test list execution regardless, and sequentially triggers each suite.
270
271 @param test_list: The test list to run.
272 """
273 self._set_test_list(test_list)
274 self._wait_for_goofy()
275 self._stop_running_tests()
276
277 test_map = self._get_test_map()
278 for current_suite in test_map.keys():
279 logging.info('Processing suite %s', current_suite)
280
281 # Check if any of these tests are actually a Barrier.
282 barrier = None
283 for test in test_map.get(current_suite):
284 if '.' in test and 'Barrier' in test.split('.')[1]:
285 barrier = test
286 break
287
288 logging.info('Current suite = %s, barrier: %s', current_suite,
289 barrier)
290
291 result = self._synchronous_run_suite(current_suite, barrier)
292 logging.info(result)
293
294 for test_names in test_map.get(current_suite):
295 self._log_test_results(self._get_test_info(test_names),
296 current_suite)
297
298
299 @retry.retry((httplib.BadStatusLine, socket.timeout), timeout_min=1)
300 def get_results(self, resultsdir):
301 """Copies results from the DUT to a local results directory.
302
303 Copy the tarball over to the results folder, untar, and delete the
304 tarball if everything was successful. This will effectively place
305 all the logs relevant to factory testing in the job's results folder.
306
307 @param resultsdir: The directory in which to untar the contents of the
308 tarball factory_bug generates.
309 """
310 logging.info('Getting results logs for test_list.')
311
312 try:
313 factory_bug_log = self._host.run('factory_bug').stderr
314 except error.CmdError as e:
315 logging.error('Could not execute factory_bug: %s', e)
316 return
317
318 try:
319 factory_bug_tar = re.match(self.FACTORY_BUG_RE,
320 factory_bug_log).groups(1)[0]
321 except (IndexError, AttributeError):
322 logging.error('could not collect logs for factory results, '
323 'factory bug returned %s', factory_bug_log)
324 return
325
326 factory_bug_tar_file = os.path.basename(factory_bug_tar)
327 local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file)
328
329 try:
330 self._host.get_file(factory_bug_tar, local_factory_bug_tar)
331 except error.AutoservRunError as e:
332 logging.error('Failed to pull back the results tarball: %s', e)
333 return
334
335 try:
336 utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir))
337 except error.CmdError as e:
338 logging.error('Failed to untar the results tarball: %s', e)
339 return
340 finally:
341 if os.path.exists(local_factory_bug_tar):
342 os.remove(local_factory_bug_tar)
343
344