[autotest] Add retrying to autotest frontend RPCs
Add a decorator that will retry a function if it raises an exception,
exponentially backing off until a deadline. Create classes that wrap
frontend.AFE and frontend.TKO, applying the decorator to their run()
method.
BUG=chromium-os:26419
TEST=unit
TEST=use atest to schedule a suite and restart the afe while it runs
STATUS=Fixed
Change-Id: I8a8c48b4418f55933ab168fc92f19f03ca7e9fc3
Reviewed-on: https://gerrit.chromium.org/gerrit/15960
Commit-Ready: Chris Masone <cmasone@chromium.org>
Reviewed-by: Chris Masone <cmasone@chromium.org>
Tested-by: Chris Masone <cmasone@chromium.org>
diff --git a/server/cros/dynamic_suite.py b/server/cros/dynamic_suite.py
index ecf1b7b..3852644 100644
--- a/server/cros/dynamic_suite.py
+++ b/server/cros/dynamic_suite.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
@@ -6,7 +6,7 @@
import compiler, logging, os, random, re, time
from autotest_lib.client.common_lib import control_data, global_config, error
from autotest_lib.client.common_lib import utils
-from autotest_lib.server.cros import control_file_getter
+from autotest_lib.server.cros import control_file_getter, frontend_wrappers
from autotest_lib.server import frontend
@@ -54,8 +54,12 @@
@param afe: an instance of AFE as defined in server/frontend.py.
@param tko: an instance of TKO as defined in server/frontend.py.
"""
- self._afe = afe or frontend.AFE(debug=False)
- self._tko = tko or frontend.TKO(debug=False)
+ self._afe = afe or frontend_wrappers.RetryingAFE(timeout_min=30,
+ delay_sec=10,
+ debug=False)
+ self._tko = tko or frontend_wrappers.RetryingTKO(timeout_min=30,
+ delay_sec=10,
+ debug=False)
self._cf_getter = control_file_getter.FileSystemGetter(
[os.path.join(autotest_dir, 'server/site_tests')])
@@ -239,8 +243,12 @@
"""
self._predicate = predicate
self._tag = tag
- self._afe = afe or frontend.AFE(debug=False)
- self._tko = tko or frontend.TKO(debug=False)
+ self._afe = afe or frontend_wrappers.RetryingAFE(timeout_min=30,
+ delay_sec=10,
+ debug=False)
+ self._tko = tko or frontend_wrappers.RetryingTKO(timeout_min=30,
+ delay_sec=10,
+ debug=False)
self._jobs = []
self._cf_getter = Suite.create_fs_getter(autotest_dir)
diff --git a/server/cros/frontend_wrappers.py b/server/cros/frontend_wrappers.py
new file mode 100644
index 0000000..25933a4
--- /dev/null
+++ b/server/cros/frontend_wrappers.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import logging, random, time
+import common
+from autotest_lib.client.common_lib import utils
+from autotest_lib.server import frontend
+
+
+def jittered_delay(delay):
+ """Return |delay| +/- up to 50%.
+
+ To calculate this, we first determine 50% of the delay, then multiply by
+ a random float between 0.0 and 1.0. This gets us some value between 0 and
+ half of the delay. Then, we flip a coin to decide whether the delta we
+ apply to the delay should be positive or negative. Finally, we add the
+ delta to the delay and return it.
+
+ @param delay: the delay to which to add jitter.
+ @return: the delay with jitter added in.
+ """
+ return delay + random.choice([-1, 1]) * random.random() * .5 * delay
+
+
+def retry(ExceptionToCheck, timeout_min=1, delay_sec=3):
+ """Retry calling the decorated function using a delay with jitter.
+
+ original from:
+ http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+
+ @param ExceptionToCheck: the exception to check. May be a tuple of
+ exceptions to check.
+ @param timeout_min: timeout in minutes until giving up.
+ @param delay_sec: pre-jittered delay between retries in seconds. Actual
+ delays will be centered around this value, ranging up to
+ 50% off this midpoint.
+ """
+ def deco_retry(func):
+ random.seed()
+ def func_retry(*args, **kwargs):
+ deadline = time.time() + timeout_min * 60 # convert to seconds.
+ delay = jittered_delay(delay_sec)
+ while time.time() < deadline:
+ try:
+ return func(*args, **kwargs)
+ break
+ except ExceptionToCheck, e:
+ msg = "%s(%s), Retrying in %f seconds..." % (e.__class__,
+ e,
+ delay)
+ logging.warning(msg)
+ time.sleep(delay)
+ delay = jittered_delay(delay)
+ else:
+ return func(*args, **kwargs)
+ return
+ return func_retry # true decorator
+ return deco_retry
+
+
+class RetryingAFE(frontend.AFE):
+ """Wrapper around frontend.AFE that retries all RPCs.
+
+ Timeout for retries and delay between retries are configurable.
+ """
+ def __init__(self, timeout_min, delay_sec, **dargs):
+ """Constructor
+
+ @param timeout_min: timeout in minutes until giving up.
+ @param delay_sec: pre-jittered delay between retries in seconds.
+ """
+ self.timeout_min = timeout_min
+ self.delay_sec = delay_sec
+ super(RetryingAFE, self).__init__(**dargs)
+
+ @retry(Exception, timeout_min=30, delay_sec=10)
+ def run(self, call, **dargs):
+ return super(RetryingAFE, self).run(call, **dargs)
+
+
+class RetryingTKO(frontend.TKO):
+ """Wrapper around frontend.TKO that retries all RPCs.
+
+ Timeout for retries and delay between retries are configurable.
+ """
+ def __init__(self, timeout_min, delay_sec, **dargs):
+ """Constructor
+
+ @param timeout_min: timeout in minutes until giving up.
+ @param delay_sec: pre-jittered delay between retries in seconds.
+ """
+ self.timeout_min = timeout_min
+ self.delay_sec = delay_sec
+ super(RetryingTKO, self).__init__(**dargs)
+
+
+ @retry(Exception, timeout_min=30, delay_sec=10)
+ def run(self, call, **dargs):
+ return super(RetryingTKO, self).run(call, **dargs)
diff --git a/server/cros/frontend_wrappers_unittest.py b/server/cros/frontend_wrappers_unittest.py
new file mode 100644
index 0000000..7b3b8ff
--- /dev/null
+++ b/server/cros/frontend_wrappers_unittest.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Unit tests for server/cros/frontend_wrappers.py."""
+
+import logging
+import mox
+import time
+import unittest
+
+from autotest_lib.server.cros import frontend_wrappers
+from autotest_lib.server import frontend
+
+class FrontendWrappersTest(mox.MoxTestBase):
+ """Unit tests for dynamic_suite.Reimager.
+
+ @var _FLAKY_FLAG: for use in tests that need to simulate random failures.
+ """
+
+ _FLAKY_FLAG = None
+
+ def setUp(self):
+ super(FrontendWrappersTest, self).setUp()
+ self._FLAKY_FLAG = False
+
+
+ def testRetryDecoratorSucceeds(self):
+ """Tests that a wrapped function succeeds without retrying."""
+ timeout_min = .1
+ timeout_sec = timeout_min * 60
+ @frontend_wrappers.retry(Exception,
+ timeout_min=timeout_min,
+ delay_sec=1)
+ def succeed():
+ return True
+
+ deadline = time.time() + timeout_sec
+ self.assertTrue(succeed())
+ self.assertTrue(time.time() < deadline)
+
+
+ def testRetryDecoratorFlakySucceeds(self):
+ """Tests that a wrapped function can retry and succeed."""
+ timeout_min = .1
+ timeout_sec = timeout_min * 60
+ @frontend_wrappers.retry(Exception,
+ timeout_min=timeout_min,
+ delay_sec=1)
+ def flaky_succeed():
+ if self._FLAKY_FLAG:
+ return True
+ self._FLAKY_FLAG = True
+ raise Exception
+
+ deadline = time.time() + timeout_sec
+ self.assertTrue(flaky_succeed())
+ self.assertTrue(time.time() < deadline)
+
+
+ def testRetryDecoratorFailss(self):
+ """Tests that a wrapped function retries til the timeout, then fails."""
+ timeout_min = .01
+ timeout_sec = timeout_min * 60
+ @frontend_wrappers.retry(Exception,
+ timeout_min=timeout_min,
+ delay_sec=1)
+ def fail():
+ raise Exception()
+
+ deadline = time.time() + timeout_sec
+ self.assertRaises(Exception, fail)
+ self.assertTrue(time.time() >= deadline)