[autotest] Add retrying to autotest frontend RPCs

Add a decorator that will retry a function if it raises an exception,
exponentially backing off until a deadline.  Create classes that wrap
frontend.AFE and frontend.TKO, applying the decorator to their run()
method.

BUG=chromium-os:26419
TEST=unit
TEST=use atest to schedule a suite and restart the afe while it runs
STATUS=Fixed

Change-Id: I8a8c48b4418f55933ab168fc92f19f03ca7e9fc3
Reviewed-on: https://gerrit.chromium.org/gerrit/15960
Commit-Ready: Chris Masone <cmasone@chromium.org>
Reviewed-by: Chris Masone <cmasone@chromium.org>
Tested-by: Chris Masone <cmasone@chromium.org>
diff --git a/server/cros/dynamic_suite.py b/server/cros/dynamic_suite.py
index ecf1b7b..3852644 100644
--- a/server/cros/dynamic_suite.py
+++ b/server/cros/dynamic_suite.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
@@ -6,7 +6,7 @@
 import compiler, logging, os, random, re, time
 from autotest_lib.client.common_lib import control_data, global_config, error
 from autotest_lib.client.common_lib import utils
-from autotest_lib.server.cros import control_file_getter
+from autotest_lib.server.cros import control_file_getter, frontend_wrappers
 from autotest_lib.server import frontend
 
 
@@ -54,8 +54,12 @@
         @param afe: an instance of AFE as defined in server/frontend.py.
         @param tko: an instance of TKO as defined in server/frontend.py.
         """
-        self._afe = afe or frontend.AFE(debug=False)
-        self._tko = tko or frontend.TKO(debug=False)
+        self._afe = afe or frontend_wrappers.RetryingAFE(timeout_min=30,
+                                                         delay_sec=10,
+                                                         debug=False)
+        self._tko = tko or frontend_wrappers.RetryingTKO(timeout_min=30,
+                                                         delay_sec=10,
+                                                         debug=False)
         self._cf_getter = control_file_getter.FileSystemGetter(
             [os.path.join(autotest_dir, 'server/site_tests')])
 
@@ -239,8 +243,12 @@
         """
         self._predicate = predicate
         self._tag = tag
-        self._afe = afe or frontend.AFE(debug=False)
-        self._tko = tko or frontend.TKO(debug=False)
+        self._afe = afe or frontend_wrappers.RetryingAFE(timeout_min=30,
+                                                         delay_sec=10,
+                                                         debug=False)
+        self._tko = tko or frontend_wrappers.RetryingTKO(timeout_min=30,
+                                                         delay_sec=10,
+                                                         debug=False)
         self._jobs = []
 
         self._cf_getter = Suite.create_fs_getter(autotest_dir)
diff --git a/server/cros/frontend_wrappers.py b/server/cros/frontend_wrappers.py
new file mode 100644
index 0000000..25933a4
--- /dev/null
+++ b/server/cros/frontend_wrappers.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import logging, random, time
+import common
+from autotest_lib.client.common_lib import utils
+from autotest_lib.server import frontend
+
+
+def jittered_delay(delay):
+    """Return |delay| +/- up to 50%.
+
+    To calculate this, we first determine 50% of the delay, then multiply by
+    a random float between 0.0 and 1.0.  This gets us some value between 0 and
+    half of the delay.  Then, we flip a coin to decide whether the delta we
+    apply to the delay should be positive or negative.  Finally, we add the
+    delta to the delay and return it.
+
+    @param delay: the delay to which to add jitter.
+    @return: the delay with jitter added in.
+    """
+    return delay + random.choice([-1, 1]) * random.random() * .5 * delay
+
+
+def retry(ExceptionToCheck, timeout_min=1, delay_sec=3):
+    """Retry calling the decorated function using a delay with jitter.
+
+    original from:
+      http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+
+    @param ExceptionToCheck: the exception to check.  May be a tuple of
+                             exceptions to check.
+    @param timeout_min: timeout in minutes until giving up.
+    @param delay_sec: pre-jittered delay between retries in seconds.  Actual
+                      delays will be centered around this value, ranging up to
+                      50% off this midpoint.
+    """
+    def deco_retry(func):
+        random.seed()
+        def func_retry(*args, **kwargs):
+            deadline = time.time() + timeout_min * 60  # convert to seconds.
+            delay = jittered_delay(delay_sec)
+            while time.time() < deadline:
+                try:
+                    return func(*args, **kwargs)
+                    break
+                except ExceptionToCheck, e:
+                    msg = "%s(%s), Retrying in %f seconds..." % (e.__class__,
+                                                                 e,
+                                                                 delay)
+                    logging.warning(msg)
+                    time.sleep(delay)
+                    delay = jittered_delay(delay)
+            else:
+                return func(*args, **kwargs)
+            return
+        return func_retry  # true decorator
+    return deco_retry
+
+
+class RetryingAFE(frontend.AFE):
+    """Wrapper around frontend.AFE that retries all RPCs.
+
+    Timeout for retries and delay between retries are configurable.
+    """
+    def __init__(self, timeout_min, delay_sec, **dargs):
+        """Constructor
+
+        @param timeout_min: timeout in minutes until giving up.
+        @param delay_sec: pre-jittered delay between retries in seconds.
+        """
+        self.timeout_min = timeout_min
+        self.delay_sec = delay_sec
+        super(RetryingAFE, self).__init__(**dargs)
+
+    @retry(Exception, timeout_min=30, delay_sec=10)
+    def run(self, call, **dargs):
+        return super(RetryingAFE, self).run(call, **dargs)
+
+
+class RetryingTKO(frontend.TKO):
+    """Wrapper around frontend.TKO that retries all RPCs.
+
+    Timeout for retries and delay between retries are configurable.
+    """
+    def __init__(self, timeout_min, delay_sec, **dargs):
+        """Constructor
+
+        @param timeout_min: timeout in minutes until giving up.
+        @param delay_sec: pre-jittered delay between retries in seconds.
+        """
+        self.timeout_min = timeout_min
+        self.delay_sec = delay_sec
+        super(RetryingTKO, self).__init__(**dargs)
+
+
+    @retry(Exception, timeout_min=30, delay_sec=10)
+    def run(self, call, **dargs):
+        return super(RetryingTKO, self).run(call, **dargs)
diff --git a/server/cros/frontend_wrappers_unittest.py b/server/cros/frontend_wrappers_unittest.py
new file mode 100644
index 0000000..7b3b8ff
--- /dev/null
+++ b/server/cros/frontend_wrappers_unittest.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Unit tests for server/cros/frontend_wrappers.py."""
+
+import logging
+import mox
+import time
+import unittest
+
+from autotest_lib.server.cros import frontend_wrappers
+from autotest_lib.server import frontend
+
+class FrontendWrappersTest(mox.MoxTestBase):
+    """Unit tests for dynamic_suite.Reimager.
+
+    @var _FLAKY_FLAG: for use in tests that need to simulate random failures.
+    """
+
+    _FLAKY_FLAG = None
+
+    def setUp(self):
+        super(FrontendWrappersTest, self).setUp()
+        self._FLAKY_FLAG = False
+
+
+    def testRetryDecoratorSucceeds(self):
+        """Tests that a wrapped function succeeds without retrying."""
+        timeout_min = .1
+        timeout_sec = timeout_min * 60
+        @frontend_wrappers.retry(Exception,
+                                 timeout_min=timeout_min,
+                                 delay_sec=1)
+        def succeed():
+            return True
+
+        deadline = time.time() + timeout_sec
+        self.assertTrue(succeed())
+        self.assertTrue(time.time() < deadline)
+
+
+    def testRetryDecoratorFlakySucceeds(self):
+        """Tests that a wrapped function can retry and succeed."""
+        timeout_min = .1
+        timeout_sec = timeout_min * 60
+        @frontend_wrappers.retry(Exception,
+                                 timeout_min=timeout_min,
+                                 delay_sec=1)
+        def flaky_succeed():
+            if self._FLAKY_FLAG:
+                return True
+            self._FLAKY_FLAG = True
+            raise Exception
+
+        deadline = time.time() + timeout_sec
+        self.assertTrue(flaky_succeed())
+        self.assertTrue(time.time() < deadline)
+
+
+    def testRetryDecoratorFailss(self):
+        """Tests that a wrapped function retries til the timeout, then fails."""
+        timeout_min = .01
+        timeout_sec = timeout_min * 60
+        @frontend_wrappers.retry(Exception,
+                                 timeout_min=timeout_min,
+                                 delay_sec=1)
+        def fail():
+            raise Exception()
+
+        deadline = time.time() + timeout_sec
+        self.assertRaises(Exception, fail)
+        self.assertTrue(time.time() >= deadline)