[autotest] adding a timeout to retry decorator.

Use signal.SIGALRM to prevent retry method from hang

signal.SIGALRM is used to timeout retry method after a given amount of
time. The default timeout value is set to timeout_min, which is used as
timeout value in minutes until all retries give up.

BUG=chromium-os:34424
TEST=run unit test retry_unittest.py
Change-Id: I88f24f281cc1b8f9cb82ba10aae31a8345722d28
Reviewed-on: https://gerrit.chromium.org/gerrit/40095
Reviewed-by: Craig Harrison <craigdh@chromium.org>
Reviewed-by: Alex Miller <milleral@chromium.org>
Commit-Queue: Dan Shi <dshi@chromium.org>
Tested-by: Dan Shi <dshi@chromium.org>
diff --git a/client/common_lib/cros/retry.py b/client/common_lib/cros/retry.py
index 6401990..1a503dc 100644
--- a/client/common_lib/cros/retry.py
+++ b/client/common_lib/cros/retry.py
@@ -2,12 +2,68 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
-import logging, random, time
+import logging, random, time, signal, sys
+
 from autotest_lib.client.common_lib import error
 from autotest_lib.frontend.afe.json_rpc import proxy
 
 
-def retry(ExceptionToCheck, timeout_min=1, delay_sec=3):
+class TimeoutException(Exception):
+    """
+    Exception to be raised for when alarm is triggered.
+    """
+    pass
+
+
+def handler(signum, frame):
+    """
+    Register a handler for the timeout.
+    """
+    raise TimeoutException('Call is timed out.')
+
+
+def timeout(func, args=(), kwargs={}, timeout_sec=60.0, default=None):
+    """
+    This function run the given function using the args, kwargs and
+    return the given default value if the timeout_sec is exceeded.
+
+    @param func: function to be called.
+    @param args: arguments for function to be called.
+    @param kwargs: keyword arguments for function to be called.
+    @param timeout_sec: timeout setting for call to exit, in seconds.
+    @param default: default return value for the function call.
+    @return 1: is_timeout 2: result of the function call. If
+            is_timeout is True, the call is timed out. If the
+            value is False, the call is finished on time.
+    """
+    old_handler = signal.signal(signal.SIGALRM, handler)
+
+    timeout_sec_n = int(timeout_sec)
+    # In case the timeout is rounded to 0, force to set it to default value.
+    if timeout_sec_n == 0:
+        timeout_sec_n = 60
+    old_alarm_sec = signal.alarm(timeout_sec_n)
+    if old_alarm_sec > 0:
+        old_timeout_time = time.time() + old_alarm_sec
+    try:
+        result = func(*args, **kwargs)
+        # Cancel the timer if the function returned before timeout
+        signal.alarm(0)
+        return False, result
+    except TimeoutException:
+        return True, default
+    finally:
+        # Restore previous Signal handler and alarm
+        if old_handler is not None:
+            signal.signal(signal.SIGALRM, old_handler)
+        if old_alarm_sec > 0:
+            old_alarm_sec = int(old_timeout_time - time.time())
+            if old_alarm_sec <= 0:
+                old_alarm_sec = 1;
+            signal.alarm(old_alarm_sec)
+
+
+def retry(ExceptionToCheck, timeout_min=1.0, delay_sec=3):
     """Retry calling the decorated function using a delay with jitter.
 
     Will raise RPC ValidationError exceptions from the decorated
@@ -26,24 +82,46 @@
     """
     def deco_retry(func):
         random.seed()
+
+
+        def delay():
+            """
+            'Jitter' the delay, up to 50% in either direction.
+            """
+            random_delay = random.uniform(.5 * delay_sec, 1.5 * delay_sec)
+            logging.warning("Retrying in %f seconds...", random_delay)
+            time.sleep(random_delay)
+
+
         def func_retry(*args, **kwargs):
             deadline = time.time() + timeout_min * 60  # convert to seconds.
+            # Used to cache exception to be raised later.
+            exc_info = None
             while time.time() < deadline:
                 try:
-                    return func(*args, **kwargs)
-                except error.CrosDynamicSuiteException, e:
-                    raise e
-                except proxy.ValidationError, e:
-                    raise e
-                except ExceptionToCheck, e:
-                    # 'Jitter' the delay, up to 50% in either direction.
-                    delay = random.uniform(.5 * delay_sec, 1.5 * delay_sec)
-                    logging.warning("%s(%s), Retrying in %f seconds...",
-                                    e.__class__, e, delay)
-                    time.sleep(delay)
+                    # Clear the cache
+                    exc_info = None
+                    is_timeout, result = timeout(func, args, kwargs,
+                                                timeout_min*60)
+                    if is_timeout:
+                        delay()
+                    else:
+                        return result
+                except (error.CrosDynamicSuiteException,
+                        proxy.ValidationError):
+                    raise
+                except ExceptionToCheck as e:
+                    logging.warning("%s(%s)", e.__class__, e)
+                    # Cache the exception to be raised later.
+                    exc_info = sys.exc_info()
+                    delay()
+            # The call must have timed out or raised ExceptionToCheck.
+            if exc_info is None:
+                raise TimeoutException('Call is timed out.')
             else:
-                # On the last try, run func() and allow exceptions to escape.
-                return func(*args, **kwargs)
-            return
+                # Raise the cached exception with original backtrace.
+                raise exc_info[0], exc_info[1], exc_info[2]
+
+
         return func_retry  # true decorator
     return deco_retry
diff --git a/client/common_lib/cros/retry_unittest.py b/client/common_lib/cros/retry_unittest.py
index abf0536..0f27e3b 100644
--- a/client/common_lib/cros/retry_unittest.py
+++ b/client/common_lib/cros/retry_unittest.py
@@ -8,6 +8,7 @@
 import mox
 import time
 import unittest
+import signal
 
 from autotest_lib.client.common_lib.cros import retry
 from autotest_lib.client.common_lib import error
@@ -88,3 +89,53 @@
         self.mox.StubOutWithMock(time, 'sleep')
         self.mox.ReplayAll()
         self.assertRaises(proxy.ValidationError, fail)
+
+
+    def testRetryDecoratorFailsWithTimeout(self):
+        """Tests that a wrapped function retries til the timeout, then fails."""
+        @retry.retry(Exception, timeout_min=0.02, delay_sec=0.1)
+        def fail():
+            time.sleep(2)
+            return True
+
+        self.mox.ReplayAll()
+        #self.assertEquals(None, fail())
+        self.assertRaises(retry.TimeoutException, fail)
+
+    def testRetryDecoratorSucceedsBeforeTimeout(self):
+        """Tests that a wrapped function succeeds before the timeout."""
+        @retry.retry(Exception, timeout_min=0.02, delay_sec=0.1)
+        def succeed():
+            time.sleep(0.1)
+            return True
+
+        self.mox.ReplayAll()
+        self.assertTrue(succeed())
+
+
+    def testRetryDecoratorSucceedsWithExistingSignal(self):
+        """Tests that a wrapped function succeeds before the timeout and
+        previous signal being restored."""
+        class TestTimeoutException(Exception):
+            pass
+
+        def testFunc():
+            @retry.retry(Exception, timeout_min=0.05, delay_sec=0.1)
+            def succeed():
+                time.sleep(0.1)
+                return True
+
+            succeed()
+            # Wait for 1.5 second for previous signal to be raised
+            time.sleep(1.5)
+
+        def testHandler(signum, frame):
+            """
+            Register a handler for the timeout.
+            """
+            raise TestTimeoutException('Expected timed out.')
+
+        signal.signal(signal.SIGALRM, testHandler)
+        signal.alarm(1)
+        self.mox.ReplayAll()
+        self.assertRaises(TestTimeoutException, testFunc)