[autotest] Record exceptional job termination correctly

Ensure that exception that occur during Reimager.attempt get caught and
logged, so that we can record 'END ERROR' appropriately.

BUG=chromium-os:26550
TEST=new unit test
TEST=manually ran a job with a doomed-to-fail RPC in it
STATUS=Fixed

Change-Id: Ieb076c14424cb7699edfdb88079ea2b43de279e0
Reviewed-on: https://gerrit.chromium.org/gerrit/16441
Commit-Ready: Chris Masone <cmasone@chromium.org>
Reviewed-by: Chris Masone <cmasone@chromium.org>
Tested-by: Chris Masone <cmasone@chromium.org>
diff --git a/server/cros/dynamic_suite.py b/server/cros/dynamic_suite.py
index ed023fa..e0fd6d0 100644
--- a/server/cros/dynamic_suite.py
+++ b/server/cros/dynamic_suite.py
@@ -84,11 +84,11 @@
                       x86-alex-release/R18-1655.0.0-a1-b1584.
         @param board: which kind of devices to reimage.
         @param record: callable that records job status.
-                 prototype:
-                   record(status, subdir, name, reason)
+                       prototype:
+                         record(status, subdir, name, reason)
         @param num: how many devices to reimage.
         @param pool: Specify the pool of machines to use for scheduling
-                purposes.
+                     purposes.
         @return True if all reimaging jobs succeed, false otherwise.
         """
         if not num:
@@ -98,16 +98,22 @@
         logging.debug("scheduling reimaging across %d machines", num)
         wrapper_job_name = 'try new image'
         record('START', None, wrapper_job_name)
-        self._ensure_version_label(VERSION_PREFIX + build)
-        canary = self._schedule_reimage_job(build, num, board)
-        logging.debug('Created re-imaging job: %d', canary.id)
-        while len(self._afe.get_jobs(id=canary.id, not_yet_run=True)) > 0:
-            time.sleep(10)
-        logging.debug('Re-imaging job running.')
-        while len(self._afe.get_jobs(id=canary.id, finished=True)) == 0:
-            time.sleep(10)
-        logging.debug('Re-imaging job finished.')
-        canary.result = self._afe.poll_job_results(self._tko, canary, 0)
+        try:
+            self._ensure_version_label(VERSION_PREFIX + build)
+            canary = self._schedule_reimage_job(build, num, board)
+            logging.debug('Created re-imaging job: %d', canary.id)
+            while len(self._afe.get_jobs(id=canary.id, not_yet_run=True)) > 0:
+                time.sleep(10)
+            logging.debug('Re-imaging job running.')
+            while len(self._afe.get_jobs(id=canary.id, finished=True)) == 0:
+                time.sleep(10)
+            logging.debug('Re-imaging job finished.')
+            canary.result = self._afe.poll_job_results(self._tko, canary, 0)
+        except Exception as e:
+            # catch Exception so we record the job as terminated no matter what.
+            logging.error(e)
+            record('END ERROR', None, wrapper_job_name, str(e))
+            return False
 
         if canary.result is True:
             self._report_results(canary, record)
diff --git a/server/cros/dynamic_suite_unittest.py b/server/cros/dynamic_suite_unittest.py
index 2eb934e..04ca80a 100755
--- a/server/cros/dynamic_suite_unittest.py
+++ b/server/cros/dynamic_suite_unittest.py
@@ -167,10 +167,11 @@
         self.reimager._schedule_reimage_job(self._BUILD, self._NUM, self._BOARD)
 
 
-    def expect_attempt(self, success):
+    def expect_attempt(self, success, ex=None):
         """Sets up |self.reimager| to expect an attempt() that returns |success|
 
-        @param success the value returned by poll_job_results()
+        @param success: the value returned by poll_job_results()
+        @param ex: if not None, |ex| is raised by get_jobs()
         @return a FakeJob configured with appropriate expectations
         """
         canary = FakeJob()
@@ -186,8 +187,12 @@
             self.reimager._report_results(canary, mox.IgnoreArg())
 
         self.afe.get_jobs(id=canary.id, not_yet_run=True).AndReturn([])
-        self.afe.get_jobs(id=canary.id, finished=True).AndReturn([canary])
-        self.afe.poll_job_results(mox.IgnoreArg(), canary, 0).AndReturn(success)
+        if ex is not None:
+            self.afe.get_jobs(id=canary.id, finished=True).AndRaise(ex)
+        else:
+            self.afe.get_jobs(id=canary.id, finished=True).AndReturn([canary])
+            self.afe.poll_job_results(mox.IgnoreArg(),
+                                      canary, 0).AndReturn(success)
 
         return canary
 
@@ -226,6 +231,18 @@
         self.reimager.attempt(self._BUILD, self._BOARD, rjob.record)
 
 
+    def testReimageThatRaised(self):
+        """Should attempt a reimage that raises an exception and record that."""
+        ex_message = 'Oh no!'
+        canary = self.expect_attempt(None, Exception(ex_message))
+
+        rjob = self.mox.CreateMock(base_job.base_job)
+        rjob.record('START', mox.IgnoreArg(), mox.IgnoreArg())
+        rjob.record('END ERROR', mox.IgnoreArg(), mox.IgnoreArg(), ex_message)
+        self.mox.ReplayAll()
+        self.reimager.attempt(self._BUILD, self._BOARD, rjob.record)
+
+
 class SuiteTest(mox.MoxTestBase):
     """Unit tests for dynamic_suite.Suite.