faft: Identify the TestFail of firmware failed to auto-boot USB

On dev mode, USB inserted, triggering recovery will directly boot into USB.
However, we saw some TestFails that firmware failed to auto-boot USB.
Repluging the USB can help. This firmware mis-behavior should be identified
and report correctly. This CL tries to do so.

BUG=chromium-os:23309
TEST=on the firmware which failed to directly boot into USB, run:
$ run_remote_tests.sh --board link --remote dut RecoveryButton/control$
...
14:57:33 INFO | Server: Client machine is offline.
15:01:42 INFO | wait_for_client() timed out.
15:01:42 INFO | Try to retrieve recovery reason...
15:01:52 INFO | Setting usb_mux_sel1 to servo_sees_usbkey
15:02:02 INFO | Setting usb_mux_sel1 to dut_sees_usbkey
15:02:29 INFO | Server: Client machine is up.
...
15:02:45 INFO | Server: Relaunched remote faft.
15:02:45 INFO | Got the recovery reason 193.
15:02:45 INFO | calling <bound method firmware_UserRequestRecovery.crossystem_checker of <firmware_UserRequestRecovery.firmware_UserRequestRecovery object at 0x113fdd0>> with args ({'recovery_reason': '193', 'mainfw_type': 'recovery'},)
15:02:45 INFO | Try cold reboot...
...

Change-Id: I1fd6a157e901fb3bc2f720b767444936f1b9510e
Reviewed-on: https://gerrit.chromium.org/gerrit/37003
Commit-Ready: Tom Wai-Hong Tam <waihong@chromium.org>
Reviewed-by: Tom Wai-Hong Tam <waihong@chromium.org>
Tested-by: Tom Wai-Hong Tam <waihong@chromium.org>
diff --git a/server/cros/faftsequence.py b/server/cros/faftsequence.py
index baa17c5..cde9259 100644
--- a/server/cros/faftsequence.py
+++ b/server/cros/faftsequence.py
@@ -75,8 +75,6 @@
             test image to be installed.
         _firmware_update: Boolean. True if firmware update needed after
             installing the image.
-        _trapped_in_recovery_reason: Keep the recovery reason when the test is
-            trapped in the recovery screen.
     """
     version = 1
 
@@ -97,7 +95,6 @@
 
     _install_image_path = None
     _firmware_update = False
-    _trapped_in_recovery_reason = 0
 
     _backup_firmware_sha = ()
 
@@ -215,25 +212,13 @@
         self.unmark_setup_done('gbb_flags')
 
 
-    def reset_client(self):
-        """Reset client, if necessary.
+    def _retrieve_recovery_reason_from_trap(self):
+        """Try to retrieve the recovery reason from a trapped recovery screen.
 
-        This method is called when the client is not responsive. It may be
-        caused by the following cases:
-          - network flaky (can be recovered by replugging the Ethernet);
-          - halt on a firmware screen without timeout, e.g. REC_INSERT screen;
-          - corrupted firmware;
-          - corrutped OS image.
+        Returns:
+          The recovery_reason, 0 if any error.
         """
-        # DUT works fine, done.
-        if self._ping_test(self._client.ip, timeout=5):
-            return
-
-        # TODO(waihong@chromium.org): Implement replugging the Ethernet in the
-        # first reset item.
-
-        # DUT may be trapped in the recovery screen. Try to boot into USB to
-        # retrieve the recovery reason.
+        recovery_reason = 0
         logging.info('Try to retrieve recovery reason...')
         if self.servo.get('usb_mux_sel1') == 'dut_sees_usbkey':
             self.wait_fw_screen_and_plug_usb()
@@ -244,12 +229,22 @@
             self.wait_for_client(install_deps=True)
             lines = self.faft_client.run_shell_command_get_output(
                         'crossystem recovery_reason')
-            self._trapped_in_recovery_reason = int(lines[0])
-            logging.info('Got the recovery reason %d.',
-                         self._trapped_in_recovery_reason)
+            recovery_reason = int(lines[0])
+            logging.info('Got the recovery reason %d.', recovery_reason)
         except AssertionError:
             logging.info('Failed to get the recovery reason.')
+        return recovery_reason
 
+
+    def _reset_client(self):
+        """Reset client to a workable state.
+
+        This method is called when the client is not responsive. It may be
+        caused by the following cases:
+          - halt on a firmware screen without timeout, e.g. REC_INSERT screen;
+          - corrupted firmware;
+          - corrutped OS image.
+        """
         # DUT may halt on a firmware screen. Try cold reboot.
         logging.info('Try cold reboot...')
         self.cold_reboot()
@@ -264,7 +259,7 @@
         # code is in RO region and all FAFT tests don't change the RO region
         # except GBB.
         if self.is_firmware_saved():
-            self.ensure_client_in_recovery()
+            self._ensure_client_in_recovery()
             logging.info('Try restore the original firmware...')
             if self.is_firmware_changed():
                 try:
@@ -274,7 +269,7 @@
                     logging.info('Restoring firmware doesn\'t help.')
 
         # DUT may be broken by a corrupted OS image. Restore OS image.
-        self.ensure_client_in_recovery()
+        self._ensure_client_in_recovery()
         logging.info('Try restore the OS image...')
         self.faft_client.run_shell_command('chromeos-install --yes')
         self.sync_and_warm_reboot()
@@ -287,7 +282,7 @@
             logging.info('Restoring OS image doesn\'t help.')
 
 
-    def ensure_client_in_recovery(self):
+    def _ensure_client_in_recovery(self):
         """Ensure client in recovery boot; reboot into it if necessary.
 
         Raises:
@@ -308,6 +303,49 @@
             raise error.TestError('Failed to boot the USB image.')
 
 
+    def _restore_routine_from_timeout(self, next_step=None):
+        """A routine to try to restore the system from a timeout error.
+
+        This method is called when FAFT failed to connect DUT after reboot.
+
+        Args:
+          next_step: Optional, a FAFT_STEP dict of the next step, which is used
+                     for diagnostic.
+
+        Raises:
+          error.TestFail: This exception is already raised, with a decription
+                          why it failed.
+        """
+        next_checker_matched = False
+        if next_step is not None:
+            next_test = {}
+            next_test.update(self._faft_template)
+            next_test.update(next_step)
+
+        # TODO(waihong@chromium.org): Implement replugging the Ethernet to
+        # identify if it is a network flaky.
+
+        recovery_reason = self._retrieve_recovery_reason_from_trap()
+        if next_step is not None and recovery_reason:
+            if self._call_action(next_test['state_checker']):
+                # Repluging the USB can pass the state_checker of the next step,
+                # meaning that the firmware failed to boot into USB directly.
+                next_checker_matched = True
+
+        # Reset client to a workable state.
+        self._reset_client()
+
+        # Raise the proper TestFail exception.
+        if next_checker_matched:
+            raise error.TestFail('Firmware failed to auto-boot USB in the '
+                                 'recovery reason: %d.' % recovery_reason)
+        elif recovery_reason:
+            raise error.TestFail('Trapped in the recovery reason: %d' %
+                                 recovery_reason)
+        else:
+            raise error.TestFail('Timed out waiting for DUT reboot.')
+
+
     def assert_test_image_in_path(self, image_path):
         """Assert the image of image_path be a Chrome OS test image.
 
@@ -1239,7 +1277,7 @@
         self._faft_sequence = sequence
 
 
-    def run_faft_step(self, step, no_reboot=False):
+    def run_faft_step(self, step, no_reboot=False, next_step=None):
         """Run a single FAFT step.
 
         Any missing field falls back to faft_template. An empty step means
@@ -1248,6 +1286,8 @@
         Args:
           step: A FAFT_STEP dict.
           no_reboot: True to prevent running reboot_action and firmware_action.
+          next_step: Optional, a FAFT_STEP dict of the next step, which is used
+                     for diagnostic.
 
         Raises:
           error.TestError: An error when the given step is not valid.
@@ -1283,24 +1323,20 @@
                     self.wait_for_client()
             except AssertionError:
                 logging.info('wait_for_client() timed out.')
-                self.reset_client()
-                if self._trapped_in_recovery_reason:
-                    raise error.TestFail('Trapped in the recovery reason: %d' %
-                                          self._trapped_in_recovery_reason)
-                else:
-                    raise error.TestFail('Timed out waiting for DUT reboot.')
+                self._restore_routine_from_timeout(next_step)
 
 
     def run_faft_sequence(self):
         """Run FAFT sequence which was previously registered."""
         sequence = self._faft_sequence
-        index = 1
-        for step in sequence:
+        for index, step in enumerate(sequence):
             logging.info('======== Running FAFT sequence step %d ========',
-                         index)
+                         index + 1)
             # Don't reboot in the last step.
-            self.run_faft_step(step, no_reboot=(step is sequence[-1]))
-            index += 1
+            if index == len(sequence) - 1:
+                self.run_faft_step(step, no_reboot=True)
+            else:
+                self.run_faft_step(step, next_step=sequence[index + 1])
 
 
     def get_current_firmware_sha(self):