[autotest] Integrate crashlog collection into repair workflow.

When a DUT goes offline before logs are gathered, we lose those
logs if the DUT is re-imaged.
To grab such logs as much as we can, we integrate crashlog
collection into the repair workflow.

BUG=chromium:215160
TEST=./server/autoserv -R -m <DUT ip> -r ~/tmp/repair
CQ-DEPEND=CL:221510

Change-Id: Ifd562bfd539b133381572aeec503d9a3940ab448
Reviewed-on: https://chromium-review.googlesource.com/219999
Reviewed-by: Fang Deng <fdeng@chromium.org>
Commit-Queue: Mungyung Ryu <mkryu@google.com>
Tested-by: Mungyung Ryu <mkryu@google.com>
diff --git a/server/constants.py b/server/constants.py
new file mode 100644
index 0000000..4abd631
--- /dev/null
+++ b/server/constants.py
@@ -0,0 +1,8 @@
+# Prefix of directory where collected crash-logs are stored.
+CRASHLOGS_DEST_DIR_PREFIX = 'crashlogs'
+
+# Attribute of host that needs logs to be collected.
+CRASHLOGS_HOST_ATTRIBUTE = 'need_crash_logs'
+
+# Marker file that is left when crash-logs are collected.
+CRASHLOGS_MARKER = '.crashjob'
diff --git a/server/crashcollect.py b/server/crashcollect.py
index a30bf61..b82a036 100644
--- a/server/crashcollect.py
+++ b/server/crashcollect.py
@@ -37,7 +37,7 @@
         # run any site-specific collection
         get_site_crashinfo(host, test_start_time)
 
-        crashinfo_dir = get_crashinfo_dir(host)
+        crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
         collect_messages(host)
         collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
         collect_uncollected_logs(host)
@@ -90,10 +90,11 @@
         return True
 
 
-def get_crashinfo_dir(host):
+def get_crashinfo_dir(host, dir_prefix):
     """Find and if necessary create a directory to store crashinfo in.
 
     @param host: The RemoteHost object that crashinfo will be collected from
+    @param dir_prefix: Prefix of directory name.
 
     @returns: The path to an existing directory for writing crashinfo into
     """
@@ -102,7 +103,7 @@
         infodir = host_resultdir
     else:
         infodir = os.path.abspath(os.getcwd())
-    infodir = os.path.join(infodir, "crashinfo.%s" % host.hostname)
+    infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname))
     if not os.path.exists(infodir):
         os.mkdir(infodir)
     return infodir
@@ -189,7 +190,7 @@
 
     @param host: The RemoteHost to collect from
     """
-    crashinfo_dir = get_crashinfo_dir(host)
+    crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
 
     try:
         # paths to the messages files
diff --git a/server/hosts/abstract_ssh.py b/server/hosts/abstract_ssh.py
index 91722b4..39af9f9 100644
--- a/server/hosts/abstract_ssh.py
+++ b/server/hosts/abstract_ssh.py
@@ -738,5 +738,3 @@
                 shutil.rmtree(local_dest_dir, ignore_errors=ignore_errors)
             if not ignore_errors:
                 raise
-
-
diff --git a/server/hosts/cros_host.py b/server/hosts/cros_host.py
index 315caf6..f6b25c6 100644
--- a/server/hosts/cros_host.py
+++ b/server/hosts/cros_host.py
@@ -22,9 +22,12 @@
 from autotest_lib.client.common_lib.cros import dev_server
 from autotest_lib.client.common_lib.cros import retry
 from autotest_lib.client.common_lib.cros.graphite import stats
-from autotest_lib.client.cros import constants
+from autotest_lib.client.cros import constants as client_constants
 from autotest_lib.client.cros import cros_ui
-from autotest_lib.server import autoserv_parser, autotest
+from autotest_lib.server import autoserv_parser
+from autotest_lib.server import autotest
+from autotest_lib.server import constants
+from autotest_lib.server import crashcollect
 from autotest_lib.server import utils as server_utils
 from autotest_lib.server.cros import provision
 from autotest_lib.server.cros.dynamic_suite import constants as ds_constants
@@ -170,6 +173,23 @@
     _FW_IMAGE_URL_PATTERN = global_config.global_config.get_config_value(
             'CROS', 'firmware_url_pattern', type=str)
 
+    # File that has a list of directories to be collected
+    _LOGS_TO_COLLECT_FILE = os.path.join(
+            common.client_dir, 'common_lib', 'logs_to_collect')
+
+    # Prefix of logging message w.r.t. crash collection
+    _CRASHLOGS_PREFIX = 'collect_crashlogs'
+
+    # Time duration waiting for host up/down check
+    _CHECK_HOST_UP_TIMEOUT_SECS = 15
+
+    # A command that interacts with kernel and hardware (e.g., rm, mkdir, etc)
+    # might not be completely done deep through the hardware when the machine
+    # is powered down right after the command returns.
+    # We should wait for a few seconds to make them done. Finger crossed.
+    _SAFE_WAIT_SECS = 10
+
+
     @staticmethod
     def check_host(host, timeout=10):
         """
@@ -816,8 +836,8 @@
 
     def show_update_engine_log(self):
         """Output update engine log."""
-        logging.debug('Dumping %s', constants.UPDATE_ENGINE_LOG)
-        self.run('cat %s' % constants.UPDATE_ENGINE_LOG)
+        logging.debug('Dumping %s', client_constants.UPDATE_ENGINE_LOG)
+        self.run('cat %s' % client_constants.UPDATE_ENGINE_LOG)
 
 
     def _get_board_from_afe(self):
@@ -931,7 +951,9 @@
         timer = stats.Timer(install_timer_key)
         timer.start()
         logging.info('Installing image through chromeos-install.')
-        self.run('chromeos-install --yes', timeout=install_timeout)
+        self.run('chromeos-install --yes --lab_preserve_logs=%s' %
+                 self._LOGS_TO_COLLECT_FILE,
+                 timeout=install_timeout)
         timer.stop()
 
         logging.info('Power cycling DUT through servo.')
@@ -1027,6 +1049,14 @@
                          failed_cycles)
 
 
+    def _reboot_repair(self):
+        """SSH to this host and reboot."""
+        if not self.is_up(self._CHECK_HOST_UP_TIMEOUT_SECS):
+            raise error.AutoservRepairMethodNA('DUT unreachable for reboot.')
+        logging.info('Attempting repair via SSH reboot.')
+        self.reboot(timeout=self.BOOT_TIMEOUT, wait=True)
+
+
     def check_device(self):
         """Check if a device is ssh-able, and if so, clean and verify it.
 
@@ -1057,15 +1087,17 @@
         the status using `self.check_device()` after each of them. This is done
         until both the repair and the veryfing step succeed.
 
-        Escalation order of repair procedures:
-          1. Try to re-install to a known stable image using
-             auto-update.
+        Escalation order of repair procedures from less intrusive to
+        more intrusive repairs:
+          1. SSH to the DUT and reboot.
           2. If there's a servo for the DUT, try to power the DUT off and
              on.
-          3. If there's a servo for the DUT, try to re-install via
-             the servo.
-          4. If the DUT can be power-cycled via RPM, try to repair
+          3. If the DUT can be power-cycled via RPM, try to repair
              by power-cycling.
+          4. Try to re-install to a known stable image using
+             auto-update.
+          5. If there's a servo for the DUT, try to re-install via
+             the servo.
 
         As with the parent method, the last operation performed on
         the DUT must be to call `self.check_device()`; If that call fails the
@@ -1088,19 +1120,23 @@
                 logging.error('Could not create a healthy servo: %s', e)
             self.servo = self._servo_host.get_servo()
 
+        self.try_collect_crashlogs()
+
         # TODO(scottz): This should use something similar to label_decorator,
         # but needs to be populated in order so DUTs are repaired with the
         # least amount of effort.
-        repair_funcs = [self._servo_repair_power,
+        repair_funcs = [self._reboot_repair,
+                        self._servo_repair_power,
+                        self._powercycle_to_repair,
                         self._install_repair,
                         self._install_repair_with_powerwash,
-                        self._servo_repair_reinstall,
-                        self._powercycle_to_repair]
+                        self._servo_repair_reinstall]
         errors = []
         board = self._get_board_from_afe()
         for repair_func in repair_funcs:
             try:
                 repair_func()
+                self.try_collect_crashlogs()
                 self.check_device()
                 stats.Counter(
                         '%s.SUCCEEDED' % repair_func.__name__).increment()
@@ -1137,6 +1173,142 @@
                 '\n'.join(errors))
 
 
+    def try_collect_crashlogs(self, check_host_up=True):
+        """
+        Check if a host is up and logs need to be collected from the host,
+        if yes, collect them.
+
+        @param check_host_up: Flag for checking host is up. Default is True.
+        """
+        try:
+            crash_job = self._need_crash_logs()
+            if crash_job:
+                logging.debug('%s: Job %s was crashed', self._CRASHLOGS_PREFIX,
+                              crash_job)
+                if not check_host_up or self.is_up(
+                        self._CHECK_HOST_UP_TIMEOUT_SECS):
+                    self._collect_crashlogs(crash_job)
+                    logging.debug('%s: Completed collecting logs for the '
+                                  'crashed job %s', self._CRASHLOGS_PREFIX,
+                                  crash_job)
+        except Exception as e:
+            # Exception should not result in repair failure.
+            # Therefore, suppress all exceptions here.
+            logging.error('%s: Failed while trying to collect crash-logs: %s',
+                          self._CRASHLOGS_PREFIX, e)
+
+
+    def _need_crash_logs(self):
+        """Get the value of need_crash_logs attribute of this host.
+
+        @return: Value string of need_crash_logs attribute
+                 None if there is no need_crash_logs attribute
+        """
+        attrs = self._AFE.get_host_attribute(constants.CRASHLOGS_HOST_ATTRIBUTE,
+                                             hostname=self.hostname)
+        assert len(attrs) < 2
+        return attrs[0].value if attrs else None
+
+
+    def _collect_crashlogs(self, job_id):
+        """Grab logs from the host where a job was crashed.
+
+        First, check if PRIOR_LOGS_DIR exists in the host.
+        If yes, collect them.
+        Otherwise, check if a lab-machine marker (_LAB_MACHINE_FILE) exists
+        in the host.
+        If yes, the host was repaired automatically, and we collect normal
+        system logs.
+
+        @param job_id: Id of the job that was crashed.
+        """
+        crashlogs_dir = crashcollect.get_crashinfo_dir(self,
+                constants.CRASHLOGS_DEST_DIR_PREFIX)
+        flag_prior_logs = False
+
+        if self.path_exists(client_constants.PRIOR_LOGS_DIR):
+            flag_prior_logs = True
+            self._collect_prior_logs(crashlogs_dir)
+        elif self.path_exists(self._LAB_MACHINE_FILE):
+            self._collect_system_logs(crashlogs_dir)
+        else:
+            logging.warning('%s: Host was manually re-installed without '
+                            '--lab_preserve_log option. Skip collecting '
+                            'crash-logs.', self._CRASHLOGS_PREFIX)
+
+        # We make crash collection be one-time effort.
+        # _collect_prior_logs() and _collect_system_logs() will not throw
+        # any exception, and following codes will be executed even when
+        # those methods fail.
+        # _collect_crashlogs() is called only when the host is up (refer
+        # to try_collect_crashlogs()). We assume _collect_prior_logs() and
+        # _collect_system_logs() fail rarely when the host is up.
+        # In addition, it is not clear how many times we should try crash
+        # collection again while not triggering next repair unnecessarily.
+        # Threfore, we try crash collection one time.
+
+        # Create a marker file as soon as log collection is done.
+        # Leave the job id to this marker for gs_offloader to consume.
+        marker_file = os.path.join(crashlogs_dir, constants.CRASHLOGS_MARKER)
+        with open(marker_file, 'a') as f:
+            f.write('%s\n' % job_id)
+
+        # Remove need_crash_logs attribute
+        logging.debug('%s: Remove attribute need_crash_logs from host %s',
+                      self._CRASHLOGS_PREFIX, self.hostname)
+        self._AFE.set_host_attribute(constants.CRASHLOGS_HOST_ATTRIBUTE,
+                                     None, hostname=self.hostname)
+
+        if flag_prior_logs:
+            logging.debug('%s: Remove %s from host %s', self._CRASHLOGS_PREFIX,
+                          client_constants.PRIOR_LOGS_DIR, self.hostname)
+            self.run('rm -rf %s; sync' % client_constants.PRIOR_LOGS_DIR)
+            # Wait for a few seconds to make sure the prior command is
+            # done deep through storage.
+            time.sleep(self._SAFE_WAIT_SECS)
+
+
+    def _collect_prior_logs(self, crashlogs_dir):
+        """Grab prior logs that were stashed before re-installing a host.
+
+        @param crashlogs_dir: Directory path where crash-logs are stored.
+        """
+        logging.debug('%s: Found %s, collecting them...',
+                      self._CRASHLOGS_PREFIX, client_constants.PRIOR_LOGS_DIR)
+        try:
+            self.collect_logs(client_constants.PRIOR_LOGS_DIR,
+                              crashlogs_dir, False)
+            logging.debug('%s: %s is collected',
+                          self._CRASHLOGS_PREFIX, client_constants.PRIOR_LOGS_DIR)
+        except Exception as e:
+            logging.error('%s: Failed to collect %s: %s',
+                          self._CRASHLOGS_PREFIX, client_constants.PRIOR_LOGS_DIR,
+                          e)
+
+
+    def _collect_system_logs(self, crashlogs_dir):
+        """Grab normal system logs from a host.
+
+        @param crashlogs_dir: Directory path where crash-logs are stored.
+        """
+        logging.debug('%s: Found %s, collecting system logs...',
+                      self._CRASHLOGS_PREFIX, self._LAB_MACHINE_FILE)
+        sources = server_utils.parse_simple_config(self._LOGS_TO_COLLECT_FILE)
+        for src in sources:
+            try:
+                if self.path_exists(src):
+                    logging.debug('%s: Collecting %s...',
+                                  self._CRASHLOGS_PREFIX, src)
+                    dest = server_utils.concat_path_except_last(
+                            crashlogs_dir, src)
+                    self.collect_logs(src, dest, False)
+                    logging.debug('%s: %s is collected',
+                                  self._CRASHLOGS_PREFIX, src)
+            except Exception as e:
+                logging.error('%s: Failed to collect %s: %s',
+                              self._CRASHLOGS_PREFIX, src, e)
+
+
     def close(self):
         self.rpc_disconnect_all()
         super(CrosHost, self).close()
@@ -1194,7 +1366,7 @@
 
 
     def cleanup(self):
-        self.run('rm -f %s' % constants.CLEANUP_LOGS_PAUSED_FILE)
+        self.run('rm -f %s' % client_constants.CLEANUP_LOGS_PAUSED_FILE)
         try:
             self._restart_ui()
         except (error.AutotestRunError, error.AutoservRunError,
@@ -1271,6 +1443,12 @@
          4. update_engine answers a simple status request over DBus.
 
         """
+        # Check if a job was crashed on this host.
+        # If yes, avoid verification until crash-logs are collected.
+        if self._need_crash_logs():
+            raise error.AutoservCrashLogCollectRequired(
+                    'Need to collect crash-logs before verification')
+
         super(CrosHost, self).verify_software()
         self.check_diskspace(
             '/mnt/stateful_partition',
@@ -1935,7 +2113,7 @@
             of "chrome --version" and the milestone will be the empty string.
 
         """
-        version_string = self.run(constants.CHROME_VERSION_COMMAND).stdout
+        version_string = self.run(client_constants.CHROME_VERSION_COMMAND).stdout
         return utils.parse_chrome_version(version_string)
 
     @label_decorator()
diff --git a/server/site_utils.py b/server/site_utils.py
index b0fa276..7d95215 100644
--- a/server/site_utils.py
+++ b/server/site_utils.py
@@ -6,6 +6,7 @@
 import httplib
 import json
 import logging
+import os
 import random
 import re
 import time
@@ -309,3 +310,44 @@
         test_views[view['test_name']] = view['status']
 
     return test_views
+
+
+def parse_simple_config(config_file):
+    """Get paths by parsing a simple config file.
+
+    Each line of the config file is a path for a file or directory.
+    Ignore an empty line and a line starting with a hash character ('#').
+    One example of this kind of simple config file is
+    client/common_lib/logs_to_collect.
+
+    @param config_file: Config file path
+    @return: A list of directory strings
+    """
+    dirs = []
+    for l in open(config_file):
+        l = l.strip()
+        if l and not l.startswith('#'):
+            dirs.append(l)
+    return dirs
+
+
+def concat_path_except_last(base, sub):
+    """Concatenate two paths but exclude last entry.
+
+    Take two paths as parameters and return a path string in which
+    the second path becomes under the first path.
+    In addition, remove the last path entry from the concatenated path.
+    This works even when two paths are absolute paths.
+
+    e.g., /usr/local/autotest/results/ + /var/log/ =
+    /usr/local/autotest/results/var
+
+    e.g., /usr/local/autotest/results/ + /var/log/syslog =
+    /usr/local/autotest/results/var/log
+
+    @param base: Beginning path
+    @param sub: The path that is concatenated to base
+    @return: Concatenated path string
+    """
+    dirname = os.path.dirname(sub.rstrip('/'))
+    return os.path.join(base, dirname.strip('/'))