[autotest] Integrate crashlog collection into repair workflow.
When a DUT goes offline before logs are gathered, we lose those
logs if the DUT is re-imaged.
To grab such logs as much as we can, we integrate crashlog
collection into the repair workflow.
BUG=chromium:215160
TEST=./server/autoserv -R -m <DUT ip> -r ~/tmp/repair
CQ-DEPEND=CL:221510
Change-Id: Ifd562bfd539b133381572aeec503d9a3940ab448
Reviewed-on: https://chromium-review.googlesource.com/219999
Reviewed-by: Fang Deng <fdeng@chromium.org>
Commit-Queue: Mungyung Ryu <mkryu@google.com>
Tested-by: Mungyung Ryu <mkryu@google.com>
diff --git a/server/constants.py b/server/constants.py
new file mode 100644
index 0000000..4abd631
--- /dev/null
+++ b/server/constants.py
@@ -0,0 +1,8 @@
+# Prefix of directory where collected crash-logs are stored.
+CRASHLOGS_DEST_DIR_PREFIX = 'crashlogs'
+
+# Attribute of host that needs logs to be collected.
+CRASHLOGS_HOST_ATTRIBUTE = 'need_crash_logs'
+
+# Marker file that is left when crash-logs are collected.
+CRASHLOGS_MARKER = '.crashjob'
diff --git a/server/crashcollect.py b/server/crashcollect.py
index a30bf61..b82a036 100644
--- a/server/crashcollect.py
+++ b/server/crashcollect.py
@@ -37,7 +37,7 @@
# run any site-specific collection
get_site_crashinfo(host, test_start_time)
- crashinfo_dir = get_crashinfo_dir(host)
+ crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
collect_messages(host)
collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
collect_uncollected_logs(host)
@@ -90,10 +90,11 @@
return True
-def get_crashinfo_dir(host):
+def get_crashinfo_dir(host, dir_prefix):
"""Find and if necessary create a directory to store crashinfo in.
@param host: The RemoteHost object that crashinfo will be collected from
+ @param dir_prefix: Prefix of directory name.
@returns: The path to an existing directory for writing crashinfo into
"""
@@ -102,7 +103,7 @@
infodir = host_resultdir
else:
infodir = os.path.abspath(os.getcwd())
- infodir = os.path.join(infodir, "crashinfo.%s" % host.hostname)
+ infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname))
if not os.path.exists(infodir):
os.mkdir(infodir)
return infodir
@@ -189,7 +190,7 @@
@param host: The RemoteHost to collect from
"""
- crashinfo_dir = get_crashinfo_dir(host)
+ crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
try:
# paths to the messages files
diff --git a/server/hosts/abstract_ssh.py b/server/hosts/abstract_ssh.py
index 91722b4..39af9f9 100644
--- a/server/hosts/abstract_ssh.py
+++ b/server/hosts/abstract_ssh.py
@@ -738,5 +738,3 @@
shutil.rmtree(local_dest_dir, ignore_errors=ignore_errors)
if not ignore_errors:
raise
-
-
diff --git a/server/hosts/cros_host.py b/server/hosts/cros_host.py
index 315caf6..f6b25c6 100644
--- a/server/hosts/cros_host.py
+++ b/server/hosts/cros_host.py
@@ -22,9 +22,12 @@
from autotest_lib.client.common_lib.cros import dev_server
from autotest_lib.client.common_lib.cros import retry
from autotest_lib.client.common_lib.cros.graphite import stats
-from autotest_lib.client.cros import constants
+from autotest_lib.client.cros import constants as client_constants
from autotest_lib.client.cros import cros_ui
-from autotest_lib.server import autoserv_parser, autotest
+from autotest_lib.server import autoserv_parser
+from autotest_lib.server import autotest
+from autotest_lib.server import constants
+from autotest_lib.server import crashcollect
from autotest_lib.server import utils as server_utils
from autotest_lib.server.cros import provision
from autotest_lib.server.cros.dynamic_suite import constants as ds_constants
@@ -170,6 +173,23 @@
_FW_IMAGE_URL_PATTERN = global_config.global_config.get_config_value(
'CROS', 'firmware_url_pattern', type=str)
+ # File that has a list of directories to be collected
+ _LOGS_TO_COLLECT_FILE = os.path.join(
+ common.client_dir, 'common_lib', 'logs_to_collect')
+
+ # Prefix of logging message w.r.t. crash collection
+ _CRASHLOGS_PREFIX = 'collect_crashlogs'
+
+ # Time duration waiting for host up/down check
+ _CHECK_HOST_UP_TIMEOUT_SECS = 15
+
+ # A command that interacts with kernel and hardware (e.g., rm, mkdir, etc)
+ # might not be completely done deep through the hardware when the machine
+ # is powered down right after the command returns.
+ # We should wait for a few seconds to make them done. Finger crossed.
+ _SAFE_WAIT_SECS = 10
+
+
@staticmethod
def check_host(host, timeout=10):
"""
@@ -816,8 +836,8 @@
def show_update_engine_log(self):
"""Output update engine log."""
- logging.debug('Dumping %s', constants.UPDATE_ENGINE_LOG)
- self.run('cat %s' % constants.UPDATE_ENGINE_LOG)
+ logging.debug('Dumping %s', client_constants.UPDATE_ENGINE_LOG)
+ self.run('cat %s' % client_constants.UPDATE_ENGINE_LOG)
def _get_board_from_afe(self):
@@ -931,7 +951,9 @@
timer = stats.Timer(install_timer_key)
timer.start()
logging.info('Installing image through chromeos-install.')
- self.run('chromeos-install --yes', timeout=install_timeout)
+ self.run('chromeos-install --yes --lab_preserve_logs=%s' %
+ self._LOGS_TO_COLLECT_FILE,
+ timeout=install_timeout)
timer.stop()
logging.info('Power cycling DUT through servo.')
@@ -1027,6 +1049,14 @@
failed_cycles)
+ def _reboot_repair(self):
+ """SSH to this host and reboot."""
+ if not self.is_up(self._CHECK_HOST_UP_TIMEOUT_SECS):
+ raise error.AutoservRepairMethodNA('DUT unreachable for reboot.')
+ logging.info('Attempting repair via SSH reboot.')
+ self.reboot(timeout=self.BOOT_TIMEOUT, wait=True)
+
+
def check_device(self):
"""Check if a device is ssh-able, and if so, clean and verify it.
@@ -1057,15 +1087,17 @@
the status using `self.check_device()` after each of them. This is done
until both the repair and the veryfing step succeed.
- Escalation order of repair procedures:
- 1. Try to re-install to a known stable image using
- auto-update.
+ Escalation order of repair procedures from less intrusive to
+ more intrusive repairs:
+ 1. SSH to the DUT and reboot.
2. If there's a servo for the DUT, try to power the DUT off and
on.
- 3. If there's a servo for the DUT, try to re-install via
- the servo.
- 4. If the DUT can be power-cycled via RPM, try to repair
+ 3. If the DUT can be power-cycled via RPM, try to repair
by power-cycling.
+ 4. Try to re-install to a known stable image using
+ auto-update.
+ 5. If there's a servo for the DUT, try to re-install via
+ the servo.
As with the parent method, the last operation performed on
the DUT must be to call `self.check_device()`; If that call fails the
@@ -1088,19 +1120,23 @@
logging.error('Could not create a healthy servo: %s', e)
self.servo = self._servo_host.get_servo()
+ self.try_collect_crashlogs()
+
# TODO(scottz): This should use something similar to label_decorator,
# but needs to be populated in order so DUTs are repaired with the
# least amount of effort.
- repair_funcs = [self._servo_repair_power,
+ repair_funcs = [self._reboot_repair,
+ self._servo_repair_power,
+ self._powercycle_to_repair,
self._install_repair,
self._install_repair_with_powerwash,
- self._servo_repair_reinstall,
- self._powercycle_to_repair]
+ self._servo_repair_reinstall]
errors = []
board = self._get_board_from_afe()
for repair_func in repair_funcs:
try:
repair_func()
+ self.try_collect_crashlogs()
self.check_device()
stats.Counter(
'%s.SUCCEEDED' % repair_func.__name__).increment()
@@ -1137,6 +1173,142 @@
'\n'.join(errors))
+ def try_collect_crashlogs(self, check_host_up=True):
+ """
+ Check if a host is up and logs need to be collected from the host,
+ if yes, collect them.
+
+ @param check_host_up: Flag for checking host is up. Default is True.
+ """
+ try:
+ crash_job = self._need_crash_logs()
+ if crash_job:
+ logging.debug('%s: Job %s was crashed', self._CRASHLOGS_PREFIX,
+ crash_job)
+ if not check_host_up or self.is_up(
+ self._CHECK_HOST_UP_TIMEOUT_SECS):
+ self._collect_crashlogs(crash_job)
+ logging.debug('%s: Completed collecting logs for the '
+ 'crashed job %s', self._CRASHLOGS_PREFIX,
+ crash_job)
+ except Exception as e:
+ # Exception should not result in repair failure.
+ # Therefore, suppress all exceptions here.
+ logging.error('%s: Failed while trying to collect crash-logs: %s',
+ self._CRASHLOGS_PREFIX, e)
+
+
+ def _need_crash_logs(self):
+ """Get the value of need_crash_logs attribute of this host.
+
+ @return: Value string of need_crash_logs attribute
+ None if there is no need_crash_logs attribute
+ """
+ attrs = self._AFE.get_host_attribute(constants.CRASHLOGS_HOST_ATTRIBUTE,
+ hostname=self.hostname)
+ assert len(attrs) < 2
+ return attrs[0].value if attrs else None
+
+
+ def _collect_crashlogs(self, job_id):
+ """Grab logs from the host where a job was crashed.
+
+ First, check if PRIOR_LOGS_DIR exists in the host.
+ If yes, collect them.
+ Otherwise, check if a lab-machine marker (_LAB_MACHINE_FILE) exists
+ in the host.
+ If yes, the host was repaired automatically, and we collect normal
+ system logs.
+
+ @param job_id: Id of the job that was crashed.
+ """
+ crashlogs_dir = crashcollect.get_crashinfo_dir(self,
+ constants.CRASHLOGS_DEST_DIR_PREFIX)
+ flag_prior_logs = False
+
+ if self.path_exists(client_constants.PRIOR_LOGS_DIR):
+ flag_prior_logs = True
+ self._collect_prior_logs(crashlogs_dir)
+ elif self.path_exists(self._LAB_MACHINE_FILE):
+ self._collect_system_logs(crashlogs_dir)
+ else:
+ logging.warning('%s: Host was manually re-installed without '
+ '--lab_preserve_log option. Skip collecting '
+ 'crash-logs.', self._CRASHLOGS_PREFIX)
+
+ # We make crash collection be one-time effort.
+ # _collect_prior_logs() and _collect_system_logs() will not throw
+ # any exception, and following codes will be executed even when
+ # those methods fail.
+ # _collect_crashlogs() is called only when the host is up (refer
+ # to try_collect_crashlogs()). We assume _collect_prior_logs() and
+ # _collect_system_logs() fail rarely when the host is up.
+ # In addition, it is not clear how many times we should try crash
+ # collection again while not triggering next repair unnecessarily.
+ # Threfore, we try crash collection one time.
+
+ # Create a marker file as soon as log collection is done.
+ # Leave the job id to this marker for gs_offloader to consume.
+ marker_file = os.path.join(crashlogs_dir, constants.CRASHLOGS_MARKER)
+ with open(marker_file, 'a') as f:
+ f.write('%s\n' % job_id)
+
+ # Remove need_crash_logs attribute
+ logging.debug('%s: Remove attribute need_crash_logs from host %s',
+ self._CRASHLOGS_PREFIX, self.hostname)
+ self._AFE.set_host_attribute(constants.CRASHLOGS_HOST_ATTRIBUTE,
+ None, hostname=self.hostname)
+
+ if flag_prior_logs:
+ logging.debug('%s: Remove %s from host %s', self._CRASHLOGS_PREFIX,
+ client_constants.PRIOR_LOGS_DIR, self.hostname)
+ self.run('rm -rf %s; sync' % client_constants.PRIOR_LOGS_DIR)
+ # Wait for a few seconds to make sure the prior command is
+ # done deep through storage.
+ time.sleep(self._SAFE_WAIT_SECS)
+
+
+ def _collect_prior_logs(self, crashlogs_dir):
+ """Grab prior logs that were stashed before re-installing a host.
+
+ @param crashlogs_dir: Directory path where crash-logs are stored.
+ """
+ logging.debug('%s: Found %s, collecting them...',
+ self._CRASHLOGS_PREFIX, client_constants.PRIOR_LOGS_DIR)
+ try:
+ self.collect_logs(client_constants.PRIOR_LOGS_DIR,
+ crashlogs_dir, False)
+ logging.debug('%s: %s is collected',
+ self._CRASHLOGS_PREFIX, client_constants.PRIOR_LOGS_DIR)
+ except Exception as e:
+ logging.error('%s: Failed to collect %s: %s',
+ self._CRASHLOGS_PREFIX, client_constants.PRIOR_LOGS_DIR,
+ e)
+
+
+ def _collect_system_logs(self, crashlogs_dir):
+ """Grab normal system logs from a host.
+
+ @param crashlogs_dir: Directory path where crash-logs are stored.
+ """
+ logging.debug('%s: Found %s, collecting system logs...',
+ self._CRASHLOGS_PREFIX, self._LAB_MACHINE_FILE)
+ sources = server_utils.parse_simple_config(self._LOGS_TO_COLLECT_FILE)
+ for src in sources:
+ try:
+ if self.path_exists(src):
+ logging.debug('%s: Collecting %s...',
+ self._CRASHLOGS_PREFIX, src)
+ dest = server_utils.concat_path_except_last(
+ crashlogs_dir, src)
+ self.collect_logs(src, dest, False)
+ logging.debug('%s: %s is collected',
+ self._CRASHLOGS_PREFIX, src)
+ except Exception as e:
+ logging.error('%s: Failed to collect %s: %s',
+ self._CRASHLOGS_PREFIX, src, e)
+
+
def close(self):
self.rpc_disconnect_all()
super(CrosHost, self).close()
@@ -1194,7 +1366,7 @@
def cleanup(self):
- self.run('rm -f %s' % constants.CLEANUP_LOGS_PAUSED_FILE)
+ self.run('rm -f %s' % client_constants.CLEANUP_LOGS_PAUSED_FILE)
try:
self._restart_ui()
except (error.AutotestRunError, error.AutoservRunError,
@@ -1271,6 +1443,12 @@
4. update_engine answers a simple status request over DBus.
"""
+ # Check if a job was crashed on this host.
+ # If yes, avoid verification until crash-logs are collected.
+ if self._need_crash_logs():
+ raise error.AutoservCrashLogCollectRequired(
+ 'Need to collect crash-logs before verification')
+
super(CrosHost, self).verify_software()
self.check_diskspace(
'/mnt/stateful_partition',
@@ -1935,7 +2113,7 @@
of "chrome --version" and the milestone will be the empty string.
"""
- version_string = self.run(constants.CHROME_VERSION_COMMAND).stdout
+ version_string = self.run(client_constants.CHROME_VERSION_COMMAND).stdout
return utils.parse_chrome_version(version_string)
@label_decorator()
diff --git a/server/site_utils.py b/server/site_utils.py
index b0fa276..7d95215 100644
--- a/server/site_utils.py
+++ b/server/site_utils.py
@@ -6,6 +6,7 @@
import httplib
import json
import logging
+import os
import random
import re
import time
@@ -309,3 +310,44 @@
test_views[view['test_name']] = view['status']
return test_views
+
+
+def parse_simple_config(config_file):
+ """Get paths by parsing a simple config file.
+
+ Each line of the config file is a path for a file or directory.
+ Ignore an empty line and a line starting with a hash character ('#').
+ One example of this kind of simple config file is
+ client/common_lib/logs_to_collect.
+
+ @param config_file: Config file path
+ @return: A list of directory strings
+ """
+ dirs = []
+ for l in open(config_file):
+ l = l.strip()
+ if l and not l.startswith('#'):
+ dirs.append(l)
+ return dirs
+
+
+def concat_path_except_last(base, sub):
+ """Concatenate two paths but exclude last entry.
+
+ Take two paths as parameters and return a path string in which
+ the second path becomes under the first path.
+ In addition, remove the last path entry from the concatenated path.
+ This works even when two paths are absolute paths.
+
+ e.g., /usr/local/autotest/results/ + /var/log/ =
+ /usr/local/autotest/results/var
+
+ e.g., /usr/local/autotest/results/ + /var/log/syslog =
+ /usr/local/autotest/results/var/log
+
+ @param base: Beginning path
+ @param sub: The path that is concatenated to base
+ @return: Concatenated path string
+ """
+ dirname = os.path.dirname(sub.rstrip('/'))
+ return os.path.join(base, dirname.strip('/'))