Reduce timeout conditions for Chrome OS for efficiency.

Many of the Autotest timeouts are far too long for Chrome OS. Where a
normal machine might take 10+ minutes to recover, Chromebooks take only
seconds. As such, I've tweaked many of the boot and crash timeouts to
reflect this.

When a host goes down the scheduler idles the job for four hours while
waiting for crash collection, causing two main problems:

1. Metahost jobs take four hours to requeue upon verify failure.
2. Results are delayed four hours when a host dies in jobs.

Additional issues fixed:

1. Timeout after reboot was essentially doubled in the default Host
implementation of wait_for_reboot. Works for us, but will need to query
upstream to ensure it works for everyone.
2. Unified timeouts across autotest.py and base_classes.py so boot
timeouts exist in only one place. Should be okay since all hosts extend
from Host in base_classes.py. Timings are now configurable in
global_config.
3. Global SSH options to ensure better timings. In order to do this, it
was necessary to add site extensibility to abstract_ssh.

Changes to crashcollect.py, autotest.py, abstract_ssh.py, and
base_classes.py will be upstreamed.

BUG=none
TEST=run_remote_tests. Will test in production as well.

Change-Id: I92964b140337e4ab919288b5190bb2e6d5a4fa2d
Reviewed-on: http://gerrit.chromium.org/gerrit/2269
Reviewed-by: Paul Pendlebury <pauldean@chromium.org>
Reviewed-by: Scott Zawalski <scottz@chromium.org>
Tested-by: Dale Curtis <dalecurtis@chromium.org>
diff --git a/server/autotest.py b/server/autotest.py
index 8591efd..726d043 100644
--- a/server/autotest.py
+++ b/server/autotest.py
@@ -7,14 +7,9 @@
 from autotest_lib.client.common_lib import global_config, packages
 from autotest_lib.client.common_lib import utils as client_utils
 
-AUTOTEST_SVN  = 'svn://test.kernel.org/autotest/trunk/client'
+AUTOTEST_SVN = 'svn://test.kernel.org/autotest/trunk/client'
 AUTOTEST_HTTP = 'http://test.kernel.org/svn/autotest/trunk/client'
 
-# Timeouts for powering down and up respectively
-HALT_TIME = 300
-BOOT_TIME = 1800
-CRASH_RECOVERY_TIME = 9000
-
 
 get_value = global_config.global_config.get_config_value
 autoserv_prebuild = get_value('AUTOSERV', 'enable_server_prebuild',
@@ -37,7 +32,7 @@
     implement the unimplemented methods in parent classes.
     """
 
-    def __init__(self, host = None):
+    def __init__(self, host=None):
         self.host = host
         self.got = False
         self.installed = False
@@ -223,7 +218,7 @@
             except (error.PackageInstallError, error.AutoservRunError,
                     global_config.ConfigError), e:
                 logging.info("Could not install autotest using the packaging "
-                             "system: %s. Trying other methods",  e)
+                             "system: %s. Trying other methods", e)
 
         # try to install from file or directory
         if self.source_material:
@@ -272,7 +267,7 @@
         self.installed = False
 
 
-    def get(self, location = None):
+    def get(self, location=None):
         if not location:
             location = os.path.join(self.serverdir, '../client')
             location = os.path.abspath(location)
@@ -290,7 +285,7 @@
 
     def run(self, control_file, results_dir='.', host=None, timeout=None,
             tag=None, parallel_flag=False, background=False,
-            client_disconnect_timeout=1800):
+            client_disconnect_timeout=None):
         """
         Run an autotest job on the remote machine.
 
@@ -307,7 +302,8 @@
                 a background job; the code calling run will be responsible
                 for monitoring the client and collecting the results.
         @param client_disconnect_timeout: Seconds to wait for the remote host
-                to come back after a reboot.  [default: 30 minutes]
+                to come back after a reboot. Defaults to the host setting for
+                DEFAULT_REBOOT_TIMEOUT.
 
         @raises AutotestRunError: If there is a problem executing
                 the control file.
@@ -315,6 +311,9 @@
         host = self._get_host_and_setup(host)
         results_dir = os.path.abspath(results_dir)
 
+        if client_disconnect_timeout is None:
+            client_disconnect_timeout = host.DEFAULT_REBOOT_TIMEOUT
+
         if tag:
             results_dir = os.path.join(results_dir, tag)
 
@@ -700,12 +699,13 @@
     def _wait_for_reboot(self, old_boot_id):
         logging.info("Client is rebooting")
         logging.info("Waiting for client to halt")
-        if not self.host.wait_down(HALT_TIME, old_boot_id=old_boot_id):
+        if not self.host.wait_down(self.host.WAIT_DOWN_REBOOT_TIMEOUT,
+                                   old_boot_id=old_boot_id):
             err = "%s failed to shutdown after %d"
-            err %= (self.host.hostname, HALT_TIME)
+            err %= (self.host.hostname, self.host.WAIT_DOWN_REBOOT_TIMEOUT)
             raise error.AutotestRunError(err)
         logging.info("Client down, waiting for restart")
-        if not self.host.wait_up(BOOT_TIME):
+        if not self.host.wait_up(self.host.DEFAULT_REBOOT_TIMEOUT):
             # since reboot failed
             # hardreset the machine once if possible
             # before failing this control file
@@ -719,7 +719,8 @@
                 warning %= self.host.hostname
                 logging.warning(warning)
             raise error.AutotestRunError("%s failed to boot after %ds" %
-                                         (self.host.hostname, BOOT_TIME))
+                                         (self.host.hostname,
+                                          self.host.DEFAULT_REBOOT_TIMEOUT))
         self.host.reboot_followup()
 
 
@@ -765,7 +766,7 @@
                 self.log_unexpected_abort(logger)
 
                 # give the client machine a chance to recover from a crash
-                self.host.wait_up(CRASH_RECOVERY_TIME)
+                self.host.wait_up(self.host.HOURS_TO_WAIT_FOR_RECOVERY * 3600)
                 msg = ("Aborting - unexpected final status message from "
                        "client on %s: %s\n") % (self.host.hostname, last)
                 raise error.AutotestRunError(msg)
diff --git a/server/crashcollect.py b/server/crashcollect.py
index 6e1de7f..dc582ac 100644
--- a/server/crashcollect.py
+++ b/server/crashcollect.py
@@ -1,5 +1,6 @@
 import os, time, pickle, logging, shutil
 
+from autotest_lib.client.common_lib import global_config
 from autotest_lib.server import utils
 
 
@@ -33,7 +34,12 @@
         collect_uncollected_logs(host)
 
 
-def wait_for_machine_to_recover(host, hours_to_wait=4.0):
+# Load default for number of hours to wait before giving up on crash collection.
+HOURS_TO_WAIT = global_config.global_config.get_config_value(
+    'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
+
+
+def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
     """Wait for a machine (possibly down) to become accessible again.
 
     @param host: A RemoteHost instance to wait on
diff --git a/server/hosts/abstract_ssh.py b/server/hosts/abstract_ssh.py
index 3723c46..0f61391 100644
--- a/server/hosts/abstract_ssh.py
+++ b/server/hosts/abstract_ssh.py
@@ -10,8 +10,8 @@
                               default=False)
 
 
-def make_ssh_command(user="root", port=22, opts='', hosts_file='/dev/null',
-                     connect_timeout=30, alive_interval=300):
+def _make_ssh_cmd_default(user="root", port=22, opts='', hosts_file='/dev/null',
+                          connect_timeout=30, alive_interval=300):
     base_command = ("/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no "
                     "-o UserKnownHostsFile=%s -o BatchMode=yes "
                     "-o ConnectTimeout=%d -o ServerAliveInterval=%d "
@@ -22,6 +22,11 @@
                            alive_interval, user, port)
 
 
+make_ssh_command = utils.import_site_function(
+    __file__, "autotest_lib.server.hosts.site_host", "make_ssh_command",
+    _make_ssh_cmd_default)
+
+
 # import site specific Host class
 SiteHost = utils.import_site_class(
     __file__, "autotest_lib.server.hosts.site_host", "SiteHost",
diff --git a/server/hosts/site_host.py b/server/hosts/site_host.py
new file mode 100644
index 0000000..afe6342
--- /dev/null
+++ b/server/hosts/site_host.py
@@ -0,0 +1,22 @@
+def make_ssh_command(user='root', port=22, opts='', hosts_file=None,
+                     connect_timeout=None, alive_interval=None):
+    """Override default make_ssh_command to use options tuned for Chrome OS.
+
+    Tuning changes:
+      - ConnectTimeout=10; maximum of 10 seconds allowed for an SSH connection
+      failure.
+
+      - ServerAliveInterval=60; which causes SSH to ping connection every
+      60 seconds. In conjunction with ServerAliveCountMax ensures that if the
+      connection dies, Autotest will bail out quickly.
+
+      - ServerAliveCountMax=1; only allow a single keep alive failure.
+
+      - UserKnownHostsFile=/dev/null; we don't care about the keys. Host keys
+      change with every new installation, don't waste memory/space saving them.
+    """
+    base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
+                    ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
+                    ' -o ConnectTimeout=10 -o ServerAliveInterval=60'
+                    ' -o ServerAliveCountMax=1 -l %s -p %d')
+    return base_command % (opts, user, port)