Add new test that has shorter reboot/down timeouts and increase others.

This CL adds platform_RebootAfterUpdate which specifically tests shorter
reboot timeouts. I originally planned on setting down_timeout and
reboot_timeout and passing these to reboot but it seemed to be actually a
worse solution for 2 reasons:

1) Since these are client errors, they are reinterpreted as AutoservRunErrors.
I'd literally have to parse the ssh commands to figure whether it was associated
with either not shutting down in time or rebooting in time.
2) Even with (1) wait_for_reboot in client/common_lib/hosts/base_classes.py
waits using the down_timeout + reboot_timeout -- sort of. It doesn't subtract
the time used waiting for wait_down from the total time in wait_up. This is
pretty bad logic and isn't really that useful to set a good timeout.

Given these, it doesn't make much sense to separate down/reboot errors (which I
originally planned to do). So all I could do was sort of pass this one aggregate
value (which wait_for_reboot doesn't allow). So given all these, I decided that
allowing the long timeouts and measuring the time was the best solution for
our problem.

BUG=chromium:242384
TEST=Ran it -- manually changed time outs to see failures correctly.

Change-Id: I52103c8f835cea326bf7efbeb126d9f145988a43
Reviewed-on: https://gerrit.chromium.org/gerrit/56348
Reviewed-by: Chris Sosa <sosa@chromium.org>
Tested-by: Chris Sosa <sosa@chromium.org>
Commit-Queue: Chris Sosa <sosa@chromium.org>
diff --git a/client/common_lib/cros/dev_server.py b/client/common_lib/cros/dev_server.py
index 9fce7f4..f0b2613 100644
--- a/client/common_lib/cros/dev_server.py
+++ b/client/common_lib/cros/dev_server.py
@@ -401,6 +401,16 @@
                                      "HTTP OK not accompanied by 'Success'." %
                                      image)
 
+    def get_update_url(self, image):
+        """Returns the url that should be passed to the updater.
+
+        @param image: the image that was fetched.
+        """
+        url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
+                                              type=str)
+        return (url_pattern % (self.url(), image))
+
+
     def _get_image_url(self, image):
         """Returns the url of the directory for this image on the devserver.
 
diff --git a/client/site_tests/login_LoginSuccess/control b/client/site_tests/login_LoginSuccess/control
index b379be5..d418746 100644
--- a/client/site_tests/login_LoginSuccess/control
+++ b/client/site_tests/login_LoginSuccess/control
@@ -13,8 +13,8 @@
 TEST_TYPE = "client"
 
 DOC = """
-This is a helper test that will login a system through the login manager.
+This is a helper test that will login a system through the login manager. Sets
+client_success to True on success so callers can verify it ran correctly.
 """
 
 job.run_test('login_LoginSuccess')
-
diff --git a/client/site_tests/login_LoginSuccess/login_LoginSuccess.py b/client/site_tests/login_LoginSuccess/login_LoginSuccess.py
index 2b46ccb..f7e67c7 100644
--- a/client/site_tests/login_LoginSuccess/login_LoginSuccess.py
+++ b/client/site_tests/login_LoginSuccess/login_LoginSuccess.py
@@ -47,7 +47,8 @@
 
 
     def run_once(self):
-        pass
+        self.job.set_state('client_success', True)
+
 
 
     def cleanup(self):
diff --git a/global_config.ini b/global_config.ini
index d674484..a0df2aa 100644
--- a/global_config.ini
+++ b/global_config.ini
@@ -107,9 +107,9 @@
 wait_up_processes:
 default_protection: NO_PROTECTION
 # Time in seconds to wait for a machine to come back after reboot.
-default_reboot_timeout: 120
+default_reboot_timeout: 240
 # Time in seconds to wait for a machine to go down prior to reboot.
-wait_down_reboot_timeout: 30
+wait_down_reboot_timeout: 120
 # Time in seconds to wait before generating a warning if a host has not gone
 # down for reboot.
 wait_down_reboot_warning: 30
diff --git a/server/cros/dynamic_suite/tools.py b/server/cros/dynamic_suite/tools.py
index f106b93..52d82d2 100644
--- a/server/cros/dynamic_suite/tools.py
+++ b/server/cros/dynamic_suite/tools.py
@@ -4,6 +4,7 @@
 
 
 import random
+import re
 
 from autotest_lib.client.common_lib import global_config
 
@@ -12,26 +13,32 @@
 
 
 def image_url_pattern():
+    """Returns image_url_pattern from global_config."""
     return _CONFIG.get_config_value('CROS', 'image_url_pattern', type=str)
 
 
 def firmware_url_pattern():
+    """Returns firmware_url_pattern from global_config."""
     return _CONFIG.get_config_value('CROS', 'firmware_url_pattern', type=str)
 
 
 def sharding_factor():
+    """Returns sharding_factor from global_config."""
     return _CONFIG.get_config_value('CROS', 'sharding_factor', type=int)
 
 
 def infrastructure_user():
+    """Returns infrastructure_user from global_config."""
     return _CONFIG.get_config_value('CROS', 'infrastructure_user', type=str)
 
 
 def package_url_pattern():
+    """Returns package_url_pattern from global_config."""
     return _CONFIG.get_config_value('CROS', 'package_url_pattern', type=str)
 
 
 def try_job_timeout_mins():
+    """Returns try_job_timeout_mins from global_config."""
     return _CONFIG.get_config_value('SCHEDULER', 'try_job_timeout_mins',
                                     type=int, default=4*60)
 
@@ -47,6 +54,18 @@
     return package_url_pattern() % (devserver_url, build)
 
 
+def get_devserver_build_from_package_url(package_url):
+    """The inverse method of get_package_url.
+
+    @param package_url: a string specifying the package url.
+
+    @return tuple containing the devserver_url, build.
+    """
+    pattern = package_url_pattern()
+    re_pattern = pattern.replace('%s', '(\S+)')
+    return re.search(re_pattern, package_url).groups()
+
+
 def get_random_best_host(afe, host_list, require_usable_hosts=True):
     """
     Randomly choose the 'best' host from host_list, using fresh status.
diff --git a/server/hosts/site_host.py b/server/hosts/site_host.py
index 0888ab9..7c851ee 100644
--- a/server/hosts/site_host.py
+++ b/server/hosts/site_host.py
@@ -146,22 +146,18 @@
     #   including the 30 second dev-mode delay and time to start the
     #   network.
     # SHUTDOWN_TIMEOUT: Time to allow for shut down.
-    # REBOOT_TIMEOUT: Combination of shutdown and reboot times.
-    # _UPDATE_REBOOT_TIMEOUT: Time to allow for reboot after AU; this
-    #   time provides no allowance for the 30 second dev-mode delay,
-    #   but is deliberately generous to avoid try-job failures.
+    # REBOOT_TIMEOUT: How long to wait for a reboot.
     # _INSTALL_TIMEOUT: Time to allow for chromeos-install.
 
     SLEEP_TIMEOUT = 2
     RESUME_TIMEOUT = 10
     BOOT_TIMEOUT = 45
     USB_BOOT_TIMEOUT = 150
-    SHUTDOWN_TIMEOUT = 5
-    REBOOT_TIMEOUT = SHUTDOWN_TIMEOUT + BOOT_TIMEOUT
-    # TODO(jrbarnette) - temporarily set this value to 2 min to allow
-    # for http://crbug.com/224871.  Reset to 1 minute once that bug
-    # is fixed.
-    _UPDATE_REBOOT_TIMEOUT = 120
+
+    # We have a long timeout to ensure we don't flakily fail due to other
+    # issues. Shorter timeouts are vetted in platform_RebootAfterUpdate.
+    REBOOT_TIMEOUT = 300
+
     _INSTALL_TIMEOUT = 240
 
     # _USB_POWER_TIMEOUT: Time to allow for USB to power toggle ON and OFF.
@@ -290,6 +286,21 @@
         return self._AFE.get_hosts(hostname=self.hostname)
 
 
+    def lookup_job_repo_url(self):
+        """Looks up the job_repo_url for the host.
+
+        @returns job_repo_url from AFE or None if not found.
+
+        @raises KeyError if the host does not have a job_repo_url
+        """
+        if not self._host_in_AFE():
+            return None
+
+        hosts = self._AFE.get_hosts(hostname=self.hostname)
+        if hosts and constants.JOB_REPO_URL in hosts[0].attributes:
+            return hosts[0].attributes[constants.JOB_REPO_URL]
+
+
     def clear_cros_version_labels_and_job_repo_url(self):
         """Clear cros_version labels and host attribute job_repo_url."""
         if not self._host_in_AFE():
@@ -303,7 +314,7 @@
         for label in labels:
             label.remove_hosts(hosts=host_list)
 
-        self._AFE.set_host_attribute('job_repo_url', None,
+        self._AFE.set_host_attribute(constants.JOB_REPO_URL, None,
                                      hostname=self.hostname)
 
 
@@ -327,7 +338,7 @@
 
         label.add_hosts([self.hostname])
         repo_url = tools.get_package_url(devserver_url, image_name)
-        self._AFE.set_host_attribute('job_repo_url', repo_url,
+        self._AFE.set_host_attribute(constants.JOB_REPO_URL, repo_url,
                                      hostname=self.hostname)
 
 
@@ -366,7 +377,7 @@
             return False
 
         # Reboot to complete stateful update.
-        self.reboot(timeout=self._UPDATE_REBOOT_TIMEOUT, wait=True)
+        self.reboot(timeout=self.REBOOT_TIMEOUT, wait=True)
         check_file_cmd = 'test -f %s; echo $?'
         for folder in folders_to_check:
             test_file_path = os.path.join(folder, test_file)
@@ -510,7 +521,7 @@
         if repair:
             # In case the system is in a bad state, we always reboot the machine
             # before machine_install.
-            self.reboot(timeout=self._UPDATE_REBOOT_TIMEOUT, wait=True)
+            self.reboot(timeout=self.REBOOT_TIMEOUT, wait=True)
             self.run('stop update-engine; start update-engine')
             force_update = True
 
@@ -523,8 +534,8 @@
         # first. Stateful update does not update kernel and tends to run much
         # faster than a full reimage.
         try:
-            updated = self._try_stateful_update(update_url, force_update,
-                                                updater)
+            updated = self._try_stateful_update(
+                    update_url, force_update, updater)
             if updated:
                 logging.info('DUT is updated with stateful update.')
         except Exception as e:
@@ -536,7 +547,7 @@
         if not updated:
             # In case the system is in a bad state, we always reboot the
             # machine before machine_install.
-            self.reboot(timeout=self._UPDATE_REBOOT_TIMEOUT, wait=True)
+            self.reboot(timeout=self.REBOOT_TIMEOUT, wait=True)
 
             # TODO(sosa): Remove temporary hack to get rid of bricked machines
             # that can't update due to a corrupted policy.
@@ -562,8 +573,7 @@
                 logging.info('Dumping %s', update_engine_log)
                 self.run('cat %s' % update_engine_log)
                 # Updater has returned successfully; reboot the host.
-                self.reboot(timeout=self._UPDATE_REBOOT_TIMEOUT,
-                            wait=True)
+                self.reboot(timeout=self.REBOOT_TIMEOUT, wait=True)
 
         if updated:
             self._post_update_processing(updater, inactive_kernel)
diff --git a/server/site_tests/platform_RebootAfterUpdate/control b/server/site_tests/platform_RebootAfterUpdate/control
new file mode 100644
index 0000000..7c5cee1
--- /dev/null
+++ b/server/site_tests/platform_RebootAfterUpdate/control
@@ -0,0 +1,46 @@
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+from autotest_lib.client.common_lib import utils
+
+AUTHOR = "sosa@chromium.org"
+NAME = "platform_RebootAfterUpdate"
+TIME = "SHORT"
+TEST_CATEGORY = "Benchmark"
+TEST_CLASS = "platform"
+TEST_TYPE = "server"
+
+DOC = """
+This test checks the behavior of reboot after update. It specifically tests
+the amount of time it takes for a system to reboot both after an update and
+after first login.
+
+Arg:
+  job_repo_url: repo url to use to find image to update from -- assumes caller
+                has staged image. By default if host[repo_job_url] is set, it'll
+                use that. This overrides that value. This value must follow
+                the package_url_pattern in the global config.
+
+To run locally:
+  1) Setup your devserver in your shadow config that your DUT can reach.
+  2) Stage the image you want for example:
+     http://localhost:8080/download?archive_url=\
+     gs://chromeos-image-archive/x86-alex/R29-4165.0.0
+  3) Run with run_remote_test etc passing
+     args="job_repo_url=http://localhost:8080/static/archive/\
+     x86-alex/R29-4165.0.0/autotest/packages"
+"""
+
+args_dict = utils.args_to_dict(args)
+job_repo_url = args_dict.get('job_repo_url')
+
+def run_test(machine):
+    """Execute a test configuration on a given machine."""
+    host = hosts.create_host(machine)
+    job.run_test("platform_RebootAfterUpdate",
+                 host=host, job_repo_url=job_repo_url)
+
+
+# Invoke parallel tests.
+parallel_simple(run_test, machines)
diff --git a/server/site_tests/platform_RebootAfterUpdate/platform_RebootAfterUpdate.py b/server/site_tests/platform_RebootAfterUpdate/platform_RebootAfterUpdate.py
new file mode 100755
index 0000000..d9ac8b3
--- /dev/null
+++ b/server/site_tests/platform_RebootAfterUpdate/platform_RebootAfterUpdate.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import logging
+import time
+
+from autotest_lib.client.common_lib import error
+from autotest_lib.client.common_lib.cros import autoupdater, dev_server
+from autotest_lib.server import autotest, test
+from autotest_lib.server.cros.dynamic_suite import tools
+
+
+class platform_RebootAfterUpdate(test.test):
+    """Test that updates the machine, reboots, and logs in / logs out.
+
+    This test keeps a reboot timeout in order to keep the system boot times
+    regressing in performance. For example, if Chrome takes a long time to
+    shutdown / hangs, other tests have much longer timeouts (to prevent them
+    from being flaky) while this test targets these hangs. Note, this test only
+    has smaller timeouts for boot, not for login/logout. Also, these timeouts
+    are still fairly conservative and are only meant to catch large regressions
+    or hangs, not small regressions.
+
+    """
+    version = 1
+
+    _REBOOT_ERROR_MESSAGE = (
+            'System failed to restart within the timeout after '
+            '%(reason)s. This failure indicates that the system after '
+            'receiving a reboot request and restarting did not '
+            'reconnect via ssh within the timeout. Actual time %(actual)d '
+            'seconds vs expected time: %(expected)d seconds')
+
+    # Timeouts specific to this test. These should be as low as possible.
+
+    # Total amount of time to wait for a reboot to return.
+    _REBOOT_TIMEOUT = 60
+
+
+    @classmethod
+    def reboot_with_timeout(cls, host, reason):
+        """Reboots the device and checks to see if it completed within desired.
+
+        @param host: Autotest host object to reboot.
+        @param reason: string representing why we are rebooting e.g. autoupdate.
+
+        Raises:
+            error.TestFail: If it takes too long to reboot.
+        """
+        start_time = time.time()
+        host.reboot()
+        reboot_duration = time.time() - start_time
+        if reboot_duration > cls._REBOOT_TIMEOUT:
+            raise error.TestFail(
+                cls._REBOOT_ERROR_MESSAGE % dict(
+                        reason=reason, actual=reboot_duration,
+                        expected=cls._REBOOT_TIMEOUT))
+
+
+    def run_once(self, host, job_repo_url=None):
+        """Runs the test.
+
+        @param host: a host object representing the DUT
+        @param job_repo_url: URL to get the image.
+
+        @raise error.TestError if anything went wrong with setting up the test;
+               error.TestFail if any part of the test has failed.
+
+        """
+        # Get the job_repo_url -- if not present, attempt to use the one
+        # specified in the host attributes for the host.
+        if not job_repo_url:
+            try:
+                job_repo_url = host.lookup_job_repo_url()
+            except KeyError:
+                logging.fatal('Could not lookup job_repo_url from afe.')
+
+            if not job_repo_url:
+                raise error.TestError(
+                        'Test could not be run. Missing the url with which to '
+                        're-image the device!')
+
+        # Get the devserver url and build (image) from the repo url e.g.
+        # 'http://mydevserver:8080', 'x86-alex-release/R27-123.0.0'
+        ds, build = tools.get_devserver_build_from_package_url(job_repo_url)
+        devserver = dev_server.ImageServer(ds)
+
+        # We only need to update stateful to do this test.
+        updater = autoupdater.ChromiumOSUpdater(
+                devserver.get_update_url(build), host=host)
+        updater.update_stateful(clobber=True)
+
+        logging.info('Rebooting after performing update.')
+        self.reboot_with_timeout(host, 'update')
+
+        # TODO(sosa): Ideally we would be able to just use
+        # autotest.run_static_method to login/logout, however, this
+        # functionality is currently nested deep into the test logic. Once
+        # telemetry has replaced pyauto login and has been librarized, we
+        # should switch to using that code and not have to rely on running a
+        # client test to do what we want.
+        logging.info('Running sanity desktop login to see that we can '
+                     'login and logout after performing an update.')
+        client_at = autotest.Autotest(host)
+        self.job.set_state('client_success', False)
+        client_at.run_test('login_LoginSuccess')
+        if not self.job.get_state('client_success'):
+            raise error.TestFail(
+                    'Failed to login successfully after an update.')
+
+        logging.info('Rebooting the DUT after first login/logout.')
+        self.reboot_with_timeout(host, 'first login')