Add desktopui_CrashyReboot{Server} autotest

To address some GPU hanging problems, we added logic to reboot the
device if the UI seemed to crash too much, too often.
This server-side test validates this logic by deploying an
associated client-side test to the DUT that drives it into
the failure state.

CQ-DEPEND=CL:65243
BUG=chromium:261784
TEST=run this autotest, and it should succeed.
TEST=run this autotest and while it's running, echo $(date +%s) > /var/lib/ui/reboot-timestamps. The test should fail.

Change-Id: I68af57e0266e85434ad63e1f646e90a41a6d393d
Reviewed-on: https://gerrit.chromium.org/gerrit/65218
Tested-by: Chris Masone <cmasone@chromium.org>
Reviewed-by: Richard Barnette <jrbarnette@chromium.org>
Commit-Queue: Chris Masone <cmasone@chromium.org>
diff --git a/client/bin/site_utils.py b/client/bin/site_utils.py
index 88e2384..e79986c 100644
--- a/client/bin/site_utils.py
+++ b/client/bin/site_utils.py
@@ -117,6 +117,9 @@
     except Exception as e:
         logging.error(e)
         return
+    if pid is None:
+        raise error.AutoservPidAlreadyDeadError(
+            'No process matching %s.' % name)
     if with_prejudice:
         utils.nuke_pid(pid, [signal.SIGKILL])
     else:
diff --git a/client/cros/cros_ui.py b/client/cros/cros_ui.py
index 4226e6d..fa193a8 100644
--- a/client/cros/cros_ui.py
+++ b/client/cros/cros_ui.py
@@ -117,7 +117,7 @@
         True upon successfully stopping the UI and all chrome processes exiting.
         False otherwise.
     """
-    status = utils.system("stop ui", ignore_status=True)
+    status = stop(allow_fail=True)
     if status:
         logging.error('stop ui returned non-zero status: %s', status)
         return False
diff --git a/client/site_tests/desktopui_CrashyReboot/control b/client/site_tests/desktopui_CrashyReboot/control
new file mode 100644
index 0000000..edad2ac
--- /dev/null
+++ b/client/site_tests/desktopui_CrashyReboot/control
@@ -0,0 +1,29 @@
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+AUTHOR = "cmasone, jrbarnette"
+NAME = "desktopui_CrashyReboot"
+TIME = "MEDIUM"
+TEST_TYPE = "client"
+
+DOC = """
+This test exercises the logic we put in place to reboot the device in the
+face of a too-crashy UI:
+
+If the UI crashes too much too fast, the device will eventually reboot
+to attempt to mitigate the problem. If the device determines that it's
+already tried that some number of times, it will shut down the UI and
+remain up.
+
+This test will crash the browser repeatedly and, in the case that it
+has not rebooted before, the test will cause the device to reboot. If
+it has rebooted within a certain window of time, the test will expect
+the device to stay up.
+
+This test cannot be used on its own, but is intended to be used in
+conjunction with the server-side desktopui_CrashyRebootServer test.
+"""
+
+job.run_test('desktopui_CrashyReboot')
+
diff --git a/client/site_tests/desktopui_CrashyReboot/desktopui_CrashyReboot.py b/client/site_tests/desktopui_CrashyReboot/desktopui_CrashyReboot.py
new file mode 100644
index 0000000..c4b27e7
--- /dev/null
+++ b/client/site_tests/desktopui_CrashyReboot/desktopui_CrashyReboot.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import logging
+from autotest_lib.client.bin import test, utils
+from autotest_lib.client.common_lib import error
+from autotest_lib.client.cros import constants, cros_ui
+
+
+class UIStopped(Exception):
+    """Raised when the UI seems to have stopped respawning."""
+    pass
+
+
+class desktopui_CrashyReboot(test.test):
+    """Drive device to handle a too-crashy UI.
+
+    Run by desktopui_CrashyRebootServer.
+    """
+    version = 1
+
+    UNREASONABLY_HIGH_RESPAWN_COUNT=90
+
+
+    def _nuke_browser_with_prejudice_and_check_for_ui_stop(self):
+        """Nuke the browser with prejudice, check to see if the UI is down."""
+        try:
+            utils.nuke_process_by_name(constants.BROWSER, with_prejudice=True)
+        except error.AutoservPidAlreadyDeadError:
+            pass
+        return not cros_ui.is_up()
+
+
+    def _nuke_browser_until_ui_goes_down(self):
+        """Nuke the browser continuously until it stops respawning.
+
+        @raises utils.TimeoutError if the ui doesn't stop respawning.
+        """
+        utils.poll_for_condition(
+            condition=self._nuke_browser_with_prejudice_and_check_for_ui_stop,
+            timeout=60,
+            desc='ui to stop respawning, or the device to reboot')
+
+
+    def run_once(self, expect_reboot=False):
+        # Ensure the UI is running.
+        logging.debug('Restarting UI to ensure that it\'s running.')
+        cros_ui.stop(allow_fail=True)
+        cros_ui.start(wait_for_login_prompt=True)
+
+        # Since there is no 100% reliable way to determine that the
+        # browser process we're interested in is gone, we need to use
+        # a polling interval to continuously send KILL signals. This
+        # puts the test code in an unavoidable race with the UI
+        # respawning logic being tested. If the UI is down at the
+        # instant we check, it could mean that the UI is done
+        # respawning, the UI is about to respawn, or the device could
+        # already be rebooting. In all likelihood, the UI is coming
+        # back and we'll need to kill it all over again. This is why
+        # the code below polls the UI status for a number of seconds:
+        # to be more confident that the UI went down and is staying down.
+        try:
+            while True:
+                utils.poll_for_condition(condition=cros_ui.is_up,
+                                         timeout=5,
+                                         exception=UIStopped('As expected'))
+                self._nuke_browser_until_ui_goes_down()
+        except UIStopped:
+            pass
+        except utils.TimeoutError as te:
+            raise error.TestFail(te)
+
+        if expect_reboot:
+            raise error.TestFail('UI stopped respawning instead of rebooting.')
+
+
+    def cleanup(self):
+        # If the UI is already up, we want to tolerate that.
+        cros_ui.start(allow_fail=True)
diff --git a/client/site_tests/desktopui_Respawn/desktopui_Respawn.py b/client/site_tests/desktopui_Respawn/desktopui_Respawn.py
index c7b47f2..c8ae23b 100644
--- a/client/site_tests/desktopui_Respawn/desktopui_Respawn.py
+++ b/client/site_tests/desktopui_Respawn/desktopui_Respawn.py
@@ -11,7 +11,8 @@
     """Validate that the UI will cease respawning after a certain number of
        attempts in a time window. By design, this test does _not_ attempt to
        ensure that these values remain the same over time. The values are
-       somewhat arbitrary anyhow, so enforcing them is simply an over-constraint.
+       somewhat arbitrary anyhow, so enforcing them is simply an
+       over-constraint.
     """
     version = 1
 
diff --git a/server/site_tests/desktopui_CrashyRebootServer/control b/server/site_tests/desktopui_CrashyRebootServer/control
new file mode 100644
index 0000000..1f5de44
--- /dev/null
+++ b/server/site_tests/desktopui_CrashyRebootServer/control
@@ -0,0 +1,29 @@
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+NAME = "CrashyRebootServer"
+AUTHOR = "cmasone, jrbarnette"
+SUITE = "gpu_hang"
+TIME = "MEDIUM"
+TEST_TYPE = "server"
+EXPERIMENTAL = "True"
+
+DOC = """
+This test verifies that the logic we put in place to reboot the device in the
+face of a too-crashy UI functions as intended:
+
+If the UI crashes too much too fast, the device will eventually reboot
+to attempt to mitigate the problem. If the device determines that it's
+already tried that some number of times, it will shut down the UI and
+remain up.
+
+This test deploys the client test desktop_CrashyReboot in order to drive the
+device into the desired states.
+"""
+
+def run_bootperf(machine):
+    host = hosts.create_host(machine)
+    job.run_test("desktopui_CrashyRebootServer", host=host)
+
+parallel_simple(run_bootperf, machines)
diff --git a/server/site_tests/desktopui_CrashyRebootServer/desktopui_CrashyRebootServer.py b/server/site_tests/desktopui_CrashyRebootServer/desktopui_CrashyRebootServer.py
new file mode 100644
index 0000000..9bc3344
--- /dev/null
+++ b/server/site_tests/desktopui_CrashyRebootServer/desktopui_CrashyRebootServer.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import logging
+from autotest_lib.client.bin import utils
+from autotest_lib.client.common_lib import error
+from autotest_lib.server import test, autotest
+
+class desktopui_CrashyRebootServer(test.test):
+    """Validate logic for mitigating too-crashy UI.
+
+    If the UI crashes too much too fast, the device will eventually
+    reboot to attempt to mitigate the problem. If the device
+    determines that it's already tried that once, it will shut down
+    the UI and remain up.
+
+    This test deploys the client test desktopui_CrashyReboot in order
+    to drive the device into the desired states.
+    """
+    version = 1
+
+    CRASHY_DEVICE_TIMEOUT_SECONDS = 30
+    CLIENT_TEST = 'desktopui_CrashyReboot'
+
+    def run_once(self, host=None):
+        host.run('rm -f /var/lib/ui/reboot-timestamps')
+
+        # Run a client-side test that crashes the UI a bunch, and
+        # expect a reboot.  We need to run this test in the background in
+        # order to prevent the reboot from causing autotest to auto-fail
+        # the entire test. This means we also need to handle collecting
+        # and parsing results manually if it doesn't work.
+        logging.info('CrashyRebootServer: start client test')
+        tag = 'reboot'
+        client_at = autotest.Autotest(host)
+        client_at.run_test(self.CLIENT_TEST, expect_reboot=True, tag='reboot',
+                           background=True)
+
+        logging.info('Client test now running in background.')
+        # Prepare for result gathering.
+        collector = autotest.log_collector(host, None, '.')
+        host.job.add_client_log(host.hostname,
+                                collector.client_results_dir,
+                                collector.server_results_dir)
+        job_record_context = host.job.get_record_context()
+
+        logging.info('Waiting for host to go down.')
+        if not host.wait_down(timeout=self.CRASHY_DEVICE_TIMEOUT_SECONDS):
+            # Gather results to determine why device didn't reboot.
+            collector.collect_client_job_results()
+            collector.remove_redundant_client_logs()
+            host.job.remove_client_log(host.hostname,
+                                       collector.client_results_dir,
+                                       collector.server_results_dir)
+            job_record_context.restore()
+            raise error.TestError('Host should have rebooted!')
+
+        logging.info('Waiting for host to come back up.')
+        try:
+            # wait_up() issues an ssh connection attempt and then spends
+            # the entire given timeout waiting for it to succeed. If it
+            # does this before the device is ready to accept ssh
+            # connections, it will decide that the device never came up,
+            # even if it is ready and waiting. To combat this, loop with
+            # a short timeout.
+            utils.poll_for_condition(lambda: host.wait_up(5),
+                                     timeout=self.CRASHY_DEVICE_TIMEOUT_SECONDS)
+        except utils.TimeoutError:
+            raise error.TestError('Host never came back!')
+
+        # NB: If we change the reboot-attempt threshold in
+        # /etc/init/ui-respawn.conf to be >1, this will start failing
+        # and need to be updated.
+        client_at.run_test(self.CLIENT_TEST, expect_reboot=False)
diff --git a/suite_scheduler.ini b/suite_scheduler.ini
index 8c833f0..c44de90 100644
--- a/suite_scheduler.ini
+++ b/suite_scheduler.ini
@@ -50,6 +50,12 @@
 branch_specs: >=R22
 pool: snow
 
+[GpuHang]
+run_on: new_build
+suite: gpu_hang
+branch_specs: >=R30
+pool: suites
+
 [NightlyHwQual]
 run_on: nightly
 suite: hwqual
diff --git a/test_suites/control.gpu_hang b/test_suites/control.gpu_hang
new file mode 100644
index 0000000..64cd850
--- /dev/null
+++ b/test_suites/control.gpu_hang
@@ -0,0 +1,37 @@
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+AUTHOR = "Chrome OS Team"
+NAME = "gpu_hang"
+
+TIME = "MEDIUM"
+TEST_CATEGORY = "General"
+TEST_CLASS = "suite"
+TEST_TYPE = "Server"
+
+DOC = """
+This suite runs one test, designed to validate the logic the reboots a device
+in the case that a GPU hang has caused the UI to crash over and over.
+
+It's in its own suite because that's the only way to run this test in a
+per-branch manner.
+
+@param build: The name of the image to test.
+              Ex: x86-mario-release/R17-1412.33.0-a1-b29
+@param board: The board to test on. Ex: x86-mario
+@param pool: The pool of machines to utilize for scheduling. If pool=None
+             board is used.
+@param check_hosts: require appropriate live hosts to exist in the lab.
+@param SKIP_IMAGE: (optional) If present and True, don't re-image devices.
+"""
+
+import common
+from autotest_lib.server.cros.dynamic_suite import dynamic_suite
+
+
+dynamic_suite.reimage_and_run(
+    build=build, board=board, name='gpu_hang', job=job, pool=pool,
+    check_hosts=check_hosts, add_experimental=True, num=num,
+    file_bugs=file_bugs, skip_reimage=dynamic_suite.skip_reimage(globals()),
+    max_runtime_mins=60, devserver_url=devserver_url)