Add desktopui_CrashyReboot{Server} autotest
To address some GPU hanging problems, we added logic to reboot the
device if the UI seemed to crash too much, too often.
This server-side test validates this logic by deploying an
associated client-side test to the DUT that drives it into
the failure state.
CQ-DEPEND=CL:65243
BUG=chromium:261784
TEST=run this autotest, and it should succeed.
TEST=run this autotest and while it's running, echo $(date +%s) > /var/lib/ui/reboot-timestamps. The test should fail.
Change-Id: I68af57e0266e85434ad63e1f646e90a41a6d393d
Reviewed-on: https://gerrit.chromium.org/gerrit/65218
Tested-by: Chris Masone <cmasone@chromium.org>
Reviewed-by: Richard Barnette <jrbarnette@chromium.org>
Commit-Queue: Chris Masone <cmasone@chromium.org>
diff --git a/client/bin/site_utils.py b/client/bin/site_utils.py
index 88e2384..e79986c 100644
--- a/client/bin/site_utils.py
+++ b/client/bin/site_utils.py
@@ -117,6 +117,9 @@
except Exception as e:
logging.error(e)
return
+ if pid is None:
+ raise error.AutoservPidAlreadyDeadError(
+ 'No process matching %s.' % name)
if with_prejudice:
utils.nuke_pid(pid, [signal.SIGKILL])
else:
diff --git a/client/cros/cros_ui.py b/client/cros/cros_ui.py
index 4226e6d..fa193a8 100644
--- a/client/cros/cros_ui.py
+++ b/client/cros/cros_ui.py
@@ -117,7 +117,7 @@
True upon successfully stopping the UI and all chrome processes exiting.
False otherwise.
"""
- status = utils.system("stop ui", ignore_status=True)
+ status = stop(allow_fail=True)
if status:
logging.error('stop ui returned non-zero status: %s', status)
return False
diff --git a/client/site_tests/desktopui_CrashyReboot/control b/client/site_tests/desktopui_CrashyReboot/control
new file mode 100644
index 0000000..edad2ac
--- /dev/null
+++ b/client/site_tests/desktopui_CrashyReboot/control
@@ -0,0 +1,29 @@
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+AUTHOR = "cmasone, jrbarnette"
+NAME = "desktopui_CrashyReboot"
+TIME = "MEDIUM"
+TEST_TYPE = "client"
+
+DOC = """
+This test exercises the logic we put in place to reboot the device in the
+face of a too-crashy UI:
+
+If the UI crashes too much too fast, the device will eventually reboot
+to attempt to mitigate the problem. If the device determines that it's
+already tried that some number of times, it will shut down the UI and
+remain up.
+
+This test will crash the browser repeatedly and, in the case that it
+has not rebooted before, the test will cause the device to reboot. If
+it has rebooted within a certain window of time, the test will expect
+the device to stay up.
+
+This test cannot be used on its own, but is intended to be used in
+conjunction with the server-side desktopui_CrashyRebootServer test.
+"""
+
+job.run_test('desktopui_CrashyReboot')
+
diff --git a/client/site_tests/desktopui_CrashyReboot/desktopui_CrashyReboot.py b/client/site_tests/desktopui_CrashyReboot/desktopui_CrashyReboot.py
new file mode 100644
index 0000000..c4b27e7
--- /dev/null
+++ b/client/site_tests/desktopui_CrashyReboot/desktopui_CrashyReboot.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import logging
+from autotest_lib.client.bin import test, utils
+from autotest_lib.client.common_lib import error
+from autotest_lib.client.cros import constants, cros_ui
+
+
+class UIStopped(Exception):
+ """Raised when the UI seems to have stopped respawning."""
+ pass
+
+
+class desktopui_CrashyReboot(test.test):
+ """Drive device to handle a too-crashy UI.
+
+ Run by desktopui_CrashyRebootServer.
+ """
+ version = 1
+
+ UNREASONABLY_HIGH_RESPAWN_COUNT=90
+
+
+ def _nuke_browser_with_prejudice_and_check_for_ui_stop(self):
+ """Nuke the browser with prejudice, check to see if the UI is down."""
+ try:
+ utils.nuke_process_by_name(constants.BROWSER, with_prejudice=True)
+ except error.AutoservPidAlreadyDeadError:
+ pass
+ return not cros_ui.is_up()
+
+
+ def _nuke_browser_until_ui_goes_down(self):
+ """Nuke the browser continuously until it stops respawning.
+
+ @raises utils.TimeoutError if the ui doesn't stop respawning.
+ """
+ utils.poll_for_condition(
+ condition=self._nuke_browser_with_prejudice_and_check_for_ui_stop,
+ timeout=60,
+ desc='ui to stop respawning, or the device to reboot')
+
+
+ def run_once(self, expect_reboot=False):
+ # Ensure the UI is running.
+ logging.debug('Restarting UI to ensure that it\'s running.')
+ cros_ui.stop(allow_fail=True)
+ cros_ui.start(wait_for_login_prompt=True)
+
+ # Since there is no 100% reliable way to determine that the
+ # browser process we're interested in is gone, we need to use
+ # a polling interval to continuously send KILL signals. This
+ # puts the test code in an unavoidable race with the UI
+ # respawning logic being tested. If the UI is down at the
+ # instant we check, it could mean that the UI is done
+ # respawning, the UI is about to respawn, or the device could
+ # already be rebooting. In all likelihood, the UI is coming
+ # back and we'll need to kill it all over again. This is why
+ # the code below polls the UI status for a number of seconds:
+ # to be more confident that the UI went down and is staying down.
+ try:
+ while True:
+ utils.poll_for_condition(condition=cros_ui.is_up,
+ timeout=5,
+ exception=UIStopped('As expected'))
+ self._nuke_browser_until_ui_goes_down()
+ except UIStopped:
+ pass
+ except utils.TimeoutError as te:
+ raise error.TestFail(te)
+
+ if expect_reboot:
+ raise error.TestFail('UI stopped respawning instead of rebooting.')
+
+
+ def cleanup(self):
+ # If the UI is already up, we want to tolerate that.
+ cros_ui.start(allow_fail=True)
diff --git a/client/site_tests/desktopui_Respawn/desktopui_Respawn.py b/client/site_tests/desktopui_Respawn/desktopui_Respawn.py
index c7b47f2..c8ae23b 100644
--- a/client/site_tests/desktopui_Respawn/desktopui_Respawn.py
+++ b/client/site_tests/desktopui_Respawn/desktopui_Respawn.py
@@ -11,7 +11,8 @@
"""Validate that the UI will cease respawning after a certain number of
attempts in a time window. By design, this test does _not_ attempt to
ensure that these values remain the same over time. The values are
- somewhat arbitrary anyhow, so enforcing them is simply an over-constraint.
+ somewhat arbitrary anyhow, so enforcing them is simply an
+ over-constraint.
"""
version = 1
diff --git a/server/site_tests/desktopui_CrashyRebootServer/control b/server/site_tests/desktopui_CrashyRebootServer/control
new file mode 100644
index 0000000..1f5de44
--- /dev/null
+++ b/server/site_tests/desktopui_CrashyRebootServer/control
@@ -0,0 +1,29 @@
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+NAME = "CrashyRebootServer"
+AUTHOR = "cmasone, jrbarnette"
+SUITE = "gpu_hang"
+TIME = "MEDIUM"
+TEST_TYPE = "server"
+EXPERIMENTAL = "True"
+
+DOC = """
+This test verifies that the logic we put in place to reboot the device in the
+face of a too-crashy UI functions as intended:
+
+If the UI crashes too much too fast, the device will eventually reboot
+to attempt to mitigate the problem. If the device determines that it's
+already tried that some number of times, it will shut down the UI and
+remain up.
+
+This test deploys the client test desktop_CrashyReboot in order to drive the
+device into the desired states.
+"""
+
+def run_bootperf(machine):
+ host = hosts.create_host(machine)
+ job.run_test("desktopui_CrashyRebootServer", host=host)
+
+parallel_simple(run_bootperf, machines)
diff --git a/server/site_tests/desktopui_CrashyRebootServer/desktopui_CrashyRebootServer.py b/server/site_tests/desktopui_CrashyRebootServer/desktopui_CrashyRebootServer.py
new file mode 100644
index 0000000..9bc3344
--- /dev/null
+++ b/server/site_tests/desktopui_CrashyRebootServer/desktopui_CrashyRebootServer.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import logging
+from autotest_lib.client.bin import utils
+from autotest_lib.client.common_lib import error
+from autotest_lib.server import test, autotest
+
+class desktopui_CrashyRebootServer(test.test):
+ """Validate logic for mitigating too-crashy UI.
+
+ If the UI crashes too much too fast, the device will eventually
+ reboot to attempt to mitigate the problem. If the device
+ determines that it's already tried that once, it will shut down
+ the UI and remain up.
+
+ This test deploys the client test desktopui_CrashyReboot in order
+ to drive the device into the desired states.
+ """
+ version = 1
+
+ CRASHY_DEVICE_TIMEOUT_SECONDS = 30
+ CLIENT_TEST = 'desktopui_CrashyReboot'
+
+ def run_once(self, host=None):
+ host.run('rm -f /var/lib/ui/reboot-timestamps')
+
+ # Run a client-side test that crashes the UI a bunch, and
+ # expect a reboot. We need to run this test in the background in
+ # order to prevent the reboot from causing autotest to auto-fail
+ # the entire test. This means we also need to handle collecting
+ # and parsing results manually if it doesn't work.
+ logging.info('CrashyRebootServer: start client test')
+ tag = 'reboot'
+ client_at = autotest.Autotest(host)
+ client_at.run_test(self.CLIENT_TEST, expect_reboot=True, tag='reboot',
+ background=True)
+
+ logging.info('Client test now running in background.')
+ # Prepare for result gathering.
+ collector = autotest.log_collector(host, None, '.')
+ host.job.add_client_log(host.hostname,
+ collector.client_results_dir,
+ collector.server_results_dir)
+ job_record_context = host.job.get_record_context()
+
+ logging.info('Waiting for host to go down.')
+ if not host.wait_down(timeout=self.CRASHY_DEVICE_TIMEOUT_SECONDS):
+ # Gather results to determine why device didn't reboot.
+ collector.collect_client_job_results()
+ collector.remove_redundant_client_logs()
+ host.job.remove_client_log(host.hostname,
+ collector.client_results_dir,
+ collector.server_results_dir)
+ job_record_context.restore()
+ raise error.TestError('Host should have rebooted!')
+
+ logging.info('Waiting for host to come back up.')
+ try:
+ # wait_up() issues an ssh connection attempt and then spends
+ # the entire given timeout waiting for it to succeed. If it
+ # does this before the device is ready to accept ssh
+ # connections, it will decide that the device never came up,
+ # even if it is ready and waiting. To combat this, loop with
+ # a short timeout.
+ utils.poll_for_condition(lambda: host.wait_up(5),
+ timeout=self.CRASHY_DEVICE_TIMEOUT_SECONDS)
+ except utils.TimeoutError:
+ raise error.TestError('Host never came back!')
+
+ # NB: If we change the reboot-attempt threshold in
+ # /etc/init/ui-respawn.conf to be >1, this will start failing
+ # and need to be updated.
+ client_at.run_test(self.CLIENT_TEST, expect_reboot=False)
diff --git a/suite_scheduler.ini b/suite_scheduler.ini
index 8c833f0..c44de90 100644
--- a/suite_scheduler.ini
+++ b/suite_scheduler.ini
@@ -50,6 +50,12 @@
branch_specs: >=R22
pool: snow
+[GpuHang]
+run_on: new_build
+suite: gpu_hang
+branch_specs: >=R30
+pool: suites
+
[NightlyHwQual]
run_on: nightly
suite: hwqual
diff --git a/test_suites/control.gpu_hang b/test_suites/control.gpu_hang
new file mode 100644
index 0000000..64cd850
--- /dev/null
+++ b/test_suites/control.gpu_hang
@@ -0,0 +1,37 @@
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+AUTHOR = "Chrome OS Team"
+NAME = "gpu_hang"
+
+TIME = "MEDIUM"
+TEST_CATEGORY = "General"
+TEST_CLASS = "suite"
+TEST_TYPE = "Server"
+
+DOC = """
+This suite runs one test, designed to validate the logic the reboots a device
+in the case that a GPU hang has caused the UI to crash over and over.
+
+It's in its own suite because that's the only way to run this test in a
+per-branch manner.
+
+@param build: The name of the image to test.
+ Ex: x86-mario-release/R17-1412.33.0-a1-b29
+@param board: The board to test on. Ex: x86-mario
+@param pool: The pool of machines to utilize for scheduling. If pool=None
+ board is used.
+@param check_hosts: require appropriate live hosts to exist in the lab.
+@param SKIP_IMAGE: (optional) If present and True, don't re-image devices.
+"""
+
+import common
+from autotest_lib.server.cros.dynamic_suite import dynamic_suite
+
+
+dynamic_suite.reimage_and_run(
+ build=build, board=board, name='gpu_hang', job=job, pool=pool,
+ check_hosts=check_hosts, add_experimental=True, num=num,
+ file_bugs=file_bugs, skip_reimage=dynamic_suite.skip_reimage(globals()),
+ max_runtime_mins=60, devserver_url=devserver_url)