[autotest] Add autotest for browser hang detection

Add desktopui_HangDetector to verify that session_manager's hang detection
for the browser process works.
It will start in the 'regression' suite as an experimental test.

BUG=chromium-os:35729
TEST=run it with run_remote_tests.
CQ-DEPEND=I3e60e1cbda652a3e52126cd69b2ff4cd02cc6a74
CQ-DEPEND=Ied5c9e897485aaf75fdefd211445219712972bdf

Change-Id: I1ba7a4f5db97fb1b92fd5366a18c984a7f5494bc
Reviewed-on: https://gerrit.chromium.org/gerrit/38347
Tested-by: Chris Masone <cmasone@chromium.org>
Reviewed-by: Chris Sosa <sosa@chromium.org>
Commit-Ready: Chris Masone <cmasone@chromium.org>
diff --git a/client/bin/site_utils.py b/client/bin/site_utils.py
index 2a9cced..4e44bf4 100644
--- a/client/bin/site_utils.py
+++ b/client/bin/site_utils.py
@@ -43,9 +43,42 @@
         return lambda : self.cros_system_data[name]
 
 
+def get_oldest_pid_by_name(name):
+    """
+    Return the oldest pid of a process whose name perfectly matches |name|.
+
+    name is an egrep expression, which will be matched against the entire name
+    of processes on the system.  For example:
+
+      get_oldest_pid_by_name('chrome')
+
+    on a system running
+      8600 ?        00:00:04 chrome
+      8601 ?        00:00:00 chrome
+      8602 ?        00:00:00 chrome-sandbox
+
+    would return 8600, as that's the oldest process that matches.
+    chrome-sandbox would not be matched.
+
+    Arguments:
+      name: egrep expression to match.  Will be anchored at the beginning and
+            end of the match string.
+
+    Returns:
+      pid as an integer, or None if one cannot be found.
+
+    Raises:
+      ValueError if pgrep returns something odd.
+    """
+    str_pid = utils.system_output(
+        'pgrep -o ^%s$' % name, ignore_status=True).rstrip()
+    if str_pid:
+        return int(str_pid)
+
+
 def nuke_process_by_name(name, with_prejudice=False):
     try:
-        pid = int(utils.system_output('pgrep -o ^%s$' % name).split()[0])
+        pid = get_oldest_pid_by_name(name)
     except Exception as e:
         logging.error(e)
         return
@@ -275,4 +308,4 @@
                              '(?:\s*#.*)?$', line)
         if key_value:
             result[key_value.group('key')] = key_value.group('value')
-    return result
\ No newline at end of file
+    return result
diff --git a/client/cros/constants.py b/client/cros/constants.py
index e80626c..4baae80 100644
--- a/client/cros/constants.py
+++ b/client/cros/constants.py
@@ -56,6 +56,9 @@
 DISABLE_BROWSER_RESTART_MAGIC_FILE = '/var/run/disable_chrome_restart'
 DEFAULT_OWNERSHIP_TIMEOUT = 300  # Ownership is an inherently random process.
 
+ENABLE_BROWSER_HANG_DETECTION_FILE = \
+    '/var/run/session_manager/enable_hang_detection'
+
 FLIMFLAM_TEST_PATH = '/usr/lib/flimflam/test/'
 
 KEYGEN = 'keygen'
diff --git a/client/site_tests/desktopui_HangDetector/control b/client/site_tests/desktopui_HangDetector/control
new file mode 100644
index 0000000..b0a3280
--- /dev/null
+++ b/client/site_tests/desktopui_HangDetector/control
@@ -0,0 +1,17 @@
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+AUTHOR = "cmasone@chromium.org"
+NAME = "desktopui_HangDetector"
+EXPERIMENTAL = "True"
+SUITE = "regression"
+TIME = "FAST"
+TEST_TYPE = "client"
+
+DOC = """
+Enable and verify session_manager-driven browser hang detection.
+"""
+
+job.run_test('desktopui_HangDetector')
+
diff --git a/client/site_tests/desktopui_HangDetector/desktopui_HangDetector.py b/client/site_tests/desktopui_HangDetector/desktopui_HangDetector.py
new file mode 100644
index 0000000..bac7368
--- /dev/null
+++ b/client/site_tests/desktopui_HangDetector/desktopui_HangDetector.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import logging, os, signal, time
+
+import common
+from autotest_lib.client.bin import test, utils
+from autotest_lib.client.common_lib import error
+from autotest_lib.client.cros import constants, cros_logging, cros_ui, login
+
+class desktopui_HangDetector(test.test):
+    """
+    This class enables browser process hang detection, simulates a hang
+    by sending a SIGSTOP to the browser, and then checks to see that it
+    got killed and restarted successfully -- without the UI getting bounced.
+    """
+    version = 1
+
+
+    def initialize(self):
+        self._pauser = cros_logging.LogRotationPauser()
+        self._pauser.begin()
+
+
+    def _get_oldest_pid_by_name(self, name):
+        try:
+            pid = utils.get_oldest_pid_by_name(name)
+            logging.debug('Found %d for %s', pid, name)
+        except error.CmdError as e:
+            raise error.TestError('Could not find pid of %s: %r' % (name, e))
+        except ValueError as e:
+            raise error.TestError('Got bad pid looking up %s: %r' % (name, e))
+        if not pid:
+            raise error.TestError('Got no pid looking up %s' % name)
+        return pid
+
+
+    def run_once(self):
+        # Create magic file to enable browser liveness checking and
+        # bounce the session manager to pick up the flag file.
+        cros_ui.stop()
+        os.mknod(constants.ENABLE_BROWSER_HANG_DETECTION_FILE)
+        cros_ui.start()
+
+        browser_pid = self._get_oldest_pid_by_name(constants.BROWSER)
+        sm_pid = self._get_oldest_pid_by_name(constants.SESSION_MANAGER)
+
+        # Reading the log is the best way to watch for the hang detector.
+        reader = cros_logging.LogReader()
+        reader.set_start_by_current()
+
+        # To simulate a hang, STOP the browser and wait for it to get
+        # hit by the session manager.  It won't actually exit until it gets
+        # a SIGCONT, though.
+        try:
+            os.kill(browser_pid, signal.SIGSTOP)  # Simulate hang.
+        except OSError as e:
+            raise error.TestError('Cannot STOP browser: %r' % e)
+
+        # Watch for hang detection.
+        utils.poll_for_condition(
+            condition=lambda: reader.can_find('Aborting browser process.'),
+            exception=utils.TimeoutError('Waiting for hang detector.'),
+            sleep_interval=5,
+            timeout=60)
+
+        try:
+            os.kill(browser_pid, signal.SIGCONT)  # Allow browser to die.
+        except OSError as e:
+            raise error.TestError('Cannot CONT browser: %r' % e)
+
+        # Wait for old browser process to be gone.
+        utils.poll_for_condition(
+            condition= lambda: utils.pid_is_alive(browser_pid),
+            exception=utils.TimeoutError(
+                'Browser does not seem to have restarted!'),
+            timeout=60)
+
+        # Wait for new browser to come up.
+        login.wait_for_browser()
+        if sm_pid != self._get_oldest_pid_by_name(constants.SESSION_MANAGER):
+            raise error.TestFail('session_manager seems to have restarted')
+
+
+    def cleanup(self):
+        if os.path.exists(constants.ENABLE_BROWSER_HANG_DETECTION_FILE):
+            os.remove(constants.ENABLE_BROWSER_HANG_DETECTION_FILE)
+        self._pauser.end()