graphics_GLBench: thermal fixes.

This change makes measuring temperatures from glbench more robust.
Also decouples -hasty from temperatures for more robustness.
Finally try to detect unexpected test termination better.

TEST=Ran on nyan_big.
BUG=chrome-os-partner:23243

Change-Id: Iec65cb9aaa42534148c3a65043368492baca1d54
Reviewed-on: https://chromium-review.googlesource.com/197605
Tested-by: Ilja Friedel <ihf@chromium.org>
Reviewed-by: Stéphane Marchesin <marcheu@chromium.org>
Commit-Queue: Ilja Friedel <ihf@chromium.org>
diff --git a/client/bin/site_utils.py b/client/bin/site_utils.py
index 36cc6cd..d092e69 100644
--- a/client/bin/site_utils.py
+++ b/client/bin/site_utils.py
@@ -500,11 +500,35 @@
 
 
 def wait_for_cool_machine():
-    # TODO(ihf): Implement this. The concept of a cool machine is very
-    # architecture specific. We either need a good heuristic or a table of
-    # board specific temperatures.
-    time.sleep(1.0)
-    return True
+    """
+    A simple heuristic to wait for a machine to cool.
+    The code looks a bit 'magic', but we don't know ambient temperature
+    nor machine characteristics and still would like to return the caller
+    a machine that cooled down as much as reasonably possible.
+    """
+    temperature = get_current_temperature_max()
+    # We got here with a cold machine, return immediately. This should be the
+    # most common case.
+    if temperature < 50:
+        return True
+    logging.info('Got a hot machine of %dC. Sleeping 1 minute.', temperature)
+    # A modest wait should cool the machine.
+    time.sleep(60.0)
+    temperature = get_current_temperature_max()
+    # Atoms idle below 60 and everyone else should be even lower.
+    if temperature < 62:
+        return True
+    # This should be rare.
+    logging.info('Did not cool down (%dC). Sleeping 2 minutes.', temperature)
+    time.sleep(120.0)
+    temperature = get_current_temperature_max()
+    # A temperature over 65'C doesn't give us much headroom to the critical
+    # temperatures that start at 85'C (and PerfControl as of today will fail at
+    # critical - 10'C).
+    if temperature < 65:
+        return True
+    logging.warning('Did not cool down (%dC), giving up.', temperature)
+    return False
 
 
 # System paths for machine performance state.
@@ -561,6 +585,9 @@
     return int(match, 16)
 
 
+# The paths don't change. Avoid running find all the time.
+_hwmon_paths = None
+
 def _get_hwmon_paths(file_pattern):
     """
     Returns a list of paths to the temperature sensors.
@@ -570,9 +597,10 @@
     #    /sys/class/hwmon/hwmon*/
     #    /sys/devices/virtual/hwmon/hwmon*/
     #    /sys/devices/platform/coretemp.0/
-    cmd = 'find /sys/ -name "' + file_pattern + '"'
-    paths = utils.run(cmd, verbose=False).stdout.splitlines()
-    return paths
+    if not _hwmon_paths:
+       cmd = 'find /sys/ -name "' + file_pattern + '"'
+       _hwon_paths = utils.run(cmd, verbose=False).stdout.splitlines()
+    return _hwon_paths
 
 
 def get_temperature_critical():
@@ -631,6 +659,17 @@
     return temperatures
 
 
+def get_current_temperature_max():
+    """
+    Returns the highest reported board temperature (all sensors) in Celsius.
+    """
+    temperature = get_temperature_input_max()
+    ec_temperatures = get_ec_temperatures()
+    if ec_temperatures:
+        temperature = max(max(ec_temperatures), temperature)
+    return temperature
+
+
 def get_cpu_cache_size():
     """
     Returns the last level CPU cache size in kBytes.