[autotest] Update servo_host update to work with labstation as well.
Added a new AFE rpc (retrieve_hosts_by_host_attribute) that returns a list
of dut hostnames that all share the same host attribute values (helper
function to find all duts using the same servo host).
Added a new context manager to lock duts to ensure no duts will
be impacted while rebooting the servo host.
Added a new test servohost_Reboot to handle a synchronized reboot for a
servo host.
CQ-DEPEND=CL:388674
BUG=chromium:599533
TEST=locally on moblab with 2 duts with the labstation primed for a
reboot. Checked that:
- no more than 1 servohost_Reboot test got scheduled.
- servohost_Reboot waited for the locked dut to go idle before rebooting
the servo host.
- all duts were unlocked regardless of failure in servohost_Reboot.
Change-Id: Ie9e1b0ccbf76e1cbcacbbe9737797dbac6559ef8
Reviewed-on: https://chromium-review.googlesource.com/375961
Commit-Ready: Kevin Cheng <kevcheng@chromium.org>
Tested-by: Kevin Cheng <kevcheng@chromium.org>
Reviewed-by: Kevin Cheng <kevcheng@chromium.org>
diff --git a/server/site_utils.py b/server/site_utils.py
index 08d11d9..496a22c 100644
--- a/server/site_utils.py
+++ b/server/site_utils.py
@@ -19,6 +19,7 @@
from autotest_lib.client.common_lib import error
from autotest_lib.client.common_lib import global_config
from autotest_lib.client.common_lib import host_queue_entry_states
+from autotest_lib.client.common_lib import host_states
from autotest_lib.server.cros import provision
from autotest_lib.server.cros.dynamic_suite import constants
from autotest_lib.server.cros.dynamic_suite import job_status
@@ -48,6 +49,9 @@
'CROS', 'enable_drone_in_restricted_subnet', type=bool,
default=False)
+# Wait at most 10 mins for duts to go idle.
+IDLE_DUT_WAIT_TIMEOUT = 600
+
class TestLabException(Exception):
"""Exception raised when the Test Lab blocks a test or suite."""
pass
@@ -763,3 +767,64 @@
def TrivialContextManager():
"""Context manager that does nothing."""
yield
+
+
+def wait_for_idle_duts(duts, afe, max_wait=IDLE_DUT_WAIT_TIMEOUT):
+ """Wait for the hosts to all go idle.
+
+ @param duts: List of duts to check for idle state.
+ @param afe: afe instance.
+ @param max_wait: Max wait time in seconds.
+
+ @returns Boolean True if all hosts are idle or False if any hosts did not
+ go idle within max_wait.
+ """
+ start_time = time.time()
+ # We make a shallow copy since we're going to be modifying active_dut_list.
+ active_dut_list = duts[:]
+ while active_dut_list:
+ # Let's rate-limit how often we hit the AFE.
+ time.sleep(1)
+
+ # Check if we've waited too long.
+ if (time.time() - start_time) > max_wait:
+ return False
+
+ idle_duts = []
+ # Get the status for the duts and see if they're in the idle state.
+ afe_hosts = afe.get_hosts(active_dut_list)
+ idle_duts = [afe_host.hostname for afe_host in afe_hosts
+ if afe_host.status in host_states.IDLE_STATES]
+
+ # Take out idle duts so we don't needlessly check them
+ # next time around.
+ for idle_dut in idle_duts:
+ active_dut_list.remove(idle_dut)
+
+ logging.info('still waiting for following duts to go idle: %s',
+ active_dut_list)
+ return True
+
+
+@contextlib.contextmanager
+def lock_duts_and_wait(duts, afe, lock_msg='default lock message',
+ max_wait=IDLE_DUT_WAIT_TIMEOUT):
+ """Context manager to lock the duts and wait for them to go idle.
+
+ @param duts: List of duts to lock.
+ @param afe: afe instance.
+
+ @returns Boolean lock_success where True if all duts locked successfully or
+ False if we timed out waiting too long for hosts to go idle.
+ """
+ try:
+ locked_duts = []
+ duts.sort()
+ for dut in duts:
+ if afe.lock_host(dut, lock_msg, fail_if_locked=True):
+ locked_duts.append(dut)
+ else:
+ logging.info('%s already locked', dut)
+ yield wait_for_idle_duts(locked_duts, afe, max_wait)
+ finally:
+ afe.unlock_hosts(locked_duts)