| #!/bin/bash |
| |
| # Force a repair special task for any host that hasn't seen activity in |
| # the past day. |
| # |
| # Various scripts/cron jobs look for DUTs that aren't working. To be |
| # conservative, those scripts assume that a DUT that hasn't run any jobs |
| # within a reasonable time interval isn't working, since some of the |
| # ways a DUT may be unavailable manifest as inactivity. |
| # |
| # In some cases, we'd like to be more certain as to a DUT's status. |
| # This script goes through the entire AFE hosts table, and identifies |
| # unlocked hosts that would otherwise be flagged as "not working due to |
| # lack of activity", and forces a repair task. |
| # |
| # We use a repair task (as opposed to verify) for various reasons: |
| # + If a DUT is working, repair and verify perform the same checks, |
| # and generally run in the same time. |
| # + If a DUT is broken, a verify task will fail and invoke repair, |
| # which will take longer than just repair alone. |
| # + Repair tasks that pass update labels; without this, labels could |
| # become out-of-date simply because a DUT is idle. |
| # |
| # Locked hosts are skipped because they can't run jobs and because we |
| # want them to show up as suspicious anyway. |
| |
| |
| cd $(dirname $0)/.. |
| |
| # Gather all the hosts under supervision of the lab techs. |
| # Basically, that's any host in any managed pool. |
| |
| GET_HOSTS=' |
| /pool:(suites|bvt|cq|continuous|cts|arc-presubmit|crosperf|performance)/ { |
| print $1 |
| } |
| ' |
| HOSTS=( $(cli/atest host list --unlocked | awk "$GET_HOSTS") ) |
| |
| |
| # Go through the gathered hosts, and use dut_status to find the |
| # ones with unknown state (anything without a positive "OK" or |
| # "NO" diagnosis). |
| |
| NEED_CHECK=' |
| /OK/ || /NO/ { next } |
| /^chromeos/ { print $1 } |
| ' |
| CHECK=( $(site_utils/dut_status.py -d 19 "${HOSTS[@]}" | awk "$NEED_CHECK") ) |
| |
| contrib/repair_hosts "${CHECK[@]}" |