blob: c4f1eb08278fb0b67a70e7e09b211d903012d2a1 [file] [log] [blame]
# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
#
# Expects to be run in an environment with sudo and no interactive password
# prompt, such as within the Chromium OS development chroot.
"""This file provides core logic for servo verify/repair process."""
import httplib
import logging
import socket
import time
import xmlrpclib
from autotest_lib.client.bin import utils
from autotest_lib.client.common_lib import error
from autotest_lib.client.common_lib import global_config
from autotest_lib.client.common_lib import lsbrelease_utils
from autotest_lib.client.common_lib.cros import autoupdater
from autotest_lib.client.common_lib.cros import dev_server
from autotest_lib.client.common_lib.cros import retry
from autotest_lib.client.common_lib.cros.graphite import autotest_stats
from autotest_lib.client.common_lib.cros.network import ping_runner
from autotest_lib.client.cros import constants as client_constants
from autotest_lib.server import site_utils as server_site_utils
from autotest_lib.server.cros import dnsname_mangler
from autotest_lib.server.cros.servo import servo
from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
from autotest_lib.server.hosts import ssh_host
from autotest_lib.site_utils.rpm_control_system import rpm_client
# Names of the host attributes in the database that represent the values for
# the servo_host and servo_port for a servo connected to the DUT.
SERVO_HOST_ATTR = 'servo_host'
SERVO_PORT_ATTR = 'servo_port'
_CONFIG = global_config.global_config
class ServoHostException(error.AutoservError):
"""This is the base class for exceptions raised by ServoHost."""
pass
class ServoHostVerifyFailure(ServoHostException):
"""Raised when servo verification fails."""
pass
class ServoHostRepairFailure(ServoHostException):
"""Raised when a repair method fails to repair a servo host."""
pass
class ServoHostRepairMethodNA(ServoHostException):
"""Raised when a repair method is not applicable."""
pass
class ServoHostRepairTotalFailure(ServoHostException):
"""Raised if all attempts to repair a servo host fail."""
pass
def make_servo_hostname(dut_hostname):
"""Given a DUT's hostname, return the hostname of its servo.
@param dut_hostname: hostname of a DUT.
@return hostname of the DUT's servo.
"""
host_parts = dut_hostname.split('.')
host_parts[0] = host_parts[0] + '-servo'
return '.'.join(host_parts)
class ServoHost(ssh_host.SSHHost):
"""Host class for a host that controls a servo, e.g. beaglebone."""
# Timeout for getting the value of 'pwr_button'.
PWR_BUTTON_CMD_TIMEOUT_SECS = 15
# Timeout for rebooting servo host.
REBOOT_TIMEOUT_SECS = 90
HOST_DOWN_TIMEOUT_SECS = 60
# Delay after rebooting for servod to become fully functional.
REBOOT_DELAY_SECS = 20
# Servod process name.
SERVOD_PROCESS = 'servod'
# Timeout for initializing servo signals.
INITIALIZE_SERVO_TIMEOUT_SECS = 30
_MAX_POWER_CYCLE_ATTEMPTS = 3
_timer = autotest_stats.Timer('servo_host')
def _initialize(self, servo_host='localhost', servo_port=9999,
required_by_test=True, is_in_lab=None, *args, **dargs):
"""Initialize a ServoHost instance.
A ServoHost instance represents a host that controls a servo.
@param servo_host: Name of the host where the servod process
is running.
@param servo_port: Port the servod process is listening on.
@param required_by_test: True if servo is required by test.
@param is_in_lab: True if the servo host is in Cros Lab. Default is set
to None, for which utils.host_is_in_lab_zone will be
called to check if the servo host is in Cros lab.
"""
super(ServoHost, self)._initialize(hostname=servo_host,
*args, **dargs)
if is_in_lab is None:
self._is_in_lab = utils.host_is_in_lab_zone(self.hostname)
else:
self._is_in_lab = is_in_lab
self._is_localhost = (self.hostname == 'localhost')
remote = 'http://%s:%s' % (self.hostname, servo_port)
self._servod_server = xmlrpclib.ServerProxy(remote)
# Commands on the servo host must be run by the superuser. Our account
# on Beaglebone is root, but locally we might be running as a
# different user. If so - `sudo ' will have to be added to the
# commands.
if self._is_localhost:
self._sudo_required = utils.system_output('id -u') != '0'
else:
self._sudo_required = False
# Create a cache of Servo object. This must be called at the end of
# _initialize to make sure all attributes are set.
self._servo = None
self.required_by_test = required_by_test
try:
self.verify()
except Exception:
if required_by_test:
if not self.is_in_lab():
raise
else:
self.repair_full()
def is_in_lab(self):
"""Check whether the servo host is a lab device.
@returns: True if the servo host is in Cros Lab, otherwise False.
"""
return self._is_in_lab
def is_localhost(self):
"""Checks whether the servo host points to localhost.
@returns: True if it points to localhost, otherwise False.
"""
return self._is_localhost
def get_servod_server_proxy(self):
"""Return a proxy that can be used to communicate with servod server.
@returns: An xmlrpclib.ServerProxy that is connected to the servod
server on the host.
"""
return self._servod_server
def get_wait_up_processes(self):
"""Get the list of local processes to wait for in wait_up.
Override get_wait_up_processes in
autotest_lib.client.common_lib.hosts.base_classes.Host.
Wait for servod process to go up. Called by base class when
rebooting the device.
"""
processes = [self.SERVOD_PROCESS]
return processes
def _is_cros_host(self):
"""Check if a servo host is running chromeos.
@return: True if the servo host is running chromeos.
False if it isn't, or we don't have enough information.
"""
try:
result = self.run('grep -q CHROMEOS /etc/lsb-release',
ignore_status=True, timeout=10)
except (error.AutoservRunError, error.AutoservSSHTimeout):
return False
return result.exit_status == 0
def make_ssh_command(self, user='root', port=22, opts='', hosts_file=None,
connect_timeout=None, alive_interval=None):
"""Override default make_ssh_command to use tuned options.
Tuning changes:
- ConnectTimeout=30; maximum of 30 seconds allowed for an SSH
connection failure. Consistency with remote_access.py.
- ServerAliveInterval=180; which causes SSH to ping connection every
180 seconds. In conjunction with ServerAliveCountMax ensures
that if the connection dies, Autotest will bail out quickly.
- ServerAliveCountMax=3; consistency with remote_access.py.
- ConnectAttempts=4; reduce flakiness in connection errors;
consistency with remote_access.py.
- UserKnownHostsFile=/dev/null; we don't care about the keys.
- SSH protocol forced to 2; needed for ServerAliveInterval.
@param user User name to use for the ssh connection.
@param port Port on the target host to use for ssh connection.
@param opts Additional options to the ssh command.
@param hosts_file Ignored.
@param connect_timeout Ignored.
@param alive_interval Ignored.
@returns: An ssh command with the requested settings.
"""
base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
' -o ConnectTimeout=30 -o ServerAliveInterval=180'
' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
' -o Protocol=2 -l %s -p %d')
return base_command % (opts, user, port)
def _make_scp_cmd(self, sources, dest):
"""Format scp command.
Given a list of source paths and a destination path, produces the
appropriate scp command for encoding it. Remote paths must be
pre-encoded. Overrides _make_scp_cmd in AbstractSSHHost
to allow additional ssh options.
@param sources: A list of source paths to copy from.
@param dest: Destination path to copy to.
@returns: An scp command that copies |sources| on local machine to
|dest| on the remote servo host.
"""
command = ('scp -rq %s -o BatchMode=yes -o StrictHostKeyChecking=no '
'-o UserKnownHostsFile=/dev/null -P %d %s "%s"')
return command % (self.master_ssh_option,
self.port, ' '.join(sources), dest)
def run(self, command, timeout=3600, ignore_status=False,
stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
connect_timeout=30, options='', stdin=None, verbose=True, args=()):
"""Run a command on the servo host.
Extends method `run` in SSHHost. If the servo host is a remote device,
it will call `run` in SSHost without changing anything.
If the servo host is 'localhost', it will call utils.system_output.
@param command: The command line string.
@param timeout: Time limit in seconds before attempting to
kill the running process. The run() function
will take a few seconds longer than 'timeout'
to complete if it has to kill the process.
@param ignore_status: Do not raise an exception, no matter
what the exit code of the command is.
@param stdout_tee/stderr_tee: Where to tee the stdout/stderr.
@param connect_timeout: SSH connection timeout (in seconds)
Ignored if host is 'localhost'.
@param options: String with additional ssh command options
Ignored if host is 'localhost'.
@param stdin: Stdin to pass (a string) to the executed command.
@param verbose: Log the commands.
@param args: Sequence of strings to pass as arguments to command by
quoting them in " and escaping their contents if necessary.
@returns: A utils.CmdResult object.
@raises AutoservRunError if the command failed.
@raises AutoservSSHTimeout SSH connection has timed out. Only applies
when servo host is not 'localhost'.
"""
run_args = {'command': command, 'timeout': timeout,
'ignore_status': ignore_status, 'stdout_tee': stdout_tee,
'stderr_tee': stderr_tee, 'stdin': stdin,
'verbose': verbose, 'args': args}
if self.is_localhost():
if self._sudo_required:
run_args['command'] = 'sudo -n %s' % command
try:
return utils.run(**run_args)
except error.CmdError as e:
logging.error(e)
raise error.AutoservRunError('command execution error',
e.result_obj)
else:
run_args['connect_timeout'] = connect_timeout
run_args['options'] = options
return super(ServoHost, self).run(**run_args)
@_timer.decorate
def _check_servod(self):
"""A sanity check of the servod state."""
msg_prefix = 'Servod error: %s'
error_msg = None
try:
timeout, _ = retry.timeout(
self._servod_server.get, args=('pwr_button', ),
timeout_sec=self.PWR_BUTTON_CMD_TIMEOUT_SECS)
if timeout:
error_msg = msg_prefix % 'Request timed out.'
except (socket.error, xmlrpclib.Error, httplib.BadStatusLine) as e:
error_msg = msg_prefix % e
if error_msg:
raise ServoHostVerifyFailure(error_msg)
def _check_servo_config(self):
"""Check if config file exists for servod.
If servod config file does not exist, there is no need to verify if
servo is working. The servo could be attached to a board not supported
yet.
@raises ServoHostVerifyFailure if /var/lib/servod/config does not exist.
"""
if self._is_localhost:
return
try:
self.run('test -f /var/lib/servod/config')
except (error.AutoservRunError, error.AutoservSSHTimeout) as e:
if not self._is_cros_host():
logging.info('Ignoring servo config check failure, either %s '
'is not running chromeos or we cannot find enough '
'information about the host.', self.hostname)
return
raise ServoHostVerifyFailure(
'Servo config file check failed for %s: %s' %
(self.hostname, e))
def _check_servod_status(self):
"""Check if servod process is running.
If servod is not running, there is no need to verify if servo is
working. Check the process before making any servod call can avoid
long timeout that eventually fail any servod call.
If the servo host is set to localhost, failure of servod status check
will be ignored, as servo call may use ssh tunnel.
@raises ServoHostVerifyFailure if servod process does not exist.
"""
try:
pids = [str(int(s)) for s in
self.run('pgrep servod').stdout.strip().split('\n')]
logging.info('servod is running, PID=%s', ','.join(pids))
except (error.AutoservRunError, error.AutoservSSHTimeout) as e:
if self._is_localhost:
logging.info('Ignoring servod status check failure. servo host '
'is set to localhost, servo call may use ssh '
'tunnel to go through.')
else:
raise ServoHostVerifyFailure(
'Servod status check failed for %s: %s' %
(self.hostname, e))
def get_release_version(self):
"""Get the value of attribute CHROMEOS_RELEASE_VERSION from lsb-release.
@returns The version string in lsb-release, under attribute
CHROMEOS_RELEASE_VERSION.
"""
lsb_release_content = self.run(
'cat "%s"' % client_constants.LSB_RELEASE).stdout.strip()
return lsbrelease_utils.get_chromeos_release_version(
lsb_release_content=lsb_release_content)
@_timer.decorate
def _update_image(self):
"""Update the image on the servo host, if needed.
This method recognizes the following cases:
* If the Host is not running Chrome OS, do nothing.
* If a previously triggered update is now complete, reboot
to the new version.
* If the host is processing a previously triggered update,
do nothing.
* If the host is running a version of Chrome OS different
from the default for servo Hosts, trigger an update, but
don't wait for it to complete.
@raises dev_server.DevServerException: If all the devservers are down.
@raises site_utils.ParseBuildNameException: If the devserver returns
an invalid build name.
@raises autoupdater.ChromiumOSError: If something goes wrong in the
checking update engine client status or applying an update.
@raises AutoservRunError: If the update_engine_client isn't present on
the host, and the host is a cros_host.
"""
# servod could be running in a Ubuntu workstation.
if not self._is_cros_host():
logging.info('Not attempting an update, either %s is not running '
'chromeos or we cannot find enough information about '
'the host.', self.hostname)
return
if lsbrelease_utils.is_moblab():
logging.info('Not attempting an update, %s is running moblab.',
self.hostname)
return
board = _CONFIG.get_config_value(
'CROS', 'servo_board')
afe = frontend_wrappers.RetryingAFE(timeout_min=5, delay_sec=10)
target_version = afe.run('get_stable_version', board=board)
build_pattern = _CONFIG.get_config_value(
'CROS', 'stable_build_pattern')
target_build = build_pattern % (board, target_version)
target_build_number = server_site_utils.ParseBuildName(
target_build)[3]
ds = dev_server.ImageServer.resolve(self.hostname)
url = ds.get_update_url(target_build)
updater = autoupdater.ChromiumOSUpdater(update_url=url, host=self)
current_build_number = self.get_release_version()
status = updater.check_update_status()
if status == autoupdater.UPDATER_NEED_REBOOT:
logging.info('Rebooting beaglebone host %s with build %s',
self.hostname, current_build_number)
kwargs = {
'reboot_cmd': 'sleep 1 ; reboot & sleep 10; reboot -f',
'fastsync': True,
'label': None,
'wait': False,
}
# Do not wait for reboot to complete. Otherwise, self.reboot call
# will log reboot failure if servo does not come back. The logged
# reboot failure will lead to test job failure. If the test does not
# require servo, we don't want servo failure to fail the test with
# error: `Host did not return from reboot` in status.log
# If servo does not come back after reboot, exception needs to be
# raised, so test requires servo should fail.
self.reboot(**kwargs)
if self.wait_up(timeout=120):
current_build_number = self.get_release_version()
logging.info('servo host %s back from reboot, with build %s',
self.hostname, current_build_number)
else:
raise error.AutoservHostError(
'servo host %s failed to come back from reboot.' %
self.hostname)
if status in autoupdater.UPDATER_PROCESSING_UPDATE:
logging.info('servo host %s already processing an update, update '
'engine client status=%s', self.hostname, status)
elif current_build_number != target_build_number:
logging.info('Using devserver url: %s to trigger update on '
'servo host %s, from %s to %s', url, self.hostname,
current_build_number, target_build_number)
try:
ds.stage_artifacts(target_build,
artifacts=['full_payload'])
except Exception as e:
logging.error('Staging artifacts failed: %s', str(e))
logging.error('Abandoning update for this cycle.')
else:
try:
updater.trigger_update()
except autoupdater.RootFSUpdateError as e:
trigger_download_status = 'failed with %s' % str(e)
autotest_stats.Counter(
'servo_host.RootFSUpdateError').increment()
else:
trigger_download_status = 'passed'
logging.info('Triggered download and update %s for %s, '
'update engine currently in status %s',
trigger_download_status, self.hostname,
updater.check_update_status())
else:
logging.info('servo host %s does not require an update.',
self.hostname)
def verify_software(self):
"""Update the servo host and verify it's in a good state.
It overrides the base class function for verify_software.
If an update is available, downloads and applies it. Then verifies:
1) Whether basic servo command can run successfully.
2) Whether USB is in a good state. crbug.com/225932
@raises ServoHostVerifyFailure if servo host does not pass the checks.
"""
logging.info('Applying an update to the servo host, if necessary.')
self._update_image()
self._check_servo_config()
self._check_servod_status()
# If servo is already initialized, we don't need to do it again, call
# _check_servod should be enough.
if self._servo:
self._check_servod()
else:
self._servo = servo.Servo(servo_host=self)
timeout, _ = retry.timeout(
self._servo.initialize_dut,
timeout_sec=self.INITIALIZE_SERVO_TIMEOUT_SECS)
if timeout:
raise ServoHostVerifyFailure('Servo initialize timed out.')
logging.info('Sanity checks pass on servo host %s', self.hostname)
def _repair_with_sysrq_reboot(self):
"""Reboot with magic SysRq key."""
self.reboot(timeout=self.REBOOT_TIMEOUT_SECS,
label=None,
down_timeout=self.HOST_DOWN_TIMEOUT_SECS,
reboot_cmd='echo "b" > /proc/sysrq-trigger &',
fastsync=True)
time.sleep(self.REBOOT_DELAY_SECS)
def has_power(self):
"""Return whether or not the servo host is powered by PoE."""
# TODO(fdeng): See crbug.com/302791
# For now, assume all servo hosts in the lab have power.
return self.is_in_lab()
def power_cycle(self):
"""Cycle power to this host via PoE if it is a lab device.
@raises ServoHostRepairFailure if it fails to power cycle the
servo host.
"""
if self.has_power():
try:
rpm_client.set_power(self.hostname, 'CYCLE')
except (socket.error, xmlrpclib.Error,
httplib.BadStatusLine,
rpm_client.RemotePowerException) as e:
raise ServoHostRepairFailure(
'Power cycling %s failed: %s' % (self.hostname, e))
else:
logging.info('Skipping power cycling, not a lab device.')
def _powercycle_to_repair(self):
"""Power cycle the servo host using PoE.
@raises ServoHostRepairFailure if it fails to fix the servo host.
@raises ServoHostRepairMethodNA if it does not support power.
"""
if not self.has_power():
raise ServoHostRepairMethodNA('%s does not support power.' %
self.hostname)
logging.info('Attempting repair via PoE powercycle.')
failed_cycles = 0
self.power_cycle()
while not self.wait_up(timeout=self.REBOOT_TIMEOUT_SECS):
failed_cycles += 1
if failed_cycles >= self._MAX_POWER_CYCLE_ATTEMPTS:
raise ServoHostRepairFailure(
'Powercycled host %s %d times; device did not come back'
' online.' % (self.hostname, failed_cycles))
self.power_cycle()
logging.info('Powercycling was successful after %d failures.',
failed_cycles)
# Allow some time for servod to get started.
time.sleep(self.REBOOT_DELAY_SECS)
def repair_full(self):
"""Attempt to repair servo host.
This overrides the base class function for repair.
Note if the host is not in Cros Lab, the repair procedure
will be skipped.
@raises ServoHostRepairTotalFailure if all attempts fail.
"""
if not self.is_in_lab():
logging.warning('Skip repairing servo host %s: Not a lab device.',
self.hostname)
return
logging.info('Attempting to repair servo host %s.', self.hostname)
# Reset the cache to guarantee servo initialization being called later.
self._servo = None
# TODO(dshi): add self._powercycle_to_repair back to repair_funcs
# after crbug.com/336606 is fixed.
repair_funcs = [self._repair_with_sysrq_reboot,]
errors = []
for repair_func in repair_funcs:
counter_prefix = 'servo_host_repair.%s.' % repair_func.__name__
try:
repair_func()
self.verify()
autotest_stats.Counter(counter_prefix + 'SUCCEEDED').increment()
return
except ServoHostRepairMethodNA as e:
logging.warning('Repair method NA: %s', e)
autotest_stats.Counter(counter_prefix + 'RepairNA').increment()
errors.append(str(e))
except Exception as e:
logging.warning('Failed to repair servo: %s', e)
autotest_stats.Counter(counter_prefix + 'FAILED').increment()
errors.append(str(e))
autotest_stats.Counter('servo_host_repair.Full_Repair_Failed'). \
increment()
raise ServoHostRepairTotalFailure(
'All attempts at repairing the servo failed:\n%s' %
'\n'.join(errors))
def get_servo(self):
"""Get the cached servo.Servo object.
@return: a servo.Servo object.
"""
return self._servo
def create_servo_host(dut, servo_args, try_lab_servo=False):
"""Create a ServoHost object.
The `servo_args` parameter is a dictionary specifying optional
Servo client parameter overrides (i.e. a specific host or port).
When specified, the caller requires that an exception be raised
unless both the ServoHost and the Servo are successfully
created.
There are three possible cases:
1. If the DUT is in the Cros test lab then the ServoHost object
is only created for the host in the lab. Alternate host or
port settings in `servo_host` will be ignored.
2. When not case 1., but `servo_args` is not `None`, then create
a ServoHost object using `servo_args`.
3. Otherwise, return `None`.
When the `try_lab_servo` parameter is false, it indicates that a
ServoHost should not be created for a device in the Cros test
lab. The setting of `servo_args` takes precedence over the
setting of `try_lab_servo`.
@param dut: host name of the host that servo connects. It can be used to
lookup the servo in test lab using naming convention.
@param servo_args: A dictionary that contains args for creating
a ServoHost object,
e.g. {'servo_host': '172.11.11.111',
'servo_port': 9999}.
See comments above.
@param try_lab_servo: Boolean. Whether to create ServoHost for a device
in test lab. See above.
@returns: A ServoHost object or None. See comments above.
"""
required_by_test = servo_args is not None
if not utils.is_in_container():
is_moblab = utils.is_moblab()
else:
is_moblab = _CONFIG.get_config_value(
'SSP', 'is_moblab', type=bool, default=False)
if not is_moblab:
dut_is_hostname = not dnsname_mangler.is_ip_address(dut)
if dut_is_hostname:
lab_servo_hostname = make_servo_hostname(dut)
is_in_lab = utils.host_is_in_lab_zone(lab_servo_hostname)
else:
is_in_lab = False
else:
# Servos on Moblab are not in the actual lab.
is_in_lab = False
afe = frontend_wrappers.RetryingAFE(timeout_min=5, delay_sec=10)
hosts = afe.get_hosts(hostname=dut)
if hosts and SERVO_HOST_ATTR in hosts[0].attributes:
servo_args = {}
servo_args[SERVO_HOST_ATTR] = hosts[0].attributes[SERVO_HOST_ATTR]
servo_args[SERVO_PORT_ATTR] = hosts[0].attributes.get(
SERVO_PORT_ATTR, 9999)
if (utils.is_in_container() and
servo_args[SERVO_HOST_ATTR] in ['localhost', '127.0.0.1']):
servo_args[SERVO_HOST_ATTR] = _CONFIG.get_config_value(
'SSP', 'host_container_ip', type=str, default=None)
if not is_in_lab:
if not required_by_test:
return None
return ServoHost(required_by_test=True, is_in_lab=False, **servo_args)
elif servo_args is not None or try_lab_servo:
# Technically, this duplicates the SSH ping done early in the servo
# proxy initialization code. However, this ping ends in a couple
# seconds when if fails, rather than the 60 seconds it takes to decide
# that an SSH ping has timed out. Specifically, that timeout happens
# when our servo DNS name resolves, but there is no host at that IP.
# TODO(dshi): crbug.com/380773 Remove this ping check once the bug is
# fixed. Autotest should not try to verify servo if servo is
# not required for the test.
ping_config = ping_runner.PingConfig(
lab_servo_hostname, count=3,
ignore_result=True, ignore_status=True)
logging.info('Pinging servo at %s', lab_servo_hostname)
host_is_up = ping_runner.PingRunner().ping(ping_config).received > 0
if host_is_up:
return ServoHost(servo_host=lab_servo_hostname, is_in_lab=is_in_lab,
required_by_test=required_by_test)
else:
return None