contrib/sheriff_servo.py - platform/external/autotest - Gitiles

 #!/usr/bin/env python

 import collections
 import multiprocessing.pool
 import os
 import re
 import subprocess
 import sys
 import tempfile
 import urllib2

 import common
 from autotest_lib.server import frontend
 from autotest_lib.server import utils


 SERVERS = ['cautotest', 'cautotest-cq']
 RESULT_URL_FMT = 'http://%s/tko/retrieve_logs.cgi?job=/results'
 LOG_PATH_FMT = 'hosts/%s/%d-repair'
 AUTOSERV_DEBUG_LOG = 'debug/autoserv.DEBUG'
 GS_URI =  'gs://chromeos-autotest-results'

 REPAIR_FAILURE_PATTERN = re.compile('.*Failed to repair .*: (.*)')
 REPAIR_ACTION_PATTERN = re.compile('.*Attempting (.*)')
 NEW_LINE_PATTERN = re.compile('^\d\d/\d\d\s\d\d:\d\d:\d\d')
 COMMAND_PATTERN = re.compile('[^"]*("[^"]*").*')

 def ssh_command(host, cmd):
     return ['ssh', '-o PasswordAuthentication=no', '-o ConnectTimeout=1',
             '-o ConnectionAttempts=1', '-o StrictHostKeyChecking=no',
             '-q', 'root@'+host, '--', cmd]


 class Result(object):
     def __init__(self, host, ping, ssh, servod, logs):
         self.host = host
         self.ping = ping
         self.ssh = ssh
         self.servod = servod
         self.logs = logs


 class FailureReason(object):
     """Object used to store the detail of an attempt to repair.
     """
     def __init__(self, action, error, hostname):
         """Initialize the detail of an attempt to repair.

         @param action: Action taken to repair the DUT.
         @param error: Error message.
         @param hostname: Hostname of the repair failed DUT.
         """
         self.action = action
         self.error = error
         self.hostname = hostname
         self.key = self.get_key()


     def get_key(self):
         """Get a reason string that does not contain unnecessary information.

         @param reason: A string that explains repair failure.
         @param host: An attribute dictionary of host that failed to be repaired.
         @return: A simplified string that does not contain unnecessary
                  information, e.g., hostname etc.
         """
         # Merge error into a single line.
         error = self.error.replace('\n', ' ')
         # Remove repeated empty spaces.
         error = re.sub('\s\s+' , ' ', error)

         if '/usr/bin/ssh -a -x -o ControlPath=' in error:
             # Extract the command, and rebuild reason string.
             error = ('Failed to run command: %s' %
                      extract_match_text(error, COMMAND_PATTERN))

         reason = 'Action: %s. Error: %s' % (self.action, error)

         # Remove hostname from reason string.
         reason = reason.replace(self.hostname, '')

         # Trim reason string as some might be really long.
         reason = reason[:136]

         return reason


 class Failure(object):
     """Object used to store the details of why a host is failed to be repaired.
     """
     def __init__(self, server, host, repair_job_id):
         """Initialize details of failure

         @param server: Name of Autotest instance.
         @param host: CrosHost object of the DUT.
         @param repair_job_id is the job ID of last repair job.
         """
         self.host = host
         self.repair_job_id = repair_job_id
         log_path = LOG_PATH_FMT % (host['hostname'], repair_job_id)
         self.log_url = '%s/%s' % (RESULT_URL_FMT % server, log_path)

         self.reasons = self.get_repair_fail_reasons()


     @classmethod
     def get_debug_log(self, autoserv_log_url, autoserv_log_path):
         """Get a list of strings or a stream for autoserv.DEBUG log file.

         @param autoserv_log_url: Url to the repair job's autoserv.DEBUG log.
         @param autoserv_log_path: Path to autoserv.DEBUG log, e.g.,
                             hosts/hostname/job_id-repair/debug/autoserv.DEBUG.
         @return: A list of string if debug log was moved to GS already, or a
                  stream of the autoserv.DEBUG file.
         """
         url = urllib2.urlopen(autoserv_log_url).geturl()
         if not 'accounts.google.com' in url:
             return urllib2.urlopen(url)
         else:
             # The file was moved to Google storage already, download it to /tmp
             debug_log_link = '%s/%s' % (GS_URI, autoserv_log_path)
             cmd = 'gsutil cat %s' % debug_log_link
             proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)
             stdout, stderr = proc.communicate()
             if proc.returncode == 0:
                 return stdout.split(os.linesep)
             else:
                 print 'Failed to read %s: %s' % (debug_log_link, stderr)


     @classmethod
     def is_log_entry(self, line):
         """Check if the given line is a log entry started with a timestamp.

         @param line: A single line in the log file.
         @return: True if the line is a new log entry that starts with a
                  timestamp.
         """
         return NEW_LINE_PATTERN.match(line) != None


     @classmethod
     def get_action(self, line):
         """Parse the action used to repair DUT.

         @param line: A single line in the log file.
         @return: Action used to repair DUT, None if the line does not have such
                  information.
         """
         action = extract_match_text(line, REPAIR_ACTION_PATTERN)
         # Trim `to `, some repair attempt (RPM) does not use that word.
         if action and action.startswith('to '):
             action = action[3:]
         # Ignore the line `Attempting full repair`
         if action == 'full repair':
             action = None
         return action


     def get_repair_fail_reasons(self):
         """Get a list of reasons for repair failure.

         Read through autoserv.DEBUG log to find each attempt to repair the DUT
         and the reason it failed. The sequence in the log looks like:
         Attempting full repair
         Attempting to repair servo host chromeos2-row5-rack1-host4-servo.
         Failed to repair servo: Host did not return from reboot
         Attempting recover servo enabled device by powering it off and on
         Failed to repair device: DUT did not boot after long_press.
         Attempting recovery servo enabled device with servo_repair_reinstall
         Failed to repair device: Download image to usb failed.
         Attempting repair via RPM powercycle.
         Failed to repair device: Powercycled host 6 times; device did not...

         @param autoserv_log_url: Url to the repair job's autoserv.DEBUG log.
         @param autoserv_log_path: Path to autoserv.DEBUG log, e.g.,
                             hosts/hostname/job_id-repair/debug/autoserv.DEBUG.
         @param hostname: Hostname of the repair failed DUT.
         @return: A list of reason strings.
         """
         hostname = self.host['hostname']
         log_path = LOG_PATH_FMT % (hostname, self.repair_job_id)
         autoserv_log_path = '%s/%s' % (log_path, AUTOSERV_DEBUG_LOG)
         autoserv_log_url = '%s/%s' % (self.log_url, AUTOSERV_DEBUG_LOG)

         # Parse log debug/autoserv.DEBUG to find all reasons that repair failed.
         reasons = []
         debug_log = self.get_debug_log(autoserv_log_url, autoserv_log_path)

         # Some message might take multiple lines.
         error = None
         action = None
         for line in debug_log:
             if self.is_log_entry(line):
                 if error:
                     reasons.append(FailureReason(action, error, hostname))
                     action = None
                 if not action:
                     action = self.get_action(line)
                 error = extract_match_text(line, REPAIR_FAILURE_PATTERN)
             elif error:
                 error += line
         if error:
             reasons.append(FailureReason(action, error, hostname))

         # Close the stream if debug_log is not a list of string read from
         # command 'gsutil cat'.
         if not isinstance(debug_log, list):
             debug_log.close()

         return reasons


     def log(self):
         """Print the details about why DUT was failed to be repaired.
         """
         print '\nExamining failure reason for %s ...' % self.host['hostname']
         print '    Log url: %s' % self.log_url
         print '    Repair failed reasons:'
         print '\n'.join(['        ' + reason.key for reason in self.reasons])


 def extract_match_text(line, pattern):
     """Extract the text information that matches the pattern.

     @param line: A single line in the log file.
     @return: Text that matches the given pattern, None if the line does not
              match the pattern.
     """
     m = pattern.match(line)
     if m:
         return m.group(1)
     else:
         return None


 def log_result(result):
     print "Examining %s ..." % result.host

     if result.ping:
         print "  PING = UP"
     else:
         print "  PING = DOWN\n"
         return

     if result.ssh:
         print "  SSH = UP"
     else:
         print "  SSH = DOWN\n"
         return

     print "  SERVOD = %s" % ('UP' if result.servod else 'DOWN',)
     print "  LOGS = \n%s" % (result.logs,)


 def check_servo(servo):
     r = Result(servo, None, None, None, None)

     r.ping = (utils.ping(servo, tries=5, deadline=5) == 0)
     if not r.ping:
         return r

     try:
         subprocess.check_output(ssh_command(servo, "true"))
     except subprocess.CalledProcessError:
         r.ssh = False
         return r
     else:
         r.ssh = True

     try:
         output = subprocess.check_output(ssh_command(servo, "pgrep servod"))
     except subprocess.CalledProcessError:
         r.servod = False
     else:
         r.servod = (output != "")

     try:
         output = subprocess.check_output(
             ssh_command(servo, "tail /var/log/servod.log"))
     except subprocess.CalledProcessError:
         r.logs = ""
     else:
         r.logs = output

     return r


 def get_last_repair_job(afe, host):
     """Get the job ID of last repair job.

     @param afe: rpc interface.
     @param host: Attribute dictionary of host that failed to be repaired.
     @return: Job ID of last repair job.
     """
     try:
         tasks = afe.run('get_special_tasks',
                         host__hostname=host['hostname'],
                         task='Repair',
                         time_started__isnull=False,
                         is_complete=True,
                         is_aborted=False,
                         sort_by=['-time_started'],
                         query_limit=1)
         return tasks[0]['id']
     except Exception as e:
         print ('Failed to get repair job id for host %s: %s' %
                (host['hostname'], e))
         return None


 def check_host_failure(input):
     """Check why host failed to be repaired.

     @param input: A dictionary of input arguments, i.e.,
                   {server, host, repair_job_id}, where:
                   server is the name of autotest instance
                   host is an attribute dictionary of host that failed to be
                       repaired.
                   repair_job_id is the job ID of last repair job.
     """
     return Failure(server=input['server'], host=input['host'],
                    repair_job_id=input['repair_job_id'])


 def redeploy_hdctools(host):
     try:
         subprocess.check_output(
             ssh_command(host, "/home/chromeos-test/hdctools/beaglebone/deploy"),
             stderr=subprocess.STDOUT)
     except subprocess.CalledProcessError:
         return False
     else:
         return True


 def install_package(package):
     def curry(host):
         try:
             subprocess.check_output(
                 ssh_command(host, "apt-get install %s" % package),
                 stderr=subprocess.STDOUT)
             subprocess.check_output(
                 ssh_command(host, "start servod"),
                 stderr=subprocess.STDOUT)
         except subprocess.CalledProcessError:
             return False
         else:
             return True
     curry.__name__ = "install_package(%s)" % package
     return curry


 def manual_intervention(reason):
     def curry(_):
           return False

     curry.__name__ = 'MANUAL(%s)' % reason
     return curry


 Fix = collections.namedtuple('Fix', ['host', 'method', 'success'])

 # I don't know if these failures are one-time or repeating, so I'm adding code
 # here for now.  If these are seen and fixed by this frequently, then this code
 # should be moved into servo_host's repair()
 def diagnose_failure(r):
     method = None

     if r.logs and 'ImportError: Entry point' in r.logs:
         method = redeploy_hdctools

     if r.logs and 'ImportError: No module named serial' in r.logs:
         method = install_package('python-serial')

     if not r.ping or not r.ssh:
         method = manual_intervention('servo is unresponsive on network')

     if r.logs and 'No usb device connected to servo' in r.logs:
         method = manual_intervention("servo doesn't see USB drive")

     if r.logs and 'discover_servo - No servos found' in r.logs:
         method = manual_intervention("beaglebone doesn't see servo")

     if method:
         return Fix(r.host, method.__name__, method(r.host))
     else:
         return None


 def get_board(host):
     """Get name of the board from a host object.

     @param host: An attribute dictionary of host.
     @return: Name of the board, return None if board label is not found.
     """
     for label in host['labels']:
         if label.startswith('board:'):
             return label[6:]
     return None


 def main():
     pool = multiprocessing.pool.ThreadPool()
     all_results = []
     all_repair_failures = []

     for server in SERVERS:
         afe = frontend.AFE(server=server)
         hosts = afe.run('get_hosts', multiple_labels=['servo'],
                         status='Repair Failed')
         print ('%s: %s hosts with servo label are in Repair Failed.' %
                (server, len(hosts)))

         servos = [h['hostname']+'-servo.cros' for h in hosts]

         results = pool.imap_unordered(check_servo, servos)
         for result in results:
             log_result(result)
             all_results.append(result)

         inputs = [{'server': server, 'host': host,
                    'repair_job_id': get_last_repair_job(afe, host)}
                    for host in hosts]

         repair_failures = pool.imap_unordered(check_host_failure, inputs)
         for failure in repair_failures:
             failure.log()
             all_repair_failures.append(failure)

     failure_collection = []
     for failure in all_repair_failures:
         failure_collection.extend([(reason, get_board(failure.host))
                                    for reason in failure.reasons
                                    if 'servo' in reason.action])

     def compare_failure(f1, f2):
         return (cmp(f1[0].key, f2[0].key) or cmp(f1[1], f2[1]) or
                 cmp(f1[0].hostname, f2[0].hostname))
     failure_collection.sort(cmp=compare_failure)
     print '====================================='
     print 'DUTs failed to be repaired by servo:'
     print '====================================='
     current_key = None
     for (reason, board) in failure_collection:
         if reason.key != current_key:
             print reason.key
             current_key = reason.key
             current_board = None
         if board != current_board:
             print '    %s' % board
             current_board = board
         print '        %s' % reason.hostname

     # fix 'em if you can?
     fixes = filter(None, pool.imap_unordered(diagnose_failure, all_results))
     for fix in fixes:
       print ("Fixing %(host)s via %(method)s resulted in %(success)s" %
              dict(host=fix.host, method=fix.method,
                   success='SUCCESS' if fix.success else 'FAILURE'))
     return 0

 if __name__ == '__main__':
     sys.exit(main())
	#!/usr/bin/env python

	import collections
	import multiprocessing.pool
	import os
	import re
	import subprocess
	import sys
	import tempfile
	import urllib2

	import common
	from autotest_lib.server import frontend
	from autotest_lib.server import utils


	SERVERS = ['cautotest', 'cautotest-cq']
	RESULT_URL_FMT = 'http://%s/tko/retrieve_logs.cgi?job=/results'
	LOG_PATH_FMT = 'hosts/%s/%d-repair'
	AUTOSERV_DEBUG_LOG = 'debug/autoserv.DEBUG'
	GS_URI = 'gs://chromeos-autotest-results'

	REPAIR_FAILURE_PATTERN = re.compile('.Failed to repair .: (.*)')
	REPAIR_ACTION_PATTERN = re.compile('.Attempting (.)')
	NEW_LINE_PATTERN = re.compile('^\d\d/\d\d\s\d\d:\d\d:\d\d')
	COMMAND_PATTERN = re.compile('[^"]("[^"]").*')

	def ssh_command(host, cmd):
	return ['ssh', '-o PasswordAuthentication=no', '-o ConnectTimeout=1',
	'-o ConnectionAttempts=1', '-o StrictHostKeyChecking=no',
	'-q', 'root@'+host, '--', cmd]


	class Result(object):
	def __init__(self, host, ping, ssh, servod, logs):
	self.host = host
	self.ping = ping
	self.ssh = ssh
	self.servod = servod
	self.logs = logs


	class FailureReason(object):
	"""Object used to store the detail of an attempt to repair.
	"""
	def __init__(self, action, error, hostname):
	"""Initialize the detail of an attempt to repair.

	@param action: Action taken to repair the DUT.
	@param error: Error message.
	@param hostname: Hostname of the repair failed DUT.
	"""
	self.action = action
	self.error = error
	self.hostname = hostname
	self.key = self.get_key()


	def get_key(self):
	"""Get a reason string that does not contain unnecessary information.

	@param reason: A string that explains repair failure.
	@param host: An attribute dictionary of host that failed to be repaired.
	@return: A simplified string that does not contain unnecessary
	information, e.g., hostname etc.
	"""
	# Merge error into a single line.
	error = self.error.replace('\n', ' ')
	# Remove repeated empty spaces.
	error = re.sub('\s\s+' , ' ', error)

	if '/usr/bin/ssh -a -x -o ControlPath=' in error:
	# Extract the command, and rebuild reason string.
	error = ('Failed to run command: %s' %
	extract_match_text(error, COMMAND_PATTERN))

	reason = 'Action: %s. Error: %s' % (self.action, error)

	# Remove hostname from reason string.
	reason = reason.replace(self.hostname, '')

	# Trim reason string as some might be really long.
	reason = reason[:136]

	return reason


	class Failure(object):
	"""Object used to store the details of why a host is failed to be repaired.
	"""
	def __init__(self, server, host, repair_job_id):
	"""Initialize details of failure

	@param server: Name of Autotest instance.
	@param host: CrosHost object of the DUT.
	@param repair_job_id is the job ID of last repair job.
	"""
	self.host = host
	self.repair_job_id = repair_job_id
	log_path = LOG_PATH_FMT % (host['hostname'], repair_job_id)
	self.log_url = '%s/%s' % (RESULT_URL_FMT % server, log_path)

	self.reasons = self.get_repair_fail_reasons()


	@classmethod
	def get_debug_log(self, autoserv_log_url, autoserv_log_path):
	"""Get a list of strings or a stream for autoserv.DEBUG log file.

	@param autoserv_log_url: Url to the repair job's autoserv.DEBUG log.
	@param autoserv_log_path: Path to autoserv.DEBUG log, e.g.,
	hosts/hostname/job_id-repair/debug/autoserv.DEBUG.
	@return: A list of string if debug log was moved to GS already, or a
	stream of the autoserv.DEBUG file.
	"""
	url = urllib2.urlopen(autoserv_log_url).geturl()
	if not 'accounts.google.com' in url:
	return urllib2.urlopen(url)
	else:
	# The file was moved to Google storage already, download it to /tmp
	debug_log_link = '%s/%s' % (GS_URI, autoserv_log_path)
	cmd = 'gsutil cat %s' % debug_log_link
	proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
	stderr=subprocess.PIPE)
	stdout, stderr = proc.communicate()
	if proc.returncode == 0:
	return stdout.split(os.linesep)
	else:
	print 'Failed to read %s: %s' % (debug_log_link, stderr)


	@classmethod
	def is_log_entry(self, line):
	"""Check if the given line is a log entry started with a timestamp.

	@param line: A single line in the log file.
	@return: True if the line is a new log entry that starts with a
	timestamp.
	"""
	return NEW_LINE_PATTERN.match(line) != None


	@classmethod
	def get_action(self, line):
	"""Parse the action used to repair DUT.

	@param line: A single line in the log file.
	@return: Action used to repair DUT, None if the line does not have such
	information.
	"""
	action = extract_match_text(line, REPAIR_ACTION_PATTERN)
	# Trim `to `, some repair attempt (RPM) does not use that word.
	if action and action.startswith('to '):
	action = action[3:]
	# Ignore the line `Attempting full repair`
	if action == 'full repair':
	action = None
	return action


	def get_repair_fail_reasons(self):
	"""Get a list of reasons for repair failure.

	Read through autoserv.DEBUG log to find each attempt to repair the DUT
	and the reason it failed. The sequence in the log looks like:
	Attempting full repair
	Attempting to repair servo host chromeos2-row5-rack1-host4-servo.
	Failed to repair servo: Host did not return from reboot
	Attempting recover servo enabled device by powering it off and on
	Failed to repair device: DUT did not boot after long_press.
	Attempting recovery servo enabled device with servo_repair_reinstall
	Failed to repair device: Download image to usb failed.
	Attempting repair via RPM powercycle.
	Failed to repair device: Powercycled host 6 times; device did not...

	@param autoserv_log_url: Url to the repair job's autoserv.DEBUG log.
	@param autoserv_log_path: Path to autoserv.DEBUG log, e.g.,
	hosts/hostname/job_id-repair/debug/autoserv.DEBUG.
	@param hostname: Hostname of the repair failed DUT.
	@return: A list of reason strings.
	"""
	hostname = self.host['hostname']
	log_path = LOG_PATH_FMT % (hostname, self.repair_job_id)
	autoserv_log_path = '%s/%s' % (log_path, AUTOSERV_DEBUG_LOG)
	autoserv_log_url = '%s/%s' % (self.log_url, AUTOSERV_DEBUG_LOG)

	# Parse log debug/autoserv.DEBUG to find all reasons that repair failed.
	reasons = []
	debug_log = self.get_debug_log(autoserv_log_url, autoserv_log_path)

	# Some message might take multiple lines.
	error = None
	action = None
	for line in debug_log:
	if self.is_log_entry(line):
	if error:
	reasons.append(FailureReason(action, error, hostname))
	action = None
	if not action:
	action = self.get_action(line)
	error = extract_match_text(line, REPAIR_FAILURE_PATTERN)
	elif error:
	error += line
	if error:
	reasons.append(FailureReason(action, error, hostname))

	# Close the stream if debug_log is not a list of string read from
	# command 'gsutil cat'.
	if not isinstance(debug_log, list):
	debug_log.close()

	return reasons


	def log(self):
	"""Print the details about why DUT was failed to be repaired.
	"""
	print '\nExamining failure reason for %s ...' % self.host['hostname']
	print ' Log url: %s' % self.log_url
	print ' Repair failed reasons:'
	print '\n'.join([' ' + reason.key for reason in self.reasons])


	def extract_match_text(line, pattern):
	"""Extract the text information that matches the pattern.

	@param line: A single line in the log file.
	@return: Text that matches the given pattern, None if the line does not
	match the pattern.
	"""
	m = pattern.match(line)
	if m:
	return m.group(1)
	else:
	return None


	def log_result(result):
	print "Examining %s ..." % result.host

	if result.ping:
	print " PING = UP"
	else:
	print " PING = DOWN\n"
	return

	if result.ssh:
	print " SSH = UP"
	else:
	print " SSH = DOWN\n"
	return

	print " SERVOD = %s" % ('UP' if result.servod else 'DOWN',)
	print " LOGS = \n%s" % (result.logs,)


	def check_servo(servo):
	r = Result(servo, None, None, None, None)

	r.ping = (utils.ping(servo, tries=5, deadline=5) == 0)
	if not r.ping:
	return r

	try:
	subprocess.check_output(ssh_command(servo, "true"))
	except subprocess.CalledProcessError:
	r.ssh = False
	return r
	else:
	r.ssh = True

	try:
	output = subprocess.check_output(ssh_command(servo, "pgrep servod"))
	except subprocess.CalledProcessError:
	r.servod = False
	else:
	r.servod = (output != "")

	try:
	output = subprocess.check_output(
	ssh_command(servo, "tail /var/log/servod.log"))
	except subprocess.CalledProcessError:
	r.logs = ""
	else:
	r.logs = output

	return r


	def get_last_repair_job(afe, host):
	"""Get the job ID of last repair job.

	@param afe: rpc interface.
	@param host: Attribute dictionary of host that failed to be repaired.
	@return: Job ID of last repair job.
	"""
	try:
	tasks = afe.run('get_special_tasks',
	host__hostname=host['hostname'],
	task='Repair',
	time_started__isnull=False,
	is_complete=True,
	is_aborted=False,
	sort_by=['-time_started'],
	query_limit=1)
	return tasks[0]['id']
	except Exception as e:
	print ('Failed to get repair job id for host %s: %s' %
	(host['hostname'], e))
	return None


	def check_host_failure(input):
	"""Check why host failed to be repaired.

	@param input: A dictionary of input arguments, i.e.,
	{server, host, repair_job_id}, where:
	server is the name of autotest instance
	host is an attribute dictionary of host that failed to be
	repaired.
	repair_job_id is the job ID of last repair job.
	"""
	return Failure(server=input['server'], host=input['host'],
	repair_job_id=input['repair_job_id'])


	def redeploy_hdctools(host):
	try:
	subprocess.check_output(
	ssh_command(host, "/home/chromeos-test/hdctools/beaglebone/deploy"),
	stderr=subprocess.STDOUT)
	except subprocess.CalledProcessError:
	return False
	else:
	return True


	def install_package(package):
	def curry(host):
	try:
	subprocess.check_output(
	ssh_command(host, "apt-get install %s" % package),
	stderr=subprocess.STDOUT)
	subprocess.check_output(
	ssh_command(host, "start servod"),
	stderr=subprocess.STDOUT)
	except subprocess.CalledProcessError:
	return False
	else:
	return True
	curry.__name__ = "install_package(%s)" % package
	return curry


	def manual_intervention(reason):
	def curry(_):
	return False

	curry.__name__ = 'MANUAL(%s)' % reason
	return curry


	Fix = collections.namedtuple('Fix', ['host', 'method', 'success'])

	# I don't know if these failures are one-time or repeating, so I'm adding code
	# here for now. If these are seen and fixed by this frequently, then this code
	# should be moved into servo_host's repair()
	def diagnose_failure(r):
	method = None

	if r.logs and 'ImportError: Entry point' in r.logs:
	method = redeploy_hdctools

	if r.logs and 'ImportError: No module named serial' in r.logs:
	method = install_package('python-serial')

	if not r.ping or not r.ssh:
	method = manual_intervention('servo is unresponsive on network')

	if r.logs and 'No usb device connected to servo' in r.logs:
	method = manual_intervention("servo doesn't see USB drive")

	if r.logs and 'discover_servo - No servos found' in r.logs:
	method = manual_intervention("beaglebone doesn't see servo")

	if method:
	return Fix(r.host, method.__name__, method(r.host))
	else:
	return None


	def get_board(host):
	"""Get name of the board from a host object.

	@param host: An attribute dictionary of host.
	@return: Name of the board, return None if board label is not found.
	"""
	for label in host['labels']:
	if label.startswith('board:'):
	return label[6:]
	return None


	def main():
	pool = multiprocessing.pool.ThreadPool()
	all_results = []
	all_repair_failures = []

	for server in SERVERS:
	afe = frontend.AFE(server=server)
	hosts = afe.run('get_hosts', multiple_labels=['servo'],
	status='Repair Failed')
	print ('%s: %s hosts with servo label are in Repair Failed.' %
	(server, len(hosts)))

	servos = [h['hostname']+'-servo.cros' for h in hosts]

	results = pool.imap_unordered(check_servo, servos)
	for result in results:
	log_result(result)
	all_results.append(result)

	inputs = [{'server': server, 'host': host,
	'repair_job_id': get_last_repair_job(afe, host)}
	for host in hosts]

	repair_failures = pool.imap_unordered(check_host_failure, inputs)
	for failure in repair_failures:
	failure.log()
	all_repair_failures.append(failure)

	failure_collection = []
	for failure in all_repair_failures:
	failure_collection.extend([(reason, get_board(failure.host))
	for reason in failure.reasons
	if 'servo' in reason.action])

	def compare_failure(f1, f2):
	return (cmp(f1[0].key, f2[0].key) or cmp(f1[1], f2[1]) or
	cmp(f1[0].hostname, f2[0].hostname))
	failure_collection.sort(cmp=compare_failure)
	print '====================================='
	print 'DUTs failed to be repaired by servo:'
	print '====================================='
	current_key = None
	for (reason, board) in failure_collection:
	if reason.key != current_key:
	print reason.key
	current_key = reason.key
	current_board = None
	if board != current_board:
	print ' %s' % board
	current_board = board
	print ' %s' % reason.hostname

	# fix 'em if you can?
	fixes = filter(None, pool.imap_unordered(diagnose_failure, all_results))
	for fix in fixes:
	print ("Fixing %(host)s via %(method)s resulted in %(success)s" %
	dict(host=fix.host, method=fix.method,
	success='SUCCESS' if fix.success else 'FAILURE'))
	return 0

	if __name__ == '__main__':
	sys.exit(main())