contrib/log_distiller.py - platform/external/autotest - Gitiles

 #!/usr/bin/python
 """
 Usage: ./cron_scripts/log_distiller.py job_id path_to_logfile
     If the job_id is a suite it will find all subjobs.
 You need to change the location of the log it will parse.
 The job_id needs to be in the afe database.
 """
 import abc
 import datetime
 import os
 import re
 import pprint
 import subprocess
 import sys
 import time

 import common
 from autotest_lib.server import frontend


 LOGFIE = './logs/scheduler.log.2014-04-17-16.51.47'
 # logfile name format: scheduler.log.2014-02-14-18.10.56
 time_format = '%Y-%m-%d-%H.%M.%S'
 logfile_regex = r'scheduler.log.([0-9,.,-]+)'
 logdir = os.path.join('/usr/local/autotest', 'logs')

 class StateMachineViolation(Exception):
     pass


 class LogLineException(Exception):
     pass


 def should_process_log(time_str, time_format, cutoff_days=7):
     """Returns true if the logs was created after cutoff days.

     @param time_str: A string representing the time.
         eg: 2014-02-14-18.10.56
     @param time_format: A string representing the format of the time string.
         ref: http://docs.python.org/2/library/datetime.html#strftime-strptime-behavior
     @param cutoff_days: Int representind the cutoff in days.

     @return: Returns True if time_str has aged more than cutoff_days.
     """
     log_time = datetime.datetime.strptime(time_str, time_format)
     now = datetime.datetime.strptime(time.strftime(time_format), time_format)
     cutoff = now - datetime.timedelta(days=cutoff_days)
     return log_time < cutoff


 def apply_regex(regex, line):
     """Simple regex applicator.

     @param regex: Regex to apply.
     @param line: The line to apply regex on.

     @return: A tuple with the matching groups, if there was a match.
     """
     log_match  = re.match(regex, line)
     if log_match:
         return log_match.groups()


 class StateMachineParser(object):
     """Abstract class that enforces state transition ordering.

     Classes inheriting from StateMachineParser need to define an
     expected_transitions dictionary. The SMP will pop 'to' states
     from the dictionary as they occur, so you cannot same state transitions
     unless you specify 2 of them.
     """
     __metaclass__ = abc.ABCMeta


     @abc.abstractmethod
     def __init__(self):
         self.visited_states = []
         self.expected_transitions = {}


     def advance_state(self, from_state, to_state):
         """Checks that a transition is valid.

         @param from_state: A string representind the state the host is leaving.
         @param to_state: The state The host is going to, represented as a string.

         @raises LogLineException: If an invalid state transition was
             detected.
         """
         # TODO: Updating to the same state is a waste of bw.
         if from_state and from_state == to_state:
             return ('Updating to the same state is a waste of BW: %s->%s' %
                     (from_state, to_state))
             return

         if (from_state in self.expected_transitions and
             to_state in self.expected_transitions[from_state]):
             self.expected_transitions[from_state].remove(to_state)
             self.visited_states.append(to_state)
         else:
             return (from_state, to_state)


 class SingleJobHostSMP(StateMachineParser):
     def __init__(self):
         self.visited_states = []
         self.expected_transitions = {
                 'Ready': ['Resetting', 'Verifying', 'Pending', 'Provisioning'],
                 'Resetting': ['Ready', 'Provisioning'],
                 'Pending': ['Running'],
                 'Provisioning': ['Repairing'],
                 'Running': ['Ready']
         }


     def check_transitions(self, hostline):
         if hostline.line_info['field'] == 'status':
             self.advance_state(hostline.line_info['state'],
                     hostline.line_info['value'])


 class SingleJobHqeSMP(StateMachineParser):
     def __init__(self):
         self.visited_states = []
         self.expected_transitions = {
                 'Queued': ['Starting', 'Resetting', 'Aborted'],
                 'Resetting': ['Pending', 'Provisioning'],
                 'Provisioning': ['Pending', 'Queued', 'Repairing'],
                 'Pending': ['Starting'],
                 'Starting': ['Running'],
                 'Running': ['Gathering', 'Parsing'],
                 'Gathering': ['Parsing'],
                 'Parsing': ['Completed', 'Aborted']
         }


     def check_transitions(self, hqeline):
         invalid_states = self.advance_state(
                 hqeline.line_info['from_state'], hqeline.line_info['to_state'])
         if not invalid_states:
             return

         # Deal with repair.
         if (invalid_states[0] == 'Queued' and
             'Running' in self.visited_states):
             raise StateMachineViolation('Unrecognized state transition '
                     '%s->%s, expected transitions are %s' %
                     (invalid_states[0], invalid_states[1],
                      self.expected_transitions))


 class LogLine(object):
     """Line objects.

     All classes inheriting from LogLine represent a line of some sort.
     A line is responsible for parsing itself, and invoking an SMP to
     validate state transitions. A line can be part of several state machines.
     """
     line_format = '%s'


     def __init__(self, state_machine_parsers):
         """
         @param state_machine_parsers: A list of smp objects to use to validate
             state changes on these types of lines..
         """
         self.smps = state_machine_parsers

         # Because, this is easier to flush.
         self.line_info = {}


     def parse_line(self, line):
         """Apply a line regex and save any information the parsed line contains.

         @param line: A string representing a line.
         """
         # Regex for all the things.
         line_rgx = '(.*)'
         parsed_line = apply_regex(line_rgx, line)
         if parsed_line:
             self.line_info['line'] = parsed_line[0]


     def flush(self):
         """Call any state machine parsers, persist line info if needed.
         """
         for smp in self.smps:
             smp.check_transitions(self)
         # TODO: persist this?
         self.line_info={}


     def format_line(self):
         try:
             return self.line_format % self.line_info
         except KeyError:
             return self.line_info['line']


 class TimeLine(LogLine):
     """Filters timestamps for scheduler logs.
     """

     def parse_line(self, line):
         super(TimeLine, self).parse_line(line)

         # Regex for isolating the date and time from scheduler logs, eg:
         # 02/16 16:04:36.573 INFO |scheduler_:0574|...
         line_rgx = '([0-9,/,:,., ]+)(.*)'
         parsed_line = apply_regex(line_rgx, self.line_info['line'])
         if parsed_line:
             self.line_info['time'] = parsed_line[0]
             self.line_info['line'] = parsed_line[1]


 class HostLine(TimeLine):
     """Manages hosts line parsing.
     """
     line_format = (' \t\t %(time)s %(host)s, currently in %(state)s, '
                 'updated %(field)s->%(value)s')


     def record_state_transition(self, line):
         """Apply the state_transition_rgx to a line and record state changes.

         @param line: The line we're expecting to contain a state transition.
         """
         state_transition_rgx = ".* ([a-zA-Z]+) updating {'([a-zA-Z]+)': ('[a-zA-Z]+'|[0-9])}.*"
         match = apply_regex(state_transition_rgx, line)
         if match:
             self.line_info['state'] = match[0]
             self.line_info['field'] = match[1]
             self.line_info['value'] = match[2].replace("'", "")


     def parse_line(self, line):
         super(HostLine, self).parse_line(line)

         # Regex for getting host status. Eg:
         # 172.22.4 in Running updating {'status': 'Running'}
         line_rgx = '.*Host (([0-9,.,a-z,-]+).*)'
         parsed_line = apply_regex(line_rgx, self.line_info['line'])
         if parsed_line:
             self.line_info['line'] = parsed_line[0]
             self.line_info['host'] = parsed_line[1]
             self.record_state_transition(self.line_info['line'])
             return self.format_line()


 class HQELine(TimeLine):
     """Manages HQE line parsing.
     """
     line_format = ('%(time)s %(hqe)s, currently in %(from_state)s, '
             'updated to %(to_state)s. Flags: %(flags)s')


     def record_state_transition(self, line):
         """Apply the state_transition_rgx to a line and record state changes.

         @param line: The line we're expecting to contain a state transition.
         """
         # Regex for getting hqe status. Eg:
         # status:Running [active] -> Gathering
         state_transition_rgx = ".*status:([a-zA-Z]+)( \[[a-z\,]+\])? -> ([a-zA-Z]+)"
         match = apply_regex(state_transition_rgx, line)
         if match:
             self.line_info['from_state'] = match[0]
             self.line_info['flags'] = match[1]
             self.line_info['to_state'] = match[2]


     def parse_line(self, line):
         super(HQELine, self).parse_line(line)
         line_rgx = r'.*\| HQE: (([0-9]+).*)'
         parsed_line = apply_regex(line_rgx, self.line_info['line'])
         if parsed_line:
             self.line_info['line'] = parsed_line[0]
             self.line_info['hqe'] = parsed_line[1]
             self.record_state_transition(self.line_info['line'])
             return self.format_line()


 class LogCrawler(object):
     """Crawl logs.

     Log crawlers are meant to apply some basic preprocessing to a log, and crawl
     the output validating state changes. They manage line and state machine
     creation. The initial filtering applied to the log needs to be grab all lines
     that match an action, such as the running of a job.
     """

     def __init__(self, log_name):
         self.log = log_name
         self.filter_command = 'cat %s' % log_name


     def preprocess_log(self):
         """Apply some basic filtering to the log.
         """
         proc = subprocess.Popen(self.filter_command,
                 shell=True, stdout=subprocess.PIPE)
         out, err = proc.communicate()
         return out


 class SchedulerLogCrawler(LogCrawler):
     """A log crawler for the scheduler logs.

     This crawler is only capable of processing information about a single job.
     """

     def __init__(self, log_name, **kwargs):
         super(SchedulerLogCrawler, self).__init__(log_name)
         self.job_id = kwargs['job_id']
         self.line_processors = [HostLine([SingleJobHostSMP()]),
                 HQELine([SingleJobHqeSMP()])]
         self.filter_command = ('%s | grep "for job: %s"' %
                 (self.filter_command, self.job_id))


     def parse_log(self):
         """Parse each line of the preprocessed log output.

         Pass each line through each possible line_processor. The one that matches
         will populate itself, call flush, this will walk the state machine of that
         line to the next step.
         """
         out = self.preprocess_log()
         response = []
         for job_line in out.split('\n'):
             parsed_line = None
             for processor in self.line_processors:
                 line = processor.parse_line(job_line)
                 if line and parsed_line:
                     raise LogLineException('Multiple Parsers claiming the line %s: '
                             'previous parsing: %s, current parsing: %s ' %
                             (job_line, parsed_line, line))
                 elif line:
                     parsed_line = line
                     try:
                         processor.flush()
                     except StateMachineViolation as e:
                         response.append(str(e))
                         raise StateMachineViolation(response)
             response.append(parsed_line if parsed_line else job_line)
         return response


 def process_logs():
     if len(sys.argv) < 2:
         print ('Usage: ./cron_scripts/log_distiller.py 0 8415620 '
                'You need to change the location of the log it will parse.'
                 'The job_id needs to be in the afe database.')
         sys.exit(1)

     job_id = int(sys.argv[1])
     rpc = frontend.AFE()
     suite_jobs = rpc.run('get_jobs', id=job_id)
     if not suite_jobs[0]['parent_job']:
         suite_jobs = rpc.run('get_jobs', parent_job=job_id)
     try:
         logfile = sys.argv[2]
     except Exception:
         logfile = LOGFILE

     for job in suite_jobs:
         log_crawler = SchedulerLogCrawler(logfile, job_id=job['id'])
         for line in log_crawler.parse_log():
             print line
     return


 if __name__ == '__main__':
     process_logs()
	#!/usr/bin/python
	"""
	Usage: ./cron_scripts/log_distiller.py job_id path_to_logfile
	If the job_id is a suite it will find all subjobs.
	You need to change the location of the log it will parse.
	The job_id needs to be in the afe database.
	"""
	import abc
	import datetime
	import os
	import re
	import pprint
	import subprocess
	import sys
	import time

	import common
	from autotest_lib.server import frontend


	LOGFIE = './logs/scheduler.log.2014-04-17-16.51.47'
	# logfile name format: scheduler.log.2014-02-14-18.10.56
	time_format = '%Y-%m-%d-%H.%M.%S'
	logfile_regex = r'scheduler.log.([0-9,.,-]+)'
	logdir = os.path.join('/usr/local/autotest', 'logs')

	class StateMachineViolation(Exception):
	pass


	class LogLineException(Exception):
	pass


	def should_process_log(time_str, time_format, cutoff_days=7):
	"""Returns true if the logs was created after cutoff days.

	@param time_str: A string representing the time.
	eg: 2014-02-14-18.10.56
	@param time_format: A string representing the format of the time string.
	ref: http://docs.python.org/2/library/datetime.html#strftime-strptime-behavior
	@param cutoff_days: Int representind the cutoff in days.

	@return: Returns True if time_str has aged more than cutoff_days.
	"""
	log_time = datetime.datetime.strptime(time_str, time_format)
	now = datetime.datetime.strptime(time.strftime(time_format), time_format)
	cutoff = now - datetime.timedelta(days=cutoff_days)
	return log_time < cutoff


	def apply_regex(regex, line):
	"""Simple regex applicator.

	@param regex: Regex to apply.
	@param line: The line to apply regex on.

	@return: A tuple with the matching groups, if there was a match.
	"""
	log_match = re.match(regex, line)
	if log_match:
	return log_match.groups()


	class StateMachineParser(object):
	"""Abstract class that enforces state transition ordering.

	Classes inheriting from StateMachineParser need to define an
	expected_transitions dictionary. The SMP will pop 'to' states
	from the dictionary as they occur, so you cannot same state transitions
	unless you specify 2 of them.
	"""
	__metaclass__ = abc.ABCMeta


	@abc.abstractmethod
	def __init__(self):
	self.visited_states = []
	self.expected_transitions = {}


	def advance_state(self, from_state, to_state):
	"""Checks that a transition is valid.

	@param from_state: A string representind the state the host is leaving.
	@param to_state: The state The host is going to, represented as a string.

	@raises LogLineException: If an invalid state transition was
	detected.
	"""
	# TODO: Updating to the same state is a waste of bw.
	if from_state and from_state == to_state:
	return ('Updating to the same state is a waste of BW: %s->%s' %
	(from_state, to_state))
	return

	if (from_state in self.expected_transitions and
	to_state in self.expected_transitions[from_state]):
	self.expected_transitions[from_state].remove(to_state)
	self.visited_states.append(to_state)
	else:
	return (from_state, to_state)


	class SingleJobHostSMP(StateMachineParser):
	def __init__(self):
	self.visited_states = []
	self.expected_transitions = {
	'Ready': ['Resetting', 'Verifying', 'Pending', 'Provisioning'],
	'Resetting': ['Ready', 'Provisioning'],
	'Pending': ['Running'],
	'Provisioning': ['Repairing'],
	'Running': ['Ready']
	}


	def check_transitions(self, hostline):
	if hostline.line_info['field'] == 'status':
	self.advance_state(hostline.line_info['state'],
	hostline.line_info['value'])


	class SingleJobHqeSMP(StateMachineParser):
	def __init__(self):
	self.visited_states = []
	self.expected_transitions = {
	'Queued': ['Starting', 'Resetting', 'Aborted'],
	'Resetting': ['Pending', 'Provisioning'],
	'Provisioning': ['Pending', 'Queued', 'Repairing'],
	'Pending': ['Starting'],
	'Starting': ['Running'],
	'Running': ['Gathering', 'Parsing'],
	'Gathering': ['Parsing'],
	'Parsing': ['Completed', 'Aborted']
	}


	def check_transitions(self, hqeline):
	invalid_states = self.advance_state(
	hqeline.line_info['from_state'], hqeline.line_info['to_state'])
	if not invalid_states:
	return

	# Deal with repair.
	if (invalid_states[0] == 'Queued' and
	'Running' in self.visited_states):
	raise StateMachineViolation('Unrecognized state transition '
	'%s->%s, expected transitions are %s' %
	(invalid_states[0], invalid_states[1],
	self.expected_transitions))


	class LogLine(object):
	"""Line objects.

	All classes inheriting from LogLine represent a line of some sort.
	A line is responsible for parsing itself, and invoking an SMP to
	validate state transitions. A line can be part of several state machines.
	"""
	line_format = '%s'


	def __init__(self, state_machine_parsers):
	"""
	@param state_machine_parsers: A list of smp objects to use to validate
	state changes on these types of lines..
	"""
	self.smps = state_machine_parsers

	# Because, this is easier to flush.
	self.line_info = {}


	def parse_line(self, line):
	"""Apply a line regex and save any information the parsed line contains.

	@param line: A string representing a line.
	"""
	# Regex for all the things.
	line_rgx = '(.*)'
	parsed_line = apply_regex(line_rgx, line)
	if parsed_line:
	self.line_info['line'] = parsed_line[0]


	def flush(self):
	"""Call any state machine parsers, persist line info if needed.
	"""
	for smp in self.smps:
	smp.check_transitions(self)
	# TODO: persist this?
	self.line_info={}


	def format_line(self):
	try:
	return self.line_format % self.line_info
	except KeyError:
	return self.line_info['line']


	class TimeLine(LogLine):
	"""Filters timestamps for scheduler logs.
	"""

	def parse_line(self, line):
	super(TimeLine, self).parse_line(line)

	# Regex for isolating the date and time from scheduler logs, eg:
	# 02/16 16:04:36.573 INFO \|scheduler_:0574\|...
	line_rgx = '([0-9,/,:,., ]+)(.*)'
	parsed_line = apply_regex(line_rgx, self.line_info['line'])
	if parsed_line:
	self.line_info['time'] = parsed_line[0]
	self.line_info['line'] = parsed_line[1]


	class HostLine(TimeLine):
	"""Manages hosts line parsing.
	"""
	line_format = (' \t\t %(time)s %(host)s, currently in %(state)s, '
	'updated %(field)s->%(value)s')


	def record_state_transition(self, line):
	"""Apply the state_transition_rgx to a line and record state changes.

	@param line: The line we're expecting to contain a state transition.
	"""
	state_transition_rgx = ".* ([a-zA-Z]+) updating {'([a-zA-Z]+)': ('[a-zA-Z]+'\|[0-9])}.*"
	match = apply_regex(state_transition_rgx, line)
	if match:
	self.line_info['state'] = match[0]
	self.line_info['field'] = match[1]
	self.line_info['value'] = match[2].replace("'", "")


	def parse_line(self, line):
	super(HostLine, self).parse_line(line)

	# Regex for getting host status. Eg:
	# 172.22.4 in Running updating {'status': 'Running'}
	line_rgx = '.Host (([0-9,.,a-z,-]+).)'
	parsed_line = apply_regex(line_rgx, self.line_info['line'])
	if parsed_line:
	self.line_info['line'] = parsed_line[0]
	self.line_info['host'] = parsed_line[1]
	self.record_state_transition(self.line_info['line'])
	return self.format_line()


	class HQELine(TimeLine):
	"""Manages HQE line parsing.
	"""
	line_format = ('%(time)s %(hqe)s, currently in %(from_state)s, '
	'updated to %(to_state)s. Flags: %(flags)s')


	def record_state_transition(self, line):
	"""Apply the state_transition_rgx to a line and record state changes.

	@param line: The line we're expecting to contain a state transition.
	"""
	# Regex for getting hqe status. Eg:
	# status:Running [active] -> Gathering
	state_transition_rgx = ".*status:([a-zA-Z]+)( \[[a-z\,]+\])? -> ([a-zA-Z]+)"
	match = apply_regex(state_transition_rgx, line)
	if match:
	self.line_info['from_state'] = match[0]
	self.line_info['flags'] = match[1]
	self.line_info['to_state'] = match[2]


	def parse_line(self, line):
	super(HQELine, self).parse_line(line)
	line_rgx = r'.\\| HQE: (([0-9]+).)'
	parsed_line = apply_regex(line_rgx, self.line_info['line'])
	if parsed_line:
	self.line_info['line'] = parsed_line[0]
	self.line_info['hqe'] = parsed_line[1]
	self.record_state_transition(self.line_info['line'])
	return self.format_line()


	class LogCrawler(object):
	"""Crawl logs.

	Log crawlers are meant to apply some basic preprocessing to a log, and crawl
	the output validating state changes. They manage line and state machine
	creation. The initial filtering applied to the log needs to be grab all lines
	that match an action, such as the running of a job.
	"""

	def __init__(self, log_name):
	self.log = log_name
	self.filter_command = 'cat %s' % log_name


	def preprocess_log(self):
	"""Apply some basic filtering to the log.
	"""
	proc = subprocess.Popen(self.filter_command,
	shell=True, stdout=subprocess.PIPE)
	out, err = proc.communicate()
	return out


	class SchedulerLogCrawler(LogCrawler):
	"""A log crawler for the scheduler logs.

	This crawler is only capable of processing information about a single job.
	"""

	def __init__(self, log_name, **kwargs):
	super(SchedulerLogCrawler, self).__init__(log_name)
	self.job_id = kwargs['job_id']
	self.line_processors = [HostLine([SingleJobHostSMP()]),
	HQELine([SingleJobHqeSMP()])]
	self.filter_command = ('%s \| grep "for job: %s"' %
	(self.filter_command, self.job_id))


	def parse_log(self):
	"""Parse each line of the preprocessed log output.

	Pass each line through each possible line_processor. The one that matches
	will populate itself, call flush, this will walk the state machine of that
	line to the next step.
	"""
	out = self.preprocess_log()
	response = []
	for job_line in out.split('\n'):
	parsed_line = None
	for processor in self.line_processors:
	line = processor.parse_line(job_line)
	if line and parsed_line:
	raise LogLineException('Multiple Parsers claiming the line %s: '
	'previous parsing: %s, current parsing: %s ' %
	(job_line, parsed_line, line))
	elif line:
	parsed_line = line
	try:
	processor.flush()
	except StateMachineViolation as e:
	response.append(str(e))
	raise StateMachineViolation(response)
	response.append(parsed_line if parsed_line else job_line)
	return response


	def process_logs():
	if len(sys.argv) < 2:
	print ('Usage: ./cron_scripts/log_distiller.py 0 8415620 '
	'You need to change the location of the log it will parse.'
	'The job_id needs to be in the afe database.')
	sys.exit(1)

	job_id = int(sys.argv[1])
	rpc = frontend.AFE()
	suite_jobs = rpc.run('get_jobs', id=job_id)
	if not suite_jobs[0]['parent_job']:
	suite_jobs = rpc.run('get_jobs', parent_job=job_id)
	try:
	logfile = sys.argv[2]
	except Exception:
	logfile = LOGFILE

	for job in suite_jobs:
	log_crawler = SchedulerLogCrawler(logfile, job_id=job['id'])
	for line in log_crawler.parse_log():
	print line
	return


	if __name__ == '__main__':
	process_logs()