site_utils/job_directories.py - platform/external/autotest - Gitiles

 import abc
 import datetime
 import glob
 import json
 import logging
 import os
 import re
 import shutil

 import common
 from autotest_lib.client.common_lib import time_utils
 from autotest_lib.client.common_lib import utils
 from autotest_lib.server.cros.dynamic_suite import constants
 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

 try:
     from chromite.lib import metrics
 except ImportError:
     metrics = utils.metrics_mock


 SPECIAL_TASK_PATTERN = '.*/hosts/[^/]+/(\d+)-[^/]+'

 def is_job_expired(age_limit, timestamp):
   """Check whether a job timestamp is older than an age limit.

   @param age_limit: Minimum age, measured in days.  If the value is
                     not positive, the job is always expired.
   @param timestamp: Timestamp of the job whose age we are checking.
                     The format must match time_utils.TIME_FMT.

   @returns True iff the job is old enough to be expired.
   """
   if age_limit <= 0:
     return True
   job_time = time_utils.time_string_to_datetime(timestamp)
   expiration = job_time + datetime.timedelta(days=age_limit)
   return datetime.datetime.now() >= expiration


 def get_job_id_or_task_id(result_dir):
     """Extract job id or special task id from result_dir

     @param result_dir: path to the result dir.
             For test job:
             /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
             The hostname at the end is optional.
             For special task:
             /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup

     @returns: str representing the job id or task id. Returns None if fail
         to parse job or task id from the result_dir.
     """
     if not result_dir:
         return
     result_dir = os.path.abspath(result_dir)
     # Result folder for job running inside container has only job id.
     ssp_job_pattern = '.*/(\d+)$'
     # Try to get the job ID from the last pattern of number-text. This avoids
     # issue with path like 123-results/456-debug_user, in which 456 is the real
     # job ID.
     m_job = re.findall('.*/(\d+)-[^/]+', result_dir)
     if m_job:
         return m_job[-1]
     m_special_task = re.match(SPECIAL_TASK_PATTERN, result_dir)
     if m_special_task:
         return m_special_task.group(1)
     m_ssp_job_pattern = re.match(ssp_job_pattern, result_dir)
     if m_ssp_job_pattern and utils.is_in_container():
         return m_ssp_job_pattern.group(1)
     m_swarming_task = re.match('.*/swarming-([0-9a-fA-F]+)$', result_dir)
     if m_swarming_task:
         return m_swarming_task.group(1)


 def get_job_folder_name(result_dir):
     """Extract folder name of a job from result_dir.

     @param result_dir: path to the result dir.
             For test job:
             /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
             The hostname at the end is optional.
             For special task:
             /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup

     @returns: The name of the folder of a job. Returns None if directory path
             does not match supported job directories patterns.
     """
     if not result_dir:
         return
     # Pattern of a job folder, e.g., 123-debug_user, where 123 is job id and
     # debug_user is the name of user starts the job.
     test_job_pattern = '.*/(\d+-[^/]+)'
     m_job = re.findall(test_job_pattern, result_dir)
     if m_job:
         return m_job[-1]
     m_swarming_task = re.match('.*/(swarming-[0-9a-fA-F]+)$', result_dir)
     if m_swarming_task:
         return m_swarming_task.group(1)


 class _JobDirectory(object):
   """State associated with a job to be offloaded.

   The full life-cycle of a job (including failure events that
   normally don't occur) looks like this:
    1. The job's results directory is discovered by
       `get_job_directories()`, and a job instance is created for it.
    2. Calls to `offload()` have no effect so long as the job
       isn't complete in the database and the job isn't expired
       according to the `age_limit` parameter.
    3. Eventually, the job is both finished and expired.  The next
       call to `offload()` makes the first attempt to offload the
       directory to GS.  Offload is attempted, but fails to complete
       (e.g. because of a GS problem).
    4. Finally, a call to `offload()` succeeds, and the directory no
       longer exists.  Now `is_offloaded()` is true, so the job
       instance is deleted, and future failures will not mention this
       directory any more.

   Only steps 1. and 4. are guaranteed to occur.  The others depend
   on the timing of calls to `offload()`, and on the reliability of
   the actual offload process.

   """

   __metaclass__ = abc.ABCMeta

   GLOB_PATTERN = None   # must be redefined in subclass

   def __init__(self, resultsdir):
     self.dirname = resultsdir
     self._id = get_job_id_or_task_id(resultsdir)
     self.offload_count = 0
     self.first_offload_start = 0

   @classmethod
   def get_job_directories(cls):
     """Return a list of directories of jobs that need offloading."""
     return [d for d in glob.glob(cls.GLOB_PATTERN) if os.path.isdir(d)]

   @abc.abstractmethod
   def get_timestamp_if_finished(self):
     """Return this job's timestamp from the database.

     If the database has not marked the job as finished, return
     `None`.  Otherwise, return a timestamp for the job.  The
     timestamp is to be used to determine expiration in
     `is_job_expired()`.

     @return Return `None` if the job is still running; otherwise
             return a string with a timestamp in the appropriate
             format.
     """
     raise NotImplementedError("_JobDirectory.get_timestamp_if_finished")

   def process_gs_instructions(self):
     """Process any gs_offloader instructions for this special task.

     @returns True/False if there is anything left to offload.
     """
     # Default support is to still offload the directory.
     return True


 NO_OFFLOAD_README = """These results have been deleted rather than offloaded.
 This is the expected behavior for passing jobs from the Commit Queue."""


 class RegularJobDirectory(_JobDirectory):
   """Subclass of _JobDirectory for regular test jobs."""

   GLOB_PATTERN = '[0-9]*-*'

   def process_gs_instructions(self):
     """Process any gs_offloader instructions for this job.

     @returns True/False if there is anything left to offload.
     """
     # Go through the gs_offloader instructions file for each test in this job.
     for path in glob.glob(os.path.join(self.dirname, '*',
                                        constants.GS_OFFLOADER_INSTRUCTIONS)):
       with open(path, 'r') as f:
         gs_off_instructions = json.load(f)
       if gs_off_instructions.get(constants.GS_OFFLOADER_NO_OFFLOAD):
         dirname = os.path.dirname(path)
         _remove_log_directory_contents(dirname)

     # Finally check if there's anything left to offload.
     if os.path.exists(self.dirname) and not os.listdir(self.dirname):
       shutil.rmtree(self.dirname)
       return False
     return True

   def get_timestamp_if_finished(self):
     """Get the timestamp to use for finished jobs.

     @returns the latest hqe finished_on time. If the finished_on times are null
              returns the job's created_on time.
     """
     entry = _cached_afe().get_jobs(id=self._id, finished=True)
     if not entry:
       return None
     hqes = _cached_afe().get_host_queue_entries(finished_on__isnull=False,
                                                 job_id=self._id)
     if not hqes:
       return entry[0].created_on
     # While most Jobs have 1 HQE, some can have multiple, so check them all.
     return max([hqe.finished_on for hqe in hqes])


 def _remove_log_directory_contents(dirpath):
     """Remove log directory contents.

     Leave a note explaining what has happened to the logs.

     @param dirpath: Path to log directory.
     """
     shutil.rmtree(dirpath)
     os.mkdir(dirpath)
     breadcrumb_name = os.path.join(dirpath, 'logs-removed-readme.txt')
     with open(breadcrumb_name, 'w') as f:
       f.write(NO_OFFLOAD_README)


 class SpecialJobDirectory(_JobDirectory):
   """Subclass of _JobDirectory for special (per-host) jobs."""

   GLOB_PATTERN = 'hosts/*/[0-9]*-*'

   def __init__(self, resultsdir):
     super(SpecialJobDirectory, self).__init__(resultsdir)

   def get_timestamp_if_finished(self):
     entry = _cached_afe().get_special_tasks(id=self._id, is_complete=True)
     return entry[0].time_finished if entry else None


 _OFFLOAD_MARKER = ".ready_for_offload"
 _marker_parse_error_metric = metrics.Counter(
     'chromeos/autotest/gs_offloader/offload_marker_parse_errors',
     description='Errors parsing the offload marker file')


 class SwarmingJobDirectory(_JobDirectory):
   """Subclass of _JobDirectory for Skylab swarming jobs."""

   # .../results/swarming-3e4391423c3a4311
   GLOB_PATTERN = 'swarming-[a-f0-9]*'

   def get_timestamp_if_finished(self):
     """Get the timestamp to use for finished jobs.

     @returns the latest hqe finished_on time. If the finished_on times are null
              returns the job's created_on time.
     """
     marker_path = os.path.join(self.dirname, _OFFLOAD_MARKER)
     try:
       with open(marker_path) as f:
         ts_string = f.read().strip()
     except (OSError, IOError) as e:
       return None
     try:
       ts = int(ts_string)
       return time_utils.epoch_time_to_date_string(ts)
     except ValueError as e:
       logging.debug('Error parsing %s for %s: %s',
                     _OFFLOAD_MARKER, self.dirname, e)
       _marker_parse_error_metric.increment()
       return None


 _AFE = None
 def _cached_afe():
   global _AFE
   if _AFE is None:
     _AFE = frontend_wrappers.RetryingAFE()
   return _AFE
	import abc
	import datetime
	import glob
	import json
	import logging
	import os
	import re
	import shutil

	import common
	from autotest_lib.client.common_lib import time_utils
	from autotest_lib.client.common_lib import utils
	from autotest_lib.server.cros.dynamic_suite import constants
	from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

	try:
	from chromite.lib import metrics
	except ImportError:
	metrics = utils.metrics_mock


	SPECIAL_TASK_PATTERN = '.*/hosts/[^/]+/(\d+)-[^/]+'

	def is_job_expired(age_limit, timestamp):
	"""Check whether a job timestamp is older than an age limit.

	@param age_limit: Minimum age, measured in days. If the value is
	not positive, the job is always expired.
	@param timestamp: Timestamp of the job whose age we are checking.
	The format must match time_utils.TIME_FMT.

	@returns True iff the job is old enough to be expired.
	"""
	if age_limit <= 0:
	return True
	job_time = time_utils.time_string_to_datetime(timestamp)
	expiration = job_time + datetime.timedelta(days=age_limit)
	return datetime.datetime.now() >= expiration


	def get_job_id_or_task_id(result_dir):
	"""Extract job id or special task id from result_dir

	@param result_dir: path to the result dir.
	For test job:
	/usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
	The hostname at the end is optional.
	For special task:
	/usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup

	@returns: str representing the job id or task id. Returns None if fail
	to parse job or task id from the result_dir.
	"""
	if not result_dir:
	return
	result_dir = os.path.abspath(result_dir)
	# Result folder for job running inside container has only job id.
	ssp_job_pattern = '.*/(\d+)$'
	# Try to get the job ID from the last pattern of number-text. This avoids
	# issue with path like 123-results/456-debug_user, in which 456 is the real
	# job ID.
	m_job = re.findall('.*/(\d+)-[^/]+', result_dir)
	if m_job:
	return m_job[-1]
	m_special_task = re.match(SPECIAL_TASK_PATTERN, result_dir)
	if m_special_task:
	return m_special_task.group(1)
	m_ssp_job_pattern = re.match(ssp_job_pattern, result_dir)
	if m_ssp_job_pattern and utils.is_in_container():
	return m_ssp_job_pattern.group(1)
	m_swarming_task = re.match('.*/swarming-([0-9a-fA-F]+)$', result_dir)
	if m_swarming_task:
	return m_swarming_task.group(1)


	def get_job_folder_name(result_dir):
	"""Extract folder name of a job from result_dir.

	@param result_dir: path to the result dir.
	For test job:
	/usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
	The hostname at the end is optional.
	For special task:
	/usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup

	@returns: The name of the folder of a job. Returns None if directory path
	does not match supported job directories patterns.
	"""
	if not result_dir:
	return
	# Pattern of a job folder, e.g., 123-debug_user, where 123 is job id and
	# debug_user is the name of user starts the job.
	test_job_pattern = '.*/(\d+-[^/]+)'
	m_job = re.findall(test_job_pattern, result_dir)
	if m_job:
	return m_job[-1]
	m_swarming_task = re.match('.*/(swarming-[0-9a-fA-F]+)$', result_dir)
	if m_swarming_task:
	return m_swarming_task.group(1)


	class _JobDirectory(object):
	"""State associated with a job to be offloaded.

	The full life-cycle of a job (including failure events that
	normally don't occur) looks like this:
	1. The job's results directory is discovered by
	`get_job_directories()`, and a job instance is created for it.
	2. Calls to `offload()` have no effect so long as the job
	isn't complete in the database and the job isn't expired
	according to the `age_limit` parameter.
	3. Eventually, the job is both finished and expired. The next
	call to `offload()` makes the first attempt to offload the
	directory to GS. Offload is attempted, but fails to complete
	(e.g. because of a GS problem).
	4. Finally, a call to `offload()` succeeds, and the directory no
	longer exists. Now `is_offloaded()` is true, so the job
	instance is deleted, and future failures will not mention this
	directory any more.

	Only steps 1. and 4. are guaranteed to occur. The others depend
	on the timing of calls to `offload()`, and on the reliability of
	the actual offload process.

	"""

	__metaclass__ = abc.ABCMeta

	GLOB_PATTERN = None # must be redefined in subclass

	def __init__(self, resultsdir):
	self.dirname = resultsdir
	self._id = get_job_id_or_task_id(resultsdir)
	self.offload_count = 0
	self.first_offload_start = 0

	@classmethod
	def get_job_directories(cls):
	"""Return a list of directories of jobs that need offloading."""
	return [d for d in glob.glob(cls.GLOB_PATTERN) if os.path.isdir(d)]

	@abc.abstractmethod
	def get_timestamp_if_finished(self):
	"""Return this job's timestamp from the database.

	If the database has not marked the job as finished, return
	`None`. Otherwise, return a timestamp for the job. The
	timestamp is to be used to determine expiration in
	`is_job_expired()`.

	@return Return `None` if the job is still running; otherwise
	return a string with a timestamp in the appropriate
	format.
	"""
	raise NotImplementedError("_JobDirectory.get_timestamp_if_finished")

	def process_gs_instructions(self):
	"""Process any gs_offloader instructions for this special task.

	@returns True/False if there is anything left to offload.
	"""
	# Default support is to still offload the directory.
	return True


	NO_OFFLOAD_README = """These results have been deleted rather than offloaded.
	This is the expected behavior for passing jobs from the Commit Queue."""


	class RegularJobDirectory(_JobDirectory):
	"""Subclass of _JobDirectory for regular test jobs."""

	GLOB_PATTERN = '[0-9]-'

	def process_gs_instructions(self):
	"""Process any gs_offloader instructions for this job.

	@returns True/False if there is anything left to offload.
	"""
	# Go through the gs_offloader instructions file for each test in this job.
	for path in glob.glob(os.path.join(self.dirname, '*',
	constants.GS_OFFLOADER_INSTRUCTIONS)):
	with open(path, 'r') as f:
	gs_off_instructions = json.load(f)
	if gs_off_instructions.get(constants.GS_OFFLOADER_NO_OFFLOAD):
	dirname = os.path.dirname(path)
	_remove_log_directory_contents(dirname)

	# Finally check if there's anything left to offload.
	if os.path.exists(self.dirname) and not os.listdir(self.dirname):
	shutil.rmtree(self.dirname)
	return False
	return True

	def get_timestamp_if_finished(self):
	"""Get the timestamp to use for finished jobs.

	@returns the latest hqe finished_on time. If the finished_on times are null
	returns the job's created_on time.
	"""
	entry = _cached_afe().get_jobs(id=self._id, finished=True)
	if not entry:
	return None
	hqes = _cached_afe().get_host_queue_entries(finished_on__isnull=False,
	job_id=self._id)
	if not hqes:
	return entry[0].created_on
	# While most Jobs have 1 HQE, some can have multiple, so check them all.
	return max([hqe.finished_on for hqe in hqes])


	def _remove_log_directory_contents(dirpath):
	"""Remove log directory contents.

	Leave a note explaining what has happened to the logs.

	@param dirpath: Path to log directory.
	"""
	shutil.rmtree(dirpath)
	os.mkdir(dirpath)
	breadcrumb_name = os.path.join(dirpath, 'logs-removed-readme.txt')
	with open(breadcrumb_name, 'w') as f:
	f.write(NO_OFFLOAD_README)


	class SpecialJobDirectory(_JobDirectory):
	"""Subclass of _JobDirectory for special (per-host) jobs."""

	GLOB_PATTERN = 'hosts//[0-9]-*'

	def __init__(self, resultsdir):
	super(SpecialJobDirectory, self).__init__(resultsdir)

	def get_timestamp_if_finished(self):
	entry = _cached_afe().get_special_tasks(id=self._id, is_complete=True)
	return entry[0].time_finished if entry else None


	_OFFLOAD_MARKER = ".ready_for_offload"
	_marker_parse_error_metric = metrics.Counter(
	'chromeos/autotest/gs_offloader/offload_marker_parse_errors',
	description='Errors parsing the offload marker file')


	class SwarmingJobDirectory(_JobDirectory):
	"""Subclass of _JobDirectory for Skylab swarming jobs."""

	# .../results/swarming-3e4391423c3a4311
	GLOB_PATTERN = 'swarming-[a-f0-9]*'

	def get_timestamp_if_finished(self):
	"""Get the timestamp to use for finished jobs.

	@returns the latest hqe finished_on time. If the finished_on times are null
	returns the job's created_on time.
	"""
	marker_path = os.path.join(self.dirname, _OFFLOAD_MARKER)
	try:
	with open(marker_path) as f:
	ts_string = f.read().strip()
	except (OSError, IOError) as e:
	return None
	try:
	ts = int(ts_string)
	return time_utils.epoch_time_to_date_string(ts)
	except ValueError as e:
	logging.debug('Error parsing %s for %s: %s',
	_OFFLOAD_MARKER, self.dirname, e)
	_marker_parse_error_metric.increment()
	return None


	_AFE = None
	def _cached_afe():
	global _AFE
	if _AFE is None:
	_AFE = frontend_wrappers.RetryingAFE()
	return _AFE