blob: 51ead644af04f204c7513cb609fabcc459b85f2a [file] [log] [blame]
J. Richard Barnetteea785362014-03-17 16:00:53 -07001import abc
2import datetime
3import glob
Simran Basi1e10e922015-04-16 15:09:56 -07004import json
Allen Li7402f092018-06-26 15:42:21 -07005import logging
J. Richard Barnetteea785362014-03-17 16:00:53 -07006import os
Dan Shicf4d2032015-03-12 15:04:21 -07007import re
Simran Basi1e10e922015-04-16 15:09:56 -07008import shutil
J. Richard Barnetteea785362014-03-17 16:00:53 -07009
10import common
Dan Shidfea3682014-08-10 23:38:40 -070011from autotest_lib.client.common_lib import time_utils
Dan Shi81800632015-09-29 12:16:48 -070012from autotest_lib.client.common_lib import utils
Simran Basi1e10e922015-04-16 15:09:56 -070013from autotest_lib.server.cros.dynamic_suite import constants
J. Richard Barnetteacdb0132014-09-03 16:44:12 -070014from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnetteea785362014-03-17 16:00:53 -070015
Allen Li7402f092018-06-26 15:42:21 -070016try:
17 from chromite.lib import metrics
18except ImportError:
19 metrics = utils.metrics_mock
20
J. Richard Barnetteea785362014-03-17 16:00:53 -070021
Dan Shi1b4c7c32015-10-05 10:38:57 -070022SPECIAL_TASK_PATTERN = '.*/hosts/[^/]+/(\d+)-[^/]+'
Dan Shi1b4c7c32015-10-05 10:38:57 -070023
Keith Haddow5ba5fb82016-11-09 11:39:36 -080024def is_job_expired(age_limit, timestamp):
J. Richard Barnetteea785362014-03-17 16:00:53 -070025 """Check whether a job timestamp is older than an age limit.
26
27 @param age_limit: Minimum age, measured in days. If the value is
28 not positive, the job is always expired.
29 @param timestamp: Timestamp of the job whose age we are checking.
Dan Shidfea3682014-08-10 23:38:40 -070030 The format must match time_utils.TIME_FMT.
J. Richard Barnetteea785362014-03-17 16:00:53 -070031
32 @returns True iff the job is old enough to be expired.
33 """
34 if age_limit <= 0:
35 return True
Dan Shidfea3682014-08-10 23:38:40 -070036 job_time = time_utils.time_string_to_datetime(timestamp)
J. Richard Barnetteea785362014-03-17 16:00:53 -070037 expiration = job_time + datetime.timedelta(days=age_limit)
38 return datetime.datetime.now() >= expiration
39
40
Dan Shicf4d2032015-03-12 15:04:21 -070041def get_job_id_or_task_id(result_dir):
42 """Extract job id or special task id from result_dir
43
44 @param result_dir: path to the result dir.
45 For test job:
46 /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
47 The hostname at the end is optional.
48 For special task:
49 /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup
50
Prathmesh Prabhu28a46512018-05-11 17:40:29 -070051 @returns: str representing the job id or task id. Returns None if fail
Prathmesh Prabhuf76a85c2018-05-22 22:58:20 -070052 to parse job or task id from the result_dir.
Dan Shicf4d2032015-03-12 15:04:21 -070053 """
54 if not result_dir:
55 return
56 result_dir = os.path.abspath(result_dir)
Dan Shi81800632015-09-29 12:16:48 -070057 # Result folder for job running inside container has only job id.
58 ssp_job_pattern = '.*/(\d+)$'
Dan Shicf4d2032015-03-12 15:04:21 -070059 # Try to get the job ID from the last pattern of number-text. This avoids
60 # issue with path like 123-results/456-debug_user, in which 456 is the real
61 # job ID.
Prathmesh Prabhua4557032018-05-22 22:47:37 -070062 m_job = re.findall('.*/(\d+)-[^/]+', result_dir)
Dan Shi81800632015-09-29 12:16:48 -070063 if m_job:
Prathmesh Prabhu28a46512018-05-11 17:40:29 -070064 return m_job[-1]
Dan Shi1b4c7c32015-10-05 10:38:57 -070065 m_special_task = re.match(SPECIAL_TASK_PATTERN, result_dir)
Dan Shi81800632015-09-29 12:16:48 -070066 if m_special_task:
Prathmesh Prabhu28a46512018-05-11 17:40:29 -070067 return m_special_task.group(1)
Dan Shi81800632015-09-29 12:16:48 -070068 m_ssp_job_pattern = re.match(ssp_job_pattern, result_dir)
69 if m_ssp_job_pattern and utils.is_in_container():
Prathmesh Prabhu28a46512018-05-11 17:40:29 -070070 return m_ssp_job_pattern.group(1)
Prathmesh Prabhuf76a85c2018-05-22 22:58:20 -070071 m_swarming_task = re.match('.*/swarming-([0-9a-fA-F]+)$', result_dir)
72 if m_swarming_task:
73 return m_swarming_task.group(1)
Dan Shicf4d2032015-03-12 15:04:21 -070074
75
J. Richard Barnetteea785362014-03-17 16:00:53 -070076class _JobDirectory(object):
77 """State associated with a job to be offloaded.
78
79 The full life-cycle of a job (including failure events that
80 normally don't occur) looks like this:
81 1. The job's results directory is discovered by
82 `get_job_directories()`, and a job instance is created for it.
83 2. Calls to `offload()` have no effect so long as the job
84 isn't complete in the database and the job isn't expired
85 according to the `age_limit` parameter.
86 3. Eventually, the job is both finished and expired. The next
87 call to `offload()` makes the first attempt to offload the
88 directory to GS. Offload is attempted, but fails to complete
89 (e.g. because of a GS problem).
Prathmesh Prabhuca481592017-01-30 18:05:49 -080090 4. Finally, a call to `offload()` succeeds, and the directory no
J. Richard Barnetteea785362014-03-17 16:00:53 -070091 longer exists. Now `is_offloaded()` is true, so the job
92 instance is deleted, and future failures will not mention this
93 directory any more.
94
Prathmesh Prabhuca481592017-01-30 18:05:49 -080095 Only steps 1. and 4. are guaranteed to occur. The others depend
J. Richard Barnetteea785362014-03-17 16:00:53 -070096 on the timing of calls to `offload()`, and on the reliability of
97 the actual offload process.
98
99 """
100
101 __metaclass__ = abc.ABCMeta
102
103 GLOB_PATTERN = None # must be redefined in subclass
104
105 def __init__(self, resultsdir):
Allen Lib41527d2017-06-22 17:28:00 -0700106 self.dirname = resultsdir
Dan Shicf4d2032015-03-12 15:04:21 -0700107 self._id = get_job_id_or_task_id(resultsdir)
Allen Lib41527d2017-06-22 17:28:00 -0700108 self.offload_count = 0
109 self.first_offload_start = 0
J. Richard Barnetteea785362014-03-17 16:00:53 -0700110
111 @classmethod
112 def get_job_directories(cls):
113 """Return a list of directories of jobs that need offloading."""
114 return [d for d in glob.glob(cls.GLOB_PATTERN) if os.path.isdir(d)]
115
116 @abc.abstractmethod
117 def get_timestamp_if_finished(self):
118 """Return this job's timestamp from the database.
119
120 If the database has not marked the job as finished, return
121 `None`. Otherwise, return a timestamp for the job. The
122 timestamp is to be used to determine expiration in
Keith Haddow5ba5fb82016-11-09 11:39:36 -0800123 `is_job_expired()`.
J. Richard Barnetteea785362014-03-17 16:00:53 -0700124
125 @return Return `None` if the job is still running; otherwise
126 return a string with a timestamp in the appropriate
127 format.
128 """
129 raise NotImplementedError("_JobDirectory.get_timestamp_if_finished")
130
Simran Basi1e10e922015-04-16 15:09:56 -0700131 def process_gs_instructions(self):
132 """Process any gs_offloader instructions for this special task.
133
134 @returns True/False if there is anything left to offload.
135 """
136 # Default support is to still offload the directory.
137 return True
138
J. Richard Barnetteea785362014-03-17 16:00:53 -0700139
Aviv Keshet114f2fc2017-02-02 16:07:04 -0800140NO_OFFLOAD_README = """These results have been deleted rather than offloaded.
141This is the expected behavior for passing jobs from the Commit Queue."""
142
143
J. Richard Barnetteea785362014-03-17 16:00:53 -0700144class RegularJobDirectory(_JobDirectory):
145 """Subclass of _JobDirectory for regular test jobs."""
146
147 GLOB_PATTERN = '[0-9]*-*'
148
Simran Basi1e10e922015-04-16 15:09:56 -0700149 def process_gs_instructions(self):
150 """Process any gs_offloader instructions for this job.
151
152 @returns True/False if there is anything left to offload.
153 """
154 # Go through the gs_offloader instructions file for each test in this job.
Allen Lib41527d2017-06-22 17:28:00 -0700155 for path in glob.glob(os.path.join(self.dirname, '*',
Simran Basi1e10e922015-04-16 15:09:56 -0700156 constants.GS_OFFLOADER_INSTRUCTIONS)):
157 with open(path, 'r') as f:
158 gs_off_instructions = json.load(f)
159 if gs_off_instructions.get(constants.GS_OFFLOADER_NO_OFFLOAD):
Aviv Keshet114f2fc2017-02-02 16:07:04 -0800160 dirname = os.path.dirname(path)
Allen Lib41527d2017-06-22 17:28:00 -0700161 _remove_log_directory_contents(dirname)
Simran Basi1e10e922015-04-16 15:09:56 -0700162
163 # Finally check if there's anything left to offload.
Keith Haddow3bc3be02018-07-13 10:36:08 -0700164 if os.path.exists(self.dirname) and not os.listdir(self.dirname):
Allen Lib41527d2017-06-22 17:28:00 -0700165 shutil.rmtree(self.dirname)
Simran Basi1e10e922015-04-16 15:09:56 -0700166 return False
167 return True
168
J. Richard Barnetteea785362014-03-17 16:00:53 -0700169 def get_timestamp_if_finished(self):
Simran Basifb98e462014-08-18 12:35:44 -0700170 """Get the timestamp to use for finished jobs.
171
172 @returns the latest hqe finished_on time. If the finished_on times are null
173 returns the job's created_on time.
174 """
Prathmesh Prabhua4557032018-05-22 22:47:37 -0700175 entry = _cached_afe().get_jobs(id=self._id, finished=True)
Simran Basifb98e462014-08-18 12:35:44 -0700176 if not entry:
177 return None
Prathmesh Prabhua4557032018-05-22 22:47:37 -0700178 hqes = _cached_afe().get_host_queue_entries(finished_on__isnull=False,
179 job_id=self._id)
Simran Basifb98e462014-08-18 12:35:44 -0700180 if not hqes:
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700181 return entry[0].created_on
Simran Basifb98e462014-08-18 12:35:44 -0700182 # While most Jobs have 1 HQE, some can have multiple, so check them all.
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700183 return max([hqe.finished_on for hqe in hqes])
J. Richard Barnetteea785362014-03-17 16:00:53 -0700184
185
Allen Lib41527d2017-06-22 17:28:00 -0700186def _remove_log_directory_contents(dirpath):
187 """Remove log directory contents.
188
189 Leave a note explaining what has happened to the logs.
190
191 @param dirpath: Path to log directory.
192 """
193 shutil.rmtree(dirpath)
194 os.mkdir(dirpath)
195 breadcrumb_name = os.path.join(dirpath, 'logs-removed-readme.txt')
196 with open(breadcrumb_name, 'w') as f:
197 f.write(NO_OFFLOAD_README)
198
199
J. Richard Barnetteea785362014-03-17 16:00:53 -0700200class SpecialJobDirectory(_JobDirectory):
201 """Subclass of _JobDirectory for special (per-host) jobs."""
202
203 GLOB_PATTERN = 'hosts/*/[0-9]*-*'
204
205 def __init__(self, resultsdir):
206 super(SpecialJobDirectory, self).__init__(resultsdir)
J. Richard Barnetteea785362014-03-17 16:00:53 -0700207
208 def get_timestamp_if_finished(self):
Prathmesh Prabhua4557032018-05-22 22:47:37 -0700209 entry = _cached_afe().get_special_tasks(id=self._id, is_complete=True)
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700210 return entry[0].time_finished if entry else None
Prathmesh Prabhua4557032018-05-22 22:47:37 -0700211
212
Allen Li7402f092018-06-26 15:42:21 -0700213_OFFLOAD_MARKER = ".ready_for_offload"
214_marker_parse_error_metric = metrics.Counter(
215 'chromeos/autotest/gs_offloader/offload_marker_parse_errors',
216 description='Errors parsing the offload marker file')
217
218
219class SwarmingJobDirectory(_JobDirectory):
220 """Subclass of _JobDirectory for Skylab swarming jobs."""
221
Allen Li41dbe232018-06-29 17:14:37 -0700222 # .../results/swarming-3e4391423c3a4311
223 GLOB_PATTERN = 'swarming-[a-f0-9]*'
Allen Li7402f092018-06-26 15:42:21 -0700224
225 def get_timestamp_if_finished(self):
226 """Get the timestamp to use for finished jobs.
227
228 @returns the latest hqe finished_on time. If the finished_on times are null
229 returns the job's created_on time.
230 """
231 marker_path = os.path.join(self.dirname, _OFFLOAD_MARKER)
232 try:
233 with open(marker_path) as f:
234 ts_string = f.read().strip()
Allen Li0a47a672018-07-03 11:20:59 -0700235 except (OSError, IOError) as e:
Allen Li7402f092018-06-26 15:42:21 -0700236 return None
237 try:
238 ts = int(ts_string)
239 return time_utils.epoch_time_to_date_string(ts)
240 except ValueError as e:
241 logging.debug('Error parsing %s for %s: %s',
242 _OFFLOAD_MARKER, self.dirname, e)
243 _marker_parse_error_metric.increment()
244 return None
245
246
Prathmesh Prabhua4557032018-05-22 22:47:37 -0700247_AFE = None
248def _cached_afe():
249 global _AFE
250 if _AFE is None:
251 _AFE = frontend_wrappers.RetryingAFE()
252 return _AFE