blob: b1c04a91bef043e2434a35aa8c8f722abfbf5b39 [file] [log] [blame]
J. Richard Barnetteea785362014-03-17 16:00:53 -07001import abc
2import datetime
3import glob
Simran Basi1e10e922015-04-16 15:09:56 -07004import json
Allen Li7402f092018-06-26 15:42:21 -07005import logging
J. Richard Barnetteea785362014-03-17 16:00:53 -07006import os
Dan Shicf4d2032015-03-12 15:04:21 -07007import re
Simran Basi1e10e922015-04-16 15:09:56 -07008import shutil
J. Richard Barnetteea785362014-03-17 16:00:53 -07009
10import common
Dan Shidfea3682014-08-10 23:38:40 -070011from autotest_lib.client.common_lib import time_utils
Dan Shi81800632015-09-29 12:16:48 -070012from autotest_lib.client.common_lib import utils
Simran Basi1e10e922015-04-16 15:09:56 -070013from autotest_lib.server.cros.dynamic_suite import constants
J. Richard Barnetteacdb0132014-09-03 16:44:12 -070014from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnetteea785362014-03-17 16:00:53 -070015
Allen Li7402f092018-06-26 15:42:21 -070016try:
17 from chromite.lib import metrics
18except ImportError:
19 metrics = utils.metrics_mock
20
J. Richard Barnetteea785362014-03-17 16:00:53 -070021
Dan Shi1b4c7c32015-10-05 10:38:57 -070022SPECIAL_TASK_PATTERN = '.*/hosts/[^/]+/(\d+)-[^/]+'
Dan Shi1b4c7c32015-10-05 10:38:57 -070023
Keith Haddow5ba5fb82016-11-09 11:39:36 -080024def is_job_expired(age_limit, timestamp):
J. Richard Barnetteea785362014-03-17 16:00:53 -070025 """Check whether a job timestamp is older than an age limit.
26
27 @param age_limit: Minimum age, measured in days. If the value is
28 not positive, the job is always expired.
29 @param timestamp: Timestamp of the job whose age we are checking.
Dan Shidfea3682014-08-10 23:38:40 -070030 The format must match time_utils.TIME_FMT.
J. Richard Barnetteea785362014-03-17 16:00:53 -070031
32 @returns True iff the job is old enough to be expired.
33 """
34 if age_limit <= 0:
35 return True
Dan Shidfea3682014-08-10 23:38:40 -070036 job_time = time_utils.time_string_to_datetime(timestamp)
J. Richard Barnetteea785362014-03-17 16:00:53 -070037 expiration = job_time + datetime.timedelta(days=age_limit)
38 return datetime.datetime.now() >= expiration
39
40
Dan Shicf4d2032015-03-12 15:04:21 -070041def get_job_id_or_task_id(result_dir):
42 """Extract job id or special task id from result_dir
43
44 @param result_dir: path to the result dir.
45 For test job:
46 /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
47 The hostname at the end is optional.
48 For special task:
49 /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup
50
Prathmesh Prabhu28a46512018-05-11 17:40:29 -070051 @returns: str representing the job id or task id. Returns None if fail
Prathmesh Prabhuf76a85c2018-05-22 22:58:20 -070052 to parse job or task id from the result_dir.
Dan Shicf4d2032015-03-12 15:04:21 -070053 """
54 if not result_dir:
55 return
56 result_dir = os.path.abspath(result_dir)
Dan Shi81800632015-09-29 12:16:48 -070057 # Result folder for job running inside container has only job id.
58 ssp_job_pattern = '.*/(\d+)$'
Dan Shicf4d2032015-03-12 15:04:21 -070059 # Try to get the job ID from the last pattern of number-text. This avoids
60 # issue with path like 123-results/456-debug_user, in which 456 is the real
61 # job ID.
Prathmesh Prabhua4557032018-05-22 22:47:37 -070062 m_job = re.findall('.*/(\d+)-[^/]+', result_dir)
Dan Shi81800632015-09-29 12:16:48 -070063 if m_job:
Prathmesh Prabhu28a46512018-05-11 17:40:29 -070064 return m_job[-1]
Dan Shi1b4c7c32015-10-05 10:38:57 -070065 m_special_task = re.match(SPECIAL_TASK_PATTERN, result_dir)
Dan Shi81800632015-09-29 12:16:48 -070066 if m_special_task:
Prathmesh Prabhu28a46512018-05-11 17:40:29 -070067 return m_special_task.group(1)
Dan Shi81800632015-09-29 12:16:48 -070068 m_ssp_job_pattern = re.match(ssp_job_pattern, result_dir)
69 if m_ssp_job_pattern and utils.is_in_container():
Prathmesh Prabhu28a46512018-05-11 17:40:29 -070070 return m_ssp_job_pattern.group(1)
Prathmesh Prabhuf76a85c2018-05-22 22:58:20 -070071 m_swarming_task = re.match('.*/swarming-([0-9a-fA-F]+)$', result_dir)
72 if m_swarming_task:
73 return m_swarming_task.group(1)
Dan Shicf4d2032015-03-12 15:04:21 -070074
75
Dan Shiafa63872016-02-23 15:32:31 -080076def get_job_folder_name(result_dir):
77 """Extract folder name of a job from result_dir.
78
79 @param result_dir: path to the result dir.
80 For test job:
81 /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
82 The hostname at the end is optional.
83 For special task:
84 /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup
85
Prathmesh Prabhua4557032018-05-22 22:47:37 -070086 @returns: The name of the folder of a job. Returns None if directory path
87 does not match supported job directories patterns.
Dan Shiafa63872016-02-23 15:32:31 -080088 """
89 if not result_dir:
90 return
Prathmesh Prabhua4557032018-05-22 22:47:37 -070091 # Pattern of a job folder, e.g., 123-debug_user, where 123 is job id and
92 # debug_user is the name of user starts the job.
93 test_job_pattern = '.*/(\d+-[^/]+)'
94 m_job = re.findall(test_job_pattern, result_dir)
Dan Shiafa63872016-02-23 15:32:31 -080095 if m_job:
96 return m_job[-1]
Prathmesh Prabhuf76a85c2018-05-22 22:58:20 -070097 m_swarming_task = re.match('.*/(swarming-[0-9a-fA-F]+)$', result_dir)
98 if m_swarming_task:
99 return m_swarming_task.group(1)
Dan Shiafa63872016-02-23 15:32:31 -0800100
101
J. Richard Barnetteea785362014-03-17 16:00:53 -0700102class _JobDirectory(object):
103 """State associated with a job to be offloaded.
104
105 The full life-cycle of a job (including failure events that
106 normally don't occur) looks like this:
107 1. The job's results directory is discovered by
108 `get_job_directories()`, and a job instance is created for it.
109 2. Calls to `offload()` have no effect so long as the job
110 isn't complete in the database and the job isn't expired
111 according to the `age_limit` parameter.
112 3. Eventually, the job is both finished and expired. The next
113 call to `offload()` makes the first attempt to offload the
114 directory to GS. Offload is attempted, but fails to complete
115 (e.g. because of a GS problem).
Prathmesh Prabhuca481592017-01-30 18:05:49 -0800116 4. Finally, a call to `offload()` succeeds, and the directory no
J. Richard Barnetteea785362014-03-17 16:00:53 -0700117 longer exists. Now `is_offloaded()` is true, so the job
118 instance is deleted, and future failures will not mention this
119 directory any more.
120
Prathmesh Prabhuca481592017-01-30 18:05:49 -0800121 Only steps 1. and 4. are guaranteed to occur. The others depend
J. Richard Barnetteea785362014-03-17 16:00:53 -0700122 on the timing of calls to `offload()`, and on the reliability of
123 the actual offload process.
124
125 """
126
127 __metaclass__ = abc.ABCMeta
128
129 GLOB_PATTERN = None # must be redefined in subclass
130
131 def __init__(self, resultsdir):
Allen Lib41527d2017-06-22 17:28:00 -0700132 self.dirname = resultsdir
Dan Shicf4d2032015-03-12 15:04:21 -0700133 self._id = get_job_id_or_task_id(resultsdir)
Allen Lib41527d2017-06-22 17:28:00 -0700134 self.offload_count = 0
135 self.first_offload_start = 0
J. Richard Barnetteea785362014-03-17 16:00:53 -0700136
137 @classmethod
138 def get_job_directories(cls):
139 """Return a list of directories of jobs that need offloading."""
140 return [d for d in glob.glob(cls.GLOB_PATTERN) if os.path.isdir(d)]
141
142 @abc.abstractmethod
143 def get_timestamp_if_finished(self):
144 """Return this job's timestamp from the database.
145
146 If the database has not marked the job as finished, return
147 `None`. Otherwise, return a timestamp for the job. The
148 timestamp is to be used to determine expiration in
Keith Haddow5ba5fb82016-11-09 11:39:36 -0800149 `is_job_expired()`.
J. Richard Barnetteea785362014-03-17 16:00:53 -0700150
151 @return Return `None` if the job is still running; otherwise
152 return a string with a timestamp in the appropriate
153 format.
154 """
155 raise NotImplementedError("_JobDirectory.get_timestamp_if_finished")
156
Simran Basi1e10e922015-04-16 15:09:56 -0700157 def process_gs_instructions(self):
158 """Process any gs_offloader instructions for this special task.
159
160 @returns True/False if there is anything left to offload.
161 """
162 # Default support is to still offload the directory.
163 return True
164
J. Richard Barnetteea785362014-03-17 16:00:53 -0700165
Aviv Keshet114f2fc2017-02-02 16:07:04 -0800166NO_OFFLOAD_README = """These results have been deleted rather than offloaded.
167This is the expected behavior for passing jobs from the Commit Queue."""
168
169
J. Richard Barnetteea785362014-03-17 16:00:53 -0700170class RegularJobDirectory(_JobDirectory):
171 """Subclass of _JobDirectory for regular test jobs."""
172
173 GLOB_PATTERN = '[0-9]*-*'
174
Simran Basi1e10e922015-04-16 15:09:56 -0700175 def process_gs_instructions(self):
176 """Process any gs_offloader instructions for this job.
177
178 @returns True/False if there is anything left to offload.
179 """
180 # Go through the gs_offloader instructions file for each test in this job.
Allen Lib41527d2017-06-22 17:28:00 -0700181 for path in glob.glob(os.path.join(self.dirname, '*',
Simran Basi1e10e922015-04-16 15:09:56 -0700182 constants.GS_OFFLOADER_INSTRUCTIONS)):
183 with open(path, 'r') as f:
184 gs_off_instructions = json.load(f)
185 if gs_off_instructions.get(constants.GS_OFFLOADER_NO_OFFLOAD):
Aviv Keshet114f2fc2017-02-02 16:07:04 -0800186 dirname = os.path.dirname(path)
Allen Lib41527d2017-06-22 17:28:00 -0700187 _remove_log_directory_contents(dirname)
Simran Basi1e10e922015-04-16 15:09:56 -0700188
189 # Finally check if there's anything left to offload.
Keith Haddow3bc3be02018-07-13 10:36:08 -0700190 if os.path.exists(self.dirname) and not os.listdir(self.dirname):
Allen Lib41527d2017-06-22 17:28:00 -0700191 shutil.rmtree(self.dirname)
Simran Basi1e10e922015-04-16 15:09:56 -0700192 return False
193 return True
194
J. Richard Barnetteea785362014-03-17 16:00:53 -0700195 def get_timestamp_if_finished(self):
Simran Basifb98e462014-08-18 12:35:44 -0700196 """Get the timestamp to use for finished jobs.
197
198 @returns the latest hqe finished_on time. If the finished_on times are null
199 returns the job's created_on time.
200 """
Prathmesh Prabhua4557032018-05-22 22:47:37 -0700201 entry = _cached_afe().get_jobs(id=self._id, finished=True)
Simran Basifb98e462014-08-18 12:35:44 -0700202 if not entry:
203 return None
Prathmesh Prabhua4557032018-05-22 22:47:37 -0700204 hqes = _cached_afe().get_host_queue_entries(finished_on__isnull=False,
205 job_id=self._id)
Simran Basifb98e462014-08-18 12:35:44 -0700206 if not hqes:
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700207 return entry[0].created_on
Simran Basifb98e462014-08-18 12:35:44 -0700208 # While most Jobs have 1 HQE, some can have multiple, so check them all.
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700209 return max([hqe.finished_on for hqe in hqes])
J. Richard Barnetteea785362014-03-17 16:00:53 -0700210
211
Allen Lib41527d2017-06-22 17:28:00 -0700212def _remove_log_directory_contents(dirpath):
213 """Remove log directory contents.
214
215 Leave a note explaining what has happened to the logs.
216
217 @param dirpath: Path to log directory.
218 """
219 shutil.rmtree(dirpath)
220 os.mkdir(dirpath)
221 breadcrumb_name = os.path.join(dirpath, 'logs-removed-readme.txt')
222 with open(breadcrumb_name, 'w') as f:
223 f.write(NO_OFFLOAD_README)
224
225
J. Richard Barnetteea785362014-03-17 16:00:53 -0700226class SpecialJobDirectory(_JobDirectory):
227 """Subclass of _JobDirectory for special (per-host) jobs."""
228
229 GLOB_PATTERN = 'hosts/*/[0-9]*-*'
230
231 def __init__(self, resultsdir):
232 super(SpecialJobDirectory, self).__init__(resultsdir)
J. Richard Barnetteea785362014-03-17 16:00:53 -0700233
234 def get_timestamp_if_finished(self):
Prathmesh Prabhua4557032018-05-22 22:47:37 -0700235 entry = _cached_afe().get_special_tasks(id=self._id, is_complete=True)
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700236 return entry[0].time_finished if entry else None
Prathmesh Prabhua4557032018-05-22 22:47:37 -0700237
238
Allen Li7402f092018-06-26 15:42:21 -0700239_OFFLOAD_MARKER = ".ready_for_offload"
240_marker_parse_error_metric = metrics.Counter(
241 'chromeos/autotest/gs_offloader/offload_marker_parse_errors',
242 description='Errors parsing the offload marker file')
243
244
245class SwarmingJobDirectory(_JobDirectory):
246 """Subclass of _JobDirectory for Skylab swarming jobs."""
247
Allen Li41dbe232018-06-29 17:14:37 -0700248 # .../results/swarming-3e4391423c3a4311
249 GLOB_PATTERN = 'swarming-[a-f0-9]*'
Allen Li7402f092018-06-26 15:42:21 -0700250
251 def get_timestamp_if_finished(self):
252 """Get the timestamp to use for finished jobs.
253
254 @returns the latest hqe finished_on time. If the finished_on times are null
255 returns the job's created_on time.
256 """
257 marker_path = os.path.join(self.dirname, _OFFLOAD_MARKER)
258 try:
259 with open(marker_path) as f:
260 ts_string = f.read().strip()
Allen Li0a47a672018-07-03 11:20:59 -0700261 except (OSError, IOError) as e:
Allen Li7402f092018-06-26 15:42:21 -0700262 logging.debug('Error opening %s for %s: %s',
263 _OFFLOAD_MARKER, self.dirname, e)
264 return None
265 try:
266 ts = int(ts_string)
267 return time_utils.epoch_time_to_date_string(ts)
268 except ValueError as e:
269 logging.debug('Error parsing %s for %s: %s',
270 _OFFLOAD_MARKER, self.dirname, e)
271 _marker_parse_error_metric.increment()
272 return None
273
274
Prathmesh Prabhua4557032018-05-22 22:47:37 -0700275_AFE = None
276def _cached_afe():
277 global _AFE
278 if _AFE is None:
279 _AFE = frontend_wrappers.RetryingAFE()
280 return _AFE