blob: 98392240bca37bbe7b2f3ad9bb03b533201705d0 [file] [log] [blame]
J. Richard Barnetteea785362014-03-17 16:00:53 -07001import abc
2import datetime
3import glob
Simran Basi1e10e922015-04-16 15:09:56 -07004import json
J. Richard Barnetteea785362014-03-17 16:00:53 -07005import os
Dan Shicf4d2032015-03-12 15:04:21 -07006import re
Simran Basi1e10e922015-04-16 15:09:56 -07007import shutil
J. Richard Barnetteea785362014-03-17 16:00:53 -07008import time
9
10import common
Dan Shidfea3682014-08-10 23:38:40 -070011from autotest_lib.client.common_lib import time_utils
Dan Shi81800632015-09-29 12:16:48 -070012from autotest_lib.client.common_lib import utils
Simran Basi1e10e922015-04-16 15:09:56 -070013from autotest_lib.server.cros.dynamic_suite import constants
J. Richard Barnetteacdb0132014-09-03 16:44:12 -070014from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnetteea785362014-03-17 16:00:53 -070015
J. Richard Barnetteea785362014-03-17 16:00:53 -070016
Dan Shidfea3682014-08-10 23:38:40 -070017_AFE = frontend_wrappers.RetryingAFE()
J. Richard Barnetteea785362014-03-17 16:00:53 -070018
Dan Shi1b4c7c32015-10-05 10:38:57 -070019SPECIAL_TASK_PATTERN = '.*/hosts/[^/]+/(\d+)-[^/]+'
20JOB_PATTERN = '.*/(\d+)-[^/]+'
21
J. Richard Barnetteea785362014-03-17 16:00:53 -070022def _is_job_expired(age_limit, timestamp):
23 """Check whether a job timestamp is older than an age limit.
24
25 @param age_limit: Minimum age, measured in days. If the value is
26 not positive, the job is always expired.
27 @param timestamp: Timestamp of the job whose age we are checking.
Dan Shidfea3682014-08-10 23:38:40 -070028 The format must match time_utils.TIME_FMT.
J. Richard Barnetteea785362014-03-17 16:00:53 -070029
30 @returns True iff the job is old enough to be expired.
31 """
32 if age_limit <= 0:
33 return True
Dan Shidfea3682014-08-10 23:38:40 -070034 job_time = time_utils.time_string_to_datetime(timestamp)
J. Richard Barnetteea785362014-03-17 16:00:53 -070035 expiration = job_time + datetime.timedelta(days=age_limit)
36 return datetime.datetime.now() >= expiration
37
38
Dan Shicf4d2032015-03-12 15:04:21 -070039def get_job_id_or_task_id(result_dir):
40 """Extract job id or special task id from result_dir
41
42 @param result_dir: path to the result dir.
43 For test job:
44 /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
45 The hostname at the end is optional.
46 For special task:
47 /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup
48
Dan Shi81800632015-09-29 12:16:48 -070049 @returns: integer representing the job id or task id. Returns None if fail
50 to parse job or task id from the result_dir.
Dan Shicf4d2032015-03-12 15:04:21 -070051 """
52 if not result_dir:
53 return
54 result_dir = os.path.abspath(result_dir)
Dan Shi81800632015-09-29 12:16:48 -070055 # Result folder for job running inside container has only job id.
56 ssp_job_pattern = '.*/(\d+)$'
Dan Shicf4d2032015-03-12 15:04:21 -070057 # Try to get the job ID from the last pattern of number-text. This avoids
58 # issue with path like 123-results/456-debug_user, in which 456 is the real
59 # job ID.
Dan Shi1b4c7c32015-10-05 10:38:57 -070060 m_job = re.findall(JOB_PATTERN, result_dir)
Dan Shi81800632015-09-29 12:16:48 -070061 if m_job:
62 return int(m_job[-1])
Dan Shi1b4c7c32015-10-05 10:38:57 -070063 m_special_task = re.match(SPECIAL_TASK_PATTERN, result_dir)
Dan Shi81800632015-09-29 12:16:48 -070064 if m_special_task:
65 return int(m_special_task.group(1))
66 m_ssp_job_pattern = re.match(ssp_job_pattern, result_dir)
67 if m_ssp_job_pattern and utils.is_in_container():
68 return int(m_ssp_job_pattern.group(1))
Dan Shicf4d2032015-03-12 15:04:21 -070069
70
J. Richard Barnetteea785362014-03-17 16:00:53 -070071class _JobDirectory(object):
72 """State associated with a job to be offloaded.
73
74 The full life-cycle of a job (including failure events that
75 normally don't occur) looks like this:
76 1. The job's results directory is discovered by
77 `get_job_directories()`, and a job instance is created for it.
78 2. Calls to `offload()` have no effect so long as the job
79 isn't complete in the database and the job isn't expired
80 according to the `age_limit` parameter.
81 3. Eventually, the job is both finished and expired. The next
82 call to `offload()` makes the first attempt to offload the
83 directory to GS. Offload is attempted, but fails to complete
84 (e.g. because of a GS problem).
85 4. After the first failed offload `is_offloaded()` is false,
86 but `is_reportable()` is also false, so the failure is not
87 reported.
88 5. Another call to `offload()` again tries to offload the
89 directory, and again fails.
90 6. After a second failure, `is_offloaded()` is false and
91 `is_reportable()` is true, so the failure generates an e-mail
92 notification.
93 7. Finally, a call to `offload()` succeeds, and the directory no
94 longer exists. Now `is_offloaded()` is true, so the job
95 instance is deleted, and future failures will not mention this
96 directory any more.
97
98 Only steps 1. and 7. are guaranteed to occur. The others depend
99 on the timing of calls to `offload()`, and on the reliability of
100 the actual offload process.
101
102 """
103
104 __metaclass__ = abc.ABCMeta
105
106 GLOB_PATTERN = None # must be redefined in subclass
107
108 def __init__(self, resultsdir):
109 self._dirname = resultsdir
Dan Shicf4d2032015-03-12 15:04:21 -0700110 self._id = get_job_id_or_task_id(resultsdir)
J. Richard Barnetteea785362014-03-17 16:00:53 -0700111 self._offload_count = 0
112 self._first_offload_start = 0
113
114 @classmethod
115 def get_job_directories(cls):
116 """Return a list of directories of jobs that need offloading."""
117 return [d for d in glob.glob(cls.GLOB_PATTERN) if os.path.isdir(d)]
118
119 @abc.abstractmethod
120 def get_timestamp_if_finished(self):
121 """Return this job's timestamp from the database.
122
123 If the database has not marked the job as finished, return
124 `None`. Otherwise, return a timestamp for the job. The
125 timestamp is to be used to determine expiration in
126 `_is_job_expired()`.
127
128 @return Return `None` if the job is still running; otherwise
129 return a string with a timestamp in the appropriate
130 format.
131 """
132 raise NotImplementedError("_JobDirectory.get_timestamp_if_finished")
133
134 def enqueue_offload(self, queue, age_limit):
135 """Enqueue the job for offload, if it's eligible.
136
137 The job is eligible for offloading if the database has marked
138 it finished, and the job is older than the `age_limit`
139 parameter.
140
141 If the job is eligible, offload processing is requested by
142 passing the `queue` parameter's `put()` method a sequence with
J. Richard Barnette2c72ddd2014-05-20 12:17:37 -0700143 the job's `_dirname` attribute and its directory name.
J. Richard Barnetteea785362014-03-17 16:00:53 -0700144
145 @param queue If the job should be offloaded, put the offload
146 parameters into this queue for processing.
147 @param age_limit Minimum age for a job to be offloaded. A value
148 of 0 means that the job will be offloaded as
149 soon as it is finished.
150
151 """
152 if not self._offload_count:
153 timestamp = self.get_timestamp_if_finished()
154 if not timestamp:
J. Richard Barnetteea785362014-03-17 16:00:53 -0700155 return
156 if not _is_job_expired(age_limit, timestamp):
J. Richard Barnetteea785362014-03-17 16:00:53 -0700157 return
158 self._first_offload_start = time.time()
J. Richard Barnetteea785362014-03-17 16:00:53 -0700159 self._offload_count += 1
Simran Basi1e10e922015-04-16 15:09:56 -0700160 if self.process_gs_instructions():
161 queue.put([self._dirname, os.path.dirname(self._dirname)])
J. Richard Barnetteea785362014-03-17 16:00:53 -0700162
163 def is_offloaded(self):
164 """Return whether this job has been successfully offloaded."""
165 return not os.path.exists(self._dirname)
166
167 def is_reportable(self):
168 """Return whether this job has a reportable failure."""
169 return self._offload_count > 1
170
171 def get_failure_time(self):
172 """Return the time of the first offload failure."""
173 return self._first_offload_start
174
175 def get_failure_count(self):
176 """Return the number of times this job has failed to offload."""
177 return self._offload_count
178
179 def get_job_directory(self):
180 """Return the name of this job's results directory."""
181 return self._dirname
182
Simran Basi1e10e922015-04-16 15:09:56 -0700183 def process_gs_instructions(self):
184 """Process any gs_offloader instructions for this special task.
185
186 @returns True/False if there is anything left to offload.
187 """
188 # Default support is to still offload the directory.
189 return True
190
J. Richard Barnetteea785362014-03-17 16:00:53 -0700191
192class RegularJobDirectory(_JobDirectory):
193 """Subclass of _JobDirectory for regular test jobs."""
194
195 GLOB_PATTERN = '[0-9]*-*'
196
Simran Basi1e10e922015-04-16 15:09:56 -0700197 def process_gs_instructions(self):
198 """Process any gs_offloader instructions for this job.
199
200 @returns True/False if there is anything left to offload.
201 """
202 # Go through the gs_offloader instructions file for each test in this job.
203 for path in glob.glob(os.path.join(self._dirname, '*',
204 constants.GS_OFFLOADER_INSTRUCTIONS)):
205 with open(path, 'r') as f:
206 gs_off_instructions = json.load(f)
207 if gs_off_instructions.get(constants.GS_OFFLOADER_NO_OFFLOAD):
208 shutil.rmtree(os.path.dirname(path))
209
210 # Finally check if there's anything left to offload.
211 if not os.listdir(self._dirname):
212 shutil.rmtree(self._dirname)
213 return False
214 return True
215
216
J. Richard Barnetteea785362014-03-17 16:00:53 -0700217 def get_timestamp_if_finished(self):
Simran Basifb98e462014-08-18 12:35:44 -0700218 """Get the timestamp to use for finished jobs.
219
220 @returns the latest hqe finished_on time. If the finished_on times are null
221 returns the job's created_on time.
222 """
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700223 entry = _AFE.get_jobs(id=self._id, finished=True)
Simran Basifb98e462014-08-18 12:35:44 -0700224 if not entry:
225 return None
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700226 hqes = _AFE.get_host_queue_entries(finished_on__isnull=False,
227 job_id=self._id)
Simran Basifb98e462014-08-18 12:35:44 -0700228 if not hqes:
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700229 return entry[0].created_on
Simran Basifb98e462014-08-18 12:35:44 -0700230 # While most Jobs have 1 HQE, some can have multiple, so check them all.
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700231 return max([hqe.finished_on for hqe in hqes])
J. Richard Barnetteea785362014-03-17 16:00:53 -0700232
233
234class SpecialJobDirectory(_JobDirectory):
235 """Subclass of _JobDirectory for special (per-host) jobs."""
236
237 GLOB_PATTERN = 'hosts/*/[0-9]*-*'
238
239 def __init__(self, resultsdir):
240 super(SpecialJobDirectory, self).__init__(resultsdir)
J. Richard Barnetteea785362014-03-17 16:00:53 -0700241
242 def get_timestamp_if_finished(self):
J. Richard Barnettedd0227d2015-04-10 15:18:48 -0700243 entry = _AFE.get_special_tasks(id=self._id, is_complete=True)
244 return entry[0].time_finished if entry else None