blob: 78173fc36a759f2e484c5b5337d09ca3624dd259 [file] [log] [blame]
Scott Zawalski20a9b582011-11-21 11:49:40 -08001#!/usr/bin/python
2#
Scott Zawalskicb2e2b72012-04-17 12:10:05 -04003# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Scott Zawalski20a9b582011-11-21 11:49:40 -08004# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
J. Richard Barnetteea785362014-03-17 16:00:53 -07007"""Script to archive old Autotest results to Google Storage.
Scott Zawalski20a9b582011-11-21 11:49:40 -08008
J. Richard Barnetteea785362014-03-17 16:00:53 -07009Uses gsutil to archive files to the configured Google Storage bucket.
10Upon successful copy, the local results directory is deleted.
Scott Zawalski20a9b582011-11-21 11:49:40 -080011"""
12
Simran Basibd9ded02013-11-04 15:49:11 -080013import datetime
Simran Basi9523eaa2012-06-28 17:18:45 -070014import logging
Scott Zawalski20a9b582011-11-21 11:49:40 -080015import os
Scott Zawalski20a9b582011-11-21 11:49:40 -080016import shutil
Simran Basi9523eaa2012-06-28 17:18:45 -070017import signal
Simran Basi981a9272012-11-14 10:46:03 -080018import socket
Scott Zawalski20a9b582011-11-21 11:49:40 -080019import subprocess
20import sys
Simran Basi9523eaa2012-06-28 17:18:45 -070021import tempfile
22import time
Scott Zawalskicb2e2b72012-04-17 12:10:05 -040023
Simran Basi7d9a1492012-10-25 13:51:54 -070024from optparse import OptionParser
25
Simran Basi981a9272012-11-14 10:46:03 -080026import common
27
Alex Millerc900b342014-06-09 16:52:07 -070028try:
29 # Does not exist, nor is needed, on moblab.
30 import psutil
31except ImportError:
32 psutil = None
33
J. Richard Barnetteea785362014-03-17 16:00:53 -070034import job_directories
Simran Basi981a9272012-11-14 10:46:03 -080035from autotest_lib.client.common_lib import global_config
36from autotest_lib.scheduler import email_manager
Fang Deng970b6a72013-04-09 11:59:16 -070037from chromite.lib import parallel
Scott Zawalski20a9b582011-11-21 11:49:40 -080038
Scott Zawalski20a9b582011-11-21 11:49:40 -080039# Google Storage bucket URI to store results in.
J. Richard Barnetteea785362014-03-17 16:00:53 -070040GS_URI = global_config.global_config.get_config_value(
41 'CROS', 'results_storage_server')
42GS_URI_PATTERN = GS_URI + '%s'
Scott Zawalski20a9b582011-11-21 11:49:40 -080043
44# Nice setting for process, the higher the number the lower the priority.
45NICENESS = 10
46
J. Richard Barnetteea785362014-03-17 16:00:53 -070047# Maximum number of seconds to allow for offloading a single
48# directory.
J. Richard Barnette7e0f8592014-09-03 17:00:55 -070049OFFLOAD_TIMEOUT_SECS = 60 * 60
Simran Basi9523eaa2012-06-28 17:18:45 -070050
Simran Basi392d4a52012-12-14 10:29:44 -080051# Sleep time per loop.
52SLEEP_TIME_SECS = 5
53
J. Richard Barnetteea785362014-03-17 16:00:53 -070054# Minimum number of seconds between e-mail reports.
55REPORT_INTERVAL_SECS = 60 * 60
56
Scott Zawalski20a9b582011-11-21 11:49:40 -080057# Location of Autotest results on disk.
58RESULTS_DIR = '/usr/local/autotest/results'
59
Simran Basi31d561d2012-07-31 13:44:40 -070060# Hosts sub-directory that contains cleanup, verify and repair jobs.
61HOSTS_SUB_DIR = 'hosts'
62
Alex Miller0c8db6d2013-02-15 15:41:00 -080063LOG_LOCATION = '/usr/local/autotest/logs/'
64LOG_FILENAME_FORMAT = 'gs_offloader_%s_log_%s.txt'
65LOG_TIMESTAMP_FORMAT = '%Y%m%d_%H%M%S'
Simran Basi9523eaa2012-06-28 17:18:45 -070066LOGGING_FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
67
Alex Miller0c8db6d2013-02-15 15:41:00 -080068# pylint: disable=E1120
Simran Basi981a9272012-11-14 10:46:03 -080069NOTIFY_ADDRESS = global_config.global_config.get_config_value(
70 'SCHEDULER', 'notify_email', default='')
71
72ERROR_EMAIL_SUBJECT_FORMAT = 'GS Offloader notifications from %s'
J. Richard Barnetteea785362014-03-17 16:00:53 -070073ERROR_EMAIL_REPORT_FORMAT = '''\
74gs_offloader is failing to offload results directories.
Simran Basi981a9272012-11-14 10:46:03 -080075
J. Richard Barnetteea785362014-03-17 16:00:53 -070076First failure Count Directory name
77=================== ====== ==============================
78'''
79# --+----1----+---- ----+ ----+----1----+----2----+----3
80
81ERROR_EMAIL_DIRECTORY_FORMAT = '%19s %5d %-1s\n'
82ERROR_EMAIL_TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
Simran Basi9523eaa2012-06-28 17:18:45 -070083
84class TimeoutException(Exception):
beeps8c30db12013-03-30 18:20:27 -070085 """Exception raised by the timeout_handler."""
Simran Basi9523eaa2012-06-28 17:18:45 -070086 pass
87
88
89def timeout_handler(_signum, _frame):
J. Richard Barnetteea785362014-03-17 16:00:53 -070090 """Handler for SIGALRM when the offloading process times out.
Simran Basi9523eaa2012-06-28 17:18:45 -070091
beeps8c30db12013-03-30 18:20:27 -070092 @param _signum: Signal number of the signal that was just caught.
93 14 for SIGALRM.
94 @param _frame: Current stack frame.
Simran Basi9523eaa2012-06-28 17:18:45 -070095 @raise TimeoutException: Automatically raises so that the time out is caught
96 by the try/except surrounding the Popen call.
J. Richard Barnetteea785362014-03-17 16:00:53 -070097
Simran Basi9523eaa2012-06-28 17:18:45 -070098 """
99 raise TimeoutException('Process Timed Out')
100
101
Simran Basi31d561d2012-07-31 13:44:40 -0700102def get_cmd_list(dir_entry, relative_path):
J. Richard Barnetteea785362014-03-17 16:00:53 -0700103 """Return the command to offload a specified directory.
Simran Basi9523eaa2012-06-28 17:18:45 -0700104
105 @param dir_entry: Directory entry/path that which we need a cmd_list to
106 offload.
J. Richard Barnetteea785362014-03-17 16:00:53 -0700107 @param relative_path: Location in google storage where we will
108 offload the directory.
Simran Basi9523eaa2012-06-28 17:18:45 -0700109
110 @return: A command list to be executed by Popen.
J. Richard Barnetteea785362014-03-17 16:00:53 -0700111
Simran Basi9523eaa2012-06-28 17:18:45 -0700112 """
J. Richard Barnetteea785362014-03-17 16:00:53 -0700113 return ['gsutil', '-m', 'cp', '-eR', '-a', 'project-private',
114 dir_entry, GS_URI_PATTERN % relative_path]
Simran Basi9523eaa2012-06-28 17:18:45 -0700115
116
J. Richard Barnetteea785362014-03-17 16:00:53 -0700117def offload_dir(dir_entry, dest_path):
118 """Offload the specified directory entry to Google storage.
Simran Basi9523eaa2012-06-28 17:18:45 -0700119
120 @param dir_entry: Directory entry to offload.
J. Richard Barnetteea785362014-03-17 16:00:53 -0700121 @param dest_path: Location in google storage where we will offload
122 the directory.
123
Simran Basi9523eaa2012-06-28 17:18:45 -0700124 """
125 try:
126 error = False
Simran Basi9523eaa2012-06-28 17:18:45 -0700127 stdout_file = tempfile.TemporaryFile('w+')
128 stderr_file = tempfile.TemporaryFile('w+')
J. Richard Barnette2e443ef2014-05-20 12:31:35 -0700129 process = None
130 signal.alarm(OFFLOAD_TIMEOUT_SECS)
Simran Basi31d561d2012-07-31 13:44:40 -0700131 process = subprocess.Popen(get_cmd_list(dir_entry, dest_path),
132 stdout=stdout_file, stderr=stderr_file)
Simran Basi9523eaa2012-06-28 17:18:45 -0700133 process.wait()
134 signal.alarm(0)
135 if process.returncode == 0:
136 shutil.rmtree(dir_entry)
137 else:
138 error = True
139 except TimeoutException:
J. Richard Barnette2e443ef2014-05-20 12:31:35 -0700140 # If we finished the call to Popen(), we may need to terminate
141 # the child process. We don't bother calling process.poll();
142 # that inherently races because the child can die any time it
143 # wants.
144 if process:
145 try:
146 process.terminate()
147 except OSError:
148 # We don't expect any error other than "No such
149 # process".
150 pass
Simran Basi9523eaa2012-06-28 17:18:45 -0700151 logging.error('Offloading %s timed out after waiting %d seconds.',
J. Richard Barnetteea785362014-03-17 16:00:53 -0700152 dir_entry, OFFLOAD_TIMEOUT_SECS)
Simran Basi9523eaa2012-06-28 17:18:45 -0700153 error = True
154 finally:
155 signal.alarm(0)
156 if error:
157 # Rewind the log files for stdout and stderr and log their contents.
158 stdout_file.seek(0)
159 stderr_file.seek(0)
J. Richard Barnetteea785362014-03-17 16:00:53 -0700160 logging.error('Error occurred when offloading %s:', dir_entry)
161 logging.error('Stdout:\n%s \nStderr:\n%s',
162 stdout_file.read(), stderr_file.read())
Simran Basi9523eaa2012-06-28 17:18:45 -0700163 stdout_file.close()
164 stderr_file.close()
165
Scott Zawalski20a9b582011-11-21 11:49:40 -0800166
J. Richard Barnetteea785362014-03-17 16:00:53 -0700167def delete_files(dir_entry, dest_path):
Simran Basibd9ded02013-11-04 15:49:11 -0800168 """Simply deletes the dir_entry from the filesystem.
169
170 Uses same arguments as offload_dir so that it can be used in replace of it on
171 systems that only want to delete files instead of offloading them.
172
173 @param dir_entry: Directory entry to offload.
174 @param dest_path: NOT USED.
175 """
176 shutil.rmtree(dir_entry)
177
178
J. Richard Barnetteea785362014-03-17 16:00:53 -0700179def report_offload_failures(joblist):
180 """Generate e-mail notification for failed offloads.
181
182 The e-mail report will include data from all jobs in `joblist`.
183
184 @param joblist List of jobs to be reported in the message.
185
Scott Zawalskicb2e2b72012-04-17 12:10:05 -0400186 """
J. Richard Barnetteea785362014-03-17 16:00:53 -0700187 def _format_job(job):
188 d = datetime.datetime.fromtimestamp(job.get_failure_time())
189 data = (d.strftime(ERROR_EMAIL_TIME_FORMAT),
190 job.get_failure_count(),
191 job.get_job_directory())
192 return ERROR_EMAIL_DIRECTORY_FORMAT % data
193 joblines = [_format_job(job) for job in joblist]
194 joblines.sort()
195 email_subject = ERROR_EMAIL_SUBJECT_FORMAT % socket.gethostname()
196 email_message = ERROR_EMAIL_REPORT_FORMAT + ''.join(joblines)
197 email_manager.manager.send_email(NOTIFY_ADDRESS, email_subject,
198 email_message)
Simran Basi9523eaa2012-06-28 17:18:45 -0700199
Scott Zawalski20a9b582011-11-21 11:49:40 -0800200
J. Richard Barnetteea785362014-03-17 16:00:53 -0700201class Offloader(object):
202 """State of the offload process.
203
204 Contains the following member fields:
205 * _offload_func: Function to call for each attempt to offload
206 a job directory.
207 * _jobdir_classes: List of classes of job directory to be
208 offloaded.
209 * _processes: Maximum number of outstanding offload processes
210 to allow during an offload cycle.
211 * _age_limit: Minimum age in days at which a job may be
212 offloaded.
213 * _open_jobs: a dictionary mapping directory paths to Job
214 objects.
215 * _next_report_time: Earliest time that we should send e-mail
216 if there are failures to be reported.
217
Scott Zawalskicb2e2b72012-04-17 12:10:05 -0400218 """
Alex Miller95c3a4e2012-11-30 19:14:39 -0800219
J. Richard Barnetteea785362014-03-17 16:00:53 -0700220 def __init__(self, options):
221 if options.delete_only:
222 self._offload_func = delete_files
223 else:
224 self._offload_func = offload_dir
225 classlist = []
226 if options.process_hosts_only or options.process_all:
227 classlist.append(job_directories.SpecialJobDirectory)
228 if not options.process_hosts_only:
229 classlist.append(job_directories.RegularJobDirectory)
230 self._jobdir_classes = classlist
231 assert self._jobdir_classes
232 self._processes = options.parallelism
233 self._age_limit = options.days_old
234 self._open_jobs = {}
235 self._next_report_time = time.time()
236
237 def _add_new_jobs(self):
238 """Find new job directories that need offloading.
239
240 Go through the file system looking for valid job directories
241 that are currently not in `self._open_jobs`, and add them in.
242
243 """
J. Richard Barnette22dd7482014-06-23 12:25:02 -0700244 new_job_count = 0
J. Richard Barnetteea785362014-03-17 16:00:53 -0700245 for cls in self._jobdir_classes:
246 for resultsdir in cls.get_job_directories():
247 if resultsdir in self._open_jobs:
248 continue
249 self._open_jobs[resultsdir] = cls(resultsdir)
J. Richard Barnette22dd7482014-06-23 12:25:02 -0700250 new_job_count += 1
251 logging.debug("Start of offload cycle - found %d new jobs",
252 new_job_count)
J. Richard Barnetteea785362014-03-17 16:00:53 -0700253
254 def _remove_offloaded_jobs(self):
255 """Removed offloaded jobs from `self._open_jobs`."""
J. Richard Barnette22dd7482014-06-23 12:25:02 -0700256 removed_job_count = 0
J. Richard Barnetteea785362014-03-17 16:00:53 -0700257 for jobkey, job in self._open_jobs.items():
258 if job.is_offloaded():
259 del self._open_jobs[jobkey]
J. Richard Barnette22dd7482014-06-23 12:25:02 -0700260 removed_job_count += 1
261 logging.debug("End of offload cycle - cleared %d new jobs, "
262 "carrying %d open jobs",
263 removed_job_count, len(self._open_jobs))
J. Richard Barnetteea785362014-03-17 16:00:53 -0700264
265 def _have_reportable_errors(self):
266 """Return whether any jobs need reporting via e-mail.
267
268 @returns True if there are reportable jobs in `self._open_jobs`,
269 or False otherwise.
270 """
271 for job in self._open_jobs.values():
272 if job.is_reportable():
273 return True
274 return False
275
276 def _update_offload_results(self):
277 """Check and report status after attempting offload.
278
279 This function processes all jobs in `self._open_jobs`, assuming
280 an attempt has just been made to offload all of them.
281
282 Any jobs that have been successfully offloaded are removed.
283
284 If any jobs have reportable errors, and we haven't generated
285 an e-mail report in the last `REPORT_INTERVAL_SECS` seconds,
286 send new e-mail describing the failures.
287
288 """
289 self._remove_offloaded_jobs()
J. Richard Barnette22dd7482014-06-23 12:25:02 -0700290 if self._have_reportable_errors():
J. Richard Barnetteea785362014-03-17 16:00:53 -0700291 # N.B. We include all jobs that have failed at least once,
292 # which may include jobs that aren't otherwise reportable.
J. Richard Barnette22dd7482014-06-23 12:25:02 -0700293 failed_jobs = [j for j in self._open_jobs.values()
294 if j.get_failure_time()]
295 logging.debug("Currently there are %d jobs with offload failures",
296 len(failed_jobs))
297 if time.time() >= self._next_report_time:
298 logging.debug("Reporting failures by e-mail")
299 report_offload_failures(failed_jobs)
300 self._next_report_time = time.time() + REPORT_INTERVAL_SECS
J. Richard Barnetteea785362014-03-17 16:00:53 -0700301
302 def offload_once(self):
303 """Perform one offload cycle.
304
305 Find all job directories for new jobs that we haven't seen
306 before. Then, attempt to offload the directories for any
307 jobs that have finished running. Offload of multiple jobs
308 is done in parallel, up to `self._processes` at a time.
309
310 After we've tried uploading all directories, go through the list
311 checking the status of all uploaded directories. If necessary,
312 report failures via e-mail.
313
314 """
315 self._add_new_jobs()
Fang Deng970b6a72013-04-09 11:59:16 -0700316 with parallel.BackgroundTaskRunner(
J. Richard Barnetteea785362014-03-17 16:00:53 -0700317 self._offload_func, processes=self._processes) as queue:
318 for job in self._open_jobs.values():
319 job.enqueue_offload(queue, self._age_limit)
320 self._update_offload_results()
Scott Zawalski20a9b582011-11-21 11:49:40 -0800321
322
Simran Basi7d9a1492012-10-25 13:51:54 -0700323def parse_options():
J. Richard Barnetteea785362014-03-17 16:00:53 -0700324 """Parse the args passed into gs_offloader."""
Simran Basi7d9a1492012-10-25 13:51:54 -0700325 defaults = 'Defaults:\n Destination: %s\n Results Path: %s' % (GS_URI,
326 RESULTS_DIR)
327 usage = 'usage: %prog [options]\n' + defaults
328 parser = OptionParser(usage)
329 parser.add_option('-a', '--all', dest='process_all', action='store_true',
330 help='Offload all files in the results directory.')
331 parser.add_option('-s', '--hosts', dest='process_hosts_only',
332 action='store_true',
333 help='Offload only the special tasks result files located'
334 'in the results/hosts subdirectory')
Fang Deng970b6a72013-04-09 11:59:16 -0700335 parser.add_option('-p', '--parallelism', dest='parallelism', type='int',
336 default=1, help='Number of parallel workers to use.')
Simran Basi9244c332013-11-12 15:40:03 -0800337 parser.add_option('-o', '--delete_only', dest='delete_only',
Simran Basibd9ded02013-11-04 15:49:11 -0800338 action='store_true',
339 help='GS Offloader will only the delete the directories '
340 'and will not offload them to google storage.',
341 default=False)
Simran Basi9244c332013-11-12 15:40:03 -0800342 parser.add_option('-d', '--days_old', dest='days_old',
Simran Basibd9ded02013-11-04 15:49:11 -0800343 help='Minimum job age in days before a result can be '
344 'offloaded.', type='int', default=0)
Simran Basi7d9a1492012-10-25 13:51:54 -0700345 options = parser.parse_args()[0]
346 if options.process_all and options.process_hosts_only:
347 parser.print_help()
348 print ('Cannot process all files and only the hosts subdirectory. '
349 'Please remove an argument.')
350 sys.exit(1)
351 return options
Scott Zawalskicb2e2b72012-04-17 12:10:05 -0400352
Simran Basi9523eaa2012-06-28 17:18:45 -0700353
354def main():
beeps8c30db12013-03-30 18:20:27 -0700355 """Main method of gs_offloader."""
Simran Basi7d9a1492012-10-25 13:51:54 -0700356 options = parse_options()
Alex Miller0c8db6d2013-02-15 15:41:00 -0800357
358 if options.process_all:
359 offloader_type = 'all'
360 elif options.process_hosts_only:
361 offloader_type = 'hosts'
362 else:
363 offloader_type = 'jobs'
364
365 log_timestamp = time.strftime(LOG_TIMESTAMP_FORMAT)
366 log_filename = os.path.join(LOG_LOCATION,
367 LOG_FILENAME_FORMAT % (offloader_type, log_timestamp))
Simran Basi9523eaa2012-06-28 17:18:45 -0700368 logging.basicConfig(filename=log_filename, level=logging.DEBUG,
369 format=LOGGING_FORMAT)
J. Richard Barnetteea785362014-03-17 16:00:53 -0700370
371 # Nice our process (carried to subprocesses) so we don't overload
372 # the system.
373 logging.debug('Set process to nice value: %d', NICENESS)
374 os.nice(NICENESS)
Alex Millerc900b342014-06-09 16:52:07 -0700375 if psutil:
376 proc = psutil.Process()
377 logging.debug('Set process to ionice IDLE')
378 proc.ionice(psutil.IOPRIO_CLASS_IDLE)
J. Richard Barnetteea785362014-03-17 16:00:53 -0700379
380 # os.listdir returns relative paths, so change to where we need to be to avoid
381 # an os.path.join on each loop.
382 logging.debug('Offloading Autotest results in %s', RESULTS_DIR)
383 os.chdir(RESULTS_DIR)
384
385 signal.signal(signal.SIGALRM, timeout_handler)
386
387 offloader = Offloader(options)
388 while True:
389 offloader.offload_once()
390 time.sleep(SLEEP_TIME_SECS)
Scott Zawalskicb2e2b72012-04-17 12:10:05 -0400391
392
Scott Zawalski20a9b582011-11-21 11:49:40 -0800393if __name__ == '__main__':
394 main()