[Autotest] merge cleanup and verify
The objective of this CL is to merge cleanup and verify into a single job to
reduce run time of each test. In existing design, by default, a cleanup job is
scheduled after a test is finished, and a verify job is scheduled before a
test is started. By merging these two jobs together, we are seeing the total
run time of these two jobs is reduced from about 47s to 37s, around 10s saving.
That does not include the saving on scheduler to schedule two jobs, which may
take another 5-10s.
The design is to create a new special task, reset, which runs at the beginning
of a job by default. Verify task is changed to not to run by default before a
job starts. Cleanup job will only be run if a job is scheduled to reboot and
any test failed in that job.
BUG=chromium:220679
TEST=tested with run_suite in local machine
DEPLOY=afe,apache,scheduler,change all users' preference on reboot_after to
Never, sql: |update chromeos_autotest_db.afe_users set reboot_after=0|
Change-Id: Ia38baf6b73897b7e09fdf635eadedc752b5eba2f
Reviewed-on: https://gerrit.chromium.org/gerrit/48685
Commit-Queue: Dan Shi <dshi@chromium.org>
Reviewed-by: Dan Shi <dshi@chromium.org>
Tested-by: Dan Shi <dshi@chromium.org>
diff --git a/server/autoserv b/server/autoserv
index 1cfde22..10c2875 100755
--- a/server/autoserv
+++ b/server/autoserv
@@ -83,6 +83,7 @@
repair = parser.options.repair
cleanup = parser.options.cleanup
provision = parser.options.provision
+ reset = parser.options.reset
no_tee = parser.options.no_tee
parse_job = parser.options.parse_job
execution_tag = parser.options.execution_tag
@@ -106,7 +107,7 @@
parser.parser.error("Cannot specify provisioning and client!")
is_special_task = (verify or repair or cleanup or collect_crashinfo or
- provision)
+ provision or reset)
if len(parser.args) < 1 and not is_special_task:
parser.parser.error("Missing argument: control file")
@@ -160,6 +161,8 @@
job.verify()
elif provision:
job.provision(provision)
+ elif reset:
+ job.reset()
else:
job.run(cleanup, install_before, install_after,
verify_job_repo_url=verify_job_repo_url,
diff --git a/server/autoserv_parser.py b/server/autoserv_parser.py
index 12d405d..8469bab 100644
--- a/server/autoserv_parser.py
+++ b/server/autoserv_parser.py
@@ -1,3 +1,5 @@
+# pylint: disable-msg=C0111
+
import os, sys, optparse
from autotest_lib.client.common_lib import host_protections, utils
@@ -76,6 +78,10 @@
help="cleanup all machines after the job")
self.parser.add_option("--provision", action="store",
help="Labels to provision the machine to.")
+ self.parser.add_option("-T", "--reset", action="store_true",
+ default=False,
+ help="Reset (cleanup and verify) all machines"
+ "after the job")
self.parser.add_option("-n", action="store_true",
dest="no_tee", default=False,
help="no teeing the status to stdout/err")
diff --git a/server/control_segments/cleanup b/server/control_segments/cleanup
index ad0c814..d63f7ac 100644
--- a/server/control_segments/cleanup
+++ b/server/control_segments/cleanup
@@ -6,6 +6,9 @@
host = hosts.create_host(machine, initialize=False, auto_monitor=False)
timer = stats.Timer('cleanup_time.%s' % host._get_board_from_afe())
timer.start()
+ log_dir = os.path.join(job.resultdir, machine)
+ os.makedirs(log_dir)
+ host.get_file('/var/log/', log_dir, preserve_symlinks=True)
host.cleanup()
finally:
if timer:
diff --git a/server/control_segments/reset b/server/control_segments/reset
new file mode 100644
index 0000000..90dd651
--- /dev/null
+++ b/server/control_segments/reset
@@ -0,0 +1,27 @@
+import sys
+
+from autotest_lib.site_utils.graphite import stats
+
+def reset(machine):
+ print 'Starting to reset host ' + machine
+ timer = None
+ try:
+ host = hosts.create_host(machine, initialize=False, auto_monitor=False)
+ timer = stats.Timer('reset_time.%s' %
+ host._get_board_from_afe())
+ timer.start()
+ # Assume cleanup always runs first.
+ host.cleanup()
+ host.verify()
+ job.record('GOOD', None, 'reset',
+ '%s is reset successfully' % machine)
+ except Exception as e:
+ msg = 'reset failed: %s' % e
+ job.record('FAIL', None, 'reset', msg)
+ raise
+ finally:
+ if timer:
+ timer.stop()
+
+
+job.parallel_simple(reset, machines)
diff --git a/server/server_job.py b/server/server_job.py
index fcf7d77..2f50492 100644
--- a/server/server_job.py
+++ b/server/server_job.py
@@ -1,3 +1,5 @@
+# pylint: disable-msg=C0111
+
# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
@@ -39,6 +41,7 @@
REPAIR_CONTROL_FILE = _control_segment_path('repair')
PROVISION_CONTROL_FILE = _control_segment_path('provision')
VERIFY_JOB_REPO_URL_CONTROL_FILE = _control_segment_path('verify_job_repo_url')
+RESET_CONTROL_FILE = _control_segment_path('reset')
# by default provide a stub that generates no site data
@@ -334,6 +337,7 @@
def verify(self):
+ """Verify machines are all ssh-able."""
if not self.machines:
raise error.AutoservError('No machines specified to verify')
if self.resultdir:
@@ -350,6 +354,26 @@
raise
+ def reset(self):
+ """Reset machines by first cleanup then verify each machine."""
+ if not self.machines:
+ raise error.AutoservError('No machines specified to reset.')
+ if self.resultdir:
+ os.chdir(self.resultdir)
+
+ try:
+ namespace = {'machines' : self.machines, 'job' : self,
+ 'ssh_user' : self._ssh_user,
+ 'ssh_port' : self._ssh_port,
+ 'ssh_pass' : self._ssh_pass}
+ self._execute_code(RESET_CONTROL_FILE, namespace, protect=False)
+ except Exception as e:
+ msg = ('Reset failed\n' + str(e) + '\n' +
+ traceback.format_exc())
+ self.record('ABORT', None, None, msg)
+ raise
+
+
def repair(self, host_protection):
if not self.machines:
raise error.AutoservError('No machines specified to repair')
@@ -878,6 +902,10 @@
@param update_func - a function that updates the list of uncollected
logs. Should take one parameter, the list to be updated.
"""
+ # Skip log collection if file _uncollected_log_file does not exist.
+ if not (self._uncollected_log_file and
+ os.path.exists(self._uncollected_log_file)):
+ return
if self._uncollected_log_file:
log_file = open(self._uncollected_log_file, "r+")
fcntl.flock(log_file, fcntl.LOCK_EX)