blob: 11b8a9704b342a70a6555ee71eb41e94e2e41532 [file] [log] [blame]
"""The main job wrapper
This is the core infrastructure.
"""
__author__ = """Copyright Andy Whitcroft, Martin J. Bligh 2006"""
# standard stuff
import os, sys, re, pickle, shutil, time, traceback
# autotest stuff
from autotest_utils import *
from parallel import *
from common.error import *
from common import barrier
import kernel, xen, test, profilers, filesystem, fd_stack, boottool
import harness, config
import sysinfo
import cpuset
class job:
"""The actual job against which we do everything.
Properties:
autodir
The top level autotest directory (/usr/local/autotest).
Comes from os.environ['AUTODIR'].
bindir
<autodir>/bin/
libdir
<autodir>/lib/
testdir
<autodir>/tests/
site_testdir
<autodir>/site_tests/
profdir
<autodir>/profilers/
tmpdir
<autodir>/tmp/
resultdir
<autodir>/results/<jobtag>
stdout
fd_stack object for stdout
stderr
fd_stack object for stderr
profilers
the profilers object for this job
harness
the server harness object for this job
config
the job configuration for this job
"""
DEFAULT_LOG_FILENAME = "status"
def __init__(self, control, jobtag, cont, harness_type=None):
"""
control
The control file (pathname of)
jobtag
The job tag string (eg "default")
cont
If this is the continuation of this job
harness_type
An alternative server harness
"""
self.autodir = os.environ['AUTODIR']
self.bindir = os.path.join(self.autodir, 'bin')
self.libdir = os.path.join(self.autodir, 'lib')
self.testdir = os.path.join(self.autodir, 'tests')
self.site_testdir = os.path.join(self.autodir, 'site_tests')
self.profdir = os.path.join(self.autodir, 'profilers')
self.tmpdir = os.path.join(self.autodir, 'tmp')
self.resultdir = os.path.join(self.autodir, 'results', jobtag)
self.sysinfodir = os.path.join(self.resultdir, 'sysinfo')
self.control = os.path.abspath(control)
if not cont:
df_root = system_output('df -m / | tail -1').split()
self.free_space_mb_root_before = int(df_root[3])
self.usage_percent_root_before = int(df_root[4].rstrip('%'))
if (self.free_space_mb_root_before < 100 or
self.usage_percent_root_before > 90):
self.record('WARN', 'check free space on root', 'free space is less than 100Mb or 10%')
pickle.dump(self.free_space_mb_root_before, file(self.control + '.fs', 'w'))
if os.path.exists(self.tmpdir):
system('umount -f %s > /dev/null 2> /dev/null'%\
self.tmpdir, ignorestatus=True)
system('rm -rf ' + self.tmpdir)
os.mkdir(self.tmpdir)
results = os.path.join(self.autodir, 'results')
if not os.path.exists(results):
os.mkdir(results)
download = os.path.join(self.testdir, 'download')
if os.path.exists(download):
system('rm -rf ' + download)
os.mkdir(download)
if os.path.exists(self.resultdir):
system('rm -rf ' + self.resultdir)
os.mkdir(self.resultdir)
os.mkdir(self.sysinfodir)
os.mkdir(os.path.join(self.resultdir, 'debug'))
os.mkdir(os.path.join(self.resultdir, 'analysis'))
shutil.copyfile(self.control,
os.path.join(self.resultdir, 'control'))
else:
self.free_space_mb_root_before = pickle.load(file(self.control + '.fs', 'r'))
self.control = control
self.jobtag = jobtag
self.log_filename = self.DEFAULT_LOG_FILENAME
self.container = None
self.stdout = fd_stack.fd_stack(1, sys.stdout)
self.stderr = fd_stack.fd_stack(2, sys.stderr)
self.group_level = 0
self.config = config.config(self)
self.harness = harness.select(harness_type, self)
self.profilers = profilers.profilers(self)
try:
tool = self.config_get('boottool.executable')
self.bootloader = boottool.boottool(tool)
except:
pass
sysinfo.log_per_reboot_data(self.sysinfodir)
if not cont:
self.record('START', None, None)
self.group_level = 1
self.harness.run_start()
def relative_path(self, path):
"""\
Return a patch relative to the job results directory
"""
head = len(self.resultdir) + 1 # remove the / inbetween
return path[head:]
def control_get(self):
return self.control
def control_set(self, control):
self.control = os.path.abspath(control)
def harness_select(self, which):
self.harness = harness.select(which, self)
def config_set(self, name, value):
self.config.set(name, value)
def config_get(self, name):
return self.config.get(name)
def setup_dirs(self, results_dir, tmp_dir):
if not tmp_dir:
tmp_dir = os.path.join(self.tmpdir, 'build')
if not os.path.exists(tmp_dir):
os.mkdir(tmp_dir)
if not os.path.isdir(tmp_dir):
e_msg = "Temp dir (%s) is not a dir - args backwards?" % self.tmpdir
raise ValueError(e_msg)
# We label the first build "build" and then subsequent ones
# as "build.2", "build.3", etc. Whilst this is a little bit
# inconsistent, 99.9% of jobs will only have one build
# (that's not done as kernbench, sparse, or buildtest),
# so it works out much cleaner. One of life's comprimises.
if not results_dir:
results_dir = os.path.join(self.resultdir, 'build')
i = 2
while os.path.exists(results_dir):
results_dir = os.path.join(self.resultdir, 'build.%d' % i)
i += 1
if not os.path.exists(results_dir):
os.mkdir(results_dir)
return (results_dir, tmp_dir)
def xen(self, base_tree, results_dir = '', tmp_dir = '', leave = False, \
kjob = None ):
"""Summon a xen object"""
(results_dir, tmp_dir) = self.setup_dirs(results_dir, tmp_dir)
build_dir = 'xen'
return xen.xen(self, base_tree, results_dir, tmp_dir, build_dir, leave, kjob)
def kernel(self, base_tree, results_dir = '', tmp_dir = '', leave = False):
"""Summon a kernel object"""
(results_dir, tmp_dir) = self.setup_dirs(results_dir, tmp_dir)
build_dir = 'linux'
return kernel.auto_kernel(self, base_tree, results_dir,
tmp_dir, build_dir, leave)
def barrier(self, *args, **kwds):
"""Create a barrier object"""
return barrier.barrier(*args, **kwds)
def setup_dep(self, deps):
"""Set up the dependencies for this test.
deps is a list of libraries required for this test.
"""
for dep in deps:
try:
os.chdir(os.path.join(self.autodir, 'deps', dep))
system('./' + dep + '.py')
except:
error = "setting up dependency " + dep + "\n"
raise UnhandledError(error)
def __runtest(self, url, tag, args, dargs):
try:
l = lambda : test.runtest(self, url, tag, args, dargs)
pid = fork_start(self.resultdir, l)
fork_waitfor(self.resultdir, pid)
except AutotestError:
raise
except:
raise UnhandledError('running test ' + \
self.__class__.__name__ + "\n")
def run_test(self, url, *args, **dargs):
"""Summon a test object and run it.
tag
tag to add to testname
url
url of the test to run
"""
if not url:
raise TypeError("Test name is invalid. Switched arguments?")
(group, testname) = test.testname(url)
tag = dargs.pop('tag', None)
container = dargs.pop('container', None)
subdir = testname
if tag:
subdir += '.' + tag
if container:
cname = container.get('name', None)
if not cname: # get old name
cname = container.get('container_name', None)
mbytes = container.get('mbytes', None)
if not mbytes: # get old name
mbytes = container.get('mem', None)
cpus = container.get('cpus', None)
if not cpus: # get old name
cpus = container.get('cpu', None)
root = container.get('root', None)
self.new_container(mbytes=mbytes, cpus=cpus,
root=root, name=cname)
# We are running in a container now...
def group_func():
try:
self.__runtest(url, tag, args, dargs)
except Exception, detail:
self.record('FAIL', subdir, testname,
str(detail))
raise
else:
self.record('GOOD', subdir, testname,
'completed successfully')
result, exc_info = self.__rungroup(subdir, group_func)
if container:
self.release_container()
if exc_info and isinstance(exc_info[1], TestError):
return False
elif exc_info:
raise exc_info[0], exc_info[1], exc_info[2]
else:
return True
def __rungroup(self, name, function, *args, **dargs):
"""\
name:
name of the group
function:
subroutine to run
*args:
arguments for the function
Returns a 2-tuple (result, exc_info) where result
is the return value of function, and exc_info is
the sys.exc_info() of the exception thrown by the
function (which may be None).
"""
result, exc_info = None, None
try:
self.record('START', None, name)
self.group_level += 1
result = function(*args, **dargs)
self.group_level -= 1
self.record('END GOOD', None, name)
except Exception, e:
exc_info = sys.exc_info()
self.group_level -= 1
err_msg = str(e) + '\n' + format_error()
self.record('END FAIL', None, name, err_msg)
return result, exc_info
def run_group(self, function, *args, **dargs):
"""\
function:
subroutine to run
*args:
arguments for the function
"""
# Allow the tag for the group to be specified
name = function.__name__
tag = dargs.pop('tag', None)
if tag:
name = tag
result, exc_info = self.__rungroup(name, function,
*args, **dargs)
# if there was a non-TestError exception, raise it
if exc_info and not isinstance(exc_info[1], TestError):
err = ''.join(traceback.format_exception(*exc_info))
raise TestError(name + ' failed\n' + err)
# pass back the actual return value from the function
return result
def new_container(self, mbytes=None, cpus=None, root=None, name=None):
if not grep('cpuset', '/proc/filesystems'):
print "Containers not enabled by latest reboot"
return # containers weren't enabled in this kernel boot
pid = os.getpid()
if not root:
root = 'sys'
if not name:
name = 'test%d' % pid # make arbitrary unique name
self.container = cpuset.cpuset(name, job_size=mbytes,
job_pid=pid, cpus=cpus, root=root, cleanup=1)
# This job's python shell is now running in the new container
# and all forked test processes will inherit that container
def release_container(self):
if self.container:
self.container.release(job_pid=os.getpid())
self.container = None
def cpu_count(self):
if self.container:
return len(self.container.cpus)
return count_cpus() # use total system count
# Check the passed kernel identifier against the command line
# and the running kernel, abort the job on missmatch.
def kernel_check_ident(self, expected_when, expected_id, expected_cl, subdir, type = 'src'):
print "POST BOOT: checking booted kernel mark=%d identity='%s' changelist=%s type='%s'" \
% (expected_when, expected_id, expected_cl, type)
running_id = running_os_ident()
cmdline = read_one_line("/proc/cmdline")
find_sum = re.compile(r'.*IDENT=(\d+)')
m = find_sum.match(cmdline)
cmdline_when = -1
if m:
cmdline_when = int(m.groups()[0])
cl_re = re.compile(r'\d{7,}')
cl_match = cl_re.search(system_output('uname -v').split()[1])
if cl_match:
current_cl = cl_match.group()
else:
current_cl = None
# We have all the facts, see if they indicate we
# booted the requested kernel or not.
bad = False
if (type == 'src' and expected_id != running_id or
type == 'rpm' and not running_id.startswith(expected_id + '::')):
print "check_kernel_ident: kernel identifier mismatch"
bad = True
if expected_when != cmdline_when:
print "check_kernel_ident: kernel command line mismatch"
bad = True
if expected_cl and current_cl and str(expected_cl) != current_cl:
print 'check_kernel_ident: kernel changelist mismatch'
bad = True
if bad:
print " Expected Ident: " + expected_id
print " Running Ident: " + running_id
print " Expected Mark: %d" % (expected_when)
print "Command Line Mark: %d" % (cmdline_when)
print " Expected P4 CL: %s" % expected_cl
print " P4 CL: %s" % current_cl
print " Command Line: " + cmdline
raise JobError("boot failure", "reboot.verify")
self.record('GOOD', subdir, 'reboot.verify')
def filesystem(self, device, mountpoint = None, loop_size = 0):
if not mountpoint:
mountpoint = self.tmpdir
return filesystem.filesystem(self, device, mountpoint,loop_size)
def reboot(self, tag='autotest'):
self.record('GOOD', None, 'reboot.start')
self.harness.run_reboot()
default = self.config_get('boot.set_default')
if default:
self.bootloader.set_default(tag)
else:
self.bootloader.boot_once(tag)
system("(sleep 5; reboot) </dev/null >/dev/null 2>&1 &")
self.quit()
def noop(self, text):
print "job: noop: " + text
def parallel(self, *tasklist):
"""Run tasks in parallel"""
pids = []
old_log_filename = self.log_filename
for i, task in enumerate(tasklist):
self.log_filename = old_log_filename + (".%d" % i)
task_func = lambda: task[0](*task[1:])
pids.append(fork_start(self.resultdir, task_func))
old_log_path = os.path.join(self.resultdir, old_log_filename)
old_log = open(old_log_path, "a")
exceptions = []
for i, pid in enumerate(pids):
# wait for the task to finish
try:
fork_waitfor(self.resultdir, pid)
except Exception, e:
exceptions.append(e)
# copy the logs from the subtask into the main log
new_log_path = old_log_path + (".%d" % i)
if os.path.exists(new_log_path):
new_log = open(new_log_path)
old_log.write(new_log.read())
new_log.close()
old_log.flush()
os.remove(new_log_path)
old_log.close()
self.log_filename = old_log_filename
# handle any exceptions raised by the parallel tasks
if exceptions:
msg = "%d task(s) failed" % len(exceptions)
raise JobError(msg, str(exceptions), exceptions)
def quit(self):
# XXX: should have a better name.
self.harness.run_pause()
raise JobContinue("more to come")
def complete(self, status):
"""Clean up and exit"""
# We are about to exit 'complete' so clean up the control file.
try:
os.unlink(self.control + '.state')
except:
pass
try:
os.unlink(self.control + '.fs')
except:
pass
self.harness.run_complete()
sys.exit(status)
steps = []
def next_step(self, step):
"""Define the next step"""
if not isinstance(step[0], basestring):
step[0] = step[0].__name__
self.steps.append(step)
pickle.dump(self.steps, open(self.control + '.state', 'w'))
def next_step_prepend(self, step):
"""Insert a new step, executing first"""
if not isinstance(step[0], basestring):
step[0] = step[0].__name__
self.steps.insert(0, step)
pickle.dump(self.steps, open(self.control + '.state', 'w'))
def step_engine(self):
"""the stepping engine -- if the control file defines
step_init we will be using this engine to drive multiple runs.
"""
"""Do the next step"""
lcl = dict({'job': self})
str = """
from common.error import *
from autotest_utils import *
"""
exec(str, lcl, lcl)
execfile(self.control, lcl, lcl)
state = self.control + '.state'
# If there is a mid-job state file load that in and continue
# where it indicates. Otherwise start stepping at the passed
# entry.
try:
self.steps = pickle.load(open(state, 'r'))
except:
if lcl.has_key('step_init'):
self.next_step([lcl['step_init']])
# Run the step list.
while len(self.steps) > 0:
step = self.steps.pop(0)
pickle.dump(self.steps, open(state, 'w'))
cmd = step.pop(0)
lcl['__args'] = step
exec(cmd + "(*__args)", lcl, lcl)
def record(self, status_code, subdir, operation, status = ''):
"""
Record job-level status
The intent is to make this file both machine parseable and
human readable. That involves a little more complexity, but
really isn't all that bad ;-)
Format is <status code>\t<subdir>\t<operation>\t<status>
status code: (GOOD|WARN|FAIL|ABORT)
or START
or END (GOOD|WARN|FAIL|ABORT)
subdir: MUST be a relevant subdirectory in the results,
or None, which will be represented as '----'
operation: description of what you ran (e.g. "dbench", or
"mkfs -t foobar /dev/sda9")
status: error message or "completed sucessfully"
------------------------------------------------------------
Initial tabs indicate indent levels for grouping, and is
governed by self.group_level
multiline messages have secondary lines prefaced by a double
space (' ')
"""
if subdir:
if re.match(r'[\n\t]', subdir):
raise ValueError("Invalid character in subdir string")
substr = subdir
else:
substr = '----'
if not re.match(r'(START|(END )?(GOOD|WARN|FAIL|ABORT))$', \
status_code):
raise ValueError("Invalid status code supplied: %s" % status_code)
if not operation:
operation = '----'
if re.match(r'[\n\t]', operation):
raise ValueError("Invalid character in operation string")
operation = operation.rstrip()
status = status.rstrip()
status = re.sub(r"\t", " ", status)
# Ensure any continuation lines are marked so we can
# detect them in the status file to ensure it is parsable.
status = re.sub(r"\n", "\n" + "\t" * self.group_level + " ", status)
# Generate timestamps for inclusion in the logs
epoch_time = int(time.time()) # seconds since epoch, in UTC
local_time = time.localtime(epoch_time)
epoch_time_str = "timestamp=%d" % (epoch_time,)
local_time_str = time.strftime("localtime=%b %d %H:%M:%S",
local_time)
msg = '\t'.join(str(x) for x in (status_code, substr, operation,
epoch_time_str, local_time_str,
status))
msg = '\t' * self.group_level + msg
msg_tag = ""
if "." in self.log_filename:
msg_tag = self.log_filename.split(".", 1)[1]
self.harness.test_status_detail(status_code, substr, operation,
status, msg_tag)
self.harness.test_status(msg, msg_tag)
# log to stdout (if enabled)
#if self.log_filename == self.DEFAULT_LOG_FILENAME:
print msg
# log to the "root" status log
status_file = os.path.join(self.resultdir, self.log_filename)
open(status_file, "a").write(msg + "\n")
# log to the subdir status log (if subdir is set)
if subdir:
dir = os.path.join(self.resultdir, subdir)
if not os.path.exists(dir):
os.mkdir(dir)
status_file = os.path.join(dir,
self.DEFAULT_LOG_FILENAME)
open(status_file, "a").write(msg + "\n")
def runjob(control, cont = False, tag = "default", harness_type = ''):
"""The main interface to this module
control
The control file to use for this job.
cont
Whether this is the continuation of a previously started job
"""
control = os.path.abspath(control)
state = control + '.state'
# instantiate the job object ready for the control file.
myjob = None
try:
# Check that the control file is valid
if not os.path.exists(control):
raise JobError(control + ": control file not found")
# When continuing, the job is complete when there is no
# state file, ensure we don't try and continue.
if cont and not os.path.exists(state):
raise JobComplete("all done")
if cont == False and os.path.exists(state):
os.unlink(state)
myjob = job(control, tag, cont, harness_type)
# Load in the users control file, may do any one of:
# 1) execute in toto
# 2) define steps, and select the first via next_step()
myjob.step_engine()
except JobContinue:
sys.exit(5)
except JobComplete:
sys.exit(1)
except JobError, instance:
print "JOB ERROR: " + instance.args[0]
if myjob:
command = None
if len(instance.args) > 1:
command = instance.args[1]
myjob.group_level = 0
myjob.record('ABORT', None, command, instance.args[0])
myjob.record('END ABORT', None, None)
myjob.complete(1)
else:
sys.exit(1)
except Exception, e:
msg = str(e) + '\n' + format_error()
print "JOB ERROR: " + msg
if myjob:
myjob.group_level = 0
myjob.record('ABORT', None, None, msg)
myjob.record('END ABORT', None, None)
myjob.complete(1)
else:
sys.exit(1)
# If we get here, then we assume the job is complete and good.
myjob.group_level = 0
myjob.record('END GOOD', None, None)
df_root = system_output('df -m / | tail -1').split()
free_space_mb_root_after = int(df_root[3])
if myjob.free_space_mb_root_before - free_space_mb_root_after > 5:
myjob.record('WARN', None, 'disk_usage',
'disk usage on root is greater than 5Mb')
myjob.complete(0)