[autotest] Scheduler, drone_manager, drone_utility stats.
Adds some useful stats to drone manager, handle agents and
drone utility. These stats should help us track processes,
figure out where the drone_manager latency is coming from
and draw correlations between the number of agents scheduled
and drone refresh time.
This cl also moves site_drone_utility's kill_process method
into drone_utility, and modifies the nuke_pids function to only
wait on and kill processes that haven't already died.
TEST=Ran suites.
BUG=chromium:400486
DEPLOY=scheduler
Change-Id: I56e6ee05fa2ae1935435dbc2055d7f99a9a89e5e
Reviewed-on: https://chromium-review.googlesource.com/211769
Reviewed-by: Prashanth B <beeps@chromium.org>
Commit-Queue: Prashanth B <beeps@chromium.org>
Tested-by: Prashanth B <beeps@chromium.org>
diff --git a/client/common_lib/site_utils.py b/client/common_lib/site_utils.py
index d5fe764..8f44a9e 100644
--- a/client/common_lib/site_utils.py
+++ b/client/common_lib/site_utils.py
@@ -222,9 +222,17 @@
@param pid_list: List of PID's to kill.
@param signal_queue: Queue of signals to send the PID's to terminate them.
+
+ @return: A mapping of the signal name to the number of processes it
+ was sent to.
"""
+ sig_count = {}
+ # Though this is slightly hacky it beats hardcoding names anyday.
+ sig_names = dict((k, v) for v, k in signal.__dict__.iteritems()
+ if v.startswith('SIG'))
for sig in signal_queue:
logging.debug('Sending signal %s to the following pids:', sig)
+ sig_count[sig_names.get(sig, 'unknown_signal')] = len(pid_list)
for pid in pid_list:
logging.debug('Pid %d', pid)
try:
@@ -233,10 +241,13 @@
# The process may have died from a previous signal before we
# could kill it.
pass
+ pid_list = [pid for pid in pid_list if base_utils.pid_is_alive(pid)]
+ if not pid_list:
+ break
time.sleep(CHECK_PID_IS_ALIVE_TIMEOUT)
failed_list = []
if signal.SIGKILL in signal_queue:
- return
+ return sig_count
for pid in pid_list:
if base_utils.pid_is_alive(pid):
failed_list.append('Could not kill %d for process name: %s.' % pid,
@@ -244,6 +255,7 @@
if failed_list:
raise error.AutoservRunError('Following errors occured: %s' %
failed_list, None)
+ return sig_count
def externalize_host(host):
@@ -410,4 +422,4 @@
try:
return int(base_utils.system_output('pgrep -o ^X$')) > 0
except Exception:
- return False
\ No newline at end of file
+ return False