blob: ae88379ebed0e14560b5c0689165555a81e125f9 [file] [log] [blame]
mblighc0e24fb2008-10-02 20:17:37 +00001#!/usr/bin/python -u
showard136e6dc2009-06-10 19:38:49 +00002import os, sys, signal, time, subprocess, logging
mblighc0e24fb2008-10-02 20:17:37 +00003from optparse import OptionParser
showard701f6262009-04-16 03:10:11 +00004import common
5from autotest_lib.client.common_lib import error, global_config, utils
showard136e6dc2009-06-10 19:38:49 +00006from autotest_lib.client.common_lib import logging_manager
7from autotest_lib.scheduler import babysitter_logging_config
8from autotest_lib.scheduler import scheduler_logging_config
mblighc0e24fb2008-10-02 20:17:37 +00009
10PAUSE_LENGTH = 60
11STALL_TIMEOUT = 2*60*60
12
13parser = OptionParser()
14parser.add_option("-r", action="store_true", dest="recover")
15(options, args) = parser.parse_args()
16
17autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
18results_dir = os.path.join(autodir, 'results')
19monitor_db = os.path.join(autodir, 'scheduler/monitor_db.py')
20recover = (options.recover == True)
21
showardb18134f2009-03-20 20:52:18 +000022# load logging settings
showard136e6dc2009-06-10 19:38:49 +000023logging_manager.configure_logging(
24 babysitter_logging_config.BabysitterLoggingConfig())
mblighc0e24fb2008-10-02 20:17:37 +000025
26if len(args) != 0:
27 print "Usage: %s [options]" % __file__
28 print " -r Run recovery mode. (Note: recovery is implicit after"
29 print " any crash!)"
30 print
31 sys.exit(1)
32
33
showard701f6262009-04-16 03:10:11 +000034def run_banner_output(cmd):
35 """Returns ------ CMD ------\nCMD_OUTPUT in a string"""
36 banner_output = '%s\n%%s\n\n' % cmd.center(60, '-')
37 command_output = ''
38 try:
39 cmd_out = utils.run(cmd, ignore_status=True, timeout=30)
40 command_output = cmd_out.stdout + cmd_out.stderr
41 except error.CmdError:
42 command_output = 'Timed out'
43
44 return banner_output % command_output
45
46
mblighc0e24fb2008-10-02 20:17:37 +000047def kill_all_monitors():
showardb18134f2009-03-20 20:52:18 +000048 logging.info("Killing all monitor_dbs")
mblighc0e24fb2008-10-02 20:17:37 +000049 # try shutdown first
50 status = os.system("killall -2 monitor_db.py")
51 if status == 0: # were any killed?
52 # give them some time to shutdown
53 time.sleep(30)
54 # kill any that are left
55 os.system("killall monitor_db.py")
56
57
58def handle_sigterm(signum, frame):
showardb18134f2009-03-20 20:52:18 +000059 logging.info('Caught SIGTERM')
mblighc0e24fb2008-10-02 20:17:37 +000060 kill_all_monitors()
61 sys.exit(1)
62
63signal.signal(signal.SIGTERM, handle_sigterm)
64
65
66class MonitorProc:
67 def __init__(self, do_recovery=False):
68 args = [monitor_db]
69 if do_recovery:
70 args.append("--recover-hosts")
71 args.append(results_dir)
72
73 kill_all_monitors()
mblighc9895aa2009-04-01 18:36:58 +000074 environ = os.environ
showard136e6dc2009-06-10 19:38:49 +000075 scheduler_config = scheduler_logging_config.SchedulerLoggingConfig
76 log_name = scheduler_config.get_log_name()
showard50e463b2009-04-07 18:13:45 +000077 os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
showard136e6dc2009-06-10 19:38:49 +000078 scheduler_log_dir = scheduler_config.get_server_log_dir()
79 self.log_path = os.path.join(scheduler_log_dir, log_name)
mblighc0e24fb2008-10-02 20:17:37 +000080
mblighc9895aa2009-04-01 18:36:58 +000081 self.log_size = 0
mblighc0e24fb2008-10-02 20:17:37 +000082 self.last_log_change = time.time()
83
showardb18134f2009-03-20 20:52:18 +000084 logging.info("STARTING monitor_db with log file %s" % self.log_path)
showard50e463b2009-04-07 18:13:45 +000085 devnull = open(os.devnull, 'w')
86 self.proc = subprocess.Popen(args, stdout=devnull)
mblighc0e24fb2008-10-02 20:17:37 +000087
88
89 def is_running(self):
mblighd876f452008-12-03 15:09:17 +000090 if self.proc.poll() is not None:
showardb18134f2009-03-20 20:52:18 +000091 logging.info("monitor_db DIED")
mblighc0e24fb2008-10-02 20:17:37 +000092 return False
93
94 old_size = self.log_size
95 new_size = os.path.getsize(self.log_path)
96 if old_size != new_size:
showardb18134f2009-03-20 20:52:18 +000097 logging.info("Log was touched")
mblighc0e24fb2008-10-02 20:17:37 +000098 self.log_size = new_size
99 self.last_log_change = time.time()
100 elif self.last_log_change + STALL_TIMEOUT < time.time():
showardb18134f2009-03-20 20:52:18 +0000101 logging.info("monitor_db STALLED")
showard701f6262009-04-16 03:10:11 +0000102 self.collect_stalled_info()
mblighc0e24fb2008-10-02 20:17:37 +0000103 return False
104
105 return True
106
107
showard701f6262009-04-16 03:10:11 +0000108 def collect_stalled_info(self):
109 INFO_TO_COLLECT = ['uptime',
110 'ps auxwww',
111 'iostat -k -x 2 4',
112 ]
113 db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
114 config = global_config.global_config
115 try:
116 user = config.get_config_value("BACKUP", "user")
117 password = config.get_config_value("BACKUP", "password")
118 db_cmd %= (user, password)
119 INFO_TO_COLLECT.append(db_cmd)
120 except global_config.ConfigError:
121 pass
122 stall_log_path = self.log_path + '.stall_info'
123 log = open(stall_log_path, "w")
124 for cmd in INFO_TO_COLLECT:
125 log.write(run_banner_output(cmd))
126
127 log.close()
128
129
showardb18134f2009-03-20 20:52:18 +0000130logging.info("initializing")
mbligh6adf8372009-01-30 00:51:18 +0000131
132if os.getuid() == 0:
showard701f6262009-04-16 03:10:11 +0000133 logging.critical("running as root, aborting!")
mbligh6adf8372009-01-30 00:51:18 +0000134 sys.exit(1)
135
mblighfb676032009-04-01 18:25:38 +0000136utils.write_pid("monitor_db_babysitter")
137
mblighc0e24fb2008-10-02 20:17:37 +0000138while True:
139 proc = MonitorProc(do_recovery=recover)
140 time.sleep(PAUSE_LENGTH)
141 while proc.is_running():
showardb18134f2009-03-20 20:52:18 +0000142 logging.info("Tick")
mblighc0e24fb2008-10-02 20:17:37 +0000143 time.sleep(PAUSE_LENGTH)
144 recover = False