blob: 0ae0a7b584f59397faf00c9a85c7f55db3ce045c [file] [log] [blame]
mblighc0e24fb2008-10-02 20:17:37 +00001#!/usr/bin/python -u
showard50e463b2009-04-07 18:13:45 +00002import os, sys, signal, time, subprocess, logging, logging.config
mblighc0e24fb2008-10-02 20:17:37 +00003from optparse import OptionParser
showard701f6262009-04-16 03:10:11 +00004import common
5from autotest_lib.client.common_lib import error, global_config, utils
mblighc0e24fb2008-10-02 20:17:37 +00006
7PAUSE_LENGTH = 60
8STALL_TIMEOUT = 2*60*60
9
10parser = OptionParser()
11parser.add_option("-r", action="store_true", dest="recover")
12(options, args) = parser.parse_args()
13
14autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
15results_dir = os.path.join(autodir, 'results')
16monitor_db = os.path.join(autodir, 'scheduler/monitor_db.py')
17recover = (options.recover == True)
18
showardb18134f2009-03-20 20:52:18 +000019# load logging settings
20scheduler_dir = os.path.join(autodir, 'scheduler')
showard50e463b2009-04-07 18:13:45 +000021AUTOTEST_BABYSITTER_LOG_DIR = os.path.join(autodir, 'logs')
22os.environ['AUTOTEST_BABYSITTER_LOG_DIR'] = AUTOTEST_BABYSITTER_LOG_DIR
showardb18134f2009-03-20 20:52:18 +000023# Here we export the log name, using the same convention as autoserv's results
24# directory.
25babysitter_log_name = 'babysitter.log.%s' % time.strftime('%Y-%m-%d-%H.%M.%S')
26os.environ['AUTOTEST_BABYSITTER_LOG_NAME'] = babysitter_log_name
27logging.config.fileConfig(os.path.join(scheduler_dir, 'debug_babysitter.ini'))
mblighc0e24fb2008-10-02 20:17:37 +000028
29if len(args) != 0:
30 print "Usage: %s [options]" % __file__
31 print " -r Run recovery mode. (Note: recovery is implicit after"
32 print " any crash!)"
33 print
34 sys.exit(1)
35
36
showard701f6262009-04-16 03:10:11 +000037def run_banner_output(cmd):
38 """Returns ------ CMD ------\nCMD_OUTPUT in a string"""
39 banner_output = '%s\n%%s\n\n' % cmd.center(60, '-')
40 command_output = ''
41 try:
42 cmd_out = utils.run(cmd, ignore_status=True, timeout=30)
43 command_output = cmd_out.stdout + cmd_out.stderr
44 except error.CmdError:
45 command_output = 'Timed out'
46
47 return banner_output % command_output
48
49
mblighc0e24fb2008-10-02 20:17:37 +000050def kill_all_monitors():
showardb18134f2009-03-20 20:52:18 +000051 logging.info("Killing all monitor_dbs")
mblighc0e24fb2008-10-02 20:17:37 +000052 # try shutdown first
53 status = os.system("killall -2 monitor_db.py")
54 if status == 0: # were any killed?
55 # give them some time to shutdown
56 time.sleep(30)
57 # kill any that are left
58 os.system("killall monitor_db.py")
59
60
61def handle_sigterm(signum, frame):
showardb18134f2009-03-20 20:52:18 +000062 logging.info('Caught SIGTERM')
mblighc0e24fb2008-10-02 20:17:37 +000063 kill_all_monitors()
64 sys.exit(1)
65
66signal.signal(signal.SIGTERM, handle_sigterm)
67
68
69class MonitorProc:
70 def __init__(self, do_recovery=False):
71 args = [monitor_db]
72 if do_recovery:
73 args.append("--recover-hosts")
74 args.append(results_dir)
75
76 kill_all_monitors()
mblighc9895aa2009-04-01 18:36:58 +000077 environ = os.environ
showard50e463b2009-04-07 18:13:45 +000078 log_name = 'scheduler.log.%s' % time.strftime('%Y-%m-%d-%H.%M.%S')
79 os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
80 os.environ['AUTOTEST_SCHEDULER_LOG_DIR'] = AUTOTEST_BABYSITTER_LOG_DIR
81 self.log_path = os.path.join(autodir, 'logs', log_name)
mblighc0e24fb2008-10-02 20:17:37 +000082
mblighc9895aa2009-04-01 18:36:58 +000083 self.log_size = 0
mblighc0e24fb2008-10-02 20:17:37 +000084 self.last_log_change = time.time()
85
showardb18134f2009-03-20 20:52:18 +000086 logging.info("STARTING monitor_db with log file %s" % self.log_path)
showard50e463b2009-04-07 18:13:45 +000087 devnull = open(os.devnull, 'w')
88 self.proc = subprocess.Popen(args, stdout=devnull)
mblighc0e24fb2008-10-02 20:17:37 +000089
90
91 def is_running(self):
mblighd876f452008-12-03 15:09:17 +000092 if self.proc.poll() is not None:
showardb18134f2009-03-20 20:52:18 +000093 logging.info("monitor_db DIED")
mblighc0e24fb2008-10-02 20:17:37 +000094 return False
95
96 old_size = self.log_size
97 new_size = os.path.getsize(self.log_path)
98 if old_size != new_size:
showardb18134f2009-03-20 20:52:18 +000099 logging.info("Log was touched")
mblighc0e24fb2008-10-02 20:17:37 +0000100 self.log_size = new_size
101 self.last_log_change = time.time()
102 elif self.last_log_change + STALL_TIMEOUT < time.time():
showardb18134f2009-03-20 20:52:18 +0000103 logging.info("monitor_db STALLED")
showard701f6262009-04-16 03:10:11 +0000104 self.collect_stalled_info()
mblighc0e24fb2008-10-02 20:17:37 +0000105 return False
106
107 return True
108
109
showard701f6262009-04-16 03:10:11 +0000110 def collect_stalled_info(self):
111 INFO_TO_COLLECT = ['uptime',
112 'ps auxwww',
113 'iostat -k -x 2 4',
114 ]
115 db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
116 config = global_config.global_config
117 try:
118 user = config.get_config_value("BACKUP", "user")
119 password = config.get_config_value("BACKUP", "password")
120 db_cmd %= (user, password)
121 INFO_TO_COLLECT.append(db_cmd)
122 except global_config.ConfigError:
123 pass
124 stall_log_path = self.log_path + '.stall_info'
125 log = open(stall_log_path, "w")
126 for cmd in INFO_TO_COLLECT:
127 log.write(run_banner_output(cmd))
128
129 log.close()
130
131
showardb18134f2009-03-20 20:52:18 +0000132logging.info("initializing")
mbligh6adf8372009-01-30 00:51:18 +0000133
134if os.getuid() == 0:
showard701f6262009-04-16 03:10:11 +0000135 logging.critical("running as root, aborting!")
mbligh6adf8372009-01-30 00:51:18 +0000136 sys.exit(1)
137
mblighfb676032009-04-01 18:25:38 +0000138utils.write_pid("monitor_db_babysitter")
139
mblighc0e24fb2008-10-02 20:17:37 +0000140while True:
141 proc = MonitorProc(do_recovery=recover)
142 time.sleep(PAUSE_LENGTH)
143 while proc.is_running():
showardb18134f2009-03-20 20:52:18 +0000144 logging.info("Tick")
mblighc0e24fb2008-10-02 20:17:37 +0000145 time.sleep(PAUSE_LENGTH)
146 recover = False