mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 1 | #!/usr/bin/python -u |
showard | 50e463b | 2009-04-07 18:13:45 +0000 | [diff] [blame] | 2 | import os, sys, signal, time, subprocess, logging, logging.config |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 3 | from optparse import OptionParser |
showard | 701f626 | 2009-04-16 03:10:11 +0000 | [diff] [blame] | 4 | import common |
| 5 | from autotest_lib.client.common_lib import error, global_config, utils |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 6 | |
| 7 | PAUSE_LENGTH = 60 |
| 8 | STALL_TIMEOUT = 2*60*60 |
| 9 | |
| 10 | parser = OptionParser() |
| 11 | parser.add_option("-r", action="store_true", dest="recover") |
| 12 | (options, args) = parser.parse_args() |
| 13 | |
| 14 | autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) |
| 15 | results_dir = os.path.join(autodir, 'results') |
| 16 | monitor_db = os.path.join(autodir, 'scheduler/monitor_db.py') |
| 17 | recover = (options.recover == True) |
| 18 | |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 19 | # load logging settings |
| 20 | scheduler_dir = os.path.join(autodir, 'scheduler') |
showard | 50e463b | 2009-04-07 18:13:45 +0000 | [diff] [blame] | 21 | AUTOTEST_BABYSITTER_LOG_DIR = os.path.join(autodir, 'logs') |
| 22 | os.environ['AUTOTEST_BABYSITTER_LOG_DIR'] = AUTOTEST_BABYSITTER_LOG_DIR |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 23 | # Here we export the log name, using the same convention as autoserv's results |
| 24 | # directory. |
| 25 | babysitter_log_name = 'babysitter.log.%s' % time.strftime('%Y-%m-%d-%H.%M.%S') |
| 26 | os.environ['AUTOTEST_BABYSITTER_LOG_NAME'] = babysitter_log_name |
| 27 | logging.config.fileConfig(os.path.join(scheduler_dir, 'debug_babysitter.ini')) |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 28 | |
| 29 | if len(args) != 0: |
| 30 | print "Usage: %s [options]" % __file__ |
| 31 | print " -r Run recovery mode. (Note: recovery is implicit after" |
| 32 | print " any crash!)" |
| 33 | print |
| 34 | sys.exit(1) |
| 35 | |
| 36 | |
showard | 701f626 | 2009-04-16 03:10:11 +0000 | [diff] [blame] | 37 | def run_banner_output(cmd): |
| 38 | """Returns ------ CMD ------\nCMD_OUTPUT in a string""" |
| 39 | banner_output = '%s\n%%s\n\n' % cmd.center(60, '-') |
| 40 | command_output = '' |
| 41 | try: |
| 42 | cmd_out = utils.run(cmd, ignore_status=True, timeout=30) |
| 43 | command_output = cmd_out.stdout + cmd_out.stderr |
| 44 | except error.CmdError: |
| 45 | command_output = 'Timed out' |
| 46 | |
| 47 | return banner_output % command_output |
| 48 | |
| 49 | |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 50 | def kill_all_monitors(): |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 51 | logging.info("Killing all monitor_dbs") |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 52 | # try shutdown first |
| 53 | status = os.system("killall -2 monitor_db.py") |
| 54 | if status == 0: # were any killed? |
| 55 | # give them some time to shutdown |
| 56 | time.sleep(30) |
| 57 | # kill any that are left |
| 58 | os.system("killall monitor_db.py") |
| 59 | |
| 60 | |
| 61 | def handle_sigterm(signum, frame): |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 62 | logging.info('Caught SIGTERM') |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 63 | kill_all_monitors() |
| 64 | sys.exit(1) |
| 65 | |
| 66 | signal.signal(signal.SIGTERM, handle_sigterm) |
| 67 | |
| 68 | |
| 69 | class MonitorProc: |
| 70 | def __init__(self, do_recovery=False): |
| 71 | args = [monitor_db] |
| 72 | if do_recovery: |
| 73 | args.append("--recover-hosts") |
| 74 | args.append(results_dir) |
| 75 | |
| 76 | kill_all_monitors() |
mbligh | c9895aa | 2009-04-01 18:36:58 +0000 | [diff] [blame] | 77 | environ = os.environ |
showard | 50e463b | 2009-04-07 18:13:45 +0000 | [diff] [blame] | 78 | log_name = 'scheduler.log.%s' % time.strftime('%Y-%m-%d-%H.%M.%S') |
| 79 | os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name |
| 80 | os.environ['AUTOTEST_SCHEDULER_LOG_DIR'] = AUTOTEST_BABYSITTER_LOG_DIR |
| 81 | self.log_path = os.path.join(autodir, 'logs', log_name) |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 82 | |
mbligh | c9895aa | 2009-04-01 18:36:58 +0000 | [diff] [blame] | 83 | self.log_size = 0 |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 84 | self.last_log_change = time.time() |
| 85 | |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 86 | logging.info("STARTING monitor_db with log file %s" % self.log_path) |
showard | 50e463b | 2009-04-07 18:13:45 +0000 | [diff] [blame] | 87 | devnull = open(os.devnull, 'w') |
| 88 | self.proc = subprocess.Popen(args, stdout=devnull) |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 89 | |
| 90 | |
| 91 | def is_running(self): |
mbligh | d876f45 | 2008-12-03 15:09:17 +0000 | [diff] [blame] | 92 | if self.proc.poll() is not None: |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 93 | logging.info("monitor_db DIED") |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 94 | return False |
| 95 | |
| 96 | old_size = self.log_size |
| 97 | new_size = os.path.getsize(self.log_path) |
| 98 | if old_size != new_size: |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 99 | logging.info("Log was touched") |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 100 | self.log_size = new_size |
| 101 | self.last_log_change = time.time() |
| 102 | elif self.last_log_change + STALL_TIMEOUT < time.time(): |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 103 | logging.info("monitor_db STALLED") |
showard | 701f626 | 2009-04-16 03:10:11 +0000 | [diff] [blame] | 104 | self.collect_stalled_info() |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 105 | return False |
| 106 | |
| 107 | return True |
| 108 | |
| 109 | |
showard | 701f626 | 2009-04-16 03:10:11 +0000 | [diff] [blame] | 110 | def collect_stalled_info(self): |
| 111 | INFO_TO_COLLECT = ['uptime', |
| 112 | 'ps auxwww', |
| 113 | 'iostat -k -x 2 4', |
| 114 | ] |
| 115 | db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s' |
| 116 | config = global_config.global_config |
| 117 | try: |
| 118 | user = config.get_config_value("BACKUP", "user") |
| 119 | password = config.get_config_value("BACKUP", "password") |
| 120 | db_cmd %= (user, password) |
| 121 | INFO_TO_COLLECT.append(db_cmd) |
| 122 | except global_config.ConfigError: |
| 123 | pass |
| 124 | stall_log_path = self.log_path + '.stall_info' |
| 125 | log = open(stall_log_path, "w") |
| 126 | for cmd in INFO_TO_COLLECT: |
| 127 | log.write(run_banner_output(cmd)) |
| 128 | |
| 129 | log.close() |
| 130 | |
| 131 | |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 132 | logging.info("initializing") |
mbligh | 6adf837 | 2009-01-30 00:51:18 +0000 | [diff] [blame] | 133 | |
| 134 | if os.getuid() == 0: |
showard | 701f626 | 2009-04-16 03:10:11 +0000 | [diff] [blame] | 135 | logging.critical("running as root, aborting!") |
mbligh | 6adf837 | 2009-01-30 00:51:18 +0000 | [diff] [blame] | 136 | sys.exit(1) |
| 137 | |
mbligh | fb67603 | 2009-04-01 18:25:38 +0000 | [diff] [blame] | 138 | utils.write_pid("monitor_db_babysitter") |
| 139 | |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 140 | while True: |
| 141 | proc = MonitorProc(do_recovery=recover) |
| 142 | time.sleep(PAUSE_LENGTH) |
| 143 | while proc.is_running(): |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 144 | logging.info("Tick") |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 145 | time.sleep(PAUSE_LENGTH) |
| 146 | recover = False |