mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 1 | #!/usr/bin/python -u |
Simran Basi | 77edf61 | 2012-08-14 15:14:11 -0700 | [diff] [blame] | 2 | import os, socket, sys, signal, time, subprocess, logging |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 3 | from optparse import OptionParser |
showard | 701f626 | 2009-04-16 03:10:11 +0000 | [diff] [blame] | 4 | import common |
showard | 043c62a | 2009-06-10 19:48:57 +0000 | [diff] [blame] | 5 | from autotest_lib.scheduler import babysitter_logging_config |
showard | 701f626 | 2009-04-16 03:10:11 +0000 | [diff] [blame] | 6 | from autotest_lib.client.common_lib import error, global_config, utils |
showard | 136e6dc | 2009-06-10 19:38:49 +0000 | [diff] [blame] | 7 | from autotest_lib.client.common_lib import logging_manager |
showard | 136e6dc | 2009-06-10 19:38:49 +0000 | [diff] [blame] | 8 | from autotest_lib.scheduler import scheduler_logging_config |
Simran Basi | 77edf61 | 2012-08-14 15:14:11 -0700 | [diff] [blame] | 9 | from autotest_lib.scheduler import status_server |
showard | 549afad | 2009-08-20 23:33:36 +0000 | [diff] [blame] | 10 | from autotest_lib.scheduler import monitor_db |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 11 | |
| 12 | PAUSE_LENGTH = 60 |
| 13 | STALL_TIMEOUT = 2*60*60 |
| 14 | |
| 15 | parser = OptionParser() |
Dale Curtis | 74a314b | 2011-06-23 14:55:46 -0700 | [diff] [blame] | 16 | parser.add_option("-r", action="store_true", dest="recover", |
| 17 | help=("run recovery mode (implicit after any crash)")) |
| 18 | parser.add_option("--background", dest="background", action="store_true", |
| 19 | default=False, help=("runs the scheduler monitor on " |
| 20 | "background")) |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 21 | (options, args) = parser.parse_args() |
| 22 | |
| 23 | autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) |
| 24 | results_dir = os.path.join(autodir, 'results') |
showard | 549afad | 2009-08-20 23:33:36 +0000 | [diff] [blame] | 25 | monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py') |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 26 | recover = (options.recover == True) |
| 27 | |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 28 | if len(args) != 0: |
Dale Curtis | 74a314b | 2011-06-23 14:55:46 -0700 | [diff] [blame] | 29 | parser.print_help() |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 30 | sys.exit(1) |
| 31 | |
| 32 | |
showard | 701f626 | 2009-04-16 03:10:11 +0000 | [diff] [blame] | 33 | def run_banner_output(cmd): |
| 34 | """Returns ------ CMD ------\nCMD_OUTPUT in a string""" |
| 35 | banner_output = '%s\n%%s\n\n' % cmd.center(60, '-') |
| 36 | command_output = '' |
| 37 | try: |
| 38 | cmd_out = utils.run(cmd, ignore_status=True, timeout=30) |
| 39 | command_output = cmd_out.stdout + cmd_out.stderr |
| 40 | except error.CmdError: |
| 41 | command_output = 'Timed out' |
| 42 | |
| 43 | return banner_output % command_output |
| 44 | |
| 45 | |
showard | 549afad | 2009-08-20 23:33:36 +0000 | [diff] [blame] | 46 | def kill_monitor(): |
| 47 | logging.info("Killing monitor_db") |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 48 | # try shutdown first |
showard | 8de3713 | 2009-08-31 18:33:08 +0000 | [diff] [blame] | 49 | utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT) |
| 50 | if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed? |
showard | 549afad | 2009-08-20 23:33:36 +0000 | [diff] [blame] | 51 | # give it some time to shutdown |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 52 | time.sleep(30) |
showard | 549afad | 2009-08-20 23:33:36 +0000 | [diff] [blame] | 53 | # kill it |
| 54 | utils.signal_process(monitor_db.PID_FILE_PREFIX) |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 55 | |
| 56 | |
| 57 | def handle_sigterm(signum, frame): |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 58 | logging.info('Caught SIGTERM') |
showard | 549afad | 2009-08-20 23:33:36 +0000 | [diff] [blame] | 59 | kill_monitor() |
| 60 | utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX) |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 61 | sys.exit(1) |
| 62 | |
| 63 | signal.signal(signal.SIGTERM, handle_sigterm) |
| 64 | |
| 65 | |
jamesren | 138785a | 2010-02-19 00:13:27 +0000 | [diff] [blame] | 66 | SiteMonitorProc = utils.import_site_class( |
| 67 | __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter', |
| 68 | 'SiteMonitorProc', object) |
| 69 | |
| 70 | |
| 71 | class MonitorProc(SiteMonitorProc): |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 72 | def __init__(self, do_recovery=False): |
showard | 549afad | 2009-08-20 23:33:36 +0000 | [diff] [blame] | 73 | args = [monitor_db_path] |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 74 | if do_recovery: |
| 75 | args.append("--recover-hosts") |
| 76 | args.append(results_dir) |
| 77 | |
showard | 549afad | 2009-08-20 23:33:36 +0000 | [diff] [blame] | 78 | kill_monitor() |
mbligh | c9895aa | 2009-04-01 18:36:58 +0000 | [diff] [blame] | 79 | environ = os.environ |
showard | 136e6dc | 2009-06-10 19:38:49 +0000 | [diff] [blame] | 80 | scheduler_config = scheduler_logging_config.SchedulerLoggingConfig |
| 81 | log_name = scheduler_config.get_log_name() |
showard | 50e463b | 2009-04-07 18:13:45 +0000 | [diff] [blame] | 82 | os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name |
showard | 136e6dc | 2009-06-10 19:38:49 +0000 | [diff] [blame] | 83 | scheduler_log_dir = scheduler_config.get_server_log_dir() |
| 84 | self.log_path = os.path.join(scheduler_log_dir, log_name) |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 85 | |
mbligh | c9895aa | 2009-04-01 18:36:58 +0000 | [diff] [blame] | 86 | self.log_size = 0 |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 87 | self.last_log_change = time.time() |
| 88 | |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 89 | logging.info("STARTING monitor_db with log file %s" % self.log_path) |
jamesren | 138785a | 2010-02-19 00:13:27 +0000 | [diff] [blame] | 90 | self.args = args |
| 91 | |
| 92 | # Allow site specific code to run, set environment variables and |
| 93 | # modify self.args if desired. |
| 94 | super(MonitorProc, self).__init__() |
| 95 | |
| 96 | |
| 97 | def start(self): |
showard | 50e463b | 2009-04-07 18:13:45 +0000 | [diff] [blame] | 98 | devnull = open(os.devnull, 'w') |
jamesren | 138785a | 2010-02-19 00:13:27 +0000 | [diff] [blame] | 99 | self.proc = subprocess.Popen(self.args, stdout=devnull) |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 100 | |
| 101 | |
| 102 | def is_running(self): |
mbligh | d876f45 | 2008-12-03 15:09:17 +0000 | [diff] [blame] | 103 | if self.proc.poll() is not None: |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 104 | logging.info("monitor_db DIED") |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 105 | return False |
| 106 | |
| 107 | old_size = self.log_size |
| 108 | new_size = os.path.getsize(self.log_path) |
| 109 | if old_size != new_size: |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 110 | logging.info("Log was touched") |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 111 | self.log_size = new_size |
| 112 | self.last_log_change = time.time() |
| 113 | elif self.last_log_change + STALL_TIMEOUT < time.time(): |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 114 | logging.info("monitor_db STALLED") |
showard | 701f626 | 2009-04-16 03:10:11 +0000 | [diff] [blame] | 115 | self.collect_stalled_info() |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 116 | return False |
| 117 | |
| 118 | return True |
| 119 | |
| 120 | |
showard | 701f626 | 2009-04-16 03:10:11 +0000 | [diff] [blame] | 121 | def collect_stalled_info(self): |
| 122 | INFO_TO_COLLECT = ['uptime', |
| 123 | 'ps auxwww', |
| 124 | 'iostat -k -x 2 4', |
| 125 | ] |
| 126 | db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s' |
| 127 | config = global_config.global_config |
| 128 | try: |
| 129 | user = config.get_config_value("BACKUP", "user") |
| 130 | password = config.get_config_value("BACKUP", "password") |
| 131 | db_cmd %= (user, password) |
| 132 | INFO_TO_COLLECT.append(db_cmd) |
| 133 | except global_config.ConfigError: |
| 134 | pass |
| 135 | stall_log_path = self.log_path + '.stall_info' |
| 136 | log = open(stall_log_path, "w") |
| 137 | for cmd in INFO_TO_COLLECT: |
| 138 | log.write(run_banner_output(cmd)) |
| 139 | |
| 140 | log.close() |
| 141 | |
| 142 | |
mbligh | 6adf837 | 2009-01-30 00:51:18 +0000 | [diff] [blame] | 143 | if os.getuid() == 0: |
Dale Curtis | 74a314b | 2011-06-23 14:55:46 -0700 | [diff] [blame] | 144 | logging.critical("Running as root, aborting!") |
mbligh | 6adf837 | 2009-01-30 00:51:18 +0000 | [diff] [blame] | 145 | sys.exit(1) |
| 146 | |
showard | 8de3713 | 2009-08-31 18:33:08 +0000 | [diff] [blame] | 147 | if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX): |
Dale Curtis | 74a314b | 2011-06-23 14:55:46 -0700 | [diff] [blame] | 148 | logging.critical("Monitor_db_babysitter already running, aborting!") |
showard | 549afad | 2009-08-20 23:33:36 +0000 | [diff] [blame] | 149 | sys.exit(1) |
Dale Curtis | 74a314b | 2011-06-23 14:55:46 -0700 | [diff] [blame] | 150 | |
showard | 549afad | 2009-08-20 23:33:36 +0000 | [diff] [blame] | 151 | utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX) |
mbligh | fb67603 | 2009-04-01 18:25:38 +0000 | [diff] [blame] | 152 | |
Dale Curtis | 74a314b | 2011-06-23 14:55:46 -0700 | [diff] [blame] | 153 | if options.background: |
| 154 | logging_manager.configure_logging( |
| 155 | babysitter_logging_config.BabysitterLoggingConfig(use_console=False)) |
| 156 | |
| 157 | # Double fork - see http://code.activestate.com/recipes/66012/ |
| 158 | try: |
| 159 | pid = os.fork() |
| 160 | if (pid > 0): |
| 161 | sys.exit(0) # exit from first parent |
| 162 | except OSError, e: |
| 163 | sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror)) |
| 164 | sys.exit(1) |
| 165 | |
| 166 | # Decouple from parent environment |
| 167 | os.chdir("/") |
| 168 | os.umask(0) |
| 169 | os.setsid() |
| 170 | |
| 171 | # Second fork |
| 172 | try: |
| 173 | pid = os.fork() |
| 174 | if (pid > 0): |
| 175 | sys.exit(0) # exit from second parent |
| 176 | except OSError, e: |
| 177 | sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror)) |
| 178 | sys.exit(1) |
| 179 | else: |
| 180 | logging_manager.configure_logging( |
| 181 | babysitter_logging_config.BabysitterLoggingConfig()) |
| 182 | |
| 183 | |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 184 | while True: |
Simran Basi | 77edf61 | 2012-08-14 15:14:11 -0700 | [diff] [blame] | 185 | sock = socket.socket() |
| 186 | try: |
| 187 | # Try to bind to the same port as the status_server. |
| 188 | sock.bind(('localhost', status_server._PORT)) |
| 189 | except socket.error, msg: |
| 190 | # If binding failed, open the port. |
| 191 | logging.error('Failed to open socket with error:%s. Closing socket.', |
| 192 | msg) |
| 193 | release_port_cmd_list = ['fuser', '-k', '-n', 'tcp', |
| 194 | '%d' % status_server._PORT] |
| 195 | process = subprocess.Popen(release_port_cmd_list) |
| 196 | process.wait() |
| 197 | sock.close() |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 198 | proc = MonitorProc(do_recovery=recover) |
jamesren | 138785a | 2010-02-19 00:13:27 +0000 | [diff] [blame] | 199 | proc.start() |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 200 | time.sleep(PAUSE_LENGTH) |
| 201 | while proc.is_running(): |
showard | b18134f | 2009-03-20 20:52:18 +0000 | [diff] [blame] | 202 | logging.info("Tick") |
mbligh | c0e24fb | 2008-10-02 20:17:37 +0000 | [diff] [blame] | 203 | time.sleep(PAUSE_LENGTH) |
| 204 | recover = False |