blob: c82c1d7a21d28a4c48b8806b6c03c51f194a6f69 [file] [log] [blame]
mblighc0e24fb2008-10-02 20:17:37 +00001#!/usr/bin/python -u
Simran Basi77edf612012-08-14 15:14:11 -07002import os, socket, sys, signal, time, subprocess, logging
mblighc0e24fb2008-10-02 20:17:37 +00003from optparse import OptionParser
showard701f6262009-04-16 03:10:11 +00004import common
showard043c62a2009-06-10 19:48:57 +00005from autotest_lib.scheduler import babysitter_logging_config
showard701f6262009-04-16 03:10:11 +00006from autotest_lib.client.common_lib import error, global_config, utils
showard136e6dc2009-06-10 19:38:49 +00007from autotest_lib.client.common_lib import logging_manager
showard136e6dc2009-06-10 19:38:49 +00008from autotest_lib.scheduler import scheduler_logging_config
Simran Basi77edf612012-08-14 15:14:11 -07009from autotest_lib.scheduler import status_server
showard549afad2009-08-20 23:33:36 +000010from autotest_lib.scheduler import monitor_db
mblighc0e24fb2008-10-02 20:17:37 +000011
12PAUSE_LENGTH = 60
13STALL_TIMEOUT = 2*60*60
14
15parser = OptionParser()
Dale Curtis74a314b2011-06-23 14:55:46 -070016parser.add_option("-r", action="store_true", dest="recover",
17 help=("run recovery mode (implicit after any crash)"))
18parser.add_option("--background", dest="background", action="store_true",
19 default=False, help=("runs the scheduler monitor on "
20 "background"))
mblighc0e24fb2008-10-02 20:17:37 +000021(options, args) = parser.parse_args()
22
23autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
24results_dir = os.path.join(autodir, 'results')
showard549afad2009-08-20 23:33:36 +000025monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py')
mblighc0e24fb2008-10-02 20:17:37 +000026recover = (options.recover == True)
27
mblighc0e24fb2008-10-02 20:17:37 +000028if len(args) != 0:
Dale Curtis74a314b2011-06-23 14:55:46 -070029 parser.print_help()
mblighc0e24fb2008-10-02 20:17:37 +000030 sys.exit(1)
31
32
showard701f6262009-04-16 03:10:11 +000033def run_banner_output(cmd):
34 """Returns ------ CMD ------\nCMD_OUTPUT in a string"""
35 banner_output = '%s\n%%s\n\n' % cmd.center(60, '-')
36 command_output = ''
37 try:
38 cmd_out = utils.run(cmd, ignore_status=True, timeout=30)
39 command_output = cmd_out.stdout + cmd_out.stderr
40 except error.CmdError:
41 command_output = 'Timed out'
42
43 return banner_output % command_output
44
45
showard549afad2009-08-20 23:33:36 +000046def kill_monitor():
47 logging.info("Killing monitor_db")
mblighc0e24fb2008-10-02 20:17:37 +000048 # try shutdown first
showard8de37132009-08-31 18:33:08 +000049 utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT)
50 if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed?
showard549afad2009-08-20 23:33:36 +000051 # give it some time to shutdown
mblighc0e24fb2008-10-02 20:17:37 +000052 time.sleep(30)
showard549afad2009-08-20 23:33:36 +000053 # kill it
54 utils.signal_process(monitor_db.PID_FILE_PREFIX)
mblighc0e24fb2008-10-02 20:17:37 +000055
56
57def handle_sigterm(signum, frame):
showardb18134f2009-03-20 20:52:18 +000058 logging.info('Caught SIGTERM')
showard549afad2009-08-20 23:33:36 +000059 kill_monitor()
60 utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX)
mblighc0e24fb2008-10-02 20:17:37 +000061 sys.exit(1)
62
63signal.signal(signal.SIGTERM, handle_sigterm)
64
65
jamesren138785a2010-02-19 00:13:27 +000066SiteMonitorProc = utils.import_site_class(
67 __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter',
68 'SiteMonitorProc', object)
69
70
71class MonitorProc(SiteMonitorProc):
mblighc0e24fb2008-10-02 20:17:37 +000072 def __init__(self, do_recovery=False):
showard549afad2009-08-20 23:33:36 +000073 args = [monitor_db_path]
mblighc0e24fb2008-10-02 20:17:37 +000074 if do_recovery:
75 args.append("--recover-hosts")
76 args.append(results_dir)
77
showard549afad2009-08-20 23:33:36 +000078 kill_monitor()
mblighc9895aa2009-04-01 18:36:58 +000079 environ = os.environ
showard136e6dc2009-06-10 19:38:49 +000080 scheduler_config = scheduler_logging_config.SchedulerLoggingConfig
81 log_name = scheduler_config.get_log_name()
showard50e463b2009-04-07 18:13:45 +000082 os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
showard136e6dc2009-06-10 19:38:49 +000083 scheduler_log_dir = scheduler_config.get_server_log_dir()
84 self.log_path = os.path.join(scheduler_log_dir, log_name)
mblighc0e24fb2008-10-02 20:17:37 +000085
mblighc9895aa2009-04-01 18:36:58 +000086 self.log_size = 0
mblighc0e24fb2008-10-02 20:17:37 +000087 self.last_log_change = time.time()
88
showardb18134f2009-03-20 20:52:18 +000089 logging.info("STARTING monitor_db with log file %s" % self.log_path)
jamesren138785a2010-02-19 00:13:27 +000090 self.args = args
91
92 # Allow site specific code to run, set environment variables and
93 # modify self.args if desired.
94 super(MonitorProc, self).__init__()
95
96
97 def start(self):
showard50e463b2009-04-07 18:13:45 +000098 devnull = open(os.devnull, 'w')
jamesren138785a2010-02-19 00:13:27 +000099 self.proc = subprocess.Popen(self.args, stdout=devnull)
mblighc0e24fb2008-10-02 20:17:37 +0000100
101
102 def is_running(self):
mblighd876f452008-12-03 15:09:17 +0000103 if self.proc.poll() is not None:
showardb18134f2009-03-20 20:52:18 +0000104 logging.info("monitor_db DIED")
mblighc0e24fb2008-10-02 20:17:37 +0000105 return False
106
107 old_size = self.log_size
108 new_size = os.path.getsize(self.log_path)
109 if old_size != new_size:
showardb18134f2009-03-20 20:52:18 +0000110 logging.info("Log was touched")
mblighc0e24fb2008-10-02 20:17:37 +0000111 self.log_size = new_size
112 self.last_log_change = time.time()
113 elif self.last_log_change + STALL_TIMEOUT < time.time():
showardb18134f2009-03-20 20:52:18 +0000114 logging.info("monitor_db STALLED")
showard701f6262009-04-16 03:10:11 +0000115 self.collect_stalled_info()
mblighc0e24fb2008-10-02 20:17:37 +0000116 return False
117
118 return True
119
120
showard701f6262009-04-16 03:10:11 +0000121 def collect_stalled_info(self):
122 INFO_TO_COLLECT = ['uptime',
123 'ps auxwww',
124 'iostat -k -x 2 4',
125 ]
126 db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
127 config = global_config.global_config
128 try:
129 user = config.get_config_value("BACKUP", "user")
130 password = config.get_config_value("BACKUP", "password")
131 db_cmd %= (user, password)
132 INFO_TO_COLLECT.append(db_cmd)
133 except global_config.ConfigError:
134 pass
135 stall_log_path = self.log_path + '.stall_info'
136 log = open(stall_log_path, "w")
137 for cmd in INFO_TO_COLLECT:
138 log.write(run_banner_output(cmd))
139
140 log.close()
141
142
mbligh6adf8372009-01-30 00:51:18 +0000143if os.getuid() == 0:
Dale Curtis74a314b2011-06-23 14:55:46 -0700144 logging.critical("Running as root, aborting!")
mbligh6adf8372009-01-30 00:51:18 +0000145 sys.exit(1)
146
showard8de37132009-08-31 18:33:08 +0000147if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX):
Dale Curtis74a314b2011-06-23 14:55:46 -0700148 logging.critical("Monitor_db_babysitter already running, aborting!")
showard549afad2009-08-20 23:33:36 +0000149 sys.exit(1)
Dale Curtis74a314b2011-06-23 14:55:46 -0700150
showard549afad2009-08-20 23:33:36 +0000151utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX)
mblighfb676032009-04-01 18:25:38 +0000152
Dale Curtis74a314b2011-06-23 14:55:46 -0700153if options.background:
154 logging_manager.configure_logging(
155 babysitter_logging_config.BabysitterLoggingConfig(use_console=False))
156
157 # Double fork - see http://code.activestate.com/recipes/66012/
158 try:
159 pid = os.fork()
160 if (pid > 0):
161 sys.exit(0) # exit from first parent
162 except OSError, e:
163 sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror))
164 sys.exit(1)
165
166 # Decouple from parent environment
167 os.chdir("/")
168 os.umask(0)
169 os.setsid()
170
171 # Second fork
172 try:
173 pid = os.fork()
174 if (pid > 0):
175 sys.exit(0) # exit from second parent
176 except OSError, e:
177 sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror))
178 sys.exit(1)
179else:
180 logging_manager.configure_logging(
181 babysitter_logging_config.BabysitterLoggingConfig())
182
183
mblighc0e24fb2008-10-02 20:17:37 +0000184while True:
Simran Basi77edf612012-08-14 15:14:11 -0700185 sock = socket.socket()
186 try:
187 # Try to bind to the same port as the status_server.
188 sock.bind(('localhost', status_server._PORT))
189 except socket.error, msg:
190 # If binding failed, open the port.
191 logging.error('Failed to open socket with error:%s. Closing socket.',
192 msg)
193 release_port_cmd_list = ['fuser', '-k', '-n', 'tcp',
194 '%d' % status_server._PORT]
195 process = subprocess.Popen(release_port_cmd_list)
196 process.wait()
197 sock.close()
mblighc0e24fb2008-10-02 20:17:37 +0000198 proc = MonitorProc(do_recovery=recover)
jamesren138785a2010-02-19 00:13:27 +0000199 proc.start()
mblighc0e24fb2008-10-02 20:17:37 +0000200 time.sleep(PAUSE_LENGTH)
201 while proc.is_running():
showardb18134f2009-03-20 20:52:18 +0000202 logging.info("Tick")
mblighc0e24fb2008-10-02 20:17:37 +0000203 time.sleep(PAUSE_LENGTH)
204 recover = False