blob: a8ac8a5457726100826b589bcd20898e6c13fc11 [file] [log] [blame]
Dan Shib03ea9d2013-08-15 17:13:27 -07001import os, time, logging, shutil
jadmanski96b78072009-05-21 22:21:04 +00002
Dale Curtiscb7bfaf2011-06-07 16:21:57 -07003from autotest_lib.client.common_lib import global_config
Dan Shib03ea9d2013-08-15 17:13:27 -07004from autotest_lib.client.cros import constants
mbligh15971eb2009-12-29 02:55:23 +00005from autotest_lib.server import utils
Dan Shib03ea9d2013-08-15 17:13:27 -07006from autotest_lib.site_utils.graphite import stats
jadmanski96b78072009-05-21 22:21:04 +00007
8
9# import any site hooks for the crashdump and crashinfo collection
10get_site_crashdumps = utils.import_site_function(
11 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
12 lambda host, test_start_time: None)
13get_site_crashinfo = utils.import_site_function(
14 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
15 lambda host, test_start_time: None)
16
17
18def get_crashdumps(host, test_start_time):
19 get_site_crashdumps(host, test_start_time)
20
21
22def get_crashinfo(host, test_start_time):
23 logging.info("Collecting crash information...")
24
25 # include crashdumps as part of the general crashinfo
26 get_crashdumps(host, test_start_time)
27
jadmanski663d55a2009-05-21 22:54:28 +000028 if wait_for_machine_to_recover(host):
29 # run any site-specific collection
30 get_site_crashinfo(host, test_start_time)
31
32 crashinfo_dir = get_crashinfo_dir(host)
jadmanski4900b3b2009-07-02 22:12:08 +000033 collect_messages(host)
jadmanski663d55a2009-05-21 22:54:28 +000034 collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
jadmanski663d55a2009-05-21 22:54:28 +000035 collect_uncollected_logs(host)
36
Dan Shib03ea9d2013-08-15 17:13:27 -070037 # Collect everything in /var/log.
38 log_path = os.path.join(crashinfo_dir, 'var')
39 os.makedirs(log_path)
40 collect_log_file(host, constants.LOG_DIR, log_path)
41
jadmanski663d55a2009-05-21 22:54:28 +000042
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070043# Load default for number of hours to wait before giving up on crash collection.
44HOURS_TO_WAIT = global_config.global_config.get_config_value(
45 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
46
47
48def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
jadmanski663d55a2009-05-21 22:54:28 +000049 """Wait for a machine (possibly down) to become accessible again.
50
51 @param host: A RemoteHost instance to wait on
52 @param hours_to_wait: Number of hours to wait before giving up
53
54 @returns: True if the machine comes back up, False otherwise
55 """
jadmanski96b78072009-05-21 22:21:04 +000056 current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
mbligh67429f42009-11-06 03:00:37 +000057 if host.is_up():
58 logging.info("%s already up, collecting crash info", host.hostname)
59 return True
60
Eric Li8a12e802011-02-17 14:24:13 -080061 logging.info("Waiting %s hours for %s to come up (%s)",
62 hours_to_wait, host.hostname, current_time)
jadmanski663d55a2009-05-21 22:54:28 +000063 if not host.wait_up(timeout=hours_to_wait * 3600):
Dan Shib03ea9d2013-08-15 17:13:27 -070064 stats.Counter('collect_crashinfo_timeout').increment()
jadmanski96b78072009-05-21 22:21:04 +000065 logging.warning("%s down, unable to collect crash info",
66 host.hostname)
jadmanski663d55a2009-05-21 22:54:28 +000067 return False
jadmanski96b78072009-05-21 22:21:04 +000068 else:
69 logging.info("%s is back up, collecting crash info", host.hostname)
jadmanski663d55a2009-05-21 22:54:28 +000070 return True
jadmanski96b78072009-05-21 22:21:04 +000071
jadmanski96b78072009-05-21 22:21:04 +000072
jadmanski663d55a2009-05-21 22:54:28 +000073def get_crashinfo_dir(host):
74 """Find and if necessary create a directory to store crashinfo in.
75
76 @param host: The RemoteHost object that crashinfo will be collected from
77
78 @returns: The path to an existing directory for writing crashinfo into
79 """
jadmanski96b78072009-05-21 22:21:04 +000080 host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
81 if host_resultdir:
82 infodir = host_resultdir
83 else:
84 infodir = os.path.abspath(os.getcwd())
85 infodir = os.path.join(infodir, "crashinfo.%s" % host.hostname)
86 if not os.path.exists(infodir):
87 os.mkdir(infodir)
jadmanski663d55a2009-05-21 22:54:28 +000088 return infodir
jadmanski96b78072009-05-21 22:21:04 +000089
jadmanski96b78072009-05-21 22:21:04 +000090
jadmanski663d55a2009-05-21 22:54:28 +000091def collect_log_file(host, log_path, dest_path):
92 """Collects a log file from the remote machine.
93
94 Log files are collected from the remote machine and written into the
95 destination path. If dest_path is a directory, the log file will be named
96 using the basename of the remote log path.
97
98 @param host: The RemoteHost to collect logs from
99 @param log_path: The remote path to collect the log file from
100 @param dest_path: A path (file or directory) to write the copies logs into
101 """
102 logging.info("Collecting %s...", log_path)
103 try:
104 host.get_file(log_path, dest_path, preserve_perm=False)
105 except Exception:
106 logging.warning("Collection of %s failed", log_path)
107
108
109
110def collect_command(host, command, dest_path):
111 """Collects the result of a command on the remote machine.
112
113 The standard output of the command will be collected and written into the
114 desitionation path. The destination path is assumed to be filename and
115 not a directory.
116
117 @param host: The RemoteHost to collect from
118 @param command: A shell command to run on the remote machine and capture
119 the output from.
120 @param dest_path: A file path to write the results of the log into
121 """
122 logging.info("Collecting '%s' ...", command)
jadmanski96b78072009-05-21 22:21:04 +0000123 devnull = open("/dev/null", "w")
124 try:
125 try:
jadmanski663d55a2009-05-21 22:54:28 +0000126 result = host.run(command, stdout_tee=devnull).stdout
127 utils.open_write_close(dest_path, result)
jadmanski96b78072009-05-21 22:21:04 +0000128 except Exception, e:
jadmanski663d55a2009-05-21 22:54:28 +0000129 logging.warning("Collection of '%s' failed:\n%s", command, e)
jadmanski96b78072009-05-21 22:21:04 +0000130 finally:
131 devnull.close()
132
jadmanski663d55a2009-05-21 22:54:28 +0000133
jadmanski663d55a2009-05-21 22:54:28 +0000134def collect_uncollected_logs(host):
135 """Collects any leftover uncollected logs from the client.
136
137 @param host: The RemoteHost to collect from
138 """
mbligh0d0f67d2009-11-06 03:15:03 +0000139 if host.job:
jadmanski96b78072009-05-21 22:21:04 +0000140 try:
mbligh0d0f67d2009-11-06 03:15:03 +0000141 logs = host.job.get_client_logs()
jadmanski96b78072009-05-21 22:21:04 +0000142 for hostname, remote_path, local_path in logs:
143 if hostname == host.hostname:
144 logging.info("Retrieving logs from %s:%s into %s",
jadmanski663d55a2009-05-21 22:54:28 +0000145 hostname, remote_path, local_path)
jadmanski96b78072009-05-21 22:21:04 +0000146 host.get_file(remote_path + "/", local_path + "/")
147 except Exception, e:
148 logging.warning("Error while trying to collect stranded "
jadmanski663d55a2009-05-21 22:54:28 +0000149 "Autotest client logs: %s", e)
jadmanski4900b3b2009-07-02 22:12:08 +0000150
151
152def collect_messages(host):
153 """Collects the 'new' contents of /var/log/messages.
154
155 If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
156 the contents of /var/log/messages excluding whatever initial contents
157 are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
158 present, simply collects the entire contents of /var/log/messages.
159
160 @param host: The RemoteHost to collect from
161 """
162 crashinfo_dir = get_crashinfo_dir(host)
163
164 try:
165 # paths to the messages files
166 messages = os.path.join(crashinfo_dir, "messages")
167 messages_raw = os.path.join(crashinfo_dir, "messages.raw")
168 messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
169
170 # grab the files from the remote host
171 collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
172 messages_at_start)
173 collect_log_file(host, "/var/log/messages", messages_raw)
174
175 # figure out how much of messages.raw to skip
176 if os.path.exists(messages_at_start):
177 # if the first lines of the messages at start should match the
178 # first lines of the current messages; if they don't then messages
179 # has been erase or rotated and we just grab all of it
180 first_line_at_start = utils.read_one_line(messages_at_start)
181 first_line_now = utils.read_one_line(messages_raw)
182 if first_line_at_start != first_line_now:
183 size_at_start = 0
184 else:
185 size_at_start = os.path.getsize(messages_at_start)
186 else:
187 size_at_start = 0
188 raw_messages_file = open(messages_raw)
189 messages_file = open(messages, "w")
190 raw_messages_file.seek(size_at_start)
191 shutil.copyfileobj(raw_messages_file, messages_file)
192 raw_messages_file.close()
193 messages_file.close()
194
195 # get rid of the "raw" versions of messages
196 os.remove(messages_raw)
197 if os.path.exists(messages_at_start):
198 os.remove(messages_at_start)
199 except Exception, e:
200 logging.warning("Error while collecting /var/log/messages: %s", e)