blob: a30bf6119fad225704ecb757d9b84fba28d3e530 [file] [log] [blame]
Dan Shib03ea9d2013-08-15 17:13:27 -07001import os, time, logging, shutil
jadmanski96b78072009-05-21 22:21:04 +00002
Dale Curtiscb7bfaf2011-06-07 16:21:57 -07003from autotest_lib.client.common_lib import global_config
Michael Liangda8c60a2014-06-03 13:24:51 -07004from autotest_lib.client.common_lib.cros.graphite import stats
Dan Shib03ea9d2013-08-15 17:13:27 -07005from autotest_lib.client.cros import constants
mbligh15971eb2009-12-29 02:55:23 +00006from autotest_lib.server import utils
jadmanski96b78072009-05-21 22:21:04 +00007
8
9# import any site hooks for the crashdump and crashinfo collection
10get_site_crashdumps = utils.import_site_function(
11 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
12 lambda host, test_start_time: None)
13get_site_crashinfo = utils.import_site_function(
14 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
15 lambda host, test_start_time: None)
16
17
beeps0a628292013-11-26 18:07:08 -080018_timer = stats.Timer('crash_collection')
19
20@_timer.decorate
jadmanski96b78072009-05-21 22:21:04 +000021def get_crashdumps(host, test_start_time):
22 get_site_crashdumps(host, test_start_time)
23
24
beeps0a628292013-11-26 18:07:08 -080025@_timer.decorate
jadmanski96b78072009-05-21 22:21:04 +000026def get_crashinfo(host, test_start_time):
27 logging.info("Collecting crash information...")
28
Fang Deng279d8a92014-04-16 17:16:50 -070029 # get_crashdumps collects orphaned crashdumps and symbolicates all
30 # collected crashdumps. Symbolicating could happen
31 # during a postjob task as well, at which time some crashdumps could have
32 # already been pulled back from machine. So it doesn't necessarily need
33 # to wait for the machine to come up.
jadmanski96b78072009-05-21 22:21:04 +000034 get_crashdumps(host, test_start_time)
35
jadmanski663d55a2009-05-21 22:54:28 +000036 if wait_for_machine_to_recover(host):
37 # run any site-specific collection
38 get_site_crashinfo(host, test_start_time)
39
40 crashinfo_dir = get_crashinfo_dir(host)
jadmanski4900b3b2009-07-02 22:12:08 +000041 collect_messages(host)
jadmanski663d55a2009-05-21 22:54:28 +000042 collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
jadmanski663d55a2009-05-21 22:54:28 +000043 collect_uncollected_logs(host)
44
Dan Shib03ea9d2013-08-15 17:13:27 -070045 # Collect everything in /var/log.
46 log_path = os.path.join(crashinfo_dir, 'var')
47 os.makedirs(log_path)
48 collect_log_file(host, constants.LOG_DIR, log_path)
49
Fang Deng279d8a92014-04-16 17:16:50 -070050 # Collect console-ramoops
51 log_path = os.path.join(
52 crashinfo_dir, os.path.basename(constants.LOG_CONSOLE_RAMOOPS))
53 collect_log_file(host, constants.LOG_CONSOLE_RAMOOPS, log_path)
54 # Collect i915_error_state, only available on intel systems.
55 # i915 contains the Intel graphics state. It might contain useful data
56 # when a DUT hangs, times out or crashes.
57 log_path = os.path.join(
58 crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE))
59 collect_log_file(host, constants.LOG_I915_ERROR_STATE,
60 log_path, use_tmp=True)
61
jadmanski663d55a2009-05-21 22:54:28 +000062
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070063# Load default for number of hours to wait before giving up on crash collection.
64HOURS_TO_WAIT = global_config.global_config.get_config_value(
65 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
66
67
68def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
jadmanski663d55a2009-05-21 22:54:28 +000069 """Wait for a machine (possibly down) to become accessible again.
70
71 @param host: A RemoteHost instance to wait on
72 @param hours_to_wait: Number of hours to wait before giving up
73
74 @returns: True if the machine comes back up, False otherwise
75 """
jadmanski96b78072009-05-21 22:21:04 +000076 current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
mbligh67429f42009-11-06 03:00:37 +000077 if host.is_up():
78 logging.info("%s already up, collecting crash info", host.hostname)
79 return True
80
Eric Li8a12e802011-02-17 14:24:13 -080081 logging.info("Waiting %s hours for %s to come up (%s)",
82 hours_to_wait, host.hostname, current_time)
jadmanski663d55a2009-05-21 22:54:28 +000083 if not host.wait_up(timeout=hours_to_wait * 3600):
Dan Shib03ea9d2013-08-15 17:13:27 -070084 stats.Counter('collect_crashinfo_timeout').increment()
jadmanski96b78072009-05-21 22:21:04 +000085 logging.warning("%s down, unable to collect crash info",
86 host.hostname)
jadmanski663d55a2009-05-21 22:54:28 +000087 return False
jadmanski96b78072009-05-21 22:21:04 +000088 else:
89 logging.info("%s is back up, collecting crash info", host.hostname)
jadmanski663d55a2009-05-21 22:54:28 +000090 return True
jadmanski96b78072009-05-21 22:21:04 +000091
jadmanski96b78072009-05-21 22:21:04 +000092
jadmanski663d55a2009-05-21 22:54:28 +000093def get_crashinfo_dir(host):
94 """Find and if necessary create a directory to store crashinfo in.
95
96 @param host: The RemoteHost object that crashinfo will be collected from
97
98 @returns: The path to an existing directory for writing crashinfo into
99 """
jadmanski96b78072009-05-21 22:21:04 +0000100 host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
101 if host_resultdir:
102 infodir = host_resultdir
103 else:
104 infodir = os.path.abspath(os.getcwd())
105 infodir = os.path.join(infodir, "crashinfo.%s" % host.hostname)
106 if not os.path.exists(infodir):
107 os.mkdir(infodir)
jadmanski663d55a2009-05-21 22:54:28 +0000108 return infodir
jadmanski96b78072009-05-21 22:21:04 +0000109
jadmanski96b78072009-05-21 22:21:04 +0000110
Fang Deng279d8a92014-04-16 17:16:50 -0700111def collect_log_file(host, log_path, dest_path, use_tmp=False):
jadmanski663d55a2009-05-21 22:54:28 +0000112 """Collects a log file from the remote machine.
113
114 Log files are collected from the remote machine and written into the
115 destination path. If dest_path is a directory, the log file will be named
116 using the basename of the remote log path.
117
118 @param host: The RemoteHost to collect logs from
119 @param log_path: The remote path to collect the log file from
120 @param dest_path: A path (file or directory) to write the copies logs into
Fang Deng279d8a92014-04-16 17:16:50 -0700121 @param use_tmp: If True, will first copy the logs to a temporary directory
122 on the host and download logs from there.
jadmanski663d55a2009-05-21 22:54:28 +0000123
Fang Deng279d8a92014-04-16 17:16:50 -0700124 """
125 logging.info('Collecting %s...', log_path)
126 try:
127 source_path = log_path
128 if use_tmp:
129 devnull = open('/dev/null', 'w')
130 tmpdir = host.run('mktemp -d', stdout_tee=devnull).stdout.strip()
131 host.run('cp -rp %s %s' % (log_path, tmpdir))
132 source_path = os.path.join(tmpdir, os.path.basename(log_path))
133 host.get_file(source_path, dest_path, preserve_perm=False)
134 if use_tmp:
135 host.run('rm -rf %s' % tmpdir)
136 except Exception, e:
137 logging.warning('Collection of %s failed: %s', log_path, e)
jadmanski663d55a2009-05-21 22:54:28 +0000138
139
140def collect_command(host, command, dest_path):
141 """Collects the result of a command on the remote machine.
142
143 The standard output of the command will be collected and written into the
144 desitionation path. The destination path is assumed to be filename and
145 not a directory.
146
147 @param host: The RemoteHost to collect from
148 @param command: A shell command to run on the remote machine and capture
149 the output from.
150 @param dest_path: A file path to write the results of the log into
151 """
152 logging.info("Collecting '%s' ...", command)
jadmanski96b78072009-05-21 22:21:04 +0000153 devnull = open("/dev/null", "w")
154 try:
155 try:
jadmanski663d55a2009-05-21 22:54:28 +0000156 result = host.run(command, stdout_tee=devnull).stdout
157 utils.open_write_close(dest_path, result)
jadmanski96b78072009-05-21 22:21:04 +0000158 except Exception, e:
jadmanski663d55a2009-05-21 22:54:28 +0000159 logging.warning("Collection of '%s' failed:\n%s", command, e)
jadmanski96b78072009-05-21 22:21:04 +0000160 finally:
161 devnull.close()
162
jadmanski663d55a2009-05-21 22:54:28 +0000163
jadmanski663d55a2009-05-21 22:54:28 +0000164def collect_uncollected_logs(host):
165 """Collects any leftover uncollected logs from the client.
166
167 @param host: The RemoteHost to collect from
168 """
mbligh0d0f67d2009-11-06 03:15:03 +0000169 if host.job:
jadmanski96b78072009-05-21 22:21:04 +0000170 try:
mbligh0d0f67d2009-11-06 03:15:03 +0000171 logs = host.job.get_client_logs()
jadmanski96b78072009-05-21 22:21:04 +0000172 for hostname, remote_path, local_path in logs:
173 if hostname == host.hostname:
174 logging.info("Retrieving logs from %s:%s into %s",
jadmanski663d55a2009-05-21 22:54:28 +0000175 hostname, remote_path, local_path)
jadmanski96b78072009-05-21 22:21:04 +0000176 host.get_file(remote_path + "/", local_path + "/")
177 except Exception, e:
178 logging.warning("Error while trying to collect stranded "
jadmanski663d55a2009-05-21 22:54:28 +0000179 "Autotest client logs: %s", e)
jadmanski4900b3b2009-07-02 22:12:08 +0000180
181
182def collect_messages(host):
183 """Collects the 'new' contents of /var/log/messages.
184
185 If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
186 the contents of /var/log/messages excluding whatever initial contents
187 are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
188 present, simply collects the entire contents of /var/log/messages.
189
190 @param host: The RemoteHost to collect from
191 """
192 crashinfo_dir = get_crashinfo_dir(host)
193
194 try:
195 # paths to the messages files
196 messages = os.path.join(crashinfo_dir, "messages")
197 messages_raw = os.path.join(crashinfo_dir, "messages.raw")
198 messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
199
200 # grab the files from the remote host
201 collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
202 messages_at_start)
203 collect_log_file(host, "/var/log/messages", messages_raw)
204
205 # figure out how much of messages.raw to skip
206 if os.path.exists(messages_at_start):
207 # if the first lines of the messages at start should match the
208 # first lines of the current messages; if they don't then messages
209 # has been erase or rotated and we just grab all of it
210 first_line_at_start = utils.read_one_line(messages_at_start)
211 first_line_now = utils.read_one_line(messages_raw)
212 if first_line_at_start != first_line_now:
213 size_at_start = 0
214 else:
215 size_at_start = os.path.getsize(messages_at_start)
216 else:
217 size_at_start = 0
218 raw_messages_file = open(messages_raw)
219 messages_file = open(messages, "w")
220 raw_messages_file.seek(size_at_start)
221 shutil.copyfileobj(raw_messages_file, messages_file)
222 raw_messages_file.close()
223 messages_file.close()
224
225 # get rid of the "raw" versions of messages
226 os.remove(messages_raw)
227 if os.path.exists(messages_at_start):
228 os.remove(messages_at_start)
229 except Exception, e:
230 logging.warning("Error while collecting /var/log/messages: %s", e)