blob: dca2f5ec71a895d61e5c5490086c39e3151c1585 [file] [log] [blame]
Allen Li45da8472016-08-25 17:39:07 -07001import collections
Allen Li5889a8a2016-08-17 11:54:09 -07002import logging
3import os
4import pipes
Allen Li45da8472016-08-25 17:39:07 -07005import random
Allen Li5889a8a2016-08-17 11:54:09 -07006import shutil
7import time
jadmanski96b78072009-05-21 22:21:04 +00008
Allen Li45da8472016-08-25 17:39:07 -07009from autotest_lib.client.common_lib import error
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070010from autotest_lib.client.common_lib import global_config
Hsinyu Chaoe0b08e62015-08-11 10:50:37 +000011from autotest_lib.client.cros import constants
mbligh15971eb2009-12-29 02:55:23 +000012from autotest_lib.server import utils
jadmanski96b78072009-05-21 22:21:04 +000013
Dan Shi5e2efb72017-02-07 11:40:23 -080014try:
15 from chromite.lib import metrics
16except ImportError:
17 metrics = utils.metrics_mock
18
jadmanski96b78072009-05-21 22:21:04 +000019
Allen Li45da8472016-08-25 17:39:07 -070020# The amortized max filesize to collect. For example, if _MAX_FILESIZE is 10
21# then we would collect a file with size 20 half the time, and a file with size
22# 40 a quarter of the time, so that in the long run we are collecting files
23# with this max size.
24_MAX_FILESIZE = 64 * (2 ** 20) # 64 MiB
25
26
27class _RemoteTempDir(object):
28
29 """Context manager for temporary directory on remote host."""
30
31 def __init__(self, host):
32 self.host = host
33 self.tmpdir = None
34
35 def __repr__(self):
36 return '<{cls} host={this.host!r}, tmpdir={this.tmpdir!r}>'.format(
37 cls=type(self).__name__, this=self)
38
39 def __enter__(self):
40 self.tmpdir = (self.host
41 .run('mktemp -d', stdout_tee=None)
42 .stdout.strip())
43 return self.tmpdir
44
45 def __exit__(self, exc_type, exc_value, exc_tb):
46 self.host.run('rm -rf %s' % (pipes.quote(self.tmpdir),))
47
48
Allen Li5889a8a2016-08-17 11:54:09 -070049def collect_log_file(host, log_path, dest_path, use_tmp=False, clean=False):
50 """Collects a log file from the remote machine.
51
52 Log files are collected from the remote machine and written into the
53 destination path. If dest_path is a directory, the log file will be named
54 using the basename of the remote log path.
55
Allen Li45da8472016-08-25 17:39:07 -070056 Very large files will randomly not be collected, to alleviate network
57 traffic in the case of widespread crashes dumping large core files. Note
58 that this check only applies to the exact file passed as log_path. For
59 example, if this is a directory, the size of the contents will not be
60 checked.
61
Allen Li5889a8a2016-08-17 11:54:09 -070062 @param host: The RemoteHost to collect logs from
63 @param log_path: The remote path to collect the log file from
64 @param dest_path: A path (file or directory) to write the copies logs into
65 @param use_tmp: If True, will first copy the logs to a temporary directory
66 on the host and download logs from there.
67 @param clean: If True, remove dest_path after upload attempt even if it
68 failed.
69
70 """
71 logging.info('Collecting %s...', log_path)
Allen Li5889a8a2016-08-17 11:54:09 -070072 try:
Allen Li45da8472016-08-25 17:39:07 -070073 file_stats = _get_file_stats(host, log_path)
74 if random.random() > file_stats.collection_probability:
75 logging.warning('Collection of %s skipped:'
76 'size=%s, collection_probability=%s',
77 log_path, file_stats.size,
78 file_stats.collection_probability)
79 elif use_tmp:
80 _collect_log_file_with_tmpdir(host, log_path, dest_path)
81 else:
82 source_path = log_path
83 host.get_file(source_path, dest_path, preserve_perm=False)
Allen Li5889a8a2016-08-17 11:54:09 -070084 except Exception, e:
85 logging.warning('Collection of %s failed: %s', log_path, e)
86 finally:
Allen Li5889a8a2016-08-17 11:54:09 -070087 if clean:
88 host.run('rm -rf %s' % (pipes.quote(log_path),))
89
90
Allen Li45da8472016-08-25 17:39:07 -070091_FileStats = collections.namedtuple('_FileStats',
92 'size collection_probability')
93
94
95def _collect_log_file_with_tmpdir(host, log_path, dest_path):
96 """Collect log file from host through a temp directory on the host.
97
98 @param host: The RemoteHost to collect logs from.
99 @param log_path: The remote path to collect the log file from.
100 @param dest_path: A path (file or directory) to write the copies logs into.
101
102 """
103 with _RemoteTempDir(host) as tmpdir:
104 host.run('cp -rp %s %s' % (pipes.quote(log_path), pipes.quote(tmpdir)))
105 source_path = os.path.join(tmpdir, os.path.basename(log_path))
106 host.get_file(source_path, dest_path, preserve_perm=False)
107
108
109def _get_file_stats(host, path):
110 """Get the stats of a file from host.
111
112 @param host: Instance of Host subclass with run().
113 @param path: Path of file to check.
114 @returns: _FileStats namedtuple with file size and collection probability.
115 """
116 cmd = 'ls -ld %s | cut -d" " -f5' % (pipes.quote(path),)
117 try:
118 file_size = int(host.run(cmd).stdout)
119 except error.CmdError as e:
120 logging.warning('Getting size of file %r on host %r failed: %s',
121 path, host, e)
122 file_size = 0
123 if file_size == 0:
124 return _FileStats(0, 1.0)
125 else:
126 collection_probability = _MAX_FILESIZE / float(file_size)
127 return _FileStats(file_size, collection_probability)
128
129
jadmanski96b78072009-05-21 22:21:04 +0000130# import any site hooks for the crashdump and crashinfo collection
131get_site_crashdumps = utils.import_site_function(
132 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
133 lambda host, test_start_time: None)
134get_site_crashinfo = utils.import_site_function(
135 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
136 lambda host, test_start_time: None)
Allen Liab020912016-09-19 18:07:41 -0700137report_crashdumps = utils.import_site_function(
138 __file__, "autotest_lib.server.site_crashcollect", "report_crashdumps",
139 lambda host: None)
140fetch_orphaned_crashdumps = utils.import_site_function(
141 __file__, "autotest_lib.server.site_crashcollect", "fetch_orphaned_crashdumps",
142 lambda host, host_resultdir: None)
143get_host_infodir = utils.import_site_function(
144 __file__, "autotest_lib.server.site_crashcollect", "get_host_infodir",
145 lambda host: None)
jadmanski96b78072009-05-21 22:21:04 +0000146
147
Allen Lib59a7602016-11-22 13:35:18 -0800148@metrics.SecondsTimerDecorator(
Prathmesh Prabhua7556a92017-02-01 14:18:41 -0800149 'chromeos/autotest/autoserv/get_crashdumps_duration')
jadmanski96b78072009-05-21 22:21:04 +0000150def get_crashdumps(host, test_start_time):
151 get_site_crashdumps(host, test_start_time)
152
153
Allen Lib59a7602016-11-22 13:35:18 -0800154@metrics.SecondsTimerDecorator(
Prathmesh Prabhua7556a92017-02-01 14:18:41 -0800155 'chromeos/autotest/autoserv/get_crashinfo_duration')
jadmanski96b78072009-05-21 22:21:04 +0000156def get_crashinfo(host, test_start_time):
157 logging.info("Collecting crash information...")
158
Fang Deng279d8a92014-04-16 17:16:50 -0700159 # get_crashdumps collects orphaned crashdumps and symbolicates all
160 # collected crashdumps. Symbolicating could happen
161 # during a postjob task as well, at which time some crashdumps could have
162 # already been pulled back from machine. So it doesn't necessarily need
163 # to wait for the machine to come up.
jadmanski96b78072009-05-21 22:21:04 +0000164 get_crashdumps(host, test_start_time)
165
jadmanski663d55a2009-05-21 22:54:28 +0000166 if wait_for_machine_to_recover(host):
167 # run any site-specific collection
168 get_site_crashinfo(host, test_start_time)
169
MK Ryu35d661e2014-09-25 17:44:10 -0700170 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
jadmanski4900b3b2009-07-02 22:12:08 +0000171 collect_messages(host)
jadmanski663d55a2009-05-21 22:54:28 +0000172 collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
jadmanski663d55a2009-05-21 22:54:28 +0000173 collect_uncollected_logs(host)
174
Dan Shib03ea9d2013-08-15 17:13:27 -0700175 # Collect everything in /var/log.
176 log_path = os.path.join(crashinfo_dir, 'var')
177 os.makedirs(log_path)
178 collect_log_file(host, constants.LOG_DIR, log_path)
179
Fang Deng279d8a92014-04-16 17:16:50 -0700180 # Collect console-ramoops
181 log_path = os.path.join(
182 crashinfo_dir, os.path.basename(constants.LOG_CONSOLE_RAMOOPS))
Allen Li5889a8a2016-08-17 11:54:09 -0700183 collect_log_file(host, constants.LOG_CONSOLE_RAMOOPS, log_path,
184 clean=True)
Fang Deng279d8a92014-04-16 17:16:50 -0700185 # Collect i915_error_state, only available on intel systems.
186 # i915 contains the Intel graphics state. It might contain useful data
187 # when a DUT hangs, times out or crashes.
188 log_path = os.path.join(
189 crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE))
190 collect_log_file(host, constants.LOG_I915_ERROR_STATE,
191 log_path, use_tmp=True)
192
jadmanski663d55a2009-05-21 22:54:28 +0000193
Dale Curtiscb7bfaf2011-06-07 16:21:57 -0700194# Load default for number of hours to wait before giving up on crash collection.
195HOURS_TO_WAIT = global_config.global_config.get_config_value(
196 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
197
198
199def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
jadmanski663d55a2009-05-21 22:54:28 +0000200 """Wait for a machine (possibly down) to become accessible again.
201
202 @param host: A RemoteHost instance to wait on
203 @param hours_to_wait: Number of hours to wait before giving up
204
205 @returns: True if the machine comes back up, False otherwise
206 """
jadmanski96b78072009-05-21 22:21:04 +0000207 current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
mbligh67429f42009-11-06 03:00:37 +0000208 if host.is_up():
209 logging.info("%s already up, collecting crash info", host.hostname)
210 return True
211
Eric Li8a12e802011-02-17 14:24:13 -0800212 logging.info("Waiting %s hours for %s to come up (%s)",
213 hours_to_wait, host.hostname, current_time)
jadmanski663d55a2009-05-21 22:54:28 +0000214 if not host.wait_up(timeout=hours_to_wait * 3600):
Prathmesh Prabhua7556a92017-02-01 14:18:41 -0800215 (metrics.Counter('chromeos/autotest/errors/collect_crashinfo_timeout')
Allen Lib59a7602016-11-22 13:35:18 -0800216 .increment())
jadmanski96b78072009-05-21 22:21:04 +0000217 logging.warning("%s down, unable to collect crash info",
218 host.hostname)
jadmanski663d55a2009-05-21 22:54:28 +0000219 return False
jadmanski96b78072009-05-21 22:21:04 +0000220 else:
221 logging.info("%s is back up, collecting crash info", host.hostname)
jadmanski663d55a2009-05-21 22:54:28 +0000222 return True
jadmanski96b78072009-05-21 22:21:04 +0000223
jadmanski96b78072009-05-21 22:21:04 +0000224
MK Ryu35d661e2014-09-25 17:44:10 -0700225def get_crashinfo_dir(host, dir_prefix):
jadmanski663d55a2009-05-21 22:54:28 +0000226 """Find and if necessary create a directory to store crashinfo in.
227
228 @param host: The RemoteHost object that crashinfo will be collected from
MK Ryu35d661e2014-09-25 17:44:10 -0700229 @param dir_prefix: Prefix of directory name.
jadmanski663d55a2009-05-21 22:54:28 +0000230
231 @returns: The path to an existing directory for writing crashinfo into
232 """
jadmanski96b78072009-05-21 22:21:04 +0000233 host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
234 if host_resultdir:
235 infodir = host_resultdir
236 else:
237 infodir = os.path.abspath(os.getcwd())
MK Ryu35d661e2014-09-25 17:44:10 -0700238 infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname))
jadmanski96b78072009-05-21 22:21:04 +0000239 if not os.path.exists(infodir):
240 os.mkdir(infodir)
jadmanski663d55a2009-05-21 22:54:28 +0000241 return infodir
jadmanski96b78072009-05-21 22:21:04 +0000242
jadmanski96b78072009-05-21 22:21:04 +0000243
jadmanski663d55a2009-05-21 22:54:28 +0000244def collect_command(host, command, dest_path):
245 """Collects the result of a command on the remote machine.
246
247 The standard output of the command will be collected and written into the
248 desitionation path. The destination path is assumed to be filename and
249 not a directory.
250
251 @param host: The RemoteHost to collect from
252 @param command: A shell command to run on the remote machine and capture
253 the output from.
254 @param dest_path: A file path to write the results of the log into
255 """
256 logging.info("Collecting '%s' ...", command)
jadmanski96b78072009-05-21 22:21:04 +0000257 try:
Allen Li5889a8a2016-08-17 11:54:09 -0700258 result = host.run(command, stdout_tee=None).stdout
259 utils.open_write_close(dest_path, result)
260 except Exception, e:
261 logging.warning("Collection of '%s' failed:\n%s", command, e)
jadmanski96b78072009-05-21 22:21:04 +0000262
jadmanski663d55a2009-05-21 22:54:28 +0000263
jadmanski663d55a2009-05-21 22:54:28 +0000264def collect_uncollected_logs(host):
265 """Collects any leftover uncollected logs from the client.
266
267 @param host: The RemoteHost to collect from
268 """
mbligh0d0f67d2009-11-06 03:15:03 +0000269 if host.job:
jadmanski96b78072009-05-21 22:21:04 +0000270 try:
mbligh0d0f67d2009-11-06 03:15:03 +0000271 logs = host.job.get_client_logs()
jadmanski96b78072009-05-21 22:21:04 +0000272 for hostname, remote_path, local_path in logs:
273 if hostname == host.hostname:
Allen Li5889a8a2016-08-17 11:54:09 -0700274 logging.info('Retrieving logs from %s:%s into %s',
jadmanski663d55a2009-05-21 22:54:28 +0000275 hostname, remote_path, local_path)
Allen Li5889a8a2016-08-17 11:54:09 -0700276 collect_log_file(host, remote_path + '/', local_path + '/')
jadmanski96b78072009-05-21 22:21:04 +0000277 except Exception, e:
Allen Li5889a8a2016-08-17 11:54:09 -0700278 logging.warning('Error while trying to collect stranded '
279 'Autotest client logs: %s', e)
jadmanski4900b3b2009-07-02 22:12:08 +0000280
281
282def collect_messages(host):
283 """Collects the 'new' contents of /var/log/messages.
284
285 If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
286 the contents of /var/log/messages excluding whatever initial contents
287 are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
288 present, simply collects the entire contents of /var/log/messages.
289
290 @param host: The RemoteHost to collect from
291 """
MK Ryu35d661e2014-09-25 17:44:10 -0700292 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
jadmanski4900b3b2009-07-02 22:12:08 +0000293
294 try:
295 # paths to the messages files
296 messages = os.path.join(crashinfo_dir, "messages")
297 messages_raw = os.path.join(crashinfo_dir, "messages.raw")
298 messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
299
300 # grab the files from the remote host
301 collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
302 messages_at_start)
303 collect_log_file(host, "/var/log/messages", messages_raw)
304
305 # figure out how much of messages.raw to skip
306 if os.path.exists(messages_at_start):
307 # if the first lines of the messages at start should match the
308 # first lines of the current messages; if they don't then messages
309 # has been erase or rotated and we just grab all of it
310 first_line_at_start = utils.read_one_line(messages_at_start)
311 first_line_now = utils.read_one_line(messages_raw)
312 if first_line_at_start != first_line_now:
313 size_at_start = 0
314 else:
315 size_at_start = os.path.getsize(messages_at_start)
316 else:
317 size_at_start = 0
318 raw_messages_file = open(messages_raw)
319 messages_file = open(messages, "w")
320 raw_messages_file.seek(size_at_start)
321 shutil.copyfileobj(raw_messages_file, messages_file)
322 raw_messages_file.close()
323 messages_file.close()
324
325 # get rid of the "raw" versions of messages
326 os.remove(messages_raw)
327 if os.path.exists(messages_at_start):
328 os.remove(messages_at_start)
329 except Exception, e:
330 logging.warning("Error while collecting /var/log/messages: %s", e)