blob: 4f5803025676ecef9719b91b0186e88e966f0e01 [file] [log] [blame]
Allen Li45da8472016-08-25 17:39:07 -07001import collections
Allen Li5889a8a2016-08-17 11:54:09 -07002import logging
3import os
4import pipes
Allen Li45da8472016-08-25 17:39:07 -07005import random
Allen Li5889a8a2016-08-17 11:54:09 -07006import shutil
7import time
jadmanski96b78072009-05-21 22:21:04 +00008
Dan Shi4f8c0242017-07-07 15:34:49 -07009import common
10from autotest_lib.client.bin.result_tools import runner as result_tools_runner
Allen Li45da8472016-08-25 17:39:07 -070011from autotest_lib.client.common_lib import error
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070012from autotest_lib.client.common_lib import global_config
Hsinyu Chaoe0b08e62015-08-11 10:50:37 +000013from autotest_lib.client.cros import constants
mbligh15971eb2009-12-29 02:55:23 +000014from autotest_lib.server import utils
jadmanski96b78072009-05-21 22:21:04 +000015
Dan Shi5e2efb72017-02-07 11:40:23 -080016try:
17 from chromite.lib import metrics
18except ImportError:
19 metrics = utils.metrics_mock
20
jadmanski96b78072009-05-21 22:21:04 +000021
Allen Li45da8472016-08-25 17:39:07 -070022# The amortized max filesize to collect. For example, if _MAX_FILESIZE is 10
23# then we would collect a file with size 20 half the time, and a file with size
24# 40 a quarter of the time, so that in the long run we are collecting files
25# with this max size.
26_MAX_FILESIZE = 64 * (2 ** 20) # 64 MiB
27
Allen Li45da8472016-08-25 17:39:07 -070028class _RemoteTempDir(object):
29
30 """Context manager for temporary directory on remote host."""
31
32 def __init__(self, host):
33 self.host = host
34 self.tmpdir = None
35
36 def __repr__(self):
37 return '<{cls} host={this.host!r}, tmpdir={this.tmpdir!r}>'.format(
38 cls=type(self).__name__, this=self)
39
40 def __enter__(self):
41 self.tmpdir = (self.host
42 .run('mktemp -d', stdout_tee=None)
43 .stdout.strip())
44 return self.tmpdir
45
46 def __exit__(self, exc_type, exc_value, exc_tb):
47 self.host.run('rm -rf %s' % (pipes.quote(self.tmpdir),))
48
49
Dan Shi4f8c0242017-07-07 15:34:49 -070050def _collect_log_file_with_summary(host, source_path, dest_path):
51 """Collects a log file from the remote machine with directory summary.
52
53 @param host: The RemoteHost to collect logs from.
54 @param source_path: The remote path to collect the log file from.
55 @param dest_path: A path (file or directory) to write the copies logs into.
56 """
57 # Build test result directory summary
58 summary_created = result_tools_runner.run_on_client(host, source_path)
59
60 skip_summary_collection = True
61 try:
62 host.get_file(source_path, dest_path, preserve_perm=False)
63 skip_summary_collection = False
64 finally:
65 if summary_created:
66 # If dest_path is a file, use its parent folder to store the
67 # directory summary file.
68 if os.path.isfile(dest_path):
69 dest_path = os.path.dirname(dest_path)
70 # If dest_path doesn't exist, that means get_file failed, there is
71 # no need to collect directory summary file.
72 skip_summary_collection |= not os.path.exists(dest_path)
73 result_tools_runner.collect_last_summary(
74 host, source_path, dest_path,
75 skip_summary_collection=skip_summary_collection)
76
77
Dan Shi5d1066a2017-08-18 12:15:08 -070078def collect_log_file(host, log_path, dest_path, use_tmp=False, clean=False,
79 clean_content=False):
Allen Li5889a8a2016-08-17 11:54:09 -070080 """Collects a log file from the remote machine.
81
82 Log files are collected from the remote machine and written into the
83 destination path. If dest_path is a directory, the log file will be named
84 using the basename of the remote log path.
85
Allen Li45da8472016-08-25 17:39:07 -070086 Very large files will randomly not be collected, to alleviate network
87 traffic in the case of widespread crashes dumping large core files. Note
88 that this check only applies to the exact file passed as log_path. For
89 example, if this is a directory, the size of the contents will not be
90 checked.
91
Allen Li5889a8a2016-08-17 11:54:09 -070092 @param host: The RemoteHost to collect logs from
93 @param log_path: The remote path to collect the log file from
94 @param dest_path: A path (file or directory) to write the copies logs into
95 @param use_tmp: If True, will first copy the logs to a temporary directory
96 on the host and download logs from there.
97 @param clean: If True, remove dest_path after upload attempt even if it
98 failed.
Dan Shi5d1066a2017-08-18 12:15:08 -070099 @param clean_content: If True, remove files and directories in dest_path
100 after upload attempt even if it failed.
Allen Li5889a8a2016-08-17 11:54:09 -0700101
102 """
103 logging.info('Collecting %s...', log_path)
Dan Shi9f92aa62017-07-27 17:07:05 -0700104 if not host.check_cached_up_status():
105 logging.warning('Host %s did not answer to ping, skip collecting log '
106 'file %s.', host.hostname, log_path)
107 return
Allen Li5889a8a2016-08-17 11:54:09 -0700108 try:
Allen Li45da8472016-08-25 17:39:07 -0700109 file_stats = _get_file_stats(host, log_path)
Dan Shic6b709e2017-10-06 14:34:50 -0700110 if not file_stats:
111 # Failed to get file stat, the file may not exist.
112 return
113
Dan Shi4f8c0242017-07-07 15:34:49 -0700114 if (not result_tools_runner.ENABLE_RESULT_THROTTLING and
115 random.random() > file_stats.collection_probability):
Allen Li45da8472016-08-25 17:39:07 -0700116 logging.warning('Collection of %s skipped:'
117 'size=%s, collection_probability=%s',
118 log_path, file_stats.size,
119 file_stats.collection_probability)
120 elif use_tmp:
121 _collect_log_file_with_tmpdir(host, log_path, dest_path)
122 else:
Dan Shi4f8c0242017-07-07 15:34:49 -0700123 _collect_log_file_with_summary(host, log_path, dest_path)
124 except Exception as e:
125 logging.exception('Non-critical failure: collection of %s failed: %s',
126 log_path, e)
Allen Li5889a8a2016-08-17 11:54:09 -0700127 finally:
Dan Shi5d1066a2017-08-18 12:15:08 -0700128 if clean_content:
129 path_to_delete = os.path.join(pipes.quote(log_path), '*')
130 elif clean:
131 path_to_delete = pipes.quote(log_path)
132 if clean or clean_content:
Prathmesh Prabhu0c432d62017-08-22 11:43:41 -0700133 host.run('rm -rf %s' % path_to_delete, ignore_status=True)
Allen Li5889a8a2016-08-17 11:54:09 -0700134
135
Allen Li45da8472016-08-25 17:39:07 -0700136_FileStats = collections.namedtuple('_FileStats',
137 'size collection_probability')
138
139
140def _collect_log_file_with_tmpdir(host, log_path, dest_path):
141 """Collect log file from host through a temp directory on the host.
142
143 @param host: The RemoteHost to collect logs from.
144 @param log_path: The remote path to collect the log file from.
145 @param dest_path: A path (file or directory) to write the copies logs into.
146
147 """
148 with _RemoteTempDir(host) as tmpdir:
149 host.run('cp -rp %s %s' % (pipes.quote(log_path), pipes.quote(tmpdir)))
150 source_path = os.path.join(tmpdir, os.path.basename(log_path))
Dan Shi4f8c0242017-07-07 15:34:49 -0700151
152 _collect_log_file_with_summary(host, source_path, dest_path)
Allen Li45da8472016-08-25 17:39:07 -0700153
154
155def _get_file_stats(host, path):
156 """Get the stats of a file from host.
157
158 @param host: Instance of Host subclass with run().
159 @param path: Path of file to check.
160 @returns: _FileStats namedtuple with file size and collection probability.
161 """
162 cmd = 'ls -ld %s | cut -d" " -f5' % (pipes.quote(path),)
Dan Shic6b709e2017-10-06 14:34:50 -0700163 output = None
164 file_size = 0
Allen Li45da8472016-08-25 17:39:07 -0700165 try:
Dan Shic6b709e2017-10-06 14:34:50 -0700166 output = host.run(cmd).stdout
167 except error.CmdError as e:
168 logging.warning('Getting size of file %r on host %r failed: %s. '
169 'Default its size to 0', path, host, e)
170 try:
171 if output is not None:
172 file_size = int(output)
173 except ValueError:
174 logging.warning('Failed to convert size string "%s" for %s on host %r. '
175 'File may not exist.', output, path, host)
176 return
177
Allen Li45da8472016-08-25 17:39:07 -0700178 if file_size == 0:
179 return _FileStats(0, 1.0)
180 else:
181 collection_probability = _MAX_FILESIZE / float(file_size)
182 return _FileStats(file_size, collection_probability)
183
184
jadmanski96b78072009-05-21 22:21:04 +0000185# import any site hooks for the crashdump and crashinfo collection
186get_site_crashdumps = utils.import_site_function(
187 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
188 lambda host, test_start_time: None)
189get_site_crashinfo = utils.import_site_function(
190 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
191 lambda host, test_start_time: None)
Allen Liab020912016-09-19 18:07:41 -0700192report_crashdumps = utils.import_site_function(
193 __file__, "autotest_lib.server.site_crashcollect", "report_crashdumps",
194 lambda host: None)
195fetch_orphaned_crashdumps = utils.import_site_function(
196 __file__, "autotest_lib.server.site_crashcollect", "fetch_orphaned_crashdumps",
197 lambda host, host_resultdir: None)
198get_host_infodir = utils.import_site_function(
199 __file__, "autotest_lib.server.site_crashcollect", "get_host_infodir",
200 lambda host: None)
jadmanski96b78072009-05-21 22:21:04 +0000201
202
Allen Lib59a7602016-11-22 13:35:18 -0800203@metrics.SecondsTimerDecorator(
Prathmesh Prabhua7556a92017-02-01 14:18:41 -0800204 'chromeos/autotest/autoserv/get_crashdumps_duration')
jadmanski96b78072009-05-21 22:21:04 +0000205def get_crashdumps(host, test_start_time):
206 get_site_crashdumps(host, test_start_time)
207
208
Allen Lib59a7602016-11-22 13:35:18 -0800209@metrics.SecondsTimerDecorator(
Prathmesh Prabhua7556a92017-02-01 14:18:41 -0800210 'chromeos/autotest/autoserv/get_crashinfo_duration')
jadmanski96b78072009-05-21 22:21:04 +0000211def get_crashinfo(host, test_start_time):
212 logging.info("Collecting crash information...")
213
Fang Deng279d8a92014-04-16 17:16:50 -0700214 # get_crashdumps collects orphaned crashdumps and symbolicates all
215 # collected crashdumps. Symbolicating could happen
216 # during a postjob task as well, at which time some crashdumps could have
217 # already been pulled back from machine. So it doesn't necessarily need
218 # to wait for the machine to come up.
jadmanski96b78072009-05-21 22:21:04 +0000219 get_crashdumps(host, test_start_time)
220
jadmanski663d55a2009-05-21 22:54:28 +0000221 if wait_for_machine_to_recover(host):
222 # run any site-specific collection
223 get_site_crashinfo(host, test_start_time)
224
MK Ryu35d661e2014-09-25 17:44:10 -0700225 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
jadmanski4900b3b2009-07-02 22:12:08 +0000226 collect_messages(host)
jadmanski663d55a2009-05-21 22:54:28 +0000227 collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
jadmanski663d55a2009-05-21 22:54:28 +0000228 collect_uncollected_logs(host)
229
Dan Shib03ea9d2013-08-15 17:13:27 -0700230 # Collect everything in /var/log.
231 log_path = os.path.join(crashinfo_dir, 'var')
232 os.makedirs(log_path)
233 collect_log_file(host, constants.LOG_DIR, log_path)
234
Mike Frysinger2dc4a412017-08-15 16:33:59 -0400235 # Collect console-ramoops. The filename has changed in linux-3.19,
236 # so collect all the files in the pstore dirs.
237 log_path = os.path.join(crashinfo_dir, 'pstore')
238 for pstore_dir in constants.LOG_PSTORE_DIRS:
Dan Shi5d1066a2017-08-18 12:15:08 -0700239 collect_log_file(host, pstore_dir, log_path, use_tmp=True,
240 clean_content=True)
Fang Deng279d8a92014-04-16 17:16:50 -0700241 # Collect i915_error_state, only available on intel systems.
242 # i915 contains the Intel graphics state. It might contain useful data
243 # when a DUT hangs, times out or crashes.
244 log_path = os.path.join(
245 crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE))
246 collect_log_file(host, constants.LOG_I915_ERROR_STATE,
247 log_path, use_tmp=True)
248
jadmanski663d55a2009-05-21 22:54:28 +0000249
Dale Curtiscb7bfaf2011-06-07 16:21:57 -0700250# Load default for number of hours to wait before giving up on crash collection.
251HOURS_TO_WAIT = global_config.global_config.get_config_value(
252 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
253
254
255def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
jadmanski663d55a2009-05-21 22:54:28 +0000256 """Wait for a machine (possibly down) to become accessible again.
257
258 @param host: A RemoteHost instance to wait on
259 @param hours_to_wait: Number of hours to wait before giving up
260
261 @returns: True if the machine comes back up, False otherwise
262 """
jadmanski96b78072009-05-21 22:21:04 +0000263 current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
mbligh67429f42009-11-06 03:00:37 +0000264 if host.is_up():
265 logging.info("%s already up, collecting crash info", host.hostname)
266 return True
267
Eric Li8a12e802011-02-17 14:24:13 -0800268 logging.info("Waiting %s hours for %s to come up (%s)",
269 hours_to_wait, host.hostname, current_time)
jadmanski663d55a2009-05-21 22:54:28 +0000270 if not host.wait_up(timeout=hours_to_wait * 3600):
Prathmesh Prabhua7556a92017-02-01 14:18:41 -0800271 (metrics.Counter('chromeos/autotest/errors/collect_crashinfo_timeout')
Allen Lib59a7602016-11-22 13:35:18 -0800272 .increment())
jadmanski96b78072009-05-21 22:21:04 +0000273 logging.warning("%s down, unable to collect crash info",
274 host.hostname)
jadmanski663d55a2009-05-21 22:54:28 +0000275 return False
jadmanski96b78072009-05-21 22:21:04 +0000276 else:
277 logging.info("%s is back up, collecting crash info", host.hostname)
jadmanski663d55a2009-05-21 22:54:28 +0000278 return True
jadmanski96b78072009-05-21 22:21:04 +0000279
jadmanski96b78072009-05-21 22:21:04 +0000280
MK Ryu35d661e2014-09-25 17:44:10 -0700281def get_crashinfo_dir(host, dir_prefix):
jadmanski663d55a2009-05-21 22:54:28 +0000282 """Find and if necessary create a directory to store crashinfo in.
283
284 @param host: The RemoteHost object that crashinfo will be collected from
MK Ryu35d661e2014-09-25 17:44:10 -0700285 @param dir_prefix: Prefix of directory name.
jadmanski663d55a2009-05-21 22:54:28 +0000286
287 @returns: The path to an existing directory for writing crashinfo into
288 """
jadmanski96b78072009-05-21 22:21:04 +0000289 host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
290 if host_resultdir:
291 infodir = host_resultdir
292 else:
293 infodir = os.path.abspath(os.getcwd())
MK Ryu35d661e2014-09-25 17:44:10 -0700294 infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname))
jadmanski96b78072009-05-21 22:21:04 +0000295 if not os.path.exists(infodir):
296 os.mkdir(infodir)
jadmanski663d55a2009-05-21 22:54:28 +0000297 return infodir
jadmanski96b78072009-05-21 22:21:04 +0000298
jadmanski96b78072009-05-21 22:21:04 +0000299
jadmanski663d55a2009-05-21 22:54:28 +0000300def collect_command(host, command, dest_path):
301 """Collects the result of a command on the remote machine.
302
303 The standard output of the command will be collected and written into the
304 desitionation path. The destination path is assumed to be filename and
305 not a directory.
306
307 @param host: The RemoteHost to collect from
308 @param command: A shell command to run on the remote machine and capture
309 the output from.
310 @param dest_path: A file path to write the results of the log into
311 """
312 logging.info("Collecting '%s' ...", command)
jadmanski96b78072009-05-21 22:21:04 +0000313 try:
Allen Li5889a8a2016-08-17 11:54:09 -0700314 result = host.run(command, stdout_tee=None).stdout
315 utils.open_write_close(dest_path, result)
316 except Exception, e:
317 logging.warning("Collection of '%s' failed:\n%s", command, e)
jadmanski96b78072009-05-21 22:21:04 +0000318
jadmanski663d55a2009-05-21 22:54:28 +0000319
jadmanski663d55a2009-05-21 22:54:28 +0000320def collect_uncollected_logs(host):
321 """Collects any leftover uncollected logs from the client.
322
323 @param host: The RemoteHost to collect from
324 """
mbligh0d0f67d2009-11-06 03:15:03 +0000325 if host.job:
jadmanski96b78072009-05-21 22:21:04 +0000326 try:
mbligh0d0f67d2009-11-06 03:15:03 +0000327 logs = host.job.get_client_logs()
jadmanski96b78072009-05-21 22:21:04 +0000328 for hostname, remote_path, local_path in logs:
329 if hostname == host.hostname:
Allen Li5889a8a2016-08-17 11:54:09 -0700330 logging.info('Retrieving logs from %s:%s into %s',
jadmanski663d55a2009-05-21 22:54:28 +0000331 hostname, remote_path, local_path)
Allen Li5889a8a2016-08-17 11:54:09 -0700332 collect_log_file(host, remote_path + '/', local_path + '/')
jadmanski96b78072009-05-21 22:21:04 +0000333 except Exception, e:
Allen Li5889a8a2016-08-17 11:54:09 -0700334 logging.warning('Error while trying to collect stranded '
335 'Autotest client logs: %s', e)
jadmanski4900b3b2009-07-02 22:12:08 +0000336
337
338def collect_messages(host):
339 """Collects the 'new' contents of /var/log/messages.
340
341 If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
342 the contents of /var/log/messages excluding whatever initial contents
343 are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
344 present, simply collects the entire contents of /var/log/messages.
345
346 @param host: The RemoteHost to collect from
347 """
MK Ryu35d661e2014-09-25 17:44:10 -0700348 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
jadmanski4900b3b2009-07-02 22:12:08 +0000349
350 try:
351 # paths to the messages files
352 messages = os.path.join(crashinfo_dir, "messages")
353 messages_raw = os.path.join(crashinfo_dir, "messages.raw")
354 messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
355
356 # grab the files from the remote host
357 collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
358 messages_at_start)
359 collect_log_file(host, "/var/log/messages", messages_raw)
360
361 # figure out how much of messages.raw to skip
362 if os.path.exists(messages_at_start):
363 # if the first lines of the messages at start should match the
364 # first lines of the current messages; if they don't then messages
365 # has been erase or rotated and we just grab all of it
366 first_line_at_start = utils.read_one_line(messages_at_start)
367 first_line_now = utils.read_one_line(messages_raw)
368 if first_line_at_start != first_line_now:
369 size_at_start = 0
370 else:
371 size_at_start = os.path.getsize(messages_at_start)
372 else:
373 size_at_start = 0
374 raw_messages_file = open(messages_raw)
375 messages_file = open(messages, "w")
376 raw_messages_file.seek(size_at_start)
377 shutil.copyfileobj(raw_messages_file, messages_file)
378 raw_messages_file.close()
379 messages_file.close()
380
381 # get rid of the "raw" versions of messages
382 os.remove(messages_raw)
383 if os.path.exists(messages_at_start):
384 os.remove(messages_at_start)
385 except Exception, e:
386 logging.warning("Error while collecting /var/log/messages: %s", e)