blob: d6dba91f6de9c7f75fe37bf051d55428c4033d3a [file] [log] [blame]
Dan Shib03ea9d2013-08-15 17:13:27 -07001import os, time, logging, shutil
jadmanski96b78072009-05-21 22:21:04 +00002
Dale Curtiscb7bfaf2011-06-07 16:21:57 -07003from autotest_lib.client.common_lib import global_config
Dan Shib03ea9d2013-08-15 17:13:27 -07004from autotest_lib.client.cros import constants
mbligh15971eb2009-12-29 02:55:23 +00005from autotest_lib.server import utils
Dan Shib03ea9d2013-08-15 17:13:27 -07006from autotest_lib.site_utils.graphite import stats
jadmanski96b78072009-05-21 22:21:04 +00007
8
9# import any site hooks for the crashdump and crashinfo collection
10get_site_crashdumps = utils.import_site_function(
11 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
12 lambda host, test_start_time: None)
13get_site_crashinfo = utils.import_site_function(
14 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
15 lambda host, test_start_time: None)
16
17
beeps0a628292013-11-26 18:07:08 -080018_timer = stats.Timer('crash_collection')
19
20@_timer.decorate
jadmanski96b78072009-05-21 22:21:04 +000021def get_crashdumps(host, test_start_time):
22 get_site_crashdumps(host, test_start_time)
23
24
beeps0a628292013-11-26 18:07:08 -080025@_timer.decorate
jadmanski96b78072009-05-21 22:21:04 +000026def get_crashinfo(host, test_start_time):
27 logging.info("Collecting crash information...")
28
29 # include crashdumps as part of the general crashinfo
30 get_crashdumps(host, test_start_time)
31
jadmanski663d55a2009-05-21 22:54:28 +000032 if wait_for_machine_to_recover(host):
33 # run any site-specific collection
34 get_site_crashinfo(host, test_start_time)
35
36 crashinfo_dir = get_crashinfo_dir(host)
jadmanski4900b3b2009-07-02 22:12:08 +000037 collect_messages(host)
jadmanski663d55a2009-05-21 22:54:28 +000038 collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
jadmanski663d55a2009-05-21 22:54:28 +000039 collect_uncollected_logs(host)
40
Dan Shib03ea9d2013-08-15 17:13:27 -070041 # Collect everything in /var/log.
42 log_path = os.path.join(crashinfo_dir, 'var')
43 os.makedirs(log_path)
44 collect_log_file(host, constants.LOG_DIR, log_path)
45
jadmanski663d55a2009-05-21 22:54:28 +000046
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070047# Load default for number of hours to wait before giving up on crash collection.
48HOURS_TO_WAIT = global_config.global_config.get_config_value(
49 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
50
51
52def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
jadmanski663d55a2009-05-21 22:54:28 +000053 """Wait for a machine (possibly down) to become accessible again.
54
55 @param host: A RemoteHost instance to wait on
56 @param hours_to_wait: Number of hours to wait before giving up
57
58 @returns: True if the machine comes back up, False otherwise
59 """
jadmanski96b78072009-05-21 22:21:04 +000060 current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
mbligh67429f42009-11-06 03:00:37 +000061 if host.is_up():
62 logging.info("%s already up, collecting crash info", host.hostname)
63 return True
64
Eric Li8a12e802011-02-17 14:24:13 -080065 logging.info("Waiting %s hours for %s to come up (%s)",
66 hours_to_wait, host.hostname, current_time)
jadmanski663d55a2009-05-21 22:54:28 +000067 if not host.wait_up(timeout=hours_to_wait * 3600):
Dan Shib03ea9d2013-08-15 17:13:27 -070068 stats.Counter('collect_crashinfo_timeout').increment()
jadmanski96b78072009-05-21 22:21:04 +000069 logging.warning("%s down, unable to collect crash info",
70 host.hostname)
jadmanski663d55a2009-05-21 22:54:28 +000071 return False
jadmanski96b78072009-05-21 22:21:04 +000072 else:
73 logging.info("%s is back up, collecting crash info", host.hostname)
jadmanski663d55a2009-05-21 22:54:28 +000074 return True
jadmanski96b78072009-05-21 22:21:04 +000075
jadmanski96b78072009-05-21 22:21:04 +000076
jadmanski663d55a2009-05-21 22:54:28 +000077def get_crashinfo_dir(host):
78 """Find and if necessary create a directory to store crashinfo in.
79
80 @param host: The RemoteHost object that crashinfo will be collected from
81
82 @returns: The path to an existing directory for writing crashinfo into
83 """
jadmanski96b78072009-05-21 22:21:04 +000084 host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
85 if host_resultdir:
86 infodir = host_resultdir
87 else:
88 infodir = os.path.abspath(os.getcwd())
89 infodir = os.path.join(infodir, "crashinfo.%s" % host.hostname)
90 if not os.path.exists(infodir):
91 os.mkdir(infodir)
jadmanski663d55a2009-05-21 22:54:28 +000092 return infodir
jadmanski96b78072009-05-21 22:21:04 +000093
jadmanski96b78072009-05-21 22:21:04 +000094
jadmanski663d55a2009-05-21 22:54:28 +000095def collect_log_file(host, log_path, dest_path):
96 """Collects a log file from the remote machine.
97
98 Log files are collected from the remote machine and written into the
99 destination path. If dest_path is a directory, the log file will be named
100 using the basename of the remote log path.
101
102 @param host: The RemoteHost to collect logs from
103 @param log_path: The remote path to collect the log file from
104 @param dest_path: A path (file or directory) to write the copies logs into
105 """
106 logging.info("Collecting %s...", log_path)
107 try:
108 host.get_file(log_path, dest_path, preserve_perm=False)
109 except Exception:
110 logging.warning("Collection of %s failed", log_path)
111
112
113
114def collect_command(host, command, dest_path):
115 """Collects the result of a command on the remote machine.
116
117 The standard output of the command will be collected and written into the
118 desitionation path. The destination path is assumed to be filename and
119 not a directory.
120
121 @param host: The RemoteHost to collect from
122 @param command: A shell command to run on the remote machine and capture
123 the output from.
124 @param dest_path: A file path to write the results of the log into
125 """
126 logging.info("Collecting '%s' ...", command)
jadmanski96b78072009-05-21 22:21:04 +0000127 devnull = open("/dev/null", "w")
128 try:
129 try:
jadmanski663d55a2009-05-21 22:54:28 +0000130 result = host.run(command, stdout_tee=devnull).stdout
131 utils.open_write_close(dest_path, result)
jadmanski96b78072009-05-21 22:21:04 +0000132 except Exception, e:
jadmanski663d55a2009-05-21 22:54:28 +0000133 logging.warning("Collection of '%s' failed:\n%s", command, e)
jadmanski96b78072009-05-21 22:21:04 +0000134 finally:
135 devnull.close()
136
jadmanski663d55a2009-05-21 22:54:28 +0000137
jadmanski663d55a2009-05-21 22:54:28 +0000138def collect_uncollected_logs(host):
139 """Collects any leftover uncollected logs from the client.
140
141 @param host: The RemoteHost to collect from
142 """
mbligh0d0f67d2009-11-06 03:15:03 +0000143 if host.job:
jadmanski96b78072009-05-21 22:21:04 +0000144 try:
mbligh0d0f67d2009-11-06 03:15:03 +0000145 logs = host.job.get_client_logs()
jadmanski96b78072009-05-21 22:21:04 +0000146 for hostname, remote_path, local_path in logs:
147 if hostname == host.hostname:
148 logging.info("Retrieving logs from %s:%s into %s",
jadmanski663d55a2009-05-21 22:54:28 +0000149 hostname, remote_path, local_path)
jadmanski96b78072009-05-21 22:21:04 +0000150 host.get_file(remote_path + "/", local_path + "/")
151 except Exception, e:
152 logging.warning("Error while trying to collect stranded "
jadmanski663d55a2009-05-21 22:54:28 +0000153 "Autotest client logs: %s", e)
jadmanski4900b3b2009-07-02 22:12:08 +0000154
155
156def collect_messages(host):
157 """Collects the 'new' contents of /var/log/messages.
158
159 If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
160 the contents of /var/log/messages excluding whatever initial contents
161 are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
162 present, simply collects the entire contents of /var/log/messages.
163
164 @param host: The RemoteHost to collect from
165 """
166 crashinfo_dir = get_crashinfo_dir(host)
167
168 try:
169 # paths to the messages files
170 messages = os.path.join(crashinfo_dir, "messages")
171 messages_raw = os.path.join(crashinfo_dir, "messages.raw")
172 messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
173
174 # grab the files from the remote host
175 collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
176 messages_at_start)
177 collect_log_file(host, "/var/log/messages", messages_raw)
178
179 # figure out how much of messages.raw to skip
180 if os.path.exists(messages_at_start):
181 # if the first lines of the messages at start should match the
182 # first lines of the current messages; if they don't then messages
183 # has been erase or rotated and we just grab all of it
184 first_line_at_start = utils.read_one_line(messages_at_start)
185 first_line_now = utils.read_one_line(messages_raw)
186 if first_line_at_start != first_line_now:
187 size_at_start = 0
188 else:
189 size_at_start = os.path.getsize(messages_at_start)
190 else:
191 size_at_start = 0
192 raw_messages_file = open(messages_raw)
193 messages_file = open(messages, "w")
194 raw_messages_file.seek(size_at_start)
195 shutil.copyfileobj(raw_messages_file, messages_file)
196 raw_messages_file.close()
197 messages_file.close()
198
199 # get rid of the "raw" versions of messages
200 os.remove(messages_raw)
201 if os.path.exists(messages_at_start):
202 os.remove(messages_at_start)
203 except Exception, e:
204 logging.warning("Error while collecting /var/log/messages: %s", e)