Change the server-side profilers to use their own host objects, instead of trying to re-use existing ones. Using existing ones was making them unreliable since we avoided using Host objects with "normal" autotest installs associated with them to avoid conflicts, so sometimes your hosts would get used, sometimes not, and it's non-obvious from a user point of view. So instead we change the profilers to look and what hostnames are in use and then create their own Host objects. However, this led to another problem, namely that the host classes themselves have a dependency on the profiler module and so now we have a circular dependency. To get around this I extracted the crashinfo and crashdump collection into a separate module, server.crashcollect. Risk: High Visibility: Make server-side profilers run much more reliably. Signed-off-by: John Admanski <jadmanski@google.com> git-svn-id: http://test.kernel.org/svn/autotest/trunk@3165 592f7852-d20e-0410-864c-8624ca9c26a4

commit: 96b7807a2887dc07ec1591a1693a9a7b99a104bb [log] [tgz]
author: jadmanski <jadmanski@592f7852-d20e-0410-864c-8624ca9c26a4> Thu May 21 22:21:04 2009 +0000
committer: jadmanski <jadmanski@592f7852-d20e-0410-864c-8624ca9c26a4> Thu May 21 22:21:04 2009 +0000
tree: 7f0378c0b70427df89f5fb0faf15694565ee6a31
parent: a61edad29fd9f81e0c20190a16d0ea9cb9f0b9cd [diff]
diff --git a/server/control_segments/crashdumps b/server/control_segments/crashdumps
index 78b750c..7bad63d 100644
--- a/server/control_segments/crashdumps
+++ b/server/control_segments/crashdumps

@@ -1,9 +1,12 @@
+from autotest_lib.server import crashcollect
+
+
 def crashdumps(machine):
     hostname, user, passwd, port = parse_machine(machine, ssh_user,
                                                  ssh_port, ssh_pass)
     host = hosts.create_host(hostname, user=user, port=port, initialize=False,
                              password=passwd, auto_monitor=False)
-    host.get_crashdumps(test_start_time)
+    crashcollect.get_crashdumps(host, test_start_time)
 
 
 job.parallel_simple(crashdumps, machines, log=False)

diff --git a/server/control_segments/crashinfo b/server/control_segments/crashinfo
index cde3dfc..c273620 100644
--- a/server/control_segments/crashinfo
+++ b/server/control_segments/crashinfo

@@ -1,9 +1,12 @@
+from autotest_lib.server import crashcollect
+
+
 def crashinfo(machine):
     hostname, user, passwd, port = parse_machine(machine, ssh_user,
                                                  ssh_port, ssh_pass)
     host = hosts.create_host(hostname, user=user, port=port, initialize=False,
                              password=passwd, auto_monitor=False)
-    host.get_crashinfo(test_start_time)
+    crashcollect.get_crashinfo(host, test_start_time)
 
 
 job.parallel_simple(crashinfo, machines, log=False)

diff --git a/server/crashcollect.py b/server/crashcollect.py
new file mode 100644
index 0000000..eeadf2f
--- /dev/null
+++ b/server/crashcollect.py

@@ -0,0 +1,102 @@
+import os, time, pickle, logging
+
+from autotest_lib.server import utils, profiler
+
+
+# import any site hooks for the crashdump and crashinfo collection
+get_site_crashdumps = utils.import_site_function(
+    __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
+    lambda host, test_start_time: None)
+get_site_crashinfo = utils.import_site_function(
+    __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
+    lambda host, test_start_time: None)
+
+
+def get_crashdumps(host, test_start_time):
+    get_site_crashdumps(host, test_start_time)
+
+
+def get_crashinfo(host, test_start_time):
+    logging.info("Collecting crash information...")
+
+    # include crashdumps as part of the general crashinfo
+    get_crashdumps(host, test_start_time)
+
+    # wait for four hours, to see if the machine comes back up
+    current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
+    logging.info("Waiting four hours for %s to come up (%s)",
+                 host.hostname, current_time)
+    if not host.wait_up(timeout=4*60*60):
+        logging.warning("%s down, unable to collect crash info",
+                        host.hostname)
+        return
+    else:
+        logging.info("%s is back up, collecting crash info", host.hostname)
+
+    # run any site-specific crashinfo collection
+    get_site_crashinfo(host, test_start_time)
+
+    # find a directory to put the crashinfo into
+    host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
+    if host_resultdir:
+        infodir = host_resultdir
+    else:
+        infodir = os.path.abspath(os.getcwd())
+    infodir = os.path.join(infodir, "crashinfo.%s" % host.hostname)
+    if not os.path.exists(infodir):
+        os.mkdir(infodir)
+
+    # collect various log files
+    log_files = ["/var/log/messages", "/var/log/monitor-ssh-reboots"]
+    for log in log_files:
+        logging.info("Collecting %s...", log)
+        try:
+            host.get_file(log, infodir, preserve_perm=False)
+        except Exception:
+            logging.warning("Collection of %s failed", log)
+
+    # collect dmesg
+    logging.info("Collecting dmesg (saved to crashinfo/dmesg)...")
+    devnull = open("/dev/null", "w")
+    try:
+        try:
+            result = host.run("dmesg", stdout_tee=devnull).stdout
+            file(os.path.join(infodir, "dmesg"), "w").write(result)
+        except Exception, e:
+            logging.warning("Collection of dmesg failed:\n%s", e)
+    finally:
+        devnull.close()
+
+    # collect any profiler data we can find
+    logging.info("Collecting any server-side profiler data lying around...")
+    try:
+        cmd = "ls %s" % profiler.PROFILER_TMPDIR
+        profiler_dirs = [path for path in host.run(cmd).stdout.split()
+                         if path.startswith("autoserv-")]
+        for profiler_dir in profiler_dirs:
+            remote_path = profiler.get_profiler_results_dir(profiler_dir)
+            remote_exists = host.run("ls %s" % remote_path,
+                                     ignore_status=True).exit_status == 0
+            if not remote_exists:
+                continue
+            local_path = os.path.join(infodir, "profiler." + profiler_dir)
+            os.mkdir(local_path)
+            host.get_file(remote_path + "/", local_path)
+    except Exception, e:
+        logging.warning("Collection of profiler data failed with:\n%s", e)
+
+
+    # collect any uncollected logs we see (for this host)
+    if not host.job.uncollected_log_file:
+        host.job.uncollected_log_file = ''
+    if host.job and os.path.exists(host.job.uncollected_log_file):
+        try:
+            logs = pickle.load(open(host.job.uncollected_log_file))
+            for hostname, remote_path, local_path in logs:
+                if hostname == host.hostname:
+                    logging.info("Retrieving logs from %s:%s into %s",
+                                hostname, remote_path, local_path)
+                    host.get_file(remote_path + "/", local_path + "/")
+        except Exception, e:
+            logging.warning("Error while trying to collect stranded "
+                           "Autotest client logs: %s", e)

diff --git a/server/hosts/base_classes.py b/server/hosts/base_classes.py
index 589da8e..00565af 100644
--- a/server/hosts/base_classes.py
+++ b/server/hosts/base_classes.py

@@ -309,14 +309,6 @@
         installableObject.install(self)
 
 
-    def get_crashinfo(self, test_start_time):
-        self.get_crashdumps(test_start_time)
-
-
-    def get_crashdumps(self, test_start_time):
-        pass
-
-
     def get_autodir(self):
         raise NotImplementedError('Get autodir not implemented!')
 

diff --git a/server/hosts/remote.py b/server/hosts/remote.py
index 6465289..2a80395 100644
--- a/server/hosts/remote.py
+++ b/server/hosts/remote.py

@@ -1,9 +1,9 @@
 """This class defines the Remote host class, mixing in the SiteHost class
 if it is available."""
 
-import os, time, pickle, logging
+import os, logging
 from autotest_lib.client.common_lib import error
-from autotest_lib.server import utils, profiler
+from autotest_lib.server import utils
 from autotest_lib.server.hosts import base_classes, bootloader
 
 
@@ -213,91 +213,6 @@
         return result.stdout.strip().split()[0]
 
 
-    def get_crashinfo(self, test_start_time):
-        logging.info("Collecting crash information...")
-        super(RemoteHost, self).get_crashinfo(test_start_time)
-
-        # wait for four hours, to see if the machine comes back up
-        current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
-        logging.info("Waiting four hours for %s to come up (%s)",
-                    self.hostname, current_time)
-        if not self.wait_up(timeout=4*60*60):
-            logging.warning("%s down, unable to collect crash info",
-                           self.hostname)
-            return
-        else:
-            logging.info("%s is back up, collecting crash info", self.hostname)
-
-        # find a directory to put the crashinfo into
-        try:
-            self.job.resultsdir
-        except AttributeError:
-            self.job.resultsdir = None
-
-        if self.job.resultsdir:
-            infodir = self.job.resultdir
-        else:
-            infodir = os.path.abspath(os.getcwd())
-        infodir = os.path.join(infodir, "crashinfo.%s" % self.hostname)
-        if not os.path.exists(infodir):
-            os.mkdir(infodir)
-
-        # collect various log files
-        log_files = ["/var/log/messages", "/var/log/monitor-ssh-reboots"]
-        for log in log_files:
-            logging.info("Collecting %s...", log)
-            try:
-                self.get_file(log, infodir, preserve_perm=False)
-            except Exception:
-                logging.warning("Collection of %s failed", log)
-
-        # collect dmesg
-        logging.info("Collecting dmesg (saved to crashinfo/dmesg)...")
-        devnull = open("/dev/null", "w")
-        try:
-            try:
-                result = self.run("dmesg", stdout_tee=devnull).stdout
-                file(os.path.join(infodir, "dmesg"), "w").write(result)
-            except Exception, e:
-                logging.warning("Collection of dmesg failed:\n%s", e)
-        finally:
-            devnull.close()
-
-        # collect any profiler data we can find
-        logging.info("Collecting any server-side profiler data lying around...")
-        try:
-            cmd = "ls %s" % profiler.PROFILER_TMPDIR
-            profiler_dirs = [path for path in self.run(cmd).stdout.split()
-                             if path.startswith("autoserv-")]
-            for profiler_dir in profiler_dirs:
-                remote_path = profiler.get_profiler_results_dir(profiler_dir)
-                remote_exists = self.run("ls %s" % remote_path,
-                                         ignore_status=True).exit_status == 0
-                if not remote_exists:
-                    continue
-                local_path = os.path.join(infodir, "profiler." + profiler_dir)
-                os.mkdir(local_path)
-                self.get_file(remote_path + "/", local_path)
-        except Exception, e:
-            logging.warning("Collection of profiler data failed with:\n%s", e)
-
-
-        # collect any uncollected logs we see (for this host)
-        if not self.job.uncollected_log_file:
-            self.job.uncollected_log_file = ''
-        if self.job and os.path.exists(self.job.uncollected_log_file):
-            try:
-                logs = pickle.load(open(self.job.uncollected_log_file))
-                for hostname, remote_path, local_path in logs:
-                    if hostname == self.hostname:
-                        logging.info("Retrieving logs from %s:%s into %s",
-                                    hostname, remote_path, local_path)
-                        self.get_file(remote_path + "/", local_path + "/")
-            except Exception, e:
-                logging.warning("Error while trying to collect stranded "
-                               "Autotest client logs: %s", e)
-
-
     def are_wait_up_processes_up(self):
         """
         Checks if any HOSTS waitup processes are running yet on the

diff --git a/server/profiler.py b/server/profiler.py
index 7508498..c7279e1 100644
--- a/server/profiler.py
+++ b/server/profiler.py

@@ -2,7 +2,7 @@
 import common
 
 from autotest_lib.client.common_lib import utils, error
-from autotest_lib.server import autotest
+from autotest_lib.server import autotest, hosts
 
 
 PROFILER_TMPDIR = "/tmp/profilers"
@@ -82,18 +82,26 @@
 
     def _install(self):
         """ Install autotest on any current job hosts. """
-        current_job_hosts = set(host for host in self.job.hosts
-                                if not host.get_autodir() or
-                                host.get_autodir().startswith(PROFILER_TMPDIR))
-        current_profiler_hosts = set(self.installed_hosts.keys())
-        # install autotest on any new hosts in job.hosts
-        for host in current_job_hosts - current_profiler_hosts:
+        in_use_hosts = set(host.hostname for host in self.job.hosts
+                           if not
+                           (host.get_autodir() and
+                            host.get_autodir().startswith(PROFILER_TMPDIR)))
+        profiler_hosts = set(self.installed_hosts.keys())
+
+        # install autotest on any new hosts in use
+        for hostname in in_use_hosts - profiler_hosts:
+            host = hosts.create_host(hostname, auto_monitor=False)
             tmp_dir = host.get_tmp_dir(parent=PROFILER_TMPDIR)
             at = autotest.Autotest(host)
             at.install(autodir=tmp_dir)
             self.installed_hosts[host] = (at, tmp_dir)
+
         # drop any installs from hosts no longer in job.hosts
-        for host in current_profiler_hosts - current_job_hosts:
+        hostnames_to_drop = profiler_hosts - in_use_hosts
+        hosts_to_drop = [host for host in self.installed_hosts.iterkeys()
+                         if host.hostname in hostnames_to_drop]
+        for host in hosts_to_drop:
+            host.close()
             del self.installed_hosts[host]
commit	96b7807a2887dc07ec1591a1693a9a7b99a104bb	[log] [tgz]
author	jadmanski <jadmanski@592f7852-d20e-0410-864c-8624ca9c26a4>	Thu May 21 22:21:04 2009 +0000
committer	jadmanski <jadmanski@592f7852-d20e-0410-864c-8624ca9c26a4>	Thu May 21 22:21:04 2009 +0000
tree	7f0378c0b70427df89f5fb0faf15694565ee6a31
parent	a61edad29fd9f81e0c20190a16d0ea9cb9f0b9cd [diff]