trappy: Speed up trappy by caching trace parsing Pandas is extremely fast at parsing csv to data frames. Astonishingly it takes < 1s to serialize/deserialize a 100MB work of traces with 430000 events to/from csv. We leverage this and write out a data frames into a csv file when they are created for the first time. Next time we read it out if it exists. To make sure, the cache isn't stale, we take the md5sum of the trace file and also ensure all CSVs exist before reading from the cache. I get a speed up of 16s to 1s when parsing a 100MB trace. Co-developed-by: Brendan Jackman <brendan.jackman@arm.com> Signed-off-by: Joel Fernandes <joelaf@google.com> Reviewed-by: KP Singh <kpsingh@google.com>

commit: a19ff2508e37fce5e4909e0b717c3a4d2f0be679 [log] [tgz]
author: Joel Fernandes <joelaf@google.com> Mon May 15 03:02:31 2017 -0700
committer: KP Singh <kpsingh@google.com> Thu Jun 29 22:51:02 2017 +0200
tree: 19790e61fcd2b070621c4780bf5c8c0b0c44c026
parent: 50610cbe8742f4a1e3da60b406cb36b4c962bd72 [diff]
diff --git a/tests/test_caching.py b/tests/test_caching.py
new file mode 100644
index 0000000..a8e3ac1
--- /dev/null
+++ b/tests/test_caching.py

@@ -0,0 +1,99 @@
+#    Copyright 2015-2017 ARM Limited, Google and contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+import unittest
+import utils_tests
+import trappy
+from trappy.ftrace import GenericFTrace
+
+class TestCaching(utils_tests.SetupDirectory):
+    def __init__(self, *args, **kwargs):
+        super(TestCaching, self).__init__(
+            [("trace_sched.txt", "trace.txt"),
+             ("trace_sched.txt", "trace.raw.txt")],
+            *args,
+            **kwargs)
+
+    def test_cache_created(self):
+        """Test cache creation when enabled"""
+        GenericFTrace.disable_cache = False
+        trace = trappy.FTrace()
+
+        trace_path = os.path.abspath(trace.trace_path)
+        trace_dir = os.path.dirname(trace_path)
+        trace_file = os.path.basename(trace_path)
+        cache_dir = '.' + trace_file + '.cache'
+
+        self.assertTrue(cache_dir in os.listdir(trace_dir))
+
+    def test_cache_not_created(self):
+        """Test that cache should not be created when disabled """
+        GenericFTrace.disable_cache = True
+        trace = trappy.FTrace()
+
+        trace_path = os.path.abspath(trace.trace_path)
+        trace_dir = os.path.dirname(trace_path)
+        trace_file = os.path.basename(trace_path)
+        cache_dir = '.' + trace_file + '.cache'
+
+        self.assertFalse(cache_dir in os.listdir(trace_dir))
+
+    def test_compare_cached_vs_uncached(self):
+        """ Test that the cached and uncached traces are same """
+        # Build the cache, but the actual trace will be parsed
+        # fresh since this is a first time parse
+        GenericFTrace.disable_cache = False
+        uncached_trace = trappy.FTrace()
+        uncached_dfr = uncached_trace.sched_wakeup.data_frame
+
+        # Now read from previously parsed cache by reusing the path
+        cached_trace = trappy.FTrace(uncached_trace.trace_path)
+        cached_dfr = cached_trace.sched_wakeup.data_frame
+
+        # Test whether timestamps are the same:
+        # The cached/uncached versions of the timestamps are slightly
+        # different due to floating point precision errors due to converting
+        # back and forth CSV and DataFrame. For all purposes this is not relevant
+        # since such rounding doesn't effect the end result.
+        # Here's an example of the error, the actual normalized time when
+        # calculated by hand is 0.081489, however following is what's stored
+        # in the CSV for sched_wakeup events in this trace.
+        # When converting the index to strings (and also what's in the CSV)
+        # cached: ['0.0814890000001', '1.981491']
+        # uncached: ['0.0814890000001', '1.981491']
+        #
+        # Keeping index as numpy.float64
+        # cached: [0.081489000000100009, 1.9814909999999999]
+        # uncached: [0.081489000000146916, 1.9814909999995507]
+        #
+        # To make it possible to test, lets just convert the timestamps to strings
+        # and compare them below.
+
+        cached_times = [str(r[0]) for r in cached_dfr.iterrows()]
+        uncached_times = [str(r[0]) for r in uncached_dfr.iterrows()]
+
+        self.assertTrue(cached_times == uncached_times)
+
+        # compare other columns as well
+        self.assertTrue([r[1].pid for r in cached_dfr.iterrows()] ==
+                        [r[1].pid for r in uncached_dfr.iterrows()])
+
+        self.assertTrue([r[1].comm for r in cached_dfr.iterrows()] ==
+                        [r[1].comm for r in uncached_dfr.iterrows()])
+
+        self.assertTrue([r[1].prio for r in cached_dfr.iterrows()] ==
+                        [r[1].prio for r in uncached_dfr.iterrows()])

diff --git a/tests/utils_tests.py b/tests/utils_tests.py
index 617cfa3..e13b868 100644
--- a/tests/utils_tests.py
+++ b/tests/utils_tests.py

@@ -19,6 +19,8 @@
 import shutil
 import subprocess
 import tempfile
+import trappy
+from trappy.ftrace import GenericFTrace
 
 TESTS_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
 
@@ -36,6 +38,7 @@
     def __init__(self, files_to_copy, *args, **kwargs):
         self.files_to_copy = files_to_copy
         super(SetupDirectory, self).__init__(*args, **kwargs)
+        GenericFTrace.disable_cache = True
 
     def setUp(self):
         self.previous_dir = os.getcwd()

diff --git a/trappy/base.py b/trappy/base.py
index 4f06af2..4f2fb21 100644
--- a/trappy/base.py
+++ b/trappy/base.py

@@ -113,6 +113,7 @@
         self.pid_array = []
         self.cpu_array = []
         self.parse_raw = parse_raw
+        self.cached = False
 
     def finalize_object(self):
         pass
@@ -259,6 +260,14 @@
         """
         self.data_frame.to_csv(fname)
 
+    def read_csv(self, fname):
+        """Read the csv data into a DataFrame
+
+        :param fname: The name of the CSV file
+        :type fname: str
+        """
+        self.data_frame = pd.read_csv(fname, index_col = 0)
+
     def normalize_time(self, basetime):
         """Substract basetime from the Time of the data frame
 

diff --git a/trappy/ftrace.py b/trappy/ftrace.py
index ea435f5..2176d30 100644
--- a/trappy/ftrace.py
+++ b/trappy/ftrace.py

@@ -21,6 +21,9 @@
 import os
 import re
 import pandas as pd
+import hashlib
+import shutil
+import warnings
 
 from trappy.bare_trace import BareTrace
 from trappy.utils import listify
@@ -62,6 +65,49 @@
 
     dynamic_classes = {}
 
+    disable_cache = False
+
+    def _trace_cache_path(self):
+        trace_file = self.trace_path
+        cache_dir  = '.' +  os.path.basename(trace_file) + '.cache'
+        tracefile_dir = os.path.dirname(os.path.abspath(trace_file))
+        cache_path = os.path.join(tracefile_dir, cache_dir)
+        return cache_path
+
+    def _check_trace_cache(self):
+        cache_path = self._trace_cache_path()
+        md5file = os.path.join(cache_path, 'md5sum')
+
+        if not os.path.exists(cache_path) or not os.path.exists(md5file):
+            return False
+
+        with open(md5file) as f:
+            cache_md5sum = f.read()
+        with open(self.trace_path, 'rb') as f:
+            trace_md5sum = hashlib.md5(f.read()).hexdigest()
+
+        # check if cache is valid
+        if cache_md5sum != trace_md5sum:
+            shutil.rmtree(cache_path)
+            return False
+        return True
+
+    def _create_trace_cache(self):
+        cache_path = self._trace_cache_path()
+        md5file = os.path.join(cache_path, 'md5sum')
+
+        if os.path.exists(cache_path):
+            shutil.rmtree(cache_path)
+        os.mkdir(cache_path)
+
+        md5sum = hashlib.md5(open(self.trace_path, 'rb').read()).hexdigest()
+        with open(md5file, 'w') as f:
+            f.write(md5sum)
+
+    def _get_csv_path(self, trace_class):
+        path = self._trace_cache_path()
+        return os.path.join(path, trace_class.__class__.__name__ + '.csv')
+
     def __init__(self, name="", normalize_time=True, scope="all",
                  events=[], window=(0, None), abs_window=(0, None)):
         super(GenericFTrace, self).__init__(name)
@@ -127,7 +173,35 @@
                 del scope_classes[name]
 
     def _do_parse(self):
+        if not self.__class__.disable_cache and self._check_trace_cache():
+            # Read csv into frames
+            for trace_class in self.trace_classes:
+                try:
+                    csv_file = self._get_csv_path(trace_class)
+                    trace_class.read_csv(csv_file)
+                    trace_class.cached = True
+                except:
+                    warnstr = "TRAPpy: Couldn't read {} from cache, reading it from trace".format(trace_class)
+                    warnings.warn(warnstr)
+
         self.__parse_trace_file(self.trace_path)
+
+        if not self.__class__.disable_cache:
+            try:
+                # Recreate basic cache directories only if nothing cached
+                if not all([c.cached for c in self.trace_classes]):
+                    self._create_trace_cache()
+
+                # Write out only events that weren't cached before
+                for trace_class in self.trace_classes:
+                    if trace_class.cached:
+                        continue
+                    csv_file = self._get_csv_path(trace_class)
+                    trace_class.write_csv(csv_file)
+            except OSError as err:
+                warnings.warn(
+                    "TRAPpy: Cache not created due to OS error: {0}".format(err))
+
         self.finalize_objects()
 
         if self.normalize_time:
@@ -265,6 +339,8 @@
         cls_for_unique_word = {}
         for trace_name in self.class_definitions.iterkeys():
             trace_class = getattr(self, trace_name)
+            if trace_class.cached:
+                continue
 
             unique_word = trace_class.unique_word
             cls_for_unique_word[unique_word] = trace_class
commit	a19ff2508e37fce5e4909e0b717c3a4d2f0be679	[log] [tgz]
author	Joel Fernandes <joelaf@google.com>	Mon May 15 03:02:31 2017 -0700
committer	KP Singh <kpsingh@google.com>	Thu Jun 29 22:51:02 2017 +0200
tree	19790e61fcd2b070621c4780bf5c8c0b0c44c026
parent	50610cbe8742f4a1e3da60b406cb36b4c962bd72 [diff]