trappy: Speed up trappy by caching trace parsing
Pandas is extremely fast at parsing csv to data frames. Astonishingly it takes
< 1s to serialize/deserialize a 100MB work of traces with 430000 events to/from
csv. We leverage this and write out a data frames into a csv file when they are
created for the first time. Next time we read it out if it exists. To make
sure, the cache isn't stale, we take the md5sum of the trace file and also
ensure all CSVs exist before reading from the cache. I get a speed up of 16s to
1s when parsing a 100MB trace.
Co-developed-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Reviewed-by: KP Singh <kpsingh@google.com>
diff --git a/tests/test_caching.py b/tests/test_caching.py
new file mode 100644
index 0000000..a8e3ac1
--- /dev/null
+++ b/tests/test_caching.py
@@ -0,0 +1,99 @@
+# Copyright 2015-2017 ARM Limited, Google and contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+import unittest
+import utils_tests
+import trappy
+from trappy.ftrace import GenericFTrace
+
+class TestCaching(utils_tests.SetupDirectory):
+ def __init__(self, *args, **kwargs):
+ super(TestCaching, self).__init__(
+ [("trace_sched.txt", "trace.txt"),
+ ("trace_sched.txt", "trace.raw.txt")],
+ *args,
+ **kwargs)
+
+ def test_cache_created(self):
+ """Test cache creation when enabled"""
+ GenericFTrace.disable_cache = False
+ trace = trappy.FTrace()
+
+ trace_path = os.path.abspath(trace.trace_path)
+ trace_dir = os.path.dirname(trace_path)
+ trace_file = os.path.basename(trace_path)
+ cache_dir = '.' + trace_file + '.cache'
+
+ self.assertTrue(cache_dir in os.listdir(trace_dir))
+
+ def test_cache_not_created(self):
+ """Test that cache should not be created when disabled """
+ GenericFTrace.disable_cache = True
+ trace = trappy.FTrace()
+
+ trace_path = os.path.abspath(trace.trace_path)
+ trace_dir = os.path.dirname(trace_path)
+ trace_file = os.path.basename(trace_path)
+ cache_dir = '.' + trace_file + '.cache'
+
+ self.assertFalse(cache_dir in os.listdir(trace_dir))
+
+ def test_compare_cached_vs_uncached(self):
+ """ Test that the cached and uncached traces are same """
+ # Build the cache, but the actual trace will be parsed
+ # fresh since this is a first time parse
+ GenericFTrace.disable_cache = False
+ uncached_trace = trappy.FTrace()
+ uncached_dfr = uncached_trace.sched_wakeup.data_frame
+
+ # Now read from previously parsed cache by reusing the path
+ cached_trace = trappy.FTrace(uncached_trace.trace_path)
+ cached_dfr = cached_trace.sched_wakeup.data_frame
+
+ # Test whether timestamps are the same:
+ # The cached/uncached versions of the timestamps are slightly
+ # different due to floating point precision errors due to converting
+ # back and forth CSV and DataFrame. For all purposes this is not relevant
+ # since such rounding doesn't effect the end result.
+ # Here's an example of the error, the actual normalized time when
+ # calculated by hand is 0.081489, however following is what's stored
+ # in the CSV for sched_wakeup events in this trace.
+ # When converting the index to strings (and also what's in the CSV)
+ # cached: ['0.0814890000001', '1.981491']
+ # uncached: ['0.0814890000001', '1.981491']
+ #
+ # Keeping index as numpy.float64
+ # cached: [0.081489000000100009, 1.9814909999999999]
+ # uncached: [0.081489000000146916, 1.9814909999995507]
+ #
+ # To make it possible to test, lets just convert the timestamps to strings
+ # and compare them below.
+
+ cached_times = [str(r[0]) for r in cached_dfr.iterrows()]
+ uncached_times = [str(r[0]) for r in uncached_dfr.iterrows()]
+
+ self.assertTrue(cached_times == uncached_times)
+
+ # compare other columns as well
+ self.assertTrue([r[1].pid for r in cached_dfr.iterrows()] ==
+ [r[1].pid for r in uncached_dfr.iterrows()])
+
+ self.assertTrue([r[1].comm for r in cached_dfr.iterrows()] ==
+ [r[1].comm for r in uncached_dfr.iterrows()])
+
+ self.assertTrue([r[1].prio for r in cached_dfr.iterrows()] ==
+ [r[1].prio for r in uncached_dfr.iterrows()])
diff --git a/tests/utils_tests.py b/tests/utils_tests.py
index 617cfa3..e13b868 100644
--- a/tests/utils_tests.py
+++ b/tests/utils_tests.py
@@ -19,6 +19,8 @@
import shutil
import subprocess
import tempfile
+import trappy
+from trappy.ftrace import GenericFTrace
TESTS_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
@@ -36,6 +38,7 @@
def __init__(self, files_to_copy, *args, **kwargs):
self.files_to_copy = files_to_copy
super(SetupDirectory, self).__init__(*args, **kwargs)
+ GenericFTrace.disable_cache = True
def setUp(self):
self.previous_dir = os.getcwd()
diff --git a/trappy/base.py b/trappy/base.py
index 4f06af2..4f2fb21 100644
--- a/trappy/base.py
+++ b/trappy/base.py
@@ -113,6 +113,7 @@
self.pid_array = []
self.cpu_array = []
self.parse_raw = parse_raw
+ self.cached = False
def finalize_object(self):
pass
@@ -259,6 +260,14 @@
"""
self.data_frame.to_csv(fname)
+ def read_csv(self, fname):
+ """Read the csv data into a DataFrame
+
+ :param fname: The name of the CSV file
+ :type fname: str
+ """
+ self.data_frame = pd.read_csv(fname, index_col = 0)
+
def normalize_time(self, basetime):
"""Substract basetime from the Time of the data frame
diff --git a/trappy/ftrace.py b/trappy/ftrace.py
index ea435f5..2176d30 100644
--- a/trappy/ftrace.py
+++ b/trappy/ftrace.py
@@ -21,6 +21,9 @@
import os
import re
import pandas as pd
+import hashlib
+import shutil
+import warnings
from trappy.bare_trace import BareTrace
from trappy.utils import listify
@@ -62,6 +65,49 @@
dynamic_classes = {}
+ disable_cache = False
+
+ def _trace_cache_path(self):
+ trace_file = self.trace_path
+ cache_dir = '.' + os.path.basename(trace_file) + '.cache'
+ tracefile_dir = os.path.dirname(os.path.abspath(trace_file))
+ cache_path = os.path.join(tracefile_dir, cache_dir)
+ return cache_path
+
+ def _check_trace_cache(self):
+ cache_path = self._trace_cache_path()
+ md5file = os.path.join(cache_path, 'md5sum')
+
+ if not os.path.exists(cache_path) or not os.path.exists(md5file):
+ return False
+
+ with open(md5file) as f:
+ cache_md5sum = f.read()
+ with open(self.trace_path, 'rb') as f:
+ trace_md5sum = hashlib.md5(f.read()).hexdigest()
+
+ # check if cache is valid
+ if cache_md5sum != trace_md5sum:
+ shutil.rmtree(cache_path)
+ return False
+ return True
+
+ def _create_trace_cache(self):
+ cache_path = self._trace_cache_path()
+ md5file = os.path.join(cache_path, 'md5sum')
+
+ if os.path.exists(cache_path):
+ shutil.rmtree(cache_path)
+ os.mkdir(cache_path)
+
+ md5sum = hashlib.md5(open(self.trace_path, 'rb').read()).hexdigest()
+ with open(md5file, 'w') as f:
+ f.write(md5sum)
+
+ def _get_csv_path(self, trace_class):
+ path = self._trace_cache_path()
+ return os.path.join(path, trace_class.__class__.__name__ + '.csv')
+
def __init__(self, name="", normalize_time=True, scope="all",
events=[], window=(0, None), abs_window=(0, None)):
super(GenericFTrace, self).__init__(name)
@@ -127,7 +173,35 @@
del scope_classes[name]
def _do_parse(self):
+ if not self.__class__.disable_cache and self._check_trace_cache():
+ # Read csv into frames
+ for trace_class in self.trace_classes:
+ try:
+ csv_file = self._get_csv_path(trace_class)
+ trace_class.read_csv(csv_file)
+ trace_class.cached = True
+ except:
+ warnstr = "TRAPpy: Couldn't read {} from cache, reading it from trace".format(trace_class)
+ warnings.warn(warnstr)
+
self.__parse_trace_file(self.trace_path)
+
+ if not self.__class__.disable_cache:
+ try:
+ # Recreate basic cache directories only if nothing cached
+ if not all([c.cached for c in self.trace_classes]):
+ self._create_trace_cache()
+
+ # Write out only events that weren't cached before
+ for trace_class in self.trace_classes:
+ if trace_class.cached:
+ continue
+ csv_file = self._get_csv_path(trace_class)
+ trace_class.write_csv(csv_file)
+ except OSError as err:
+ warnings.warn(
+ "TRAPpy: Cache not created due to OS error: {0}".format(err))
+
self.finalize_objects()
if self.normalize_time:
@@ -265,6 +339,8 @@
cls_for_unique_word = {}
for trace_name in self.class_definitions.iterkeys():
trace_class = getattr(self, trace_name)
+ if trace_class.cached:
+ continue
unique_word = trace_class.unique_word
cls_for_unique_word[unique_word] = trace_class