blob: a06ff1d1cabbc10d51da885028dd13b45f51cf49 [file] [log] [blame]
# Copyright 2015-2017 ARM Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import pandas as pd
import numpy as np
"""Generic functions that can be used in multiple places in trappy
"""
def listify(to_select):
"""Utitlity function to handle both single and
list inputs
"""
if not isinstance(to_select, list):
to_select = [to_select]
return to_select
def handle_duplicate_index(data,
max_delta=0.000001):
"""Handle duplicate values in index
:param data: The timeseries input
:type data: :mod:`pandas.Series`
:param max_delta: Maximum interval adjustment value that
will be added to duplicate indices
:type max_delta: float
Consider the following case where a series needs to be reindexed
to a new index (which can be required when different series need to
be combined and compared):
::
import pandas
values = [0, 1, 2, 3, 4]
index = [0.0, 1.0, 1.0, 6.0, 7.0]
series = pandas.Series(values, index=index)
new_index = [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 7.0]
series.reindex(new_index)
The above code fails with:
::
ValueError: cannot reindex from a duplicate axis
The function :func:`handle_duplicate_axis` changes the duplicate values
to
::
>>> import pandas
>>> from trappy.utils import handle_duplicate_index
>>> values = [0, 1, 2, 3, 4]
index = [0.0, 1.0, 1.0, 6.0, 7.0]
series = pandas.Series(values, index=index)
series = handle_duplicate_index(series)
print series.index.values
>>> [ 0. 1. 1.000001 6. 7. ]
"""
index = data.index
new_index = index.values
dups = index.get_duplicates()
for dup in dups:
# Leave one of the values intact
dup_index_left = index.searchsorted(dup, side="left")
dup_index_right = index.searchsorted(dup, side="right") - 1
num_dups = dup_index_right - dup_index_left + 1
# Calculate delta that needs to be added to each duplicate
# index
try:
delta = (index[dup_index_right + 1] - dup) / num_dups
except IndexError:
# dup_index_right + 1 is outside of the series (i.e. the
# dup is at the end of the series).
delta = max_delta
# Clamp the maximum delta added to max_delta
if delta > max_delta:
delta = max_delta
# Add a delta to the others
dup_index_left += 1
while dup_index_left <= dup_index_right:
new_index[dup_index_left] += delta
delta += delta
dup_index_left += 1
return data.reindex(new_index)
# Iterate fast over all rows in a data frame and apply fn
def apply_callback(df, fn, *kwargs):
iters = df.itertuples()
event_tuple = iters.next()
# Column names beginning with underscore will not be preserved in tuples
# due to constraints on namedtuple field names, so store mappings from
# column name to column number for each trace event.
col_idxs = { name: idx for idx, name in enumerate(['Time'] + df.columns.tolist()) }
while True:
if not event_tuple:
break
event_dict = { col: event_tuple[idx] for col, idx in col_idxs.iteritems() }
if kwargs:
fn(event_dict, kwargs)
else:
fn(event_dict)
event_tuple = next(iters, None)
def merge_dfs(pr_df, sec_df, pivot):
# Keep track of last secondary event
pivot_map = {}
# An array accumating dicts with merged data
merged_data = []
def df_fn(data):
# Store the latest secondary info
if data['Time'][0] == 'secondary':
pivot_map[data[pivot]] = data
# Get rid of primary/secondary labels
data['Time'] = data['Time'][1]
return
# Propogate latest secondary info
for key, value in data.iteritems():
if key == pivot:
continue
# Fast check for if value is nan (faster than np.isnan + try/except)
if value != value and pivot_map.has_key(data[pivot]):
data[key] = pivot_map[data[pivot]][key]
# Get rid of primary/secondary labels
data['Time'] = data['Time'][1]
merged_data.append(data)
df = pd.concat([pr_df, sec_df], keys=['primary', 'secondary']).sort_values(by='__line')
apply_callback(df, df_fn)
merged_df = pd.DataFrame.from_dict(merged_data)
merged_df.set_index('Time', inplace=True)
return merged_df