blob: a06ff1d1cabbc10d51da885028dd13b45f51cf49 [file] [log] [blame]
Brendan Jackmane81fdcb2017-01-04 17:10:29 +00001# Copyright 2015-2017 ARM Limited
Javi Merinob95a4c52015-11-26 11:51:53 +00002#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14#
15
Joel Fernandes89ce9a02017-07-08 13:38:55 -070016import pandas as pd
17import numpy as np
18
Javi Merinob95a4c52015-11-26 11:51:53 +000019"""Generic functions that can be used in multiple places in trappy
20"""
21
22def listify(to_select):
23 """Utitlity function to handle both single and
24 list inputs
25 """
26
27 if not isinstance(to_select, list):
28 to_select = [to_select]
29
30 return to_select
Kapileshwar Singh6f3c26c2015-12-06 18:23:13 +000031
32def handle_duplicate_index(data,
33 max_delta=0.000001):
34 """Handle duplicate values in index
35
36 :param data: The timeseries input
37 :type data: :mod:`pandas.Series`
38
39 :param max_delta: Maximum interval adjustment value that
40 will be added to duplicate indices
41 :type max_delta: float
42
43 Consider the following case where a series needs to be reindexed
44 to a new index (which can be required when different series need to
45 be combined and compared):
46 ::
47
48 import pandas
49 values = [0, 1, 2, 3, 4]
50 index = [0.0, 1.0, 1.0, 6.0, 7.0]
51 series = pandas.Series(values, index=index)
52 new_index = [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 7.0]
53 series.reindex(new_index)
54
55 The above code fails with:
56 ::
57
58 ValueError: cannot reindex from a duplicate axis
59
60 The function :func:`handle_duplicate_axis` changes the duplicate values
61 to
62 ::
63
64 >>> import pandas
65 >>> from trappy.utils import handle_duplicate_index
66
67 >>> values = [0, 1, 2, 3, 4]
68 index = [0.0, 1.0, 1.0, 6.0, 7.0]
69 series = pandas.Series(values, index=index)
70 series = handle_duplicate_index(series)
71 print series.index.values
72 >>> [ 0. 1. 1.000001 6. 7. ]
73
74 """
75
76 index = data.index
77 new_index = index.values
78
79 dups = index.get_duplicates()
80
81 for dup in dups:
82 # Leave one of the values intact
83 dup_index_left = index.searchsorted(dup, side="left")
84 dup_index_right = index.searchsorted(dup, side="right") - 1
85 num_dups = dup_index_right - dup_index_left + 1
86
87 # Calculate delta that needs to be added to each duplicate
88 # index
Javi Merino08c8d292016-01-14 16:17:26 +000089 try:
90 delta = (index[dup_index_right + 1] - dup) / num_dups
91 except IndexError:
92 # dup_index_right + 1 is outside of the series (i.e. the
93 # dup is at the end of the series).
94 delta = max_delta
Kapileshwar Singh6f3c26c2015-12-06 18:23:13 +000095
96 # Clamp the maximum delta added to max_delta
97 if delta > max_delta:
98 delta = max_delta
99
100 # Add a delta to the others
101 dup_index_left += 1
102 while dup_index_left <= dup_index_right:
103 new_index[dup_index_left] += delta
104 delta += delta
105 dup_index_left += 1
106
107 return data.reindex(new_index)
Joel Fernandes89ce9a02017-07-08 13:38:55 -0700108
Joel Fernandes49f4c422017-07-10 19:33:09 -0700109# Iterate fast over all rows in a data frame and apply fn
110def apply_callback(df, fn, *kwargs):
111 iters = df.itertuples()
112 event_tuple = iters.next()
113
114 # Column names beginning with underscore will not be preserved in tuples
115 # due to constraints on namedtuple field names, so store mappings from
116 # column name to column number for each trace event.
117 col_idxs = { name: idx for idx, name in enumerate(['Time'] + df.columns.tolist()) }
118
119 while True:
120 if not event_tuple:
121 break
122 event_dict = { col: event_tuple[idx] for col, idx in col_idxs.iteritems() }
123
124 if kwargs:
125 fn(event_dict, kwargs)
126 else:
127 fn(event_dict)
128
129 event_tuple = next(iters, None)
130
131
Joel Fernandes89ce9a02017-07-08 13:38:55 -0700132def merge_dfs(pr_df, sec_df, pivot):
133 # Keep track of last secondary event
134 pivot_map = {}
135
136 # An array accumating dicts with merged data
137 merged_data = []
138 def df_fn(data):
139 # Store the latest secondary info
140 if data['Time'][0] == 'secondary':
141 pivot_map[data[pivot]] = data
142 # Get rid of primary/secondary labels
143 data['Time'] = data['Time'][1]
144 return
145
146 # Propogate latest secondary info
147 for key, value in data.iteritems():
148 if key == pivot:
149 continue
Joel Fernandeseffd7b52017-07-13 21:48:09 -0700150 # Fast check for if value is nan (faster than np.isnan + try/except)
151 if value != value and pivot_map.has_key(data[pivot]):
152 data[key] = pivot_map[data[pivot]][key]
Joel Fernandes89ce9a02017-07-08 13:38:55 -0700153
154 # Get rid of primary/secondary labels
155 data['Time'] = data['Time'][1]
156 merged_data.append(data)
157
Joel Fernandes6d691882017-07-15 00:59:27 -0700158 df = pd.concat([pr_df, sec_df], keys=['primary', 'secondary']).sort_values(by='__line')
Joel Fernandes49f4c422017-07-10 19:33:09 -0700159 apply_callback(df, df_fn)
Joel Fernandes89ce9a02017-07-08 13:38:55 -0700160 merged_df = pd.DataFrame.from_dict(merged_data)
161 merged_df.set_index('Time', inplace=True)
162
163 return merged_df