Brendan Jackman | e81fdcb | 2017-01-04 17:10:29 +0000 | [diff] [blame] | 1 | # Copyright 2015-2017 ARM Limited |
Javi Merino | b95a4c5 | 2015-11-26 11:51:53 +0000 | [diff] [blame] | 2 | # |
| 3 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | # you may not use this file except in compliance with the License. |
| 5 | # You may obtain a copy of the License at |
| 6 | # |
| 7 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | # |
| 9 | # Unless required by applicable law or agreed to in writing, software |
| 10 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | # See the License for the specific language governing permissions and |
| 13 | # limitations under the License. |
| 14 | # |
| 15 | |
Joel Fernandes | 89ce9a0 | 2017-07-08 13:38:55 -0700 | [diff] [blame] | 16 | import pandas as pd |
| 17 | import numpy as np |
| 18 | |
Javi Merino | b95a4c5 | 2015-11-26 11:51:53 +0000 | [diff] [blame] | 19 | """Generic functions that can be used in multiple places in trappy |
| 20 | """ |
| 21 | |
| 22 | def listify(to_select): |
| 23 | """Utitlity function to handle both single and |
| 24 | list inputs |
| 25 | """ |
| 26 | |
| 27 | if not isinstance(to_select, list): |
| 28 | to_select = [to_select] |
| 29 | |
| 30 | return to_select |
Kapileshwar Singh | 6f3c26c | 2015-12-06 18:23:13 +0000 | [diff] [blame] | 31 | |
| 32 | def handle_duplicate_index(data, |
| 33 | max_delta=0.000001): |
| 34 | """Handle duplicate values in index |
| 35 | |
| 36 | :param data: The timeseries input |
| 37 | :type data: :mod:`pandas.Series` |
| 38 | |
| 39 | :param max_delta: Maximum interval adjustment value that |
| 40 | will be added to duplicate indices |
| 41 | :type max_delta: float |
| 42 | |
| 43 | Consider the following case where a series needs to be reindexed |
| 44 | to a new index (which can be required when different series need to |
| 45 | be combined and compared): |
| 46 | :: |
| 47 | |
| 48 | import pandas |
| 49 | values = [0, 1, 2, 3, 4] |
| 50 | index = [0.0, 1.0, 1.0, 6.0, 7.0] |
| 51 | series = pandas.Series(values, index=index) |
| 52 | new_index = [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 7.0] |
| 53 | series.reindex(new_index) |
| 54 | |
| 55 | The above code fails with: |
| 56 | :: |
| 57 | |
| 58 | ValueError: cannot reindex from a duplicate axis |
| 59 | |
| 60 | The function :func:`handle_duplicate_axis` changes the duplicate values |
| 61 | to |
| 62 | :: |
| 63 | |
| 64 | >>> import pandas |
| 65 | >>> from trappy.utils import handle_duplicate_index |
| 66 | |
| 67 | >>> values = [0, 1, 2, 3, 4] |
| 68 | index = [0.0, 1.0, 1.0, 6.0, 7.0] |
| 69 | series = pandas.Series(values, index=index) |
| 70 | series = handle_duplicate_index(series) |
| 71 | print series.index.values |
| 72 | >>> [ 0. 1. 1.000001 6. 7. ] |
| 73 | |
| 74 | """ |
| 75 | |
| 76 | index = data.index |
| 77 | new_index = index.values |
| 78 | |
| 79 | dups = index.get_duplicates() |
| 80 | |
| 81 | for dup in dups: |
| 82 | # Leave one of the values intact |
| 83 | dup_index_left = index.searchsorted(dup, side="left") |
| 84 | dup_index_right = index.searchsorted(dup, side="right") - 1 |
| 85 | num_dups = dup_index_right - dup_index_left + 1 |
| 86 | |
| 87 | # Calculate delta that needs to be added to each duplicate |
| 88 | # index |
Javi Merino | 08c8d29 | 2016-01-14 16:17:26 +0000 | [diff] [blame] | 89 | try: |
| 90 | delta = (index[dup_index_right + 1] - dup) / num_dups |
| 91 | except IndexError: |
| 92 | # dup_index_right + 1 is outside of the series (i.e. the |
| 93 | # dup is at the end of the series). |
| 94 | delta = max_delta |
Kapileshwar Singh | 6f3c26c | 2015-12-06 18:23:13 +0000 | [diff] [blame] | 95 | |
| 96 | # Clamp the maximum delta added to max_delta |
| 97 | if delta > max_delta: |
| 98 | delta = max_delta |
| 99 | |
| 100 | # Add a delta to the others |
| 101 | dup_index_left += 1 |
| 102 | while dup_index_left <= dup_index_right: |
| 103 | new_index[dup_index_left] += delta |
| 104 | delta += delta |
| 105 | dup_index_left += 1 |
| 106 | |
| 107 | return data.reindex(new_index) |
Joel Fernandes | 89ce9a0 | 2017-07-08 13:38:55 -0700 | [diff] [blame] | 108 | |
Joel Fernandes | 49f4c42 | 2017-07-10 19:33:09 -0700 | [diff] [blame] | 109 | # Iterate fast over all rows in a data frame and apply fn |
| 110 | def apply_callback(df, fn, *kwargs): |
| 111 | iters = df.itertuples() |
| 112 | event_tuple = iters.next() |
| 113 | |
| 114 | # Column names beginning with underscore will not be preserved in tuples |
| 115 | # due to constraints on namedtuple field names, so store mappings from |
| 116 | # column name to column number for each trace event. |
| 117 | col_idxs = { name: idx for idx, name in enumerate(['Time'] + df.columns.tolist()) } |
| 118 | |
| 119 | while True: |
| 120 | if not event_tuple: |
| 121 | break |
| 122 | event_dict = { col: event_tuple[idx] for col, idx in col_idxs.iteritems() } |
| 123 | |
| 124 | if kwargs: |
| 125 | fn(event_dict, kwargs) |
| 126 | else: |
| 127 | fn(event_dict) |
| 128 | |
| 129 | event_tuple = next(iters, None) |
| 130 | |
| 131 | |
Joel Fernandes | 89ce9a0 | 2017-07-08 13:38:55 -0700 | [diff] [blame] | 132 | def merge_dfs(pr_df, sec_df, pivot): |
| 133 | # Keep track of last secondary event |
| 134 | pivot_map = {} |
| 135 | |
| 136 | # An array accumating dicts with merged data |
| 137 | merged_data = [] |
| 138 | def df_fn(data): |
| 139 | # Store the latest secondary info |
| 140 | if data['Time'][0] == 'secondary': |
| 141 | pivot_map[data[pivot]] = data |
| 142 | # Get rid of primary/secondary labels |
| 143 | data['Time'] = data['Time'][1] |
| 144 | return |
| 145 | |
| 146 | # Propogate latest secondary info |
| 147 | for key, value in data.iteritems(): |
| 148 | if key == pivot: |
| 149 | continue |
Joel Fernandes | effd7b5 | 2017-07-13 21:48:09 -0700 | [diff] [blame] | 150 | # Fast check for if value is nan (faster than np.isnan + try/except) |
| 151 | if value != value and pivot_map.has_key(data[pivot]): |
| 152 | data[key] = pivot_map[data[pivot]][key] |
Joel Fernandes | 89ce9a0 | 2017-07-08 13:38:55 -0700 | [diff] [blame] | 153 | |
| 154 | # Get rid of primary/secondary labels |
| 155 | data['Time'] = data['Time'][1] |
| 156 | merged_data.append(data) |
| 157 | |
Joel Fernandes | 6d69188 | 2017-07-15 00:59:27 -0700 | [diff] [blame] | 158 | df = pd.concat([pr_df, sec_df], keys=['primary', 'secondary']).sort_values(by='__line') |
Joel Fernandes | 49f4c42 | 2017-07-10 19:33:09 -0700 | [diff] [blame] | 159 | apply_callback(df, df_fn) |
Joel Fernandes | 89ce9a0 | 2017-07-08 13:38:55 -0700 | [diff] [blame] | 160 | merged_df = pd.DataFrame.from_dict(merged_data) |
| 161 | merged_df.set_index('Time', inplace=True) |
| 162 | |
| 163 | return merged_df |