Evelina Dumitrescu | c7faa09 | 2016-09-28 15:13:29 -0700 | [diff] [blame] | 1 | # Copyright 2016 The Chromium OS Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | """Utility functions for parsing pprof, CWP data and Chrome OS groups files.""" |
| 5 | |
| 6 | from collections import defaultdict |
| 7 | |
| 8 | import csv |
| 9 | import os |
| 10 | import re |
| 11 | |
| 12 | SEPARATOR_REGEX = re.compile(r'-+\+-+') |
| 13 | FUNCTION_STATISTIC_REGEX = \ |
| 14 | re.compile(r'(\S+)\s+(\S+)%\s+(\S+)%\s+(\S+)\s+(\S+)%') |
| 15 | CHILD_FUNCTION_PERCENTAGE_REGEX = re.compile(r'([0-9.]+)%') |
| 16 | FUNCTION_KEY_SEPARATOR_REGEX = re.compile(r'\|\s+') |
| 17 | # Constants used to identify if a function is common in the pprof and CWP |
| 18 | # files. |
| 19 | COMMON_FUNCTION = 'common' |
| 20 | EXTRA_FUNCTION = 'extra' |
| 21 | PARENT_CHILD_FUNCTIONS_SEPARATOR = ';;' |
| 22 | # List of pairs of strings used for make substitutions in file names to make |
| 23 | # CWP and pprof data consistent. |
Evelina Dumitrescu | 63dce85 | 2016-10-19 18:13:34 -0700 | [diff] [blame] | 24 | FILE_NAME_REPLACING_PAIR_STRINGS = [('gnawty', 'BOARD'), |
| 25 | ('amd64-generic', 'BOARD'), |
Evelina Dumitrescu | c7faa09 | 2016-09-28 15:13:29 -0700 | [diff] [blame] | 26 | (' ../sysdeps', ',sysdeps'), |
| 27 | (' ../nptl', ',nptl'), |
| 28 | (' aes-x86_64.s', ',aes-x86_64.s'), |
| 29 | (' (inline)', ''), |
| 30 | (' (partial-inline)', ''), |
| 31 | (' ../', ','), |
| 32 | ('../', '')] |
| 33 | # Separator used to delimit the function from the file name. |
| 34 | FUNCTION_FILE_SEPARATOR = ' /' |
| 35 | |
| 36 | |
| 37 | def MakeCWPAndPprofFileNamesConsistent(file_name): |
| 38 | """Makes the CWP and pprof file names consistent. |
| 39 | |
| 40 | For the same function, it may happen for some file paths to differ slightly |
| 41 | in the CWP data compared to the pprof output. In a file name, for each tuple |
| 42 | element of the list, we substitute the first element with the second one. |
| 43 | |
| 44 | Args: |
| 45 | file_name: A string representing the name of the file. |
| 46 | |
| 47 | Returns: |
| 48 | A string representing the modified name of tihe file. |
| 49 | """ |
| 50 | file_name = file_name.replace(', ', '; ') |
| 51 | for replacing_pair_string in FILE_NAME_REPLACING_PAIR_STRINGS: |
| 52 | file_name = file_name.replace(replacing_pair_string[0], |
| 53 | replacing_pair_string[1]) |
| 54 | |
| 55 | return file_name |
| 56 | |
| 57 | def MakePprofFunctionKey(function_and_file_name): |
| 58 | """Creates the function key from the function and file name. |
| 59 | |
| 60 | Parsing the the pprof --top and --tree outputs is difficult due to the fact |
| 61 | that it hard to extract the function and file name (i.e the function names |
| 62 | can have a lot of unexpected charachters such as spaces, operators etc). |
| 63 | For the moment, we used FUNCTION_FILE_SEPARATOR as delimiter between the |
| 64 | function and the file name. However, there are some cases where the file name |
| 65 | does not start with / and we treat this cases separately (i.e ../sysdeps, |
| 66 | ../nptl, aes-x86_64.s). |
| 67 | |
| 68 | Args: |
| 69 | function_and_file_name: A string representing the function and the file name |
| 70 | as it appears in the pprof output. |
| 71 | |
| 72 | Returns: |
| 73 | A string representing the function key, composed from the function and file |
| 74 | name, comma separated. |
| 75 | """ |
| 76 | # TODO(evelinad): Use pprof --topproto instead of pprof --top to parse |
| 77 | # protobuffers instead of text output. Investigate if there is an equivalent |
| 78 | # for pprof --tree that gives protobuffer output. |
| 79 | # |
| 80 | # In the CWP output, we replace the , with ; as a workaround for parsing |
| 81 | # csv files. We do the same for the pprof output. |
| 82 | # |
| 83 | # TODO(evelinad): Use dremel --csv_dialect=excel-tab in the queries for |
| 84 | # replacing the , delimiter with tab. |
| 85 | function_and_file_name = function_and_file_name.replace(', ', '; ') |
| 86 | # If the function and file name sequence contains the FUNCTION_FILE_SEPARATOR, |
| 87 | # we normalize the path name of the file and make the string subtitutions |
| 88 | # to make the CWP and pprof data consistent. The returned key is composed |
| 89 | # from the function name and normalized file path name, separated by a comma. |
| 90 | # If the function and file name does not contain the FUNCTION_FILE_SEPARATOR, |
| 91 | # we just do the strings substitution. |
| 92 | if FUNCTION_FILE_SEPARATOR in function_and_file_name: |
| 93 | function_name, file_name = \ |
| 94 | function_and_file_name.split(FUNCTION_FILE_SEPARATOR) |
| 95 | file_name = \ |
| 96 | MakeCWPAndPprofFileNamesConsistent(os.path.normpath("/" + file_name)) |
| 97 | return ','.join([function_name, file_name]) |
| 98 | |
| 99 | return MakeCWPAndPprofFileNamesConsistent(function_and_file_name) |
| 100 | |
Evelina Dumitrescu | 731ad07 | 2016-10-20 16:05:38 -0700 | [diff] [blame] | 101 | |
| 102 | def ComputeCWPCummulativeInclusiveStatistics(cwp_inclusive_count_statistics): |
| 103 | """Computes the cumulative inclusive count value of a function. |
| 104 | |
| 105 | A function might appear declared in multiple files or objects. When |
| 106 | computing the fraction of the inclusive count value from a child function to |
| 107 | the parent function, we take into consideration the sum of the |
| 108 | inclusive_count |
| 109 | count values from all the ocurences of that function. |
| 110 | |
| 111 | Args: |
| 112 | cwp_inclusive_count_statistics: A dict containing the inclusive count |
| 113 | statistics extracted by the ParseCWPInclusiveCountFile method. |
| 114 | |
| 115 | Returns: |
| 116 | A dict having as a ket the name of the function and as a value the sum of |
| 117 | the inclusive count values of the occurences of the functions from all |
| 118 | the files and objects. |
| 119 | """ |
| 120 | cwp_inclusive_count_statistics_cumulative = defaultdict(int) |
| 121 | |
| 122 | for function_key, function_statistics \ |
| 123 | in cwp_inclusive_count_statistics.iteritems(): |
| 124 | function_name, _ = function_key.split(',') |
| 125 | cwp_inclusive_count_statistics_cumulative[function_name] += \ |
| 126 | function_statistics[1] |
| 127 | |
| 128 | return cwp_inclusive_count_statistics_cumulative |
| 129 | |
| 130 | def ComputeCWPChildFunctionsFractions(cwp_inclusive_count_statistics_cumulative, |
| 131 | cwp_pairwise_inclusive_count_statistics): |
| 132 | """Computes the fractions of the inclusive count values for child functions. |
| 133 | |
| 134 | The fraction represents the inclusive count value of a child function over |
| 135 | the one of the parent function. |
| 136 | |
| 137 | Args: |
| 138 | cwp_inclusive_count_statistics_cumulative: A dict containing the |
| 139 | cumulative inclusive count values of the CWP functions. |
| 140 | cwp_pairwise_inclusive_count_statistics: A dict containing the inclusive |
| 141 | count statistics for pairs of parent and child functions. The key is the |
| 142 | parent function. The value is a dict with the key the name of the child |
| 143 | function and the file name, comma separated, and the value is the |
| 144 | inclusive count value of the pair of parent and child functions. |
| 145 | |
| 146 | Returns: |
| 147 | A dict containing the inclusive count statistics for pairs of parent |
| 148 | and child functions. The key is the parent function. The value is a |
| 149 | dict with the key the name of the child function and the file name, |
| 150 | comma separated, and the value is the inclusive count fraction of the |
| 151 | child function out of the parent function. |
| 152 | """ |
| 153 | |
| 154 | pairwise_inclusive_count_fractions = {} |
| 155 | |
| 156 | for parent_function_key, child_functions_metrics in \ |
| 157 | cwp_pairwise_inclusive_count_statistics.iteritems(): |
| 158 | child_functions_fractions = {} |
| 159 | parent_function_inclusive_count = \ |
| 160 | cwp_inclusive_count_statistics_cumulative.get(parent_function_key, 0.0) |
| 161 | |
| 162 | if parent_function_key in cwp_inclusive_count_statistics_cumulative: |
| 163 | for child_function_key, child_function_inclusive_count \ |
| 164 | in child_functions_metrics.iteritems(): |
| 165 | child_functions_fractions[child_function_key] = \ |
| 166 | child_function_inclusive_count / parent_function_inclusive_count |
| 167 | else: |
| 168 | for child_function_key, child_function_inclusive_count \ |
| 169 | in child_functions_metrics.iteritems(): |
| 170 | child_functions_fractions[child_function_key] = 0.0 |
| 171 | pairwise_inclusive_count_fractions[parent_function_key] = \ |
| 172 | child_functions_fractions |
| 173 | |
| 174 | return pairwise_inclusive_count_fractions |
| 175 | |
Evelina Dumitrescu | c7faa09 | 2016-09-28 15:13:29 -0700 | [diff] [blame] | 176 | def ParseFunctionGroups(cwp_function_groups_lines): |
| 177 | """Parses the contents of the function groups file. |
| 178 | |
| 179 | Args: |
| 180 | cwp_function_groups_lines: A list of the lines contained in the CWP |
| 181 | function groups file. A line contains the group name and the file path |
| 182 | that describes the group, separated by a space. |
| 183 | |
| 184 | Returns: |
| 185 | A list of tuples containing the group name and the file path. |
| 186 | """ |
| 187 | # The order of the groups mentioned in the cwp_function_groups file |
| 188 | # matters. A function declared in a file will belong to the first |
| 189 | # mentioned group that matches its path to the one of the file. |
| 190 | # It is possible to have multiple paths that belong to the same group. |
| 191 | return [tuple(line.split()) for line in cwp_function_groups_lines] |
| 192 | |
| 193 | |
| 194 | def ParsePprofTopOutput(file_name): |
| 195 | """Parses a file that contains the output of the pprof --top command. |
| 196 | |
| 197 | Args: |
| 198 | file_name: The name of the file containing the pprof --top output. |
| 199 | |
| 200 | Returns: |
| 201 | A dict having as a key the name of the function and the file containing |
| 202 | the declaration of the function, separated by a comma, and as a value |
| 203 | a tuple containing the flat, flat percentage, sum percentage, cummulative |
| 204 | and cummulative percentage values. |
| 205 | """ |
| 206 | |
| 207 | pprof_top_statistics = {} |
| 208 | |
| 209 | # In the pprof top output, the statistics of the functions start from the |
| 210 | # 6th line. |
| 211 | with open(file_name) as input_file: |
| 212 | pprof_top_content = input_file.readlines()[6:] |
| 213 | |
| 214 | for line in pprof_top_content: |
| 215 | function_statistic_match = FUNCTION_STATISTIC_REGEX.search(line) |
| 216 | flat, flat_p, sum_p, cum, cum_p = function_statistic_match.groups() |
| 217 | flat_p = str(float(flat_p) / 100.0) |
| 218 | sum_p = str(float(sum_p) / 100.0) |
| 219 | cum_p = str(float(cum_p) / 100.0) |
| 220 | lookup_index = function_statistic_match.end() |
| 221 | function_and_file_name = line[lookup_index + 2 : -1] |
| 222 | key = MakePprofFunctionKey(function_and_file_name) |
| 223 | pprof_top_statistics[key] = (flat, flat_p, sum_p, cum, cum_p) |
| 224 | return pprof_top_statistics |
| 225 | |
| 226 | |
| 227 | def ParsePprofTreeOutput(file_name): |
| 228 | """Parses a file that contains the output of the pprof --tree command. |
| 229 | |
| 230 | Args: |
| 231 | file_name: The name of the file containing the pprof --tree output. |
| 232 | |
| 233 | Returns: |
| 234 | A dict including the statistics for pairs of parent and child functions. |
| 235 | The key is the name of the parent function and the file where the |
| 236 | function is declared, separated by a comma. The value is a dict having as |
| 237 | a key the name of the child function and the file where the function is |
| 238 | delcared, comma separated and as a value the percentage of time the |
| 239 | parent function spends in the child function. |
| 240 | """ |
| 241 | |
| 242 | # In the pprof output, the statistics of the functions start from the 9th |
| 243 | # line. |
| 244 | with open(file_name) as input_file: |
| 245 | pprof_tree_content = input_file.readlines()[9:] |
| 246 | |
| 247 | pprof_tree_statistics = defaultdict(lambda: defaultdict(float)) |
| 248 | track_child_functions = False |
| 249 | |
| 250 | # The statistics of a given function, its parent and child functions are |
| 251 | # included between two separator marks. |
| 252 | # All the parent function statistics are above the line containing the |
| 253 | # statistics of the given function. |
| 254 | # All the statistics of a child function are below the statistics of the |
| 255 | # given function. |
| 256 | # The statistics of a parent or a child function contain the calls, calls |
| 257 | # percentage, the function name and the file where the function is declared. |
| 258 | # The statistics of the given function contain the flat, flat percentage, |
| 259 | # sum percentage, cummulative, cummulative percentage, function name and the |
| 260 | # name of the file containing the declaration of the function. |
| 261 | for line in pprof_tree_content: |
| 262 | separator_match = SEPARATOR_REGEX.search(line) |
| 263 | |
| 264 | if separator_match: |
| 265 | track_child_functions = False |
| 266 | continue |
| 267 | |
| 268 | parent_function_statistic_match = FUNCTION_STATISTIC_REGEX.search(line) |
| 269 | |
| 270 | if parent_function_statistic_match: |
| 271 | track_child_functions = True |
| 272 | lookup_index = parent_function_statistic_match.end() |
| 273 | parent_function_key_match = \ |
| 274 | FUNCTION_KEY_SEPARATOR_REGEX.search(line, pos=lookup_index) |
| 275 | lookup_index = parent_function_key_match.end() |
| 276 | parent_function_key = MakePprofFunctionKey(line[lookup_index:-1]) |
| 277 | continue |
| 278 | |
| 279 | if not track_child_functions: |
| 280 | continue |
| 281 | |
| 282 | child_function_statistic_match = \ |
| 283 | CHILD_FUNCTION_PERCENTAGE_REGEX.search(line) |
| 284 | child_function_percentage = \ |
| 285 | float(child_function_statistic_match.group(1)) |
| 286 | lookup_index = child_function_statistic_match.end() |
| 287 | child_function_key_match = \ |
| 288 | FUNCTION_KEY_SEPARATOR_REGEX.search(line, pos=lookup_index) |
| 289 | lookup_index = child_function_key_match.end() |
| 290 | child_function_key = MakePprofFunctionKey(line[lookup_index:-1]) |
| 291 | |
| 292 | pprof_tree_statistics[parent_function_key][child_function_key] += \ |
| 293 | child_function_percentage / 100.0 |
| 294 | |
| 295 | return pprof_tree_statistics |
| 296 | |
| 297 | |
| 298 | def ParseCWPInclusiveCountFile(file_name): |
| 299 | """Parses the CWP inclusive count files. |
| 300 | |
| 301 | A line should contain the name of the function, the file name with the |
| 302 | declaration, the inclusive count and inclusive count fraction out of the |
| 303 | total extracted inclusive count values. |
| 304 | |
| 305 | Args: |
| 306 | file_name: The file containing the inclusive count values of the CWP |
| 307 | functions. |
| 308 | |
| 309 | Returns: |
| 310 | A dict containing the inclusive count statistics. The key is the name of |
| 311 | the function and the file name, comma separated. The value represents a |
| 312 | tuple with the object name containing the function declaration, the |
| 313 | inclusive count and inclusive count fraction values, and a marker to |
| 314 | identify if the function is present in one of the benchmark profiles. |
| 315 | """ |
| 316 | cwp_inclusive_count_statistics = defaultdict(lambda: ('', 0, 0.0, 0)) |
| 317 | |
| 318 | with open(file_name) as input_file: |
| 319 | statistics_reader = csv.DictReader(input_file, delimiter=',') |
| 320 | for statistic in statistics_reader: |
| 321 | function_name = statistic['function'] |
| 322 | file_name = MakeCWPAndPprofFileNamesConsistent( |
| 323 | os.path.normpath(statistic['file'])) |
| 324 | dso_name = statistic['dso'] |
| 325 | inclusive_count = statistic['inclusive_count'] |
| 326 | inclusive_count_fraction = statistic['inclusive_count_fraction'] |
| 327 | |
| 328 | # We ignore the lines that have empty fields(i.e they specify only the |
| 329 | # addresses of the functions and the inclusive counts values). |
| 330 | if all([ |
| 331 | function_name, file_name, dso_name, inclusive_count, |
| 332 | inclusive_count_fraction |
| 333 | ]): |
| 334 | key = '%s,%s' % (function_name, file_name) |
| 335 | |
| 336 | # There might be situations where a function appears in multiple files |
| 337 | # or objects. Such situations can occur when in the Dremel queries there |
| 338 | # are not specified the Chrome OS version and the name of the board (i.e |
| 339 | # the files can belong to different kernel or library versions). |
| 340 | inclusive_count_sum = \ |
| 341 | cwp_inclusive_count_statistics[key][1] + int(inclusive_count) |
| 342 | inclusive_count_fraction_sum = \ |
| 343 | cwp_inclusive_count_statistics[key][2] + \ |
| 344 | float(inclusive_count_fraction) |
| 345 | |
| 346 | # All the functions are initially marked as EXTRA_FUNCTION. |
| 347 | value = \ |
| 348 | (dso_name, inclusive_count_sum, inclusive_count_fraction_sum, |
| 349 | EXTRA_FUNCTION) |
| 350 | cwp_inclusive_count_statistics[key] = value |
| 351 | |
| 352 | return cwp_inclusive_count_statistics |
| 353 | |
| 354 | |
| 355 | def ParseCWPPairwiseInclusiveCountFile(file_name): |
| 356 | """Parses the CWP pairwise inclusive count files. |
| 357 | |
| 358 | A line of the file should contain a pair of a parent and a child function, |
| 359 | concatenated by the PARENT_CHILD_FUNCTIONS_SEPARATOR, the name of the file |
| 360 | where the child function is declared and the inclusive count fractions of |
| 361 | the pair of functions out of the total amount of inclusive count values. |
| 362 | |
| 363 | Args: |
| 364 | file_name: The file containing the pairwise inclusive_count statistics of |
| 365 | the |
| 366 | CWP functions. |
| 367 | |
| 368 | Returns: |
| 369 | A dict containing the statistics of the parent functions and each of |
| 370 | their child functions. The key of the dict is the name of the parent |
| 371 | function. The value is a dict having as a key the name of the child |
| 372 | function with its file name separated by a ',' and as a value the |
| 373 | inclusive count value of the parent-child function pair. |
| 374 | """ |
| 375 | pairwise_inclusive_count_statistics = defaultdict(lambda: defaultdict(float)) |
| 376 | |
| 377 | with open(file_name) as input_file: |
| 378 | statistics_reader = csv.DictReader(input_file, delimiter=',') |
| 379 | |
| 380 | for statistic in statistics_reader: |
| 381 | parent_function_name, child_function_name = \ |
| 382 | statistic['parent_child_functions'].split( |
| 383 | PARENT_CHILD_FUNCTIONS_SEPARATOR) |
| 384 | child_function_file_name = MakeCWPAndPprofFileNamesConsistent( |
| 385 | os.path.normpath(statistic['child_function_file'])) |
| 386 | inclusive_count = statistic['inclusive_count'] |
| 387 | |
| 388 | # There might be situations where a child function appears in |
| 389 | # multiple files or objects. Such situations can occur when in the |
| 390 | # Dremel queries are not specified the Chrome OS version and the |
| 391 | # name of the board (i.e the files can belong to different kernel or |
| 392 | # library versions), when the child function is a template function |
| 393 | # that is declared in a header file or there are name collisions |
| 394 | # between multiple executable objects. |
| 395 | # If a pair of child and parent functions appears multiple times, we |
| 396 | # add their inclusive count values. |
| 397 | child_function_key = ','.join( |
| 398 | [child_function_name, child_function_file_name]) |
| 399 | pairwise_inclusive_count_statistics[parent_function_name] \ |
| 400 | [child_function_key] += float(inclusive_count) |
| 401 | |
| 402 | return pairwise_inclusive_count_statistics |