Henrique Nakashima | f24fc1e | 2017-08-03 13:29:22 -0400 | [diff] [blame] | 1 | # Copyright 2017 The PDFium Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | |
| 5 | """Classes that draw conclusions out of a comparison and represent them.""" |
| 6 | |
| 7 | from collections import Counter |
| 8 | |
| 9 | |
| 10 | FORMAT_RED = '\033[01;31m{0}\033[00m' |
| 11 | FORMAT_GREEN = '\033[01;32m{0}\033[00m' |
| 12 | FORMAT_MAGENTA = '\033[01;35m{0}\033[00m' |
| 13 | FORMAT_CYAN = '\033[01;36m{0}\033[00m' |
| 14 | FORMAT_NORMAL = '{0}' |
| 15 | |
| 16 | RATING_FAILURE = 'failure' |
| 17 | RATING_REGRESSION = 'regression' |
| 18 | RATING_IMPROVEMENT = 'improvement' |
| 19 | RATING_NO_CHANGE = 'no_change' |
| 20 | RATING_SMALL_CHANGE = 'small_change' |
| 21 | |
| 22 | RATINGS = [ |
| 23 | RATING_FAILURE, |
| 24 | RATING_REGRESSION, |
| 25 | RATING_IMPROVEMENT, |
| 26 | RATING_NO_CHANGE, |
| 27 | RATING_SMALL_CHANGE |
| 28 | ] |
| 29 | |
| 30 | RATING_TO_COLOR = { |
| 31 | RATING_FAILURE: FORMAT_MAGENTA, |
| 32 | RATING_REGRESSION: FORMAT_RED, |
| 33 | RATING_IMPROVEMENT: FORMAT_CYAN, |
| 34 | RATING_NO_CHANGE: FORMAT_GREEN, |
| 35 | RATING_SMALL_CHANGE: FORMAT_NORMAL, |
| 36 | } |
| 37 | |
| 38 | |
| 39 | class ComparisonConclusions(object): |
| 40 | """All conclusions drawn from a comparison. |
| 41 | |
| 42 | This is initialized empty and then processes pairs of results for each test |
| 43 | case, determining the rating for that case, which can be: |
| 44 | "failure" if either or both runs for the case failed. |
| 45 | "regression" if there is a significant increase in time for the test case. |
| 46 | "improvement" if there is a significant decrease in time for the test case. |
| 47 | "no_change" if the time for the test case did not change at all. |
| 48 | "small_change" if the time for the test case changed but within the threshold. |
| 49 | """ |
| 50 | |
| 51 | def __init__(self, threshold_significant): |
| 52 | """Initializes an empty ComparisonConclusions. |
| 53 | |
| 54 | Args: |
| 55 | threshold_significant: Float with the tolerance beyond which changes in |
| 56 | measurements are considered significant. |
| 57 | |
| 58 | The change is considered as a multiplication rather than an addition |
| 59 | of a fraction of the previous measurement, that is, a |
| 60 | threshold_significant of 1.0 will flag test cases that became over |
| 61 | 100% slower (> 200% of the previous time measured) or over 100% faster |
| 62 | (< 50% of the previous time measured). |
| 63 | |
| 64 | threshold_significant 0.02 -> 98.04% to 102% is not significant |
| 65 | threshold_significant 0.1 -> 90.9% to 110% is not significant |
| 66 | threshold_significant 0.25 -> 80% to 125% is not significant |
| 67 | threshold_significant 1 -> 50% to 200% is not significant |
| 68 | threshold_significant 4 -> 20% to 500% is not significant |
| 69 | |
| 70 | """ |
| 71 | self.threshold_significant = threshold_significant |
| 72 | self.threshold_significant_negative = (1 / (1 + threshold_significant)) - 1 |
| 73 | |
| 74 | self.params = {'threshold': threshold_significant} |
| 75 | self.summary = ComparisonSummary() |
| 76 | self.case_results = {} |
| 77 | |
| 78 | def ProcessCase(self, case_name, before, after): |
| 79 | """Feeds a test case results to the ComparisonConclusions. |
| 80 | |
| 81 | Args: |
| 82 | case_name: String identifying the case. |
| 83 | before: Measurement for the "before" version of the code. |
| 84 | after: Measurement for the "after" version of the code. |
| 85 | """ |
| 86 | |
| 87 | # Switch 0 to None to simplify the json dict output. All zeros are |
| 88 | # considered failed runs, so they will be represented by "null". |
| 89 | if not before: |
| 90 | before = None |
| 91 | if not after: |
| 92 | after = None |
| 93 | |
| 94 | if not before or not after: |
| 95 | ratio = None |
| 96 | rating = RATING_FAILURE |
| 97 | else: |
| 98 | ratio = (float(after) / before) - 1.0 |
| 99 | if ratio > self.threshold_significant: |
| 100 | rating = RATING_REGRESSION |
| 101 | elif ratio < self.threshold_significant_negative: |
| 102 | rating = RATING_IMPROVEMENT |
| 103 | elif ratio == 0: |
| 104 | rating = RATING_NO_CHANGE |
| 105 | else: |
| 106 | rating = RATING_SMALL_CHANGE |
| 107 | |
| 108 | case_result = CaseResult(case_name, before, after, ratio, rating) |
| 109 | |
| 110 | self.summary.ProcessCaseResult(case_result) |
| 111 | self.case_results[case_name] = case_result |
| 112 | |
| 113 | def GetSummary(self): |
| 114 | """Gets the ComparisonSummary with consolidated totals.""" |
| 115 | return self.summary |
| 116 | |
| 117 | def GetCaseResults(self): |
| 118 | """Gets a dict mapping each test case identifier to its CaseResult.""" |
| 119 | return self.case_results |
| 120 | |
| 121 | def GetOutputDict(self): |
| 122 | """Returns a conclusions dict with all the conclusions drawn. |
| 123 | |
| 124 | Returns: |
| 125 | A serializable dict with the format illustrated below: |
| 126 | { |
Henrique Nakashima | f76741e | 2017-09-07 12:16:05 -0400 | [diff] [blame] | 127 | "version": 1, |
Henrique Nakashima | f24fc1e | 2017-08-03 13:29:22 -0400 | [diff] [blame] | 128 | "params": { |
| 129 | "threshold": 0.02 |
| 130 | }, |
| 131 | "summary": { |
| 132 | "total": 123, |
| 133 | "failure": 1, |
| 134 | "regression": 2, |
| 135 | "improvement": 1, |
| 136 | "no_change": 100, |
| 137 | "small_change": 19 |
| 138 | }, |
| 139 | "comparison_by_case": { |
| 140 | "testing/resources/new_test.pdf": { |
| 141 | "before": None, |
| 142 | "after": 1000, |
| 143 | "ratio": None, |
| 144 | "rating": "failure" |
| 145 | }, |
| 146 | "testing/resources/test1.pdf": { |
| 147 | "before": 100, |
| 148 | "after": 120, |
| 149 | "ratio": 0.2, |
| 150 | "rating": "regression" |
| 151 | }, |
| 152 | "testing/resources/test2.pdf": { |
| 153 | "before": 100, |
| 154 | "after": 2000, |
| 155 | "ratio": 19.0, |
| 156 | "rating": "regression" |
| 157 | }, |
| 158 | "testing/resources/test3.pdf": { |
| 159 | "before": 1000, |
| 160 | "after": 1005, |
| 161 | "ratio": 0.005, |
| 162 | "rating": "small_change" |
| 163 | }, |
| 164 | "testing/resources/test4.pdf": { |
| 165 | "before": 1000, |
| 166 | "after": 1000, |
| 167 | "ratio": 0.0, |
| 168 | "rating": "no_change" |
| 169 | }, |
| 170 | "testing/resources/test5.pdf": { |
| 171 | "before": 1000, |
| 172 | "after": 600, |
| 173 | "ratio": -0.4, |
| 174 | "rating": "improvement" |
| 175 | } |
| 176 | } |
| 177 | } |
| 178 | """ |
| 179 | output_dict = {} |
Henrique Nakashima | f76741e | 2017-09-07 12:16:05 -0400 | [diff] [blame] | 180 | output_dict['version'] = 1 |
Henrique Nakashima | f24fc1e | 2017-08-03 13:29:22 -0400 | [diff] [blame] | 181 | output_dict['params'] = {'threshold': self.threshold_significant} |
| 182 | output_dict['summary'] = self.summary.GetOutputDict() |
| 183 | output_dict['comparison_by_case'] = { |
Henrique Nakashima | 0da39e6 | 2017-08-15 14:37:58 -0400 | [diff] [blame] | 184 | cr.case_name.decode('utf-8'): cr.GetOutputDict() |
Henrique Nakashima | f24fc1e | 2017-08-03 13:29:22 -0400 | [diff] [blame] | 185 | for cr in self.GetCaseResults().values() |
| 186 | } |
| 187 | return output_dict |
| 188 | |
| 189 | |
| 190 | class ComparisonSummary(object): |
| 191 | """Totals computed for a comparison.""" |
| 192 | |
| 193 | def __init__(self): |
| 194 | self.rating_counter = Counter() |
| 195 | |
| 196 | def ProcessCaseResult(self, case_result): |
| 197 | self.rating_counter[case_result.rating] += 1 |
| 198 | |
| 199 | def GetTotal(self): |
| 200 | """Gets the number of test cases processed.""" |
| 201 | return sum(self.rating_counter.values()) |
| 202 | |
| 203 | def GetCount(self, rating): |
| 204 | """Gets the number of test cases processed with a given rating.""" |
| 205 | return self.rating_counter[rating] |
| 206 | |
| 207 | def GetOutputDict(self): |
| 208 | """Returns a dict that can be serialized with all the totals.""" |
| 209 | result = {'total': self.GetTotal()} |
| 210 | for rating in RATINGS: |
| 211 | result[rating] = self.GetCount(rating) |
| 212 | return result |
| 213 | |
| 214 | |
| 215 | class CaseResult(object): |
| 216 | """The conclusion for the comparison of a single test case.""" |
| 217 | |
| 218 | def __init__(self, case_name, before, after, ratio, rating): |
| 219 | """Initializes an empty ComparisonConclusions. |
| 220 | |
| 221 | Args: |
| 222 | case_name: String identifying the case. |
| 223 | before: Measurement for the "before" version of the code. |
| 224 | after: Measurement for the "after" version of the code. |
| 225 | ratio: Difference between |after| and |before| as a fraction of |before|. |
| 226 | rating: Rating for this test case. |
| 227 | """ |
| 228 | self.case_name = case_name |
| 229 | self.before = before |
| 230 | self.after = after |
| 231 | self.ratio = ratio |
| 232 | self.rating = rating |
| 233 | |
| 234 | def GetOutputDict(self): |
| 235 | """Returns a dict with the test case's conclusions.""" |
| 236 | return {'before': self.before, |
| 237 | 'after': self.after, |
| 238 | 'ratio': self.ratio, |
| 239 | 'rating': self.rating} |
| 240 | |
| 241 | |
| 242 | def PrintConclusionsDictHumanReadable(conclusions_dict, colored, key=None): |
| 243 | """Prints a conclusions dict in a human-readable way. |
| 244 | |
| 245 | Args: |
| 246 | conclusions_dict: Dict to print. |
| 247 | colored: Whether to color the output to highlight significant changes. |
| 248 | key: String with the CaseResult dictionary key to sort the cases. |
| 249 | """ |
| 250 | # Print header |
| 251 | print '=' * 80 |
| 252 | print '{0:>11s} {1:>15s} {2}' .format( |
| 253 | '% Change', |
| 254 | 'Time after', |
| 255 | 'Test case') |
| 256 | print '-' * 80 |
| 257 | |
| 258 | color = FORMAT_NORMAL |
| 259 | |
| 260 | # Print cases |
| 261 | if key is not None: |
| 262 | case_pairs = sorted(conclusions_dict['comparison_by_case'].iteritems(), |
| 263 | key=lambda kv: kv[1][key]) |
| 264 | else: |
| 265 | case_pairs = sorted(conclusions_dict['comparison_by_case'].iteritems()) |
| 266 | |
| 267 | for case_name, case_dict in case_pairs: |
Henrique Nakashima | f24fc1e | 2017-08-03 13:29:22 -0400 | [diff] [blame] | 268 | if colored: |
| 269 | color = RATING_TO_COLOR[case_dict['rating']] |
| 270 | |
Henrique Nakashima | e6f3fcc | 2017-08-22 11:03:33 -0400 | [diff] [blame] | 271 | if case_dict['rating'] == RATING_FAILURE: |
| 272 | print u'{} to measure time for {}'.format( |
| 273 | color.format('Failed'), |
| 274 | case_name).encode('utf-8') |
| 275 | continue |
| 276 | |
Henrique Nakashima | 0da39e6 | 2017-08-15 14:37:58 -0400 | [diff] [blame] | 277 | print u'{0} {1:15,d} {2}' .format( |
Henrique Nakashima | f24fc1e | 2017-08-03 13:29:22 -0400 | [diff] [blame] | 278 | color.format('{:+11.4%}'.format(case_dict['ratio'])), |
| 279 | case_dict['after'], |
Henrique Nakashima | 0da39e6 | 2017-08-15 14:37:58 -0400 | [diff] [blame] | 280 | case_name).encode('utf-8') |
Henrique Nakashima | f24fc1e | 2017-08-03 13:29:22 -0400 | [diff] [blame] | 281 | |
| 282 | # Print totals |
| 283 | totals = conclusions_dict['summary'] |
| 284 | print '=' * 80 |
| 285 | print 'Test cases run: %d' % totals['total'] |
| 286 | |
| 287 | if colored: |
| 288 | color = FORMAT_MAGENTA if totals[RATING_FAILURE] else FORMAT_GREEN |
| 289 | print ('Failed to measure: %s' |
| 290 | % color.format(totals[RATING_FAILURE])) |
| 291 | |
| 292 | if colored: |
| 293 | color = FORMAT_RED if totals[RATING_REGRESSION] else FORMAT_GREEN |
| 294 | print ('Regressions: %s' |
| 295 | % color.format(totals[RATING_REGRESSION])) |
| 296 | |
| 297 | if colored: |
| 298 | color = FORMAT_CYAN if totals[RATING_IMPROVEMENT] else FORMAT_GREEN |
| 299 | print ('Improvements: %s' |
| 300 | % color.format(totals[RATING_IMPROVEMENT])) |