Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | # encoding: utf-8 |
| 3 | |
| 4 | # Copyright 2017 Google Inc. |
| 5 | # |
| 6 | # Use of this source code is governed by a BSD-style license that can be found |
| 7 | # in the LICENSE file. |
| 8 | # |
| 9 | # This is an A/B test utility script used by calmbench.py |
| 10 | # |
| 11 | # For each bench, we get a distribution of min_ms measurements from nanobench. |
| 12 | # From that, we try to recover the 1/3 and 2/3 quantiles of the distribution. |
| 13 | # If range (1/3 quantile, 2/3 quantile) is completely disjoint between A and B, |
| 14 | # we report that as a regression. |
| 15 | # |
| 16 | # The more measurements we have for a bench, the more accurate our quantiles |
| 17 | # are. However, taking more measurements is time consuming. Hence we'll prune |
| 18 | # out benches and only take more measurements for benches whose current quantile |
| 19 | # ranges are disjoint. |
| 20 | # |
| 21 | # P.S. The current script is brute forcely translated from a ruby script. So it |
| 22 | # may be ugly... |
| 23 | |
| 24 | import re |
| 25 | import os |
| 26 | import sys |
| 27 | import time |
| 28 | import json |
| 29 | import subprocess |
| 30 | import shlex |
Yuqian Li | 4a577af | 2018-01-05 11:13:43 -0500 | [diff] [blame] | 31 | import multiprocessing |
Yuqian Li | 58b90f7 | 2018-04-27 17:52:56 -0400 | [diff] [blame] | 32 | import traceback |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 33 | from argparse import ArgumentParser |
| 34 | from multiprocessing import Process |
| 35 | from threading import Thread |
| 36 | from threading import Lock |
| 37 | from pdb import set_trace |
| 38 | |
| 39 | |
| 40 | HELP = """ |
| 41 | \033[31mPlease call calmbench.py to drive this script if you're not doing so. |
| 42 | This script is not supposed to be used by itself. (At least, it's not easy to |
Yuqian Li | 58b90f7 | 2018-04-27 17:52:56 -0400 | [diff] [blame] | 43 | use by itself. The calmbench bots may use this script directly.) |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 44 | \033[0m |
| 45 | """ |
| 46 | |
| 47 | FACTOR = 3 # lower/upper quantile factor |
| 48 | DIFF_T = 0.99 # different enough threshold |
| 49 | TERM = 10 # terminate after this no. of iterations without suspect changes |
| 50 | MAXTRY = 30 # max number of nanobench tries to narrow down suspects |
| 51 | |
| 52 | UNITS = "ns µs ms s".split() |
| 53 | |
| 54 | |
| 55 | timesLock = Lock() |
| 56 | timesA = {} |
| 57 | timesB = {} |
| 58 | |
| 59 | |
| 60 | def parse_args(): |
| 61 | parser = ArgumentParser(description=HELP) |
| 62 | |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 63 | parser.add_argument('outdir', type=str, help="output directory") |
| 64 | parser.add_argument('a', type=str, help="name of A") |
| 65 | parser.add_argument('b', type=str, help="name of B") |
| 66 | parser.add_argument('nano_a', type=str, help="path to A's nanobench binary") |
| 67 | parser.add_argument('nano_b', type=str, help="path to B's nanobench binary") |
| 68 | parser.add_argument('arg_a', type=str, help="args for A's nanobench run") |
| 69 | parser.add_argument('arg_b', type=str, help="args for B's nanobench run") |
| 70 | parser.add_argument('repeat', type=int, help="number of initial runs") |
| 71 | parser.add_argument('skip_b', type=str, help=("whether to skip running B" |
| 72 | " ('true' or 'false')")) |
| 73 | parser.add_argument('config', type=str, help="nanobenh config") |
| 74 | parser.add_argument('threads', type=int, help="number of threads to run") |
| 75 | parser.add_argument('noinit', type=str, help=("whether to skip running B" |
| 76 | " ('true' or 'false')")) |
| 77 | |
Yuqian Li | 84366d2 | 2017-10-17 16:26:32 -0400 | [diff] [blame] | 78 | parser.add_argument('--concise', dest='concise', action="store_true", |
| 79 | help="If set, no verbose thread info will be printed.") |
| 80 | parser.set_defaults(concise=False) |
| 81 | |
Yuqian Li | 228da62 | 2017-10-26 15:38:30 -0400 | [diff] [blame] | 82 | # Additional args for bots |
| 83 | BHELP = "bot specific options" |
Yuqian Li | 61ffd53 | 2017-11-06 15:59:12 -0500 | [diff] [blame] | 84 | parser.add_argument('--githash', type=str, default="", help=BHELP) |
Yuqian Li | 228da62 | 2017-10-26 15:38:30 -0400 | [diff] [blame] | 85 | parser.add_argument('--keys', type=str, default=[], nargs='+', help=BHELP) |
| 86 | |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 87 | args = parser.parse_args() |
| 88 | args.skip_b = args.skip_b == "true" |
| 89 | args.noinit = args.noinit == "true" |
| 90 | |
Yuqian Li | 4a577af | 2018-01-05 11:13:43 -0500 | [diff] [blame] | 91 | if args.threads == -1: |
| 92 | args.threads = 1 |
| 93 | if args.config in ["8888", "565"]: # multi-thread for CPU only |
| 94 | args.threads = max(1, multiprocessing.cpu_count() / 2) |
| 95 | |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 96 | return args |
| 97 | |
| 98 | def append_dict_sorted_array(dict_array, key, value): |
| 99 | if key not in dict_array: |
| 100 | dict_array[key] = [] |
| 101 | dict_array[key].append(value) |
| 102 | dict_array[key].sort() |
| 103 | |
| 104 | |
| 105 | def add_time(args, name, bench, t, unit): |
| 106 | normalized_t = t * 1000 ** UNITS.index(unit); |
| 107 | if name.startswith(args.a): |
| 108 | append_dict_sorted_array(timesA, bench, normalized_t) |
| 109 | else: |
| 110 | append_dict_sorted_array(timesB, bench, normalized_t) |
| 111 | |
| 112 | |
| 113 | def append_times_from_file(args, name, filename): |
| 114 | with open(filename) as f: |
| 115 | lines = f.readlines() |
| 116 | for line in lines: |
| 117 | items = line.split() |
| 118 | if len(items) > 10: |
| 119 | bench = items[10] |
| 120 | matches = re.search("([+-]?\d*.?\d+)(s|ms|µs|ns)", items[3]) |
| 121 | if (not matches or items[9] != args.config): |
| 122 | continue |
| 123 | time_num = matches.group(1) |
| 124 | time_unit = matches.group(2) |
| 125 | add_time(args, name, bench, float(time_num), time_unit) |
| 126 | |
| 127 | |
Yuqian Li | 58b90f7 | 2018-04-27 17:52:56 -0400 | [diff] [blame] | 128 | class ThreadWithException(Thread): |
| 129 | def __init__(self, target): |
| 130 | super(ThreadWithException, self).__init__(target = target) |
| 131 | self.exception = None |
| 132 | |
| 133 | def run(self): |
| 134 | try: |
| 135 | self._Thread__target(*self._Thread__args, **self._Thread__kwargs) |
| 136 | except BaseException as e: |
| 137 | self.exception = e |
| 138 | |
| 139 | def join(self, timeout=None): |
| 140 | super(ThreadWithException, self).join(timeout) |
| 141 | |
| 142 | |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 143 | class ThreadRunner: |
| 144 | """Simplest and stupidiest threaded executer.""" |
Yuqian Li | 84366d2 | 2017-10-17 16:26:32 -0400 | [diff] [blame] | 145 | def __init__(self, args): |
| 146 | self.concise = args.concise |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 147 | self.threads = [] |
| 148 | |
| 149 | def add(self, args, fn): |
| 150 | if len(self.threads) >= args.threads: |
| 151 | self.wait() |
Yuqian Li | 58b90f7 | 2018-04-27 17:52:56 -0400 | [diff] [blame] | 152 | t = ThreadWithException(target = fn) |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 153 | t.daemon = True |
| 154 | self.threads.append(t) |
| 155 | t.start() |
| 156 | |
| 157 | def wait(self): |
| 158 | def spin(): |
| 159 | i = 0 |
| 160 | spinners = [". ", ".. ", "..."] |
| 161 | while len(self.threads) > 0: |
| 162 | timesLock.acquire() |
| 163 | sys.stderr.write( |
| 164 | "\r" + spinners[i % len(spinners)] + |
| 165 | " (%d threads running)" % len(self.threads) + |
| 166 | " \r" # spaces for erasing characters |
| 167 | ) |
| 168 | timesLock.release() |
| 169 | time.sleep(0.5) |
| 170 | i += 1 |
| 171 | |
Yuqian Li | 84366d2 | 2017-10-17 16:26:32 -0400 | [diff] [blame] | 172 | if not self.concise: |
| 173 | ts = Thread(target = spin); |
| 174 | ts.start() |
| 175 | |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 176 | for t in self.threads: |
Yuqian Li | 58b90f7 | 2018-04-27 17:52:56 -0400 | [diff] [blame] | 177 | t.join() |
| 178 | |
| 179 | exceptions = [] |
| 180 | for t in self.threads: |
| 181 | if t.exception: |
| 182 | exceptions.append(t.exception) |
| 183 | |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 184 | self.threads = [] |
Yuqian Li | 84366d2 | 2017-10-17 16:26:32 -0400 | [diff] [blame] | 185 | |
| 186 | if not self.concise: |
| 187 | ts.join() |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 188 | |
Yuqian Li | 58b90f7 | 2018-04-27 17:52:56 -0400 | [diff] [blame] | 189 | if len(exceptions): |
| 190 | for exc in exceptions: |
| 191 | print exc |
| 192 | raise exceptions[0] |
| 193 | |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 194 | |
| 195 | def split_arg(arg): |
| 196 | raw = shlex.split(arg) |
| 197 | result = [] |
| 198 | for r in raw: |
| 199 | if '~' in r: |
| 200 | result.append(os.path.expanduser(r)) |
| 201 | else: |
| 202 | result.append(r) |
| 203 | return result |
| 204 | |
| 205 | |
| 206 | def run(args, threadRunner, name, nano, arg, i): |
| 207 | def task(): |
| 208 | file_i = "%s/%s.out%d" % (args.outdir, name, i) |
| 209 | |
| 210 | should_run = not args.noinit and not (name == args.b and args.skip_b) |
| 211 | if i <= 0: |
| 212 | should_run = True # always run for suspects |
| 213 | |
| 214 | if should_run: |
| 215 | if i > 0: |
| 216 | timesLock.acquire() |
| 217 | print "Init run %d for %s..." % (i, name) |
| 218 | timesLock.release() |
| 219 | subprocess.check_call(["touch", file_i]) |
| 220 | with open(file_i, 'w') as f: |
| 221 | subprocess.check_call([nano] + split_arg(arg) + |
| 222 | ["--config", args.config], stderr=f, stdout=f) |
| 223 | |
| 224 | timesLock.acquire() |
| 225 | append_times_from_file(args, name, file_i) |
| 226 | timesLock.release() |
| 227 | |
| 228 | threadRunner.add(args, task) |
| 229 | |
| 230 | |
| 231 | def init_run(args): |
Yuqian Li | 84366d2 | 2017-10-17 16:26:32 -0400 | [diff] [blame] | 232 | threadRunner = ThreadRunner(args) |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 233 | for i in range(1, max(args.repeat, args.threads / 2) + 1): |
| 234 | run(args, threadRunner, args.a, args.nano_a, args.arg_a, i) |
| 235 | run(args, threadRunner, args.b, args.nano_b, args.arg_b, i) |
| 236 | threadRunner.wait() |
| 237 | |
| 238 | |
| 239 | def get_lower_upper(values): |
| 240 | i = max(0, (len(values) - 1) / FACTOR) |
| 241 | return values[i], values[-i - 1] |
| 242 | |
| 243 | |
| 244 | def different_enough(lower1, upper2): |
| 245 | return upper2 < DIFF_T * lower1 |
| 246 | |
| 247 | |
Yuqian Li | 9127ea3 | 2018-05-02 17:26:40 -0400 | [diff] [blame] | 248 | # TODO(liyuqian): we used this hacky criteria mainly because that I didn't have |
| 249 | # time to study more rigorous statistical tests. We should adopt a more rigorous |
| 250 | # test in the future. |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 251 | def get_suspects(): |
| 252 | suspects = [] |
| 253 | for bench in timesA.keys(): |
| 254 | if bench not in timesB: |
| 255 | continue |
| 256 | lowerA, upperA = get_lower_upper(timesA[bench]) |
| 257 | lowerB, upperB = get_lower_upper(timesB[bench]) |
| 258 | if different_enough(lowerA, upperB) or different_enough(lowerB, upperA): |
| 259 | suspects.append(bench) |
| 260 | return suspects |
| 261 | |
| 262 | |
| 263 | def process_bench_pattern(s): |
| 264 | if ".skp" in s: # skp bench won't match their exact names... |
| 265 | return "^\"" + s[0:(s.index(".skp") + 3)] + "\"" |
| 266 | else: |
| 267 | return "^\"" + s + "\"$" |
| 268 | |
| 269 | |
| 270 | def suspects_arg(suspects): |
| 271 | patterns = map(process_bench_pattern, suspects) |
| 272 | return " --match " + (" ".join(patterns)) |
| 273 | |
| 274 | |
| 275 | def median(array): |
| 276 | return array[len(array) / 2] |
| 277 | |
| 278 | |
| 279 | def regression(bench): |
| 280 | a = median(timesA[bench]) |
| 281 | b = median(timesB[bench]) |
| 282 | if (a == 0): # bad bench, just return no regression |
| 283 | return 1 |
| 284 | return b / a |
| 285 | |
| 286 | |
| 287 | def percentage(x): |
| 288 | return (x - 1) * 100 |
| 289 | |
| 290 | |
| 291 | def format_r(r): |
| 292 | return ('%6.2f' % percentage(r)) + "%" |
| 293 | |
| 294 | |
Yuqian Li | 228da62 | 2017-10-26 15:38:30 -0400 | [diff] [blame] | 295 | def normalize_r(r): |
| 296 | if r > 1.0: |
| 297 | return r - 1.0 |
| 298 | else: |
| 299 | return 1.0 - 1/r |
| 300 | |
| 301 | |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 302 | def test(): |
| 303 | args = parse_args() |
| 304 | |
| 305 | init_run(args) |
| 306 | last_unchanged_iter = 0 |
| 307 | last_suspect_number = -1 |
| 308 | tryCnt = 0 |
| 309 | it = 0 |
| 310 | while tryCnt < MAXTRY: |
| 311 | it += 1 |
| 312 | suspects = get_suspects() |
| 313 | if len(suspects) != last_suspect_number: |
| 314 | last_suspect_number = len(suspects) |
| 315 | last_unchanged_iter = it |
| 316 | if (len(suspects) == 0 or it - last_unchanged_iter >= TERM): |
| 317 | break |
| 318 | |
| 319 | print "Number of suspects at iteration %d: %d" % (it, len(suspects)) |
Yuqian Li | 84366d2 | 2017-10-17 16:26:32 -0400 | [diff] [blame] | 320 | threadRunner = ThreadRunner(args) |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 321 | for j in range(1, max(1, args.threads / 2) + 1): |
| 322 | run(args, threadRunner, args.a, args.nano_a, |
| 323 | args.arg_a + suspects_arg(suspects), -j) |
| 324 | run(args, threadRunner, args.b, args.nano_b, |
| 325 | args.arg_b + suspects_arg(suspects), -j) |
| 326 | tryCnt += 1 |
| 327 | threadRunner.wait() |
| 328 | |
| 329 | suspects = get_suspects() |
| 330 | if len(suspects) == 0: |
| 331 | print ("%s and %s does not seem to have significant " + \ |
| 332 | "performance differences.") % (args.a, args.b) |
| 333 | else: |
| 334 | suspects.sort(key = regression) |
| 335 | print "%s (compared to %s) is likely" % (args.a, args.b) |
| 336 | for suspect in suspects: |
| 337 | r = regression(suspect) |
| 338 | if r < 1: |
| 339 | print "\033[31m %s slower in %s\033[0m" % \ |
| 340 | (format_r(1/r), suspect) |
| 341 | else: |
| 342 | print "\033[32m %s faster in %s\033[0m" % \ |
| 343 | (format_r(r), suspect) |
| 344 | |
| 345 | with open("%s/bench_%s_%s.json" % (args.outdir, args.a, args.b), 'w') as f: |
Yuqian Li | 228da62 | 2017-10-26 15:38:30 -0400 | [diff] [blame] | 346 | results = {} |
| 347 | for bench in timesA: |
| 348 | r = regression(bench) if bench in suspects else 1.0 |
| 349 | results[bench] = { |
| 350 | args.config: { |
| 351 | "signed_regression": normalize_r(r), |
| 352 | "lower_quantile_ms": get_lower_upper(timesA[bench])[0] * 1e-6, |
Yuqian Li | 9127ea3 | 2018-05-02 17:26:40 -0400 | [diff] [blame] | 353 | "upper_quantile_ms": get_lower_upper(timesA[bench])[1] * 1e-6, |
| 354 | "options": { |
| 355 | # TODO(liyuqian): let ab.py call nanobench with --outResultsFile so |
| 356 | # nanobench could generate the json for us that's exactly the same |
| 357 | # as that being used by perf bots. Currently, we cannot guarantee |
| 358 | # that bench is the name (e.g., bench may have additional resolution |
| 359 | # information appended after name). |
| 360 | "name": bench |
| 361 | } |
Yuqian Li | 228da62 | 2017-10-26 15:38:30 -0400 | [diff] [blame] | 362 | } |
| 363 | } |
| 364 | |
| 365 | output = {"results": results} |
| 366 | if args.githash: |
| 367 | output["gitHash"] = args.githash |
| 368 | if args.keys: |
| 369 | keys = {} |
| 370 | for i in range(len(args.keys) / 2): |
| 371 | keys[args.keys[i * 2]] = args.keys[i * 2 + 1] |
| 372 | output["key"] = keys |
| 373 | f.write(json.dumps(output, indent=4)) |
Yuqian Li | 980379d | 2017-09-29 11:20:01 -0400 | [diff] [blame] | 374 | print ("\033[36mJSON results available in %s\033[0m" % f.name) |
| 375 | |
| 376 | with open("%s/bench_%s_%s.csv" % (args.outdir, args.a, args.b), 'w') as out: |
| 377 | out.write(("bench, significant?, raw regresion, " + |
| 378 | "%(A)s quantile (ns), %(B)s quantile (ns), " + |
| 379 | "%(A)s (ns), %(B)s (ns)\n") % {'A': args.a, 'B': args.b}) |
| 380 | for bench in suspects + timesA.keys(): |
| 381 | if (bench not in timesA or bench not in timesB): |
| 382 | continue |
| 383 | ta = timesA[bench] |
| 384 | tb = timesB[bench] |
| 385 | out.write( |
| 386 | "%s, %s, %f, " % (bench, bench in suspects, regression(bench)) + |
| 387 | ' '.join(map(str, get_lower_upper(ta))) + ", " + |
| 388 | ' '.join(map(str, get_lower_upper(tb))) + ", " + |
| 389 | ("%s, %s\n" % (' '.join(map(str, ta)), ' '.join(map(str, tb)))) |
| 390 | ) |
| 391 | print (("\033[36m" + |
| 392 | "Compared %d benches. " + |
| 393 | "%d of them seem to be significantly differrent." + |
| 394 | "\033[0m") % |
| 395 | (len([x for x in timesA if x in timesB]), len(suspects))) |
| 396 | print ("\033[36mPlease see detailed bench results in %s\033[0m" % |
| 397 | out.name) |
| 398 | |
| 399 | |
| 400 | if __name__ == "__main__": |
| 401 | try: |
| 402 | test() |
| 403 | except Exception as e: |
| 404 | print e |
| 405 | print HELP |
Yuqian Li | 58b90f7 | 2018-04-27 17:52:56 -0400 | [diff] [blame] | 406 | traceback.print_exc() |
| 407 | raise e |