blob: d74d6f68d8232a8fc976202cd6e8ab1141ce7fa7 [file] [log] [blame]
Igor Murashkin25f394d2018-09-11 16:37:18 -07001#!/usr/bin/env python3
2#
3# Copyright 2018, The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""
18Perform statistical analysis on measurements produced by app_startup_runner.py
19
20Install:
21$> sudo apt-get install python3-scipy
22
23Usage:
24$> ./analyze_metrics.py <filename.csv> [<filename2.csv> ...]
25$> ./analyze_metrics.py --help
26"""
27
28import argparse
29import csv
30import itertools
31import os
32import subprocess
33import sys
34import tempfile
35from typing import Any, List, Dict, Iterable, TextIO, Tuple
36
37from scipy import stats as sc
38import numpy as np
39
40
41# These CSV columns are considered labels. Everything after them in the same row are metrics.
42_LABEL_COLUMNS=['packages', 'readaheads', 'compiler_filters']
43# The metric series with the 'cold' readahead is the baseline.
44# All others (warm, jit, etc) are the potential improvements.
45
46#fixme: this should probably be an option
47_BASELINE=('readaheads', 'cold')
48# ignore this for some statistic calculations
49_IGNORE_PAIR=('readaheads', 'warm')
50_PLOT_SUBKEY='readaheads'
51_PLOT_GROUPKEY='packages'
52_PLOT_DATA_INDEX = 0
53_DELTA=50
54_DELTA2=100
55_PVALUE_THRESHOLD=0.10
56_debug = False # See -d/--debug flag.
57
58def parse_options(argv: List[str] = None):
59 """Parse command line arguments and return an argparse Namespace object."""
60 parser = argparse.ArgumentParser(description="Perform statistical analysis on measurements produced by app_start_runner.py.")
61 parser.add_argument('input_files', metavar='file.csv', nargs='+', help='CSV file produced by app_startup_runner.py')
62
63 parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='Add extra debugging output')
64 parser.add_argument('-os', '--output-samples', dest='output_samples', default='/dev/null', action='store', help='Store CSV for per-sample data')
65 parser.add_argument('-oc', '--output-comparable', dest='output_comparable', default='/dev/null', action='store', help='Output CSV for comparable against baseline')
66 parser.add_argument('-ocs', '--output-comparable-significant', dest='output_comparable_significant', default='/dev/null', action='store', help='Output CSV for comparable against baseline (significant only)')
67 parser.add_argument('-pt', '--pvalue-threshold', dest='pvalue_threshold', type=float, default=_PVALUE_THRESHOLD, action='store')
68 parser.add_argument('-dt', '--delta-threshold', dest='delta_threshold', type=int, default=_DELTA, action='store')
69
70 return parser.parse_args(argv)
71
72def _debug_print(*args, **kwargs):
73 """Print the args to sys.stderr if the --debug/-d flag was passed in."""
74 global _debug
75 if _debug:
76 print(*args, **kwargs, file=sys.stderr)
77
78def _expand_gen_repr(args):
79 new_args_list = []
80 for i in args:
81 # detect iterable objects that do not have their own override of __str__
82 if hasattr(i, '__iter__'):
83 to_str = getattr(i, '__str__')
84 if to_str.__objclass__ == object:
85 # the repr for a generator is just type+address, expand it out instead.
86 new_args_list.append([_expand_gen_repr([j])[0] for j in i])
87 continue
88 # normal case: uses the built-in to-string
89 new_args_list.append(i)
90 return new_args_list
91
92def _debug_print_gen(*args, **kwargs):
93 """Like _debug_print but will turn any iterable args into a list."""
94 if not _debug:
95 return
96
97 new_args_list = _expand_gen_repr(args)
98 _debug_print(*new_args_list, **kwargs)
99
100def read_headers(input_file: TextIO) -> Tuple[List[str], List[str]]:
101 _debug_print("read_headers for file: ", input_file.name)
102 csv_reader = csv.reader(input_file)
103
104 label_num_columns = len(_LABEL_COLUMNS)
105
106 try:
107 header = next(csv_reader)
108 except StopIteration:
109 header = None
110 _debug_print('header', header)
111
112 if not header:
113 return (None, None)
114
115 labels = header[0:label_num_columns]
116 data = header[label_num_columns:]
117
118 return (labels, data)
119
120def read_labels_and_data(input_file: TextIO) -> Iterable[Tuple[List[str], List[int]]]:
121 _debug_print("print_analysis for file: ", input_file.name)
122 csv_reader = csv.reader(input_file)
123
124 # Skip the header because it doesn't contain any data.
125 # To get the header see read_headers function.
126 try:
127 header = next(csv_reader)
128 except StopIteration:
129 header = None
130
131 label_num_columns = len(_LABEL_COLUMNS)
132
133 for row in csv_reader:
134 if len(row) > 0 and row[0][0] == ';':
135 _debug_print("skip comment line", row)
136 continue
137
138 labels = row[0:label_num_columns]
139 data = [int(i) for i in row[label_num_columns:]]
140
141# _debug_print("labels:", labels)
142# _debug_print("data:", data)
143
144 yield (labels, data)
145
146def group_metrics_by_label(it: Iterable[Tuple[List[str], List[int]]]):
147 prev_labels = None
148 data_2d = []
149
150 for label_list, data_list in it:
151 if prev_labels != label_list:
152 if prev_labels:
153# _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d)
154 yield (prev_labels, data_2d)
155 data_2d = []
156
157 data_2d.append(data_list)
158 prev_labels = label_list
159
160 if prev_labels:
161# _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d)
162 yield (prev_labels, data_2d)
163
164def data_to_numpy(it: Iterable[Tuple[List[str], List[List[int]]]]) -> Iterable[Tuple[List[str], Any]]:
165 for label_list, data_2d in it:
166 yield (label_list, np.asarray(data_2d, dtype=int))
167
168def iterate_columns(np_data_2d):
169 for col in range(np_data_2d.shape[1]):
170 col_as_array = np_data_2d[:, col]
171 yield col_as_array
172
173def confidence_interval(np_data_2d, percent=0.95):
174 """
175 Given some data [[a,b,c],[d,e,f,]...]
176
177 We assume the same metric is in the column (e.g. [a,d])
178 and that data in the rows (e.g. [b,e]) are separate metric values.
179
180 We then calculate the CI for each metric individually returning it as a list of tuples.
181 """
182 arr = []
183 for col_2d in iterate_columns(np_data_2d):
184 mean = col_2d.mean()
185 sigma = col_2d.std()
186
187 ci = sc.norm.interval(percent, loc=mean, scale=sigma / np.sqrt(len(col_2d)))
188 arr.append(ci)
189
190 # TODO: This seems to be returning NaN when all the samples have the same exact value
191 # (e.g. stddev=0, which can trivially happen when sample count = 1).
192
193 return arr
194
195def print_analysis(it, label_header: List[str], data_header: List[str], output_samples: str):
196 print(label_header)
197
198 with open(output_samples, "w") as output_file:
199
200 csv_writer = csv.writer(output_file)
201 csv_writer.writerow(label_header + ['mean', 'std', 'confidence_interval_a', 'confidence_interval_b'])
202
203 for label_list, np_data_2d in it:
204 print("**********************")
205 print(label_list)
206 print()
207 print(" ", data_header)
208 # aggregate computation column-wise
209 print("Mean: ", np_data_2d.mean(axis=0))
210 print("Std: ", np_data_2d.std(axis=0))
211 print("CI95%:", confidence_interval(np_data_2d))
212 print("SEM: ", stats_standard_error_one(np_data_2d, axis=0))
213
214 #ci = confidence_interval(np_data_2d)[_PLOT_DATA_INDEX]
215 sem = stats_standard_error_one(np_data_2d, axis=0)[_PLOT_DATA_INDEX]
216 mean = np_data_2d.mean(axis=0)[_PLOT_DATA_INDEX]
217
218 ci = (mean - sem, mean + sem)
219
220 csv_writer.writerow(label_list + [mean, np_data_2d.std(axis=0)[_PLOT_DATA_INDEX], ci[0], ci[1]])
221
222def from_file_group_by_labels(input_file):
223 (label_header, data_header) = read_headers(input_file)
224 label_data_iter = read_labels_and_data(input_file)
225 grouped_iter = group_metrics_by_label(label_data_iter)
226 grouped_numpy_iter = data_to_numpy(grouped_iter)
227
228 return grouped_numpy_iter, label_header, data_header
229
230def list_without_index(list, index):
231 return list[:index] + list[index+1:]
232
233def group_by_without_baseline_key(grouped_numpy_iter, label_header):
234 """
235 Data is considered comparable if the only difference is the baseline key
236 (i.e. the readahead is different but the package, compilation filter, etc, are the same).
237
238 Returns iterator that's grouped by the non-baseline labels to an iterator of
239 (label_list, data_2d).
240 """
241 baseline_index = label_header.index(_BASELINE[0])
242
243 def get_label_without_baseline(tpl):
244 label_list, _ = tpl
245 return list_without_index(label_list, baseline_index)
246 # [['pkgname', 'compfilter', 'warm'], [data]]
247 # [['pkgname', 'compfilter', 'cold'], [data2]]
248 # [['pkgname2', 'compfilter', 'warm'], [data3]]
249 #
250 # ->
251 # ( [['pkgname', 'compfilter', 'warm'], [data]] # ignore baseline label change.
252 # [['pkgname', 'compfilter', 'cold'], [data2]] ), # split here because the pkgname changed.
253 # ( [['pkgname2', 'compfilter', 'warm'], [data3]] )
254 for group_info, it in itertools.groupby(grouped_numpy_iter, key = get_label_without_baseline):
255 yield it
256
257 # TODO: replace this messy manual iteration/grouping with pandas
258
259def iterate_comparable_metrics(without_baseline_iter, label_header):
260 baseline_index = label_header.index(_BASELINE[0])
261 baseline_value = _BASELINE[1]
262
263 _debug_print("iterate comparables")
264
265 def is_baseline_fun(tp):
266 ll, dat = tp
267 return ll[baseline_index] == baseline_value
268
269 # iterating here when everything but the baseline key is the same.
270 for it in without_baseline_iter:
271 it1, it2 = itertools.tee(it)
272
273 # find all the baseline data.
274 baseline_filter_it = filter(is_baseline_fun, it1)
275
276 # find non-baseline data.
277 nonbaseline_filter_it = itertools.filterfalse(is_baseline_fun, it2)
278
279 yield itertools.product(baseline_filter_it, nonbaseline_filter_it)
280
281def stats_standard_error_one(a, axis):
282 a_std = a.std(axis=axis, ddof=0)
283 a_len = a.shape[axis]
284
285 return a_std / np.sqrt(a_len)
286
287def stats_standard_error(a, b, axis):
288 a_std = a.std(axis=axis, ddof=0)
289 b_std = b.std(axis=axis, ddof=0)
290
291 a_len = a.shape[axis]
292 b_len = b.shape[axis]
293
294 temp1 = a_std*a_std/a_len
295 temp2 = b_std*b_std/b_len
296
297 return np.sqrt(temp1 + temp2)
298
299def stats_tvalue(a, b, axis, delta = 0):
300 a_mean = a.mean(axis=axis)
301 b_mean = b.mean(axis=axis)
302
303 return (a_mean - b_mean - delta) / stats_standard_error(a, b, axis)
304
305def stats_pvalue(a, b, axis, delta, left:bool = False):
306 """
307 Single-tailed 2-sample t-test.
308
309 Returns p-value for the null hypothesis: mean(a) - mean(b) >= delta.
310 :param a: numpy 2d array
311 :param b: numpy 2d array
312 :param axis: which axis to do the calculations across
313 :param delta: test value of mean differences
314 :param left: if true then use <= delta instead of >= delta
315 :return: p-value
316 """
317 # implement our own pvalue calculation because the built-in t-test (t,p values)
318 # only offer delta=0 , e.g. m1-m1 ? 0
319 # we are however interested in m1-m2 ? delta
320 t_value = stats_tvalue(a, b, axis, delta)
321
322 # 2-sample degrees of freedom is using the array sizes - 2.
323 dof = a.shape[axis] + b.shape[axis] - 2
324
325 if left:
326 # left tailed test. e.g. m1-m2 <= delta
327 return sc.t.cdf(t_value, dof)
328 else:
329 # right tailed test. e.g. m1-m2 >= delta
330 return sc.t.sf(t_value, dof)
331 # a left+right tailed test is a 2-tail t-test and can be done using ttest_ind for delta=0
332
333def print_comparable_analysis(comparable_metrics_iter, label_header, data_header, output_comparable: str, output_comparable_significant: str):
334 baseline_value = _BASELINE[1]
335 baseline_index = label_header.index(_BASELINE[0])
336
337 old_baseline_label_list = None
338 delta = _DELTA
339 filter_value = _IGNORE_PAIR[1]
340 filter_index = label_header.index(_IGNORE_PAIR[0])
341
342 pvalue_threshold = _PVALUE_THRESHOLD
343 ci_threshold = (1 - _PVALUE_THRESHOLD) * 100.0
344
345 with open(output_comparable, "w") as output_file:
346
347 csv_writer = csv.writer(output_file)
348 csv_writer.writerow(label_header + ['mean', 'mean_diff', 'sem', 'pvalue_2tailed', 'pvalue_gt%d' %(_DELTA), 'pvalue_gt%d' %(_DELTA2)])
349
350 print("------------------------------------------------------------------")
351 print("Comparison against the baseline %s = %s" %(_BASELINE, baseline_value))
352 print("--- Right-tailed t-test checks if the baseline >= current %s by at least %d" %(_BASELINE[0], delta))
353 print()
354
355 global_stats = {'better_than_delta': [], 'better_than_delta_p95': []}
356
357 for nested_it in comparable_metrics_iter:
358 print("************************")
359
360 better_than_delta = []
361 better_than_delta_p95 = []
362
363 saw_baseline_once = False
364
365 for ((baseline_label_list, baseline_np_data_2d), (rest_label_list, rest_np_data_2d)) in nested_it:
366 _debug_print("baseline_label_list:", baseline_label_list)
367 _debug_print("baseline_np_data_2d:", baseline_np_data_2d)
368 _debug_print("rest_label_list:", rest_label_list)
369 _debug_print("rest_np_data_2d:", rest_np_data_2d)
370
371 mean_diff = baseline_np_data_2d.mean(axis=0) - rest_np_data_2d.mean(axis=0)
372 # 2-sample 2-tailed t-test with delta=0
373 # e.g. "Is it true that usually the two sample means are different?"
374 t_statistic, t_pvalue = sc.ttest_ind(baseline_np_data_2d, rest_np_data_2d, axis=0)
375
376 # 2-sample 1-tailed t-test with delta=50
377 # e.g. "Is it true that usually the sample means better than 50ms?"
378 t2 = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta)
379 p2 = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta)
380
381 t2_b = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2)
382 p2_b = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2)
383
384 print("%s vs %s" %(rest_label_list, baseline_value))
385 print(" ", data_header)
386 print("Mean Difference: ", mean_diff)
387 print("T-test (2-tailed) != 0: t=%s, p=%s" %(t_statistic, t_pvalue))
388 print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA, t2, p2))
389 print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA2, t2_b, p2_b))
390
391 def write_out_values(label_list, *args):
392 csv_writer.writerow(label_list + [i[_PLOT_DATA_INDEX] for i in args])
393
394 sem = stats_standard_error(baseline_np_data_2d, rest_np_data_2d, axis=0)
395 if saw_baseline_once == False:
396 saw_baseline_once = True
397 base_sem = stats_standard_error_one(baseline_np_data_2d, axis=0)
398 write_out_values(baseline_label_list, baseline_np_data_2d.mean(axis=0), [0], base_sem, [None], [None], [None])
399 write_out_values(rest_label_list, rest_np_data_2d.mean(axis=0), mean_diff, sem, t_pvalue, p2, p2_b)
400
401 # now do the global statistics aggregation
402
403 if rest_label_list[filter_index] == filter_value:
404 continue
405
406 if mean_diff > delta:
407 better_than_delta.append((mean_diff, p2, rest_label_list))
408
409 if p2 <= pvalue_threshold:
410 better_than_delta_p95.append((mean_diff, rest_label_list))
411
412 if better_than_delta:
413 global_stats['better_than_delta'].append(better_than_delta)
414 if better_than_delta_p95:
415 global_stats['better_than_delta_p95'].append(better_than_delta_p95)
416
417 print("------------------------")
418 print("Global statistics:")
419 print("//// Rows with %s=%s are ignored here." %_IGNORE_PAIR)
420 print("- # of results with mean diff better than delta(%d) = %d" %(delta, len(global_stats['better_than_delta'])))
421 print(" > (meandiff, pvalue, labels)")
422 for i in global_stats['better_than_delta']:
423 print(" > %s" %i)
424 print("- # of results with mean diff better than delta(%d) CI%d%% = %d" %(delta, ci_threshold, len(global_stats['better_than_delta_p95'])))
425 print(" > (meandiff, labels)")
426 for i in global_stats['better_than_delta_p95']:
427 print(" > %s" %i)
428
429def main():
430 global _debug
431 global _DELTA
432 global _PVALUE_THRESHOLD
433
434 opts = parse_options()
435 _debug = opts.debug
436 _debug_print("parsed options: ", opts)
437
438 _PVALUE_THRESHOLD = opts.pvalue_threshold or _PVALUE_THRESHOLD
439
440 for file_name in opts.input_files:
441 with open(file_name, 'r') as input_file:
442 (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file)
443 print_analysis(grouped_numpy_iter, label_header, data_header, opts.output_samples)
444
445 with open(file_name, 'r') as input_file:
446 (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file)
447 without_baseline_iter = group_by_without_baseline_key(grouped_numpy_iter, label_header)
448 #_debug_print_gen(without_baseline_iter)
449
450 comparable_metrics_iter = iterate_comparable_metrics(without_baseline_iter, label_header)
451 print_comparable_analysis(comparable_metrics_iter, label_header, data_header, opts.output_comparable, opts.output_comparable_significant)
452
453 return 0
454
455
456if __name__ == '__main__':
457 sys.exit(main())