Blame - startop/scripts/app_startup/analyze_metrics.py - platform/frameworks/base

blob: d74d6f68d8232a8fc976202cd6e8ab1141ce7fa7 [file] [log] [blame]

Igor Murashkin	25f394d	2018-09-11 16:37:18 -0700	[diff] [blame]	1	#!/usr/bin/env python3
				2	#
				3	# Copyright 2018, The Android Open Source Project
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the "License");
				6	# you may not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# http://www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an "AS IS" BASIS,
				13	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16
				17	"""
				18	Perform statistical analysis on measurements produced by app_startup_runner.py
				19
				20	Install:
				21	$> sudo apt-get install python3-scipy
				22
				23	Usage:
				24	$> ./analyze_metrics.py <filename.csv> [<filename2.csv> ...]
				25	$> ./analyze_metrics.py --help
				26	"""
				27
				28	import argparse
				29	import csv
				30	import itertools
				31	import os
				32	import subprocess
				33	import sys
				34	import tempfile
				35	from typing import Any, List, Dict, Iterable, TextIO, Tuple
				36
				37	from scipy import stats as sc
				38	import numpy as np
				39
				40
				41	# These CSV columns are considered labels. Everything after them in the same row are metrics.
				42	_LABEL_COLUMNS=['packages', 'readaheads', 'compiler_filters']
				43	# The metric series with the 'cold' readahead is the baseline.
				44	# All others (warm, jit, etc) are the potential improvements.
				45
				46	#fixme: this should probably be an option
				47	_BASELINE=('readaheads', 'cold')
				48	# ignore this for some statistic calculations
				49	_IGNORE_PAIR=('readaheads', 'warm')
				50	_PLOT_SUBKEY='readaheads'
				51	_PLOT_GROUPKEY='packages'
				52	_PLOT_DATA_INDEX = 0
				53	_DELTA=50
				54	_DELTA2=100
				55	_PVALUE_THRESHOLD=0.10
				56	_debug = False # See -d/--debug flag.
				57
				58	def parse_options(argv: List[str] = None):
				59	"""Parse command line arguments and return an argparse Namespace object."""
				60	parser = argparse.ArgumentParser(description="Perform statistical analysis on measurements produced by app_start_runner.py.")
				61	parser.add_argument('input_files', metavar='file.csv', nargs='+', help='CSV file produced by app_startup_runner.py')
				62
				63	parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='Add extra debugging output')
				64	parser.add_argument('-os', '--output-samples', dest='output_samples', default='/dev/null', action='store', help='Store CSV for per-sample data')
				65	parser.add_argument('-oc', '--output-comparable', dest='output_comparable', default='/dev/null', action='store', help='Output CSV for comparable against baseline')
				66	parser.add_argument('-ocs', '--output-comparable-significant', dest='output_comparable_significant', default='/dev/null', action='store', help='Output CSV for comparable against baseline (significant only)')
				67	parser.add_argument('-pt', '--pvalue-threshold', dest='pvalue_threshold', type=float, default=_PVALUE_THRESHOLD, action='store')
				68	parser.add_argument('-dt', '--delta-threshold', dest='delta_threshold', type=int, default=_DELTA, action='store')
				69
				70	return parser.parse_args(argv)
				71
				72	def _debug_print(args, *kwargs):
				73	"""Print the args to sys.stderr if the --debug/-d flag was passed in."""
				74	global _debug
				75	if _debug:
				76	print(args, *kwargs, file=sys.stderr)
				77
				78	def _expand_gen_repr(args):
				79	new_args_list = []
				80	for i in args:
				81	# detect iterable objects that do not have their own override of __str__
				82	if hasattr(i, '__iter__'):
				83	to_str = getattr(i, '__str__')
				84	if to_str.__objclass__ == object:
				85	# the repr for a generator is just type+address, expand it out instead.
				86	new_args_list.append([_expand_gen_repr([j])[0] for j in i])
				87	continue
				88	# normal case: uses the built-in to-string
				89	new_args_list.append(i)
				90	return new_args_list
				91
				92	def _debug_print_gen(args, *kwargs):
				93	"""Like _debug_print but will turn any iterable args into a list."""
				94	if not _debug:
				95	return
				96
				97	new_args_list = _expand_gen_repr(args)
				98	_debug_print(new_args_list, *kwargs)
				99
				100	def read_headers(input_file: TextIO) -> Tuple[List[str], List[str]]:
				101	_debug_print("read_headers for file: ", input_file.name)
				102	csv_reader = csv.reader(input_file)
				103
				104	label_num_columns = len(_LABEL_COLUMNS)
				105
				106	try:
				107	header = next(csv_reader)
				108	except StopIteration:
				109	header = None
				110	_debug_print('header', header)
				111
				112	if not header:
				113	return (None, None)
				114
				115	labels = header[0:label_num_columns]
				116	data = header[label_num_columns:]
				117
				118	return (labels, data)
				119
				120	def read_labels_and_data(input_file: TextIO) -> Iterable[Tuple[List[str], List[int]]]:
				121	_debug_print("print_analysis for file: ", input_file.name)
				122	csv_reader = csv.reader(input_file)
				123
				124	# Skip the header because it doesn't contain any data.
				125	# To get the header see read_headers function.
				126	try:
				127	header = next(csv_reader)
				128	except StopIteration:
				129	header = None
				130
				131	label_num_columns = len(_LABEL_COLUMNS)
				132
				133	for row in csv_reader:
				134	if len(row) > 0 and row[0][0] == ';':
				135	_debug_print("skip comment line", row)
				136	continue
				137
				138	labels = row[0:label_num_columns]
				139	data = [int(i) for i in row[label_num_columns:]]
				140
				141	# _debug_print("labels:", labels)
				142	# _debug_print("data:", data)
				143
				144	yield (labels, data)
				145
				146	def group_metrics_by_label(it: Iterable[Tuple[List[str], List[int]]]):
				147	prev_labels = None
				148	data_2d = []
				149
				150	for label_list, data_list in it:
				151	if prev_labels != label_list:
				152	if prev_labels:
				153	# _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d)
				154	yield (prev_labels, data_2d)
				155	data_2d = []
				156
				157	data_2d.append(data_list)
				158	prev_labels = label_list
				159
				160	if prev_labels:
				161	# _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d)
				162	yield (prev_labels, data_2d)
				163
				164	def data_to_numpy(it: Iterable[Tuple[List[str], List[List[int]]]]) -> Iterable[Tuple[List[str], Any]]:
				165	for label_list, data_2d in it:
				166	yield (label_list, np.asarray(data_2d, dtype=int))
				167
				168	def iterate_columns(np_data_2d):
				169	for col in range(np_data_2d.shape[1]):
				170	col_as_array = np_data_2d[:, col]
				171	yield col_as_array
				172
				173	def confidence_interval(np_data_2d, percent=0.95):
				174	"""
				175	Given some data [[a,b,c],[d,e,f,]...]
				176
				177	We assume the same metric is in the column (e.g. [a,d])
				178	and that data in the rows (e.g. [b,e]) are separate metric values.
				179
				180	We then calculate the CI for each metric individually returning it as a list of tuples.
				181	"""
				182	arr = []
				183	for col_2d in iterate_columns(np_data_2d):
				184	mean = col_2d.mean()
				185	sigma = col_2d.std()
				186
				187	ci = sc.norm.interval(percent, loc=mean, scale=sigma / np.sqrt(len(col_2d)))
				188	arr.append(ci)
				189
				190	# TODO: This seems to be returning NaN when all the samples have the same exact value
				191	# (e.g. stddev=0, which can trivially happen when sample count = 1).
				192
				193	return arr
				194
				195	def print_analysis(it, label_header: List[str], data_header: List[str], output_samples: str):
				196	print(label_header)
				197
				198	with open(output_samples, "w") as output_file:
				199
				200	csv_writer = csv.writer(output_file)
				201	csv_writer.writerow(label_header + ['mean', 'std', 'confidence_interval_a', 'confidence_interval_b'])
				202
				203	for label_list, np_data_2d in it:
				204	print("**********************")
				205	print(label_list)
				206	print()
				207	print(" ", data_header)
				208	# aggregate computation column-wise
				209	print("Mean: ", np_data_2d.mean(axis=0))
				210	print("Std: ", np_data_2d.std(axis=0))
				211	print("CI95%:", confidence_interval(np_data_2d))
				212	print("SEM: ", stats_standard_error_one(np_data_2d, axis=0))
				213
				214	#ci = confidence_interval(np_data_2d)[_PLOT_DATA_INDEX]
				215	sem = stats_standard_error_one(np_data_2d, axis=0)[_PLOT_DATA_INDEX]
				216	mean = np_data_2d.mean(axis=0)[_PLOT_DATA_INDEX]
				217
				218	ci = (mean - sem, mean + sem)
				219
				220	csv_writer.writerow(label_list + [mean, np_data_2d.std(axis=0)[_PLOT_DATA_INDEX], ci[0], ci[1]])
				221
				222	def from_file_group_by_labels(input_file):
				223	(label_header, data_header) = read_headers(input_file)
				224	label_data_iter = read_labels_and_data(input_file)
				225	grouped_iter = group_metrics_by_label(label_data_iter)
				226	grouped_numpy_iter = data_to_numpy(grouped_iter)
				227
				228	return grouped_numpy_iter, label_header, data_header
				229
				230	def list_without_index(list, index):
				231	return list[:index] + list[index+1:]
				232
				233	def group_by_without_baseline_key(grouped_numpy_iter, label_header):
				234	"""
				235	Data is considered comparable if the only difference is the baseline key
				236	(i.e. the readahead is different but the package, compilation filter, etc, are the same).
				237
				238	Returns iterator that's grouped by the non-baseline labels to an iterator of
				239	(label_list, data_2d).
				240	"""
				241	baseline_index = label_header.index(_BASELINE[0])
				242
				243	def get_label_without_baseline(tpl):
				244	label_list, _ = tpl
				245	return list_without_index(label_list, baseline_index)
				246	# [['pkgname', 'compfilter', 'warm'], [data]]
				247	# [['pkgname', 'compfilter', 'cold'], [data2]]
				248	# [['pkgname2', 'compfilter', 'warm'], [data3]]
				249	#
				250	# ->
				251	# ( [['pkgname', 'compfilter', 'warm'], [data]] # ignore baseline label change.
				252	# [['pkgname', 'compfilter', 'cold'], [data2]] ), # split here because the pkgname changed.
				253	# ( [['pkgname2', 'compfilter', 'warm'], [data3]] )
				254	for group_info, it in itertools.groupby(grouped_numpy_iter, key = get_label_without_baseline):
				255	yield it
				256
				257	# TODO: replace this messy manual iteration/grouping with pandas
				258
				259	def iterate_comparable_metrics(without_baseline_iter, label_header):
				260	baseline_index = label_header.index(_BASELINE[0])
				261	baseline_value = _BASELINE[1]
				262
				263	_debug_print("iterate comparables")
				264
				265	def is_baseline_fun(tp):
				266	ll, dat = tp
				267	return ll[baseline_index] == baseline_value
				268
				269	# iterating here when everything but the baseline key is the same.
				270	for it in without_baseline_iter:
				271	it1, it2 = itertools.tee(it)
				272
				273	# find all the baseline data.
				274	baseline_filter_it = filter(is_baseline_fun, it1)
				275
				276	# find non-baseline data.
				277	nonbaseline_filter_it = itertools.filterfalse(is_baseline_fun, it2)
				278
				279	yield itertools.product(baseline_filter_it, nonbaseline_filter_it)
				280
				281	def stats_standard_error_one(a, axis):
				282	a_std = a.std(axis=axis, ddof=0)
				283	a_len = a.shape[axis]
				284
				285	return a_std / np.sqrt(a_len)
				286
				287	def stats_standard_error(a, b, axis):
				288	a_std = a.std(axis=axis, ddof=0)
				289	b_std = b.std(axis=axis, ddof=0)
				290
				291	a_len = a.shape[axis]
				292	b_len = b.shape[axis]
				293
				294	temp1 = a_std*a_std/a_len
				295	temp2 = b_std*b_std/b_len
				296
				297	return np.sqrt(temp1 + temp2)
				298
				299	def stats_tvalue(a, b, axis, delta = 0):
				300	a_mean = a.mean(axis=axis)
				301	b_mean = b.mean(axis=axis)
				302
				303	return (a_mean - b_mean - delta) / stats_standard_error(a, b, axis)
				304
				305	def stats_pvalue(a, b, axis, delta, left:bool = False):
				306	"""
				307	Single-tailed 2-sample t-test.
				308
				309	Returns p-value for the null hypothesis: mean(a) - mean(b) >= delta.
				310	:param a: numpy 2d array
				311	:param b: numpy 2d array
				312	:param axis: which axis to do the calculations across
				313	:param delta: test value of mean differences
				314	:param left: if true then use <= delta instead of >= delta
				315	:return: p-value
				316	"""
				317	# implement our own pvalue calculation because the built-in t-test (t,p values)
				318	# only offer delta=0 , e.g. m1-m1 ? 0
				319	# we are however interested in m1-m2 ? delta
				320	t_value = stats_tvalue(a, b, axis, delta)
				321
				322	# 2-sample degrees of freedom is using the array sizes - 2.
				323	dof = a.shape[axis] + b.shape[axis] - 2
				324
				325	if left:
				326	# left tailed test. e.g. m1-m2 <= delta
				327	return sc.t.cdf(t_value, dof)
				328	else:
				329	# right tailed test. e.g. m1-m2 >= delta
				330	return sc.t.sf(t_value, dof)
				331	# a left+right tailed test is a 2-tail t-test and can be done using ttest_ind for delta=0
				332
				333	def print_comparable_analysis(comparable_metrics_iter, label_header, data_header, output_comparable: str, output_comparable_significant: str):
				334	baseline_value = _BASELINE[1]
				335	baseline_index = label_header.index(_BASELINE[0])
				336
				337	old_baseline_label_list = None
				338	delta = _DELTA
				339	filter_value = _IGNORE_PAIR[1]
				340	filter_index = label_header.index(_IGNORE_PAIR[0])
				341
				342	pvalue_threshold = _PVALUE_THRESHOLD
				343	ci_threshold = (1 - _PVALUE_THRESHOLD) * 100.0
				344
				345	with open(output_comparable, "w") as output_file:
				346
				347	csv_writer = csv.writer(output_file)
				348	csv_writer.writerow(label_header + ['mean', 'mean_diff', 'sem', 'pvalue_2tailed', 'pvalue_gt%d' %(_DELTA), 'pvalue_gt%d' %(_DELTA2)])
				349
				350	print("------------------------------------------------------------------")
				351	print("Comparison against the baseline %s = %s" %(_BASELINE, baseline_value))
				352	print("--- Right-tailed t-test checks if the baseline >= current %s by at least %d" %(_BASELINE[0], delta))
				353	print()
				354
				355	global_stats = {'better_than_delta': [], 'better_than_delta_p95': []}
				356
				357	for nested_it in comparable_metrics_iter:
				358	print("************************")
				359
				360	better_than_delta = []
				361	better_than_delta_p95 = []
				362
				363	saw_baseline_once = False
				364
				365	for ((baseline_label_list, baseline_np_data_2d), (rest_label_list, rest_np_data_2d)) in nested_it:
				366	_debug_print("baseline_label_list:", baseline_label_list)
				367	_debug_print("baseline_np_data_2d:", baseline_np_data_2d)
				368	_debug_print("rest_label_list:", rest_label_list)
				369	_debug_print("rest_np_data_2d:", rest_np_data_2d)
				370
				371	mean_diff = baseline_np_data_2d.mean(axis=0) - rest_np_data_2d.mean(axis=0)
				372	# 2-sample 2-tailed t-test with delta=0
				373	# e.g. "Is it true that usually the two sample means are different?"
				374	t_statistic, t_pvalue = sc.ttest_ind(baseline_np_data_2d, rest_np_data_2d, axis=0)
				375
				376	# 2-sample 1-tailed t-test with delta=50
				377	# e.g. "Is it true that usually the sample means better than 50ms?"
				378	t2 = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta)
				379	p2 = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta)
				380
				381	t2_b = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2)
				382	p2_b = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2)
				383
				384	print("%s vs %s" %(rest_label_list, baseline_value))
				385	print(" ", data_header)
				386	print("Mean Difference: ", mean_diff)
				387	print("T-test (2-tailed) != 0: t=%s, p=%s" %(t_statistic, t_pvalue))
				388	print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA, t2, p2))
				389	print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA2, t2_b, p2_b))
				390
				391	def write_out_values(label_list, *args):
				392	csv_writer.writerow(label_list + [i[_PLOT_DATA_INDEX] for i in args])
				393
				394	sem = stats_standard_error(baseline_np_data_2d, rest_np_data_2d, axis=0)
				395	if saw_baseline_once == False:
				396	saw_baseline_once = True
				397	base_sem = stats_standard_error_one(baseline_np_data_2d, axis=0)
				398	write_out_values(baseline_label_list, baseline_np_data_2d.mean(axis=0), [0], base_sem, [None], [None], [None])
				399	write_out_values(rest_label_list, rest_np_data_2d.mean(axis=0), mean_diff, sem, t_pvalue, p2, p2_b)
				400
				401	# now do the global statistics aggregation
				402
				403	if rest_label_list[filter_index] == filter_value:
				404	continue
				405
				406	if mean_diff > delta:
				407	better_than_delta.append((mean_diff, p2, rest_label_list))
				408
				409	if p2 <= pvalue_threshold:
				410	better_than_delta_p95.append((mean_diff, rest_label_list))
				411
				412	if better_than_delta:
				413	global_stats['better_than_delta'].append(better_than_delta)
				414	if better_than_delta_p95:
				415	global_stats['better_than_delta_p95'].append(better_than_delta_p95)
				416
				417	print("------------------------")
				418	print("Global statistics:")
				419	print("//// Rows with %s=%s are ignored here." %_IGNORE_PAIR)
				420	print("- # of results with mean diff better than delta(%d) = %d" %(delta, len(global_stats['better_than_delta'])))
				421	print(" > (meandiff, pvalue, labels)")
				422	for i in global_stats['better_than_delta']:
				423	print(" > %s" %i)
				424	print("- # of results with mean diff better than delta(%d) CI%d%% = %d" %(delta, ci_threshold, len(global_stats['better_than_delta_p95'])))
				425	print(" > (meandiff, labels)")
				426	for i in global_stats['better_than_delta_p95']:
				427	print(" > %s" %i)
				428
				429	def main():
				430	global _debug
				431	global _DELTA
				432	global _PVALUE_THRESHOLD
				433
				434	opts = parse_options()
				435	_debug = opts.debug
				436	_debug_print("parsed options: ", opts)
				437
				438	_PVALUE_THRESHOLD = opts.pvalue_threshold or _PVALUE_THRESHOLD
				439
				440	for file_name in opts.input_files:
				441	with open(file_name, 'r') as input_file:
				442	(grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file)
				443	print_analysis(grouped_numpy_iter, label_header, data_header, opts.output_samples)
				444
				445	with open(file_name, 'r') as input_file:
				446	(grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file)
				447	without_baseline_iter = group_by_without_baseline_key(grouped_numpy_iter, label_header)
				448	#_debug_print_gen(without_baseline_iter)
				449
				450	comparable_metrics_iter = iterate_comparable_metrics(without_baseline_iter, label_header)
				451	print_comparable_analysis(comparable_metrics_iter, label_header, data_header, opts.output_comparable, opts.output_comparable_significant)
				452
				453	return 0
				454
				455
				456	if __name__ == '__main__':
				457	sys.exit(main())