Blame - tools/perf/statistics-for-json.R - fp2-dev/platform/external/v8

blob: fde2cd75db128e9593fdf0d9c82c3e021b3db97b [file] [log] [blame]

Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1	# Copyright 2016 the V8 project authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4
				5	# Do statistical tests on benchmark results
				6	# This script requires the libraries rjson, R.utils, ggplot2 and data.table
				7	# Install them prior to running
				8
				9	# To use the script, first get some benchmark results, for example via
				10	# tools/run_perf.py ../v8-perf/benchmarks/Octane2.1/Octane2.1-TF.json
				11	# --outdir=out/x64.release-on --outdir-no-patch=out/x64.release-off
				12	# --json-test-results=results-on.json
				13	# --json-test-results-no-patch=results-off.json
				14	# then run this script
				15	# Rscript statistics-for-json.R results-on.json results-off.json ~/SVG
				16	# to produce graphs (and get stdio output of statistical tests).
				17
				18
				19	suppressMessages(library("rjson")) # for fromJson
				20	suppressMessages(library("R.utils")) # for printf
				21	suppressMessages(library("ggplot2")) # for plotting
				22	suppressMessages(library("data.table")) # less broken than data.frame
				23
				24	# Clear all variables from environment
				25	rm(list=ls())
				26
				27	args <- commandArgs(TRUE)
				28	if (length(args) != 3) {
				29	printf(paste("usage: Rscript %%this_script patched-results.json",
				30	"unpatched-results.json\n"))
				31	} else {
				32	patch <- fromJSON(file=args[1])
				33	nopatch <- fromJSON(file=args[2])
				34	outputPath <- args[3]
				35	df <- data.table(L = numeric(), R = numeric(), E = numeric(),
				36	p.value = numeric(), yL = character(),
				37	p.value.sig = logical())
				38
				39	for (i in seq(1, length(patch$traces))) {
				40	testName <- patch$traces[[i]]$graphs[[2]]
				41	printf("%s\n", testName)
				42
				43	nopatch_res <- as.integer(nopatch$traces[[i]]$results)
				44	patch_res <- as.integer(patch$traces[[i]]$results)
				45	if (length(nopatch_res) > 0) {
				46	patch_norm <- shapiro.test(patch_res);
				47	nopatch_norm <- shapiro.test(nopatch_res);
				48
				49	# Shaprio-Wilk test indicates whether data is not likely to
				50	# come from a normal distribution. The p-value is the probability
				51	# to obtain the sample from a normal distribution. This means, the
				52	# smaller p, the more likely the sample was not drawn from a normal
				53	# distribution. See [wikipedia:Shapiro-Wilk-Test].
				54	printf(" Patched scores look %s distributed (W=%.4f, p=%.4f)\n",
				55	ifelse(patch_norm$p.value < 0.05, "not normally", "normally"),
				56	patch_norm$statistic, patch_norm$p.value);
				57	printf(" Unpatched scores look %s distributed (W=%.4f, p=%.4f)\n",
				58	ifelse(nopatch_norm$p.value < 0.05, "not normally", "normally"),
				59	nopatch_norm$statistic, nopatch_norm$p.value);
				60
				61	hist <- ggplot(data=data.frame(x=as.integer(patch_res)), aes(x)) +
				62	theme_bw() +
				63	geom_histogram(bins=50) +
				64	ylab("Points") +
				65	xlab(patch$traces[[i]]$graphs[[2]])
				66	ggsave(filename=sprintf("%s/%s.svg", outputPath, testName),
				67	plot=hist, width=7, height=7)
				68
				69	hist <- ggplot(data=data.frame(x=as.integer(nopatch_res)), aes(x)) +
				70	theme_bw() +
				71	geom_histogram(bins=50) +
				72	ylab("Points") +
				73	xlab(patch$traces[[i]]$graphs[[2]])
				74	ggsave(filename=sprintf("%s/%s-before.svg", outputPath, testName),
				75	plot=hist, width=7, height=7)
				76
				77	# The Wilcoxon rank-sum test
				78	mww <- wilcox.test(patch_res, nopatch_res, conf.int = TRUE, exact=TRUE)
				79	printf(paste(" Wilcoxon U-test W=%.4f, p=%.4f,",
				80	"confidence interval [%.1f, %.1f],",
				81	"est. effect size %.1f \n"),
				82	mww$statistic, mww$p.value,
				83	mww$conf.int[1], mww$conf.int[2], mww$estimate);
				84	df <-rbind(df, list(mww$conf.int[1], mww$conf.int[2],
				85	unname(mww$estimate), unname(mww$p.value),
				86	testName, ifelse(mww$p.value < 0.05, TRUE, FALSE)))
				87	# t-test
				88	t <- t.test(patch_res, nopatch_res, paired=FALSE)
				89	printf(paste(" Welch t-test t=%.4f, df = %.2f, p=%.4f,",
				90	"confidence interval [%.1f, %.1f], mean diff %.1f \n"),
				91	t$statistic, t$parameter, t$p.value,
				92	t$conf.int[1], t$conf.int[2], t$estimate[1]-t$estimate[2]);
				93	}
				94	}
				95	df2 <- cbind(x=1:nrow(df), df[order(E),])
				96	speedup <- ggplot(df2, aes(x = x, y = E, colour=p.value.sig)) +
				97	geom_errorbar(aes(ymax = L, ymin = R), colour="black") +
				98	geom_point(size = 4) +
				99	scale_x_discrete(limits=df2$yL,
				100	name=paste("Benchmark, n=", length(patch_res))) +
				101	theme_bw() +
				102	geom_hline(yintercept = 0) +
				103	ylab("Est. Effect Size in Points") +
				104	theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) +
				105	theme(legend.position = "bottom") +
				106	scale_colour_manual(name="Statistical Significance (MWW, p < 0.05)",
				107	values=c("red", "green"),
				108	labels=c("not significant", "significant")) +
				109	theme(legend.justification=c(0,1), legend.position=c(0,1))
				110	print(speedup)
				111	ggsave(filename=sprintf("%s/speedup-estimates.svg", outputPath),
				112	plot=speedup, width=7, height=7)
				113	}