Add baseline comparisons for latency and accuracy.
Test: build_and_run_benchmark.sh
Bug: 115720411
Change-Id: I567d7b44710b15cffe2f2ecfabe7d8bf3c4f882d
Merged-In: I567d7b44710b15cffe2f2ecfabe7d8bf3c4f882d
(cherry picked from commit 5720de2b5a3e74a4fb048e8682b7febe2b35506d)
diff --git a/results/generate_result.py b/results/generate_result.py
index 772fcbd..eb298b6 100755
--- a/results/generate_result.py
+++ b/results/generate_result.py
@@ -27,12 +27,14 @@
import collections
import csv
import os
+import re
class ScoreException(Exception):
"""Generator base exception type. """
pass
+
BenchmarkResult = collections.namedtuple(
'BenchmarkResult',
['name', 'backend_type', 'iterations', 'total_time_sec', 'max_single_error',
@@ -40,6 +42,17 @@
'time_freq_start_sec', 'time_freq_step_sec', 'time_freq_sec'])
+ResultsWithBaseline = collections.namedtuple(
+ 'ResultsWithBaseline',
+ ['baseline', 'other'])
+
+
+BASELINE_BACKEND = 'TFLite_CPU'
+KNOWN_GROUPS = [(re.compile('mobilenet_v1.*quant.*'), 'mobilenet_v1_quant'),
+ (re.compile('mobilenet_v1.*'), 'mobilenet_v1_float'),
+ (re.compile('tts.*'), 'tts')]
+
+
def parse_csv_input(input_filename):
"""Parse input CSV file, returns: (benchmarkInfo, list of BenchmarkResult)."""
with open(input_filename, 'r') as csvfile:
@@ -71,14 +84,31 @@
for result in results:
groupings[result.name].append(result)
- # Sort by backend type inside groups
- for name in groupings:
- groupings[name] = sorted(groupings[name], key=lambda x: x.backend_type)
+ # Find baseline for each group, make ResultsWithBaseline for each name
+ groupings_baseline = {}
+ for name, results in groupings.items():
+ baseline = next(filter(lambda x: x.backend_type == BASELINE_BACKEND,
+ results))
+ other = sorted(filter(lambda x: x is not baseline, results),
+ key=lambda x: x.backend_type)
+ groupings_baseline[name] = ResultsWithBaseline(
+ baseline=baseline,
+ other=other)
+
+ # Merge ResultsWithBaseline for known groups
+ known_groupings_baseline = collections.defaultdict(list)
+ for name, result_with_bl in sorted(groupings_baseline.items()):
+ group_name = name
+ for known_group in KNOWN_GROUPS:
+ if known_group[0].match(result_with_bl.baseline.name):
+ group_name = known_group[1]
+ break
+ known_groupings_baseline[group_name].append(result_with_bl)
# Turn into a list sorted by name
groupings_list = []
- for name in sorted(groupings.keys()):
- groupings_list.append(groupings[name])
+ for name, results_wbl in sorted(known_groupings_baseline.items()):
+ groupings_list.append(results_wbl)
return groupings_list
@@ -106,27 +136,64 @@
evaluator_keys[1] == 'max_log_f0_error')
-def generate_accuracy_headers(entries_group):
+def generate_accuracy_headers(result):
"""Accuracy-related headers for result table."""
- if is_topk_evaluator(entries_group[0].evaluator_keys):
+ if is_topk_evaluator(result.evaluator_keys):
return ACCURACY_HEADERS_TOPK_TEMPLATE
- elif is_melceplogf0_evaluator(entries_group[0].evaluator_keys):
+ elif is_melceplogf0_evaluator(result.evaluator_keys):
return ACCURACY_HEADERS_MELCEPLOGF0_TEMPLATE
- elif entries_group[0].evaluator_keys:
+ elif result.evaluator_keys:
return ACCURACY_HEADERS_BASIC_TEMPLATE
- raise ScoreException('Unknown accuracy headers for: ' + str(entries_group[0]))
+ raise ScoreException('Unknown accuracy headers for: ' + str(result))
-def generate_accuracy_values(result):
+def get_diff_span(value, same_delta, positive_is_better):
+ if abs(value) < same_delta:
+ return 'same'
+ if positive_is_better and value > 0 or not positive_is_better and value < 0:
+ return 'better'
+ return 'worse'
+
+
+def generate_accuracy_values(baseline, result):
"""Accuracy-related data for result table."""
if is_topk_evaluator(result.evaluator_keys):
- return ACCURACY_VALUES_TOPK_TEMPLATE.format(
- top1=float(result.evaluator_values[0]) * 100.0,
- top2=float(result.evaluator_values[1]) * 100.0,
- top3=float(result.evaluator_values[2]) * 100.0,
- top4=float(result.evaluator_values[3]) * 100.0,
- top5=float(result.evaluator_values[4]) * 100.0)
+ val = [float(x) * 100.0 for x in result.evaluator_values]
+ if result is baseline:
+ topk = [TOPK_BASELINE_TEMPLATE.format(val=x) for x in val]
+ return ACCURACY_VALUES_TOPK_TEMPLATE.format(
+ top1=topk[0], top2=topk[1], top3=topk[2], top4=topk[3],
+ top5=topk[4]
+ )
+ else:
+ base = [float(x) * 100.0 for x in baseline.evaluator_values]
+ diff = [a - b for a, b in zip(val, base)]
+ topk = [TOPK_DIFF_TEMPLATE.format(
+ val=v, diff=d,span=get_diff_span(d, 1.0, positive_is_better=True))
+ for v, d in zip(val, diff)]
+ return ACCURACY_VALUES_TOPK_TEMPLATE.format(
+ top1=topk[0], top2=topk[1], top3=topk[2], top4=topk[3],
+ top5=topk[4]
+ )
elif is_melceplogf0_evaluator(result.evaluator_keys):
+ val = [float(x) for x in result.evaluator_values + [result.max_single_error]]
+ if result is baseline:
+ return ACCURACY_VALUES_MELCEPLOGF0_TEMPLATE.format(
+ max_log_f0=MELCEPLOGF0_BASELINE_TEMPLATE.format(val=val[0]),
+ max_mel_cep_distortion=MELCEPLOGF0_BASELINE_TEMPLATE.format(val=val[1]),
+ max_single_error=MELCEPLOGF0_BASELINE_TEMPLATE.format(val=val[2]),
+ )
+ else:
+ base = [float(x) for x in baseline.evaluator_values + [baseline.max_single_error]]
+ diff = [a - b for a, b in zip(val, base)]
+ v = [MELCEPLOGF0_DIFF_TEMPLATE.format(
+ val=v, diff=d, span=get_diff_span(d, 1.0, positive_is_better=False))
+ for v, d in zip(val, diff)]
+ return ACCURACY_VALUES_MELCEPLOGF0_TEMPLATE.format(
+ max_log_f0=v[0],
+ max_mel_cep_distortion=v[1],
+ max_single_error=v[2],
+ )
return ACCURACY_VALUES_MELCEPLOGF0_TEMPLATE.format(
max_log_f0=float(result.evaluator_values[0]),
max_mel_cep_distortion=float(result.evaluator_values[1]),
@@ -140,7 +207,44 @@
def getchartjs_source():
- return open(os.path.dirname(os.path.abspath(__file__)) + "/" + CHART_JS_FILE).read()
+ return open(os.path.dirname(os.path.abspath(__file__)) + '/' +
+ CHART_JS_FILE).read()
+
+
+def generate_avg_ms(baseline, result):
+ """Generate average latency value."""
+ if result is None:
+ result = baseline
+
+ result_avg_ms = (result.total_time_sec / result.iterations)*1000.0
+ if result is baseline:
+ return LATENCY_BASELINE_TEMPLATE.format(val=result_avg_ms)
+ baseline_avg_ms = (baseline.total_time_sec / baseline.iterations)*1000.0
+ diff = (result_avg_ms/baseline_avg_ms - 1.0) * 100.0
+ diff_val = result_avg_ms - baseline_avg_ms
+ return LATENCY_DIFF_TEMPLATE.format(
+ val=result_avg_ms,
+ diff=diff,
+ diff_val=diff_val,
+ span=get_diff_span(diff, same_delta=1.0, positive_is_better=False))
+
+
+def generate_result_entry(baseline, result):
+ if result is None:
+ result = baseline
+
+ return RESULT_ENTRY_TEMPLATE.format(
+ row_class='baseline' if result is baseline else 'normal',
+ name=result.name,
+ backend=result.backend_type,
+ i=id(result),
+ iterations=result.iterations,
+ testset_size=result.testset_size,
+ accuracy_values=generate_accuracy_values(baseline, result),
+ avg_ms=generate_avg_ms(baseline, result),
+ freq_data=get_frequency_graph(result.time_freq_start_sec,
+ result.time_freq_step_sec,
+ result.time_freq_sec))
def generate_result(benchmark_info, data):
@@ -153,20 +257,17 @@
),
results_list=''.join((
RESULT_GROUP_TEMPLATE.format(
- accuracy_headers=generate_accuracy_headers(entries_group),
+ accuracy_headers=generate_accuracy_headers(
+ entries_group[0].baseline),
results=''.join((
- RESULT_ENTRY_TEMPLATE.format(
- name=result.name,
- backend=result.backend_type,
- i=id(result),
- iterations=result.iterations,
- testset_size=result.testset_size,
- accuracy_values=generate_accuracy_values(result),
- avg_ms=(result.total_time_sec / result.iterations)*1000.0,
- freq_data=get_frequency_graph(result.time_freq_start_sec,
- result.time_freq_step_sec,
- result.time_freq_sec)
- ) for i, result in enumerate(entries_group))
+ RESULT_ENTRY_WITH_BASELINE_TEMPLATE.format(
+ baseline=generate_result_entry(
+ result_and_bl.baseline, None),
+ other=''.join(
+ generate_result_entry(
+ result_and_bl.baseline, x)
+ for x in result_and_bl.other)
+ ) for i, result_and_bl in enumerate(entries_group))
)
) for entries_group in group_results(data))
))
@@ -188,10 +289,10 @@
# Templates below
MAIN_TEMPLATE = """<!doctype html>
-<html lang="en-US">
+<html lang='en-US'>
<head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
+ <meta http-equiv='Content-Type' content='text/html; charset=utf-8'>
+ <script src='https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js'></script>
<script>{jsdeps}</script>
<title>MLTS results</title>
<style>
@@ -203,6 +304,19 @@
border: 1px solid #ddd;
padding: 6px;
}}
+ .results tbody {{
+ border-top: 8px solid #666;
+ border-bottom: 8px solid #666;
+ }}
+ span.better {{
+ color: #070;
+ }}
+ span.worse {{
+ color: #700;
+ }}
+ span.same {{
+ color: #000;
+ }}
.results tr:nth-child(even) {{background-color: #eee;}}
.results tr:hover {{background-color: #ddd;}}
.results th {{
@@ -220,13 +334,13 @@
</body>
</html>"""
-DEVICE_INFO_TEMPLATE = """<div id="device_info">
+DEVICE_INFO_TEMPLATE = """<div id='device_info'>
Benchmark for {device_info}, started at {benchmark_time}
</div>"""
RESULT_GROUP_TEMPLATE = """<div>
-<table class="results">
+<table class='results'>
<tr>
<th>Name</th>
<th>Backend</th>
@@ -241,19 +355,25 @@
</div>"""
+RESULT_ENTRY_WITH_BASELINE_TEMPLATE = """
+ <tbody>
+ {baseline}
+ {other}
+ </tbody>
+"""
+
RESULT_ENTRY_TEMPLATE = """
- <tr>
+ <tr class={row_class}>
<td>{name}</td>
<td>{backend}</td>
<td>{iterations:d}</td>
<td>{testset_size:d}</td>
- <td>{avg_ms:.2f}ms</td>
+ <td>{avg_ms}</td>
{accuracy_values}
- <td class="container" style="width: 300px;">
- <canvas id="latency_chart{i}" class="latency_chart"></canvas>
- </td>
- </tr>
- <script>
+ <td class='container' style='width: 200px;'>
+ <canvas id='latency_chart{i}' class='latency_chart'></canvas>
+ </td>
+ <script>
$(function() {{
var freqData = {{
labels: {freq_data[0]},
@@ -273,7 +393,7 @@
options: {{
responsive: true,
title: {{
- display: true,
+ display: false,
text: 'Latency frequency'
}},
legend: {{
@@ -286,7 +406,7 @@
}}],
yAxes: [{{
scaleLabel: {{
- display: true,
+ display: false,
labelString: 'Iterations Count'
}}
}}]
@@ -294,7 +414,12 @@
}}
}});
}});
- </script>"""
+ </script>
+ </tr>"""
+
+LATENCY_BASELINE_TEMPLATE = """{val:.2f}ms"""
+LATENCY_DIFF_TEMPLATE = """{val:.2f}ms <span class='{span}'>
+({diff_val:.2f}ms, {diff:.1f}%)</span>"""
ACCURACY_HEADERS_TOPK_TEMPLATE = """
@@ -304,14 +429,16 @@
<th>Top 4</th>
<th>Top 5</th>
"""
-
ACCURACY_VALUES_TOPK_TEMPLATE = """
-<td>{top1:.3f}%</td>
-<td>{top2:.3f}%</td>
-<td>{top3:.3f}%</td>
-<td>{top4:.3f}%</td>
-<td>{top5:.3f}%</td>
+<td>{top1}</td>
+<td>{top2}</td>
+<td>{top3}</td>
+<td>{top4}</td>
+<td>{top5}</td>
"""
+TOPK_BASELINE_TEMPLATE = """{val:.3f}%"""
+TOPK_DIFF_TEMPLATE = """{val:.3f}% <span class='{span}'>({diff:.1f}%)</span>"""
+
ACCURACY_HEADERS_MELCEPLOGF0_TEMPLATE = """
<th>Max log(F0) error</th>
@@ -320,11 +447,14 @@
"""
ACCURACY_VALUES_MELCEPLOGF0_TEMPLATE = """
-<td>{max_log_f0:.2E}</td>
-<td>{max_mel_cep_distortion:.2E}</td>
-<td>{max_single_error:.2E}</td>
+<td>{max_log_f0}</td>
+<td>{max_mel_cep_distortion}</td>
+<td>{max_single_error}</td>
"""
+MELCEPLOGF0_BASELINE_TEMPLATE = """{val:.2E}"""
+MELCEPLOGF0_DIFF_TEMPLATE = """{val:.2E} <span class='{span}'>({diff:.1f}%)</span>"""
+
ACCURACY_HEADERS_BASIC_TEMPLATE = """
<th>Max single scalar error</th>
@@ -335,7 +465,6 @@
<td>{max_single_error:.2f}</td>
"""
-
CHART_JS_FILE = "Chart.bundle.min.js"
if __name__ == '__main__':