Add baseline comparisons for latency and accuracy.

Test: build_and_run_benchmark.sh
Bug: 115720411
Change-Id: I567d7b44710b15cffe2f2ecfabe7d8bf3c4f882d
Merged-In: I567d7b44710b15cffe2f2ecfabe7d8bf3c4f882d
(cherry picked from commit 5720de2b5a3e74a4fb048e8682b7febe2b35506d)
diff --git a/results/generate_result.py b/results/generate_result.py
index 772fcbd..eb298b6 100755
--- a/results/generate_result.py
+++ b/results/generate_result.py
@@ -27,12 +27,14 @@
 import collections
 import csv
 import os
+import re
 
 
 class ScoreException(Exception):
   """Generator base exception type. """
   pass
 
+
 BenchmarkResult = collections.namedtuple(
     'BenchmarkResult',
     ['name', 'backend_type', 'iterations', 'total_time_sec', 'max_single_error',
@@ -40,6 +42,17 @@
      'time_freq_start_sec', 'time_freq_step_sec', 'time_freq_sec'])
 
 
+ResultsWithBaseline = collections.namedtuple(
+    'ResultsWithBaseline',
+    ['baseline', 'other'])
+
+
+BASELINE_BACKEND = 'TFLite_CPU'
+KNOWN_GROUPS = [(re.compile('mobilenet_v1.*quant.*'), 'mobilenet_v1_quant'),
+                (re.compile('mobilenet_v1.*'), 'mobilenet_v1_float'),
+                (re.compile('tts.*'), 'tts')]
+
+
 def parse_csv_input(input_filename):
   """Parse input CSV file, returns: (benchmarkInfo, list of BenchmarkResult)."""
   with open(input_filename, 'r') as csvfile:
@@ -71,14 +84,31 @@
   for result in results:
     groupings[result.name].append(result)
 
-  # Sort by backend type inside groups
-  for name in groupings:
-    groupings[name] = sorted(groupings[name], key=lambda x: x.backend_type)
+  # Find baseline for each group, make ResultsWithBaseline for each name
+  groupings_baseline = {}
+  for name, results in groupings.items():
+    baseline = next(filter(lambda x: x.backend_type == BASELINE_BACKEND,
+                           results))
+    other = sorted(filter(lambda x: x is not baseline, results),
+                   key=lambda x: x.backend_type)
+    groupings_baseline[name] = ResultsWithBaseline(
+        baseline=baseline,
+        other=other)
+
+  # Merge ResultsWithBaseline for known groups
+  known_groupings_baseline = collections.defaultdict(list)
+  for name, result_with_bl in sorted(groupings_baseline.items()):
+    group_name = name
+    for known_group in KNOWN_GROUPS:
+      if known_group[0].match(result_with_bl.baseline.name):
+        group_name = known_group[1]
+        break
+    known_groupings_baseline[group_name].append(result_with_bl)
 
   # Turn into a list sorted by name
   groupings_list = []
-  for name in sorted(groupings.keys()):
-    groupings_list.append(groupings[name])
+  for name, results_wbl in sorted(known_groupings_baseline.items()):
+    groupings_list.append(results_wbl)
   return groupings_list
 
 
@@ -106,27 +136,64 @@
           evaluator_keys[1] == 'max_log_f0_error')
 
 
-def generate_accuracy_headers(entries_group):
+def generate_accuracy_headers(result):
   """Accuracy-related headers for result table."""
-  if is_topk_evaluator(entries_group[0].evaluator_keys):
+  if is_topk_evaluator(result.evaluator_keys):
     return ACCURACY_HEADERS_TOPK_TEMPLATE
-  elif is_melceplogf0_evaluator(entries_group[0].evaluator_keys):
+  elif is_melceplogf0_evaluator(result.evaluator_keys):
     return ACCURACY_HEADERS_MELCEPLOGF0_TEMPLATE
-  elif entries_group[0].evaluator_keys:
+  elif result.evaluator_keys:
     return ACCURACY_HEADERS_BASIC_TEMPLATE
-  raise ScoreException('Unknown accuracy headers for: ' + str(entries_group[0]))
+  raise ScoreException('Unknown accuracy headers for: ' + str(result))
 
 
-def generate_accuracy_values(result):
+def get_diff_span(value, same_delta, positive_is_better):
+  if abs(value) < same_delta:
+    return 'same'
+  if positive_is_better and value > 0 or not positive_is_better and value < 0:
+    return 'better'
+  return 'worse'
+
+
+def generate_accuracy_values(baseline, result):
   """Accuracy-related data for result table."""
   if is_topk_evaluator(result.evaluator_keys):
-    return ACCURACY_VALUES_TOPK_TEMPLATE.format(
-        top1=float(result.evaluator_values[0]) * 100.0,
-        top2=float(result.evaluator_values[1]) * 100.0,
-        top3=float(result.evaluator_values[2]) * 100.0,
-        top4=float(result.evaluator_values[3]) * 100.0,
-        top5=float(result.evaluator_values[4]) * 100.0)
+    val = [float(x) * 100.0 for x in result.evaluator_values]
+    if result is baseline:
+      topk = [TOPK_BASELINE_TEMPLATE.format(val=x) for x in val]
+      return ACCURACY_VALUES_TOPK_TEMPLATE.format(
+          top1=topk[0], top2=topk[1], top3=topk[2], top4=topk[3],
+          top5=topk[4]
+      )
+    else:
+      base = [float(x) * 100.0 for x in baseline.evaluator_values]
+      diff = [a - b for a, b in zip(val, base)]
+      topk = [TOPK_DIFF_TEMPLATE.format(
+          val=v, diff=d,span=get_diff_span(d, 1.0, positive_is_better=True))
+              for v, d in zip(val, diff)]
+      return ACCURACY_VALUES_TOPK_TEMPLATE.format(
+          top1=topk[0], top2=topk[1], top3=topk[2], top4=topk[3],
+          top5=topk[4]
+      )
   elif is_melceplogf0_evaluator(result.evaluator_keys):
+    val = [float(x) for x in result.evaluator_values + [result.max_single_error]]
+    if result is baseline:
+      return ACCURACY_VALUES_MELCEPLOGF0_TEMPLATE.format(
+        max_log_f0=MELCEPLOGF0_BASELINE_TEMPLATE.format(val=val[0]),
+        max_mel_cep_distortion=MELCEPLOGF0_BASELINE_TEMPLATE.format(val=val[1]),
+        max_single_error=MELCEPLOGF0_BASELINE_TEMPLATE.format(val=val[2]),
+      )
+    else:
+      base = [float(x) for x in baseline.evaluator_values + [baseline.max_single_error]]
+      diff = [a - b for a, b in zip(val, base)]
+      v = [MELCEPLOGF0_DIFF_TEMPLATE.format(
+          val=v, diff=d, span=get_diff_span(d, 1.0, positive_is_better=False))
+              for v, d in zip(val, diff)]
+      return ACCURACY_VALUES_MELCEPLOGF0_TEMPLATE.format(
+        max_log_f0=v[0],
+        max_mel_cep_distortion=v[1],
+        max_single_error=v[2],
+      )
     return ACCURACY_VALUES_MELCEPLOGF0_TEMPLATE.format(
         max_log_f0=float(result.evaluator_values[0]),
         max_mel_cep_distortion=float(result.evaluator_values[1]),
@@ -140,7 +207,44 @@
 
 
 def getchartjs_source():
-  return open(os.path.dirname(os.path.abspath(__file__)) + "/" + CHART_JS_FILE).read()
+  return open(os.path.dirname(os.path.abspath(__file__)) + '/' +
+              CHART_JS_FILE).read()
+
+
+def generate_avg_ms(baseline, result):
+  """Generate average latency value."""
+  if result is None:
+    result = baseline
+
+  result_avg_ms = (result.total_time_sec / result.iterations)*1000.0
+  if result is baseline:
+    return LATENCY_BASELINE_TEMPLATE.format(val=result_avg_ms)
+  baseline_avg_ms = (baseline.total_time_sec / baseline.iterations)*1000.0
+  diff = (result_avg_ms/baseline_avg_ms - 1.0) * 100.0
+  diff_val = result_avg_ms - baseline_avg_ms
+  return LATENCY_DIFF_TEMPLATE.format(
+      val=result_avg_ms,
+      diff=diff,
+      diff_val=diff_val,
+      span=get_diff_span(diff, same_delta=1.0, positive_is_better=False))
+
+
+def generate_result_entry(baseline, result):
+  if result is None:
+    result = baseline
+
+  return RESULT_ENTRY_TEMPLATE.format(
+      row_class='baseline' if result is baseline else 'normal',
+      name=result.name,
+      backend=result.backend_type,
+      i=id(result),
+      iterations=result.iterations,
+      testset_size=result.testset_size,
+      accuracy_values=generate_accuracy_values(baseline, result),
+      avg_ms=generate_avg_ms(baseline, result),
+      freq_data=get_frequency_graph(result.time_freq_start_sec,
+                                    result.time_freq_step_sec,
+                                    result.time_freq_sec))
 
 
 def generate_result(benchmark_info, data):
@@ -153,20 +257,17 @@
           ),
       results_list=''.join((
           RESULT_GROUP_TEMPLATE.format(
-              accuracy_headers=generate_accuracy_headers(entries_group),
+              accuracy_headers=generate_accuracy_headers(
+                  entries_group[0].baseline),
               results=''.join((
-                  RESULT_ENTRY_TEMPLATE.format(
-                      name=result.name,
-                      backend=result.backend_type,
-                      i=id(result),
-                      iterations=result.iterations,
-                      testset_size=result.testset_size,
-                      accuracy_values=generate_accuracy_values(result),
-                      avg_ms=(result.total_time_sec / result.iterations)*1000.0,
-                      freq_data=get_frequency_graph(result.time_freq_start_sec,
-                                                    result.time_freq_step_sec,
-                                                    result.time_freq_sec)
-                  ) for i, result in enumerate(entries_group))
+                  RESULT_ENTRY_WITH_BASELINE_TEMPLATE.format(
+                      baseline=generate_result_entry(
+                          result_and_bl.baseline, None),
+                      other=''.join(
+                          generate_result_entry(
+                              result_and_bl.baseline, x)
+                          for x in result_and_bl.other)
+                  ) for i, result_and_bl in enumerate(entries_group))
                              )
           ) for entries_group in group_results(data))
                           ))
@@ -188,10 +289,10 @@
 # Templates below
 
 MAIN_TEMPLATE = """<!doctype html>
-<html lang="en-US">
+<html lang='en-US'>
 <head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
+  <meta http-equiv='Content-Type' content='text/html; charset=utf-8'>
+  <script src='https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js'></script>
   <script>{jsdeps}</script>
   <title>MLTS results</title>
   <style>
@@ -203,6 +304,19 @@
       border: 1px solid #ddd;
       padding: 6px;
     }}
+    .results tbody {{
+      border-top: 8px solid #666;
+      border-bottom: 8px solid #666;
+    }}
+    span.better {{
+      color: #070;
+    }}
+    span.worse {{
+      color: #700;
+    }}
+    span.same {{
+      color: #000;
+    }}
     .results tr:nth-child(even) {{background-color: #eee;}}
     .results tr:hover {{background-color: #ddd;}}
     .results th {{
@@ -220,13 +334,13 @@
 </body>
 </html>"""
 
-DEVICE_INFO_TEMPLATE = """<div id="device_info">
+DEVICE_INFO_TEMPLATE = """<div id='device_info'>
 Benchmark for {device_info}, started at {benchmark_time}
 </div>"""
 
 
 RESULT_GROUP_TEMPLATE = """<div>
-<table class="results">
+<table class='results'>
  <tr>
    <th>Name</th>
    <th>Backend</th>
@@ -241,19 +355,25 @@
 </div>"""
 
 
+RESULT_ENTRY_WITH_BASELINE_TEMPLATE = """
+ <tbody>
+ {baseline}
+ {other}
+ </tbody>
+"""
+
 RESULT_ENTRY_TEMPLATE = """
-  <tr>
+  <tr class={row_class}>
    <td>{name}</td>
    <td>{backend}</td>
    <td>{iterations:d}</td>
    <td>{testset_size:d}</td>
-   <td>{avg_ms:.2f}ms</td>
+   <td>{avg_ms}</td>
    {accuracy_values}
-   <td class="container" style="width: 300px;">
-    <canvas id="latency_chart{i}" class="latency_chart"></canvas>
-  </td>
- </tr>
- <script>
+   <td class='container' style='width: 200px;'>
+    <canvas id='latency_chart{i}' class='latency_chart'></canvas>
+   </td>
+  <script>
    $(function() {{
        var freqData = {{
          labels: {freq_data[0]},
@@ -273,7 +393,7 @@
           options: {{
            responsive: true,
            title: {{
-             display: true,
+             display: false,
              text: 'Latency frequency'
            }},
            legend: {{
@@ -286,7 +406,7 @@
             }}],
             yAxes: [{{
               scaleLabel: {{
-                display: true,
+                display: false,
                 labelString: 'Iterations Count'
               }}
             }}]
@@ -294,7 +414,12 @@
          }}
        }});
      }});
-  </script>"""
+    </script>
+  </tr>"""
+
+LATENCY_BASELINE_TEMPLATE = """{val:.2f}ms"""
+LATENCY_DIFF_TEMPLATE = """{val:.2f}ms <span class='{span}'>
+({diff_val:.2f}ms, {diff:.1f}%)</span>"""
 
 
 ACCURACY_HEADERS_TOPK_TEMPLATE = """
@@ -304,14 +429,16 @@
 <th>Top 4</th>
 <th>Top 5</th>
 """
-
 ACCURACY_VALUES_TOPK_TEMPLATE = """
-<td>{top1:.3f}%</td>
-<td>{top2:.3f}%</td>
-<td>{top3:.3f}%</td>
-<td>{top4:.3f}%</td>
-<td>{top5:.3f}%</td>
+<td>{top1}</td>
+<td>{top2}</td>
+<td>{top3}</td>
+<td>{top4}</td>
+<td>{top5}</td>
 """
+TOPK_BASELINE_TEMPLATE = """{val:.3f}%"""
+TOPK_DIFF_TEMPLATE = """{val:.3f}% <span class='{span}'>({diff:.1f}%)</span>"""
+
 
 ACCURACY_HEADERS_MELCEPLOGF0_TEMPLATE = """
 <th>Max log(F0) error</th>
@@ -320,11 +447,14 @@
 """
 
 ACCURACY_VALUES_MELCEPLOGF0_TEMPLATE = """
-<td>{max_log_f0:.2E}</td>
-<td>{max_mel_cep_distortion:.2E}</td>
-<td>{max_single_error:.2E}</td>
+<td>{max_log_f0}</td>
+<td>{max_mel_cep_distortion}</td>
+<td>{max_single_error}</td>
 """
 
+MELCEPLOGF0_BASELINE_TEMPLATE = """{val:.2E}"""
+MELCEPLOGF0_DIFF_TEMPLATE = """{val:.2E} <span class='{span}'>({diff:.1f}%)</span>"""
+
 
 ACCURACY_HEADERS_BASIC_TEMPLATE = """
 <th>Max single scalar error</th>
@@ -335,7 +465,6 @@
 <td>{max_single_error:.2f}</td>
 """
 
-
 CHART_JS_FILE = "Chart.bundle.min.js"
 
 if __name__ == '__main__':