Gold support in PDFium

Extends the PDFium tests to collect images and meta data to be uploaded
to Gold. This feature is triggered by adding the --gold_* flags.

It extends pdfium_test to output the MD5 hash of the underlying pixel
buffer for each page it renders.
That output is then processed by test_runner.py to generate the
gold meta data.

This behavior is modeled after the 'dm' tool in skia. See
https://skia.googlesource.com/skia/+/master/dm/DM.cpp#1090

This should not cause any change in the current behavior of the tests,
it will be trigger once we update the buildbot recipe.

BUG=skia:5973

Review-Url: https://codereview.chromium.org/2578893004
diff --git a/testing/tools/common.py b/testing/tools/common.py
index 1e1d257..a0cc946 100755
--- a/testing/tools/common.py
+++ b/testing/tools/common.py
@@ -25,6 +25,24 @@
   except subprocess.CalledProcessError as e:
     return e
 
+# RunCommandExtractHashedFiles returns a tuple: (raised_exception, hashed_files)
+# It runs the given command. If it fails it will return an exception and None.
+# If it succeeds it will return None and the list of processed files extracted
+# from the output of the command. It expects lines in this format:
+#    MD5:<path_to_image_file>:<md5_hash_in_hex>
+# The returned hashed_files is a list of (file_path, MD5-hash) pairs.
+def RunCommandExtractHashedFiles(cmd):
+  try:
+    output = subprocess.check_output(cmd, universal_newlines=True)
+    ret = []
+    for line in output.split('\n'):
+      line = line.strip()
+      if line.startswith("MD5:"):
+          ret.append([x.strip() for x in line.lstrip("MD5:").rsplit(":", 1)])
+    return None, ret
+  except subprocess.CalledProcessError as e:
+    return e, None
+
 # Adjust Dr. Memory wrapper to have separate log directory for each test
 # for better error reporting.
 def DrMemoryWrapper(wrapper, pdf_name):
diff --git a/testing/tools/gold.py b/testing/tools/gold.py
new file mode 100644
index 0000000..fda63b6
--- /dev/null
+++ b/testing/tools/gold.py
@@ -0,0 +1,126 @@
+# Copyright 2015 The PDFium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+
+import json
+import os
+import shlex
+import shutil
+
+# This module collects and writes output in a format expected by the
+# Gold baseline tool. Based on meta data provided explicitly and by
+# adding a series of test results it can be used to produce
+# a JSON file that is uploaded to Google Storage and ingested by Gold.
+#
+# The output will look similar this:
+#
+# {
+#    "build_number" : "2",
+#    "gitHash" : "a4a338179013b029d6dd55e737b5bd648a9fb68c",
+#    "key" : {
+#       "arch" : "arm64",
+#       "compiler" : "Clang",
+#    },
+#    "results" : [
+#       {
+#          "key" : {
+#             "config" : "vk",
+#             "name" : "yuv_nv12_to_rgb_effect",
+#             "source_type" : "gm"
+#          },
+#          "md5" : "7db34da246868d50ab9ddd776ce6d779",
+#          "options" : {
+#             "ext" : "png",
+#             "gamma_correct" : "no"
+#          }
+#       },
+#       {
+#          "key" : {
+#             "config" : "vk",
+#             "name" : "yuv_to_rgb_effect",
+#             "source_type" : "gm"
+#          },
+#          "md5" : "0b955f387740c66eb23bf0e253c80d64",
+#          "options" : {
+#             "ext" : "png",
+#             "gamma_correct" : "no"
+#          }
+#       }
+#    ],
+# }
+#
+class GoldResults(object):
+  def __init__(self, source_type, outputDir, propertiesStr, keyStr):
+    """
+    source_type is the source_type (=corpus) field used for all results.
+    output_dir is the directory where the resulting images are copied and
+               the dm.json file is written.
+    propertiesStr is a string with space separated key/value pairs that
+               is used to set the top level fields in the output JSON file.
+    keyStr is a string with space separated key/value pairs that
+               is used to set the 'key' field in the output JSON file.
+    """
+    self._source_type = source_type
+    self._properties = self._parseKeyValuePairs(propertiesStr)
+    self._properties["key"] = self._parseKeyValuePairs(keyStr)
+    self._results =  []
+    self._outputDir = outputDir
+
+  def AddTestResult(self, testName, md5Hash, outputImagePath):
+    # Copy the image to <output_dir>/<md5Hash>.<image_extension>
+    imgExt = os.path.splitext(outputImagePath)[1].lstrip(".")
+    if not imgExt:
+      raise ValueError("File %s does not have an extension" % outputImagePath)
+    newFilePath = os.path.join(self._outputDir, md5Hash + '.' + imgExt)
+    shutil.copy2(outputImagePath, newFilePath)
+
+    # Add an entry to the list of test results
+    self._results.append({
+      "key": {
+        "name": testName,
+        "source_type": self._source_type,
+      },
+      "md5": md5Hash,
+      "options": {
+        "ext": imgExt,
+        "gamma_correct": "no"
+      }
+    })
+
+  def _parseKeyValuePairs(self, kvStr):
+    kvPairs = shlex.split(kvStr)
+    if len(kvPairs) % 2:
+      raise ValueError("Uneven number of key/value pairs. Got %s" % kvStr)
+    return { kvPairs[i]:kvPairs[i+1] for i in range(0, len(kvPairs), 2) }
+
+  def WriteResults(self):
+    self._properties.update({
+      "results": self._results
+    })
+
+    outputFileName = os.path.join(self._outputDir, "dm.json")
+    with open(outputFileName, 'wb') as outfile:
+      json.dump(self._properties, outfile, indent=1)
+      outfile.write("\n")
+
+# Produce example output for manual testing.
+if __name__ == "__main__":
+  # Create a test directory with three empty 'image' files.
+  testDir = "./testdirectory"
+  if not os.path.exists(testDir):
+    os.makedirs(testDir)
+  open(os.path.join(testDir, "image1.png"), 'wb').close()
+  open(os.path.join(testDir, "image2.png"), 'wb').close()
+  open(os.path.join(testDir, "image3.png"), 'wb').close()
+
+  # Create an instance and add results.
+  propStr = """build_number 2 "builder name" Builder-Name gitHash a4a338179013b029d6dd55e737b5bd648a9fb68c"""
+
+  keyStr = "arch arm64 compiler Clang configuration Debug"
+
+  gr = GoldResults("pdfium", testDir, propStr, keyStr)
+  gr.AddTestResult("test-1", "hash-1", os.path.join(testDir, "image1.png"))
+  gr.AddTestResult("test-2", "hash-2", os.path.join(testDir, "image2.png"))
+  gr.AddTestResult("test-3", "hash-3", os.path.join(testDir, "image3.png"))
+  gr.WriteResults()
diff --git a/testing/tools/test_runner.py b/testing/tools/test_runner.py
index 5c37706..fad7a9c 100644
--- a/testing/tools/test_runner.py
+++ b/testing/tools/test_runner.py
@@ -14,6 +14,7 @@
 import sys
 
 import common
+import gold
 import pngdiffer
 import suppressor
 
@@ -39,6 +40,10 @@
   def __init__(self, dirname):
     self.test_dir = dirname
 
+  # GenerateAndTest returns a tuple <success, outputfiles> where
+  # success is a boolean indicating whether the tests passed comparison
+  # tests and outputfiles is a list tuples:
+  #          (path_to_image, md5_hash_of_pixelbuffer)
   def GenerateAndTest(self, input_filename, source_dir):
     input_root, _ = os.path.splitext(input_filename)
     expected_txt_path = os.path.join(source_dir, input_root + '_expected.txt')
@@ -59,23 +64,23 @@
 
     if raised_exception != None:
       print "FAILURE: " + input_filename + "; " + str(raised_exception)
-      return False
+      return False, []
 
+    results = []
     if os.path.exists(expected_txt_path):
       raised_exception = self.TestText(input_root, expected_txt_path, pdf_path)
     else:
-      raised_exception = self.TestPixel(input_root, pdf_path)
+      raised_exception, results = self.TestPixel(input_root, pdf_path)
 
     if raised_exception != None:
       print "FAILURE: " + input_filename + "; " + str(raised_exception)
-      return False
+      return False, results
 
     if len(actual_images):
       if self.image_differ.HasDifferences(input_filename, source_dir,
                                           self.working_dir):
-        return False
-
-    return True
+        return False, results
+    return True, results
 
   def Generate(self, source_dir, input_filename, input_root, pdf_path):
     original_path = os.path.join(source_dir, input_filename)
@@ -113,12 +118,20 @@
 
   def TestPixel(self, input_root, pdf_path):
     cmd_to_run = common.DrMemoryWrapper(self.drmem_wrapper, input_root)
-    cmd_to_run.extend([self.pdfium_test_path, '--send-events', '--png',
-                       pdf_path])
-    return common.RunCommand(cmd_to_run)
-
+    cmd_to_run.extend([self.pdfium_test_path, '--send-events', '--png'])
+    if self.gold_results:
+      cmd_to_run.append('--md5')
+    cmd_to_run.append(pdf_path)
+    return common.RunCommandExtractHashedFiles(cmd_to_run)
 
   def HandleResult(self, input_filename, input_path, result):
+    if self.gold_results:
+      success, image_paths = result
+      for img_path, md5_hash in image_paths:
+        # the output filename (without extension becomes the test name)
+        test_name = os.path.splitext(os.path.split(img_path)[1])[0]
+        self.gold_results.AddTestResult(test_name, md5_hash, img_path)
+
     if self.test_suppressor.IsResultSuppressed(input_filename):
       if result:
         self.surprises.append(input_path)
@@ -129,13 +142,29 @@
 
   def Run(self):
     parser = optparse.OptionParser()
+
     parser.add_option('--build-dir', default=os.path.join('out', 'Debug'),
                       help='relative path from the base source directory')
+
     parser.add_option('-j', default=multiprocessing.cpu_count(),
                       dest='num_workers', type='int',
                       help='run NUM_WORKERS jobs in parallel')
+
     parser.add_option('--wrapper', default='', dest="wrapper",
                       help='wrapper for running test under Dr. Memory')
+
+    parser.add_option('--gold_properties', default='', dest="gold_properties",
+                      help='Key value pairs that are written to the top level of the JSON file that is ingested by Gold.')
+
+    parser.add_option('--gold_key', default='', dest="gold_key",
+                      help='Key value pairs that are added to the "key" field of the JSON file that is ingested by Gold.')
+
+    parser.add_option('--gold_output_dir', default='', dest="gold_output_dir",
+                      help='Path of where to write the JSON output to be uploaded to Gold.')
+
+    parser.add_option('--ignore_errors', action="store_true", dest="ignore_errors",
+                      help='Prevents the return value from being non-zero when image comparison fails.')
+
     options, args = parser.parse_args()
 
     finder = common.DirectoryFinder(options.build_dir)
@@ -191,6 +220,14 @@
     self.failures = []
     self.surprises = []
 
+    # Collect Gold results if an output directory was named.
+    self.gold_results = None
+    if options.gold_output_dir:
+      self.gold_results = gold.GoldResults("pdfium",
+                                           options.gold_output_dir,
+                                           options.gold_properties,
+                                           options.gold_key)
+
     if options.num_workers > 1 and len(test_cases) > 1:
       try:
         pool = multiprocessing.Pool(options.num_workers)
@@ -215,6 +252,9 @@
         self.HandleResult(input_filename,
                           os.path.join(input_file_dir, input_filename), result)
 
+    if self.gold_results:
+      self.gold_results.WriteResults()
+
     if self.surprises:
       self.surprises.sort()
       print '\n\nUnexpected Successes:'
@@ -226,6 +266,6 @@
       print '\n\nSummary of Failures:'
       for failure in self.failures:
         print failure
-      return 1
-
+        if not options.ignore_errors:
+          return 1
     return 0