skpbench: add option for gpu timing

Adds a gpu timing option with a GL implementation.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2388433003

Committed: https://skia.googlesource.com/skia/+/c06720d06faab3b01eba1b8693e0ac791f06dc96
Review-Url: https://codereview.chromium.org/2388433003
diff --git a/tools/gpu/FenceSync.h b/tools/gpu/FenceSync.h
index 8f2bbe2..b430f5d 100644
--- a/tools/gpu/FenceSync.h
+++ b/tools/gpu/FenceSync.h
@@ -13,7 +13,7 @@
 namespace sk_gpu_test {
 
 using PlatformFence = uint64_t;
-static constexpr PlatformFence kInvalidPlatformFence = 0;
+static constexpr PlatformFence kInvalidFence = 0;
 
 /*
  * This class provides an interface to interact with fence syncs. A fence sync is an object that the
@@ -29,6 +29,6 @@
     virtual ~FenceSync() {}
 };
 
-}
+}  // namespace sk_gpu_test
 
 #endif
diff --git a/tools/gpu/GpuTimer.h b/tools/gpu/GpuTimer.h
new file mode 100644
index 0000000..7678421
--- /dev/null
+++ b/tools/gpu/GpuTimer.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GpuTimer_DEFINED
+#define GpuTimer_DEFINED
+
+#include "SkTypes.h"
+#include "SkExchange.h"
+#include <chrono>
+
+namespace sk_gpu_test {
+
+using PlatformTimerQuery = uint64_t;
+static constexpr PlatformTimerQuery kInvalidTimerQuery = 0;
+
+/**
+ * Platform-independent interface for timing operations on the GPU.
+ */
+class GpuTimer {
+public:
+    GpuTimer(bool disjointSupport)
+        : fDisjointSupport(disjointSupport)
+        , fActiveTimer(kInvalidTimerQuery) {
+    }
+    virtual ~GpuTimer() { SkASSERT(!fActiveTimer); }
+
+    /**
+     * Returns whether this timer can detect disjoint GPU operations while timing. If false, a query
+     * has less confidence when it completes with QueryStatus::kAccurate.
+     */
+    bool disjointSupport() const { return fDisjointSupport; }
+
+    /**
+     * Inserts a "start timing" command in the GPU command stream.
+     */
+    void queueStart() {
+        SkASSERT(!fActiveTimer);
+        fActiveTimer = this->onQueueTimerStart();
+    }
+
+    /**
+     * Inserts a "stop timing" command in the GPU command stream.
+     *
+     * @return a query object that can retrieve the time elapsed once the timer has completed.
+     */
+    PlatformTimerQuery SK_WARN_UNUSED_RESULT queueStop() {
+        SkASSERT(fActiveTimer);
+        this->onQueueTimerStop(fActiveTimer);
+        return skstd::exchange(fActiveTimer, kInvalidTimerQuery);
+    }
+
+    enum class QueryStatus {
+        kInvalid,  //<! the timer query is invalid.
+        kPending,  //<! the timer is still running on the GPU.
+        kDisjoint, //<! the query is complete, but dubious due to disjoint GPU operations.
+        kAccurate  //<! the query is complete and reliable.
+    };
+
+    virtual QueryStatus checkQueryStatus(PlatformTimerQuery) = 0;
+    virtual std::chrono::nanoseconds getTimeElapsed(PlatformTimerQuery) = 0;
+    virtual void deleteQuery(PlatformTimerQuery) = 0;
+
+private:
+    virtual PlatformTimerQuery onQueueTimerStart() const = 0;
+    virtual void onQueueTimerStop(PlatformTimerQuery) const = 0;
+
+    bool const           fDisjointSupport;
+    PlatformTimerQuery   fActiveTimer;
+};
+
+}  // namespace sk_gpu_test
+
+#endif
diff --git a/tools/gpu/TestContext.cpp b/tools/gpu/TestContext.cpp
index 8a78b90..90aba43 100644
--- a/tools/gpu/TestContext.cpp
+++ b/tools/gpu/TestContext.cpp
@@ -8,8 +8,13 @@
 
 #include "TestContext.h"
 
+#include "GpuTimer.h"
+
 namespace sk_gpu_test {
-TestContext::TestContext() : fFenceSync(nullptr), fCurrentFenceIdx(0) {
+TestContext::TestContext()
+    : fFenceSync(nullptr)
+    , fGpuTimer(nullptr)
+    , fCurrentFenceIdx(0) {
     memset(fFrameFences, 0, sizeof(fFrameFences));
 }
 
@@ -21,6 +26,7 @@
     }
 #endif
     SkASSERT(!fFenceSync);
+    SkASSERT(!fGpuTimer);
 }
 
 void TestContext::makeCurrent() const { this->onPlatformMakeCurrent(); }
@@ -60,9 +66,9 @@
                 fFrameFences[i] = 0;
             }
         }
-        delete fFenceSync;
-        fFenceSync = nullptr;
+        fFenceSync.reset();
     }
+    fGpuTimer.reset();
 }
 
 }
diff --git a/tools/gpu/TestContext.h b/tools/gpu/TestContext.h
index d01cb02..8722a33 100644
--- a/tools/gpu/TestContext.h
+++ b/tools/gpu/TestContext.h
@@ -14,6 +14,9 @@
 #include "../private/SkTemplates.h"
 
 namespace sk_gpu_test {
+
+class GpuTimer;
+
 /**
  * An offscreen 3D context. This class is intended for Skia's internal testing needs and not
  * for general use.
@@ -27,6 +30,9 @@
     bool fenceSyncSupport() const { return fFenceSync != nullptr; }
     FenceSync* fenceSync() { SkASSERT(fFenceSync); return fFenceSync; }
 
+    bool gpuTimingSupport() const { return fGpuTimer != nullptr; }
+    GpuTimer* gpuTimer() const { SkASSERT(fGpuTimer); return fGpuTimer; }
+
     bool getMaxGpuFrameLag(int *maxFrameLag) const {
         if (!fFenceSync) {
             return false;
@@ -75,7 +81,8 @@
     virtual void finish() = 0;
 
 protected:
-    FenceSync* fFenceSync;
+    SkAutoTDelete<FenceSync>   fFenceSync;
+    SkAutoTDelete<GpuTimer>    fGpuTimer;
 
     TestContext();
 
diff --git a/tools/gpu/gl/GLTestContext.cpp b/tools/gpu/gl/GLTestContext.cpp
index 1b077d5..20a9908 100644
--- a/tools/gpu/gl/GLTestContext.cpp
+++ b/tools/gpu/gl/GLTestContext.cpp
@@ -6,6 +6,8 @@
  */
 
 #include "GLTestContext.h"
+
+#include "GpuTimer.h"
 #include "gl/GrGLUtil.h"
 
 namespace {
@@ -78,6 +80,135 @@
     fGLDeleteSync(glsync);
 }
 
+class GLGpuTimer : public sk_gpu_test::GpuTimer {
+public:
+    static GLGpuTimer* CreateIfSupported(const sk_gpu_test::GLTestContext*);
+
+    QueryStatus checkQueryStatus(sk_gpu_test::PlatformTimerQuery) override;
+    std::chrono::nanoseconds getTimeElapsed(sk_gpu_test::PlatformTimerQuery) override;
+    void deleteQuery(sk_gpu_test::PlatformTimerQuery) override;
+
+private:
+    GLGpuTimer(bool disjointSupport, const sk_gpu_test::GLTestContext*, const char* ext = "");
+
+    bool validate() const;
+
+    sk_gpu_test::PlatformTimerQuery onQueueTimerStart() const override;
+    void onQueueTimerStop(sk_gpu_test::PlatformTimerQuery) const override;
+
+    static constexpr GrGLenum GL_QUERY_RESULT            = 0x8866;
+    static constexpr GrGLenum GL_QUERY_RESULT_AVAILABLE  = 0x8867;
+    static constexpr GrGLenum GL_TIME_ELAPSED            = 0x88bf;
+    static constexpr GrGLenum GL_GPU_DISJOINT            = 0x8fbb;
+
+    typedef void (GR_GL_FUNCTION_TYPE* GLGetIntegervProc) (GrGLenum, GrGLint*);
+    typedef void (GR_GL_FUNCTION_TYPE* GLGenQueriesProc) (GrGLsizei, GrGLuint*);
+    typedef void (GR_GL_FUNCTION_TYPE* GLDeleteQueriesProc) (GrGLsizei, const GrGLuint*);
+    typedef void (GR_GL_FUNCTION_TYPE* GLBeginQueryProc) (GrGLenum, GrGLuint);
+    typedef void (GR_GL_FUNCTION_TYPE* GLEndQueryProc) (GrGLenum);
+    typedef void (GR_GL_FUNCTION_TYPE* GLGetQueryObjectuivProc) (GrGLuint, GrGLenum, GrGLuint*);
+    typedef void (GR_GL_FUNCTION_TYPE* GLGetQueryObjectui64vProc) (GrGLuint, GrGLenum, GrGLuint64*);
+
+    GLGetIntegervProc           fGLGetIntegerv;
+    GLGenQueriesProc            fGLGenQueries;
+    GLDeleteQueriesProc         fGLDeleteQueries;
+    GLBeginQueryProc            fGLBeginQuery;
+    GLEndQueryProc              fGLEndQuery;
+    GLGetQueryObjectuivProc     fGLGetQueryObjectuiv;
+    GLGetQueryObjectui64vProc   fGLGetQueryObjectui64v;
+
+
+    typedef sk_gpu_test::GpuTimer INHERITED;
+};
+
+GLGpuTimer* GLGpuTimer::CreateIfSupported(const sk_gpu_test::GLTestContext* ctx) {
+    SkAutoTDelete<GLGpuTimer> ret;
+    const GrGLInterface* gl = ctx->gl();
+    if (gl->fExtensions.has("GL_EXT_disjoint_timer_query")) {
+        ret.reset(new GLGpuTimer(true, ctx, "EXT"));
+    } else if (kGL_GrGLStandard == gl->fStandard &&
+               (GrGLGetVersion(gl) > GR_GL_VER(3,3) || gl->fExtensions.has("GL_ARB_timer_query"))) {
+        ret.reset(new GLGpuTimer(false, ctx));
+    } else if (gl->fExtensions.has("GL_EXT_timer_query")) {
+        ret.reset(new GLGpuTimer(false, ctx, "EXT"));
+    }
+    return ret && ret->validate() ? ret.release() : nullptr;
+}
+
+GLGpuTimer::GLGpuTimer(bool disjointSupport, const sk_gpu_test::GLTestContext* ctx, const char* ext)
+    : INHERITED(disjointSupport) {
+    ctx->getGLProcAddress(&fGLGetIntegerv, "glGetIntegerv");
+    ctx->getGLProcAddress(&fGLGenQueries, "glGenQueries", ext);
+    ctx->getGLProcAddress(&fGLDeleteQueries, "glDeleteQueries", ext);
+    ctx->getGLProcAddress(&fGLBeginQuery, "glBeginQuery", ext);
+    ctx->getGLProcAddress(&fGLEndQuery, "glEndQuery", ext);
+    ctx->getGLProcAddress(&fGLGetQueryObjectuiv, "glGetQueryObjectuiv", ext);
+    ctx->getGLProcAddress(&fGLGetQueryObjectui64v, "glGetQueryObjectui64v", ext);
+}
+
+bool GLGpuTimer::validate() const {
+    return fGLGetIntegerv && fGLGenQueries && fGLDeleteQueries && fGLBeginQuery && fGLEndQuery &&
+           fGLGetQueryObjectuiv && fGLGetQueryObjectui64v;
+}
+
+sk_gpu_test::PlatformTimerQuery GLGpuTimer::onQueueTimerStart() const {
+    GrGLuint queryID;
+    fGLGenQueries(1, &queryID);
+    if (!queryID) {
+        return sk_gpu_test::kInvalidTimerQuery;
+    }
+    if (this->disjointSupport()) {
+        // Clear the disjoint flag.
+        GrGLint disjoint;
+        fGLGetIntegerv(GL_GPU_DISJOINT, &disjoint);
+    }
+    fGLBeginQuery(GL_TIME_ELAPSED, queryID);
+    return static_cast<sk_gpu_test::PlatformTimerQuery>(queryID);
+}
+
+void GLGpuTimer::onQueueTimerStop(sk_gpu_test::PlatformTimerQuery platformTimer) const {
+    if (sk_gpu_test::kInvalidTimerQuery == platformTimer) {
+        return;
+    }
+    fGLEndQuery(GL_TIME_ELAPSED);
+}
+
+sk_gpu_test::GpuTimer::QueryStatus
+GLGpuTimer::checkQueryStatus(sk_gpu_test::PlatformTimerQuery platformTimer) {
+    const GrGLuint queryID = static_cast<GrGLuint>(platformTimer);
+    if (!queryID) {
+        return QueryStatus::kInvalid;
+    }
+    GrGLuint available = 0;
+    fGLGetQueryObjectuiv(queryID, GL_QUERY_RESULT_AVAILABLE, &available);
+    if (!available) {
+        return QueryStatus::kPending;
+    }
+    if (this->disjointSupport()) {
+        GrGLint disjoint = 1;
+        fGLGetIntegerv(GL_GPU_DISJOINT, &disjoint);
+        if (disjoint) {
+            return QueryStatus::kDisjoint;
+        }
+    }
+    return QueryStatus::kAccurate;
+}
+
+std::chrono::nanoseconds GLGpuTimer::getTimeElapsed(sk_gpu_test::PlatformTimerQuery platformTimer) {
+    SkASSERT(this->checkQueryStatus(platformTimer) >= QueryStatus::kDisjoint);
+    const GrGLuint queryID = static_cast<GrGLuint>(platformTimer);
+    GrGLuint64 nanoseconds;
+    fGLGetQueryObjectui64v(queryID, GL_QUERY_RESULT, &nanoseconds);
+    return std::chrono::nanoseconds(nanoseconds);
+}
+
+void GLGpuTimer::deleteQuery(sk_gpu_test::PlatformTimerQuery platformTimer) {
+    const GrGLuint queryID = static_cast<GrGLuint>(platformTimer);
+    fGLDeleteQueries(1, &queryID);
+}
+
+GR_STATIC_ASSERT(sizeof(GrGLuint) <= sizeof(sk_gpu_test::PlatformTimerQuery));
+
 }  // anonymous namespace
 
 namespace sk_gpu_test {
@@ -92,6 +223,7 @@
     SkASSERT(!fGL.get());
     fGL.reset(gl);
     fFenceSync = fenceSync ? fenceSync : GLFenceSync::CreateIfSupported(this);
+    fGpuTimer = GLGpuTimer::CreateIfSupported(this);
 }
 
 void GLTestContext::teardown() {
diff --git a/tools/skpbench/_benchresult.py b/tools/skpbench/_benchresult.py
index 94c1105..666878b 100644
--- a/tools/skpbench/_benchresult.py
+++ b/tools/skpbench/_benchresult.py
@@ -25,6 +25,8 @@
                        '(?P<samples>\d+)'
                        '(?P<sample_ms_pad> +)'
                        '(?P<sample_ms>\d+)'
+                       '(?P<clock_pad> +)'
+                       '(?P<clock>[cg]pu)'
                        '(?P<metric_pad> +)'
                        '(?P<metric>ms|fps)'
                        '(?P<config_pad> +)'
@@ -45,6 +47,7 @@
     self.stddev = float(match.group('stddev')[:-1]) # Drop '%' sign.
     self.samples = int(match.group('samples'))
     self.sample_ms = int(match.group('sample_ms'))
+    self.clock = match.group('clock')
     self.metric = match.group('metric')
     self.config = match.group('config')
     self.bench = match.group('bench')
@@ -59,7 +62,7 @@
     else:
       values = list()
       for name in ['accum', 'median', 'max', 'min', 'stddev',
-                   'samples', 'sample_ms', 'metric', 'config']:
+                   'samples', 'sample_ms', 'clock', 'metric', 'config']:
         values.append(self.get_string(name + '_pad'))
         values.append(self.get_string(name))
       values.append(config_suffix)
diff --git a/tools/skpbench/parseskpbench.py b/tools/skpbench/parseskpbench.py
index 5fe146e..800c1ca 100755
--- a/tools/skpbench/parseskpbench.py
+++ b/tools/skpbench/parseskpbench.py
@@ -8,8 +8,8 @@
 from __future__ import print_function
 from _benchresult import BenchResult
 from argparse import ArgumentParser
+from collections import defaultdict, namedtuple
 from datetime import datetime
-import collections
 import operator
 import os
 import sys
@@ -27,7 +27,7 @@
 (1) Install the "Office Editing for Docs, Sheets & Slides" Chrome extension:
     https://chrome.google.com/webstore/detail/office-editing-for-docs-s/gbkeegbaiigmenfmjfclcdgdpimamgkj
 
-(2) Designate Chrome os-wide as the default application for opening .csv files.
+(2) Update your global OS file associations to use Chrome for .csv files.
 
 (3) Run parseskpbench.py with the --open flag.
 
@@ -49,75 +49,92 @@
 
 FLAGS = __argparse.parse_args()
 
+RESULT_QUALIFIERS = ('sample_ms', 'clock', 'metric')
+
+class FullConfig(namedtuple('fullconfig', ('config',) + RESULT_QUALIFIERS)):
+  def qualified_name(self, qualifiers=RESULT_QUALIFIERS):
+    return get_qualified_name(self.config.replace(',', ' '),
+                              {x:getattr(self, x) for x in qualifiers})
+
+def get_qualified_name(name, qualifiers):
+  if not qualifiers:
+    return name
+  else:
+    args = ('%s=%s' % (k,v) for k,v in qualifiers.iteritems())
+    return '%s (%s)' % (name, ' '.join(args))
 
 class Parser:
   def __init__(self):
-    self.configs = list() # use list to preserve the order configs appear in.
-    self.rows = collections.defaultdict(dict)
-    self.cols = collections.defaultdict(dict)
-    self.metric = None
-    self.sample_ms = None
+    self.sheet_qualifiers = {x:None for x in RESULT_QUALIFIERS}
+    self.config_qualifiers = set()
+    self.fullconfigs = list() # use list to preserve the order.
+    self.rows = defaultdict(dict)
+    self.cols = defaultdict(dict)
 
   def parse_file(self, infile):
     for line in infile:
       match = BenchResult.match(line)
       if not match:
         continue
-      if self.metric is None:
-        self.metric = match.metric
-      elif match.metric != self.metric:
-        raise ValueError("results have mismatched metrics (%s and %s)" %
-                         (self.metric, match.metric))
-      if self.sample_ms is None:
-        self.sample_ms = match.sample_ms
-      elif not FLAGS.force and match.sample_ms != self.sample_ms:
-        raise ValueError("results have mismatched sampling times. "
-                         "(use --force to ignore)")
-      if not match.config in self.configs:
-        self.configs.append(match.config)
-      self.rows[match.bench][match.config] = match.get_string(FLAGS.result)
-      self.cols[match.config][match.bench] = getattr(match, FLAGS.result)
+
+      fullconfig = FullConfig(*(match.get_string(x)
+                                for x in FullConfig._fields))
+      if not fullconfig in self.fullconfigs:
+        self.fullconfigs.append(fullconfig)
+
+      for qualifier, value in self.sheet_qualifiers.items():
+        if value is None:
+          self.sheet_qualifiers[qualifier] = match.get_string(qualifier)
+        elif value != match.get_string(qualifier):
+          del self.sheet_qualifiers[qualifier]
+          self.config_qualifiers.add(qualifier)
+
+      self.rows[match.bench][fullconfig] = match.get_string(FLAGS.result)
+      self.cols[fullconfig][match.bench] = getattr(match, FLAGS.result)
 
   def print_csv(self, outfile=sys.stdout):
-    print('%s_%s' % (FLAGS.result, self.metric), file=outfile)
+    # Write the title.
+    print(get_qualified_name(FLAGS.result, self.sheet_qualifiers), file=outfile)
 
     # Write the header.
     outfile.write('bench,')
-    for config in self.configs:
-      outfile.write('%s,' % config)
+    for fullconfig in self.fullconfigs:
+      outfile.write('%s,' % fullconfig.qualified_name(self.config_qualifiers))
     outfile.write('\n')
 
     # Write the rows.
-    for bench, row in self.rows.items():
+    for bench, row in self.rows.iteritems():
       outfile.write('%s,' % bench)
-      for config in self.configs:
-        if config in row:
-          outfile.write('%s,' % row[config])
+      for fullconfig in self.fullconfigs:
+        if fullconfig in row:
+          outfile.write('%s,' % row[fullconfig])
         elif FLAGS.force:
-          outfile.write(',')
+          outfile.write('NULL,')
         else:
           raise ValueError("%s: missing value for %s. (use --force to ignore)" %
-                           (bench, config))
+                           (bench,
+                            fullconfig.qualified_name(self.config_qualifiers)))
       outfile.write('\n')
 
     # Add simple, literal averages.
     if len(self.rows) > 1:
       outfile.write('\n')
-      self.__print_computed_row('MEAN',
+      self._print_computed_row('MEAN',
         lambda col: reduce(operator.add, col.values()) / len(col),
         outfile=outfile)
-      self.__print_computed_row('GEOMEAN',
+      self._print_computed_row('GEOMEAN',
         lambda col: reduce(operator.mul, col.values()) ** (1.0 / len(col)),
         outfile=outfile)
 
-  def __print_computed_row(self, name, func, outfile=sys.stdout):
+  def _print_computed_row(self, name, func, outfile=sys.stdout):
     outfile.write('%s,' % name)
-    for config in self.configs:
-      assert(len(self.cols[config]) == len(self.rows))
-      outfile.write('%.4g,' % func(self.cols[config]))
+    for fullconfig in self.fullconfigs:
+      if len(self.cols[fullconfig]) != len(self.rows):
+        outfile.write('NULL,')
+        continue
+      outfile.write('%.4g,' % func(self.cols[fullconfig]))
     outfile.write('\n')
 
-
 def main():
   parser = Parser()
 
diff --git a/tools/skpbench/skpbench.cpp b/tools/skpbench/skpbench.cpp
index adb6af0..6d0381a 100644
--- a/tools/skpbench/skpbench.cpp
+++ b/tools/skpbench/skpbench.cpp
@@ -5,6 +5,7 @@
  * found in the LICENSE file.
  */
 
+#include "GpuTimer.h"
 #include "GrContextFactory.h"
 #include "SkCanvas.h"
 #include "SkOSFile.h"
@@ -33,12 +34,9 @@
  * Currently, only GPU configs are supported.
  */
 
-using sk_gpu_test::PlatformFence;
-using sk_gpu_test::kInvalidPlatformFence;
-using sk_gpu_test::FenceSync;
-
 DEFINE_int32(duration, 5000, "number of milliseconds to run the benchmark");
 DEFINE_int32(sampleMs, 50, "minimum duration of a sample");
+DEFINE_bool(gpuClock, false, "time on the gpu clock (gpu work only)");
 DEFINE_bool(fps, false, "use fps instead of ms");
 DEFINE_string(skp, "", "path to a single .skp file to benchmark");
 DEFINE_string(png, "", "if set, save a .png proof to disk at this file location");
@@ -46,13 +44,13 @@
 DEFINE_bool(suppressHeader, false, "don't print a header row before the results");
 
 static const char* header =
-    "   accum    median       max       min   stddev  samples  sample_ms  metric  config    bench";
+"   accum    median       max       min   stddev  samples  sample_ms  clock  metric  config    bench";
 
 static const char* resultFormat =
-    "%8.4g  %8.4g  %8.4g  %8.4g  %6.3g%%  %7li  %9i  %-6s  %-9s %s";
+"%8.4g  %8.4g  %8.4g  %8.4g  %6.3g%%  %7li  %9i  %-5s  %-6s  %-9s %s";
 
 struct Sample {
-    using clock = std::chrono::high_resolution_clock;
+    using duration = std::chrono::nanoseconds;
 
     Sample() : fFrames(0), fDuration(0) {}
     double seconds() const { return std::chrono::duration<double>(fDuration).count(); }
@@ -60,13 +58,13 @@
     double value() const { return FLAGS_fps ? fFrames / this->seconds() : this->ms() / fFrames; }
     static const char* metric() { return FLAGS_fps ? "fps" : "ms"; }
 
-    int fFrames;
-    clock::duration fDuration;
+    int        fFrames;
+    duration   fDuration;
 };
 
 class GpuSync {
 public:
-    GpuSync(const FenceSync* fenceSync);
+    GpuSync(const sk_gpu_test::FenceSync* fenceSync);
     ~GpuSync();
 
     void syncToPreviousFrame();
@@ -74,8 +72,8 @@
 private:
     void updateFence();
 
-    const FenceSync* const   fFenceSync;
-    PlatformFence            fFence;
+    const sk_gpu_test::FenceSync* const   fFenceSync;
+    sk_gpu_test::PlatformFence            fFence;
 };
 
 enum class ExitErr {
@@ -92,10 +90,10 @@
 static SkString join(const SkCommandLineFlags::StringArray&);
 static void exitf(ExitErr, const char* format, ...);
 
-static void run_benchmark(const FenceSync* fenceSync, SkCanvas* canvas, const SkPicture* skp,
-                          std::vector<Sample>* samples) {
-    using clock = Sample::clock;
-    const clock::duration sampleDuration = std::chrono::milliseconds(FLAGS_sampleMs);
+static void run_benchmark(const sk_gpu_test::FenceSync* fenceSync, SkCanvas* canvas,
+                          const SkPicture* skp, std::vector<Sample>* samples) {
+    using clock = std::chrono::high_resolution_clock;
+    const Sample::duration sampleDuration = std::chrono::milliseconds(FLAGS_sampleMs);
     const clock::duration benchDuration = std::chrono::milliseconds(FLAGS_duration);
 
     draw_skp_and_flush(canvas, skp);
@@ -123,6 +121,66 @@
     } while (now < endTime || 0 == samples->size() % 2);
 }
 
+static void run_gpu_time_benchmark(sk_gpu_test::GpuTimer* gpuTimer,
+                                   const sk_gpu_test::FenceSync* fenceSync, SkCanvas* canvas,
+                                   const SkPicture* skp, std::vector<Sample>* samples) {
+    using sk_gpu_test::PlatformTimerQuery;
+    using clock = std::chrono::steady_clock;
+    const clock::duration sampleDuration = std::chrono::milliseconds(FLAGS_sampleMs);
+    const clock::duration benchDuration = std::chrono::milliseconds(FLAGS_duration);
+
+    if (!gpuTimer->disjointSupport()) {
+        fprintf(stderr, "WARNING: GPU timer cannot detect disjoint operations; "
+                        "results may be unreliable\n");
+    }
+
+    draw_skp_and_flush(canvas, skp);
+    GpuSync gpuSync(fenceSync);
+
+    gpuTimer->queueStart();
+    draw_skp_and_flush(canvas, skp);
+    PlatformTimerQuery previousTime = gpuTimer->queueStop();
+    gpuSync.syncToPreviousFrame();
+
+    clock::time_point now = clock::now();
+    const clock::time_point endTime = now + benchDuration;
+
+    do {
+        const clock::time_point sampleEndTime = now + sampleDuration;
+        samples->emplace_back();
+        Sample& sample = samples->back();
+
+        do {
+            gpuTimer->queueStart();
+            draw_skp_and_flush(canvas, skp);
+            PlatformTimerQuery time = gpuTimer->queueStop();
+            gpuSync.syncToPreviousFrame();
+
+            switch (gpuTimer->checkQueryStatus(previousTime)) {
+                using QueryStatus = sk_gpu_test::GpuTimer::QueryStatus;
+                case QueryStatus::kInvalid:
+                    exitf(ExitErr::kUnavailable, "GPU timer failed");
+                case QueryStatus::kPending:
+                    exitf(ExitErr::kUnavailable, "timer query still not ready after fence sync");
+                case QueryStatus::kDisjoint:
+                    if (FLAGS_verbosity >= 4) {
+                        fprintf(stderr, "discarding timer query due to disjoint operations.\n");
+                    }
+                    break;
+                case QueryStatus::kAccurate:
+                    sample.fDuration += gpuTimer->getTimeElapsed(previousTime);
+                    ++sample.fFrames;
+                    break;
+            }
+            gpuTimer->deleteQuery(previousTime);
+            previousTime = time;
+            now = clock::now();
+        } while (now < sampleEndTime || 0 == sample.fFrames);
+    } while (now < endTime || 0 == samples->size() % 2);
+
+    gpuTimer->deleteQuery(previousTime);
+}
+
 void print_result(const std::vector<Sample>& samples, const char* config, const char* bench)  {
     if (0 == (samples.size() % 2)) {
         exitf(ExitErr::kSoftware, "attempted to gather stats on even number of samples");
@@ -149,7 +207,8 @@
     const double stddev = 100/*%*/ * sqrt(variance) / accumValue;
 
     printf(resultFormat, accumValue, values[values.size() / 2], values.back(), values.front(),
-           stddev, values.size(), FLAGS_sampleMs, Sample::metric(), config, bench);
+           stddev, values.size(), FLAGS_sampleMs, FLAGS_gpuClock ? "gpu" : "cpu", Sample::metric(),
+           config, bench);
     printf("\n");
     fflush(stdout);
 }
@@ -247,7 +306,15 @@
     // Run the benchmark.
     SkCanvas* canvas = surface->getCanvas();
     canvas->translate(-skp->cullRect().x(), -skp->cullRect().y());
-    run_benchmark(testCtx->fenceSync(), canvas, skp.get(), &samples);
+    if (!FLAGS_gpuClock) {
+        run_benchmark(testCtx->fenceSync(), canvas, skp.get(), &samples);
+    } else {
+        if (!testCtx->gpuTimingSupport()) {
+            exitf(ExitErr::kUnavailable, "GPU does not support timing");
+        }
+        run_gpu_time_benchmark(testCtx->gpuTimer(), testCtx->fenceSync(), canvas, skp.get(),
+                               &samples);
+    }
     print_result(samples, config->getTag().c_str(), SkOSPath::Basename(skpfile).c_str());
 
     // Save a proof (if one was requested).
@@ -300,7 +367,7 @@
     exit((int)err);
 }
 
-GpuSync::GpuSync(const FenceSync* fenceSync)
+GpuSync::GpuSync(const sk_gpu_test::FenceSync* fenceSync)
     : fFenceSync(fenceSync) {
     this->updateFence();
 }
@@ -310,7 +377,7 @@
 }
 
 void GpuSync::syncToPreviousFrame() {
-    if (kInvalidPlatformFence == fFence) {
+    if (sk_gpu_test::kInvalidFence == fFence) {
         exitf(ExitErr::kSoftware, "attempted to sync with invalid fence");
     }
     if (!fFenceSync->waitFence(fFence)) {
@@ -322,7 +389,7 @@
 
 void GpuSync::updateFence() {
     fFence = fFenceSync->insertFence();
-    if (kInvalidPlatformFence == fFence) {
+    if (sk_gpu_test::kInvalidFence == fFence) {
         exitf(ExitErr::kUnavailable, "failed to insert fence");
     }
 }
diff --git a/tools/skpbench/skpbench.py b/tools/skpbench/skpbench.py
index 83aaf84..6bf3975 100755
--- a/tools/skpbench/skpbench.py
+++ b/tools/skpbench/skpbench.py
@@ -32,7 +32,8 @@
 __argparse.add_argument('--adb',
     action='store_true', help="execute skpbench over adb")
 __argparse.add_argument('-s', '--device-serial',
-    help="if using adb, id of the specific device to target")
+    help="if using adb, ID of the specific device to target "
+         "(only required if more than 1 device is attached)")
 __argparse.add_argument('-p', '--path',
     help="directory to execute ./skpbench from")
 __argparse.add_argument('-m', '--max-stddev',
@@ -47,7 +48,10 @@
 __argparse.add_argument('-d', '--duration',
     type=int, help="number of milliseconds to run each benchmark")
 __argparse.add_argument('-l', '--sample-ms',
-    type=int, help="minimum duration of a sample")
+    type=int, help="duration of a sample (minimum)")
+__argparse.add_argument('--gpu',
+    action='store_true',
+    help="perform timing on the gpu clock instead of cpu (gpu work only)")
 __argparse.add_argument('--fps',
     action='store_true', help="use fps instead of ms")
 __argparse.add_argument('-c', '--config',
@@ -93,6 +97,8 @@
     ARGV.extend(['--duration', str(FLAGS.duration)])
   if FLAGS.sample_ms:
     ARGV.extend(['--sampleMs', str(FLAGS.sample_ms)])
+  if FLAGS.gpu:
+    ARGV.extend(['--gpuClock', 'true'])
   if FLAGS.fps:
     ARGV.extend(['--fps', 'true'])
   if FLAGS.path:
@@ -188,7 +194,7 @@
 
   def terminate(self):
     if self._proc:
-      self._proc.kill()
+      self._proc.terminate()
       self._monitor.join()
       self._proc.wait()
       self._proc = None