Add MultiPictureDraw to nanobench

I would like some guard against performance regressions on our side before turning layer hoisting on in Chromium.

TBR=bsalomon@google.com

Committed: https://skia.googlesource.com/skia/+/0ddad31012dabfc1267effc8071d37f7d606efbe

Review URL: https://codereview.chromium.org/731973005
diff --git a/bench/Benchmark.cpp b/bench/Benchmark.cpp
index 88e385d..6040f91 100644
--- a/bench/Benchmark.cpp
+++ b/bench/Benchmark.cpp
@@ -37,6 +37,14 @@
     this->onPreDraw();
 }
 
+void Benchmark::perCanvasPreDraw(SkCanvas* canvas) {
+    this->onPerCanvasPreDraw(canvas);
+}
+
+void Benchmark::perCanvasPostDraw(SkCanvas* canvas) {
+    this->onPerCanvasPostDraw(canvas);
+}
+
 void Benchmark::draw(const int loops, SkCanvas* canvas) {
     SkAutoCanvasRestore ar(canvas, true/*save now*/);
     this->onDraw(loops, canvas);
diff --git a/bench/Benchmark.h b/bench/Benchmark.h
index 508d1da..c30167e 100644
--- a/bench/Benchmark.h
+++ b/bench/Benchmark.h
@@ -70,6 +70,11 @@
     // before the initial draw.
     void preDraw();
 
+    // Called once before and after a series of draw calls to a single canvas.
+    // The setup/break down in these calls is not timed.
+    void perCanvasPreDraw(SkCanvas*);
+    void perCanvasPostDraw(SkCanvas*);
+
     // Bench framework can tune loops to be large enough for stable timing.
     void draw(const int loops, SkCanvas*);
 
@@ -101,6 +106,8 @@
     virtual const char* onGetName() = 0;
     virtual const char* onGetUniqueName() { return this->onGetName(); }
     virtual void onPreDraw() {}
+    virtual void onPerCanvasPreDraw(SkCanvas*) {}
+    virtual void onPerCanvasPostDraw(SkCanvas*) {}
     // Each bench should do its main work in a loop like this:
     //   for (int i = 0; i < loops; i++) { <work here> }
     virtual void onDraw(const int loops, SkCanvas*) = 0;
diff --git a/bench/SKPBench.cpp b/bench/SKPBench.cpp
index 5844d8a..fd8df42 100644
--- a/bench/SKPBench.cpp
+++ b/bench/SKPBench.cpp
@@ -7,15 +7,28 @@
 
 #include "SKPBench.h"
 #include "SkCommandLineFlags.h"
+#include "SkMultiPictureDraw.h"
+#include "SkSurface.h"
 
 DEFINE_int32(benchTile, 256, "Tile dimension used for SKP playback.");
 
-SKPBench::SKPBench(const char* name, const SkPicture* pic, const SkIRect& clip, SkScalar scale)
+SKPBench::SKPBench(const char* name, const SkPicture* pic, const SkIRect& clip, SkScalar scale,
+                   bool useMultiPictureDraw)
     : fPic(SkRef(pic))
     , fClip(clip)
     , fScale(scale)
-    , fName(name) {
+    , fName(name)
+    , fUseMultiPictureDraw(useMultiPictureDraw) {
     fUniqueName.printf("%s_%.2g", name, scale);  // Scale makes this unqiue for skiaperf.com traces.
+    if (useMultiPictureDraw) {
+        fUniqueName.append("_mpd");
+    }
+}
+
+SKPBench::~SKPBench() {
+    for (int i = 0; i < fSurfaces.count(); ++i) {
+        fSurfaces[i]->unref();
+    }
 }
 
 const char* SKPBench::onGetName() {
@@ -26,6 +39,49 @@
     return fUniqueName.c_str();
 }
 
+void SKPBench::onPerCanvasPreDraw(SkCanvas* canvas) {
+    if (!fUseMultiPictureDraw) {
+        return;
+    }
+
+    SkIRect bounds;
+    SkAssertResult(canvas->getClipDeviceBounds(&bounds));
+
+    int xTiles = SkScalarCeilToInt(bounds.width()  / SkIntToScalar(FLAGS_benchTile));
+    int yTiles = SkScalarCeilToInt(bounds.height() / SkIntToScalar(FLAGS_benchTile));
+
+    fSurfaces.setReserve(xTiles * yTiles);
+    fTileRects.setReserve(xTiles * yTiles);
+
+    SkImageInfo ii = canvas->imageInfo().makeWH(FLAGS_benchTile, FLAGS_benchTile);
+
+    for (int y = bounds.fTop; y < bounds.fBottom; y += FLAGS_benchTile) {
+        for (int x = bounds.fLeft; x < bounds.fRight; x += FLAGS_benchTile) {
+            *fTileRects.append() = SkIRect::MakeXYWH(x, y, FLAGS_benchTile, FLAGS_benchTile);
+            *fSurfaces.push() = canvas->newSurface(ii);
+            fSurfaces.top()->getCanvas()->setMatrix(canvas->getTotalMatrix());
+            fSurfaces.top()->getCanvas()->scale(fScale, fScale);
+        }
+    }
+}
+
+void SKPBench::onPerCanvasPostDraw(SkCanvas* canvas) {
+    if (!fUseMultiPictureDraw) {
+        return;
+    }
+
+    // Draw the last set of tiles into the master canvas in case we're
+    // saving the images
+    for (int i = 0; i < fTileRects.count(); ++i) {
+        canvas->drawImage(fSurfaces[i]->newImageSnapshot(),
+                          SkIntToScalar(fTileRects[i].fLeft), SkIntToScalar(fTileRects[i].fTop));
+        SkSafeSetNull(fSurfaces[i]);
+    }
+
+    fSurfaces.rewind();
+    fTileRects.rewind();
+}
+
 bool SKPBench::isSuitableFor(Backend backend) {
     return backend != kNonRendering_Backend;
 }
@@ -35,21 +91,41 @@
 }
 
 void SKPBench::onDraw(const int loops, SkCanvas* canvas) {
-    SkIRect bounds;
-    SkAssertResult(canvas->getClipDeviceBounds(&bounds));
+    if (fUseMultiPictureDraw) {
+        for (int i = 0; i < loops; i++) {
+            SkMultiPictureDraw mpd;
 
-    SkAutoCanvasRestore overall(canvas, true/*save now*/);
-    canvas->scale(fScale, fScale);
+            for (int i = 0; i < fTileRects.count(); ++i) {
+                SkMatrix trans;
+                trans.setTranslate(-fTileRects[i].fLeft/fScale,
+                                   -fTileRects[i].fTop/fScale);
+                mpd.add(fSurfaces[i]->getCanvas(), fPic, &trans);
+            }
 
-    for (int i = 0; i < loops; i++) {
-        for (int y = bounds.fTop; y < bounds.fBottom; y += FLAGS_benchTile) {
-            for (int x = bounds.fLeft; x < bounds.fRight; x += FLAGS_benchTile) {
-                SkAutoCanvasRestore perTile(canvas, true/*save now*/);
-                canvas->clipRect(SkRect::Make(
-                            SkIRect::MakeXYWH(x, y, FLAGS_benchTile, FLAGS_benchTile)));
-                fPic->playback(canvas);
+            mpd.draw();
+
+            for (int i = 0; i < fTileRects.count(); ++i) {
+                fSurfaces[i]->getCanvas()->flush();
             }
         }
-        canvas->flush();
+    } else {
+        SkIRect bounds;
+        SkAssertResult(canvas->getClipDeviceBounds(&bounds));
+
+        SkAutoCanvasRestore overall(canvas, true/*save now*/);
+        canvas->scale(fScale, fScale);
+
+        for (int i = 0; i < loops; i++) {
+            for (int y = bounds.fTop; y < bounds.fBottom; y += FLAGS_benchTile) {
+                for (int x = bounds.fLeft; x < bounds.fRight; x += FLAGS_benchTile) {
+                    SkAutoCanvasRestore perTile(canvas, true/*save now*/);
+                    canvas->clipRect(SkRect::Make(
+                                SkIRect::MakeXYWH(x, y, FLAGS_benchTile, FLAGS_benchTile)));
+                    fPic->playback(canvas);
+                }
+            }
+
+            canvas->flush();
+        }
     }
 }
diff --git a/bench/SKPBench.h b/bench/SKPBench.h
index 0b0e008..36d7dfd 100644
--- a/bench/SKPBench.h
+++ b/bench/SKPBench.h
@@ -17,14 +17,18 @@
  */
 class SKPBench : public Benchmark {
 public:
-    SKPBench(const char* name, const SkPicture*, const SkIRect& devClip, SkScalar scale);
+    SKPBench(const char* name, const SkPicture*, const SkIRect& devClip, SkScalar scale,
+             bool useMultiPictureDraw);
+    ~SKPBench() SK_OVERRIDE;
 
 protected:
-    virtual const char* onGetName() SK_OVERRIDE;
-    virtual const char* onGetUniqueName() SK_OVERRIDE;
-    virtual bool isSuitableFor(Backend backend) SK_OVERRIDE;
-    virtual void onDraw(const int loops, SkCanvas* canvas) SK_OVERRIDE;
-    virtual SkIPoint onGetSize() SK_OVERRIDE;
+    const char* onGetName() SK_OVERRIDE;
+    const char* onGetUniqueName() SK_OVERRIDE;
+    void onPerCanvasPreDraw(SkCanvas*) SK_OVERRIDE;
+    void onPerCanvasPostDraw(SkCanvas*) SK_OVERRIDE;
+    bool isSuitableFor(Backend backend) SK_OVERRIDE;
+    void onDraw(const int loops, SkCanvas* canvas) SK_OVERRIDE;
+    SkIPoint onGetSize() SK_OVERRIDE;
 
 private:
     SkAutoTUnref<const SkPicture> fPic;
@@ -33,6 +37,10 @@
     SkString fName;
     SkString fUniqueName;
 
+    const bool fUseMultiPictureDraw;
+    SkTDArray<SkSurface*> fSurfaces;   // for MultiPictureDraw
+    SkTDArray<SkIRect> fTileRects;     // for MultiPictureDraw
+
     typedef Benchmark INHERITED;
 };
 
diff --git a/bench/nanobench.cpp b/bench/nanobench.cpp
index 1f5ae60..b7603bf 100644
--- a/bench/nanobench.cpp
+++ b/bench/nanobench.cpp
@@ -26,6 +26,7 @@
 #include "SkPictureRecorder.h"
 #include "SkString.h"
 #include "SkSurface.h"
+#include "SkTaskGroup.h"
 
 #if SK_SUPPORT_GPU
     #include "gl/GrGLDefines.h"
@@ -70,6 +71,7 @@
 DEFINE_string(clip, "0,0,1000,1000", "Clip for SKPs.");
 DEFINE_string(scales, "1.0", "Space-separated scales for SKPs.");
 DEFINE_bool(bbh, true, "Build a BBH for SKPs?");
+DEFINE_bool(mpd, true, "Use MultiPictureDraw for the SKPs?");
 DEFINE_int32(flushEvery, 10, "Flush --outResultsFile every Nth run.");
 
 static SkString humanize(double ms) {
@@ -215,14 +217,16 @@
 }
 
 #if SK_SUPPORT_GPU
+static void setup_gl(SkGLContext* gl) {
+    gl->makeCurrent();
+    // Make sure we're done with whatever came before.
+    SK_GL(*gl, Finish());
+}
+
 static int gpu_bench(SkGLContext* gl,
                      Benchmark* bench,
                      SkCanvas* canvas,
                      double* samples) {
-    gl->makeCurrent();
-    // Make sure we're done with whatever came before.
-    SK_GL(*gl, Finish());
-
     // First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs.
     int loops = FLAGS_loops;
     if (kAutoTuneLoops == loops) {
@@ -437,7 +441,8 @@
                       , fGMs(skiagm::GMRegistry::Head())
                       , fCurrentRecording(0)
                       , fCurrentScale(0)
-                      , fCurrentSKP(0) {
+                      , fCurrentSKP(0)
+                      , fCurrentUseMPD(0) {
         for (int i = 0; i < FLAGS_skps.count(); i++) {
             if (SkStrEndsWith(FLAGS_skps[i], ".skp")) {
                 fSKPs.push_back() = FLAGS_skps[i];
@@ -462,6 +467,11 @@
                 exit(1);
             }
         }
+
+        fUseMPDs.push_back() = false;
+        if (FLAGS_mpd) {
+            fUseMPDs.push_back() = true;
+        }
     }
 
     static bool ReadPicture(const char* path, SkAutoTUnref<SkPicture>* pic) {
@@ -520,25 +530,33 @@
         // Then once each for each scale as SKPBenches (playback).
         while (fCurrentScale < fScales.count()) {
             while (fCurrentSKP < fSKPs.count()) {
-                const SkString& path = fSKPs[fCurrentSKP++];
+                const SkString& path = fSKPs[fCurrentSKP];
                 SkAutoTUnref<SkPicture> pic;
                 if (!ReadPicture(path.c_str(), &pic)) {
+                    fCurrentSKP++;
                     continue;
                 }
-                if (FLAGS_bbh) {
-                    // The SKP we read off disk doesn't have a BBH.  Re-record so it grows one.
-                    SkRTreeFactory factory;
-                    SkPictureRecorder recorder;
-                    pic->playback(recorder.beginRecording(pic->cullRect().width(),
-                                                          pic->cullRect().height(),
-                                                          &factory));
-                    pic.reset(recorder.endRecording());
+
+                while (fCurrentUseMPD < fUseMPDs.count()) {
+                    if (FLAGS_bbh) {
+                        // The SKP we read off disk doesn't have a BBH.  Re-record so it grows one.
+                        SkRTreeFactory factory;
+                        SkPictureRecorder recorder;
+                        static const int kFlags = SkPictureRecorder::kComputeSaveLayerInfo_RecordFlag;
+                        pic->playback(recorder.beginRecording(pic->cullRect().width(),
+                                                              pic->cullRect().height(),
+                                                              &factory, kFlags));
+                        pic.reset(recorder.endRecording());
+                    }
+                    SkString name = SkOSPath::Basename(path.c_str());
+                    fSourceType = "skp";
+                    fBenchType = "playback";
+                    return SkNEW_ARGS(SKPBench,
+                            (name.c_str(), pic.get(), fClip,
+                             fScales[fCurrentScale], fUseMPDs[fCurrentUseMPD++]));
                 }
-                SkString name = SkOSPath::Basename(path.c_str());
-                fSourceType = "skp";
-                fBenchType  = "playback";
-                return SkNEW_ARGS(SKPBench,
-                        (name.c_str(), pic.get(), fClip, fScales[fCurrentScale]));
+                fCurrentUseMPD = 0;
+                fCurrentSKP++;
             }
             fCurrentSKP = 0;
             fCurrentScale++;
@@ -555,6 +573,10 @@
                     SkStringPrintf("%d %d %d %d", fClip.fLeft, fClip.fTop,
                                                   fClip.fRight, fClip.fBottom).c_str());
             log->configOption("scale", SkStringPrintf("%.2g", fScales[fCurrentScale]).c_str());
+            if (fCurrentUseMPD > 0) {
+                SkASSERT(1 == fCurrentUseMPD || 2 == fCurrentUseMPD);
+                log->configOption("multi_picture_draw", fUseMPDs[fCurrentUseMPD-1] ? "true" : "false");
+            }
         }
     }
 
@@ -564,18 +586,22 @@
     SkIRect            fClip;
     SkTArray<SkScalar> fScales;
     SkTArray<SkString> fSKPs;
+    SkTArray<bool>     fUseMPDs;
 
     const char* fSourceType;  // What we're benching: bench, GM, SKP, ...
     const char* fBenchType;   // How we bench it: micro, recording, playback, ...
     int fCurrentRecording;
     int fCurrentScale;
     int fCurrentSKP;
+    int fCurrentUseMPD;
 };
 
 int nanobench_main();
 int nanobench_main() {
     SetupCrashHandler();
     SkAutoGraphics ag;
+    // Multithreading is disabled pending resolution of skia:3149
+    //SkTaskGroup::Enabler enabled;
 
 #if SK_SUPPORT_GPU
     GrContext::Options grContextOpts;
@@ -659,6 +685,14 @@
             SkCanvas* canvas = targets[j]->surface.get() ? targets[j]->surface->getCanvas() : NULL;
             const char* config = targets[j]->config.name;
 
+#if SK_SUPPORT_GPU
+            if (Benchmark::kGPU_Backend == targets[j]->config.backend) {
+                setup_gl(targets[j]->gl);
+            }
+#endif
+
+            bench->perCanvasPreDraw(canvas);
+
             const int loops =
 #if SK_SUPPORT_GPU
                 Benchmark::kGPU_Backend == targets[j]->config.backend
@@ -667,6 +701,8 @@
 #endif
                  cpu_bench(       overhead, bench.get(), canvas, samples.get());
 
+            bench->perCanvasPostDraw(canvas);
+
             if (canvas && !FLAGS_writePath.isEmpty() && FLAGS_writePath[0]) {
                 SkString pngFilename = SkOSPath::Join(FLAGS_writePath[0], config);
                 pngFilename = SkOSPath::Join(pngFilename.c_str(), bench->getUniqueName());
diff --git a/include/core/SkColorTable.h b/include/core/SkColorTable.h
index 00ef854..cfd5110 100644
--- a/include/core/SkColorTable.h
+++ b/include/core/SkColorTable.h
@@ -65,7 +65,7 @@
     */
     void unlock16BitCache() {
         SkASSERT(f16BitCacheLockCount > 0);
-        SkDEBUGCODE(f16BitCacheLockCount -= 1);
+        SkDEBUGCODE(sk_atomic_dec(&f16BitCacheLockCount);)
     }
 
     explicit SkColorTable(SkReadBuffer&);
diff --git a/src/core/SkColorTable.cpp b/src/core/SkColorTable.cpp
index f2eab61..0d033ec 100644
--- a/src/core/SkColorTable.cpp
+++ b/src/core/SkColorTable.cpp
@@ -70,7 +70,7 @@
         build_16bitcache(f16BitCache, fColors, fCount);
     }
 
-    SkDEBUGCODE(f16BitCacheLockCount += 1);
+    SkDEBUGCODE(sk_atomic_inc(&f16BitCacheLockCount));
     return f16BitCache;
 }