Threaded generation of software paths

Re-land of: https://skia-review.googlesource.com/36560

All information needed by the thread is captured by the prepare
callback object, the lambda captures a pointer to that, and does the
mask render. Once it's done, it signals the semaphore (also owned by the
callback). The callback defers the semaphore wait even longer (into the
ASAP upload), so the odds of waiting for the thread are REALLY low.

Also did a bunch of cleanup along the way, and put in some trace markers
so we can monitor how well this is working.

Traces of a GM that includes GPU and SW path rendering (path-reverse):

Original:
    https://screenshot.googleplex.com/f5BG3901tQg.png
Threaded, with wait in the callback (notice pre flush callback blocking):
    https://screenshot.googleplex.com/htOSZFE2s04.png
Current version, with wait deferred to ASAP upload function:
    https://screenshot.googleplex.com/GHjD0U3C34q.png
Bug: skia:
Change-Id: Idb92f385590749f41328a9aec65b2a93f4775079
Reviewed-on: https://skia-review.googlesource.com/40775
Reviewed-by: Brian Salomon <bsalomon@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
diff --git a/dm/DMSrcSink.cpp b/dm/DMSrcSink.cpp
index a18bb50..e14e1f9 100644
--- a/dm/DMSrcSink.cpp
+++ b/dm/DMSrcSink.cpp
@@ -20,6 +20,7 @@
 #include "SkDebugCanvas.h"
 #include "SkDeferredCanvas.h"
 #include "SkDocument.h"
+#include "SkExecutor.h"
 #include "SkImageGenerator.h"
 #include "SkImageGeneratorCG.h"
 #include "SkImageGeneratorWIC.h"
@@ -61,6 +62,7 @@
 DEFINE_bool(multiPage, false, "For document-type backends, render the source"
             " into multiple pages");
 DEFINE_bool(RAW_threading, true, "Allow RAW decodes to run on multiple threads?");
+DECLARE_int32(gpuThreads);
 
 using sk_gpu_test::GrContextFactory;
 
@@ -1315,8 +1317,13 @@
 
 DEFINE_bool(drawOpClip, false, "Clip each GrDrawOp to its device bounds for testing.");
 
-Error GPUSink::draw(const Src& src, SkBitmap* dst, SkWStream*, SkString* log) const {
-    GrContextOptions grOptions = fBaseContextOptions;
+Error GPUSink::draw(const Src& src, SkBitmap* dst, SkWStream* dstStream, SkString* log) const {
+    return this->onDraw(src, dst, dstStream, log, fBaseContextOptions);
+}
+
+Error GPUSink::onDraw(const Src& src, SkBitmap* dst, SkWStream*, SkString* log,
+                      const GrContextOptions& baseOptions) const {
+    GrContextOptions grOptions = baseOptions;
 
     src.modifyGrContextOptions(&grOptions);
 
@@ -1368,6 +1375,58 @@
 
 /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
 
+GPUThreadTestingSink::GPUThreadTestingSink(GrContextFactory::ContextType ct,
+                                           GrContextFactory::ContextOverrides overrides,
+                                           int samples,
+                                           bool diText,
+                                           SkColorType colorType,
+                                           SkAlphaType alphaType,
+                                           sk_sp<SkColorSpace> colorSpace,
+                                           bool threaded,
+                                           const GrContextOptions& grCtxOptions)
+        : INHERITED(ct, overrides, samples, diText, colorType, alphaType, std::move(colorSpace),
+                    threaded, grCtxOptions)
+        , fExecutor(SkExecutor::MakeThreadPool(FLAGS_gpuThreads)) {
+    SkASSERT(fExecutor);
+}
+
+Error GPUThreadTestingSink::draw(const Src& src, SkBitmap* dst, SkWStream* wStream,
+                                 SkString* log) const {
+    // Draw twice, once with worker threads, and once without. Verify that we get the same result.
+    // Also, force us to only use the software path renderer, so we really stress-test the threaded
+    // version of that code.
+    GrContextOptions contextOptions = this->baseContextOptions();
+    contextOptions.fGpuPathRenderers = GrContextOptions::GpuPathRenderers::kNone;
+
+    contextOptions.fExecutor = fExecutor.get();
+    Error err = this->onDraw(src, dst, wStream, log, contextOptions);
+    if (!err.isEmpty() || !dst) {
+        return err;
+    }
+
+    SkBitmap reference;
+    SkString refLog;
+    SkDynamicMemoryWStream refStream;
+    contextOptions.fExecutor = nullptr;
+    Error refErr = this->onDraw(src, &reference, &refStream, &refLog, contextOptions);
+    if (!refErr.isEmpty()) {
+        return refErr;
+    }
+
+    // The dimensions are a property of the Src only, and so should be identical.
+    SkASSERT(reference.getSize() == dst->getSize());
+    if (reference.getSize() != dst->getSize()) {
+        return "Dimensions don't match reference";
+    }
+    // All SkBitmaps in DM are tight, so this comparison is easy.
+    if (0 != memcmp(reference.getPixels(), dst->getPixels(), reference.getSize())) {
+        return "Pixels don't match reference";
+    }
+    return "";
+}
+
+/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
+
 static Error draw_skdocument(const Src& src, SkDocument* doc, SkWStream* dst) {
     if (src.size().isEmpty()) {
         return "Source has empty dimensions";
@@ -1558,7 +1617,7 @@
         if (reference.getSize() != bitmap->getSize()) {
             return "Dimensions don't match reference";
         }
-        // All SkBitmaps in DM are pre-locked and tight, so this comparison is easy.
+        // All SkBitmaps in DM are tight, so this comparison is easy.
         if (0 != memcmp(reference.getPixels(), bitmap->getPixels(), reference.getSize())) {
             return "Pixels don't match reference";
         }