Reland "Update skpbench to let the gpu thread run alongside the recording threads"

This reverts commit 2403a1001b4b62f16519725568156e25a822e666.

Reason for revert: Fixed bot failure

Original change's description:
> Revert "Update skpbench to let the gpu thread run alongside the recording threads"
>
> This reverts commit ca06ce48f174883ead3782383cd11d4c131ab4ce.
>
> Reason for revert: It is dying on a bot with the error:
> skpbench.py: error: unrecognized arguments: --ddlNumRecordingThreads c:\b\s\w\ir\skp
>
> Original change's description:
> > Update skpbench to let the gpu thread run alongside the recording threads
> >
> > Bug: skia:10176
> > Change-Id: Ibadd52b86abfd2802b9b3f31c86ab573554601db
> > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/289883
> > Commit-Queue: Adlai Holler <adlai@google.com>
> > Reviewed-by: Robert Phillips <robertphillips@google.com>
>
> TBR=robertphillips@google.com,adlai@google.com
>
> Change-Id: Iec316410563f0fe1d06f44bcfa58f6790c547f98
> No-Presubmit: true
> No-Tree-Checks: true
> No-Try: true
> Bug: skia:10176
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/290117
> Reviewed-by: Robert Phillips <robertphillips@google.com>
> Commit-Queue: Robert Phillips <robertphillips@google.com>

TBR=robertphillips@google.com,adlai@google.com


Bug: skia:10176
Change-Id: I1b57c91ed60b093440191274d1acb72d9e0c3b37
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/290443
Reviewed-by: Robert Phillips <robertphillips@google.com>
Commit-Queue: Adlai Holler <adlai@google.com>
diff --git a/infra/bots/recipes/skpbench.expected/Perf-Win10-Clang-Golo-GPU-QuadroP400-x86_64-Release-All-Vulkan_Skpbench_DDLTotal_9x9.json b/infra/bots/recipes/skpbench.expected/Perf-Win10-Clang-Golo-GPU-QuadroP400-x86_64-Release-All-Vulkan_Skpbench_DDLTotal_9x9.json
index c7aa4e7..9c30754 100644
--- a/infra/bots/recipes/skpbench.expected/Perf-Win10-Clang-Golo-GPU-QuadroP400-x86_64-Release-All-Vulkan_Skpbench_DDLTotal_9x9.json
+++ b/infra/bots/recipes/skpbench.expected/Perf-Win10-Clang-Golo-GPU-QuadroP400-x86_64-Release-All-Vulkan_Skpbench_DDLTotal_9x9.json
@@ -79,7 +79,7 @@
       "--ddl",
       "--gpuThreads",
       "0",
-      "--ddlNumAdditionalThreads",
+      "--ddlNumRecordingThreads",
       "9",
       "--ddlTilingWidthHeight",
       "3",
diff --git a/infra/bots/recipes/skpbench.py b/infra/bots/recipes/skpbench.py
index 80681ae..7a79e0f 100644
--- a/infra/bots/recipes/skpbench.py
+++ b/infra/bots/recipes/skpbench.py
@@ -67,7 +67,7 @@
     skpbench_args += ['--gpuThreads', '0']
   if '9x9' in api.vars.builder_name:
     skpbench_args += [
-        '--ddlNumAdditionalThreads', 9,
+        '--ddlNumRecordingThreads', 9,
         '--ddlTilingWidthHeight', 3]
   if 'Android' in api.vars.builder_name:
     skpbench_args += [
diff --git a/tools/skpbench/skpbench.cpp b/tools/skpbench/skpbench.cpp
index db48940..05ad72e 100644
--- a/tools/skpbench/skpbench.cpp
+++ b/tools/skpbench/skpbench.cpp
@@ -60,8 +60,7 @@
  */
 
 static DEFINE_bool(ddl, false, "record the skp into DDLs before rendering");
-static DEFINE_int(ddlNumAdditionalThreads, 0,
-                    "number of DDL recording threads in addition to main one");
+static DEFINE_int(ddlNumRecordingThreads, 0, "number of DDL recording threads (0=num_cores)");
 static DEFINE_int(ddlTilingWidthHeight, 0, "number of tiles along one edge when in DDL mode");
 
 static DEFINE_bool(comparableDDL, false, "render in a way that is comparable to 'comparableSKP'");
@@ -201,7 +200,8 @@
     std::vector<SkDocumentPage> fFrames;
 };
 
-static void ddl_sample(GrContext* context, DDLTileHelper* tiles, GpuSync& gpuSync, Sample* sample,
+static void ddl_sample(GrContext* context, DDLTileHelper* tiles, GpuSync& gpuSync,
+                       Sample* sample, SkTaskGroup* recordingTaskGroup, SkTaskGroup* gpuTaskGroup,
                        std::chrono::high_resolution_clock::time_point* startStopTime) {
     using clock = std::chrono::high_resolution_clock;
 
@@ -220,12 +220,18 @@
         // through a DDL.
         tiles->drawAllTilesDirectly(context);
     } else {
-        // TODO: Here we create all the DDLs, wait, and then draw them all. This should be updated
-        // to use the GPUDDLSink method of having a separate GPU thread.
-        tiles->createDDLsInParallel();
-        tiles->precompileAndDrawAllTiles(context);
+        tiles->kickOffThreadedWork(recordingTaskGroup, gpuTaskGroup, context);
+        recordingTaskGroup->wait();
     }
-    flush_with_sync(context, gpuSync);
+
+    if (gpuTaskGroup) {
+        gpuTaskGroup->add([&]{
+            flush_with_sync(context, gpuSync);
+        });
+        gpuTaskGroup->wait();
+    } else {
+        flush_with_sync(context, gpuSync);
+    }
 
     *startStopTime = clock::now();
 
@@ -235,8 +241,9 @@
     }
 }
 
-static void run_ddl_benchmark(GrContext* context, sk_sp<SkSurface> dstSurface,
-                              SkPicture* inputPicture, std::vector<Sample>* samples) {
+static void run_ddl_benchmark(sk_gpu_test::TestContext* testContext, GrContext *context,
+                              sk_sp<SkSurface> dstSurface, SkPicture* inputPicture,
+                              std::vector<Sample>* samples) {
     using clock = std::chrono::high_resolution_clock;
     const Sample::duration sampleDuration = std::chrono::milliseconds(FLAGS_sampleMs);
     const clock::duration benchDuration = std::chrono::milliseconds(FLAGS_duration);
@@ -262,12 +269,26 @@
 
     tiles.createSKPPerTile(compressedPictureData.get(), promiseImageHelper);
 
-    SkTaskGroup::Enabler enabled(FLAGS_ddlNumAdditionalThreads);
+    // In comparable modes, there is no GPU thread. The following pointers are all null.
+    // Otherwise, we transfer testContext onto the GPU thread until after the bench.
+    std::unique_ptr<SkExecutor> gpuThread;
+    std::unique_ptr<SkTaskGroup> gpuTaskGroup;
+    std::unique_ptr<SkExecutor> recordingThreadPool;
+    std::unique_ptr<SkTaskGroup> recordingTaskGroup;
+    if (!FLAGS_comparableDDL && !FLAGS_comparableSKP) {
+        gpuThread = SkExecutor::MakeFIFOThreadPool(1, false);
+        gpuTaskGroup = std::make_unique<SkTaskGroup>(*gpuThread);
+        recordingThreadPool = SkExecutor::MakeFIFOThreadPool(FLAGS_ddlNumRecordingThreads, false);
+        recordingTaskGroup = std::make_unique<SkTaskGroup>(*recordingThreadPool);
+        testContext->makeNotCurrent();
+        gpuTaskGroup->add([=]{ testContext->makeCurrent(); });
+    }
 
     clock::time_point startStopTime = clock::now();
 
     GpuSync gpuSync;
-    ddl_sample(context, &tiles, gpuSync, nullptr, &startStopTime);
+    ddl_sample(context, &tiles, gpuSync, nullptr, recordingTaskGroup.get(),
+               gpuTaskGroup.get(), &startStopTime);
 
     clock::duration cumulativeDuration = std::chrono::milliseconds(0);
 
@@ -277,12 +298,22 @@
 
         do {
             tiles.resetAllTiles();
-            ddl_sample(context, &tiles, gpuSync, &sample, &startStopTime);
+            ddl_sample(context, &tiles, gpuSync, &sample, recordingTaskGroup.get(),
+                       gpuTaskGroup.get(), &startStopTime);
         } while (sample.fDuration < sampleDuration);
 
         cumulativeDuration += sample.fDuration;
     } while (cumulativeDuration < benchDuration || 0 == samples->size() % 2);
 
+    // Move the context back to this thread now that we're done benching.
+    if (gpuTaskGroup) {
+        gpuTaskGroup->add([=]{
+            testContext->makeNotCurrent();
+        });
+        gpuTaskGroup->wait();
+        testContext->makeCurrent();
+    }
+
     if (!FLAGS_png.isEmpty()) {
         // The user wants to see the final result
         dstSurface->draw(tiles.composeDDL());
@@ -580,7 +611,7 @@
     }
     if (!FLAGS_gpuClock) {
         if (FLAGS_ddl) {
-            run_ddl_benchmark(ctx, surface, skp.get(), &samples);
+            run_ddl_benchmark(testCtx, ctx, surface, skp.get(), &samples);
         } else if (!mskp) {
             auto s = std::make_unique<StaticSkp>(skp);
             run_benchmark(ctx, surface.get(), s.get(), &samples);
diff --git a/tools/skpbench/skpbench.py b/tools/skpbench/skpbench.py
index 8c53e9a..8c14818 100755
--- a/tools/skpbench/skpbench.py
+++ b/tools/skpbench/skpbench.py
@@ -73,9 +73,9 @@
   help="optional file to append results into")
 __argparse.add_argument('--ddl',
   action='store_true', help="record the skp into DDLs before rendering")
-__argparse.add_argument('--ddlNumAdditionalThreads',
+__argparse.add_argument('--ddlNumRecordingThreads',
   type=int, default=0,
-  help="number of DDL recording threads in addition to main one")
+  help="number of DDL recording threads (0=num_cores)")
 __argparse.add_argument('--ddlTilingWidthHeight',
   type=int, default=0, help="number of tiles along one edge when in DDL mode")
 __argparse.add_argument('--gpuThreads',
@@ -144,9 +144,9 @@
   # DDL parameters
   if FLAGS.ddl:
     ARGV.extend(['--ddl', 'true'])
-  if FLAGS.ddlNumAdditionalThreads:
-    ARGV.extend(['--ddlNumAdditionalThreads',
-                 str(FLAGS.ddlNumAdditionalThreads)])
+  if FLAGS.ddlNumRecordingThreads:
+    ARGV.extend(['--ddlNumRecordingThreads',
+                 str(FLAGS.ddlNumRecordingThreads)])
   if FLAGS.ddlTilingWidthHeight:
     ARGV.extend(['--ddlTilingWidthHeight', str(FLAGS.ddlTilingWidthHeight)])