SkThreadPool ~~> SkTaskGroup SkTaskGroup is like SkThreadPool except the threads stay in one global pool. Each SkTaskGroup itself is tiny (4 bytes) and its wait() method applies only to tasks add()ed to that instance, not the whole thread pool. This means we don't need to bring up new thread pools when tests themselves want to use multithreading (e.g. pathops, quilt). We just create a new SkTaskGroup and wait for that to complete. This should be more efficient, and allow us to expand where we use threads to really latency sensitive places. E.g. we can probably now use these in nanobench for CPU .skp rendering. Now that all threads are sharing the same pool, I think we can remove most of the custom mechanism pathops tests use to control threading. They'll just ride on the global pool with all other tests now. This (temporarily?) removes the GPU multithreading feature from DM, which we don't use. On my desktop, DM runs a little faster (57s -> 55s) in Debug, and a lot faster in Release (36s -> 24s). The bots show speedups of similar proportions, cutting more than a minute off the N4/Release and Win7/Debug runtimes. BUG=skia: Committed: https://skia.googlesource.com/skia/+/9c7207b5dc71dc5a96a2eb107d401133333d5b6f R=caryclark@google.com, bsalomon@google.com, bungeman@google.com, mtklein@google.com, reed@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/531653002

commit: 406654be7a930b484159f5bca107d3b11d8a9ede [log] [tgz]
author: mtklein <mtklein@chromium.org> Wed Sep 03 15:34:37 2014 -0700
committer: Commit bot <commit-bot@chromium.org> Wed Sep 03 15:34:37 2014 -0700
tree: 11d0b7ba5e6757e55c735743e649e61696f81571
parent: f8b4563e15d7567020b46d52ea43a9d4433c556a [diff]
diff --git a/dm/DM.cpp b/dm/DM.cpp
index 09f0ce8..b330445 100644
--- a/dm/DM.cpp
+++ b/dm/DM.cpp

@@ -8,6 +8,7 @@
 #include "SkGraphics.h"
 #include "SkPicture.h"
 #include "SkString.h"
+#include "SkTaskGroup.h"
 #include "Test.h"
 #include "gm.h"
 #include "sk_tool_utils.h"
@@ -41,7 +42,6 @@
 static const char kGpuAPINameGL[] = "gl";
 static const char kGpuAPINameGLES[] = "gles";
 
-DEFINE_int32(gpuThreads, 1, "Threads for GPU work.");
 DEFINE_string2(expectations, r, "",
                "If a directory, compare generated images against images under this path. "
                "If a file, compare generated images against JSON expectations at this path."
@@ -192,6 +192,7 @@
 int dm_main() {
     SetupCrashHandler();
     SkAutoGraphics ag;
+    SkTaskGroup::Enabler enabled(FLAGS_threads);
 
     if (FLAGS_dryRun) {
         FLAGS_verbose = true;
@@ -233,7 +234,8 @@
     SkDebugf("%d GMs x %d configs, %d tests, %d pictures\n",
              gms.count(), configs.count(), tests.count(), skps.count());
     DM::Reporter reporter;
-    DM::TaskRunner tasks(FLAGS_threads, FLAGS_gpuThreads);
+
+    DM::TaskRunner tasks;
     kick_off_tests(tests, &reporter, &tasks);
     kick_off_gms(gms, configs, gpuAPI, *expectations, &reporter, &tasks);
     kick_off_skps(skps, &reporter, &tasks);

diff --git a/dm/DMQuiltTask.cpp b/dm/DMQuiltTask.cpp
index 6961f09..a824de0 100644
--- a/dm/DMQuiltTask.cpp
+++ b/dm/DMQuiltTask.cpp

@@ -5,7 +5,7 @@
 #include "SkBBHFactory.h"
 #include "SkCommandLineFlags.h"
 #include "SkPicture.h"
-#include "SkThreadPool.h"
+#include "SkTaskGroup.h"
 
 DEFINE_bool(quilt, true, "If true, draw GM via a picture into a quilt of small tiles and compare.");
 DEFINE_int32(quiltTile, 256, "Dimension of (square) quilt tile.");
@@ -96,11 +96,11 @@
         canvas.flush();
     } else {
         // Draw tiles in parallel into the same bitmap, simulating aggressive impl-side painting.
-        SkThreadPool pool(SkThreadPool::kThreadPerCore);
+        SkTaskGroup tg;
         for (int y = 0; y < tiles_needed(full.height(), FLAGS_quiltTile); y++) {
             for (int x = 0; x < tiles_needed(full.width(), FLAGS_quiltTile); x++) {
                 // Deletes itself when done.
-                pool.add(new Tile(x, y, *recorded, &full));
+                tg.add(new Tile(x, y, *recorded, &full));
             }
         }
     }

diff --git a/dm/DMTask.cpp b/dm/DMTask.cpp
index 05eda4e..5ceb53b 100644
--- a/dm/DMTask.cpp
+++ b/dm/DMTask.cpp

@@ -38,8 +38,8 @@
     fReporter->printStatus(this->name(), SkTime::GetMSecs() - fStart);
 }
 
-void Task::spawnChildNext(CpuTask* task) {
-    fTaskRunner->addNext(task);
+void Task::reallySpawnChild(CpuTask* task) {
+    fTaskRunner->add(task);
 }
 
 CpuTask::CpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
@@ -56,32 +56,32 @@
 
 void CpuTask::spawnChild(CpuTask* task) {
     // Run children serially on this (CPU) thread.  This tends to save RAM and is usually no slower.
-    // Calling spawnChildNext() is nearly equivalent, but it'd pointlessly contend on the
-    // threadpool; spawnChildNext() is most useful when you want to change threadpools.
+    // Calling reallySpawnChild() is nearly equivalent, but it'd pointlessly contend on the
+    // threadpool; reallySpawnChild() is most useful when you want to change threadpools.
     task->run();
 }
 
 GpuTask::GpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
 
-void GpuTask::run(GrContextFactory& factory) {
+void GpuTask::run(GrContextFactory* factory) {
     if (FLAGS_gpu && !this->shouldSkip()) {
         this->start();
-        if (!FLAGS_dryRun) this->draw(&factory);
+        if (!FLAGS_dryRun) this->draw(factory);
         this->finish();
         if (FLAGS_abandonGpuContext) {
-            factory.abandonContexts();
+            factory->abandonContexts();
         }
         if (FLAGS_resetGpuContext || FLAGS_abandonGpuContext) {
-            factory.destroyContexts();
+            factory->destroyContexts();
         }
     }
     SkDELETE(this);
 }
 
 void GpuTask::spawnChild(CpuTask* task) {
-    // Really spawn a new task so it runs on the CPU threadpool instead of the GPU one we're on now.
+    // Spawn a new task so it runs on the CPU threadpool instead of the GPU one we're on now.
     // It goes on the front of the queue to minimize the time we must hold reference bitmaps in RAM.
-    this->spawnChildNext(task);
+    this->reallySpawnChild(task);
 }
 
 }  // namespace DM

diff --git a/dm/DMTask.h b/dm/DMTask.h
index 32bb948..3f41b49 100644
--- a/dm/DMTask.h
+++ b/dm/DMTask.h

@@ -1,9 +1,10 @@
 #ifndef DMTask_DEFINED
 #define DMTask_DEFINED
 
-#include "DMReporter.h"
 #include "DMGpuSupport.h"
+#include "DMReporter.h"
 #include "SkRunnable.h"
+#include "SkTaskGroup.h"
 #include "SkTime.h"
 
 // DM will run() these tasks on one of two threadpools.
@@ -36,7 +37,7 @@
     void fail(const char* msg = NULL);
     void finish();
 
-    void spawnChildNext(CpuTask* task);  // For now we don't allow GPU child tasks.
+    void reallySpawnChild(CpuTask* task);  // For now we don't allow GPU child tasks.
 
 private:
     Reporter* fReporter;      // Unowned.
@@ -57,12 +58,12 @@
     void spawnChild(CpuTask* task);
 };
 
-class GpuTask : public Task, public SkTRunnable<GrContextFactory> {
+class GpuTask : public Task {
  public:
     GpuTask(Reporter* reporter, TaskRunner* taskRunner);
     virtual ~GpuTask() {}
 
-    void run(GrContextFactory&) SK_OVERRIDE;
+    void run(GrContextFactory*);
     virtual void draw(GrContextFactory*) = 0;
 
     void spawnChild(CpuTask* task);

diff --git a/dm/DMTaskRunner.cpp b/dm/DMTaskRunner.cpp
index 8a0bc83..92381a7 100644
--- a/dm/DMTaskRunner.cpp
+++ b/dm/DMTaskRunner.cpp

@@ -3,19 +3,15 @@
 
 namespace DM {
 
-TaskRunner::TaskRunner(int cpuThreads, int gpuThreads) : fCpu(cpuThreads), fGpu(gpuThreads) {}
-
-void TaskRunner::add(CpuTask* task) { fCpu.add(task); }
-void TaskRunner::addNext(CpuTask* task) { fCpu.addNext(task); }
-void TaskRunner::add(GpuTask* task) { fGpu.add(task); }
+void TaskRunner::add(CpuTask* task) { fCpuWork.add(task);  }
+void TaskRunner::add(GpuTask* task) { fGpuWork.push(task); }
 
 void TaskRunner::wait() {
-    // These wait calls block until each threadpool is done.  We don't allow
-    // spawning new child GPU tasks, so we can wait for that first knowing
-    // we'll never try to add to it later.  Same can't be said of the CPU pool:
-    // both CPU and GPU tasks can spawn off new CPU work, so we wait for that last.
-    fGpu.wait();
-    fCpu.wait();
+    GrContextFactory factory;
+    for (int i = 0; i < fGpuWork.count(); i++) {
+        fGpuWork[i]->run(&factory);
+    }
+    fCpuWork.wait();
 }
 
 }  // namespace DM

diff --git a/dm/DMTaskRunner.h b/dm/DMTaskRunner.h
index dd1440e..3d4e491 100644
--- a/dm/DMTaskRunner.h
+++ b/dm/DMTaskRunner.h

@@ -2,12 +2,10 @@
 #define DMTaskRunner_DEFINED
 
 #include "DMGpuSupport.h"
-#include "SkThreadPool.h"
+#include "SkTDArray.h"
+#include "SkTaskGroup.h"
 #include "SkTypes.h"
 
-// TaskRunner runs Tasks on one of two threadpools depending on the need for a GrContextFactory.
-// It's typically a good idea to run fewer GPU threads than CPU threads (go nuts with those).
-
 namespace DM {
 
 class CpuTask;
@@ -15,16 +13,15 @@
 
 class TaskRunner : SkNoncopyable {
 public:
-    explicit TaskRunner(int cpuThreads, int gpuThreads);
+    TaskRunner() {}
 
     void add(CpuTask* task);
-    void addNext(CpuTask* task);
     void add(GpuTask* task);
     void wait();
 
 private:
-    SkTThreadPool<void> fCpu;
-    SkTThreadPool<GrContextFactory> fGpu;
+    SkTaskGroup fCpuWork;
+    SkTDArray<GpuTask*> fGpuWork;
 };
 
 }  // namespace DM

diff --git a/dm/DMTestTask.cpp b/dm/DMTestTask.cpp
index 9e7f41e..ad0c3fb 100644
--- a/dm/DMTestTask.cpp
+++ b/dm/DMTestTask.cpp

@@ -3,17 +3,11 @@
 #include "SkCommandLineFlags.h"
 #include "SkCommonFlags.h"
 
-// When PathOps threaded tests get going, they're briefly a big consumer of lots of RAM.
-// We disable the internal threading there by default on 32-bit builds.
-static const bool is32Bit = sizeof(void*) == 4;
-
-DEFINE_bool2(pathOpsExtended,     x, false, "Run extended pathOps tests.");
-DEFINE_bool2(pathOpsSingleThread, z, is32Bit, "Disallow pathOps tests from using threads.");
+DEFINE_bool2(pathOpsExtended, x, false, "Run extended pathOps tests.");
 
 namespace DM {
 
 bool TestReporter::allowExtendedTest() const { return FLAGS_pathOpsExtended; }
-bool TestReporter::allowThreaded()     const { return !FLAGS_pathOpsSingleThread; }
 bool TestReporter::verbose()           const { return FLAGS_veryVerbose; }
 
 static SkString test_name(const char* name) {

diff --git a/dm/DMTestTask.h b/dm/DMTestTask.h
index a65f096..ceb0e12 100644
--- a/dm/DMTestTask.h
+++ b/dm/DMTestTask.h

@@ -19,7 +19,6 @@
 
 private:
   virtual bool allowExtendedTest() const SK_OVERRIDE;
-  virtual bool allowThreaded()     const SK_OVERRIDE;
   virtual bool verbose()           const SK_OVERRIDE;
 
   virtual void onReportFailed(const SkString& desc) SK_OVERRIDE {
commit	406654be7a930b484159f5bca107d3b11d8a9ede	[log] [tgz]
author	mtklein <mtklein@chromium.org>	Wed Sep 03 15:34:37 2014 -0700
committer	Commit bot <commit-bot@chromium.org>	Wed Sep 03 15:34:37 2014 -0700
tree	11d0b7ba5e6757e55c735743e649e61696f81571
parent	f8b4563e15d7567020b46d52ea43a9d4433c556a [diff]