DM: make GPU tasks multithreaded again.  Big refactor.

The main meat of things is in SkThreadPool.  We can now give SkThreadPool a
type for each thread to create and destroy on its local stack.  It's TLS
without going through SkTLS.

I've split the DM tasks into CpuTasks that run on threads with no TLS, and
GpuTasks that run on threads with a thread local GrContextFactory.

The old CpuTask and GpuTask have been renamed to CpuGMTask and GpuGMTask.

Upshot: default run of out/Debug/dm goes from ~45 seconds to ~20 seconds.

BUG=skia:
R=bsalomon@google.com, mtklein@google.com, reed@google.com

Author: mtklein@chromium.org

Review URL: https://codereview.chromium.org/179233005

git-svn-id: http://skia.googlecode.com/svn/trunk@13632 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/dm/DM.cpp b/dm/DM.cpp
index bf37d20..9c5c253 100644
--- a/dm/DM.cpp
+++ b/dm/DM.cpp
@@ -12,8 +12,8 @@
 #include "gm.h"
 
 #include "DMBenchTask.h"
-#include "DMCpuTask.h"
-#include "DMGpuTask.h"
+#include "DMCpuGMTask.h"
+#include "DMGpuGMTask.h"
 #include "DMReporter.h"
 #include "DMTask.h"
 #include "DMTaskRunner.h"
@@ -28,6 +28,7 @@
 using skiatest::TestRegistry;
 
 DEFINE_int32(threads, -1, "Threads for CPU work. Default NUM_CPUS.");
+DEFINE_int32(gpuThreads, 1, "Threads for GPU work.");
 DEFINE_string2(expectations, r, "",
                "If a directory, compare generated images against images under this path. "
                "If a file, compare generated images against JSON expectations at this path.");
@@ -86,15 +87,15 @@
     }
     for (int i = 0; i < gms.count(); i++) {
         for (int j = 0; j < configs.count(); j++) {
-            START("565",      CpuTask, kRGB_565_SkColorType);
-            START("8888",     CpuTask, kPMColor_SkColorType);
-            START("gpu",      GpuTask, native, 0);
-            START("msaa4",    GpuTask, native, 4);
-            START("msaa16",   GpuTask, native, 16);
-            START("gpunull",  GpuTask, null,   0);
-            START("gpudebug", GpuTask, debug,  0);
-            START("angle",    GpuTask, angle,  0);
-            START("mesa",     GpuTask, mesa,   0);
+            START("565",      CpuGMTask, kRGB_565_SkColorType);
+            START("8888",     CpuGMTask, kPMColor_SkColorType);
+            START("gpu",      GpuGMTask, native, 0);
+            START("msaa4",    GpuGMTask, native, 4);
+            START("msaa16",   GpuGMTask, native, 16);
+            START("gpunull",  GpuGMTask, null,   0);
+            START("gpudebug", GpuGMTask, debug,  0);
+            START("angle",    GpuGMTask, angle,  0);
+            START("mesa",     GpuGMTask, mesa,   0);
         }
     }
 #undef START
@@ -129,7 +130,12 @@
                            DM::Reporter* reporter,
                            DM::TaskRunner* tasks) {
     for (int i = 0; i < tests.count(); i++) {
-        tasks->add(SkNEW_ARGS(DM::TestTask, (reporter, tasks, tests[i])));
+        SkAutoTDelete<Test> test(tests[i](NULL));
+        if (test->isGPUTest()) {
+            tasks->add(SkNEW_ARGS(DM::GpuTestTask, (reporter, tasks, tests[i])));
+        } else {
+            tasks->add(SkNEW_ARGS(DM::CpuTestTask, (reporter, tasks, tests[i])));
+        }
     }
 }
 
@@ -201,7 +207,7 @@
     SkDebugf("(%d GMs, %d benches) x %d configs, %d tests\n",
              gms.count(), benches.count(), configs.count(), tests.count());
     DM::Reporter reporter;
-    DM::TaskRunner tasks(FLAGS_threads);
+    DM::TaskRunner tasks(FLAGS_threads, FLAGS_gpuThreads);
     kick_off_gms(gms, configs, *expectations, &reporter, &tasks);
     kick_off_benches(benches, configs, &reporter, &tasks);
     kick_off_tests(tests, &reporter, &tasks);
diff --git a/dm/DMBenchTask.cpp b/dm/DMBenchTask.cpp
index 4e251de..30561a4 100644
--- a/dm/DMBenchTask.cpp
+++ b/dm/DMBenchTask.cpp
@@ -14,7 +14,7 @@
                                              Reporter* reporter,
                                              TaskRunner* tasks,
                                              BenchRegistry::Factory factory)
-    : Task(reporter, tasks)
+    : CpuTask(reporter, tasks)
     , fBench(factory(NULL))
     , fName(bench_name(fBench->getName(), config)) {}
 
@@ -23,7 +23,7 @@
                            TaskRunner* tasks,
                            BenchRegistry::Factory factory,
                            SkColorType colorType)
-    : Task(reporter, tasks)
+    : CpuTask(reporter, tasks)
     , fBench(factory(NULL))
     , fName(bench_name(fBench->getName(), config))
     , fColorType(colorType) {}
@@ -34,7 +34,7 @@
                            BenchRegistry::Factory factory,
                            GrContextFactory::GLContextType contextType,
                            int sampleCount)
-    : Task(reporter, tasks)
+    : GpuTask(reporter, tasks)
     , fBench(factory(NULL))
     , fName(bench_name(fBench->getName(), config))
     , fContextType(contextType)
@@ -70,13 +70,13 @@
     draw_raster(fBench.get(), fColorType);
 }
 
-void GpuBenchTask::draw() {
+void GpuBenchTask::draw(GrContextFactory* grFactory) {
     SkImageInfo info = SkImageInfo::Make(fBench->getSize().x(),
                                          fBench->getSize().y(),
                                          kPMColor_SkColorType,
                                          kPremul_SkAlphaType);
     SkAutoTUnref<SkSurface> surface(SkSurface::NewRenderTarget(
-            this->getGrContextFactory()->get(fContextType), info, fSampleCount));
+            grFactory->get(fContextType), info, fSampleCount));
 
     fBench->preDraw();
     fBench->draw(1, surface->getCanvas());
diff --git a/dm/DMBenchTask.h b/dm/DMBenchTask.h
index 1e9bc99..ac7030b 100644
--- a/dm/DMBenchTask.h
+++ b/dm/DMBenchTask.h
@@ -12,12 +12,11 @@
 
 namespace DM {
 
-class NonRenderingBenchTask : public Task {
+class NonRenderingBenchTask : public CpuTask {
 public:
     NonRenderingBenchTask(const char* config, Reporter*, TaskRunner*, BenchRegistry::Factory);
 
     virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
     virtual bool shouldSkip() const SK_OVERRIDE;
     virtual SkString name() const SK_OVERRIDE { return fName; }
 
@@ -26,12 +25,11 @@
     const SkString fName;
 };
 
-class CpuBenchTask : public Task {
+class CpuBenchTask : public CpuTask {
 public:
     CpuBenchTask(const char* config, Reporter*, TaskRunner*, BenchRegistry::Factory, SkColorType);
 
     virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
     virtual bool shouldSkip() const SK_OVERRIDE;
     virtual SkString name() const SK_OVERRIDE { return fName; }
 
@@ -41,7 +39,7 @@
     const SkColorType fColorType;
 };
 
-class GpuBenchTask : public Task {
+class GpuBenchTask : public GpuTask {
 public:
     GpuBenchTask(const char* config,
                  Reporter*,
@@ -50,8 +48,7 @@
                  GrContextFactory::GLContextType,
                  int sampleCount);
 
-    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return true; }
+    virtual void draw(GrContextFactory*) SK_OVERRIDE;
     virtual bool shouldSkip() const SK_OVERRIDE;
     virtual SkString name() const SK_OVERRIDE { return fName; }
 
diff --git a/dm/DMCpuTask.cpp b/dm/DMCpuGMTask.cpp
similarity index 77%
rename from dm/DMCpuTask.cpp
rename to dm/DMCpuGMTask.cpp
index acbe8d2..6ab0014 100644
--- a/dm/DMCpuTask.cpp
+++ b/dm/DMCpuGMTask.cpp
@@ -1,4 +1,4 @@
-#include "DMCpuTask.h"
+#include "DMCpuGMTask.h"
 #include "DMExpectationsTask.h"
 #include "DMPipeTask.h"
 #include "DMReplayTask.h"
@@ -9,13 +9,13 @@
 
 namespace DM {
 
-CpuTask::CpuTask(const char* config,
-                 Reporter* reporter,
-                 TaskRunner* taskRunner,
-                 const Expectations& expectations,
-                 skiagm::GMRegistry::Factory gmFactory,
-                 SkColorType colorType)
-    : Task(reporter, taskRunner)
+CpuGMTask::CpuGMTask(const char* config,
+                     Reporter* reporter,
+                     TaskRunner* taskRunner,
+                     const Expectations& expectations,
+                     skiagm::GMRegistry::Factory gmFactory,
+                     SkColorType colorType)
+    : CpuTask(reporter, taskRunner)
     , fGMFactory(gmFactory)
     , fGM(fGMFactory(NULL))
     , fName(UnderJoin(fGM->getName(), config))
@@ -23,7 +23,7 @@
     , fColorType(colorType)
     {}
 
-void CpuTask::draw() {
+void CpuGMTask::draw() {
     SkBitmap bitmap;
     SetupBitmap(fColorType, fGM.get(), &bitmap);
 
@@ -47,7 +47,7 @@
 #undef SPAWN
 }
 
-bool CpuTask::shouldSkip() const {
+bool CpuGMTask::shouldSkip() const {
     if (kRGB_565_SkColorType == fColorType && (fGM->getFlags() & skiagm::GM::kSkip565_Flag)) {
         return true;
     }
diff --git a/dm/DMCpuTask.h b/dm/DMCpuGMTask.h
similarity index 67%
rename from dm/DMCpuTask.h
rename to dm/DMCpuGMTask.h
index 0ae112f..7712da8 100644
--- a/dm/DMCpuTask.h
+++ b/dm/DMCpuGMTask.h
@@ -1,5 +1,5 @@
-#ifndef DMCpuTask_DEFINED
-#define DMCpuTask_DEFINED
+#ifndef DMCpuGMTask_DEFINED
+#define DMCpuGMTask_DEFINED
 
 #include "DMExpectations.h"
 #include "DMReporter.h"
@@ -15,17 +15,16 @@
 
 namespace DM {
 
-class CpuTask : public Task {
+class CpuGMTask : public CpuTask {
 public:
-    CpuTask(const char* config,
-            Reporter*,
-            TaskRunner*,
-            const Expectations&,
-            skiagm::GMRegistry::Factory,
-            SkColorType);
+    CpuGMTask(const char* config,
+              Reporter*,
+              TaskRunner*,
+              const Expectations&,
+              skiagm::GMRegistry::Factory,
+              SkColorType);
 
     virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
     virtual bool shouldSkip() const SK_OVERRIDE;
     virtual SkString name() const SK_OVERRIDE { return fName; }
 
@@ -39,4 +38,4 @@
 
 }  // namespace DM
 
-#endif // DMCpuTask_DEFINED
+#endif // DMCpuGMTask_DEFINED
diff --git a/dm/DMExpectationsTask.cpp b/dm/DMExpectationsTask.cpp
index cb92486..e29257a 100644
--- a/dm/DMExpectationsTask.cpp
+++ b/dm/DMExpectationsTask.cpp
@@ -6,7 +6,7 @@
 ExpectationsTask::ExpectationsTask(const Task& parent,
                                    const Expectations& expectations,
                                    SkBitmap bitmap)
-    : Task(parent)
+    : CpuTask(parent)
     , fName(parent.name())  // Masquerade as parent so failures are attributed to it.
     , fExpectations(expectations)
     , fBitmap(bitmap)
diff --git a/dm/DMExpectationsTask.h b/dm/DMExpectationsTask.h
index cf76fc8..7000de4 100644
--- a/dm/DMExpectationsTask.h
+++ b/dm/DMExpectationsTask.h
@@ -10,12 +10,11 @@
 
 // ExpectationsTask compares an SkBitmap against some Expectations.
 // Moving this off the GPU threadpool is a nice (~30%) runtime win.
-class ExpectationsTask : public Task {
+class ExpectationsTask : public CpuTask {
 public:
     ExpectationsTask(const Task& parent, const Expectations&, SkBitmap);
 
     virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
     virtual bool shouldSkip() const SK_OVERRIDE { return false; }
     virtual SkString name() const SK_OVERRIDE { return fName; }
 
diff --git a/dm/DMGpuTask.cpp b/dm/DMGpuGMTask.cpp
similarity index 68%
rename from dm/DMGpuTask.cpp
rename to dm/DMGpuGMTask.cpp
index c285d88..cffa229 100644
--- a/dm/DMGpuTask.cpp
+++ b/dm/DMGpuGMTask.cpp
@@ -1,4 +1,4 @@
-#include "DMGpuTask.h"
+#include "DMGpuGMTask.h"
 
 #include "DMExpectationsTask.h"
 #include "DMUtil.h"
@@ -9,14 +9,14 @@
 
 namespace DM {
 
-GpuTask::GpuTask(const char* config,
-                 Reporter* reporter,
-                 TaskRunner* taskRunner,
-                 const Expectations& expectations,
-                 skiagm::GMRegistry::Factory gmFactory,
-                 GrContextFactory::GLContextType contextType,
-                 int sampleCount)
-    : Task(reporter, taskRunner)
+GpuGMTask::GpuGMTask(const char* config,
+                     Reporter* reporter,
+                     TaskRunner* taskRunner,
+                     const Expectations& expectations,
+                     skiagm::GMRegistry::Factory gmFactory,
+                     GrContextFactory::GLContextType contextType,
+                     int sampleCount)
+    : GpuTask(reporter, taskRunner)
     , fGM(gmFactory(NULL))
     , fName(UnderJoin(fGM->getName(), config))
     , fExpectations(expectations)
@@ -24,13 +24,13 @@
     , fSampleCount(sampleCount)
     {}
 
-void GpuTask::draw() {
+void GpuGMTask::draw(GrContextFactory* grFactory) {
     SkImageInfo info = SkImageInfo::Make(SkScalarCeilToInt(fGM->width()),
                                          SkScalarCeilToInt(fGM->height()),
                                          kPMColor_SkColorType,
                                          kPremul_SkAlphaType);
     SkAutoTUnref<SkSurface> surface(SkSurface::NewRenderTarget(
-            this->getGrContextFactory()->get(fContextType), info, fSampleCount));
+            grFactory->get(fContextType), info, fSampleCount));
     SkCanvas* canvas = surface->getCanvas();
 
     canvas->concat(fGM->getInitialTransform());
@@ -49,7 +49,7 @@
     this->spawnChild(SkNEW_ARGS(WriteTask, (*this, bitmap)));
 }
 
-bool GpuTask::shouldSkip() const {
+bool GpuGMTask::shouldSkip() const {
     return SkToBool(fGM->getFlags() & skiagm::GM::kSkipGPU_Flag);
 }
 
diff --git a/dm/DMGpuGMTask.h b/dm/DMGpuGMTask.h
new file mode 100644
index 0000000..141994e
--- /dev/null
+++ b/dm/DMGpuGMTask.h
@@ -0,0 +1,42 @@
+#ifndef DMGpuGMTask_DEFINED
+#define DMGpuGMTask_DEFINED
+
+#include "DMExpectations.h"
+#include "DMReporter.h"
+#include "DMTask.h"
+#include "DMTaskRunner.h"
+#include "GrContextFactory.h"
+#include "SkBitmap.h"
+#include "SkString.h"
+#include "SkTemplates.h"
+#include "gm.h"
+
+// This is the main entry point for drawing GMs with the GPU.
+
+namespace DM {
+
+class GpuGMTask : public GpuTask {
+public:
+    GpuGMTask(const char* config,
+              Reporter*,
+              TaskRunner*,
+              const Expectations&,
+              skiagm::GMRegistry::Factory,
+              GrContextFactory::GLContextType,
+              int sampleCount);
+
+    virtual void draw(GrContextFactory*) SK_OVERRIDE;
+    virtual bool shouldSkip() const SK_OVERRIDE;
+    virtual SkString name() const SK_OVERRIDE { return fName; }
+
+private:
+    SkAutoTDelete<skiagm::GM> fGM;
+    const SkString fName;
+    const Expectations& fExpectations;
+    const GrContextFactory::GLContextType fContextType;
+    const int fSampleCount;
+};
+
+}  // namespace DM
+
+#endif  // DMGpuGMTask_DEFINED
diff --git a/dm/DMGpuTask.h b/dm/DMGpuTask.h
deleted file mode 100644
index f74950c..0000000
--- a/dm/DMGpuTask.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef DMGpuTask_DEFINED
-#define DMGpuTask_DEFINED
-
-#include "DMExpectations.h"
-#include "DMReporter.h"
-#include "DMTask.h"
-#include "DMTaskRunner.h"
-#include "GrContextFactory.h"
-#include "SkBitmap.h"
-#include "SkString.h"
-#include "SkTemplates.h"
-#include "gm.h"
-
-// This is the main entry point for drawing GMs with the GPU.
-
-namespace DM {
-
-class GpuTask : public Task {
-public:
-    GpuTask(const char* config,
-            Reporter*,
-            TaskRunner*,
-            const Expectations&,
-            skiagm::GMRegistry::Factory,
-            GrContextFactory::GLContextType,
-            int sampleCount);
-
-    virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return true; }
-    virtual bool shouldSkip() const SK_OVERRIDE;
-    virtual SkString name() const SK_OVERRIDE { return fName; }
-
-private:
-    SkAutoTDelete<skiagm::GM> fGM;
-    const SkString fName;
-    const Expectations& fExpectations;
-    const GrContextFactory::GLContextType fContextType;
-    const int fSampleCount;
-};
-
-}  // namespace DM
-
-#endif  // DMGpuTask_DEFINED
diff --git a/dm/DMPipeTask.cpp b/dm/DMPipeTask.cpp
index 163f1e6..513594e 100644
--- a/dm/DMPipeTask.cpp
+++ b/dm/DMPipeTask.cpp
@@ -38,7 +38,7 @@
                    SkBitmap reference,
                    bool crossProcess,
                    bool sharedAddressSpace)
-    : Task(parent)
+    : CpuTask(parent)
     , fFlags(get_flags(crossProcess, sharedAddressSpace))
     , fName(UnderJoin(parent.name().c_str(), get_name(fFlags)))
     , fGM(gm)
diff --git a/dm/DMPipeTask.h b/dm/DMPipeTask.h
index 23bbfef..c251d08 100644
--- a/dm/DMPipeTask.h
+++ b/dm/DMPipeTask.h
@@ -11,7 +11,7 @@
 
 namespace DM {
 
-class PipeTask : public Task {
+class PipeTask : public CpuTask {
 
 public:
     PipeTask(const Task& parent,        // PipeTask must be a child task.  Pass its parent here.
@@ -21,7 +21,6 @@
              bool sharedAddressSpace);  // If cross process, should it assume shared address space?
 
     virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
     virtual bool shouldSkip() const SK_OVERRIDE;
     virtual SkString name() const SK_OVERRIDE { return fName; }
 
diff --git a/dm/DMReplayTask.cpp b/dm/DMReplayTask.cpp
index 3d01035..c915be2 100644
--- a/dm/DMReplayTask.cpp
+++ b/dm/DMReplayTask.cpp
@@ -14,7 +14,7 @@
                        skiagm::GM* gm,
                        SkBitmap reference,
                        bool useRTree)
-    : Task(parent)
+    : CpuTask(parent)
     , fName(UnderJoin(parent.name().c_str(), useRTree ? "rtree" : "replay"))
     , fGM(gm)
     , fReference(reference)
diff --git a/dm/DMReplayTask.h b/dm/DMReplayTask.h
index 1245009..78bef0d 100644
--- a/dm/DMReplayTask.h
+++ b/dm/DMReplayTask.h
@@ -11,7 +11,7 @@
 
 namespace DM {
 
-class ReplayTask : public Task {
+class ReplayTask : public CpuTask {
 
 public:
     ReplayTask(const Task& parent,  // ReplayTask must be a child task.  Pass its parent here.
@@ -20,7 +20,6 @@
                bool useRTree);      // Record with an RTree?
 
     virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
     virtual bool shouldSkip() const SK_OVERRIDE;
     virtual SkString name() const SK_OVERRIDE { return fName; }
 
diff --git a/dm/DMSerializeTask.cpp b/dm/DMSerializeTask.cpp
index 7e45d49..4f55de5 100644
--- a/dm/DMSerializeTask.cpp
+++ b/dm/DMSerializeTask.cpp
@@ -13,7 +13,7 @@
 SerializeTask::SerializeTask(const Task& parent,
                              skiagm::GM* gm,
                              SkBitmap reference)
-    : Task(parent)
+    : CpuTask(parent)
     , fName(UnderJoin(parent.name().c_str(), "serialize"))
     , fGM(gm)
     , fReference(reference)
diff --git a/dm/DMSerializeTask.h b/dm/DMSerializeTask.h
index 30a4303..1f8b836 100644
--- a/dm/DMSerializeTask.h
+++ b/dm/DMSerializeTask.h
@@ -11,7 +11,7 @@
 
 namespace DM {
 
-class SerializeTask : public Task {
+class SerializeTask : public CpuTask {
 
 public:
     SerializeTask(const Task& parent,
@@ -19,7 +19,6 @@
                   SkBitmap reference);
 
     virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
     virtual bool shouldSkip() const SK_OVERRIDE;
     virtual SkString name() const SK_OVERRIDE { return fName; }
 
diff --git a/dm/DMTask.cpp b/dm/DMTask.cpp
index d26971c..1c4cc25 100644
--- a/dm/DMTask.cpp
+++ b/dm/DMTask.cpp
@@ -1,43 +1,22 @@
 #include "DMTask.h"
-
 #include "DMTaskRunner.h"
-#include "DMUtil.h"
-#include "SkBitmap.h"
-#include "SkCommandLineFlags.h"
 
 namespace DM {
 
 Task::Task(Reporter* reporter, TaskRunner* taskRunner)
-    : fReporter(reporter), fTaskRunner(taskRunner), fDepth(0) {
+    : fReporter(reporter)
+    , fTaskRunner(taskRunner)
+    , fDepth(0) {
     fReporter->start();
 }
 
 Task::Task(const Task& parent)
-    : INHERITED(parent)
-    , fReporter(parent.fReporter)
+    : fReporter(parent.fReporter)
     , fTaskRunner(parent.fTaskRunner)
-    , fDepth(parent.depth()+1) {
+    , fDepth(parent.depth() + 1) {
     fReporter->start();
 }
 
-Task::~Task() {}
-
-void Task::run() {
-    if (!this->shouldSkip()) {
-        this->draw();
-    }
-    fReporter->finish(this->name());
-    delete this;
-}
-
-void Task::spawnChild(Task* task) {
-    if (!task->usesGpu()) {
-        fTaskRunner->add(task);
-    } else {
-        SkDEBUGFAIL("Sorry, we can't spawn GPU tasks. :(  See comment in TaskRunner::wait().");
-    }
-}
-
 void Task::fail(const char* msg) {
     SkString failure(this->name());
     if (msg) {
@@ -46,8 +25,35 @@
     fReporter->fail(failure);
 }
 
-GrContextFactory* Task::getGrContextFactory() const {
-    return fTaskRunner->getGrContextFactory();
+void Task::finish() {
+    fReporter->finish(this->name());
 }
 
+void Task::spawnChild(CpuTask* task) {
+    fTaskRunner->add(task);
+}
+
+CpuTask::CpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
+CpuTask::CpuTask(const Task& parent) : Task(parent) {}
+
+void CpuTask::run() {
+    if (!this->shouldSkip()) {
+        this->draw();
+    }
+    this->finish();
+    SkDELETE(this);
+}
+
+GpuTask::GpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
+
+void GpuTask::run(GrContextFactory& factory) {
+    if (!this->shouldSkip()) {
+        this->draw(&factory);
+    }
+    this->finish();
+    SkDELETE(this);
+}
+
+
+
 }  // namespace DM
diff --git a/dm/DMTask.h b/dm/DMTask.h
index e8598df..cad8234 100644
--- a/dm/DMTask.h
+++ b/dm/DMTask.h
@@ -4,28 +4,21 @@
 #include "DMReporter.h"
 #include "GrContextFactory.h"
 #include "SkRunnable.h"
-#include "SkThreadPool.h"
 
-// DM will run() these tasks on one of two threadpools, depending on the result
-// of usesGpu().  The subclasses can call fail() to mark this task as failed,
-// or make any number of spawnChild() calls to kick off dependent tasks.
+// DM will run() these tasks on one of two threadpools.
+// Subclasses can call fail() to mark this task as failed, or make any number of spawnChild() calls
+// to kick off dependent tasks.
 //
-// Task deletes itself when run.
+// Tasks delete themselves when run.
 
 namespace DM {
 
 class TaskRunner;
 
-class Task : public SkRunnable {
+class CpuTask;
+
+class Task {
 public:
-    Task(Reporter* reporter, TaskRunner* taskRunner);
-    Task(const Task& parent);
-    virtual ~Task();
-
-    void run() SK_OVERRIDE;
-
-    virtual void draw() = 0;
-    virtual bool usesGpu() const = 0;
     virtual bool shouldSkip() const = 0;
     virtual SkString name() const = 0;
 
@@ -34,19 +27,37 @@
     int depth() const { return fDepth; }
 
 protected:
-    void spawnChild(Task* task);
-    void fail(const char* msg = NULL);
+    Task(Reporter* reporter, TaskRunner* taskRunner);
+    Task(const Task& parent);
+    virtual ~Task() {}
 
-    // This can only be safely called from a GPU task's draw() method.
-    GrContextFactory* getGrContextFactory() const;
+    void fail(const char* msg = NULL);
+    void finish();
+    void spawnChild(CpuTask* task);  // For now we don't allow GPU child tasks.
 
 private:
-    // Both unowned.
-    Reporter* fReporter;
-    TaskRunner* fTaskRunner;
+    Reporter* fReporter;      // Unowned.
+    TaskRunner* fTaskRunner;  // Unowned.
     int fDepth;
+};
 
-    typedef SkRunnable INHERITED;
+class CpuTask : public Task, public SkRunnable {
+public:
+    CpuTask(Reporter* reporter, TaskRunner* taskRunner);
+    CpuTask(const Task& parent);
+    virtual ~CpuTask() {}
+
+    void run() SK_OVERRIDE;
+    virtual void draw() = 0;
+};
+
+class GpuTask : public Task, public SkTRunnable<GrContextFactory> {
+ public:
+    GpuTask(Reporter* reporter, TaskRunner* taskRunner);
+    virtual ~GpuTask() {}
+
+    void run(GrContextFactory&) SK_OVERRIDE;
+    virtual void draw(GrContextFactory*) = 0;
 };
 
 }  // namespace DM
diff --git a/dm/DMTaskRunner.cpp b/dm/DMTaskRunner.cpp
index bd53ce6..e0bd977 100644
--- a/dm/DMTaskRunner.cpp
+++ b/dm/DMTaskRunner.cpp
@@ -3,48 +3,19 @@
 
 namespace DM {
 
+TaskRunner::TaskRunner(int cpuThreads, int gpuThreads) : fCpu(cpuThreads), fGpu(gpuThreads) {}
 
-TaskRunner::TaskRunner(int cputhreads)
-    : fMain(cputhreads)
-    , fGpu(1) {
-    // Enqueue a task on the GPU thread to create a GrContextFactory.
-    struct Create : public SkRunnable {
-        Create(GrContextFactory** ptr) : fPtr(ptr) {}
-        void run() SK_OVERRIDE {
-            *fPtr = SkNEW(GrContextFactory);
-            delete this;
-        }
-        GrContextFactory** fPtr;
-    };
-    fGpu.add(SkNEW_ARGS(Create, (&fGrContextFactory)));
-}
+void TaskRunner::add(CpuTask* task) { fCpu.add(task); }
 
-void TaskRunner::add(Task* task) {
-    if (task->usesGpu()) {
-        fGpu.add(task);
-    } else {
-        fMain.add(task);
-    }
-}
+void TaskRunner::add(GpuTask* task) { fGpu.add(task); }
 
 void TaskRunner::wait() {
-    // Enqueue a task on the GPU thread to destroy the GrContextFactory.
-    struct Delete : public SkRunnable {
-        Delete(GrContextFactory* ptr) : fPtr(ptr) {}
-        void run() SK_OVERRIDE {
-            delete fPtr;
-            delete this;
-        }
-        GrContextFactory* fPtr;
-    };
-    fGpu.add(SkNEW_ARGS(Delete, (fGrContextFactory)));
-
-    // These wait calls block until the threadpool is done.  We don't allow
-    // children to spawn new GPU tasks so we can wait for that first knowing
-    // we'll never try to add to it later.  Same can't be said of fMain: fGpu
-    // and fMain can both add tasks to fMain, so we have to wait for that last.
+    // These wait calls block until each threadpool is done.  We don't allow
+    // spawning new child GPU tasks, so we can wait for that first knowing
+    // we'll never try to add to it later.  Same can't be said of the CPU pool:
+    // both CPU and GPU tasks can spawn off new CPU work, so we wait for that last.
     fGpu.wait();
-    fMain.wait();
+    fCpu.wait();
 }
 
 }  // namespace DM
diff --git a/dm/DMTaskRunner.h b/dm/DMTaskRunner.h
index 8af1b63..c7b4058 100644
--- a/dm/DMTaskRunner.h
+++ b/dm/DMTaskRunner.h
@@ -5,26 +5,25 @@
 #include "SkThreadPool.h"
 #include "SkTypes.h"
 
-// TaskRunner runs Tasks on one of two threadpools depending on the Task's usesGpu() method.  This
-// lets us drive the GPU from a single thread while parallelizing CPU-bound work.
+// TaskRunner runs Tasks on one of two threadpools depending on the need for a GrContextFactory.
+// It's typically a good idea to run fewer GPU threads than CPU threads (go nuts with those).
 
 namespace DM {
 
-class Task;
+class CpuTask;
+class GpuTask;
 
 class TaskRunner : SkNoncopyable {
 public:
-    explicit TaskRunner(int cputhreads);
+    explicit TaskRunner(int cpuThreads, int gpuThreads);
 
-    void add(Task* task);
+    void add(CpuTask* task);
+    void add(GpuTask* task);
     void wait();
 
-    // This can only be safely called from a GPU task's draw() method.
-    GrContextFactory* getGrContextFactory() const { return fGrContextFactory; }
-
 private:
-    SkThreadPool fMain, fGpu;
-    GrContextFactory* fGrContextFactory;  // Created and destroyed on fGpu threadpool.
+    SkTThreadPool<void> fCpu;
+    SkTThreadPool<GrContextFactory> fGpu;
 };
 
 }  // namespace DM
diff --git a/dm/DMTestTask.cpp b/dm/DMTestTask.cpp
index 32a698c..6c3fced 100644
--- a/dm/DMTestTask.cpp
+++ b/dm/DMTestTask.cpp
@@ -8,23 +8,32 @@
 
 namespace DM {
 
+bool TestReporter::allowExtendedTest() const { return FLAGS_pathOpsExtended; }
+bool TestReporter::allowThreaded()     const { return !FLAGS_pathOpsSingleThread; }
+bool TestReporter::verbose()           const { return FLAGS_pathOpsVerbose; }
+
 static SkString test_name(const char* name) {
     SkString result("test ");
     result.append(name);
     return result;
 }
 
-TestTask::TestTask(Reporter* reporter,
-                   TaskRunner* taskRunner,
-                   skiatest::TestRegistry::Factory factory)
-    : Task(reporter, taskRunner)
+CpuTestTask::CpuTestTask(Reporter* reporter,
+                         TaskRunner* taskRunner,
+                         skiatest::TestRegistry::Factory factory)
+    : CpuTask(reporter, taskRunner)
     , fTest(factory(NULL))
     , fName(test_name(fTest->getName())) {}
 
-void TestTask::draw() {
-    if (this->usesGpu()) {
-        fTest->setGrContextFactory(this->getGrContextFactory());
-    }
+GpuTestTask::GpuTestTask(Reporter* reporter,
+                         TaskRunner* taskRunner,
+                         skiatest::TestRegistry::Factory factory)
+    : GpuTask(reporter, taskRunner)
+    , fTest(factory(NULL))
+    , fName(test_name(fTest->getName())) {}
+
+
+void CpuTestTask::draw() {
     fTest->setReporter(&fTestReporter);
     fTest->run();
     if (!fTest->passed()) {
@@ -32,8 +41,13 @@
     }
 }
 
-bool TestTask::TestReporter::allowExtendedTest() const { return FLAGS_pathOpsExtended; }
-bool TestTask::TestReporter::allowThreaded()     const { return !FLAGS_pathOpsSingleThread; }
-bool TestTask::TestReporter::verbose()           const { return FLAGS_pathOpsVerbose; }
+void GpuTestTask::draw(GrContextFactory* grFactory) {
+    fTest->setGrContextFactory(grFactory);
+    fTest->setReporter(&fTestReporter);
+    fTest->run();
+    if (!fTest->passed()) {
+        this->fail(fTestReporter.failure());
+    }
+}
 
 }  // namespace DM
diff --git a/dm/DMTestTask.h b/dm/DMTestTask.h
index 49a8e77..87f5920 100644
--- a/dm/DMTestTask.h
+++ b/dm/DMTestTask.h
@@ -11,34 +11,47 @@
 // Runs a unit test.
 namespace DM {
 
-class TestTask : public Task {
+class TestReporter : public skiatest::Reporter {
 public:
-    TestTask(Reporter*, TaskRunner*, skiatest::TestRegistry::Factory);
+  TestReporter() {}
+
+  const char* failure() const { return fFailure.c_str(); }
+
+private:
+  virtual bool allowExtendedTest() const SK_OVERRIDE;
+  virtual bool allowThreaded()     const SK_OVERRIDE;
+  virtual bool verbose()           const SK_OVERRIDE;
+
+  virtual void onReportFailed(const SkString& desc) SK_OVERRIDE {
+      fFailure = desc;
+  }
+
+  SkString fFailure;
+};
+
+class CpuTestTask : public CpuTask {
+public:
+    CpuTestTask(Reporter*, TaskRunner*, skiatest::TestRegistry::Factory);
 
     virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return fTest->isGPUTest(); }
     virtual bool shouldSkip() const SK_OVERRIDE { return false; }
     virtual SkString name() const SK_OVERRIDE { return fName; }
 
 private:
-    class TestReporter : public skiatest::Reporter {
-    public:
-      TestReporter() {}
+    TestReporter fTestReporter;
+    SkAutoTDelete<skiatest::Test> fTest;
+    const SkString fName;
+};
 
-      const char* failure() const { return fFailure.c_str(); }
+class GpuTestTask : public GpuTask {
+public:
+    GpuTestTask(Reporter*, TaskRunner*, skiatest::TestRegistry::Factory);
 
-    private:
-      virtual bool allowExtendedTest() const SK_OVERRIDE;
-      virtual bool allowThreaded()     const SK_OVERRIDE;
-      virtual bool verbose()           const SK_OVERRIDE;
+    virtual void draw(GrContextFactory*) SK_OVERRIDE;
+    virtual bool shouldSkip() const SK_OVERRIDE { return false; }
+    virtual SkString name() const SK_OVERRIDE { return fName; }
 
-      virtual void onReportFailed(const SkString& desc) SK_OVERRIDE {
-          fFailure = desc;
-      }
-
-      SkString fFailure;
-    };
-
+private:
     TestReporter fTestReporter;
     SkAutoTDelete<skiatest::Test> fTest;
     const SkString fName;
diff --git a/dm/DMTileGridTask.cpp b/dm/DMTileGridTask.cpp
index beffbb0..f9cac07 100644
--- a/dm/DMTileGridTask.cpp
+++ b/dm/DMTileGridTask.cpp
@@ -12,7 +12,7 @@
 namespace DM {
 
 TileGridTask::TileGridTask(const Task& parent, skiagm::GM* gm, SkBitmap reference, SkISize tileSize)
-    : Task(parent)
+    : CpuTask(parent)
     , fName(UnderJoin(parent.name().c_str(), "tilegrid"))
     , fGM(gm)
     , fReference(reference)
diff --git a/dm/DMTileGridTask.h b/dm/DMTileGridTask.h
index 4a522b9..911a1c5 100644
--- a/dm/DMTileGridTask.h
+++ b/dm/DMTileGridTask.h
@@ -11,7 +11,7 @@
 
 namespace DM {
 
-class TileGridTask : public Task {
+class TileGridTask : public CpuTask {
 
 public:
     TileGridTask(const Task& parent,  // TileGridTask must be a child task.  Pass its parent here.
@@ -20,7 +20,6 @@
                  SkISize tileSize);   // Tile size to use.
 
     virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
     virtual bool shouldSkip() const SK_OVERRIDE;
     virtual SkString name() const SK_OVERRIDE { return fName; }
 
diff --git a/dm/DMWriteTask.cpp b/dm/DMWriteTask.cpp
index 5adb1d0..e30cbdb 100644
--- a/dm/DMWriteTask.cpp
+++ b/dm/DMWriteTask.cpp
@@ -26,7 +26,7 @@
     return consumed;
 }
 
-WriteTask::WriteTask(const Task& parent, SkBitmap bitmap) : Task(parent), fBitmap(bitmap) {
+WriteTask::WriteTask(const Task& parent, SkBitmap bitmap) : CpuTask(parent), fBitmap(bitmap) {
     const int suffixes = parent.depth() + 1;
     const SkString& name = parent.name();
     const int totalSuffixLength = split_suffixes(suffixes, name.c_str(), &fSuffixes);
diff --git a/dm/DMWriteTask.h b/dm/DMWriteTask.h
index 49a5c74..839abd7 100644
--- a/dm/DMWriteTask.h
+++ b/dm/DMWriteTask.h
@@ -12,14 +12,13 @@
 
 namespace DM {
 
-class WriteTask : public Task {
+class WriteTask : public CpuTask {
 
 public:
     WriteTask(const Task& parent,  // WriteTask must be a child Task.  Pass its parent here.
               SkBitmap bitmap);    // Bitmap to write.
 
     virtual void draw() SK_OVERRIDE;
-    virtual bool usesGpu() const SK_OVERRIDE { return false; }
     virtual bool shouldSkip() const SK_OVERRIDE;
     virtual SkString name() const SK_OVERRIDE;