SkThreadPool ~~> SkTaskGroup

SkTaskGroup is like SkThreadPool except the threads stay in
one global pool.  Each SkTaskGroup itself is tiny (4 bytes)
and its wait() method applies only to tasks add()ed to that
instance, not the whole thread pool.

This means we don't need to bring up new thread pools when
tests themselves want to use multithreading (e.g. pathops,
quilt).  We just create a new SkTaskGroup and wait for that
to complete.  This should be more efficient, and allow us
to expand where we use threads to really latency sensitive
places.  E.g. we can probably now use these in nanobench
for CPU .skp rendering.

Now that all threads are sharing the same pool, I think we
can remove most of the custom mechanism pathops tests use
to control threading.  They'll just ride on the global pool
with all other tests now.

This (temporarily?) removes the GPU multithreading feature
from DM, which we don't use.

On my desktop, DM runs a little faster (57s -> 55s) in
Debug, and a lot faster in Release (36s -> 24s).  The bots
show speedups of similar proportions, cutting more than a
minute off the N4/Release and Win7/Debug runtimes.




Review URL:
diff --git a/src/utils/SkRunnable.h b/src/utils/SkRunnable.h
index 5acf4db..7a93b60 100644
--- a/src/utils/SkRunnable.h
+++ b/src/utils/SkRunnable.h
@@ -8,18 +8,9 @@
 #ifndef SkRunnable_DEFINED
 #define SkRunnable_DEFINED
-template <typename T>
-struct SkTRunnable {
-    virtual ~SkTRunnable() {};
-    virtual void run(T&) = 0;
-template <>
-struct SkTRunnable<void> {
-    virtual ~SkTRunnable() {};
+struct SkRunnable {
+    virtual ~SkRunnable() {};
     virtual void run() = 0;
-typedef SkTRunnable<void> SkRunnable;
diff --git a/src/utils/SkTaskGroup.cpp b/src/utils/SkTaskGroup.cpp
new file mode 100644
index 0000000..e7de6be
--- /dev/null
+++ b/src/utils/SkTaskGroup.cpp
@@ -0,0 +1,143 @@
+#include "SkTaskGroup.h"
+#include "SkCondVar.h"
+#include "SkTDArray.h"
+#include "SkThread.h"
+#include "SkThreadUtils.h"
+#if defined(SK_BUILD_FOR_WIN32)
+    static inline int num_cores() {
+        SYSTEM_INFO sysinfo;
+        GetSystemInfo(&sysinfo);
+        return sysinfo.dwNumberOfProcessors;
+    }
+    #include <unistd.h>
+    static inline int num_cores() {
+        return (int) sysconf(_SC_NPROCESSORS_ONLN);
+    }
+namespace {
+class ThreadPool : SkNoncopyable {
+    static void Add(SkRunnable* task, int32_t* pending) {
+        SkASSERT(gGlobal);  // If this fails, see SkTaskGroup::Enabler.
+        gGlobal->add(task, pending);
+    }
+    static void Wait(int32_t* pending) {
+        SkASSERT(gGlobal);  // If this fails, see SkTaskGroup::Enabler.
+        while (sk_acquire_load(pending) > 0) {  // Pairs with sk_atomic_dec here or in Loop.
+            // Lend a hand until our SkTaskGroup of interest is done.
+            Work work;
+            {
+                AutoLock lock(&gGlobal->fReady);
+                if (gGlobal->fWork.isEmpty()) {
+                    // Someone has picked up all the work (including ours).  How nice of them!
+                    // (They may still be working on it, so we can't assert *pending == 0 here.)
+                    continue;
+                }
+                gGlobal->fWork.pop(&work);
+            }
+            // This Work isn't necessarily part of our SkTaskGroup of interest, but that's fine.
+            // We threads gotta stick together.  We're always making forward progress.
+            work.task->run();
+            sk_atomic_dec(work.pending);  // Release pairs with the sk_acquire_load() just above.
+        }
+    }
+    struct AutoLock {
+        AutoLock(SkCondVar* c) : fC(c) { fC->lock(); }
+        ~AutoLock() { fC->unlock(); }
+    private:
+        SkCondVar* fC;
+    };
+    struct Work {
+        SkRunnable* task;  // A task to ->run(),
+        int32_t* pending;  // then sk_atomic_dec(pending) afterwards.
+    };
+    explicit ThreadPool(int threads) : fDraining(false) {
+        if (threads == 0) {
+            threads = num_cores();
+        }
+        for (int i = 0; i < threads; i++) {
+            fThreads.push(SkNEW_ARGS(SkThread, (&ThreadPool::Loop, this)));
+  >start();
+        }
+    }
+    ~ThreadPool() {
+        SkASSERT(fWork.isEmpty());  // All SkTaskGroups should be destroyed by now.
+        {
+            AutoLock lock(&fReady);
+            fDraining = true;
+            fReady.broadcast();
+        }
+        for (int i = 0; i < fThreads.count(); i++) {
+            fThreads[i]->join();
+        }
+        SkASSERT(fWork.isEmpty());  // Can't hurt to double check.
+        fThreads.deleteAll();
+    }
+    void add(SkRunnable* task, int32_t* pending) {
+        Work work = { task, pending };
+        sk_atomic_inc(pending);  // No barrier needed.
+        {
+            AutoLock lock(&fReady);
+            fWork.push(work);
+            fReady.signal();
+        }
+    }
+    static void Loop(void* arg) {
+        ThreadPool* pool = (ThreadPool*)arg;
+        Work work;
+        while (true) {
+            {
+                AutoLock lock(&pool->fReady);
+                while (pool->fWork.isEmpty()) {
+                    if (pool->fDraining) {
+                        return;
+                    }
+                    pool->fReady.wait();
+                }
+                pool->fWork.pop(&work);
+            }
+            work.task->run();
+            sk_atomic_dec(work.pending);  // Release pairs with sk_acquire_load() in Wait().
+        }
+    }
+    SkTDArray<Work>      fWork;
+    SkTDArray<SkThread*> fThreads;
+    SkCondVar            fReady;
+    bool                 fDraining;
+    static ThreadPool* gGlobal;
+    friend struct SkTaskGroup::Enabler;
+ThreadPool* ThreadPool::gGlobal = NULL;
+}  // namespace
+SkTaskGroup::Enabler::Enabler(int threads) {
+    SkASSERT(ThreadPool::gGlobal == NULL);
+    ThreadPool::gGlobal = SkNEW_ARGS(ThreadPool, (threads));
+SkTaskGroup::Enabler::~Enabler() {
+    SkASSERT(ThreadPool::gGlobal != NULL);
+    SkDELETE(ThreadPool::gGlobal);
+SkTaskGroup::SkTaskGroup() : fPending(0) {}
+void SkTaskGroup::add(SkRunnable* task) { ThreadPool::Add(task, &fPending); }
+void SkTaskGroup::wait()                { ThreadPool::Wait(&fPending); }
diff --git a/src/utils/SkTaskGroup.h b/src/utils/SkTaskGroup.h
new file mode 100644
index 0000000..d9e6f59
--- /dev/null
+++ b/src/utils/SkTaskGroup.h
@@ -0,0 +1,36 @@
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#ifndef SkTaskGroup_DEFINED
+#define SkTaskGroup_DEFINED
+#include "SkTypes.h"
+#include "SkRunnable.h"
+class SkTaskGroup : SkNoncopyable {
+    // Create one of these in main() to enable SkTaskGroups globally.
+    struct Enabler : SkNoncopyable {
+        explicit Enabler(int threads = 0);  // Default is system-reported core count.
+        ~Enabler();
+    };
+    SkTaskGroup();
+    ~SkTaskGroup() { this->wait(); }
+    // Add a task to this SkTaskGroup.  It will likely run() on another thread.
+    void add(SkRunnable*);
+    // Block until all Tasks previously add()ed to this SkTaskGroup have run().
+    // You may safely reuse this SkTaskGroup after wait() returns.
+    void wait();
+    /*atomic*/ int32_t fPending;
diff --git a/src/utils/SkThreadPool.h b/src/utils/SkThreadPool.h
deleted file mode 100644
index c99c5c4..0000000
--- a/src/utils/SkThreadPool.h
+++ /dev/null
@@ -1,221 +0,0 @@
- * Copyright 2012 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-#ifndef SkThreadPool_DEFINED
-#define SkThreadPool_DEFINED
-#include "SkCondVar.h"
-#include "SkRunnable.h"
-#include "SkTDArray.h"
-#include "SkTInternalLList.h"
-#include "SkThreadUtils.h"
-#include "SkTypes.h"
-#if defined(SK_BUILD_FOR_UNIX) || defined(SK_BUILD_FOR_MAC) || defined(SK_BUILD_FOR_ANDROID)
-#    include <unistd.h>
-// Returns the number of cores on this machine.
-static inline int num_cores() {
-#if defined(SK_BUILD_FOR_WIN32)
-    SYSTEM_INFO sysinfo;
-    GetSystemInfo(&sysinfo);
-    return sysinfo.dwNumberOfProcessors;
-#elif defined(SK_BUILD_FOR_UNIX) || defined(SK_BUILD_FOR_MAC) || defined(SK_BUILD_FOR_ANDROID)
-    return (int) sysconf(_SC_NPROCESSORS_ONLN);
-    return 1;
-template <typename T>
-class SkTThreadPool {
-    /**
-     * Create a threadpool with count threads, or one thread per core if kThreadPerCore.
-     */
-    static const int kThreadPerCore = -1;
-    explicit SkTThreadPool(int count);
-    ~SkTThreadPool();
-    /**
-     * Queues up an SkRunnable to run when a thread is available, or synchronously if count is 0.
-     * Does not take ownership. NULL is a safe no-op.  If T is not void, the runnable will be passed
-     * a reference to a T on the thread's local stack.
-     */
-    void add(SkTRunnable<T>*);
-    /**
-     * Same as add, but adds the runnable as the very next to run rather than enqueueing it.
-     */
-    void addNext(SkTRunnable<T>*);
-    /**
-     * Block until all added SkRunnables have completed.  Once called, calling add() is undefined.
-     */
-    void wait();
- private:
-    struct LinkedRunnable {
-        SkTRunnable<T>* fRunnable;  // Unowned.
-    };
-    enum State {
-        kRunning_State,  // Normal case.  We've been constructed and no one has called wait().
-        kWaiting_State,  // wait has been called, but there still might be work to do or being done.
-        kHalting_State,  // There's no work to do and no thread is busy.  All threads can shut down.
-    };
-    void addSomewhere(SkTRunnable<T>* r,
-                      void (SkTInternalLList<LinkedRunnable>::*)(LinkedRunnable*));
-    SkTInternalLList<LinkedRunnable> fQueue;
-    SkCondVar                        fReady;
-    SkTDArray<SkThread*>             fThreads;
-    State                            fState;
-    int                              fBusyThreads;
-    static void Loop(void*);  // Static because we pass in this.
-template <typename T>
-SkTThreadPool<T>::SkTThreadPool(int count) : fState(kRunning_State), fBusyThreads(0) {
-    if (count < 0) {
-        count = num_cores();
-    }
-    // Create count threads, all running SkTThreadPool::Loop.
-    for (int i = 0; i < count; i++) {
-        SkThread* thread = SkNEW_ARGS(SkThread, (&SkTThreadPool::Loop, this));
-        *fThreads.append() = thread;
-        thread->start();
-    }
-template <typename T>
-SkTThreadPool<T>::~SkTThreadPool() {
-    if (kRunning_State == fState) {
-        this->wait();
-    }
-namespace SkThreadPoolPrivate {
-template <typename T>
-struct ThreadLocal {
-    void run(SkTRunnable<T>* r) { r->run(data); }
-    T data;
-template <>
-struct ThreadLocal<void> {
-    void run(SkTRunnable<void>* r) { r->run(); }
-}  // namespace SkThreadPoolPrivate
-template <typename T>
-void SkTThreadPool<T>::addSomewhere(SkTRunnable<T>* r,
-                                    void (SkTInternalLList<LinkedRunnable>::* f)(LinkedRunnable*)) {
-    if (r == NULL) {
-        return;
-    }
-    if (fThreads.isEmpty()) {
-        SkThreadPoolPrivate::ThreadLocal<T> threadLocal;
-        return;
-    }
-    LinkedRunnable* linkedRunnable = SkNEW(LinkedRunnable);
-    linkedRunnable->fRunnable = r;
-    fReady.lock();
-    SkASSERT(fState != kHalting_State);  // Shouldn't be able to add work when we're halting.
-    (fQueue.*f)(linkedRunnable);
-    fReady.signal();
-    fReady.unlock();
-template <typename T>
-void SkTThreadPool<T>::add(SkTRunnable<T>* r) {
-    this->addSomewhere(r, &SkTInternalLList<LinkedRunnable>::addToTail);
-template <typename T>
-void SkTThreadPool<T>::addNext(SkTRunnable<T>* r) {
-    this->addSomewhere(r, &SkTInternalLList<LinkedRunnable>::addToHead);
-template <typename T>
-void SkTThreadPool<T>::wait() {
-    fReady.lock();
-    fState = kWaiting_State;
-    fReady.broadcast();
-    fReady.unlock();
-    // Wait for all threads to stop.
-    for (int i = 0; i < fThreads.count(); i++) {
-        fThreads[i]->join();
-        SkDELETE(fThreads[i]);
-    }
-    SkASSERT(fQueue.isEmpty());
-template <typename T>
-/*static*/ void SkTThreadPool<T>::Loop(void* arg) {
-    // The SkTThreadPool passes itself as arg to each thread as they're created.
-    SkTThreadPool<T>* pool = static_cast<SkTThreadPool<T>*>(arg);
-    SkThreadPoolPrivate::ThreadLocal<T> threadLocal;
-    while (true) {
-        // We have to be holding the lock to read the queue and to call wait.
-        pool->fReady.lock();
-        while(pool->fQueue.isEmpty()) {
-            // Does the client want to stop and are all the threads ready to stop?
-            // If so, we move into the halting state, and whack all the threads so they notice.
-            if (kWaiting_State == pool->fState && pool->fBusyThreads == 0) {
-                pool->fState = kHalting_State;
-                pool->fReady.broadcast();
-            }
-            // Any time we find ourselves in the halting state, it's quitting time.
-            if (kHalting_State == pool->fState) {
-                pool->fReady.unlock();
-                return;
-            }
-            // wait yields the lock while waiting, but will have it again when awoken.
-            pool->fReady.wait();
-        }
-        // We've got the lock back here, no matter if we ran wait or not.
-        // The queue is not empty, so we have something to run.  Claim it.
-        LinkedRunnable* r = pool->fQueue.head();
-        pool->fQueue.remove(r);
-        // Having claimed our SkRunnable, we now give up the lock while we run it.
-        // Otherwise, we'd only ever do work on one thread at a time, which rather
-        // defeats the point of this code.
-        pool->fBusyThreads++;
-        pool->fReady.unlock();
-        // OK, now really do the work.
-        SkDELETE(r);
-        // Let everyone know we're not busy.
-        pool->fReady.lock();
-        pool->fBusyThreads--;
-        pool->fReady.unlock();
-    }
-    SkASSERT(false); // Unreachable.  The only exit happens when pool->fState is kHalting_State.
-typedef SkTThreadPool<void> SkThreadPool;