DM: Push GPU-parent child tasks to the front of the queue. Like yesterday's change to run CPU-parent child tasks serially in thread, this reduces peak memory usage by improving the temporaly locality of the bitmaps we create. E.g. Let's say we start with tasks A B C and D Queue: [ A B C D ] Running A creates A' and A", which depend on a bitmap created by A. Queue: [ B C D A' A" * ] That bitmap now needs sit around in RAM while B C and D run pointlessly and can only be destroyed at *. If instead we do this and push dependent child tasks to the front of the queue, the queue and bitmap lifetime looks like this: Queue: [ A' A" * B C D ] This is much, much worse in practice because the queue is often several thousand tasks long. 100s of megs of bitmaps can pile up for 10s of seconds pointlessly. To make this work we add addNext() to SkThreadPool and its cousin DMTaskRunner. I also took the opportunity to swap head and tail in the threadpool implementation so it matches the comments and intuition better: we always pop the head, add() puts it at the tail, addNext() at the head. Before Debug: 49s, 1403352k peak Release: 16s, 2064008k peak After Debug: 49s, 1234788k peak Release: 15s, 1903424k peak BUG=skia:2478 R=bsalomon@google.com, borenet@google.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/263803003 git-svn-id: http://skia.googlecode.com/svn/trunk@14506 2bbb7eff-a529-9590-31e7-b0007b416f81

commit: 3f032156c8b5d4decb0beccdc2cdbcf91867fa36 [log] [tgz]
author: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> Thu May 01 17:41:32 2014 +0000
committer: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> Thu May 01 17:41:32 2014 +0000
tree: 246690316658fb9071df888e9eca21c1606c16e9
parent: 6c611940b454e230343ac0215814e0efba615eaa [diff]
diff --git a/dm/DMTask.cpp b/dm/DMTask.cpp
index 83df849..7725780 100644
--- a/dm/DMTask.cpp
+++ b/dm/DMTask.cpp

@@ -37,8 +37,8 @@
     fReporter->finish(this->name(), SkTime::GetMSecs() - fStart);
 }
 
-void Task::reallySpawnChild(CpuTask* task) {
-    fTaskRunner->add(task);
+void Task::spawnChildNext(CpuTask* task) {
+    fTaskRunner->addNext(task);
 }
 
 CpuTask::CpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
@@ -55,6 +55,8 @@
 
 void CpuTask::spawnChild(CpuTask* task) {
     // Run children serially on this (CPU) thread.  This tends to save RAM and is usually no slower.
+    // Calling spawnChildNext() is nearly equivalent, but it'd pointlessly contend on the
+    // threadpool; spawnChildNext() is most useful when you want to change threadpools.
     task->run();
 }
 
@@ -71,7 +73,8 @@
 
 void GpuTask::spawnChild(CpuTask* task) {
     // Really spawn a new task so it runs on the CPU threadpool instead of the GPU one we're on now.
-    this->reallySpawnChild(task);
+    // It goes on the front of the queue to minimize the time we must hold reference bitmaps in RAM.
+    this->spawnChildNext(task);
 }
 
 }  // namespace DM

diff --git a/dm/DMTask.h b/dm/DMTask.h
index 02d8a22..96cf810 100644
--- a/dm/DMTask.h
+++ b/dm/DMTask.h

@@ -36,7 +36,7 @@
     void fail(const char* msg = NULL);
     void finish();
 
-    void reallySpawnChild(CpuTask* task);  // For now we don't allow GPU child tasks.
+    void spawnChildNext(CpuTask* task);  // For now we don't allow GPU child tasks.
 
 private:
     Reporter* fReporter;      // Unowned.

diff --git a/dm/DMTaskRunner.cpp b/dm/DMTaskRunner.cpp
index e0bd977..8a0bc83 100644
--- a/dm/DMTaskRunner.cpp
+++ b/dm/DMTaskRunner.cpp

@@ -6,7 +6,7 @@
 TaskRunner::TaskRunner(int cpuThreads, int gpuThreads) : fCpu(cpuThreads), fGpu(gpuThreads) {}
 
 void TaskRunner::add(CpuTask* task) { fCpu.add(task); }
-
+void TaskRunner::addNext(CpuTask* task) { fCpu.addNext(task); }
 void TaskRunner::add(GpuTask* task) { fGpu.add(task); }
 
 void TaskRunner::wait() {

diff --git a/dm/DMTaskRunner.h b/dm/DMTaskRunner.h
index b20113f..dd1440e 100644
--- a/dm/DMTaskRunner.h
+++ b/dm/DMTaskRunner.h

@@ -18,6 +18,7 @@
     explicit TaskRunner(int cpuThreads, int gpuThreads);
 
     void add(CpuTask* task);
+    void addNext(CpuTask* task);
     void add(GpuTask* task);
     void wait();
 

diff --git a/include/utils/SkThreadPool.h b/include/utils/SkThreadPool.h
index a75bed8..295b1b4 100644
--- a/include/utils/SkThreadPool.h
+++ b/include/utils/SkThreadPool.h

@@ -50,6 +50,11 @@
     void add(SkTRunnable<T>*);
 
     /**
+     * Same as add, but adds the runnable as the very next to run rather than enqueueing it.
+     */
+    void addNext(SkTRunnable<T>*);
+
+    /**
      * Block until all added SkRunnables have completed.  Once called, calling add() is undefined.
      */
     void wait();
@@ -66,6 +71,9 @@
         kHalting_State,  // There's no work to do and no thread is busy.  All threads can shut down.
     };
 
+    void addSomewhere(SkTRunnable<T>* r,
+                      void (SkTInternalLList<LinkedRunnable>::*)(LinkedRunnable*));
+
     SkTInternalLList<LinkedRunnable> fQueue;
     SkCondVar                        fReady;
     SkTDArray<SkThread*>             fThreads;
@@ -111,7 +119,8 @@
 }  // namespace SkThreadPoolPrivate
 
 template <typename T>
-void SkTThreadPool<T>::add(SkTRunnable<T>* r) {
+void SkTThreadPool<T>::addSomewhere(SkTRunnable<T>* r,
+                                    void (SkTInternalLList<LinkedRunnable>::* f)(LinkedRunnable*)) {
     if (r == NULL) {
         return;
     }
@@ -126,11 +135,21 @@
     linkedRunnable->fRunnable = r;
     fReady.lock();
     SkASSERT(fState != kHalting_State);  // Shouldn't be able to add work when we're halting.
-    fQueue.addToHead(linkedRunnable);
+    (fQueue.*f)(linkedRunnable);
     fReady.signal();
     fReady.unlock();
 }
 
+template <typename T>
+void SkTThreadPool<T>::add(SkTRunnable<T>* r) {
+    this->addSomewhere(r, &SkTInternalLList<LinkedRunnable>::addToTail);
+}
+
+template <typename T>
+void SkTThreadPool<T>::addNext(SkTRunnable<T>* r) {
+    this->addSomewhere(r, &SkTInternalLList<LinkedRunnable>::addToHead);
+}
+
 
 template <typename T>
 void SkTThreadPool<T>::wait() {
@@ -174,7 +193,7 @@
         // We've got the lock back here, no matter if we ran wait or not.
 
         // The queue is not empty, so we have something to run.  Claim it.
-        LinkedRunnable* r = pool->fQueue.tail();
+        LinkedRunnable* r = pool->fQueue.head();
 
         pool->fQueue.remove(r);
commit	3f032156c8b5d4decb0beccdc2cdbcf91867fa36	[log] [tgz]
author	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	Thu May 01 17:41:32 2014 +0000
committer	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	Thu May 01 17:41:32 2014 +0000
tree	246690316658fb9071df888e9eca21c1606c16e9
parent	6c611940b454e230343ac0215814e0efba615eaa [diff]