Add init-once to threaded backend

For the simplicity of this CL, I haven't enabled DAA for init-once yet.
The current init-once is only enabled for draw path, and it simply
generates the dev path in the init-once phase.

Bug: skia:
Change-Id: Ie9a9ef9fc453acbdeb48b06b93d578c626961e3f
Reviewed-on: https://skia-review.googlesource.com/87784
Commit-Queue: Yuqian Li <liyuqian@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/BUILD.gn b/BUILD.gn
index ea71c8a..d45f692 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -1490,13 +1490,7 @@
     test_lib("samples") {
       public_include_dirs = [ "samplecode" ]
       include_dirs = [ "experimental" ]
-      sources = samples_sources + [
-                  # Relocating these files here, so that clients don't try to build them while they're
-                  # still in active development. Clang's thread safety analysis gets tripped up by
-                  # conditional locks.
-                  "src/core/SkThreadedBMPDevice.cpp",
-                  "src/core/SkThreadedBMPDevice.h",
-                ]
+      sources = samples_sources
       deps = [
         ":experimental_sksg",
         ":experimental_svg_model",
diff --git a/gn/core.gni b/gn/core.gni
index 81218b8..72e12c2 100644
--- a/gn/core.gni
+++ b/gn/core.gni
@@ -313,6 +313,10 @@
   "$_src/core/SkTextToPathIter.h",
   "$_src/core/SkTime.cpp",
   "$_src/core/SkTDPQueue.h",
+
+  # The Clang's false positive thread warnings have been resolved in SkTaskGroup2D.cpp
+  "$_src/core/SkThreadedBMPDevice.cpp",
+  "$_src/core/SkThreadedBMPDevice.h",
   "$_src/core/SkThreadID.cpp",
   "$_src/core/SkTLList.h",
   "$_src/core/SkTLS.cpp",
diff --git a/src/core/SkBlitter.cpp b/src/core/SkBlitter.cpp
index f4de215..7a3e904 100644
--- a/src/core/SkBlitter.cpp
+++ b/src/core/SkBlitter.cpp
@@ -80,11 +80,13 @@
 }
 
 void SkBlitter::blitCoverageDeltas(SkCoverageDeltaList* deltas, const SkIRect& clip,
-                                   bool isEvenOdd, bool isInverse, bool isConvex) {
-    int         runSize = clip.width() + 1; // +1 so we can set runs[clip.width()] = 0
-    void*       storage = this->allocBlitMemory(runSize * (sizeof(int16_t) + sizeof(SkAlpha)));
-    int16_t*    runs    = reinterpret_cast<int16_t*>(storage);
-    SkAlpha*    alphas  = reinterpret_cast<SkAlpha*>(runs + runSize);
+                                   bool isEvenOdd, bool isInverse, bool isConvex,
+                                   SkArenaAlloc* alloc) {
+    // We cannot use blitter to allocate the storage because the same blitter might be used across
+    // many threads.
+    int      runSize    = clip.width() + 1; // +1 so we can set runs[clip.width()] = 0
+    int16_t* runs       = alloc->makeArrayDefault<int16_t>(runSize);
+    SkAlpha* alphas     = alloc->makeArrayDefault<SkAlpha>(runSize);
     runs[clip.width()]  = 0; // we must set the last run to 0 so blitAntiH can stop there
 
     bool canUseMask = !deltas->forceRLE() &&
diff --git a/src/core/SkBlitter.h b/src/core/SkBlitter.h
index c280ac3..357eb46 100644
--- a/src/core/SkBlitter.h
+++ b/src/core/SkBlitter.h
@@ -36,7 +36,8 @@
     // For example, one may avoid some virtual blitAntiH calls by directly calling
     // SkBlitRow::Color32.
     virtual void blitCoverageDeltas(SkCoverageDeltaList* deltas, const SkIRect& clip,
-                                    bool isEvenOdd, bool isInverse, bool isConvex);
+                                    bool isEvenOdd, bool isInverse, bool isConvex,
+                                    SkArenaAlloc* alloc);
 
     /// Blit a horizontal run of one or more pixels.
     virtual void blitH(int x, int y, int width) = 0;
diff --git a/src/core/SkDraw.cpp b/src/core/SkDraw.cpp
index d2d2307..478617f 100644
--- a/src/core/SkDraw.cpp
+++ b/src/core/SkDraw.cpp
@@ -34,6 +34,7 @@
 #include "SkStrokeRec.h"
 #include "SkTemplates.h"
 #include "SkTextMapStateProc.h"
+#include "SkThreadedBMPDevice.h"
 #include "SkTLazy.h"
 #include "SkUtils.h"
 
@@ -951,12 +952,18 @@
 }
 
 void SkDraw::drawDevPath(const SkPath& devPath, const SkPaint& paint, bool drawCoverage,
-                         SkBlitter* customBlitter, bool doFill) const {
+                         SkBlitter* customBlitter, bool doFill, SkInitOnceData* iData) const {
     SkBlitter* blitter = nullptr;
     SkAutoBlitterChoose blitterStorage;
+    SkAutoBlitterChoose* blitterStoragePtr = &blitterStorage;
+    if (iData) {
+        // we're in the threaded init-once phase; the blitter has to be allocated in the thread
+        // allocator so it will remain valid later during the draw phase.
+        blitterStoragePtr = iData->fAlloc->make<SkAutoBlitterChoose>();
+    }
     if (nullptr == customBlitter) {
-        blitterStorage.choose(fDst, *fMatrix, paint, drawCoverage);
-        blitter = blitterStorage.get();
+        blitterStoragePtr->choose(fDst, *fMatrix, paint, drawCoverage);
+        blitter = blitterStoragePtr->get();
     } else {
         blitter = customBlitter;
     }
@@ -1009,12 +1016,31 @@
             }
         }
     }
-    proc(devPath, *fRC, blitter);
+
+    if (iData == nullptr) {
+        proc(devPath, *fRC, blitter); // proceed directly if we're not in threaded init-once
+    } else if (true || !doFill || !paint.isAntiAlias()) {
+        // TODO remove true in the if statement above so we can proceed to DAA.
+
+        // We're in threaded init-once but we can't use DAA. Hence we'll stop here and hand all the
+        // remaining work to draw phase. This is a simple example of how to add init-once to
+        // existing drawXXX commands: simply send in SkInitOnceData, do as much init work as
+        // possible, and finally wrap the remaining work into iData->fElement->fDrawFn.
+        iData->fElement->setDrawFn([proc, devPath, blitter](SkArenaAlloc* alloc,
+                const SkThreadedBMPDevice::DrawState& ds, const SkIRect& tileBounds) {
+            SkThreadedBMPDevice::TileDraw tileDraw(ds, tileBounds);
+            proc(devPath, *tileDraw.fRC, blitter);
+        });
+    } else {
+        // We can use DAA to do scan conversion in the init-once phase.
+        // TODO To be implemented
+    }
 }
 
 void SkDraw::drawPath(const SkPath& origSrcPath, const SkPaint& origPaint,
                       const SkMatrix* prePathMatrix, bool pathIsMutable,
-                      bool drawCoverage, SkBlitter* customBlitter) const {
+                      bool drawCoverage, SkBlitter* customBlitter,
+                      SkInitOnceData* iData) const {
     SkDEBUGCODE(this->validate();)
 
     // nothing to draw
@@ -1024,17 +1050,21 @@
 
     SkPath*         pathPtr = (SkPath*)&origSrcPath;
     bool            doFill = true;
-    SkPath          tmpPath;
+    SkPath          tmpPathStorage;
+    SkPath*         tmpPath = &tmpPathStorage;
     SkMatrix        tmpMatrix;
     const SkMatrix* matrix = fMatrix;
-    tmpPath.setIsVolatile(true);
+    if (iData) {
+        tmpPath = iData->fAlloc->make<SkPath>();
+    }
+    tmpPath->setIsVolatile(true);
 
     if (prePathMatrix) {
         if (origPaint.getPathEffect() || origPaint.getStyle() != SkPaint::kFill_Style) {
             SkPath* result = pathPtr;
 
             if (!pathIsMutable) {
-                result = &tmpPath;
+                result = tmpPath;
                 pathIsMutable = true;
             }
             pathPtr->transform(*prePathMatrix, result);
@@ -1079,18 +1109,18 @@
         if (this->computeConservativeLocalClipBounds(&cullRect)) {
             cullRectPtr = &cullRect;
         }
-        doFill = paint->getFillPath(*pathPtr, &tmpPath, cullRectPtr,
+        doFill = paint->getFillPath(*pathPtr, tmpPath, cullRectPtr,
                                     ComputeResScaleForStroking(*fMatrix));
-        pathPtr = &tmpPath;
+        pathPtr = tmpPath;
     }
 
     // avoid possibly allocating a new path in transform if we can
-    SkPath* devPathPtr = pathIsMutable ? pathPtr : &tmpPath;
+    SkPath* devPathPtr = pathIsMutable ? pathPtr : tmpPath;
 
     // transform the path into device space
     pathPtr->transform(*matrix, devPathPtr);
 
-    this->drawDevPath(*devPathPtr, *paint, drawCoverage, customBlitter, doFill);
+    this->drawDevPath(*devPathPtr, *paint, drawCoverage, customBlitter, doFill, iData);
 }
 
 void SkDraw::drawBitmapAsMask(const SkBitmap& bitmap, const SkPaint& paint) const {
diff --git a/src/core/SkDraw.h b/src/core/SkDraw.h
index e4b2285..8598475 100644
--- a/src/core/SkDraw.h
+++ b/src/core/SkDraw.h
@@ -29,6 +29,7 @@
 struct SkDrawProcs;
 struct SkRect;
 class SkRRect;
+struct SkInitOnceData;
 
 class SkDraw {
 public:
@@ -127,11 +128,11 @@
 
     void    drawPath(const SkPath&, const SkPaint&, const SkMatrix* preMatrix,
                      bool pathIsMutable, bool drawCoverage,
-                     SkBlitter* customBlitter = nullptr) const;
+                     SkBlitter* customBlitter = nullptr, SkInitOnceData* iData = nullptr) const;
 
     void drawLine(const SkPoint[2], const SkPaint&) const;
     void drawDevPath(const SkPath& devPath, const SkPaint& paint, bool drawCoverage,
-                     SkBlitter* customBlitter, bool doFill) const;
+                     SkBlitter* customBlitter, bool doFill, SkInitOnceData* iData = nullptr) const;
     /**
      *  Return the current clip bounds, in local coordinates, with slop to account
      *  for antialiasing or hairlines (i.e. device-bounds outset by 1, and then
@@ -156,6 +157,8 @@
 #else
     void validate() const {}
 #endif
+
+    friend class SkThreadedBMPDevice; // to access private method drawPath
 };
 
 #endif
diff --git a/src/core/SkScan_DAAPath.cpp b/src/core/SkScan_DAAPath.cpp
index 948acce..dca2bd2 100644
--- a/src/core/SkScan_DAAPath.cpp
+++ b/src/core/SkScan_DAAPath.cpp
@@ -352,6 +352,6 @@
     } else {
         SkCoverageDeltaList deltaList(&alloc, clippedIR.fTop, clippedIR.fBottom, forceRLE);
         gen_alpha_deltas(path, clipBounds, deltaList, blitter, skipRect, containedInClip);
-        blitter->blitCoverageDeltas(&deltaList, clipBounds, isEvenOdd, isInverse, isConvex);
+        blitter->blitCoverageDeltas(&deltaList, clipBounds, isEvenOdd, isInverse, isConvex, &alloc);
     }
 }
diff --git a/src/core/SkTaskGroup2D.cpp b/src/core/SkTaskGroup2D.cpp
index fe8a5a8..4060527 100644
--- a/src/core/SkTaskGroup2D.cpp
+++ b/src/core/SkTaskGroup2D.cpp
@@ -24,39 +24,35 @@
 }
 
 void SkSpinningTaskGroup2D::work(int threadId) {
-    int& nextColumn = fRowData[threadId].fNextColumn;
+    int workCol = 0;
+    int initCol = 0;
 
     while (true) {
-        SkASSERT(nextColumn <= fWidth);
-        if (this->isFinishing() && nextColumn >= fWidth) {
+        SkASSERT(workCol <= fWidth);
+        if (this->isFinishing() && workCol >= fWidth) {
             return;
         }
 
-        if (nextColumn < fWidth) {
-            fWork(threadId, nextColumn);
-            nextColumn++;
+        // Note that row = threadId
+        if (workCol < fWidth && fKernel->work2D(threadId, workCol, threadId)) {
+            workCol++;
+        } else {
+            // Initialize something if we can't work
+            this->initAnUninitializedColumn(initCol, threadId);
         }
     }
 }
 
-SkFlexibleTaskGroup2D::SkFlexibleTaskGroup2D(Work2D&& w, int h, SkExecutor* x, int t)
-        : SkTaskGroup2D(std::move(w), h, x, t), fRowData(h), fThreadData(t) {
-    for (int i = 0; i < t; ++i) {
-        fThreadData[i].fRowIndex = i;
-    }
-}
-
-
 void SkFlexibleTaskGroup2D::work(int threadId) {
-    int failCnt = 0;
-    int& rowIndex = fThreadData[threadId].fRowIndex;
+    int row = threadId;
+    int initCol = 0;
+    int numRowsCompleted = 0;
+    std::vector<bool> completedRows(fHeight, false);
 
-    // This loop looks for work to do as long as
-    // either 1. isFinishing is false
-    // or     2. isFinishing is true but some rows still have unfinished tasks
-    while (true) {
-        RowData& rowData = fRowData[rowIndex];
-        bool processed = false;
+    // Only keep fHeight - numRowsCompleted number of threads looping. When rows are about to
+    // complete, this strategy keeps the contention low.
+    while (threadId >= numRowsCompleted) {
+        RowData& rowData = fRowData[row];
 
         // The Android roller somehow gets a false-positive compile warning/error about the try-lock
         // and unlock process. Hence we disable -Wthread-safety-analysis to bypass it.
@@ -65,15 +61,16 @@
 #pragma clang diagnostic ignored "-Wthread-safety-analysis"
 #endif
         if (rowData.fMutex.try_lock()) {
-            if (rowData.fNextColumn < fWidth) {
-                fWork(rowIndex, rowData.fNextColumn);
+            while (rowData.fNextColumn < fWidth &&
+                    fKernel->work2D(row, rowData.fNextColumn, threadId)) {
                 rowData.fNextColumn++;
-                processed = true;
-            } else {
-                // isFinishing can never go from true to false. Once it's true, we count how many
-                // times that a row is out of work. If that count reaches fHeight, then we're out of
-                // work for the whole group.
-                failCnt += this->isFinishing();
+            }
+            // isFinishing can never go from true to false. Once it's true, we count how many rows
+            // are completed (out of work). If that count reaches fHeight, then we're out of work
+            // for the whole group and we can stop.
+            if (rowData.fNextColumn == fWidth && this->isFinishing()) {
+                numRowsCompleted += (completedRows[row] == false);
+                completedRows[row] = true; // so we won't count this row twice
             }
             rowData.fMutex.unlock();
         }
@@ -81,11 +78,9 @@
 #pragma clang diagnostic pop
 #endif
 
-        if (!processed) {
-            if (failCnt >= fHeight) {
-                return;
-            }
-            rowIndex = (rowIndex + 1) % fHeight;
-        }
+        // By reaching here, we're either unable to acquire the row, or out of work, or blocked by
+        // initialization
+        row = (row + 1) % fHeight; // Move to the next row
+        this->initAnUninitializedColumn(initCol, threadId); // Initialize something
     }
 }
diff --git a/src/core/SkTaskGroup2D.h b/src/core/SkTaskGroup2D.h
index b55b96a..851db61 100644
--- a/src/core/SkTaskGroup2D.h
+++ b/src/core/SkTaskGroup2D.h
@@ -13,22 +13,43 @@
 #include <mutex>
 #include <vector>
 
-// A 2D grid (height rows x width columns) of tasks.
-//
-// The task on row i and column j is abstracted as Work2D(i, j). We guarantee that the task on the
-// same row will be executed in order (i.e., Work2D(1, 1) is guaranteed to finish before calling
-// Work2D(1, 2)). Tasks in different rows can happen in any order.
+// The interface for doing work on a 2D grid with possible initialization on columns.
+class SkWorkKernel2D {
+public:
+    // Return false iff the column needs initialization and such initialization is not finished yet.
+    virtual bool work2D(int row, int column, int thread) = 0;
+
+    // Return false if no initialization is done for this colum (e.g., it's already initialized; or
+    // maybe some other thread is initializing the column).
+    virtual bool initColumn(int column, int thread) = 0;
+
+    virtual ~SkWorkKernel2D() {}
+};
+
+// A 2D grid (height rows x width columns) of tasks to be executed on a given executor with
+// threadCnt number of threads.
 //
 // The height (number of rows) is fixed. The width (number of columns) may be dynamically expanded.
 //
-// The tasks will eventually be executed on the executor with threadCnt number of hardware threads.
+// The task on row i and column j is abstracted as work2D(i, j, t). Parameter t is the thread id and
+// it shouldn't affect the work to be done. It's only used to allow some variables that are not
+// thread safe and should be used exclusively by one thread (e.g., thread allocators). We guarantee
+// that the task on the same row will be executed in order (i.e., work2D(1, 1, t) is guaranteed to
+// finish before calling work2D(1, 2, t)). Tasks in different rows can happen in any order.
+//
+// There are also width number of init calls, one per column. work2D(i, j, t) may return false if
+// column j requires initialization but it's not initialized yet. In that case, a thread t needs to
+// call initColumn(j, t) once to unblock all rows that depend on the initialization of column j.
+// (Again, t shouldn't affect the init work to be done; it's just for some non-thread-safe
+// variables). The init calls have no order requirement so we can call them in any order.
+//
+// Multiple therads may try to init the same column j at the same time. InitFn is expected to handle
+// this gracefully (e.g., let only one thread do the init and return immediately for other threads).
 class SkTaskGroup2D {
 public:
-    using Work2D = std::function<void(int, int)>;
-
-    SkTaskGroup2D(Work2D&& work, int height, SkExecutor* executor, int threadCnt)
-            : fWork(work), fHeight(height), fThreadCnt(threadCnt), fIsFinishing(false), fWidth(0)
-            , fThreadsGroup(new SkTaskGroup(*executor)) {}
+    SkTaskGroup2D(SkWorkKernel2D* kernel, int height, SkExecutor* executor, int threadCnt)
+            : fKernel(kernel), fHeight(height), fThreadCnt(threadCnt), fIsFinishing(false)
+            , fWidth(0), fThreadsGroup(new SkTaskGroup(*executor)) {}
 
     virtual ~SkTaskGroup2D() {}
 
@@ -47,9 +68,19 @@
     // Finish all tasks on the threadId and then return.
     virtual void work(int threadId) = 0;
 
-    Work2D      fWork; // fWork(i, j) is the task to be done on row i and column j
-    const int   fHeight;
-    const int   fThreadCnt;
+    // Initialize a column that needs to be initialized. The parameter initCol is not thread safe
+    // and should only be exclusively accessed by the working thread which will modify it to the
+    // column that may need to be initialized next.
+    void initAnUninitializedColumn(int& initCol, int threadId) {
+        bool didSomeInit = false;
+        while (initCol < fWidth && !didSomeInit) {
+            didSomeInit = fKernel->initColumn(initCol++, threadId);
+        }
+    }
+
+    SkWorkKernel2D*     fKernel;
+    const int           fHeight;
+    const int           fThreadCnt;
 
     std::atomic<bool>   fIsFinishing;
     std::atomic<int>    fWidth;
@@ -60,28 +91,19 @@
 // A simple spinning task group that assumes height equals threadCnt.
 class SkSpinningTaskGroup2D final : public SkTaskGroup2D {
 public:
-    SkSpinningTaskGroup2D(Work2D&& w, int h, SkExecutor* x, int t)
-            : SkTaskGroup2D(std::move(w), h, x, t), fRowData(h) {
+    SkSpinningTaskGroup2D(SkWorkKernel2D* kernel, int h, SkExecutor* x, int t)
+            : SkTaskGroup2D(kernel, h, x, t) {
         SkASSERT(h == t); // height must be equal to threadCnt
     }
 
 protected:
     void work(int threadId) override;
-
-private:
-    // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
-    struct alignas(MAX_CACHE_LINE) RowData {
-        RowData() : fNextColumn(0) {}
-
-        int fNextColumn; // next column index to be executed
-    };
-
-    std::vector<RowData>  fRowData;
 };
 
 class SkFlexibleTaskGroup2D final : public SkTaskGroup2D {
 public:
-    SkFlexibleTaskGroup2D(Work2D&&, int, SkExecutor*, int);
+    SkFlexibleTaskGroup2D(SkWorkKernel2D* kernel, int h, SkExecutor* x, int t)
+            : SkTaskGroup2D(kernel, h, x, t), fRowData(h) {}
 
 protected:
     void work(int threadId) override;
@@ -91,18 +113,11 @@
     struct alignas(MAX_CACHE_LINE) RowData {
         RowData() : fNextColumn(0) {}
 
-        int         fNextColumn; // next column index to be executed
+        int         fNextColumn; // next column index to work
         std::mutex  fMutex;      // the mutex for the thread to acquire
     };
 
-    struct alignas(MAX_CACHE_LINE) ThreadData {
-        ThreadData() : fRowIndex(0) {}
-
-        int fRowIndex; // the row that the current thread is working on
-    };
-
     std::vector<RowData>    fRowData;
-    std::vector<ThreadData> fThreadData;
 };
 
 #endif//SkTaskGroup2D_DEFINED
diff --git a/src/core/SkThreadedBMPDevice.cpp b/src/core/SkThreadedBMPDevice.cpp
index a61e9b2..ad3814c 100644
--- a/src/core/SkThreadedBMPDevice.cpp
+++ b/src/core/SkThreadedBMPDevice.cpp
@@ -12,23 +12,30 @@
 #include "SkTaskGroup.h"
 #include "SkVertices.h"
 
+// Calling init(j, k) would initialize the j-th element on k-th thread. It returns false if it's
+// already initiailized.
+bool SkThreadedBMPDevice::DrawQueue::initColumn(int column, int thread) {
+    return fElements[column].tryInitOnce(&fThreadAllocs[thread]);
+}
+
+// Calling work(i, j, k) would draw j-th element the i-th tile on k-th thead. If the element still
+// needs to be initialized, drawFn will return false without drawing.
+bool SkThreadedBMPDevice::DrawQueue::work2D(int row, int column, int thread) {
+    return fElements[column].tryDraw(fDevice->fTileBounds[row], &fThreadAllocs[thread]);
+}
+
 void SkThreadedBMPDevice::DrawQueue::reset() {
     if (fTasks) {
         fTasks->finish();
     }
 
+    fThreadAllocs.reset(fDevice->fThreadCnt);
     fSize = 0;
 
     // using TaskGroup2D = SkSpinningTaskGroup2D;
     using TaskGroup2D = SkFlexibleTaskGroup2D;
-    auto draw2D = [this](int row, int column){
-        SkThreadedBMPDevice::DrawElement& element = fElements[column];
-        if (!SkIRect::Intersects(fDevice->fTileBounds[row], element.fDrawBounds)) {
-            return;
-        }
-        element.fDrawFn(nullptr, element.fDS, fDevice->fTileBounds[row]);
-    };
-    fTasks.reset(new TaskGroup2D(draw2D, fDevice->fTileCnt, fDevice->fExecutor,
+
+    fTasks.reset(new TaskGroup2D(this, fDevice->fTileCnt, fDevice->fExecutor,
                                  fDevice->fThreadCnt));
     fTasks->start();
 }
@@ -149,9 +156,16 @@
         const SkMatrix* prePathMatrix, bool pathIsMutable) {
     SkRect drawBounds = path.isInverseFillType() ? SkRectPriv::MakeLargest()
                                                  : get_fast_bounds(path.getBounds(), paint);
-    fQueue.push(drawBounds, [=](SkArenaAlloc*, const DrawState& ds, const SkIRect& tileBounds) {
-        TileDraw(ds, tileBounds).drawPath(path, paint, prePathMatrix, false);
-    });
+    if (path.countVerbs() < 100) { // when path is small, init-once has too much overhead
+        fQueue.push(drawBounds, [=](SkArenaAlloc*, const DrawState& ds, const SkIRect& tileBounds) {
+            TileDraw(ds, tileBounds).drawPath(path, paint, prePathMatrix, false);
+        });
+    } else {
+        fQueue.push(drawBounds, [=](SkArenaAlloc* alloc, DrawElement* elem) {
+            SkInitOnceData data = {alloc, elem};
+            elem->getDraw().drawPath(path, paint, prePathMatrix, false, false, nullptr, &data);
+        });
+    }
 }
 
 void SkThreadedBMPDevice::drawBitmap(const SkBitmap& bitmap, SkScalar x, SkScalar y,
diff --git a/src/core/SkThreadedBMPDevice.h b/src/core/SkThreadedBMPDevice.h
index 143657f..a33715e 100644
--- a/src/core/SkThreadedBMPDevice.h
+++ b/src/core/SkThreadedBMPDevice.h
@@ -43,6 +43,7 @@
     void flush() override;
 
 private:
+    // We store DrawState inside DrawElement because inifFn and drawFn both want to use it
     struct DrawState {
         SkPixmap fDst;
         SkMatrix fMatrix;
@@ -59,16 +60,59 @@
         private: SkRasterClip fTileRC;
     };
 
-    struct DrawElement {
+    class DrawElement {
+    public:
+        using InitFn = std::function<void(SkArenaAlloc* threadAlloc, DrawElement* element)>;
         using DrawFn = std::function<void(SkArenaAlloc* threadAlloc, const DrawState& ds,
                                           const SkIRect& tileBounds)>;
 
-        DrawFn      fDrawFn;
-        DrawState   fDS;
-        SkIRect     fDrawBounds;
+        DrawElement() {}
+        DrawElement(SkThreadedBMPDevice* device, DrawFn&& drawFn, const SkRect& rawDrawBounds)
+                : fInitialized(true)
+                , fDrawFn(std::move(drawFn))
+                , fDS(device)
+                , fDrawBounds(device->transformDrawBounds(rawDrawBounds)) {}
+        DrawElement(SkThreadedBMPDevice* device, InitFn&& initFn, const SkRect& rawDrawBounds)
+                : fInitialized(false)
+                , fInitFn(std::move(initFn))
+                , fDS(device)
+                , fDrawBounds(device->transformDrawBounds(rawDrawBounds)) {}
+
+        SK_ALWAYS_INLINE bool tryInitOnce(SkArenaAlloc* alloc) {
+            if (fInitialized) {
+                return false;
+            }
+            std::call_once(fNeedInit, [this, alloc]{
+                fInitFn(alloc, this);
+                fInitialized = true;
+            });
+            return true;
+        }
+
+        SK_ALWAYS_INLINE bool tryDraw(const SkIRect& tileBounds, SkArenaAlloc* alloc) {
+            if (!SkIRect::Intersects(tileBounds, fDrawBounds)) {
+                return true;
+            }
+            if (fInitialized) {
+                fDrawFn(alloc, fDS, tileBounds);
+                return true;
+            }
+            return false;
+        }
+
+        SkDraw getDraw() const { return fDS.getDraw(); }
+        void setDrawFn(DrawFn&& fn) { fDrawFn = std::move(fn); }
+
+    private:
+        std::atomic<bool>   fInitialized;
+        std::once_flag      fNeedInit;
+        InitFn              fInitFn;
+        DrawFn              fDrawFn;
+        DrawState           fDS;
+        SkIRect             fDrawBounds;
     };
 
-    class DrawQueue {
+    class DrawQueue : public SkWorkKernel2D {
     public:
         static constexpr int MAX_QUEUE_SIZE = 100000;
 
@@ -79,25 +123,29 @@
         // will start new tasks.
         void finish() { fTasks->finish(); }
 
-        SK_ALWAYS_INLINE void push(const SkRect& rawDrawBounds,
-                                   DrawElement::DrawFn&& drawFn) {
+        // Push a draw command into the queue. If Fn is DrawFn, we're pushing an element without
+        // the need of initialization. If Fn is InitFn, we're pushing an element with init-once
+        // and the InitFn will generate the DrawFn during initialization.
+        template<typename Fn>
+        SK_ALWAYS_INLINE void push(const SkRect& rawDrawBounds, Fn&& fn) {
             if (fSize == MAX_QUEUE_SIZE) {
                 this->reset();
             }
             SkASSERT(fSize < MAX_QUEUE_SIZE);
-
-            DrawElement* element = &fElements[fSize++];
-            element->fDS = DrawState(fDevice);
-            element->fDrawFn = std::move(drawFn);
-            element->fDrawBounds = fDevice->transformDrawBounds(rawDrawBounds);
+            new (&fElements[fSize++]) DrawElement(fDevice, std::move(fn), rawDrawBounds);
             fTasks->addColumn();
         }
 
+        // SkWorkKernel2D
+        bool initColumn(int column, int thread) override;
+        bool work2D(int row, int column, int thread) override;
+
     private:
-        SkThreadedBMPDevice*            fDevice;
-        std::unique_ptr<SkTaskGroup2D>  fTasks;
-        DrawElement                     fElements[MAX_QUEUE_SIZE];
-        int                             fSize;
+        SkThreadedBMPDevice*                fDevice;
+        std::unique_ptr<SkTaskGroup2D>      fTasks;
+        SkTArray<SkSTArenaAlloc<8 << 10>>   fThreadAllocs; // 8k stack size
+        DrawElement                         fElements[MAX_QUEUE_SIZE];
+        int                                 fSize;
     };
 
     SkIRect transformDrawBounds(const SkRect& drawBounds) const;
@@ -117,7 +165,17 @@
 
     DrawQueue fQueue;
 
+    friend struct SkInitOnceData;   // to access DrawElement
+    friend class SkDraw;            // to access DrawState
+
     typedef SkBitmapDevice INHERITED;
 };
 
+// Passed to SkDraw::drawXXX to enable threaded draw with init-once. The goal is to reuse as much
+// code as possible from SkDraw. (See SkDraw::drawPath and SkDraw::drawDevPath for an example.)
+struct SkInitOnceData {
+    SkArenaAlloc* fAlloc;
+    SkThreadedBMPDevice::DrawElement* fElement;
+};
+
 #endif // SkThreadedBMPDevice_DEFINED