add compile, use it in blitter

I expanded an existing bench to show off the difference:

    SkRasterPipeline_…
    300  …compile 1x  …run 1.14x

Change-Id: I5d63d602cda3f78d2d0891fcc85baf5514632900
Reviewed-on: https://skia-review.googlesource.com/17458
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp
index 2848218..bef80b3 100644
--- a/bench/SkRasterPipelineBench.cpp
+++ b/bench/SkRasterPipelineBench.cpp
@@ -66,11 +66,13 @@
 DEF_BENCH( return (new SkRasterPipelineBench< true>); )
 DEF_BENCH( return (new SkRasterPipelineBench<false>); )
 
-class SkRasterPipelineLegacyBench : public Benchmark {
+class SkRasterPipelineCompileVsRunBench : public Benchmark {
 public:
+    explicit SkRasterPipelineCompileVsRunBench(bool compile) : fCompile(compile) {}
     bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
     const char* onGetName() override {
-        return "SkRasterPipeline_legacy";
+        return fCompile ? "SkRasterPipeline_compile"
+                        : "SkRasterPipeline_run";
     }
 
     void onDraw(int loops, SkCanvas*) override {
@@ -84,12 +86,24 @@
         p.append(SkRasterPipeline::srcover);
         p.append(SkRasterPipeline::store_8888, &dst_ctx);
 
-        while (loops --> 0) {
-            p.run(0,N);
+        if (fCompile) {
+            char buffer[1024];
+            SkArenaAlloc alloc(buffer);
+            auto fn = p.compile(&alloc);
+            while (loops --> 0) {
+                fn(0,N);
+            }
+        } else {
+            while (loops --> 0) {
+                p.run(0,N);
+            }
         }
     }
+private:
+    bool fCompile;
 };
-DEF_BENCH( return (new SkRasterPipelineLegacyBench); )
+DEF_BENCH( return (new SkRasterPipelineCompileVsRunBench(true )); )
+DEF_BENCH( return (new SkRasterPipelineCompileVsRunBench(false)); )
 
 static SkColorSpaceTransferFn gamma(float g) {
     SkColorSpaceTransferFn fn = {0,0,0,0,0,0,0};
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 3b2d617..6287d62 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -8,10 +8,12 @@
 #ifndef SkRasterPipeline_DEFINED
 #define SkRasterPipeline_DEFINED
 
+#include "SkArenaAlloc.h"
 #include "SkImageInfo.h"
 #include "SkNx.h"
 #include "SkTArray.h"
 #include "SkTypes.h"
+#include <functional>
 #include <vector>
 
 /**
@@ -123,6 +125,9 @@
     // Runs the pipeline walking x through [x,x+n).
     void run(size_t x, size_t n) const;
 
+    // Allocates a thunk which amortizes run() setup cost in alloc.
+    std::function<void(size_t, size_t)> compile(SkArenaAlloc*) const;
+
     void dump() const;
 
     struct Stage {
diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp
index 6397cae..026b92b 100644
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@@ -30,9 +30,10 @@
                              const SkRasterPipeline& shaderPipeline,
                              bool is_opaque, bool is_constant, bool wants_dither);
 
-    SkRasterPipelineBlitter(SkPixmap dst, SkBlendMode blend)
+    SkRasterPipelineBlitter(SkPixmap dst, SkBlendMode blend, SkArenaAlloc* alloc)
         : fDst(dst)
         , fBlend(blend)
+        , fAlloc(alloc)
     {}
 
     void blitH    (int x, int y, int w)                            override;
@@ -51,6 +52,7 @@
 
     SkPixmap         fDst;
     SkBlendMode      fBlend;
+    SkArenaAlloc*    fAlloc;
     SkRasterPipeline fColorPipeline;
 
     // We may be able to specialize blitH() into a memset.
@@ -58,10 +60,10 @@
     uint64_t fMemsetColor      = 0;     // Big enough for largest dst format, F16.
 
     // Built lazily on first use.
-    SkRasterPipeline fBlitH,
-                     fBlitAntiH,
-                     fBlitMaskA8,
-                     fBlitMaskLCD16;
+    std::function<void(size_t, size_t)> fBlitH,
+                                        fBlitAntiH,
+                                        fBlitMaskA8,
+                                        fBlitMaskLCD16;
 
     // These values are pointed to by the blit pipelines above,
     // which allows us to adjust them from call to call.
@@ -131,7 +133,7 @@
                                            bool is_opaque,
                                            bool is_constant,
                                            bool wants_dither) {
-    auto blitter = alloc->make<SkRasterPipelineBlitter>(dst, paint.getBlendMode());
+    auto blitter = alloc->make<SkRasterPipelineBlitter>(dst, paint.getBlendMode(), alloc);
 
     // Our job in this factory is to fill out the blitter's color pipeline.
     // This is the common front of the full blit pipelines, each constructed lazily on first use.
@@ -265,8 +267,8 @@
         }
     }
 
-    auto& p = fBlitH;
-    if (p.empty()) {
+    if (!fBlitH) {
+        SkRasterPipeline p;
         p.extend(fColorPipeline);
         if (fBlend != SkBlendMode::kSrc) {
             this->append_load_d(&p);
@@ -274,13 +276,14 @@
             this->maybe_clamp(&p);
         }
         this->append_store(&p);
+        fBlitH = p.compile(fAlloc);
     }
-    p.run(x,w);
+    fBlitH(x,w);
 }
 
 void SkRasterPipelineBlitter::blitAntiH(int x, int y, const SkAlpha aa[], const int16_t runs[]) {
-    auto& p = fBlitAntiH;
-    if (p.empty()) {
+    if (!fBlitAntiH) {
+        SkRasterPipeline p;
         p.extend(fColorPipeline);
         if (fBlend == SkBlendMode::kSrcOver) {
             p.append(SkRasterPipeline::scale_1_float, &fCurrentCoverage);
@@ -293,6 +296,7 @@
         }
         this->maybe_clamp(&p);
         this->append_store(&p);
+        fBlitAntiH = p.compile(fAlloc);
     }
 
     fDstPtr = fDst.writable_addr(0,y);
@@ -303,7 +307,7 @@
             case 0xff: this->blitH(x,y,run); break;
             default:
                 fCurrentCoverage = *aa * (1/255.0f);
-                p.run(x,run);
+                fBlitAntiH(x,run);
         }
         x    += run;
         runs += run;
@@ -317,8 +321,8 @@
         return INHERITED::blitMask(mask, clip);
     }
 
-    if (mask.fFormat == SkMask::kA8_Format && fBlitMaskA8.empty()) {
-        auto& p = fBlitMaskA8;
+    if (mask.fFormat == SkMask::kA8_Format && !fBlitMaskA8) {
+        SkRasterPipeline p;
         p.extend(fColorPipeline);
         if (fBlend == SkBlendMode::kSrcOver) {
             p.append(SkRasterPipeline::scale_u8, &fMaskPtr);
@@ -331,16 +335,18 @@
         }
         this->maybe_clamp(&p);
         this->append_store(&p);
+        fBlitMaskA8 = p.compile(fAlloc);
     }
 
-    if (mask.fFormat == SkMask::kLCD16_Format && fBlitMaskLCD16.empty()) {
-        auto& p = fBlitMaskLCD16;
+    if (mask.fFormat == SkMask::kLCD16_Format && !fBlitMaskLCD16) {
+        SkRasterPipeline p;
         p.extend(fColorPipeline);
         this->append_load_d(&p);
         this->append_blend(&p);
         p.append(SkRasterPipeline::lerp_565, &fMaskPtr);
         this->maybe_clamp(&p);
         this->append_store(&p);
+        fBlitMaskLCD16 = p.compile(fAlloc);
     }
 
     int x = clip.left();
@@ -351,11 +357,11 @@
         switch (mask.fFormat) {
             case SkMask::kA8_Format:
                 fMaskPtr = mask.getAddr8(x,y)-x;
-                fBlitMaskA8.run(x,clip.width());
+                fBlitMaskA8(x,clip.width());
                 break;
             case SkMask::kLCD16_Format:
                 fMaskPtr = mask.getAddrLCD16(x,y)-x;
-                fBlitMaskLCD16.run(x,clip.width());
+                fBlitMaskLCD16(x,clip.width());
                 break;
             default:
                 // TODO
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 1e703e1..0e184b9 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -112,6 +112,7 @@
 };
 // ...and a platform-specific engine chosen on first use based on CPU features.
 static SkJumper_Engine gPlatform = kPortable;
+static SkOnce gChooseEngineOnce;
 
 static SkJumper_Engine choose_engine() {
 #if __has_feature(memory_sanitizer)
@@ -172,29 +173,59 @@
     return kPortable;
 }
 
+static void build_pipeline(const SkRasterPipeline::Stage* stages, int nstages,
+                           const SkJumper_Engine& engine, void** ip) {
+    for (int i = 0; i < nstages; i++) {
+        const auto& st = stages[i];
+        StageFn* fn = engine.stages[st.stage];
+
+        *ip++ = (void*)fn;
+        if (st.ctx) {
+            *ip++ = st.ctx;
+        }
+    }
+    *ip = (void*)engine.just_return;
+}
+
 void SkRasterPipeline::run(size_t x, size_t n) const {
-    static SkOnce once;
-    once([]{ gPlatform = choose_engine(); });
+    gChooseEngineOnce([]{ gPlatform = choose_engine(); });
 
     SkAutoSTMalloc<64, void*> program(2*fStages.size() + 1);
     const size_t limit = x+n;
 
-    auto build_and_run = [&](const SkJumper_Engine& engine) {
-        if (x + engine.min_stride <= limit) {
-            void** ip = program.get();
-            for (auto&& st : fStages) {
-                StageFn* fn = engine.stages[st.stage];
-                *ip++ = (void*)fn;
-                if (st.ctx) {
-                    *ip++ = st.ctx;
-                }
-            }
-            *ip = (void*)engine.just_return;
+    if (x + gPlatform.min_stride <= limit) {
+        build_pipeline(fStages.data(), SkToInt(fStages.size()), gPlatform, program.get());
+        x = gPlatform.start_pipeline(x, program.get(), &kConstants, limit);
+    }
+    if (x < limit) {
+        build_pipeline(fStages.data(), SkToInt(fStages.size()), kPortable, program.get());
+        kPortable.start_pipeline(x, program.get(), &kConstants, limit);
+    }
+}
 
-            x = engine.start_pipeline(x, program.get(), &kConstants, limit);
+std::function<void(size_t, size_t)> SkRasterPipeline::compile(SkArenaAlloc* alloc) const {
+    gChooseEngineOnce([]{ gPlatform = choose_engine(); });
+
+    void** platform = alloc->makeArray<void*>(2*fStages.size() + 1);
+    build_pipeline(fStages.data(), SkToInt(fStages.size()), gPlatform, platform);
+
+    if (gPlatform.min_stride == 1) {
+        return [=](size_t x, size_t n) {
+            const size_t limit = x+n;
+            gPlatform.start_pipeline(x, platform, &kConstants, limit);
+        };
+    }
+
+    void** portable = alloc->makeArray<void*>(2*fStages.size() + 1);
+    build_pipeline(fStages.data(), SkToInt(fStages.size()), kPortable, portable);
+
+    return [=](size_t x, size_t n) {
+        const size_t limit = x+n;
+        if (x + gPlatform.min_stride <= limit) {
+            x = gPlatform.start_pipeline(x, platform, &kConstants, limit);
+        }
+        if (x < limit) {
+            kPortable.start_pipeline(x, portable, &kConstants, limit);
         }
     };
-
-    build_and_run(gPlatform);
-    build_and_run(kPortable);
 }