Use stack instead of malloc() for most calls to SkRasterPipeline::run().

Also split bench into run/compile variants to measure the effect:
 Before …f16_compile 1x  …f16_run 1.02x  …srgb_compile 1.56x  …srgb_run 1.61x
 After  …f16_run 1x  …f16_compile 1.01x  …srgb_compile 1.58x  …srgb_run 1.59x

CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD

Change-Id: I8e65fb2acdbb05ccc0b3894f16d7646603c3e74d
Reviewed-on: https://skia-review.googlesource.com/6621
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp
index 16dea8a..376efde 100644
--- a/bench/SkRasterPipelineBench.cpp
+++ b/bench/SkRasterPipelineBench.cpp
@@ -22,13 +22,18 @@
 //   - src = srcover(dst, src)
 //   - store src back as srgb/f16
 
-template <bool kF16>
+template <bool kF16, bool kCompiled>
 class SkRasterPipelineBench : public Benchmark {
 public:
     bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
     const char* onGetName() override {
-        return kF16 ? "SkRasterPipeline_f16"
-                    : "SkRasterPipeline_srgb";
+        switch ((int)kCompiled << 1 | (int)kF16) {
+            case 0: return "SkRasterPipeline_srgb_run";
+            case 1: return "SkRasterPipeline_f16_run";
+            case 2: return "SkRasterPipeline_srgb_compile";
+            case 3: return "SkRasterPipeline_f16_compile";
+        }
+        return "whoops";
     }
 
     void onDraw(int loops, SkCanvas*) override {
@@ -53,12 +58,20 @@
             p.append(SkRasterPipeline::to_srgb);
             p.append(SkRasterPipeline::store_8888, &dst_ctx);
         }
-        auto compiled = p.compile();
 
-        while (loops --> 0) {
-            compiled(0,0, N);
+        if (kCompiled) {
+            auto compiled = p.compile();
+            while (loops --> 0) {
+                compiled(0,0, N);
+            }
+        } else {
+            while (loops --> 0) {
+                p.run(0,0, N);
+            }
         }
     }
 };
-DEF_BENCH( return new SkRasterPipelineBench<true>; )
-DEF_BENCH( return new SkRasterPipelineBench<false>; )
+DEF_BENCH( return (new SkRasterPipelineBench< true,  true>); )
+DEF_BENCH( return (new SkRasterPipelineBench<false,  true>); )
+DEF_BENCH( return (new SkRasterPipelineBench< true, false>); )
+DEF_BENCH( return (new SkRasterPipelineBench<false, false>); )
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 3b877ce..9721e2a 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -965,9 +965,37 @@
 
 namespace {
 
+    static void build_program(void** program, const SkRasterPipeline::Stage* stages, int nstages) {
+        for (int i = 0; i < nstages; i++) {
+            *program++ = (void*)enum_to_Fn(stages[i].stage);
+            if (stages[i].ctx) {
+                *program++ = stages[i].ctx;
+            }
+        }
+        *program++ = (void*)just_return;
+    }
+
+    static void run_program(void** program, size_t x, size_t y, size_t n) {
+        float dx[] = { 0,1,2,3,4,5,6,7 };
+        SkNf X = SkNf(x) + SkNf::Load(dx) + 0.5f,
+             Y = SkNf(y) + 0.5f,
+             _0 = SkNf(0),
+             _1 = SkNf(1);
+
+        auto start = (Fn)load_and_increment(&program);
+        while (n >= N) {
+            start(x*N, program, X,Y,_1,_0, _0,_0,_0,_0);
+            X += (float)N;
+            x += N;
+            n -= N;
+        }
+        if (n) {
+            start(x*N+n, program, X,Y,_1,_0, _0,_0,_0,_0);
+        }
+    }
+
     // Compiled manages its memory manually because it's not safe to use
     // std::vector, SkTDArray, etc without setting us up for big ODR violations.
-
     struct Compiled {
         Compiled(const SkRasterPipeline::Stage* stages, int nstages) {
             int slots = nstages + 1;  // One extra for just_return.
@@ -977,15 +1005,7 @@
                 }
             }
             fProgram = (void**)sk_malloc_throw(slots * sizeof(void*));
-
-            void** ip = fProgram;
-            for (int i = 0; i < nstages; i++) {
-                *ip++ = (void*)enum_to_Fn(stages[i].stage);
-                if (stages[i].ctx) {
-                    *ip++ = stages[i].ctx;
-                }
-            }
-            *ip++ = (void*)just_return;
+            build_program(fProgram, stages, nstages);
         }
         ~Compiled() { sk_free(fProgram); }
 
@@ -998,23 +1018,7 @@
         }
 
         void operator()(size_t x, size_t y, size_t n) {
-            float dx[] = { 0,1,2,3,4,5,6,7 };
-            SkNf X = SkNf(x) + SkNf::Load(dx) + 0.5f,
-                 Y = SkNf(y) + 0.5f,
-                _0 = SkNf(0),
-                _1 = SkNf(1);
-
-            void** p = fProgram;
-            auto start = (Fn)load_and_increment(&p);
-            while (n >= N) {
-                start(x*N, p, X,Y,_1,_0, _0,_0,_0,_0);
-                X += (float)N;
-                x += N;
-                n -= N;
-            }
-            if (n) {
-                start(x*N+n, p, X,Y,_1,_0, _0,_0,_0,_0);
-            }
+            run_program(fProgram, x, y, n);
         }
 
         void** fProgram;
@@ -1030,7 +1034,15 @@
 
     SI void run_pipeline(size_t x, size_t y, size_t n,
                          const SkRasterPipeline::Stage* stages, int nstages) {
-        Compiled{stages,nstages}(x,y,n);
+        static const int kStackMax = 256;
+        // Worst case is nstages stages with nstages context pointers, and just_return.
+        if (2*nstages+1 <= kStackMax) {
+            void* program[kStackMax];
+            build_program(program, stages, nstages);
+            run_program(program, x,y,n);
+        } else {
+            Compiled{stages,nstages}(x,y,n);
+        }
     }
 
 }  // namespace SK_OPTS_NS