SkRasterPipeline in SkArenaAlloc

Bug: skia:6673

Change-Id: Ia2bae4f6a9039a007a10b6b45bcf2f0854bf6e5c
Reviewed-on: https://skia-review.googlesource.com/17794
Reviewed-by: Mike Reed <reed@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp
index bef80b3..df13018 100644
--- a/bench/SkRasterPipelineBench.cpp
+++ b/bench/SkRasterPipelineBench.cpp
@@ -39,7 +39,7 @@
         void*  src_ctx = src;
         void*  dst_ctx = dst;
 
-        SkRasterPipeline p;
+        SkRasterPipeline_<256> p;
         p.append(SkRasterPipeline::load_8888, &src_ctx);
         p.append_from_srgb(kUnpremul_SkAlphaType);
         p.append(SkRasterPipeline::scale_u8, &mask_ctx);
@@ -79,7 +79,7 @@
         void*  src_ctx = src;
         void*  dst_ctx = dst;
 
-        SkRasterPipeline p;
+        SkRasterPipeline_<256> p;
         p.append(SkRasterPipeline::load_8888, &dst_ctx);
         p.append(SkRasterPipeline::move_src_dst);
         p.append(SkRasterPipeline::load_8888, &src_ctx);
@@ -87,9 +87,7 @@
         p.append(SkRasterPipeline::store_8888, &dst_ctx);
 
         if (fCompile) {
-            char buffer[1024];
-            SkArenaAlloc alloc(buffer);
-            auto fn = p.compile(&alloc);
+            auto fn = p.compile();
             while (loops --> 0) {
                 fn(0,N);
             }
@@ -124,7 +122,7 @@
 
         SkColorSpaceTransferFn from_2dot2 = gamma(  2.2f),
                                  to_2dot2 = gamma(1/2.2f);
-        SkRasterPipeline p;
+        SkRasterPipeline_<256> p;
         p.append(SkRasterPipeline::constant_color, &c);
         p.append(SkRasterPipeline::parametric_r, &from_2dot2);
         p.append(SkRasterPipeline::parametric_g, &from_2dot2);
@@ -148,7 +146,7 @@
     }
 
     void onDraw(int loops, SkCanvas*) override {
-        SkRasterPipeline p;
+        SkRasterPipeline_<256> p;
         p.append(SkRasterPipeline::to_srgb);
 
         while (loops --> 0) {
@@ -157,60 +155,3 @@
     }
 };
 DEF_BENCH( return (new SkRasterPipelineToSRGB); )
-
-class SkRasterPipelineReuseBench : public Benchmark {
-public:
-    enum Mode { None, Some, Full };
-
-    explicit SkRasterPipelineReuseBench(Mode mode) : fMode(mode), fName("SkRasterPipelineReuse") {
-        switch(mode) {
-            case None: fName.append("_none"); break;
-            case Some: fName.append("_some"); break;
-            case Full: fName.append("_full"); break;
-        }
-    }
-    const char* onGetName() override { return fName.c_str(); }
-    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
-
-    void onDraw(int loops, SkCanvas*) override {
-        const int kStages = 20;
-        const auto stage  = SkRasterPipeline::to_srgb;  // Any stage will do.  We won't call it.
-
-        switch(fMode) {
-            case None:
-                while (loops --> 0) {
-                    SkRasterPipeline p;
-                    for (int i = 0; i < kStages; i++) {
-                        p.append(stage);
-                    }
-                }
-                break;
-
-            case Some:
-                while (loops --> 0) {
-                    SkRasterPipeline p(kStages);
-                    for (int i = 0; i < kStages; i++) {
-                        p.append(stage);
-                    }
-                }
-                break;
-
-            case Full:
-                SkRasterPipeline p(kStages);
-                while (loops --> 0) {
-                    p.rewind();
-                    for (int i = 0; i < kStages; i++) {
-                        p.append(stage);
-                    }
-                }
-                break;
-        }
-    }
-
-private:
-    Mode     fMode;
-    SkString fName;
-};
-DEF_BENCH( return (new SkRasterPipelineReuseBench(SkRasterPipelineReuseBench::None)); )
-DEF_BENCH( return (new SkRasterPipelineReuseBench(SkRasterPipelineReuseBench::Some)); )
-DEF_BENCH( return (new SkRasterPipelineReuseBench(SkRasterPipelineReuseBench::Full)); )
diff --git a/dm/DMSrcSink.cpp b/dm/DMSrcSink.cpp
index 643a1a4..12d8a06 100644
--- a/dm/DMSrcSink.cpp
+++ b/dm/DMSrcSink.cpp
@@ -329,7 +329,7 @@
         case kRGBA_F16_SkColorType:
             for (int y = 0; y < bitmap.height(); y++) {
                 void* row = bitmap.getAddr(0, y);
-                SkRasterPipeline p;
+                SkRasterPipeline_<256> p;
                 p.append(SkRasterPipeline::load_f16, &row);
                 p.append(SkRasterPipeline::premul);
                 p.append(SkRasterPipeline::store_f16, &row);
diff --git a/src/codec/SkWebpCodec.cpp b/src/codec/SkWebpCodec.cpp
index b2e1e64..829ebb8 100644
--- a/src/codec/SkWebpCodec.cpp
+++ b/src/codec/SkWebpCodec.cpp
@@ -338,7 +338,7 @@
                        bool needsSrgbToLinear, SkAlphaType at,
                        int width) {
     // Setup conversion from the source and dest, which will be the same.
-    SkRasterPipeline convert_to_linear_premul;
+    SkRasterPipeline_<256> convert_to_linear_premul;
     if (needsSrgbToLinear) {
         convert_to_linear_premul.append_from_srgb(at);
     }
@@ -347,7 +347,7 @@
         convert_to_linear_premul.append(SkRasterPipeline::premul);
     }
 
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     SkRasterPipeline::StockStage load_dst, store_dst;
     pick_memory_stages(dstCT, &load_dst, &store_dst);
 
diff --git a/src/core/SkColorSpaceXform.cpp b/src/core/SkColorSpaceXform.cpp
index 195de48..0749aa4 100644
--- a/src/core/SkColorSpaceXform.cpp
+++ b/src/core/SkColorSpaceXform.cpp
@@ -1155,7 +1155,7 @@
 bool SkColorSpaceXform_XYZ<kCSM>
 ::applyPipeline(ColorFormat dstColorFormat, void* dst, ColorFormat srcColorFormat,
                 const void* src, int len, SkAlphaType alphaType) const {
-    SkRasterPipeline pipeline;
+    SkRasterPipeline_<256> pipeline;
 
     LoadTablesContext loadTables;
     switch (srcColorFormat) {
diff --git a/src/core/SkColorSpaceXform_A2B.cpp b/src/core/SkColorSpaceXform_A2B.cpp
index 4b8ae42..1cbf411 100644
--- a/src/core/SkColorSpaceXform_A2B.cpp
+++ b/src/core/SkColorSpaceXform_A2B.cpp
@@ -20,7 +20,7 @@
 
 bool SkColorSpaceXform_A2B::onApply(ColorFormat dstFormat, void* dst, ColorFormat srcFormat,
                                     const void* src, int count, SkAlphaType alphaType) const {
-    SkRasterPipeline pipeline;
+    SkRasterPipeline_<256> pipeline;
     switch (srcFormat) {
         case kBGRA_8888_ColorFormat:
             pipeline.append(SkRasterPipeline::load_8888, &src);
@@ -98,7 +98,8 @@
 
 SkColorSpaceXform_A2B::SkColorSpaceXform_A2B(SkColorSpace_A2B* srcSpace,
                                              SkColorSpace_XYZ* dstSpace)
-    : fLinearDstGamma(kLinear_SkGammaNamed == dstSpace->gammaNamed()) {
+    : fElementsPipeline(&fAlloc)
+    , fLinearDstGamma(kLinear_SkGammaNamed == dstSpace->gammaNamed()) {
 #if (SkCSXformPrintfDefined)
     static const char* debugGammaNamed[4] = {
         "Linear", "SRGB", "2.2", "NonStandard"
diff --git a/src/core/SkConvertPixels.cpp b/src/core/SkConvertPixels.cpp
index cc2b953..d557763 100644
--- a/src/core/SkConvertPixels.cpp
+++ b/src/core/SkConvertPixels.cpp
@@ -277,7 +277,7 @@
 static void convert_with_pipeline(const SkImageInfo& dstInfo, void* dstRow, size_t dstRB,
                                   const SkImageInfo& srcInfo, const void* srcRow, size_t srcRB,
                                   bool isColorAware, SkTransferFunctionBehavior behavior) {
-    SkRasterPipeline pipeline;
+    SkRasterPipeline_<256> pipeline;
     switch (srcInfo.colorType()) {
         case kRGBA_8888_SkColorType:
             pipeline.append(SkRasterPipeline::load_8888, &srcRow);
diff --git a/src/core/SkDraw_vertices.cpp b/src/core/SkDraw_vertices.cpp
index ff33d5d..398d02c 100644
--- a/src/core/SkDraw_vertices.cpp
+++ b/src/core/SkDraw_vertices.cpp
@@ -455,7 +455,7 @@
         char             arenaStorage[4096];
         SkArenaAlloc     alloc(arenaStorage, sizeof(storage));
         Matrix43         matrix43;
-        SkRasterPipeline shaderPipeline;
+        SkRasterPipeline shaderPipeline(&alloc);
 
         // Convert the SkColors into float colors. The conversion depends on some conditions:
         // - If the pixmap has a dst colorspace, we have to be "color-correct".
diff --git a/src/core/SkPM4fPriv.h b/src/core/SkPM4fPriv.h
index 795ff4c..26f1557 100644
--- a/src/core/SkPM4fPriv.h
+++ b/src/core/SkPM4fPriv.h
@@ -152,7 +152,7 @@
 
         float scratch_matrix_3x4[12];
 
-        SkRasterPipeline p;
+        SkRasterPipeline_<256> p;
         p.append(SkRasterPipeline::constant_color, color4f_ptr);
         append_gamut_transform(&p, scratch_matrix_3x4, src, dst, kUnpremul_SkAlphaType);
         p.append(SkRasterPipeline::store_f32, &color4f_ptr);
diff --git a/src/core/SkRasterPipeline.cpp b/src/core/SkRasterPipeline.cpp
index ab40444..0f4c1f4 100644
--- a/src/core/SkRasterPipeline.cpp
+++ b/src/core/SkRasterPipeline.cpp
@@ -7,29 +7,39 @@
 
 #include "SkRasterPipeline.h"
 
-SkRasterPipeline::SkRasterPipeline(int size_hint) {
-    fStages.reserve(size_hint);
+SkRasterPipeline::SkRasterPipeline(SkArenaAlloc* alloc) : fAlloc(alloc) {
+    this->reset();
 }
-
-void SkRasterPipeline::rewind() {
-    fStages.clear();
+void SkRasterPipeline::reset() {
+    fStages      = nullptr;
+    fSlotsNeeded = 1;  // We always need one extra slot for just_return().
 }
 
 void SkRasterPipeline::append(StockStage stage, void* ctx) {
     SkASSERT(stage != from_srgb);
-    fStages.push_back({stage, ctx});
+    this->unchecked_append(stage, ctx);
+}
+void SkRasterPipeline::unchecked_append(StockStage stage, void* ctx) {
+    fStages = fAlloc->make<StageList>( StageList{fStages, stage, ctx} );
+    fSlotsNeeded += ctx ? 2 : 1;
 }
 
 void SkRasterPipeline::extend(const SkRasterPipeline& src) {
-    fStages.insert(fStages.end(),
-                   src.fStages.begin(), src.fStages.end());
+    this->extend(src.fStages);
+}
+void SkRasterPipeline::extend(const StageList* stages) {
+    if (!stages) {
+        return;
+    }
+    this->extend(stages->prev);
+    this->unchecked_append(stages->stage, stages->ctx);
 }
 
 void SkRasterPipeline::dump() const {
-    SkDebugf("SkRasterPipeline, %d stages\n", SkToInt(fStages.size()));
-    for (auto&& st : fStages) {
+    SkDebugf("SkRasterPipeline, (in reverse)\n");
+    for (auto st = fStages; st; st = st->prev) {
         const char* name = "";
-        switch (st.stage) {
+        switch (st->stage) {
         #define M(x) case x: name = #x; break;
             SK_RASTER_PIPELINE_STAGES(M)
         #undef M
@@ -48,9 +58,7 @@
 // This is an annoying problem with no known good solution.  So apply the clamp hammer.
 
 void SkRasterPipeline::append_from_srgb(SkAlphaType at) {
-    //this->append(from_srgb);
-    fStages.push_back({from_srgb, nullptr});
-
+    this->unchecked_append(from_srgb, nullptr);
     if (at == kPremul_SkAlphaType) {
         this->append(SkRasterPipeline::clamp_a);
     }
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 3725033..117ecf3 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -16,6 +16,8 @@
 #include <functional>
 #include <vector>
 
+struct SkJumper_Engine;
+
 /**
  * SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
  *
@@ -110,7 +112,15 @@
 
 class SkRasterPipeline {
 public:
-    SkRasterPipeline(int size_hint=0);
+    explicit SkRasterPipeline(SkArenaAlloc*);
+
+    SkRasterPipeline(const SkRasterPipeline&) = delete;
+    SkRasterPipeline(SkRasterPipeline&&)      = default;
+
+    SkRasterPipeline& operator=(const SkRasterPipeline&) = delete;
+    SkRasterPipeline& operator=(SkRasterPipeline&&)      = default;
+
+    void reset();
 
     enum StockStage {
     #define M(stage) stage,
@@ -127,26 +137,43 @@
     void run(size_t x, size_t n) const;
 
     // Allocates a thunk which amortizes run() setup cost in alloc.
-    std::function<void(size_t, size_t)> compile(SkArenaAlloc*) const;
+    std::function<void(size_t, size_t)> compile() const;
 
     void dump() const;
 
-    struct Stage {
-        StockStage stage;
-        void*        ctx;
-    };
-
     // Conversion from sRGB can be subtly tricky when premultiplication is involved.
     // Use these helpers to keep things sane.
     void append_from_srgb(SkAlphaType);
 
-    bool empty() const { return fStages.empty(); }
-
-    // Cheaply reset all state so that empty() returns true.
-    void rewind();
+    bool empty() const { return fStages == nullptr; }
 
 private:
-    std::vector<Stage> fStages;
+    struct StageList {
+        StageList* prev;
+        StockStage stage;
+        void*      ctx;
+    };
+
+    static void BuildPipeline(const StageList*, const SkJumper_Engine&, void**);
+    void unchecked_append(StockStage, void*);
+    void extend(const StageList*);
+
+    SkArenaAlloc* fAlloc;
+    StageList*    fStages;
+    int           fSlotsNeeded;
 };
 
+template <size_t bytes>
+class SkRasterPipeline_ : public SkRasterPipeline {
+public:
+    SkRasterPipeline_()
+        : SkRasterPipeline(&fBuiltinAlloc)
+        , fBuiltinAlloc(fBuffer) {}
+
+private:
+    char         fBuffer[bytes];
+    SkArenaAlloc fBuiltinAlloc;
+};
+
+
 #endif//SkRasterPipeline_DEFINED
diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp
index 82a202b..e5c5325 100644
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@@ -34,6 +34,7 @@
         : fDst(dst)
         , fBlend(blend)
         , fAlloc(alloc)
+        , fColorPipeline(alloc)
     {}
 
     void blitH    (int x, int y, int w)                            override;
@@ -100,8 +101,8 @@
                                            const SkMatrix& ctm) {
     auto paintColor = alloc->make<SkPM4f>(SkPM4f_from_SkColor(paint.getColor(),
                                                               dst.colorSpace()));
+    SkRasterPipeline_<256> shaderPipeline;
     if (auto shader = paint.getShader()) {
-        SkRasterPipeline shaderPipeline;
         if (!shader->appendStages(&shaderPipeline, dst.colorSpace(), alloc, ctm, paint)) {
             // When a shader fails to append stages, it means it has vetoed drawing entirely.
             return alloc->make<SkNullBlitter>();
@@ -118,7 +119,6 @@
         return Create(dst, paint, alloc, shaderPipeline, is_opaque, is_constant, wants_dither);
     }
 
-    SkRasterPipeline shaderPipeline;
     shaderPipeline.append(SkRasterPipeline::constant_color, paintColor);
     bool is_opaque    = paintColor->a() == 1.0f,
          is_constant  = true,
@@ -173,7 +173,7 @@
         auto constantColor = alloc->make<SkPM4f>();
         colorPipeline->append(SkRasterPipeline::store_f32, &constantColor);
         colorPipeline->run(0,1);
-        *colorPipeline = SkRasterPipeline();
+        colorPipeline->reset();
         colorPipeline->append(SkRasterPipeline::constant_color, constantColor);
 
         is_opaque = constantColor->a() == 1.0f;
@@ -189,7 +189,7 @@
     if (is_constant && blitter->fBlend == SkBlendMode::kSrc) {
         // Run our color pipeline all the way through to produce what we'd memset when we can.
         // Not all blits can memset, so we need to keep colorPipeline too.
-        SkRasterPipeline p;
+        SkRasterPipeline_<256> p;
         p.extend(*colorPipeline);
         blitter->fDstPtr = &blitter->fMemsetColor;
         blitter->append_store(&p);
@@ -268,7 +268,7 @@
     }
 
     if (!fBlitH) {
-        SkRasterPipeline p;
+        SkRasterPipeline p(fAlloc);
         p.extend(fColorPipeline);
         if (fBlend != SkBlendMode::kSrc) {
             this->append_load_d(&p);
@@ -276,14 +276,14 @@
             this->maybe_clamp(&p);
         }
         this->append_store(&p);
-        fBlitH = p.compile(fAlloc);
+        fBlitH = p.compile();
     }
     fBlitH(x,w);
 }
 
 void SkRasterPipelineBlitter::blitAntiH(int x, int y, const SkAlpha aa[], const int16_t runs[]) {
     if (!fBlitAntiH) {
-        SkRasterPipeline p;
+        SkRasterPipeline p(fAlloc);
         p.extend(fColorPipeline);
         if (fBlend == SkBlendMode::kSrcOver) {
             p.append(SkRasterPipeline::scale_1_float, &fCurrentCoverage);
@@ -296,7 +296,7 @@
         }
         this->maybe_clamp(&p);
         this->append_store(&p);
-        fBlitAntiH = p.compile(fAlloc);
+        fBlitAntiH = p.compile();
     }
 
     fDstPtr = fDst.writable_addr(0,y);
@@ -322,7 +322,7 @@
     }
 
     if (mask.fFormat == SkMask::kA8_Format && !fBlitMaskA8) {
-        SkRasterPipeline p;
+        SkRasterPipeline p(fAlloc);
         p.extend(fColorPipeline);
         if (fBlend == SkBlendMode::kSrcOver) {
             p.append(SkRasterPipeline::scale_u8, &fMaskPtr);
@@ -335,18 +335,18 @@
         }
         this->maybe_clamp(&p);
         this->append_store(&p);
-        fBlitMaskA8 = p.compile(fAlloc);
+        fBlitMaskA8 = p.compile();
     }
 
     if (mask.fFormat == SkMask::kLCD16_Format && !fBlitMaskLCD16) {
-        SkRasterPipeline p;
+        SkRasterPipeline p(fAlloc);
         p.extend(fColorPipeline);
         this->append_load_d(&p);
         this->append_blend(&p);
         p.append(SkRasterPipeline::lerp_565, &fMaskPtr);
         this->maybe_clamp(&p);
         this->append_store(&p);
-        fBlitMaskLCD16 = p.compile(fAlloc);
+        fBlitMaskLCD16 = p.compile();
     }
 
     int x = clip.left();
diff --git a/src/core/SkShader.cpp b/src/core/SkShader.cpp
index 5eead36..c8d5f08 100644
--- a/src/core/SkShader.cpp
+++ b/src/core/SkShader.cpp
@@ -264,7 +264,7 @@
                             const SkMatrix& ctm,
                             const SkPaint& paint,
                             const SkMatrix* localM) const {
-    SkRasterPipeline subclass;
+    SkRasterPipeline_<256> subclass;
     if (this->onAppendStages(&subclass, dstCS, alloc, ctm, paint, localM)) {
         p->extend(subclass);
         return true;
diff --git a/src/effects/gradients/SkGradientShader.cpp b/src/effects/gradients/SkGradientShader.cpp
index c4a1dad..5a4a31c 100644
--- a/src/effects/gradients/SkGradientShader.cpp
+++ b/src/effects/gradients/SkGradientShader.cpp
@@ -417,7 +417,7 @@
         return false;
     }
 
-    SkRasterPipeline subclass;
+    SkRasterPipeline_<256> subclass;
     if (!this->adjustMatrixAndAppendStages(alloc, &matrix, &subclass)) {
         return false;
     }
diff --git a/src/images/SkImageEncoderFns.h b/src/images/SkImageEncoderFns.h
index af75ac9..dfe5001 100644
--- a/src/images/SkImageEncoderFns.h
+++ b/src/images/SkImageEncoderFns.h
@@ -152,7 +152,7 @@
 
 template <bool kIsRGBA>
 static inline void transform_scanline_unpremultiply_sRGB(void* dst, const void* src, int width) {
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::load_8888, &src);
     if (!kIsRGBA) {
         p.append(SkRasterPipeline::swap_rb);
@@ -180,7 +180,7 @@
 static inline void transform_scanline_to_premul_linear(char* SK_RESTRICT dst,
                                                        const char* SK_RESTRICT src,
                                                        int width, int, const SkPMColor*) {
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::load_8888, (const void**) &src);
     p.append_from_srgb(kUnpremul_SkAlphaType);
     p.append(SkRasterPipeline::premul);
@@ -254,7 +254,7 @@
  */
 static inline void transform_scanline_F16(char* SK_RESTRICT dst, const char* SK_RESTRICT src,
                                           int width, int, const SkPMColor*) {
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::load_f16, (const void**) &src);
     p.append(SkRasterPipeline::to_srgb);
     p.append(SkRasterPipeline::store_u16_be, (void**) &dst);
@@ -266,7 +266,7 @@
  */
 static inline void transform_scanline_F16_premul(char* SK_RESTRICT dst, const char* SK_RESTRICT src,
                                                  int width, int, const SkPMColor*) {
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::load_f16, (const void**) &src);
     p.append(SkRasterPipeline::unpremul);
     p.append(SkRasterPipeline::to_srgb);
@@ -280,7 +280,7 @@
 static inline void transform_scanline_F16_to_8888(char* SK_RESTRICT dst,
                                                   const char* SK_RESTRICT src, int width, int,
                                                   const SkPMColor*) {
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::load_f16, (const void**) &src);
     p.append(SkRasterPipeline::to_srgb);
     p.append(SkRasterPipeline::store_8888, (void**) &dst);
@@ -293,7 +293,7 @@
 static inline void transform_scanline_F16_premul_to_8888(char* SK_RESTRICT dst,
                                                          const char* SK_RESTRICT src, int width,
                                                          int, const SkPMColor*) {
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::load_f16, (const void**) &src);
     p.append(SkRasterPipeline::unpremul);
     p.append(SkRasterPipeline::to_srgb);
@@ -306,7 +306,7 @@
  */
 static inline void transform_scanline_F16_to_premul_8888(char* SK_RESTRICT dst,
         const char* SK_RESTRICT src, int width, int, const SkPMColor*) {
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::load_f16, (const void**) &src);
     p.append(SkRasterPipeline::premul);
     p.append(SkRasterPipeline::to_srgb);
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 8daf29e..10455b9 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -173,47 +173,48 @@
     return kPortable;
 }
 
-static void build_pipeline(const SkRasterPipeline::Stage* stages, int nstages,
-                           const SkJumper_Engine& engine, void** ip) {
-    for (int i = 0; i < nstages; i++) {
-        const auto& st = stages[i];
-        StageFn* fn = engine.stages[st.stage];
+void SkRasterPipeline::BuildPipeline(const StageList* st,
+                                     const SkJumper_Engine& engine, void** ip) {
+    // We're building the pipeline backwards, so we start with the final stage just_return.
+    *--ip = (void*)engine.just_return;
 
-        *ip++ = (void*)fn;
-        if (st.ctx) {
-            *ip++ = st.ctx;
+    // Still going backwards, each stage's context pointer then its StageFn.
+    for (; st; st = st->prev) {
+        if (st->ctx) {
+            *--ip = st->ctx;
         }
+        *--ip = (void*)engine.stages[st->stage];
     }
-    *ip = (void*)engine.just_return;
 }
 
 void SkRasterPipeline::run(size_t x, size_t n) const {
-    if (fStages.empty()) {
+    if (this->empty()) {
         return;
     }
     gChooseEngineOnce([]{ gPlatform = choose_engine(); });
 
-    SkAutoSTMalloc<64, void*> program(2*fStages.size() + 1);
+    // Best to not use fAlloc here... we can't bound how often run() will be called.
+    SkAutoSTMalloc<64, void*> program(fSlotsNeeded);
     const size_t limit = x+n;
 
     if (x + gPlatform.min_stride <= limit) {
-        build_pipeline(fStages.data(), SkToInt(fStages.size()), gPlatform, program.get());
+        BuildPipeline(fStages, gPlatform, program.get() + fSlotsNeeded);
         x = gPlatform.start_pipeline(x, program.get(), &kConstants, limit);
     }
     if (x < limit) {
-        build_pipeline(fStages.data(), SkToInt(fStages.size()), kPortable, program.get());
+        BuildPipeline(fStages, kPortable, program.get() + fSlotsNeeded);
         kPortable.start_pipeline(x, program.get(), &kConstants, limit);
     }
 }
 
-std::function<void(size_t, size_t)> SkRasterPipeline::compile(SkArenaAlloc* alloc) const {
-    if (fStages.empty()) {
+std::function<void(size_t, size_t)> SkRasterPipeline::compile() const {
+    if (this->empty()) {
         return [](size_t, size_t) {};
     }
     gChooseEngineOnce([]{ gPlatform = choose_engine(); });
 
-    void** platform = alloc->makeArray<void*>(2*fStages.size() + 1);
-    build_pipeline(fStages.data(), SkToInt(fStages.size()), gPlatform, platform);
+    void** platform = fAlloc->makeArray<void*>(fSlotsNeeded);
+    BuildPipeline(fStages, gPlatform, platform + fSlotsNeeded);
 
     if (gPlatform.min_stride == 1) {
         return [=](size_t x, size_t n) {
@@ -222,8 +223,8 @@
         };
     }
 
-    void** portable = alloc->makeArray<void*>(2*fStages.size() + 1);
-    build_pipeline(fStages.data(), SkToInt(fStages.size()), kPortable, portable);
+    void** portable = fAlloc->makeArray<void*>(fSlotsNeeded);
+    BuildPipeline(fStages, kPortable, portable + fSlotsNeeded);
 
     return [=](size_t x, size_t n) {
         const size_t limit = x+n;
diff --git a/tests/F16StagesTest.cpp b/tests/F16StagesTest.cpp
index 73072e3..cdf2daf 100644
--- a/tests/F16StagesTest.cpp
+++ b/tests/F16StagesTest.cpp
@@ -22,7 +22,7 @@
     uint16_t* f16 = halfs;
 
     {
-        SkRasterPipeline p;
+        SkRasterPipeline_<256> p;
         p.append(SkRasterPipeline:: load_f32, &f32);
         p.append(SkRasterPipeline::store_f16, &f16);
         p.run(0,16/4);
@@ -37,7 +37,7 @@
     REPORTER_ASSERT(r, f16[7] == 0x4000);
 
     {
-        SkRasterPipeline p;
+        SkRasterPipeline_<256> p;
         p.append(SkRasterPipeline:: load_f16, &f16);
         p.append(SkRasterPipeline::store_f32, &f32);
         p.run(0,16/4);
diff --git a/tests/ParametricStageTest.cpp b/tests/ParametricStageTest.cpp
index 89b67c0..6edb9d9 100644
--- a/tests/ParametricStageTest.cpp
+++ b/tests/ParametricStageTest.cpp
@@ -19,7 +19,7 @@
     const float* ip = in;
     float*       op = out;
 
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::load_f32, &ip);
     p.append(SkRasterPipeline::parametric_r, &fn);
     p.append(SkRasterPipeline::parametric_g, &fn);
diff --git a/tests/SRGBTest.cpp b/tests/SRGBTest.cpp
index 4d0084a..78855a3 100644
--- a/tests/SRGBTest.cpp
+++ b/tests/SRGBTest.cpp
@@ -47,7 +47,7 @@
 
     auto ptr = (void*)reds;
 
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::load_8888,  &ptr);
     p.append_from_srgb(kUnpremul_SkAlphaType);
     p.append(SkRasterPipeline::to_srgb);
diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp
index f7c1456..da37aa2 100644
--- a/tests/SkRasterPipelineTest.cpp
+++ b/tests/SkRasterPipelineTest.cpp
@@ -20,7 +20,7 @@
     void* load_d_ctx = &red;
     void* store_ctx  = &result;
 
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::load_f16, &load_s_ctx);
     p.append(SkRasterPipeline::move_src_dst);
     p.append(SkRasterPipeline::load_f16, &load_d_ctx);
@@ -38,14 +38,14 @@
 
 DEF_TEST(SkRasterPipeline_empty, r) {
     // No asserts... just a test that this is safe to run.
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.run(0,20);
 }
 
 DEF_TEST(SkRasterPipeline_nonsense, r) {
     // No asserts... just a test that this is safe to run and terminates.
     // srcover() calls st->next(); this makes sure we've always got something there to call.
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline::srcover);
     p.run(0,20);
 }
@@ -66,7 +66,7 @@
     uint32_t*       dst = buf + 36;
 
     // Copy buf[x] to buf[x+36] for x in [15,35).
-    SkRasterPipeline p;
+    SkRasterPipeline_<256> p;
     p.append(SkRasterPipeline:: load_8888, &src);
     p.append(SkRasterPipeline::store_8888, &dst);
     p.run(15, 20);