Make all SkRasterPipeline stages stock stages in SkOpts.

If we want to support VEX-encoded instructions (AVX, F16C, etc.) without a ridiculous slowdown, we need to make sure we're running either all VEX-encoded instructions or all non-VEX-encoded instructions.  That means we cannot mix arbitrary user-defined SkRasterPipeline::Fn (never VEX) with those living in SkOpts (maybe VEX)... it's SkOpts or bust.

This ports the existing user-defined SkRasterPipeline::Fn use cases over to use stock stages from SkOpts.  I rewrote the unit test to use stock stages, and moved the SkXfermode implementations to SkOpts.  The code deleted for SkArithmeticMode_scalar should already be dead.


BUG=skia:

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2940
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Change-Id: I94dbe766b2d65bfec6e544d260f71d721f0f5cb0
Reviewed-on: https://skia-review.googlesource.com/2940
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Mike Reed <reed@google.com>
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 4d60079..7784e7f 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -113,7 +113,30 @@
 
         stage_4<SK_OPTS_NS::constant_color, true>,
 
-        stage_4<SK_OPTS_NS::srcover, true>,
+        SK_OPTS_NS::dst,
+        SK_OPTS_NS::dstatop,
+        SK_OPTS_NS::dstin,
+        SK_OPTS_NS::dstout,
+        SK_OPTS_NS::dstover,
+        SK_OPTS_NS::srcatop,
+        SK_OPTS_NS::srcin,
+        SK_OPTS_NS::srcout,
+        SK_OPTS_NS::srcover,
+        SK_OPTS_NS::clear,
+        SK_OPTS_NS::modulate,
+        SK_OPTS_NS::multiply,
+        SK_OPTS_NS::plus_,
+        SK_OPTS_NS::screen,
+        SK_OPTS_NS::xor_,
+        SK_OPTS_NS::colorburn,
+        SK_OPTS_NS::colordodge,
+        SK_OPTS_NS::darken,
+        SK_OPTS_NS::difference,
+        SK_OPTS_NS::exclusion,
+        SK_OPTS_NS::hardlight,
+        SK_OPTS_NS::lighten,
+        SK_OPTS_NS::overlay,
+        SK_OPTS_NS::softlight,
     };
     static_assert(SK_ARRAY_COUNT(stages_4) == SkRasterPipeline::kNumStockStages, "");
 
@@ -138,7 +161,30 @@
 
         stage_1_3<SK_OPTS_NS::constant_color, true>,
 
-        stage_1_3<SK_OPTS_NS::srcover, true>,
+        SK_OPTS_NS::dst,
+        SK_OPTS_NS::dstatop,
+        SK_OPTS_NS::dstin,
+        SK_OPTS_NS::dstout,
+        SK_OPTS_NS::dstover,
+        SK_OPTS_NS::srcatop,
+        SK_OPTS_NS::srcin,
+        SK_OPTS_NS::srcout,
+        SK_OPTS_NS::srcover,
+        SK_OPTS_NS::clear,
+        SK_OPTS_NS::modulate,
+        SK_OPTS_NS::multiply,
+        SK_OPTS_NS::plus_,
+        SK_OPTS_NS::screen,
+        SK_OPTS_NS::xor_,
+        SK_OPTS_NS::colorburn,
+        SK_OPTS_NS::colordodge,
+        SK_OPTS_NS::darken,
+        SK_OPTS_NS::difference,
+        SK_OPTS_NS::exclusion,
+        SK_OPTS_NS::hardlight,
+        SK_OPTS_NS::lighten,
+        SK_OPTS_NS::overlay,
+        SK_OPTS_NS::softlight,
     };
     static_assert(SK_ARRAY_COUNT(stages_1_3) == SkRasterPipeline::kNumStockStages, "");
 
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 789257e..785a118 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -83,11 +83,6 @@
     void run(size_t x, size_t n);
     void run(size_t n) { this->run(0, n); }
 
-    // body() will only be called with tail=0, indicating it always works on a full 4 pixels.
-    // tail() will only be called with tail=1..3 to handle the jagged end of n%4 pixels.
-    void append(Fn body, Fn tail, const void* ctx = nullptr);
-    void append(Fn fn, const void* ctx = nullptr) { this->append(fn, fn, ctx); }
-
     enum StockStage {
         store_565,
         store_srgb,
@@ -109,7 +104,30 @@
 
         constant_color,
 
+        dst,
+        dstatop,
+        dstin,
+        dstout,
+        dstover,
+        srcatop,
+        srcin,
+        srcout,
         srcover,
+        clear,
+        modulate,
+        multiply,
+        plus_,
+        screen,
+        xor_,
+        colorburn,
+        colordodge,
+        darken,
+        difference,
+        exclusion,
+        hardlight,
+        lighten,
+        overlay,
+        softlight,
 
         kNumStockStages,
     };
@@ -122,6 +140,8 @@
 private:
     using Stages = SkSTArray<10, Stage, /*MEM_COPY=*/true>;
 
+    void append(Fn body, Fn tail, const void*);
+
     // This no-op default makes fBodyStart and fTailStart unconditionally safe to call,
     // and is always the last stage's fNext as a sort of safety net to make sure even a
     // buggy pipeline can't walk off its own end.
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index 2717fab..58c5578 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -1433,117 +1433,36 @@
     SK_DEFINE_FLATTENABLE_REGISTRAR_ENTRY(SkProcCoeffXfermode)
 SK_DEFINE_FLATTENABLE_REGISTRAR_GROUP_END
 
-static Sk4f inv(const Sk4f& x) { return 1.0f - x; }
-
-// Most of these modes apply the same logic kernel to each channel.
-template <Sk4f kernel(const Sk4f& s, const Sk4f& sa, const Sk4f& d, const Sk4f& da)>
-static void SK_VECTORCALL rgba(SkRasterPipeline::Stage* st, size_t x, size_t tail,
-                               Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                               Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    r = kernel(r,a,dr,da);
-    g = kernel(g,a,dg,da);
-    b = kernel(b,a,db,da);
-    a = kernel(a,a,da,da);
-    st->next(x,tail, r,g,b,a, dr,dg,db,da);
-}
-
-#define KERNEL(name) static Sk4f name(const Sk4f& s, const Sk4f& sa, const Sk4f& d, const Sk4f& da)
-KERNEL(clear)    { return 0.0f; }
-KERNEL(dst)      { return d; }
-KERNEL(dstover)  { return d + inv(da)*s; }
-
-KERNEL(srcin)    { return s * da; }
-KERNEL(srcout)   { return s * inv(da); }
-KERNEL(srcatop)  { return s*da + d*inv(sa); }
-KERNEL(dstin)    { return srcin  (d,da,s,sa); }
-KERNEL(dstout)   { return srcout (d,da,s,sa); }
-KERNEL(dstatop)  { return srcatop(d,da,s,sa); }
-
-KERNEL(modulate) { return s*d; }
-KERNEL(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
-KERNEL(plus_)     { return s + d; }
-KERNEL(screen)   { return s + d - s*d; }
-KERNEL(xor_)     { return s*inv(da) + d*inv(sa); }
-
-// Most of the rest apply the same logic to each color channel, and srcover's logic to alpha.
-// (darken and lighten can actually go either way, but they're a little faster this way.)
-template <Sk4f kernel(const Sk4f& s, const Sk4f& sa, const Sk4f& d, const Sk4f& da)>
-static void SK_VECTORCALL rgb_srcover(SkRasterPipeline::Stage* st, size_t x, size_t tail,
-                                      Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                      Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    r = kernel(r,a,dr,da);
-    g = kernel(g,a,dg,da);
-    b = kernel(b,a,db,da);
-    a = a + da*inv(a);
-    st->next(x,tail, r,g,b,a, dr,dg,db,da);
-}
-
-KERNEL(colorburn) {
-    return (d == da  ).thenElse(d + s*inv(da),
-           (s == 0.0f).thenElse(s + d*inv(sa),
-                                sa*(da - Sk4f::Min(da, (da-d)*sa/s)) + s*inv(da) + d*inv(sa)));
-}
-KERNEL(colordodge) {
-    return (d == 0.0f).thenElse(d + s*inv(da),
-           (s == sa  ).thenElse(s + d*inv(sa),
-                                sa*Sk4f::Min(da, (d*sa)/(sa - s)) + s*inv(da) + d*inv(sa)));
-}
-KERNEL(darken)     { return s + d - Sk4f::Max(s*da, d*sa); }
-KERNEL(difference) { return s + d - 2.0f*Sk4f::Min(s*da,d*sa); }
-KERNEL(exclusion)  { return s + d - 2.0f*s*d; }
-KERNEL(hardlight) {
-    return s*inv(da) + d*inv(sa)
-         + (2.0f*s <= sa).thenElse(2.0f*s*d, sa*da - 2.0f*(da-d)*(sa-s));
-}
-KERNEL(lighten) { return s + d - Sk4f::Min(s*da, d*sa); }
-KERNEL(overlay) { return hardlight(d,da,s,sa); }
-KERNEL(softlight) {
-    Sk4f m  = (da > 0.0f).thenElse(d / da, 0.0f),
-         s2 = 2.0f*s,
-         m4 = 4.0f*m;
-
-    // The logic forks three ways:
-    //    1. dark src?
-    //    2. light src, dark dst?
-    //    3. light src, light dst?
-    Sk4f darkSrc = d*(sa + (s2 - sa)*(1.0f - m)),     // Used in case 1.
-         darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m,  // Used in case 2.
-         liteDst = m.rsqrt().invert() - m,            // Used in case 3.
-         liteSrc = d*sa + da*(s2 - sa) * (4.0f*d <= da).thenElse(darkDst, liteDst);  // 2 or 3?
-    return s*inv(da) + d*inv(sa) + (s2 <= sa).thenElse(darkSrc, liteSrc);  // 1 or (2 or 3)?
-}
-#undef KERNEL
 
 bool SkProcCoeffXfermode::onAppendStages(SkRasterPipeline* p) const {
     switch (fMode) {
-        case kSrcOver_Mode: SkASSERT(false); return false;  // Well how did we get here?
+        case kSrc_Mode:    /*This stage is a no-op.*/             return true;
+        case kDst_Mode:     p->append(SkRasterPipeline::dst);     return true;
+        case kSrcATop_Mode: p->append(SkRasterPipeline::srcatop); return true;
+        case kDstATop_Mode: p->append(SkRasterPipeline::dstatop); return true;
+        case kSrcIn_Mode:   p->append(SkRasterPipeline::srcin);   return true;
+        case kDstIn_Mode:   p->append(SkRasterPipeline::dstin);   return true;
+        case kSrcOut_Mode:  p->append(SkRasterPipeline::srcout);  return true;
+        case kDstOut_Mode:  p->append(SkRasterPipeline::dstout);  return true;
+        case kSrcOver_Mode: p->append(SkRasterPipeline::srcover); return true;
+        case kDstOver_Mode: p->append(SkRasterPipeline::dstover); return true;
 
-        case kSrc_Mode:    /*This stage is a no-op.*/ return true;
-        case kDst_Mode:     p->append(rgba<dst>);     return true;
-        case kSrcATop_Mode: p->append(rgba<srcatop>); return true;
-        case kDstATop_Mode: p->append(rgba<dstatop>); return true;
-        case kSrcIn_Mode:   p->append(rgba<srcin>);   return true;
-        case kDstIn_Mode:   p->append(rgba<dstin>);   return true;
-        case kSrcOut_Mode:  p->append(rgba<srcout>);  return true;
-        case kDstOut_Mode:  p->append(rgba<dstout>);  return true;
-        case kDstOver_Mode: p->append(rgba<dstover>); return true;
+        case kClear_Mode:    p->append(SkRasterPipeline::clear);    return true;
+        case kModulate_Mode: p->append(SkRasterPipeline::modulate); return true;
+        case kMultiply_Mode: p->append(SkRasterPipeline::multiply); return true;
+        case kPlus_Mode:     p->append(SkRasterPipeline::plus_);    return true;
+        case kScreen_Mode:   p->append(SkRasterPipeline::screen);   return true;
+        case kXor_Mode:      p->append(SkRasterPipeline::xor_);     return true;
 
-        case kClear_Mode:    p->append(rgba<clear>);    return true;
-        case kModulate_Mode: p->append(rgba<modulate>); return true;
-        case kMultiply_Mode: p->append(rgba<multiply>); return true;
-        case kPlus_Mode:     p->append(rgba<plus_>);    return true;
-        case kScreen_Mode:   p->append(rgba<screen>);   return true;
-        case kXor_Mode:      p->append(rgba<xor_>);     return true;
-
-        case kColorBurn_Mode:  p->append(rgb_srcover<colorburn>);  return true;
-        case kColorDodge_Mode: p->append(rgb_srcover<colordodge>); return true;
-        case kDarken_Mode:     p->append(rgb_srcover<darken>);     return true;
-        case kDifference_Mode: p->append(rgb_srcover<difference>); return true;
-        case kExclusion_Mode:  p->append(rgb_srcover<exclusion>);  return true;
-        case kHardLight_Mode:  p->append(rgb_srcover<hardlight>);  return true;
-        case kLighten_Mode:    p->append(rgb_srcover<lighten>);    return true;
-        case kOverlay_Mode:    p->append(rgb_srcover<overlay>);    return true;
-        case kSoftLight_Mode:  p->append(rgb_srcover<softlight>);  return true;
+        case kColorBurn_Mode:  p->append(SkRasterPipeline::colorburn);  return true;
+        case kColorDodge_Mode: p->append(SkRasterPipeline::colordodge); return true;
+        case kDarken_Mode:     p->append(SkRasterPipeline::darken);     return true;
+        case kDifference_Mode: p->append(SkRasterPipeline::difference); return true;
+        case kExclusion_Mode:  p->append(SkRasterPipeline::exclusion);  return true;
+        case kHardLight_Mode:  p->append(SkRasterPipeline::hardlight);  return true;
+        case kLighten_Mode:    p->append(SkRasterPipeline::lighten);    return true;
+        case kOverlay_Mode:    p->append(SkRasterPipeline::overlay);    return true;
+        case kSoftLight_Mode:  p->append(SkRasterPipeline::softlight);  return true;
 
         // TODO
         case kColor_Mode:       return false;
diff --git a/src/effects/SkArithmeticMode.cpp b/src/effects/SkArithmeticMode.cpp
index e142533..81f29bb 100644
--- a/src/effects/SkArithmeticMode.cpp
+++ b/src/effects/SkArithmeticMode.cpp
@@ -30,11 +30,6 @@
 
     void xfer32(SkPMColor[], const SkPMColor[], int count, const SkAlpha[]) const override;
 
-    bool onAppendStages(SkRasterPipeline* p) const override {
-        p->append(&Stage, this);
-        return true;
-    }
-
     SK_TO_STRING_OVERRIDE()
     SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkArithmeticMode_scalar)
 
@@ -45,10 +40,6 @@
 #endif
 
 private:
-    static void SK_VECTORCALL Stage(SkRasterPipeline::Stage* st, size_t x, size_t tail,
-                                    Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                    Sk4f dr, Sk4f dg, Sk4f db, Sk4f da);
-
     void flatten(SkWriteBuffer& buffer) const override {
         buffer.writeScalar(fK[0]);
         buffer.writeScalar(fK[1]);
@@ -74,27 +65,6 @@
     return SkArithmeticMode::Make(k1, k2, k3, k4, enforcePMColor);
 }
 
-void SK_VECTORCALL SkArithmeticMode_scalar::Stage(SkRasterPipeline::Stage* st,
-                                                  size_t x, size_t tail,
-                                                  Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                                  Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto self = st->ctx<const SkArithmeticMode_scalar*>();
-
-    const Sk4f k1 = self->fK[0],
-               k2 = self->fK[1],
-               k3 = self->fK[2],
-               k4 = self->fK[3];
-
-    r = k1*r*dr + k2*r + k3*dr + k4;
-    g = k1*g*dg + k2*g + k3*dg + k4;
-    b = k1*b*db + k2*b + k3*db + k4;
-    a = k1*a*da + k2*a + k3*da + k4;
-
-    // A later stage (clamp_01_premul) will pin and fEnforcePMColor for us.
-
-    st->next(x,tail, r,g,b,a, dr,dg,db,da);
-}
-
 void SkArithmeticMode_scalar::xfer32(SkPMColor dst[], const SkPMColor src[],
                                  int count, const SkAlpha aaCoverage[]) const {
     const Sk4f k1 = fK[0] * (1/255.0f),
diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp
index 0509b91..3a37834 100644
--- a/src/opts/SkOpts_sse41.cpp
+++ b/src/opts/SkOpts_sse41.cpp
@@ -45,7 +45,36 @@
 
         STAGE(constant_color, true);
 
-        STAGE(srcover, true);
+    #undef STAGE
+
+    #define STAGE(stage) \
+        stages_4  [SkRasterPipeline::stage] = SK_OPTS_NS::stage; \
+        stages_1_3[SkRasterPipeline::stage] = SK_OPTS_NS::stage
+
+        STAGE(dst);
+        STAGE(dstatop);
+        STAGE(dstin);
+        STAGE(dstout);
+        STAGE(dstover);
+        STAGE(srcatop);
+        STAGE(srcin);
+        STAGE(srcout);
+        STAGE(srcover);
+        STAGE(clear);
+        STAGE(modulate);
+        STAGE(multiply);
+        STAGE(plus_);
+        STAGE(screen);
+        STAGE(xor_);
+        STAGE(colorburn);
+        STAGE(colordodge);
+        STAGE(darken);
+        STAGE(difference);
+        STAGE(exclusion);
+        STAGE(hardlight);
+        STAGE(lighten);
+        STAGE(overlay);
+        STAGE(softlight);
     #undef STAGE
 
     }
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index a23940e..fdb15b4 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -48,6 +48,38 @@
     }
 }
 
+// Many xfermodes apply the same logic to each channel.
+#define RGBA_XFERMODE_Sk4f(name)                                                       \
+    static SK_ALWAYS_INLINE Sk4f name##_kernel(const Sk4f& s, const Sk4f& sa,          \
+                                               const Sk4f& d, const Sk4f& da);         \
+    static void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
+                                   Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,                 \
+                                   Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {               \
+        r = name##_kernel(r,a,dr,da);                                                  \
+        g = name##_kernel(g,a,dg,da);                                                  \
+        b = name##_kernel(b,a,db,da);                                                  \
+        a = name##_kernel(a,a,da,da);                                                  \
+        st->next(x,tail, r,g,b,a, dr,dg,db,da);                                        \
+    }                                                                                  \
+    static SK_ALWAYS_INLINE Sk4f name##_kernel(const Sk4f& s, const Sk4f& sa,          \
+                                               const Sk4f& d, const Sk4f& da)
+
+// Most of the rest apply the same logic to color channels and use srcover's alpha logic.
+#define RGB_XFERMODE_Sk4f(name)                                                        \
+    static SK_ALWAYS_INLINE Sk4f name##_kernel(const Sk4f& s, const Sk4f& sa,          \
+                                               const Sk4f& d, const Sk4f& da);         \
+    static void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
+                                   Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,                 \
+                                   Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {               \
+        r = name##_kernel(r,a,dr,da);                                                  \
+        g = name##_kernel(g,a,dg,da);                                                  \
+        b = name##_kernel(b,a,db,da);                                                  \
+        a = a + (da * (1.0f-a));                                                       \
+        st->next(x,tail, r,g,b,a, dr,dg,db,da);                                        \
+    }                                                                                  \
+    static SK_ALWAYS_INLINE Sk4f name##_kernel(const Sk4f& s, const Sk4f& sa,          \
+                                               const Sk4f& d, const Sk4f& da)
+
 namespace SK_OPTS_NS {
 
     // Clamp colors into [0,1] premul (e.g. just before storing back to memory).
@@ -63,6 +95,8 @@
         b = Sk4f::Min(b, a);
     }
 
+    static Sk4f inv(const Sk4f& x) { return 1.0f - x; }
+
     static Sk4f lerp(const Sk4f& from, const Sk4f& to, const Sk4f& cov) {
         return from + (to-from)*cov;
     }
@@ -109,14 +143,6 @@
         a = color->a();
     }
 
-    // The default transfer mode is srcover, s' = s + d*(1-sa).
-    KERNEL_Sk4f(srcover) {
-        r += dr*(1.0f - a);
-        g += dg*(1.0f - a);
-        b += db*(1.0f - a);
-        a += da*(1.0f - a);
-    }
-
     // s' = d(1-c) + sc, for a constant c.
     KERNEL_Sk4f(lerp_constant_float) {
         Sk4f c = *(const float*)ctx;
@@ -270,6 +296,63 @@
                          |       Sk4f_round(255.0f * a) << SK_A32_SHIFT), (int*)ptr);
     }
 
+    RGBA_XFERMODE_Sk4f(clear)    { return 0.0f; }
+  //RGBA_XFERMODE_Sk4f(src)      { return s; }   // This would be a no-op stage, so we just omit it.
+    RGBA_XFERMODE_Sk4f(dst)      { return d; }
+
+    RGBA_XFERMODE_Sk4f(srcatop)  { return s*da + d*inv(sa); }
+    RGBA_XFERMODE_Sk4f(srcin)    { return s * da; }
+    RGBA_XFERMODE_Sk4f(srcout)   { return s * inv(da); }
+    RGBA_XFERMODE_Sk4f(srcover)  { return s + inv(sa)*d; }
+    RGBA_XFERMODE_Sk4f(dstatop)  { return srcatop_kernel(d,da,s,sa); }
+    RGBA_XFERMODE_Sk4f(dstin)    { return srcin_kernel  (d,da,s,sa); }
+    RGBA_XFERMODE_Sk4f(dstout)   { return srcout_kernel (d,da,s,sa); }
+    RGBA_XFERMODE_Sk4f(dstover)  { return srcover_kernel(d,da,s,sa); }
+
+    RGBA_XFERMODE_Sk4f(modulate) { return s*d; }
+    RGBA_XFERMODE_Sk4f(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
+    RGBA_XFERMODE_Sk4f(plus_)    { return s + d; }
+    RGBA_XFERMODE_Sk4f(screen)   { return s + d - s*d; }
+    RGBA_XFERMODE_Sk4f(xor_)     { return s*inv(da) + d*inv(sa); }
+
+    RGB_XFERMODE_Sk4f(colorburn) {
+        return (d == da  ).thenElse(d + s*inv(da),
+               (s == 0.0f).thenElse(s + d*inv(sa),
+                                    sa*(da - Sk4f::Min(da, (da-d)*sa/s)) + s*inv(da) + d*inv(sa)));
+    }
+    RGB_XFERMODE_Sk4f(colordodge) {
+        return (d == 0.0f).thenElse(d + s*inv(da),
+               (s == sa  ).thenElse(s + d*inv(sa),
+                                    sa*Sk4f::Min(da, (d*sa)/(sa - s)) + s*inv(da) + d*inv(sa)));
+    }
+    RGB_XFERMODE_Sk4f(darken)     { return s + d - Sk4f::Max(s*da, d*sa); }
+    RGB_XFERMODE_Sk4f(difference) { return s + d - 2.0f*Sk4f::Min(s*da,d*sa); }
+    RGB_XFERMODE_Sk4f(exclusion)  { return s + d - 2.0f*s*d; }
+    RGB_XFERMODE_Sk4f(hardlight) {
+        return s*inv(da) + d*inv(sa)
+             + (2.0f*s <= sa).thenElse(2.0f*s*d, sa*da - 2.0f*(da-d)*(sa-s));
+    }
+    RGB_XFERMODE_Sk4f(lighten) { return s + d - Sk4f::Min(s*da, d*sa); }
+    RGB_XFERMODE_Sk4f(overlay) { return hardlight_kernel(d,da,s,sa); }
+    RGB_XFERMODE_Sk4f(softlight) {
+        Sk4f m  = (da > 0.0f).thenElse(d / da, 0.0f),
+             s2 = 2.0f*s,
+             m4 = 4.0f*m;
+
+        // The logic forks three ways:
+        //    1. dark src?
+        //    2. light src, dark dst?
+        //    3. light src, light dst?
+        Sk4f darkSrc = d*(sa + (s2 - sa)*(1.0f - m)),     // Used in case 1.
+             darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m,  // Used in case 2.
+             liteDst = m.rsqrt().invert() - m,            // Used in case 3.
+             liteSrc = d*sa + da*(s2 - sa) * (4.0f*d <= da).thenElse(darkDst, liteDst);  // 2 or 3?
+        return s*inv(da) + d*inv(sa) + (s2 <= sa).thenElse(darkSrc, liteSrc);  // 1 or (2 or 3)?
+    }
 }
 
+#undef KERNEL_Sk4f
+#undef RGB_XFERMODE_Sk4f
+#undef RGB_XFERMODE_Sk4f
+
 #endif//SkRasterPipeline_opts_DEFINED
diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp
index ccc728e..282b234 100644
--- a/tests/SkRasterPipelineTest.cpp
+++ b/tests/SkRasterPipelineTest.cpp
@@ -6,69 +6,31 @@
  */
 
 #include "Test.h"
+#include "SkHalf.h"
 #include "SkRasterPipeline.h"
 
-static void SK_VECTORCALL load(SkRasterPipeline::Stage* st, size_t x, size_t tail,
-                               Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                               Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const float*>() + x;
-    switch(tail&3) {
-        case 0: a = Sk4f{ptr[3]};
-        case 3: b = Sk4f{ptr[2]};
-        case 2: g = Sk4f{ptr[1]};
-        case 1: r = Sk4f{ptr[0]};
-    }
-    st->next(x,tail, r,g,b,a, dr,dg,db,da);
-}
-
-static void SK_VECTORCALL square(SkRasterPipeline::Stage* st, size_t x, size_t tail,
-                                 Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                 Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    r *= r;
-    g *= g;
-    b *= b;
-    a *= a;
-    st->next(x,tail, r,g,b,a, dr,dg,db,da);
-}
-
-static void SK_VECTORCALL store(SkRasterPipeline::Stage* st, size_t x, size_t tail,
-                                Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<float*>() + x;
-    switch (tail&3) {
-        case 0: ptr[3] = a[0];
-        case 3: ptr[2] = b[0];
-        case 2: ptr[1] = g[0];
-        case 1: ptr[0] = r[0];
-    }
-}
-
 DEF_TEST(SkRasterPipeline, r) {
-    // We'll build up and run a simple pipeline that exercises the salient
-    // mechanics of SkRasterPipeline:
-    //    - context pointers                           (load,store)
-    //    - stages sensitive to the number of pixels   (load,store)
-    //    - stages insensitive to the number of pixels (square)
-    //    - stages that chain to the next stage        (load,square)
-    //    - stages that terminate the pipeline         (store)
-    //
-    // This pipeline loads up some values, squares them, then writes them back to memory.
+    // Build and run a simple pipeline to exercise SkRasterPipeline,
+    // drawing 50% transparent blue over opaque red in half-floats.
 
-    const float src_vals[] = { 1,2,3,4,5 };
-    float       dst_vals[] = { 0,0,0,0,0 };
+    Sk4h red  = SkFloatToHalf_finite_ftz({ 1.0f, 0.0f, 0.0f, 1.0f }),
+         blue = SkFloatToHalf_finite_ftz({ 0.0f, 0.0f, 0.5f, 0.5f }),
+         result;
 
     SkRasterPipeline p;
-    p.append(load, src_vals);
-    p.append(square);
-    p.append(store, dst_vals);
+    p.append(SkRasterPipeline::load_s_f16, &blue);
+    p.append(SkRasterPipeline::load_d_f16, &red);
+    p.append(SkRasterPipeline::srcover);
+    p.append(SkRasterPipeline::store_f16, &result);
+    p.run(1);
 
-    p.run(5);
+    Sk4f f = SkHalfToFloat_finite_ftz(result);
 
-    REPORTER_ASSERT(r, dst_vals[0] ==  1);
-    REPORTER_ASSERT(r, dst_vals[1] ==  4);
-    REPORTER_ASSERT(r, dst_vals[2] ==  9);
-    REPORTER_ASSERT(r, dst_vals[3] == 16);
-    REPORTER_ASSERT(r, dst_vals[4] == 25);
+    // We should see half-intensity magenta.
+    REPORTER_ASSERT(r, f[0] == 0.5f);
+    REPORTER_ASSERT(r, f[1] == 0.0f);
+    REPORTER_ASSERT(r, f[2] == 0.5f);
+    REPORTER_ASSERT(r, f[3] == 1.0f);
 }
 
 DEF_TEST(SkRasterPipeline_empty, r) {
@@ -79,8 +41,8 @@
 
 DEF_TEST(SkRasterPipeline_nonsense, r) {
     // No asserts... just a test that this is safe to run and terminates.
-    // square() always calls st->next(); this makes sure we've always got something there to call.
+    // srcover() calls st->next(); this makes sure we've always got something there to call.
     SkRasterPipeline p;
-    p.append(square);
+    p.append(SkRasterPipeline::srcover);
     p.run(20);
 }