SkRasterPipeline refactor

  - Give body and tail functions separate types.  This frees a register in body functions, especially important for Windows.

  - Fill out default, SSE4.1, and HSW versions of all functions.  This means we don't have to mess around with SkNf_abi... all functions come from the same compilation unit where SkNf is a single consistent type.

  - Move Stage::next() into SkRasterPipeline_opts.h as a static inline function.

  - Remove Stage::ctx() entirely... fCtx is literally the same thing.

This is a step along the way toward building the entire pipeline in src/opts, removing the need for all the stages to be functions living in SkOpts.

BUG=skia:

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=3680
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot,Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug-ASAN-Trybot

Change-Id: I7de78ffebc15b9bad4eda187c9f50369cd7e5e42
Reviewed-on: https://skia-review.googlesource.com/3680
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 627f0e3..1638948 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -144,8 +144,8 @@
     static_assert(SK_ARRAY_COUNT(body) == SkRasterPipeline::kNumStockStages, "");
 
     SkOpts::VoidFn tail[] = {
-        (SkOpts::VoidFn)SK_OPTS_NS::just_return,
-        (SkOpts::VoidFn)SK_OPTS_NS::swap_src_dst,
+        (SkOpts::VoidFn)SK_OPTS_NS::just_return_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::swap_src_dst_tail,
 
         (SkOpts::VoidFn)SK_OPTS_NS::store_565_tail,
         (SkOpts::VoidFn)SK_OPTS_NS::store_srgb_tail,
@@ -163,34 +163,34 @@
 
         (SkOpts::VoidFn)SK_OPTS_NS::lerp_u8_tail,
         (SkOpts::VoidFn)SK_OPTS_NS::lerp_565_tail,
-        (SkOpts::VoidFn)SK_OPTS_NS::lerp_constant_float,
+        (SkOpts::VoidFn)SK_OPTS_NS::lerp_constant_float_tail,
 
-        (SkOpts::VoidFn)SK_OPTS_NS::constant_color,
+        (SkOpts::VoidFn)SK_OPTS_NS::constant_color_tail,
 
-        (SkOpts::VoidFn)SK_OPTS_NS::dst,
-        (SkOpts::VoidFn)SK_OPTS_NS::dstatop,
-        (SkOpts::VoidFn)SK_OPTS_NS::dstin,
-        (SkOpts::VoidFn)SK_OPTS_NS::dstout,
-        (SkOpts::VoidFn)SK_OPTS_NS::dstover,
-        (SkOpts::VoidFn)SK_OPTS_NS::srcatop,
-        (SkOpts::VoidFn)SK_OPTS_NS::srcin,
-        (SkOpts::VoidFn)SK_OPTS_NS::srcout,
-        (SkOpts::VoidFn)SK_OPTS_NS::srcover,
-        (SkOpts::VoidFn)SK_OPTS_NS::clear,
-        (SkOpts::VoidFn)SK_OPTS_NS::modulate,
-        (SkOpts::VoidFn)SK_OPTS_NS::multiply,
-        (SkOpts::VoidFn)SK_OPTS_NS::plus_,
-        (SkOpts::VoidFn)SK_OPTS_NS::screen,
-        (SkOpts::VoidFn)SK_OPTS_NS::xor_,
-        (SkOpts::VoidFn)SK_OPTS_NS::colorburn,
-        (SkOpts::VoidFn)SK_OPTS_NS::colordodge,
-        (SkOpts::VoidFn)SK_OPTS_NS::darken,
-        (SkOpts::VoidFn)SK_OPTS_NS::difference,
-        (SkOpts::VoidFn)SK_OPTS_NS::exclusion,
-        (SkOpts::VoidFn)SK_OPTS_NS::hardlight,
-        (SkOpts::VoidFn)SK_OPTS_NS::lighten,
-        (SkOpts::VoidFn)SK_OPTS_NS::overlay,
-        (SkOpts::VoidFn)SK_OPTS_NS::softlight,
+        (SkOpts::VoidFn)SK_OPTS_NS::dst_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::dstatop_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::dstin_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::dstout_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::dstover_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::srcatop_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::srcin_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::srcout_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::srcover_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::clear_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::modulate_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::multiply_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::plus__tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::screen_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::xor__tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::colorburn_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::colordodge_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::darken_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::difference_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::exclusion_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::hardlight_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::lighten_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::overlay_tail,
+        (SkOpts::VoidFn)SK_OPTS_NS::softlight_tail,
     };
     static_assert(SK_ARRAY_COUNT(tail) == SkRasterPipeline::kNumStockStages, "");
 
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index c6b85ad..525a8db 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -55,33 +55,13 @@
 
 class SkRasterPipeline {
 public:
-    struct Stage;
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
-    using V = SkNx_abi<8,float>;
-#else
-    using V = SkNx_abi<4,float>;
-#endif
-    using Fn = void(SK_VECTORCALL *)(Stage*, size_t, size_t, V,V,V,V,
-                                                             V,V,V,V);
-
     struct Stage {
-        template <typename T>
-        T ctx() { return static_cast<T>(fCtx); }
-
-        void SK_VECTORCALL next(size_t x, size_t tail, V v0, V v1, V v2, V v3,
-                                                       V v4, V v5, V v6, V v7) {
-            // Stages are logically a pipeline, and physically are contiguous in an array.
-            // To get to the next stage, we just increment our pointer to the next array element.
-            ((Fn)fNext)(this+1, x,tail, v0,v1,v2,v3, v4,v5,v6,v7);
-        }
-
         // It makes next() a good bit cheaper if we hold the next function to call here,
         // rather than logically simpler choice of the function implementing this stage.
         void (*fNext)();
         void* fCtx;
     };
 
-
     SkRasterPipeline();
 
     // Run the pipeline constructed with append(), walking x through [x,x+n),
diff --git a/src/opts/SkOpts_hsw.cpp b/src/opts/SkOpts_hsw.cpp
index ecbb3e7..3c5d19d 100644
--- a/src/opts/SkOpts_hsw.cpp
+++ b/src/opts/SkOpts_hsw.cpp
@@ -35,11 +35,6 @@
 
         STAGE(lerp_u8);
         STAGE(lerp_565);
-    #undef STAGE
-
-    #define STAGE(stage)                                                   \
-        body[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage; \
-        tail[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage
 
         STAGE(just_return);
         STAGE(swap_src_dst);
diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp
index d55978c..4e277aa 100644
--- a/src/opts/SkOpts_sse41.cpp
+++ b/src/opts/SkOpts_sse41.cpp
@@ -21,7 +21,7 @@
         srcover_srgb_srgb    = sse41::srcover_srgb_srgb;
         blit_row_s32a_opaque = sse41::blit_row_s32a_opaque;
 
-    #define STAGE(stage)                                                       \
+    #define STAGE(stage)                                                        \
         body[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage;      \
         tail[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage##_tail
 
@@ -41,40 +41,34 @@
 
         STAGE(lerp_u8);
         STAGE(lerp_565);
-    #undef STAGE
 
-    #define STAGE(stage)                                                  \
-        body[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage; \
-        tail[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage
+        STAGE(just_return);
+        STAGE(swap_src_dst);
+        STAGE(lerp_constant_float);
+        STAGE(constant_color);
 
-      // The commented-out stages don't actually benefit from SSE 4.1.
-      // To cut down on code bloat we skip them here, using the identical SSE2 defaults.
-
-      //STAGE(lerp_constant_float);
-      //STAGE(constant_color);
-
-      //STAGE(dst);
-      //STAGE(dstatop);
-      //STAGE(dstin);
-      //STAGE(dstout);
-      //STAGE(dstover);
-      //STAGE(srcatop);
-      //STAGE(srcin);
-      //STAGE(srcout);
-      //STAGE(srcover);
-      //STAGE(clear);
-      //STAGE(modulate);
-      //STAGE(multiply);
-      //STAGE(plus_);
-      //STAGE(screen);
-      //STAGE(xor_);
+        STAGE(dst);
+        STAGE(dstatop);
+        STAGE(dstin);
+        STAGE(dstout);
+        STAGE(dstover);
+        STAGE(srcatop);
+        STAGE(srcin);
+        STAGE(srcout);
+        STAGE(srcover);
+        STAGE(clear);
+        STAGE(modulate);
+        STAGE(multiply);
+        STAGE(plus_);
+        STAGE(screen);
+        STAGE(xor_);
         STAGE(colorburn);
         STAGE(colordodge);
-      //STAGE(darken);
-      //STAGE(difference);
-      //STAGE(exclusion);
+        STAGE(darken);
+        STAGE(difference);
+        STAGE(exclusion);
         STAGE(hardlight);
-      //STAGE(lighten);
+        STAGE(lighten);
         STAGE(overlay);
         STAGE(softlight);
     #undef STAGE
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 3266554..7e884c7 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -12,36 +12,53 @@
 #include "SkPM4f.h"
 #include "SkRasterPipeline.h"
 #include "SkSRGB.h"
+#include <utility>
 
-using SkNf_abi = SkRasterPipeline::V;
-static constexpr auto N = sizeof(SkNf_abi) / sizeof(float);
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+    static constexpr int N = 8;
+#else
+    static constexpr int N = 4;
+#endif
+
 using SkNf = SkNx<N, float>;
 using SkNi = SkNx<N, int>;
 using SkNh = SkNx<N, uint16_t>;
 
+using Body = void(SK_VECTORCALL *)(SkRasterPipeline::Stage*, size_t,
+                                   SkNf,SkNf,SkNf,SkNf,
+                                   SkNf,SkNf,SkNf,SkNf);
+using Tail = void(SK_VECTORCALL *)(SkRasterPipeline::Stage*, size_t, size_t,
+                                   SkNf,SkNf,SkNf,SkNf,
+                                   SkNf,SkNf,SkNf,SkNf);
+
 #define SI static inline
 
+template <typename Fn, typename... Args>
+SI void next(SkRasterPipeline::Stage* st, Args&&... args) {
+    // Stages are logically a pipeline, and physically are contiguous in an array.
+    // To get to the next stage, we just increment our pointer to the next array element.
+    ((Fn)st->fNext)(st+1, std::forward<Args>(args)...);
+}
+
 #define STAGE(name, kCallNext)                                                              \
     template <bool kIsTail>                                                                 \
     static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail,            \
                                                SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,      \
                                                SkNf& dr, SkNf& dg, SkNf& db, SkNf& da);     \
-    SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail,          \
-                               SkNf_abi  R, SkNf_abi  G, SkNf_abi  B, SkNf_abi  A,          \
-                               SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) {        \
-        SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA;                                      \
-        name##_kernel<false>(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da);                  \
+    SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x,                       \
+                               SkNf  r, SkNf  g, SkNf  b, SkNf  a,                          \
+                               SkNf dr, SkNf dg, SkNf db, SkNf da) {                        \
+        name##_kernel<false>(st->fCtx, x,0, r,g,b,a, dr,dg,db,da);                          \
         if (kCallNext) {                                                                    \
-            st->next(x,tail, r,g,b,a, dr,dg,db,da);                                         \
+            next<Body>(st, x, r,g,b,a, dr,dg,db,da);                                        \
         }                                                                                   \
     }                                                                                       \
     SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail,   \
-                                      SkNf_abi  R, SkNf_abi  G, SkNf_abi  B, SkNf_abi  A,   \
-                                      SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \
-        SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA;                                      \
-        name##_kernel<true>(st->ctx<void*>(), x,tail, r,g,b,a, dr,dg,db,da);                \
+                                      SkNf  r, SkNf  g, SkNf  b, SkNf  a,                   \
+                                      SkNf dr, SkNf dg, SkNf db, SkNf da) {                 \
+        name##_kernel<true>(st->fCtx, x,tail, r,g,b,a, dr,dg,db,da);                        \
         if (kCallNext) {                                                                    \
-            st->next(x,tail, r,g,b,a, dr,dg,db,da);                                         \
+            next<Tail>(st, x,tail, r,g,b,a, dr,dg,db,da);                                   \
         }                                                                                   \
     }                                                                                       \
     template <bool kIsTail>                                                                 \
@@ -54,15 +71,23 @@
 #define RGBA_XFERMODE(name)                                                                \
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,              \
                                                const SkNf& d, const SkNf& da);             \
-    SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail,         \
-                               SkNf_abi  R, SkNf_abi  G, SkNf_abi  B, SkNf_abi  A,         \
-                               SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) {       \
-        SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA;                                     \
+    SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x,                      \
+                               SkNf  r, SkNf  g, SkNf  b, SkNf  a,                         \
+                               SkNf dr, SkNf dg, SkNf db, SkNf da) {                       \
         r = name##_kernel(r,a,dr,da);                                                      \
         g = name##_kernel(g,a,dg,da);                                                      \
         b = name##_kernel(b,a,db,da);                                                      \
         a = name##_kernel(a,a,da,da);                                                      \
-        st->next(x,tail, r,g,b,a, dr,dg,db,da);                                            \
+        next<Body>(st, x, r,g,b,a, dr,dg,db,da);                                           \
+    }                                                                                      \
+    SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail,  \
+                                      SkNf  r, SkNf  g, SkNf  b, SkNf  a,                  \
+                                      SkNf dr, SkNf dg, SkNf db, SkNf da) {                \
+        r = name##_kernel(r,a,dr,da);                                                      \
+        g = name##_kernel(g,a,dg,da);                                                      \
+        b = name##_kernel(b,a,db,da);                                                      \
+        a = name##_kernel(a,a,da,da);                                                      \
+        next<Tail>(st, x,tail, r,g,b,a, dr,dg,db,da);                                      \
     }                                                                                      \
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,              \
                                                const SkNf& d, const SkNf& da)
@@ -71,15 +96,23 @@
 #define RGB_XFERMODE(name)                                                                 \
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,              \
                                                const SkNf& d, const SkNf& da);             \
-    SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail,         \
-                               SkNf_abi  R, SkNf_abi  G, SkNf_abi  B, SkNf_abi  A,         \
-                               SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) {       \
-        SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA;                                     \
+    SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x,                      \
+                               SkNf  r, SkNf  g, SkNf  b, SkNf  a,                         \
+                               SkNf dr, SkNf dg, SkNf db, SkNf da) {                       \
         r = name##_kernel(r,a,dr,da);                                                      \
         g = name##_kernel(g,a,dg,da);                                                      \
         b = name##_kernel(b,a,db,da);                                                      \
         a = a + (da * (1.0f-a));                                                           \
-        st->next(x,tail, r,g,b,a, dr,dg,db,da);                                            \
+        next<Body>(st, x, r,g,b,a, dr,dg,db,da);                                           \
+    }                                                                                      \
+    SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail,  \
+                                      SkNf  r, SkNf  g, SkNf  b, SkNf  a,                  \
+                                      SkNf dr, SkNf dg, SkNf db, SkNf da) {                \
+        r = name##_kernel(r,a,dr,da);                                                      \
+        g = name##_kernel(g,a,dg,da);                                                      \
+        b = name##_kernel(b,a,db,da);                                                      \
+        a = a + (da * (1.0f-a));                                                           \
+        next<Tail>(st, x,tail, r,g,b,a, dr,dg,db,da);                                      \
     }                                                                                      \
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,              \
                                                const SkNf& d, const SkNf& da)
@@ -90,11 +123,11 @@
     SI void run_pipeline(size_t x, size_t n,
                          void (*vBodyStart)(), SkRasterPipeline::Stage* body,
                          void (*vTailStart)(), SkRasterPipeline::Stage* tail) {
-        auto bodyStart = (SkRasterPipeline::Fn)vBodyStart,
-             tailStart = (SkRasterPipeline::Fn)vTailStart;
-        SkNf v{0};  // TODO: uninitialized would be a bit faster, but some compilers are whiny.
+        auto bodyStart = (Body)vBodyStart;
+        auto tailStart = (Tail)vTailStart;
+        SkNf v;  // Fastest to start uninitialized.
         while (n >= N) {
-            bodyStart(body, x,0, v,v,v,v, v,v,v,v);
+            bodyStart(body, x, v,v,v,v, v,v,v,v);
             x += N;
             n -= N;
         }