SkNx_abi for passing Sk4f as function arguments, etc.

CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot,Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug-ASAN-Trybot

BUG=skia:

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=3422

Change-Id: Idc0a192faa7ff843aef023229186580c69baf1f7
Reviewed-on: https://skia-review.googlesource.com/3422
Reviewed-by: Mike Klein <mtklein@chromium.org>
diff --git a/src/core/SkNx.h b/src/core/SkNx.h
index afba75a..6d9af9f 100644
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@@ -16,6 +16,11 @@
 #include <math.h>
 #include <type_traits>
 
+// These _abi types are data-only, and so can be used to store SkNx in structs or
+// pass them as function parameters or return values, even across compilation units.
+template <int N, typename T> struct SkNx_abi      { SkNx_abi<N/2,T> lo, hi; };
+template <       typename T> struct SkNx_abi<1,T> {              T     val; };
+
 namespace {
 
 #define SI static inline
@@ -42,6 +47,9 @@
         static_assert(N==16, "");
     }
 
+    SkNx(const SkNx_abi<N,T>& a) : fLo(a.lo), fHi(a.hi) {}
+    operator SkNx_abi<N,T>() const { return { (SkNx_abi<N/2,T>)fLo, (SkNx_abi<N/2,T>)fHi }; }
+
     T operator[](int k) const {
         SkASSERT(0 <= k && k < N);
         return k < N/2 ? fLo[k] : fHi[k-N/2];
@@ -129,6 +137,9 @@
     SkNx() = default;
     SkNx(T v) : fVal(v) {}
 
+    SkNx(const SkNx_abi<1,T>& a) : fVal(a.val) {}
+    operator SkNx_abi<1,T>() const { return { fVal }; }
+
     // Android complains against unused parameters, so we guard it
     T operator[](int SkDEBUGCODE(k)) const {
         SkASSERT(k == 0);
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 05947fc..c6b85ad 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -57,9 +57,9 @@
 public:
     struct Stage;
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
-    using V = Sk8f;
+    using V = SkNx_abi<8,float>;
 #else
-    using V = Sk4f;
+    using V = SkNx_abi<4,float>;
 #endif
     using Fn = void(SK_VECTORCALL *)(Stage*, size_t, size_t, V,V,V,V,
                                                              V,V,V,V);
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 534bb0e..abdebe2 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -12,6 +12,8 @@
 
 #define SKNX_IS_FAST
 
+template <> struct SkNx_abi<4,float> { float32x4_t vec; };
+
 namespace {
 
 // ARMv8 has vrndmq_f32 to floor 4 floats.  Here we emulate it:
@@ -111,6 +113,9 @@
     SkNx(float val) : fVec(vdupq_n_f32(val)) {}
     SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
 
+    SkNx(const SkNx_abi<4,float>& a) : fVec(a.vec) {}
+    operator SkNx_abi<4,float>() const { return { fVec }; }
+
     static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
     void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); }
 
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 28d2cde..25fb0b7 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -15,6 +15,8 @@
 
 #define SKNX_IS_FAST
 
+template <> struct SkNx_abi<4,float> { __m128 vec; };
+
 namespace {
 
 template <>
@@ -71,6 +73,9 @@
     SkNx(float val)           : fVec( _mm_set1_ps(val) ) {}
     SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
 
+    SkNx(const SkNx_abi<4,float>& a) : fVec(a.vec) {}
+    operator SkNx_abi<4,float>() const { return { fVec }; }
+
     static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); }
     void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); }
 
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 52b02d0..8562388 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -13,35 +13,38 @@
 #include "SkRasterPipeline.h"
 #include "SkSRGB.h"
 
-using SkNf = SkRasterPipeline::V;
-static constexpr auto N = sizeof(SkNf) / sizeof(float);
+using SkNf_abi = SkRasterPipeline::V;
+static constexpr auto N = sizeof(SkNf_abi) / sizeof(float);
+using SkNf = SkNx<N, float>;
 using SkNi = SkNx<N, int>;
 using SkNh = SkNx<N, uint16_t>;
 
 #define SI static inline
 
-#define STAGE(name, kCallNext)                                                             \
-    static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail,           \
-                                               SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,     \
-                                               SkNf& dr, SkNf& dg, SkNf& db, SkNf& da);    \
-    SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail,         \
-                               SkNf  r, SkNf  g, SkNf  b, SkNf  a,                         \
-                               SkNf dr, SkNf dg, SkNf db, SkNf da) {                       \
-        name##_kernel(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da);                        \
-        if (kCallNext) {                                                                   \
-            st->next(x,tail, r,g,b,a, dr,dg,db,da);                                        \
-        }                                                                                  \
-    }                                                                                      \
-    SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail,  \
-                                      SkNf  r, SkNf  g, SkNf  b, SkNf  a,                  \
-                                      SkNf dr, SkNf dg, SkNf db, SkNf da) {                \
-        name##_kernel(st->ctx<void*>(), x,tail, r,g,b,a, dr,dg,db,da);                     \
-        if (kCallNext) {                                                                   \
-            st->next(x,tail, r,g,b,a, dr,dg,db,da);                                        \
-        }                                                                                  \
-    }                                                                                      \
-    static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail,           \
-                                               SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,     \
+#define STAGE(name, kCallNext)                                                              \
+    static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail,            \
+                                               SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,      \
+                                               SkNf& dr, SkNf& dg, SkNf& db, SkNf& da);     \
+    SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail,          \
+                               SkNf_abi  R, SkNf_abi  G, SkNf_abi  B, SkNf_abi  A,          \
+                               SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) {        \
+        SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA;                                      \
+        name##_kernel(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da);                         \
+        if (kCallNext) {                                                                    \
+            st->next(x,tail, r,g,b,a, dr,dg,db,da);                                         \
+        }                                                                                   \
+    }                                                                                       \
+    SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail,   \
+                                      SkNf_abi  R, SkNf_abi  G, SkNf_abi  B, SkNf_abi  A,   \
+                                      SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \
+        SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA;                                      \
+        name##_kernel(st->ctx<void*>(), x,tail, r,g,b,a, dr,dg,db,da);                      \
+        if (kCallNext) {                                                                    \
+            st->next(x,tail, r,g,b,a, dr,dg,db,da);                                         \
+        }                                                                                   \
+    }                                                                                       \
+    static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail,            \
+                                               SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,      \
                                                SkNf& dr, SkNf& dg, SkNf& db, SkNf& da)
 
 
@@ -50,8 +53,9 @@
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,              \
                                                const SkNf& d, const SkNf& da);             \
     SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail,         \
-                               SkNf  r, SkNf  g, SkNf  b, SkNf  a,                         \
-                               SkNf dr, SkNf dg, SkNf db, SkNf da) {                       \
+                               SkNf_abi  R, SkNf_abi  G, SkNf_abi  B, SkNf_abi  A,         \
+                               SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) {       \
+        SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA;                                     \
         r = name##_kernel(r,a,dr,da);                                                      \
         g = name##_kernel(g,a,dg,da);                                                      \
         b = name##_kernel(b,a,db,da);                                                      \
@@ -66,8 +70,9 @@
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,              \
                                                const SkNf& d, const SkNf& da);             \
     SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail,         \
-                               SkNf  r, SkNf  g, SkNf  b, SkNf  a,                         \
-                               SkNf dr, SkNf dg, SkNf db, SkNf da) {                       \
+                               SkNf_abi  R, SkNf_abi  G, SkNf_abi  B, SkNf_abi  A,         \
+                               SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) {       \
+        SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA;                                     \
         r = name##_kernel(r,a,dr,da);                                                      \
         g = name##_kernel(g,a,dg,da);                                                      \
         b = name##_kernel(b,a,db,da);                                                      \
@@ -85,7 +90,7 @@
                          void (*vTailStart)(), SkRasterPipeline::Stage* tail) {
         auto bodyStart = (SkRasterPipeline::Fn)vBodyStart,
              tailStart = (SkRasterPipeline::Fn)vTailStart;
-        SkNf v;  // Fastest to start uninitialized.
+        SkNf v{0};  // TODO: uninitialized would be a bit faster, but some compilers are whiny.
         while (n >= N) {
             bodyStart(body, x,0, v,v,v,v, v,v,v,v);
             x += N;