much better idea

Instead of trying to genereate snippets of code that an work when
splicing and when interpreting, let's just make one snippet for each.
And then, let's add one more portable single float version!

Why build one when you can have three at thrice the price?

This doesn't make these new strategies active, but does start moving the
code in build_stages.py and SkSplicer_stages.cpp towards that.

Change-Id: I2197fa2a11e233f24da67e54a4f291135652b087
Reviewed-on: https://skia-review.googlesource.com/8453
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
index 7f9a5b3..19daee3 100644
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -8,14 +8,31 @@
 #include "SkSplicer_shared.h"
 #include <string.h>
 
-#if !defined(__clang__)
-    #error This file is not like the rest of Skia.  It must be compiled with clang.
-#endif
-
 // It's tricky to relocate code referencing ordinary constants, so we read them from this struct.
 using K = const SkSplicer_constants;
 
-#if defined(__aarch64__)
+#if !defined(SPLICER) && !defined(JUMPER)
+    // This path should lead to portable code that can be compiled directly into Skia.
+    // (All other paths are compiled offline by Clang into SkSplicer_generated.h.)
+    #include <math.h>
+
+    using F   = float;
+    using I32 =  int32_t;
+    using U32 = uint32_t;
+    using U8  = uint8_t;
+
+    static F   fma(F f, F m, F a)  { return f*m+a; }
+    static F   min(F a, F b)       { return fminf(a,b); }
+    static F   max(F a, F b)       { return fmaxf(a,b); }
+    static F   rcp  (F v)          { return 1.0f / v; }
+    static F   rsqrt(F v)          { return 1.0f / sqrtf(v); }
+    static U32 round(F v, F scale) { return (uint32_t)(v*scale); }
+
+    static F if_then_else(I32 c, F t, F e) { return c ? t : e; }
+
+    static F gather(const float* p, U32 ix) { return p[ix]; }
+
+#elif defined(__aarch64__)
     #include <arm_neon.h>
 
     // Since we know we're using Clang, we can use its vector extensions.
@@ -35,6 +52,7 @@
     static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
 
     static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
+
 #elif defined(__ARM_NEON__)
     #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
         #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
@@ -57,6 +75,7 @@
     static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
 
     static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
+
 #elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
     #include <immintrin.h>
 
@@ -76,6 +95,7 @@
     static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
 
     static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
+
 #elif defined(__SSE2__)
     #include <immintrin.h>
 
@@ -102,8 +122,16 @@
     static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
 #endif
 
-static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F);   }
-static U32 expand(U8  v) { return __builtin_convertvector(     v, U32); }
+// We need to be a careful with casts.
+// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
+// These named casts and bit_cast() are always what they seem to be.
+#if !defined(SPLICER) && !defined(JUMPER)
+    static F   cast  (U32 v) { return (F)v; }
+    static U32 expand(U8  v) { return (U32)v; }
+#else
+    static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F);   }
+    static U32 expand(U8  v) { return __builtin_convertvector(     v, U32); }
+#endif
 
 template <typename T, typename P>
 static T unaligned_load(const P* p) {
@@ -112,55 +140,115 @@
     return v;
 }
 
-using F4 = float __attribute__((ext_vector_type(4)));
+template <typename Dst, typename Src>
+static Dst bit_cast(const Src& src) {
+    static_assert(sizeof(Dst) == sizeof(Src), "");
+    return unaligned_load<Dst>(&src);
+}
+
+// Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector.
+#if !defined(SPLICER) && !defined(JUMPER)
+    struct F4 {
+        float vals[4];
+        float operator[](int i) const { return vals[i]; }
+    };
+#else
+    using F4 = float __attribute__((ext_vector_type(4)));
+#endif
 
 // We'll be compiling this file to an object file, then extracting parts of it into
 // SkSplicer_generated.h.  It's easier to do if the function names are not C++ mangled.
 #define C extern "C"
 
-// Stages all fit a common interface that allows SkSplicer to splice them together.
-using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
+#if defined(SPLICER)
+    // Splicer Stages all fit a common interface that allows SkSplicer to splice them together.
+    // (This is just for reference... nothing uses this type when we're in Splicer mode.)
+    using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
 
-// Stage's arguments act as the working set of registers within the final spliced function.
-// Here's a little primer on the x86-64/aarch64 ABIs:
-//   x:         rdi/x0          x and limit work to drive the loop, see loop_start in SkSplicer.cpp.
-//   limit:     rsi/x1
-//   ctx:       rdx/x2          Look for set_ctx in SkSplicer.cpp to see how this works.
-//   k:         rcx/x3
-//   vectors:   ymm0-ymm7/v0-v7
+    // Stage's arguments act as the working set of registers within the final spliced function.
+    // Here's a little primer on the x86-64/aarch64 ABIs:
+    //   x:         rdi/x0   x and limit work to drive the loop, see loop_start in SkSplicer.cpp.
+    //   limit:     rsi/x1
+    //   ctx:       rdx/x2   Look for set_ctx in SkSplicer.cpp to see how this works.
+    //   k:         rcx/x3
+    //   vectors:   ymm0-ymm7/v0-v7
 
+    // done() is the key to this entire splicing strategy.
+    //
+    // It matches the signature of Stage, so all the registers are kept live.
+    // Every Stage calls done() and so will end in a single jmp (i.e. tail-call) into done(),
+    // which marks the point where we can splice one Stage onto the next.
+    //
+    // The lovely bit is that we don't have to define done(), just declare it.
+    C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
 
-// done() is the key to this entire splicing strategy.
-//
-// It matches the signature of Stage, so all the registers are kept live.
-// Every Stage calls done() and so will end in a single jmp (i.e. tail-call) into done(),
-// which marks the point where we can splice one Stage onto the next.
-//
-// The lovely bit is that we don't have to define done(), just declare it.
-C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
+    // This should feel familiar to anyone who's read SkRasterPipeline_opts.h.
+    // It's just a convenience to make a valid, spliceable Stage, nothing magic.
+    #define STAGE(name)                                                           \
+        static void name##_k(size_t& x, size_t limit, void* ctx, K* k,            \
+                             F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+        C void name(size_t x, size_t limit, void* ctx, K* k,                      \
+                    F r, F g, F b, F a, F dr, F dg, F db, F da) {                 \
+            name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da);                        \
+            done    (x,limit,ctx,k, r,g,b,a, dr,dg,db,da);                        \
+        }                                                                         \
+        static void name##_k(size_t& x, size_t limit, void* ctx, K* k,            \
+                             F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+#else
+    // Jumper and portable Stages tail call between each other by following
+    // program, an interlaced sequence of Stage pointers and context pointers.
+    using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
 
-// This should feel familiar to anyone who's read SkRasterPipeline_opts.h.
-// It's just a convenience to make a valid, spliceable Stage, nothing magic.
-#define STAGE(name)                                                           \
-    static void name##_k(size_t& x, size_t limit, void* ctx, K* k,            \
-                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
-    C void name(size_t x, size_t limit, void* ctx, K* k,                      \
-                F r, F g, F b, F a, F dr, F dg, F db, F da) {                 \
-        name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da);                        \
-        done    (x,limit,ctx,k, r,g,b,a, dr,dg,db,da);                        \
-    }                                                                         \
-    static void name##_k(size_t& x, size_t limit, void* ctx, K* k,            \
-                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+    static void* load_and_inc(void**& program) {
+    #if defined(__GNUC__) && defined(__x86_64__)
+        // Passing program as the second Stage argument makes it likely that it's in %rsi,
+        // so this is usually a single instruction *program++.
+        void* rax;
+        asm("lodsq" : "=a"(rax), "+S"(program));  // Write-only %rax, read-write %rsi.
+        return rax;
+        // When a Stage uses its ctx pointer, this optimization typically cuts an instruction:
+        //    mov    (%rsi), %rcx     // ctx  = program[0]
+        //    ...
+        //    mov 0x8(%rsi), %rax     // next = program[1]
+        //    add $0x10, %rsi         // program += 2
+        //    jmpq *%rax              // JUMP!
+        // becomes
+        //    lods   %ds:(%rsi),%rax  // ctx  = *program++;
+        //    ...
+        //    lods   %ds:(%rsi),%rax  // next = *program++;
+        //    jmpq *%rax              // JUMP!
+        //
+        // When a Stage doesn't use its ctx pointer, it's 3 instructions either way,
+        // but using lodsq (a 2-byte instruction) tends to trim a few bytes.
+    #else
+        // On ARM *program++ compiles into a single instruction without any handholding.
+        return *program++;
+    #endif
+    }
+
+    #define STAGE(name)                                                           \
+        static void name##_k(size_t& x, void* ctx, K* k,                          \
+                             F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+        C void name(size_t x, void** program, K* k,                               \
+                    F r, F g, F b, F a, F dr, F dg, F db, F da) {                 \
+            auto ctx = load_and_inc(program);                                     \
+            name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da);                              \
+            auto next = (Stage*)load_and_inc(program);                            \
+            next(x,program,k, r,g,b,a, dr,dg,db,da);                              \
+        }                                                                         \
+        static void name##_k(size_t& x, void* ctx, K* k,                          \
+                             F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+#endif
 
 // We can now define Stages!
 
 // Some things to keep in mind while writing Stages:
 //   - do not branch;                                       (i.e. avoid jmp)
-//   - do not call functions that don't inline;             (i.e. avoid call, ret, stack use)
+//   - do not call functions that don't inline;             (i.e. avoid call, ret)
 //   - do not use constant literals other than 0 and 0.0f.  (i.e. avoid rip relative addressing)
 //
 // Some things that should work fine:
-//   - 0 and 0.0f;
+//   - 0, ~0, and 0.0f;
 //   - arithmetic;
 //   - functions of F and U32 that we've defined above;
 //   - temporary values;
@@ -350,7 +438,9 @@
 STAGE(load_f16) {
     auto ptr = *(const uint64_t**)ctx + x;
 
-#if defined(__aarch64__)
+#if !defined(SPLICER) && !defined(JUMPER)
+    // TODO:
+#elif defined(__aarch64__)
     auto halfs = vld4_f16((const float16_t*)ptr);
     r = vcvt_f32_f16(halfs.val[0]);
     g = vcvt_f32_f16(halfs.val[1]);
@@ -395,8 +485,8 @@
          ba = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
 
     auto half_to_float = [&](U32 h) {
-        return (F)(h << 13)             // Line up the mantissa,
-             * (F)U32(k->_0x77800000);  // then fix up the exponent.
+        return bit_cast<F>(h << 13)               // Line up the mantissa,
+             * bit_cast<F>(U32(k->_0x77800000));  // then fix up the exponent.
     };
 
     r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128()));
@@ -409,7 +499,9 @@
 STAGE(store_f16) {
     auto ptr = *(uint64_t**)ctx + x;
 
-#if defined(__aarch64__)
+#if !defined(SPLICER) && !defined(JUMPER)
+    // TODO:
+#elif defined(__aarch64__)
     float16x4x4_t halfs = {{
         vcvt_f16_f32(r),
         vcvt_f16_f32(g),
@@ -440,8 +532,8 @@
     _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
 #elif defined(__SSE2__)
     auto float_to_half = [&](F f) {
-        return (U32)(f * (F)U32(k->_0x07800000)) // Fix up the exponent,
-            >> 13;                               // then line up the mantissa.
+        return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000)))  // Fix up the exponent,
+            >> 13;                                                  // then line up the mantissa.
     };
     U32 R = float_to_half(r),
         G = float_to_half(g),
@@ -455,7 +547,7 @@
 }
 
 static F clamp(const F& v, float limit) {
-    F l = (F)((U32)F(limit) + U32(0xffffffff));  // limit - 1 ulp
+    F l = bit_cast<F>(bit_cast<U32>(F(limit)) + U32(0xffffffff));  // limit - 1 ulp
     return max(0, min(v, l));
 }
 STAGE(clamp_x) { r = clamp(r, *(const float*)ctx); }
diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py
index a6f55c3..cb7f6fa 100755
--- a/src/splicer/build_stages.py
+++ b/src/splicer/build_stages.py
@@ -17,14 +17,22 @@
 #ndk = '/home/mtklein/ndk/'
 #objdump = '/home/mtklein/binutils-2.27/binutils/objdump'
 
-cflags = '-std=c++11 -Os -fomit-frame-pointer'.split()
+cflags = '-std=c++11 -Os -fomit-frame-pointer -DSPLICER'.split()
+
+# This won't stay here long-term.
+# It's a stand-in for building SkSplicer_stages.cpp into Skia.
+portable = '-USPLICER'.split()
+subprocess.check_call(['clang++'] + cflags + portable +
+                      ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
+                      ['-o', 'portable.o'])
 
 sse2 = '-mno-red-zone -msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split()
 subprocess.check_call(['clang++'] + cflags + sse2 +
                       ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
                       ['-o', 'sse2.o'])
 
-sse41 = '-mno-red-zone -msse4.1'.split()
+# We're not using sse41.o right now, so use it to test -DJUMPER.
+sse41 = '-mno-red-zone -msse4.1 -USPLICER -DJUMPER'.split()
 subprocess.check_call(['clang++'] + cflags + sse41 +
                       ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
                       ['-o', 'sse41.o'])