move skvm interpreter to SkOpts again

This is the easiest way to guarantee Op::fma_f32
actually fuses, by using platform intrinsics.

While implementing this we noticed that quad-pumping
was actually slower than double-pumping by about 25%,
and single-pumping was between the two.  Switch from
quad to double pumping.

Change-Id: Ib93fd175fb8f6aaf49f769a95edfa9fd6b2674f6
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/275299
Commit-Queue: Mike Klein <mtklein@google.com>
Commit-Queue: Herb Derby <herb@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/include/private/SkVx.h b/include/private/SkVx.h
index 9f813a5..0f571a3 100644
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@@ -28,7 +28,7 @@
 #include <cstring>           // memcpy()
 #include <initializer_list>  // std::initializer_list
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__AVX2__)
     #include <immintrin.h>
 #elif defined(__ARM_NEON)
     #include <arm_neon.h>
@@ -416,6 +416,20 @@
 #endif
 }
 
+// fma() delivers a fused mul-add, even if that's really expensive.  Call it when you know it's not.
+static inline Vec<1,float> fma(const Vec<1,float>& x,
+                               const Vec<1,float>& y,
+                               const Vec<1,float>& z) {
+    return std::fma(x.val, y.val, z.val);
+}
+template <int N>
+static inline Vec<N,float> fma(const Vec<N,float>& x,
+                               const Vec<N,float>& y,
+                               const Vec<N,float>& z) {
+    return join(fma(x.lo, y.lo, z.lo),
+                fma(x.hi, y.hi, z.hi));
+}
+
 // div255(x) = (x + 127) / 255 is a bit-exact rounding divide-by-255, packing down to 8-bit.
 template <int N>
 static inline Vec<N,uint8_t> div255(const Vec<N,uint16_t>& x) {
@@ -521,6 +535,33 @@
         }
     #endif
 
+    #if defined(__AVX2__)
+        static inline Vec<4,float> fma(const Vec<4,float>& x,
+                                       const Vec<4,float>& y,
+                                       const Vec<4,float>& z) {
+            return bit_pun<Vec<4,float>>(_mm_fmadd_ps(bit_pun<__m128>(x),
+                                                      bit_pun<__m128>(y),
+                                                      bit_pun<__m128>(z)));
+        }
+
+        static inline Vec<8,float> fma(const Vec<8,float>& x,
+                                       const Vec<8,float>& y,
+                                       const Vec<8,float>& z) {
+            return bit_pun<Vec<8,float>>(_mm256_fmadd_ps(bit_pun<__m256>(x),
+                                                         bit_pun<__m256>(y),
+                                                         bit_pun<__m256>(z)));
+        }
+    #elif defined(__aarch64__)
+        static inline Vec<4,float> fma(const Vec<4,float>& x,
+                                       const Vec<4,float>& y,
+                                       const Vec<4,float>& z) {
+            // These instructions tend to work like z += xy, so the order here is z,x,y.
+            return bit_pun<Vec<4,float>>(vfmaq_f32(bit_pun<float32x4_t>(z),
+                                                   bit_pun<float32x4_t>(x),
+                                                   bit_pun<float32x4_t>(y)));
+        }
+    #endif
+
 #endif // !defined(SKNX_NO_SIMD)
 
 }  // namespace skvx
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index fe4344d..1f54593 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -36,6 +36,7 @@
     #define SK_OPTS_NS portable
 #endif
 
+#include "src/core/SkCubicSolver.h"
 #include "src/opts/SkBitmapProcState_opts.h"
 #include "src/opts/SkBlitMask_opts.h"
 #include "src/opts/SkBlitRow_opts.h"
@@ -43,10 +44,9 @@
 #include "src/opts/SkRasterPipeline_opts.h"
 #include "src/opts/SkSwizzler_opts.h"
 #include "src/opts/SkUtils_opts.h"
+#include "src/opts/SkVM_opts.h"
 #include "src/opts/SkXfermode_opts.h"
 
-#include "src/core/SkCubicSolver.h"
-
 namespace SkOpts {
     // Define default function pointer values here...
     // If our global compile options are set high enough, these defaults might even be
@@ -85,6 +85,8 @@
 
     DEFINE_DEFAULT(S32_alpha_D32_filter_DX);
     DEFINE_DEFAULT(S32_alpha_D32_filter_DXDY);
+
+    DEFINE_DEFAULT(interpret_skvm);
 #undef DEFINE_DEFAULT
 
 #define M(st) (StageFn)SK_OPTS_NS::st,
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index dd1cc9d..5b8e773 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -13,6 +13,7 @@
 #include "src/core/SkXfermodePriv.h"
 
 struct SkBitmapProcState;
+namespace skvm { struct InterpreterInstruction; }
 
 namespace SkOpts {
     // Call to replace pointers to portable functions with pointers to CPU-specific functions.
@@ -77,6 +78,9 @@
     extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**);
 #undef M
 
+    extern void (*interpret_skvm)(const skvm::InterpreterInstruction insts[], int ninsts,
+                                  int nregs, int loop, const int strides[], int nargs,
+                                  int n, void* args[]);
 }
 
 #endif//SkOpts_DEFINED
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index bb437a9..d07dc62 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -48,10 +48,10 @@
 namespace skvm {
 
     struct Program::Impl {
-        std::vector<Instruction> instructions;
-        int                      regs = 0;
-        int                      loop = 0;
-        std::vector<int>         strides;
+        std::vector<InterpreterInstruction> instructions;
+        int regs = 0;
+        int loop = 0;
+        std::vector<int> strides;
 
         std::atomic<void*> jit_entry{nullptr};   // TODO: minimal std::memory_orders
         size_t jit_size = 0;
@@ -272,7 +272,7 @@
             o->writeDecAsText(i);
             o->writeText("\t");
             if (i >= fImpl->loop) { write(o, "    "); }
-            const Program::Instruction& inst = fImpl->instructions[i];
+            const InterpreterInstruction& inst = fImpl->instructions[i];
             Op   op = inst.op;
             Reg   d = inst.d,
                   x = inst.x,
@@ -377,7 +377,7 @@
 
     std::vector<OptimizedInstruction> Builder::optimize(bool for_jit) const {
         // If requested, first specialize for our JIT backend.
-        auto specialize_for_jit = [&]() -> std::vector<Instruction> {
+        auto specialize_for_jit = [&]() -> std::vector<Builder::Instruction> {
             Builder specialized;
             for (int i = 0; i < (int)fProgram.size(); i++) {
                 Builder::Instruction inst = fProgram[i];
@@ -1678,270 +1678,9 @@
         }
 
         // So we'll sometimes use the interpreter here even if later calls will use the JIT.
-        this->interpret(n, args);
-    }
-
-    void Program::interpret(int n, void* args[]) const {
-        // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
-        constexpr int K = 16;
-        using I32 = skvx::Vec<K, int>;
-        using F32 = skvx::Vec<K, float>;
-        using U32 = skvx::Vec<K, uint32_t>;
-        using U16 = skvx::Vec<K, uint16_t>;
-        using  U8 = skvx::Vec<K, uint8_t>;
-
-        using I16x2 = skvx::Vec<2*K,  int16_t>;
-        using U16x2 = skvx::Vec<2*K, uint16_t>;
-
-        union Slot {
-            F32   f32;
-            I32   i32;
-            U32   u32;
-            I16x2 i16x2;
-            U16x2 u16x2;
-        };
-
-        Slot                     few_regs[16];
-        std::unique_ptr<char[]> many_regs;
-
-        Slot* regs = few_regs;
-
-        if (fImpl->regs > (int)SK_ARRAY_COUNT(few_regs)) {
-            // Annoyingly we can't trust that malloc() or new will work with Slot because
-            // the skvx::Vec types may have alignment greater than what they provide.
-            // We'll overallocate one extra register so we can align manually.
-            many_regs.reset(new char[ sizeof(Slot) * (fImpl->regs + 1) ]);
-
-            uintptr_t addr = (uintptr_t)many_regs.get();
-            addr += alignof(Slot) -
-                     (addr & (alignof(Slot) - 1));
-            SkASSERT((addr & (alignof(Slot) - 1)) == 0);
-            regs = (Slot*)addr;
-        }
-
-
-        auto r = [&](Reg id) -> Slot& {
-            SkASSERT(0 <= id && id < fImpl->regs);
-            return regs[id];
-        };
-        auto arg = [&](int ix) {
-            SkASSERT(0 <= ix && ix < (int)fImpl->strides.size());
-            return args[ix];
-        };
-
-        // Step each argument pointer ahead by its stride a number of times.
-        auto step_args = [&](int times) {
-            for (int i = 0; i < (int)fImpl->strides.size(); i++) {
-                args[i] = (void*)( (char*)args[i] + times * fImpl->strides[i] );
-            }
-        };
-
-        int start = 0,
-            stride;
-        for ( ; n > 0; start = fImpl->loop, n -= stride, step_args(stride)) {
-            stride = n >= K ? K : 1;
-
-            for (int i = start; i < (int)fImpl->instructions.size(); i++) {
-                Instruction inst = fImpl->instructions[i];
-
-                // d = op(x,y/imm,z/imm)
-                Reg   d = inst.d,
-                      x = inst.x,
-                      y = inst.y,
-                      z = inst.z;
-                int immy = inst.immy,
-                    immz = inst.immz;
-
-                // Ops that interact with memory need to know whether we're stride=1 or K,
-                // but all non-memory ops can run the same code no matter the stride.
-                switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
-                    default: SkUNREACHABLE;
-
-                #define STRIDE_1(op) case 2*(int)op
-                #define STRIDE_K(op) case 2*(int)op + 1
-                    STRIDE_1(Op::store8 ): memcpy(arg(immy), &r(x).i32, 1); break;
-                    STRIDE_1(Op::store16): memcpy(arg(immy), &r(x).i32, 2); break;
-                    STRIDE_1(Op::store32): memcpy(arg(immy), &r(x).i32, 4); break;
-
-                    STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r(x).i32).store(arg(immy)); break;
-                    STRIDE_K(Op::store16): skvx::cast<uint16_t>(r(x).i32).store(arg(immy)); break;
-                    STRIDE_K(Op::store32):                     (r(x).i32).store(arg(immy)); break;
-
-                    STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 1); break;
-                    STRIDE_1(Op::load16): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 2); break;
-                    STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 4); break;
-
-                    STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(immy))); break;
-                    STRIDE_K(Op::load16): r(d).i32= skvx::cast<int>(U16::Load(arg(immy))); break;
-                    STRIDE_K(Op::load32): r(d).i32=                 I32::Load(arg(immy)) ; break;
-
-                    // The pointer we base our gather on is loaded indirectly from a uniform:
-                    //     - arg(immy) is the uniform holding our gather base pointer somewhere;
-                    //     - (const uint8_t*)arg(immy) + immz points to the gather base pointer;
-                    //     - memcpy() loads the gather base and into a pointer of the right type.
-                    // After all that we have an ordinary (uniform) pointer `ptr` to load from,
-                    // and we then gather from it using the varying indices in r(x).
-                    STRIDE_1(Op::gather8):
-                        for (int i = 0; i < K; i++) {
-                            const uint8_t* ptr;
-                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
-                            r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
-                        } break;
-                    STRIDE_1(Op::gather16):
-                        for (int i = 0; i < K; i++) {
-                            const uint16_t* ptr;
-                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
-                            r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
-                        } break;
-                    STRIDE_1(Op::gather32):
-                        for (int i = 0; i < K; i++) {
-                            const int* ptr;
-                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
-                            r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
-                        } break;
-
-                    STRIDE_K(Op::gather8):
-                        for (int i = 0; i < K; i++) {
-                            const uint8_t* ptr;
-                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
-                            r(d).i32[i] = ptr[ r(x).i32[i] ];
-                        } break;
-                    STRIDE_K(Op::gather16):
-                        for (int i = 0; i < K; i++) {
-                            const uint16_t* ptr;
-                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
-                            r(d).i32[i] = ptr[ r(x).i32[i] ];
-                        } break;
-                    STRIDE_K(Op::gather32):
-                        for (int i = 0; i < K; i++) {
-                            const int* ptr;
-                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
-                            r(d).i32[i] = ptr[ r(x).i32[i] ];
-                        } break;
-
-                #undef STRIDE_1
-                #undef STRIDE_K
-
-                    // Ops that don't interact with memory should never care about the stride.
-                #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
-
-                    CASE(Op::assert_true):
-                    #ifdef SK_DEBUG
-                        if (!all(r(x).i32)) {
-                            this->dump();
-                            SkDebugf("inst %d, register %d\n", i, y);
-                            for (int i = 0; i < K; i++) {
-                                SkDebugf("\t%2d: %08x (%g)\n", i, r(y).i32[i], r(y).f32[i]);
-                            }
-                        }
-                        SkASSERT(all(r(x).i32));
-                    #endif
-                    break;
-
-                    CASE(Op::index): static_assert(K == 16, "");
-                                     r(d).i32 = n - I32{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-                                     break;
-
-                    CASE(Op::uniform8):
-                        r(d).i32 = *(const uint8_t* )( (const char*)arg(immy) + immz );
-                        break;
-                    CASE(Op::uniform16):
-                        r(d).i32 = *(const uint16_t*)( (const char*)arg(immy) + immz );
-                        break;
-                    CASE(Op::uniform32):
-                        r(d).i32 = *(const int*     )( (const char*)arg(immy) + immz );
-                        break;
-
-                    CASE(Op::splat): r(d).i32 = immy; break;
-
-                    CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
-                    CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
-                    CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
-                    CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
-                    CASE(Op::min_f32): r(d).f32 = min(r(x).f32, r(y).f32); break;
-                    CASE(Op::max_f32): r(d).f32 = max(r(x).f32, r(y).f32); break;
-
-                    // These _imm instructions are all x86/JIT only.
-                    CASE(Op::add_f32_imm):
-                    CASE(Op::sub_f32_imm):
-                    CASE(Op::mul_f32_imm):
-                    CASE(Op::min_f32_imm):
-                    CASE(Op::max_f32_imm):
-                    CASE(Op::bit_and_imm):
-                    CASE(Op::bit_or_imm ):
-                    CASE(Op::bit_xor_imm): SkUNREACHABLE; break;
-
-                    CASE(Op::fma_f32): {
-                        // TODO: vectorized skvx calls
-                        for (int i = 0; i < K; i++) {
-                            r(d).f32[i] = std::fma(r(x).f32[i], r(y).f32[i], r(z).f32[i]);
-                        }
-                    } break;
-
-                    CASE(Op::sqrt_f32): r(d).f32 = sqrt(r(x).f32); break;
-
-                    CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
-                    CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
-                    CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
-
-                    CASE(Op::add_i16x2): r(d).i16x2 = r(x).i16x2 + r(y).i16x2; break;
-                    CASE(Op::sub_i16x2): r(d).i16x2 = r(x).i16x2 - r(y).i16x2; break;
-                    CASE(Op::mul_i16x2): r(d).i16x2 = r(x).i16x2 * r(y).i16x2; break;
-
-                    CASE(Op::shl_i32): r(d).i32 = r(x).i32 << immy; break;
-                    CASE(Op::sra_i32): r(d).i32 = r(x).i32 >> immy; break;
-                    CASE(Op::shr_i32): r(d).u32 = r(x).u32 >> immy; break;
-
-                    CASE(Op::shl_i16x2): r(d).i16x2 = r(x).i16x2 << immy; break;
-                    CASE(Op::sra_i16x2): r(d).i16x2 = r(x).i16x2 >> immy; break;
-                    CASE(Op::shr_i16x2): r(d).u16x2 = r(x).u16x2 >> immy; break;
-
-                    CASE(Op:: eq_f32): r(d).i32 = r(x).f32 == r(y).f32; break;
-                    CASE(Op::neq_f32): r(d).i32 = r(x).f32 != r(y).f32; break;
-                    CASE(Op:: gt_f32): r(d).i32 = r(x).f32 >  r(y).f32; break;
-                    CASE(Op::gte_f32): r(d).i32 = r(x).f32 >= r(y).f32; break;
-
-                    CASE(Op:: eq_i32): r(d).i32 = r(x).i32 == r(y).i32; break;
-                    CASE(Op::neq_i32): r(d).i32 = r(x).i32 != r(y).i32; break;
-                    CASE(Op:: gt_i32): r(d).i32 = r(x).i32 >  r(y).i32; break;
-                    CASE(Op::gte_i32): r(d).i32 = r(x).i32 >= r(y).i32; break;
-
-                    CASE(Op:: eq_i16x2): r(d).i16x2 = r(x).i16x2 == r(y).i16x2; break;
-                    CASE(Op::neq_i16x2): r(d).i16x2 = r(x).i16x2 != r(y).i16x2; break;
-                    CASE(Op:: gt_i16x2): r(d).i16x2 = r(x).i16x2 >  r(y).i16x2; break;
-                    CASE(Op::gte_i16x2): r(d).i16x2 = r(x).i16x2 >= r(y).i16x2; break;
-
-                    CASE(Op::bit_and  ): r(d).i32 = r(x).i32 &  r(y).i32; break;
-                    CASE(Op::bit_or   ): r(d).i32 = r(x).i32 |  r(y).i32; break;
-                    CASE(Op::bit_xor  ): r(d).i32 = r(x).i32 ^  r(y).i32; break;
-                    CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
-
-                    CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32);
-                                      break;
-
-                    CASE(Op::pack):    r(d).u32 = r(x).u32 | (r(y).u32 << immz); break;
-
-                    CASE(Op::bytes): {
-                        const U32 table[] = {
-                            0,
-                            (r(x).u32      ) & 0xff,
-                            (r(x).u32 >>  8) & 0xff,
-                            (r(x).u32 >> 16) & 0xff,
-                            (r(x).u32 >> 24) & 0xff,
-                        };
-                        r(d).u32 = table[(immy >>  0) & 0xf] <<  0
-                                 | table[(immy >>  4) & 0xf] <<  8
-                                 | table[(immy >>  8) & 0xf] << 16
-                                 | table[(immy >> 12) & 0xf] << 24;
-                    } break;
-
-                    CASE(Op::floor):  r(d).f32 = skvx::floor(r(x).f32); break;
-                    CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
-                    CASE(Op::trunc):  r(d).i32 = skvx::cast<int>  (r(x).f32); break;
-                #undef CASE
-                }
-            }
-        }
+        SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(),
+                               this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(),
+                               n, args);
     }
 
 #if defined(SKVM_LLVM)
@@ -2433,13 +2172,13 @@
     #endif
     }
 
-    std::vector<Program::Instruction> Program::instructions() const { return fImpl->instructions; }
+    std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; }
     int  Program::nargs() const { return (int)fImpl->strides.size(); }
     int  Program::nregs() const { return fImpl->regs; }
     int  Program::loop () const { return fImpl->loop; }
     bool Program::empty() const { return fImpl->instructions.empty(); }
 
-    // Translate OptimizedInstructions to Program::Instructions used by the interpreter.
+    // Translate OptimizedInstructions to InterpreterInstructions.
     void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
         // Register each instruction is assigned to.
         std::vector<Reg> reg(instructions.size());
@@ -2496,7 +2235,7 @@
             if (!hoisted(id)) { assign_register(id); }
         }
 
-        // Translate OptimizedInstructions to Program::Instructions by mapping values to
+        // Translate OptimizedInstructions to InterpreterIstructions by mapping values to
         // registers.  This will be two passes, first hoisted instructions, then inside the loop.
 
         // The loop begins at the fImpl->loop'th Instruction.
@@ -2511,7 +2250,7 @@
         };
 
         auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
-            Program::Instruction pinst{
+            InterpreterInstruction pinst{
                 inst.op,
                 lookup_register(id),
                 lookup_register(inst.x),
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 1bd3f21..0f3c706 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -611,15 +611,16 @@
 
     using Reg = int;
 
+    // d = op(x, y/imm, z/imm)
+    struct InterpreterInstruction {
+        Op  op;
+        Reg d,x;
+        union { Reg y; int immy; };
+        union { Reg z; int immz; };
+    };
+
     class Program {
     public:
-        struct Instruction {   // d = op(x, y/imm, z/imm)
-            Op  op;
-            Reg d,x;
-            union { Reg y; int immy; };
-            union { Reg z; int immz; };
-        };
-
         Program(const std::vector<OptimizedInstruction>& interpreter,
                 const std::vector<int>& strides);
 
@@ -647,7 +648,7 @@
             this->eval(n, args);
         }
 
-        std::vector<Instruction> instructions() const;
+        std::vector<InterpreterInstruction> instructions() const;
         int  nargs() const;
         int  nregs() const;
         int  loop () const;
@@ -663,8 +664,6 @@
         void setupJIT        (const std::vector<OptimizedInstruction>&, const char* debug_name);
         void setupLLVM       (const std::vector<OptimizedInstruction>&, const char* debug_name);
 
-        void interpret(int n, void* args[]) const;
-
         bool jit(const std::vector<OptimizedInstruction>&,
                  bool try_hoisting,
                  Assembler*) const;
diff --git a/src/opts/SkOpts_hsw.cpp b/src/opts/SkOpts_hsw.cpp
index 1cf40c5..21f4109 100644
--- a/src/opts/SkOpts_hsw.cpp
+++ b/src/opts/SkOpts_hsw.cpp
@@ -13,6 +13,7 @@
 #include "src/opts/SkBlitRow_opts.h"
 #include "src/opts/SkRasterPipeline_opts.h"
 #include "src/opts/SkUtils_opts.h"
+#include "src/opts/SkVM_opts.h"
 
 namespace SkOpts {
     void Init_hsw() {
@@ -34,5 +35,7 @@
         just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
         start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
     #undef M
+
+        interpret_skvm = SK_OPTS_NS::interpret_skvm;
     }
 }
diff --git a/src/opts/SkVM_opts.h b/src/opts/SkVM_opts.h
new file mode 100644
index 0000000..ef3e446
--- /dev/null
+++ b/src/opts/SkVM_opts.h
@@ -0,0 +1,284 @@
+// Copyright 2020 Google LLC.
+
+#ifndef SkVM_opts_DEFINED
+#define SkVM_opts_DEFINED
+
+#include "include/private/SkVx.h"
+#include "src/core/SkVM.h"
+
+namespace SK_OPTS_NS {
+
+    inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts,
+                               const int nregs, const int loop,
+                               const int strides[], const int nargs,
+                               int n, void* args[]) {
+        using namespace skvm;
+
+        // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
+        // We noticed quad-pumping is slower than single-pumping and both were slower than double.
+    #if defined(__AVX2__)
+        constexpr int K = 16;
+    #else
+        constexpr int K = 8;
+    #endif
+        using I32 = skvx::Vec<K, int>;
+        using F32 = skvx::Vec<K, float>;
+        using U32 = skvx::Vec<K, uint32_t>;
+        using U16 = skvx::Vec<K, uint16_t>;
+        using  U8 = skvx::Vec<K, uint8_t>;
+
+        using I16x2 = skvx::Vec<2*K,  int16_t>;
+        using U16x2 = skvx::Vec<2*K, uint16_t>;
+
+        union Slot {
+            F32   f32;
+            I32   i32;
+            U32   u32;
+            I16x2 i16x2;
+            U16x2 u16x2;
+        };
+
+        Slot                     few_regs[16];
+        std::unique_ptr<char[]> many_regs;
+
+        Slot* regs = few_regs;
+
+        if (nregs > (int)SK_ARRAY_COUNT(few_regs)) {
+            // Annoyingly we can't trust that malloc() or new will work with Slot because
+            // the skvx::Vec types may have alignment greater than what they provide.
+            // We'll overallocate one extra register so we can align manually.
+            many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]);
+
+            uintptr_t addr = (uintptr_t)many_regs.get();
+            addr += alignof(Slot) -
+                     (addr & (alignof(Slot) - 1));
+            SkASSERT((addr & (alignof(Slot) - 1)) == 0);
+            regs = (Slot*)addr;
+        }
+
+
+        auto r = [&](Reg id) -> Slot& {
+            SkASSERT(0 <= id && id < nregs);
+            return regs[id];
+        };
+        auto arg = [&](int ix) {
+            SkASSERT(0 <= ix && ix < nargs);
+            return args[ix];
+        };
+
+        // Step each argument pointer ahead by its stride a number of times.
+        auto step_args = [&](int times) {
+            for (int i = 0; i < nargs; i++) {
+                args[i] = (void*)( (char*)args[i] + times * strides[i] );
+            }
+        };
+
+        int start = 0,
+            stride;
+        for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
+            stride = n >= K ? K : 1;
+
+            for (int i = start; i < ninsts; i++) {
+                InterpreterInstruction inst = insts[i];
+
+                // d = op(x,y/imm,z/imm)
+                Reg   d = inst.d,
+                      x = inst.x,
+                      y = inst.y,
+                      z = inst.z;
+                int immy = inst.immy,
+                    immz = inst.immz;
+
+                // Ops that interact with memory need to know whether we're stride=1 or K,
+                // but all non-memory ops can run the same code no matter the stride.
+                switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
+                    default: SkUNREACHABLE;
+
+                #define STRIDE_1(op) case 2*(int)op
+                #define STRIDE_K(op) case 2*(int)op + 1
+                    STRIDE_1(Op::store8 ): memcpy(arg(immy), &r(x).i32, 1); break;
+                    STRIDE_1(Op::store16): memcpy(arg(immy), &r(x).i32, 2); break;
+                    STRIDE_1(Op::store32): memcpy(arg(immy), &r(x).i32, 4); break;
+
+                    STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r(x).i32).store(arg(immy)); break;
+                    STRIDE_K(Op::store16): skvx::cast<uint16_t>(r(x).i32).store(arg(immy)); break;
+                    STRIDE_K(Op::store32):                     (r(x).i32).store(arg(immy)); break;
+
+                    STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 1); break;
+                    STRIDE_1(Op::load16): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 2); break;
+                    STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 4); break;
+
+                    STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(immy))); break;
+                    STRIDE_K(Op::load16): r(d).i32= skvx::cast<int>(U16::Load(arg(immy))); break;
+                    STRIDE_K(Op::load32): r(d).i32=                 I32::Load(arg(immy)) ; break;
+
+                    // The pointer we base our gather on is loaded indirectly from a uniform:
+                    //     - arg(immy) is the uniform holding our gather base pointer somewhere;
+                    //     - (const uint8_t*)arg(immy) + immz points to the gather base pointer;
+                    //     - memcpy() loads the gather base and into a pointer of the right type.
+                    // After all that we have an ordinary (uniform) pointer `ptr` to load from,
+                    // and we then gather from it using the varying indices in r(x).
+                    STRIDE_1(Op::gather8):
+                        for (int i = 0; i < K; i++) {
+                            const uint8_t* ptr;
+                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+                            r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
+                        } break;
+                    STRIDE_1(Op::gather16):
+                        for (int i = 0; i < K; i++) {
+                            const uint16_t* ptr;
+                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+                            r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
+                        } break;
+                    STRIDE_1(Op::gather32):
+                        for (int i = 0; i < K; i++) {
+                            const int* ptr;
+                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+                            r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
+                        } break;
+
+                    STRIDE_K(Op::gather8):
+                        for (int i = 0; i < K; i++) {
+                            const uint8_t* ptr;
+                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+                            r(d).i32[i] = ptr[ r(x).i32[i] ];
+                        } break;
+                    STRIDE_K(Op::gather16):
+                        for (int i = 0; i < K; i++) {
+                            const uint16_t* ptr;
+                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+                            r(d).i32[i] = ptr[ r(x).i32[i] ];
+                        } break;
+                    STRIDE_K(Op::gather32):
+                        for (int i = 0; i < K; i++) {
+                            const int* ptr;
+                            memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+                            r(d).i32[i] = ptr[ r(x).i32[i] ];
+                        } break;
+
+                #undef STRIDE_1
+                #undef STRIDE_K
+
+                    // Ops that don't interact with memory should never care about the stride.
+                #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
+
+                    CASE(Op::assert_true):
+                    #ifdef SK_DEBUG
+                        if (!all(r(x).i32)) {
+                            SkDebugf("inst %d, register %d\n", i, y);
+                            for (int i = 0; i < K; i++) {
+                                SkDebugf("\t%2d: %08x (%g)\n", i, r(y).i32[i], r(y).f32[i]);
+                            }
+                        }
+                        SkASSERT(all(r(x).i32));
+                    #endif
+                    break;
+
+                    CASE(Op::index): {
+                        const int iota[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+                                            16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+                        static_assert(K <= SK_ARRAY_COUNT(iota), "");
+
+                        r(d).i32 = n - I32::Load(iota);
+                    } break;
+
+                    CASE(Op::uniform8):
+                        r(d).i32 = *(const uint8_t* )( (const char*)arg(immy) + immz );
+                        break;
+                    CASE(Op::uniform16):
+                        r(d).i32 = *(const uint16_t*)( (const char*)arg(immy) + immz );
+                        break;
+                    CASE(Op::uniform32):
+                        r(d).i32 = *(const int*     )( (const char*)arg(immy) + immz );
+                        break;
+
+                    CASE(Op::splat): r(d).i32 = immy; break;
+
+                    CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
+                    CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
+                    CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
+                    CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
+                    CASE(Op::min_f32): r(d).f32 = min(r(x).f32, r(y).f32); break;
+                    CASE(Op::max_f32): r(d).f32 = max(r(x).f32, r(y).f32); break;
+
+                    // These _imm instructions are all x86/JIT only.
+                    CASE(Op::add_f32_imm):
+                    CASE(Op::sub_f32_imm):
+                    CASE(Op::mul_f32_imm):
+                    CASE(Op::min_f32_imm):
+                    CASE(Op::max_f32_imm):
+                    CASE(Op::bit_and_imm):
+                    CASE(Op::bit_or_imm ):
+                    CASE(Op::bit_xor_imm): SkUNREACHABLE; break;
+
+                    CASE(Op::fma_f32): r(d).f32 = fma(r(x).f32, r(y).f32, r(z).f32); break;
+
+                    CASE(Op::sqrt_f32): r(d).f32 = sqrt(r(x).f32); break;
+
+                    CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
+                    CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
+                    CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
+
+                    CASE(Op::add_i16x2): r(d).i16x2 = r(x).i16x2 + r(y).i16x2; break;
+                    CASE(Op::sub_i16x2): r(d).i16x2 = r(x).i16x2 - r(y).i16x2; break;
+                    CASE(Op::mul_i16x2): r(d).i16x2 = r(x).i16x2 * r(y).i16x2; break;
+
+                    CASE(Op::shl_i32): r(d).i32 = r(x).i32 << immy; break;
+                    CASE(Op::sra_i32): r(d).i32 = r(x).i32 >> immy; break;
+                    CASE(Op::shr_i32): r(d).u32 = r(x).u32 >> immy; break;
+
+                    CASE(Op::shl_i16x2): r(d).i16x2 = r(x).i16x2 << immy; break;
+                    CASE(Op::sra_i16x2): r(d).i16x2 = r(x).i16x2 >> immy; break;
+                    CASE(Op::shr_i16x2): r(d).u16x2 = r(x).u16x2 >> immy; break;
+
+                    CASE(Op:: eq_f32): r(d).i32 = r(x).f32 == r(y).f32; break;
+                    CASE(Op::neq_f32): r(d).i32 = r(x).f32 != r(y).f32; break;
+                    CASE(Op:: gt_f32): r(d).i32 = r(x).f32 >  r(y).f32; break;
+                    CASE(Op::gte_f32): r(d).i32 = r(x).f32 >= r(y).f32; break;
+
+                    CASE(Op:: eq_i32): r(d).i32 = r(x).i32 == r(y).i32; break;
+                    CASE(Op::neq_i32): r(d).i32 = r(x).i32 != r(y).i32; break;
+                    CASE(Op:: gt_i32): r(d).i32 = r(x).i32 >  r(y).i32; break;
+                    CASE(Op::gte_i32): r(d).i32 = r(x).i32 >= r(y).i32; break;
+
+                    CASE(Op:: eq_i16x2): r(d).i16x2 = r(x).i16x2 == r(y).i16x2; break;
+                    CASE(Op::neq_i16x2): r(d).i16x2 = r(x).i16x2 != r(y).i16x2; break;
+                    CASE(Op:: gt_i16x2): r(d).i16x2 = r(x).i16x2 >  r(y).i16x2; break;
+                    CASE(Op::gte_i16x2): r(d).i16x2 = r(x).i16x2 >= r(y).i16x2; break;
+
+                    CASE(Op::bit_and  ): r(d).i32 = r(x).i32 &  r(y).i32; break;
+                    CASE(Op::bit_or   ): r(d).i32 = r(x).i32 |  r(y).i32; break;
+                    CASE(Op::bit_xor  ): r(d).i32 = r(x).i32 ^  r(y).i32; break;
+                    CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
+
+                    CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32);
+                                      break;
+
+                    CASE(Op::pack):    r(d).u32 = r(x).u32 | (r(y).u32 << immz); break;
+
+                    CASE(Op::bytes): {
+                        const U32 table[] = {
+                            0,
+                            (r(x).u32      ) & 0xff,
+                            (r(x).u32 >>  8) & 0xff,
+                            (r(x).u32 >> 16) & 0xff,
+                            (r(x).u32 >> 24) & 0xff,
+                        };
+                        r(d).u32 = table[(immy >>  0) & 0xf] <<  0
+                                 | table[(immy >>  4) & 0xf] <<  8
+                                 | table[(immy >>  8) & 0xf] << 16
+                                 | table[(immy >> 12) & 0xf] << 24;
+                    } break;
+
+                    CASE(Op::floor):  r(d).f32 = skvx::floor(r(x).f32); break;
+                    CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
+                    CASE(Op::trunc):  r(d).i32 = skvx::cast<int>  (r(x).f32); break;
+                #undef CASE
+                }
+            }
+        }
+    }
+
+}
+
+#endif//SkVM_opts_DEFINED