Interpreter: Vectorized interpreter

This converts the SkSL interpreter to operate in SIMT fashion. It handles
all the same features as the previous scalar implementation, but operates
on N lanes at a time. (Currently 8).

It's modeled after GPU and other parallel architectures, using execution
masks to handle control flow, including divergent control-flow.

Change-Id: Ieb38ffe2f55a10f72bdab844c297126fe9bedb6c
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/217122
Commit-Queue: Brian Osman <brianosman@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
diff --git a/src/sksl/SkSLInterpreter.cpp b/src/sksl/SkSLInterpreter.cpp
index deea8c3..22fec06 100644
--- a/src/sksl/SkSLInterpreter.cpp
+++ b/src/sksl/SkSLInterpreter.cpp
@@ -8,6 +8,7 @@
 #ifndef SKSL_STANDALONE
 
 #include "include/core/SkPoint3.h"
+#include "include/private/SkVx.h"
 #include "src/sksl/SkSLByteCode.h"
 #include "src/sksl/SkSLByteCodeGenerator.h"
 #include "src/sksl/SkSLExternalValue.h"
@@ -18,6 +19,16 @@
 namespace SkSL {
 namespace Interpreter {
 
+constexpr int VecWidth = 16;
+
+using F32 = skvx::Vec<VecWidth, float>;
+using I32 = skvx::Vec<VecWidth, int32_t>;
+using U32 = skvx::Vec<VecWidth, uint32_t>;
+
+// Needs to be the first N non-negative integers, at least as large as VecWidth
+static const I32 gLanes = {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+                            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+
 template <typename T>
 static T unaligned_load(const void* ptr) {
     T val;
@@ -72,9 +83,6 @@
         VECTOR_DISASSEMBLE(kCompareUGTEQ, "compareugteq")
         VECTOR_DISASSEMBLE(kCompareULT, "compareult")
         VECTOR_DISASSEMBLE(kCompareULTEQ, "compareulteq")
-        case ByteCodeInstruction::kConditionalBranch:
-            printf("conditionalbranch %d", READ16());
-            break;
         VECTOR_DISASSEMBLE(kConvertFtoI, "convertftoi")
         VECTOR_DISASSEMBLE(kConvertStoF, "convertstof")
         VECTOR_DISASSEMBLE(kConvertUtoF, "convertutof")
@@ -132,7 +140,7 @@
         VECTOR_DISASSEMBLE(kMultiplyI, "multiplyi")
         VECTOR_MATRIX_DISASSEMBLE(kNegateF, "negatef")
         VECTOR_DISASSEMBLE(kNegateI, "negatei")
-        case ByteCodeInstruction::kNot: printf("not"); break;
+        case ByteCodeInstruction::kNotB: printf("notb"); break;
         case ByteCodeInstruction::kOrB: printf("orb"); break;
         VECTOR_MATRIX_DISASSEMBLE(kPop, "pop")
         case ByteCodeInstruction::kPushImmediate: {
@@ -218,6 +226,20 @@
         case ByteCodeInstruction::kWriteExternal2: printf("writeexternal2 %d", READ8()); break;
         case ByteCodeInstruction::kWriteExternal3: printf("writeexternal3 %d", READ8()); break;
         case ByteCodeInstruction::kWriteExternal4: printf("writeexternal4 %d", READ8()); break;
+        case ByteCodeInstruction::kXorB: printf("xorb"); break;
+        case ByteCodeInstruction::kMaskPush: printf("maskpush"); break;
+        case ByteCodeInstruction::kMaskPop: printf("maskpop"); break;
+        case ByteCodeInstruction::kMaskNegate: printf("masknegate"); break;
+        case ByteCodeInstruction::kMaskBlend: printf("maskblend %d", READ8()); break;
+        case ByteCodeInstruction::kBranchIfAllFalse:
+            printf("branchifallfalse %d", READ16());
+            break;
+        case ByteCodeInstruction::kLoopBegin: printf("loopbegin"); break;
+        case ByteCodeInstruction::kLoopNext: printf("loopnext"); break;
+        case ByteCodeInstruction::kLoopMask: printf("loopmask"); break;
+        case ByteCodeInstruction::kLoopEnd: printf("loopend"); break;
+        case ByteCodeInstruction::kLoopContinue: printf("loopcontinue"); break;
+        case ByteCodeInstruction::kLoopBreak: printf("loopbreak"); break;
         default: printf("unknown(%d)\n", *(ip - 1)); SkASSERT(false);
     }
     return ip;
@@ -294,27 +316,80 @@
     case ByteCodeInstruction::base:      sp[ 0] = fn(sp[ 0].field); \
                                          break;
 
+#define VECTOR_UNARY_FN_VEC(base, fn)                               \
+    case ByteCodeInstruction::base ## 4:                            \
+    case ByteCodeInstruction::base ## 3:                            \
+    case ByteCodeInstruction::base ## 2:                            \
+    case ByteCodeInstruction::base     : {                          \
+        int count = (int)inst - (int)ByteCodeInstruction::base + 1; \
+        float* v = (float*)sp - count + 1;                          \
+        for (int i = VecWidth * count; i > 0; --i, ++v) {           \
+            *v = fn(*v);                                            \
+        }                                                           \
+        break;                                                      \
+    }
+
+union VValue {
+    VValue() {}
+
+    VValue(F32 f)
+        : fFloat(f) {
+    }
+
+    VValue(I32 s)
+        : fSigned(s) {
+    }
+
+    VValue(U32 u)
+        : fUnsigned(u) {
+    }
+
+    F32 fFloat;
+    I32 fSigned;
+    U32 fUnsigned;
+};
+
 struct StackFrame {
     const uint8_t* fCode;
     const uint8_t* fIP;
-    Interpreter::Value* fStack;
+    VValue* fStack;
 };
 
-static float mix(float start, float end, float t) {
+static F32 mix(F32 start, F32 end, F32 t) {
     return start * (1 - t) + end * t;
 }
 
-void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack, Value* outReturn,
-              Value globals[], int globalCount) {
-    Value* sp = stack + f->fParameterCount + f->fLocalCount - 1;
+// TODO: trunc on integers?
+template <typename T>
+static T vec_mod(T a, T b) {
+    return a - skvx::trunc(a / b) * b;
+}
 
-    auto POP =  [&]          { SkASSERT(sp     >= stack); return *(sp--); };
-    auto PUSH = [&](Value v) { SkASSERT(sp + 1 >= stack); *(++sp) = v;    };
+void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack, Value* outReturn,
+              I32 initMask, VValue globals[], int globalCount) {
+    VValue* sp = stack + f->fParameterCount + f->fLocalCount - 1;
+
+    auto POP =  [&]           { SkASSERT(sp     >= stack); return *(sp--); };
+    auto PUSH = [&](VValue v) { SkASSERT(sp + 1 >= stack); *(++sp) = v;    };
 
     const uint8_t* code = f->fCode.data();
     const uint8_t* ip = code;
     std::vector<StackFrame> frames;
 
+    I32 condStack[16];  // Independent condition masks
+    I32 maskStack[16];  // Combined masks (eg maskStack[0] & maskStack[1] & ...)
+    I32 contStack[16];  // Continue flags for loops
+    I32 loopStack[16];  // Loop execution masks
+    condStack[0] = maskStack[0] = initMask;
+    contStack[0] = I32( 0);
+    loopStack[0] = I32(~0);
+    I32* condPtr = condStack;
+    I32* maskPtr = maskStack;
+    I32* contPtr = contStack;
+    I32* loopPtr = loopStack;
+
+    auto mask = [&]() { return *maskPtr & *loopPtr; };
+
     for (;;) {
 #ifdef TRACE
         printf("at %3d  ", (int) (ip - code));
@@ -325,8 +400,21 @@
         switch (inst) {
             VECTOR_BINARY_OP(kAddI, fSigned, +)
             VECTOR_MATRIX_BINARY_OP(kAddF, fFloat, +)
+
+            // Booleans are integer masks: 0/~0 for false/true. So bitwise ops do what we want:
             case ByteCodeInstruction::kAndB:
-                sp[-1] = sp[-1].fBool && sp[0].fBool;
+                sp[-1] = sp[-1].fSigned & sp[0].fSigned;
+                POP();
+                break;
+            case ByteCodeInstruction::kNotB:
+                sp[0] = ~sp[0].fSigned;
+                break;
+            case ByteCodeInstruction::kOrB:
+                sp[-1] = sp[-1].fSigned | sp[0].fSigned;
+                POP();
+                break;
+            case ByteCodeInstruction::kXorB:
+                sp[-1] = sp[-1].fSigned ^ sp[0].fSigned;
                 POP();
                 break;
 
@@ -340,10 +428,15 @@
                 // (plus space for locals).
                 int target = READ8();
                 const ByteCodeFunction* fun = byteCode->fFunctions[target].get();
-                frames.push_back({ code, ip, stack });
-                ip = code = fun->fCode.data();
-                stack = sp - fun->fParameterCount + 1;
-                sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
+                if (skvx::any(mask())) {
+                    frames.push_back({ code, ip, stack });
+                    ip = code = fun->fCode.data();
+                    stack = sp - fun->fParameterCount + 1;
+                    sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
+                } else {
+                    sp -= fun->fParameterCount;
+                    sp += fun->fReturnCount;
+                }
                 break;
             }
 
@@ -354,10 +447,23 @@
                 ExternalValue* v = byteCode->fExternalValues[target];
                 sp -= argumentCount - 1;
 
-                Value tmp[4];
-                SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmp));
-                v->call(sp, tmp);
-                memcpy(sp, tmp, returnCount * sizeof(Value));
+                Value tmpArgs[4];
+                Value tmpReturn[4];
+                SkASSERT(argumentCount <= (int)SK_ARRAY_COUNT(tmpArgs));
+                SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmpReturn));
+
+                I32 m = mask();
+                for (int i = 0; i < VecWidth; ++i) {
+                    if (m[i]) {
+                        for (int j = 0; j < argumentCount; ++j) {
+                            tmpArgs[j].fSigned = sp[j].fSigned[i];
+                        }
+                        v->call(tmpArgs, tmpReturn);
+                        for (int j = 0; j < returnCount; ++j) {
+                            sp[j].fSigned[i] = tmpReturn[j].fSigned;
+                        }
+                    }
+                }
                 sp += returnCount - 1;
                 break;
             }
@@ -379,45 +485,36 @@
             VECTOR_BINARY_OP(kCompareULTEQ, fUnsigned, <=)
             VECTOR_BINARY_OP(kCompareFLTEQ, fFloat, <=)
 
-            case ByteCodeInstruction::kConditionalBranch: {
-                int target = READ16();
-                if (POP().fBool) {
-                    ip = code + target;
-                }
-                break;
-            }
-
-            case ByteCodeInstruction::kConvertFtoI4: sp[-3].fSigned = (int)sp[-3].fFloat;
-            case ByteCodeInstruction::kConvertFtoI3: sp[-2].fSigned = (int)sp[-2].fFloat;
-            case ByteCodeInstruction::kConvertFtoI2: sp[-1].fSigned = (int)sp[-1].fFloat;
-            case ByteCodeInstruction::kConvertFtoI:  sp[ 0].fSigned = (int)sp[ 0].fFloat;
+            case ByteCodeInstruction::kConvertFtoI4: sp[-3] = skvx::cast<int>(sp[-3].fFloat);
+            case ByteCodeInstruction::kConvertFtoI3: sp[-2] = skvx::cast<int>(sp[-2].fFloat);
+            case ByteCodeInstruction::kConvertFtoI2: sp[-1] = skvx::cast<int>(sp[-1].fFloat);
+            case ByteCodeInstruction::kConvertFtoI:  sp[ 0] = skvx::cast<int>(sp[ 0].fFloat);
                                                      break;
 
-            case ByteCodeInstruction::kConvertStoF4: sp[-3].fFloat = sp[-3].fSigned;
-            case ByteCodeInstruction::kConvertStoF3: sp[-2].fFloat = sp[-2].fSigned;
-            case ByteCodeInstruction::kConvertStoF2: sp[-1].fFloat = sp[-1].fSigned;
-            case ByteCodeInstruction::kConvertStoF : sp[ 0].fFloat = sp[ 0].fSigned;
+            case ByteCodeInstruction::kConvertStoF4: sp[-3] = skvx::cast<float>(sp[-3].fSigned);
+            case ByteCodeInstruction::kConvertStoF3: sp[-2] = skvx::cast<float>(sp[-2].fSigned);
+            case ByteCodeInstruction::kConvertStoF2: sp[-1] = skvx::cast<float>(sp[-1].fSigned);
+            case ByteCodeInstruction::kConvertStoF : sp[ 0] = skvx::cast<float>(sp[ 0].fSigned);
                                                      break;
 
-            case ByteCodeInstruction::kConvertUtoF4: sp[-3].fFloat = sp[-3].fUnsigned;
-            case ByteCodeInstruction::kConvertUtoF3: sp[-2].fFloat = sp[-2].fUnsigned;
-            case ByteCodeInstruction::kConvertUtoF2: sp[-1].fFloat = sp[-1].fUnsigned;
-            case ByteCodeInstruction::kConvertUtoF : sp[ 0].fFloat = sp[ 0].fUnsigned;
+            case ByteCodeInstruction::kConvertUtoF4: sp[-3] = skvx::cast<float>(sp[-3].fUnsigned);
+            case ByteCodeInstruction::kConvertUtoF3: sp[-2] = skvx::cast<float>(sp[-2].fUnsigned);
+            case ByteCodeInstruction::kConvertUtoF2: sp[-1] = skvx::cast<float>(sp[-1].fUnsigned);
+            case ByteCodeInstruction::kConvertUtoF : sp[ 0] = skvx::cast<float>(sp[ 0].fUnsigned);
                                                      break;
 
-            VECTOR_UNARY_FN(kCos, cosf, fFloat)
+            VECTOR_UNARY_FN_VEC(kCos, cosf)
 
             case ByteCodeInstruction::kCross: {
-                SkPoint3 cross = SkPoint3::CrossProduct(SkPoint3::Make(sp[-5].fFloat,
-                                                                       sp[-4].fFloat,
-                                                                       sp[-3].fFloat),
-                                                        SkPoint3::Make(sp[-2].fFloat,
-                                                                       sp[-1].fFloat,
-                                                                       sp[ 0].fFloat));
+                F32 ax = sp[-5].fFloat, ay = sp[-4].fFloat, az = sp[-3].fFloat,
+                    bx = sp[-2].fFloat, by = sp[-1].fFloat, bz = sp[ 0].fFloat;
+                F32 cx = ay*bz - az*by,
+                    cy = az*bx - ax*bz,
+                    cz = ax*by - ay*bx;
                 sp -= 3;
-                sp[-2] = cross.fX;
-                sp[-1] = cross.fY;
-                sp[ 0] = cross.fZ;
+                sp[-2] = cx;
+                sp[-1] = cy;
+                sp[ 0] = cz;
                 break;
             }
 
@@ -433,7 +530,7 @@
 
             case ByteCodeInstruction::kDupN: {
                 int count = READ8();
-                memcpy(sp + 1, sp - count + 1, count * sizeof(Value));
+                memcpy(sp + 1, sp - count + 1, count * sizeof(VValue));
                 sp += count;
                 break;
             }
@@ -457,17 +554,30 @@
 
             case ByteCodeInstruction::kLoadExtended: {
                 int count = READ8();
-                int src = POP().fSigned;
-                memcpy(sp + 1, &stack[src], count * sizeof(Value));
+                I32 src = POP().fSigned;
+                I32 m = mask();
+                for (int i = 0; i < count; ++i) {
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            sp[i + 1].fSigned[j] = stack[src[j] + i].fSigned[j];
+                        }
+                    }
+                }
                 sp += count;
                 break;
             }
 
             case ByteCodeInstruction::kLoadExtendedGlobal: {
                 int count = READ8();
-                int src = POP().fSigned;
-                SkASSERT(src + count <= globalCount);
-                memcpy(sp + 1, &globals[src], count * sizeof(Value));
+                I32 src = POP().fSigned;
+                I32 m = mask();
+                for (int i = 0; i < count; ++i) {
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            sp[i + 1].fSigned[j] = globals[src[j] + i].fSigned[j];
+                        }
+                    }
+                }
                 sp += count;
                 break;
             }
@@ -502,15 +612,17 @@
                 SkASSERT(srcRows >= 2 && srcRows <= 4);
                 SkASSERT(dstCols >= 2 && dstCols <= 4);
                 SkASSERT(dstRows >= 2 && dstRows <= 4);
-                SkMatrix44 m;
+                F32 tmp[16];
+                memset(tmp, 0, sizeof(tmp));
+                tmp[0] = tmp[5] = tmp[10] = tmp[15] = F32(1.0f);
                 for (int c = srcCols - 1; c >= 0; --c) {
                     for (int r = srcRows - 1; r >= 0; --r) {
-                        m.set(r, c, POP().fFloat);
+                        tmp[c*4 + r] = POP().fFloat;
                     }
                 }
                 for (int c = 0; c < dstCols; ++c) {
                     for (int r = 0; r < dstRows; ++r) {
-                        PUSH(m.get(r, c));
+                        PUSH(tmp[c*4 + r]);
                     }
                 }
                 break;
@@ -521,9 +633,9 @@
                 int lRows = READ8();
                 int rCols = READ8();
                 int rRows = lCols;
-                float tmp[16] = { 0.0f };
-                float* B = &(sp - (rCols * rRows) + 1)->fFloat;
-                float* A = B - (lCols * lRows);
+                F32 tmp[16] = { 0.0f };
+                F32* B = &(sp - (rCols * rRows) + 1)->fFloat;
+                F32* A = B - (lCols * lRows);
                 for (int c = 0; c < rCols; ++c) {
                     for (int r = 0; r < lRows; ++r) {
                         for (int j = 0; j < lCols; ++j) {
@@ -532,7 +644,7 @@
                     }
                 }
                 sp -= (lCols * lRows) + (rCols * rRows);
-                memcpy(sp + 1, tmp, rCols * lRows * sizeof(Value));
+                memcpy(sp + 1, tmp, rCols * lRows * sizeof(VValue));
                 sp += (rCols * lRows);
                 break;
             }
@@ -564,10 +676,6 @@
             VECTOR_BINARY_OP(kMultiplyI, fSigned, *)
             VECTOR_MATRIX_BINARY_OP(kMultiplyF, fFloat, *)
 
-            case ByteCodeInstruction::kNot:
-                sp[0].fBool = !sp[0].fBool;
-                break;
-
             case ByteCodeInstruction::kNegateF4: sp[-3] = -sp[-3].fFloat;
             case ByteCodeInstruction::kNegateF3: sp[-2] = -sp[-2].fFloat;
             case ByteCodeInstruction::kNegateF2: sp[-1] = -sp[-1].fFloat;
@@ -585,14 +693,9 @@
             case ByteCodeInstruction::kNegateI4: sp[-3] = -sp[-3].fSigned;
             case ByteCodeInstruction::kNegateI3: sp[-2] = -sp[-2].fSigned;
             case ByteCodeInstruction::kNegateI2: sp[-1] = -sp[-1].fSigned;
-            case ByteCodeInstruction::kNegateI : sp[ 0] = -sp [0].fSigned;
+            case ByteCodeInstruction::kNegateI : sp[ 0] = -sp[ 0].fSigned;
                                                  break;
 
-            case ByteCodeInstruction::kOrB:
-                sp[-1] = sp[-1].fBool || sp[0].fBool;
-                POP();
-                break;
-
             case ByteCodeInstruction::kPop4: POP();
             case ByteCodeInstruction::kPop3: POP();
             case ByteCodeInstruction::kPop2: POP();
@@ -604,34 +707,56 @@
                 break;
 
             case ByteCodeInstruction::kPushImmediate:
-                PUSH(READ32());
+                PUSH(U32(READ32()));
                 break;
 
-            case ByteCodeInstruction::kReadExternal:  // fall through
-            case ByteCodeInstruction::kReadExternal2: // fall through
-            case ByteCodeInstruction::kReadExternal3: // fall through
+            case ByteCodeInstruction::kReadExternal:
+            case ByteCodeInstruction::kReadExternal2:
+            case ByteCodeInstruction::kReadExternal3:
             case ByteCodeInstruction::kReadExternal4: {
+                // TODO: Support striped external values, or passing lane index? This model is odd.
+                int count = (int)inst - (int)ByteCodeInstruction::kReadExternal + 1;
                 int src = READ8();
-                byteCode->fExternalValues[src]->read(sp + 1);
-                sp += (int) inst - (int) ByteCodeInstruction::kReadExternal + 1;
+                int32_t tmp[4];
+                I32 m = mask();
+                for (int i = 0; i < VecWidth; ++i) {
+                    if (m[i]) {
+                        byteCode->fExternalValues[src]->read(tmp);
+                        for (int j = 0; j < count; ++j) {
+                            sp[j + 1].fSigned[i] = tmp[j];
+                        }
+                    }
+                }
+                sp += count;
                 break;
             }
 
-            VECTOR_BINARY_FN(kRemainderF, fFloat, fmodf)
-            VECTOR_BINARY_OP(kRemainderS, fSigned, %)
-            VECTOR_BINARY_OP(kRemainderU,  fUnsigned, %)
+            VECTOR_BINARY_FN(kRemainderF, fFloat, vec_mod<F32>)
+            VECTOR_BINARY_FN(kRemainderS, fSigned, vec_mod<I32>)
+            VECTOR_BINARY_FN(kRemainderU, fUnsigned, vec_mod<U32>)
 
             case ByteCodeInstruction::kReturn: {
                 int count = READ8();
                 if (frames.empty()) {
                     if (outReturn) {
-                        memcpy(outReturn, sp - count + 1, count * sizeof(Value));
+                        // TODO: This can be smarter, knowing that mask is left-justified
+                        I32 m = mask();
+                        VValue* src = sp - count + 1;
+                        for (int i = 0; i < count; ++i) {
+                            for (int j = 0; j < VecWidth; ++j) {
+                                if (m[j]) {
+                                    outReturn[count * j].fSigned = src->fSigned[j];
+                                }
+                            }
+                            ++outReturn;
+                            ++src;
+                        }
                     }
                     return;
                 } else {
                     // When we were called, 'stack' was positioned at the old top-of-stack (where
                     // our parameters were placed). So copy our return values to that same spot.
-                    memmove(stack, sp - count + 1, count * sizeof(Value));
+                    memmove(stack, sp - count + 1, count * sizeof(VValue));
 
                     // Now move the stack pointer to the end of the just-pushed return values,
                     // and restore everything else.
@@ -648,44 +773,67 @@
             case ByteCodeInstruction::kScalarToMatrix: {
                 int cols = READ8();
                 int rows = READ8();
-                Value v = POP();
+                VValue v = POP();
                 for (int c = 0; c < cols; ++c) {
                     for (int r = 0; r < rows; ++r) {
-                        PUSH(c == r ? v : 0.0f);
+                        PUSH(c == r ? v : F32(0.0f));
                     }
                 }
                 break;
             }
 
-            VECTOR_UNARY_FN(kSin, sinf, fFloat)
-            VECTOR_UNARY_FN(kSqrt, sqrtf, fFloat)
+            VECTOR_UNARY_FN_VEC(kSin, sinf)
+            VECTOR_UNARY_FN(kSqrt, skvx::sqrt, fFloat)
 
-            case ByteCodeInstruction::kStore4: stack[*ip + 3] = POP();
-            case ByteCodeInstruction::kStore3: stack[*ip + 2] = POP();
-            case ByteCodeInstruction::kStore2: stack[*ip + 1] = POP();
-            case ByteCodeInstruction::kStore : stack[*ip + 0] = POP();
-                                               ++ip;
-                                               break;
+            case ByteCodeInstruction::kStore4:
+                stack[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+3].fFloat);
+            case ByteCodeInstruction::kStore3:
+                stack[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+2].fFloat);
+            case ByteCodeInstruction::kStore2:
+                stack[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+1].fFloat);
+            case ByteCodeInstruction::kStore :
+                stack[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+0].fFloat);
+                ++ip;
+                break;
 
-            case ByteCodeInstruction::kStoreGlobal4: globals[*ip + 3] = POP();
-            case ByteCodeInstruction::kStoreGlobal3: globals[*ip + 2] = POP();
-            case ByteCodeInstruction::kStoreGlobal2: globals[*ip + 1] = POP();
-            case ByteCodeInstruction::kStoreGlobal : globals[*ip + 0] = POP();
-                                                     ++ip;
-                                                     break;
+            case ByteCodeInstruction::kStoreGlobal4:
+                globals[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+3].fFloat);
+            case ByteCodeInstruction::kStoreGlobal3:
+                globals[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+2].fFloat);
+            case ByteCodeInstruction::kStoreGlobal2:
+                globals[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+1].fFloat);
+            case ByteCodeInstruction::kStoreGlobal :
+                globals[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+0].fFloat);
+                ++ip;
+                break;
 
             case ByteCodeInstruction::kStoreExtended: {
                 int count = READ8();
-                int target = POP().fSigned;
-                memcpy(&stack[target], sp - count + 1, count * sizeof(Value));
+                I32 target = POP().fSigned;
+                VValue* src = sp - count + 1;
+                I32 m = mask();
+                for (int i = 0; i < count; ++i) {
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            stack[target[j] + i].fSigned[j] = src[i].fSigned[j];
+                        }
+                    }
+                }
                 sp -= count;
                 break;
             }
             case ByteCodeInstruction::kStoreExtendedGlobal: {
                 int count = READ8();
-                int target = POP().fSigned;
-                SkASSERT(target + count <= globalCount);
-                memcpy(&globals[target], sp - count + 1, count * sizeof(Value));
+                I32 target = POP().fSigned;
+                VValue* src = sp - count + 1;
+                I32 m = mask();
+                for (int i = 0; i < count; ++i) {
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            globals[target[j] + i].fSigned[j] = src[i].fSigned[j];
+                        }
+                    }
+                }
                 sp -= count;
                 break;
             }
@@ -694,7 +842,8 @@
                 int target = READ8();
                 int count = READ8();
                 for (int i = count - 1; i >= 0; --i) {
-                    stack[target + *(ip + i)] = POP();
+                    stack[target + *(ip + i)] = skvx::if_then_else(
+                            mask(), POP().fFloat, stack[target + *(ip + i)].fFloat);
                 }
                 ip += count;
                 break;
@@ -704,25 +853,40 @@
                 int target = READ8();
                 int count = READ8();
                 for (int i = count - 1; i >= 0; --i) {
-                    globals[target + *(ip + i)] = POP();
+                    globals[target + *(ip + i)] = skvx::if_then_else(
+                            mask(), POP().fFloat, globals[target + *(ip + i)].fFloat);
                 }
                 ip += count;
                 break;
             }
+
             case ByteCodeInstruction::kStoreSwizzleIndirect: {
-                int target = POP().fSigned;
                 int count = READ8();
+                I32 target = POP().fSigned;
+                I32 m = mask();
                 for (int i = count - 1; i >= 0; --i) {
-                    stack[target + *(ip + i)] = POP();
+                    I32 v = POP().fSigned;
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            stack[target[j] + *(ip + i)].fSigned[j] = v[j];
+                        }
+                    }
                 }
                 ip += count;
                 break;
             }
+
             case ByteCodeInstruction::kStoreSwizzleIndirectGlobal: {
-                int target = POP().fSigned;
                 int count = READ8();
+                I32 target = POP().fSigned;
+                I32 m = mask();
                 for (int i = count - 1; i >= 0; --i) {
-                    globals[target + *(ip + i)] = POP();
+                    I32 v = POP().fSigned;
+                    for (int j = 0; j < VecWidth; ++j) {
+                        if (m[j]) {
+                            globals[target[j] + *(ip + i)].fSigned[j] = v[j];
+                        }
+                    }
                 }
                 ip += count;
                 break;
@@ -732,7 +896,7 @@
             VECTOR_MATRIX_BINARY_OP(kSubtractF, fFloat, -)
 
             case ByteCodeInstruction::kSwizzle: {
-                Value tmp[4];
+                VValue tmp[4];
                 for (int i = READ8() - 1; i >= 0; --i) {
                     tmp[i] = POP();
                 }
@@ -742,70 +906,150 @@
                 break;
             }
 
-            VECTOR_UNARY_FN(kTan, tanf, fFloat)
+            VECTOR_UNARY_FN_VEC(kTan, tanf)
 
-            case ByteCodeInstruction::kWriteExternal:  // fall through
-            case ByteCodeInstruction::kWriteExternal2: // fall through
-            case ByteCodeInstruction::kWriteExternal3: // fall through
+            case ByteCodeInstruction::kWriteExternal:
+            case ByteCodeInstruction::kWriteExternal2:
+            case ByteCodeInstruction::kWriteExternal3:
             case ByteCodeInstruction::kWriteExternal4: {
-                int count = (int) inst - (int) ByteCodeInstruction::kWriteExternal + 1;
+                int count = (int)inst - (int)ByteCodeInstruction::kWriteExternal + 1;
                 int target = READ8();
-                byteCode->fExternalValues[target]->write(sp - count + 1);
+                int32_t tmp[4];
+                I32 m = mask();
                 sp -= count;
+                for (int i = 0; i < VecWidth; ++i) {
+                    if (m[i]) {
+                        for (int j = 0; j < count; ++j) {
+                            tmp[j] = sp[j + 1].fSigned[i];
+                        }
+                        byteCode->fExternalValues[target]->write(tmp);
+                    }
+                }
+                break;
+            }
+
+            case ByteCodeInstruction::kMaskPush:
+                condPtr[1] = POP().fSigned;
+                maskPtr[1] = maskPtr[0] & condPtr[1];
+                ++condPtr; ++maskPtr;
+                break;
+            case ByteCodeInstruction::kMaskPop:
+                --condPtr; --maskPtr;
+                break;
+            case ByteCodeInstruction::kMaskNegate:
+                maskPtr[0] = maskPtr[-1] & ~condPtr[0];
+                break;
+            case ByteCodeInstruction::kMaskBlend: {
+                int count = READ8();
+                I32 m = condPtr[0];
+                --condPtr; --maskPtr;
+                for (int i = 0; i < count; ++i) {
+                    sp[-count] = skvx::if_then_else(m, sp[-count].fFloat, sp[0].fFloat);
+                    --sp;
+                }
+                break;
+            }
+            case ByteCodeInstruction::kBranchIfAllFalse: {
+                int target = READ16();
+                if (!skvx::any(mask())) {
+                    ip = code + target;
+                }
+                break;
+            }
+
+            case ByteCodeInstruction::kLoopBegin:
+                *(++contPtr) =  0;
+                *(++loopPtr) = ~0;
+                break;
+            case ByteCodeInstruction::kLoopNext:
+                *loopPtr |= *contPtr;
+                *contPtr = 0;
+                break;
+            case ByteCodeInstruction::kLoopMask:
+                *loopPtr &= POP().fSigned;
+                break;
+            case ByteCodeInstruction::kLoopEnd:
+                --contPtr; --loopPtr;
+                break;
+            case ByteCodeInstruction::kLoopBreak:
+                *loopPtr &= ~mask();
+                break;
+            case ByteCodeInstruction::kLoopContinue: {
+                I32 m = mask();
+                *contPtr |=  m;
+                *loopPtr &= ~m;
                 break;
             }
 
             default:
                 SkDEBUGFAILF("unsupported instruction %d\n", (int) inst);
         }
+    }
+}
+
+void VecRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value args[], Value* outReturn,
+            int N, Value uniforms[], int uniformCount) {
 #ifdef TRACE
-        int stackSize = (int) (sp - stack + 1);
-        printf("STACK(%d):", stackSize);
-        for (int i = 0; i < stackSize; ++i) {
-            printf(" %d(%g)", stack[i].fSigned, stack[i].fFloat);
-        }
-        printf("\n");
+    disassemble(f);
 #endif
+    VValue smallStack[128];
+
+    SkASSERT(uniformCount == (int)byteCode->fInputSlots.size());
+    VValue smallGlobals[32];
+    VValue* globals = smallGlobals;
+    SkASSERT((int)SK_ARRAY_COUNT(smallGlobals) >= byteCode->fGlobalCount);
+    for (uint8_t slot : byteCode->fInputSlots) {
+        globals[slot].fUnsigned = (uniforms++)->fUnsigned;
+    }
+
+    while (N) {
+        VValue* stack = smallStack;
+
+        int w = std::min(N, VecWidth);
+        N -= w;
+
+        // Transpose args into stack
+        {
+            uint32_t* src = (uint32_t*)args;
+            for (int i = 0; i < w; ++i) {
+                uint32_t* dst = (uint32_t*)stack + i;
+                for (int j = f->fParameterCount; j > 0; --j) {
+                    *dst = *src++;
+                    dst += VecWidth;
+                }
+            }
+        }
+
+        auto mask = w > gLanes;
+        innerRun(byteCode, f, stack, outReturn, mask, globals, byteCode->fGlobalCount);
+
+        // Transpose out parameters back
+        {
+            uint32_t* dst = (uint32_t*)args;
+            for (int i = 0; i < w; ++i) {
+                uint32_t* src = (uint32_t*)stack + i;
+                for (const auto& p : f->fParameters) {
+                    if (p.fIsOutParameter) {
+                        for (int j = p.fSlotCount; j > 0; --j) {
+                            *dst++ = *src;
+                            src += VecWidth;
+                        }
+                    } else {
+                        dst += p.fSlotCount;
+                        src += p.fSlotCount * VecWidth;
+                    }
+                }
+            }
+        }
+
+        args += f->fParameterCount * w;
+        outReturn += f->fReturnCount * w;
     }
 }
 
 void Run(const ByteCode* byteCode, const ByteCodeFunction* f, Value args[], Value* outReturn,
          Value uniforms[], int uniformCount) {
-#ifdef TRACE
-    disassemble(f);
-#endif
-    Value smallStack[128];
-    std::unique_ptr<Value[]> largeStack;
-    Value* stack = smallStack;
-    if ((int)SK_ARRAY_COUNT(smallStack) < f->fStackCount) {
-        largeStack.reset(new Value[f->fStackCount]);
-        stack = largeStack.get();
-    }
-
-    if (f->fParameterCount) {
-        memcpy(stack, args, f->fParameterCount * sizeof(Value));
-    }
-
-    SkASSERT(uniformCount == (int)byteCode->fInputSlots.size());
-    Value smallGlobals[32];
-    std::unique_ptr<Value[]> largeGlobals;
-    Value* globals = smallGlobals;
-    if ((int)SK_ARRAY_COUNT(smallGlobals) < byteCode->fGlobalCount) {
-        largeGlobals.reset(new Value[byteCode->fGlobalCount]);
-        globals = largeGlobals.get();
-    }
-    for (uint8_t slot : byteCode->fInputSlots) {
-        globals[slot] = *uniforms++;
-    }
-    innerRun(byteCode, f, stack, outReturn, globals, byteCode->fGlobalCount);
-
-    for (const auto& p : f->fParameters) {
-        if (p.fIsOutParameter) {
-            memcpy(args, stack, p.fSlotCount * sizeof(Value));
-        }
-        args += p.fSlotCount;
-        stack += p.fSlotCount;
-    }
+    VecRun(byteCode, f, args, outReturn, 1, uniforms, uniformCount);
 }
 
 } // namespace Interpreter