Interpreter: Vectorized interpreter
This converts the SkSL interpreter to operate in SIMT fashion. It handles
all the same features as the previous scalar implementation, but operates
on N lanes at a time. (Currently 8).
It's modeled after GPU and other parallel architectures, using execution
masks to handle control flow, including divergent control-flow.
Change-Id: Ieb38ffe2f55a10f72bdab844c297126fe9bedb6c
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/217122
Commit-Queue: Brian Osman <brianosman@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
diff --git a/src/sksl/SkSLInterpreter.cpp b/src/sksl/SkSLInterpreter.cpp
index deea8c3..22fec06 100644
--- a/src/sksl/SkSLInterpreter.cpp
+++ b/src/sksl/SkSLInterpreter.cpp
@@ -8,6 +8,7 @@
#ifndef SKSL_STANDALONE
#include "include/core/SkPoint3.h"
+#include "include/private/SkVx.h"
#include "src/sksl/SkSLByteCode.h"
#include "src/sksl/SkSLByteCodeGenerator.h"
#include "src/sksl/SkSLExternalValue.h"
@@ -18,6 +19,16 @@
namespace SkSL {
namespace Interpreter {
+constexpr int VecWidth = 16;
+
+using F32 = skvx::Vec<VecWidth, float>;
+using I32 = skvx::Vec<VecWidth, int32_t>;
+using U32 = skvx::Vec<VecWidth, uint32_t>;
+
+// Needs to be the first N non-negative integers, at least as large as VecWidth
+static const I32 gLanes = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+
template <typename T>
static T unaligned_load(const void* ptr) {
T val;
@@ -72,9 +83,6 @@
VECTOR_DISASSEMBLE(kCompareUGTEQ, "compareugteq")
VECTOR_DISASSEMBLE(kCompareULT, "compareult")
VECTOR_DISASSEMBLE(kCompareULTEQ, "compareulteq")
- case ByteCodeInstruction::kConditionalBranch:
- printf("conditionalbranch %d", READ16());
- break;
VECTOR_DISASSEMBLE(kConvertFtoI, "convertftoi")
VECTOR_DISASSEMBLE(kConvertStoF, "convertstof")
VECTOR_DISASSEMBLE(kConvertUtoF, "convertutof")
@@ -132,7 +140,7 @@
VECTOR_DISASSEMBLE(kMultiplyI, "multiplyi")
VECTOR_MATRIX_DISASSEMBLE(kNegateF, "negatef")
VECTOR_DISASSEMBLE(kNegateI, "negatei")
- case ByteCodeInstruction::kNot: printf("not"); break;
+ case ByteCodeInstruction::kNotB: printf("notb"); break;
case ByteCodeInstruction::kOrB: printf("orb"); break;
VECTOR_MATRIX_DISASSEMBLE(kPop, "pop")
case ByteCodeInstruction::kPushImmediate: {
@@ -218,6 +226,20 @@
case ByteCodeInstruction::kWriteExternal2: printf("writeexternal2 %d", READ8()); break;
case ByteCodeInstruction::kWriteExternal3: printf("writeexternal3 %d", READ8()); break;
case ByteCodeInstruction::kWriteExternal4: printf("writeexternal4 %d", READ8()); break;
+ case ByteCodeInstruction::kXorB: printf("xorb"); break;
+ case ByteCodeInstruction::kMaskPush: printf("maskpush"); break;
+ case ByteCodeInstruction::kMaskPop: printf("maskpop"); break;
+ case ByteCodeInstruction::kMaskNegate: printf("masknegate"); break;
+ case ByteCodeInstruction::kMaskBlend: printf("maskblend %d", READ8()); break;
+ case ByteCodeInstruction::kBranchIfAllFalse:
+ printf("branchifallfalse %d", READ16());
+ break;
+ case ByteCodeInstruction::kLoopBegin: printf("loopbegin"); break;
+ case ByteCodeInstruction::kLoopNext: printf("loopnext"); break;
+ case ByteCodeInstruction::kLoopMask: printf("loopmask"); break;
+ case ByteCodeInstruction::kLoopEnd: printf("loopend"); break;
+ case ByteCodeInstruction::kLoopContinue: printf("loopcontinue"); break;
+ case ByteCodeInstruction::kLoopBreak: printf("loopbreak"); break;
default: printf("unknown(%d)\n", *(ip - 1)); SkASSERT(false);
}
return ip;
@@ -294,27 +316,80 @@
case ByteCodeInstruction::base: sp[ 0] = fn(sp[ 0].field); \
break;
+#define VECTOR_UNARY_FN_VEC(base, fn) \
+ case ByteCodeInstruction::base ## 4: \
+ case ByteCodeInstruction::base ## 3: \
+ case ByteCodeInstruction::base ## 2: \
+ case ByteCodeInstruction::base : { \
+ int count = (int)inst - (int)ByteCodeInstruction::base + 1; \
+ float* v = (float*)sp - count + 1; \
+ for (int i = VecWidth * count; i > 0; --i, ++v) { \
+ *v = fn(*v); \
+ } \
+ break; \
+ }
+
+union VValue {
+ VValue() {}
+
+ VValue(F32 f)
+ : fFloat(f) {
+ }
+
+ VValue(I32 s)
+ : fSigned(s) {
+ }
+
+ VValue(U32 u)
+ : fUnsigned(u) {
+ }
+
+ F32 fFloat;
+ I32 fSigned;
+ U32 fUnsigned;
+};
+
struct StackFrame {
const uint8_t* fCode;
const uint8_t* fIP;
- Interpreter::Value* fStack;
+ VValue* fStack;
};
-static float mix(float start, float end, float t) {
+static F32 mix(F32 start, F32 end, F32 t) {
return start * (1 - t) + end * t;
}
-void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value* stack, Value* outReturn,
- Value globals[], int globalCount) {
- Value* sp = stack + f->fParameterCount + f->fLocalCount - 1;
+// TODO: trunc on integers?
+template <typename T>
+static T vec_mod(T a, T b) {
+ return a - skvx::trunc(a / b) * b;
+}
- auto POP = [&] { SkASSERT(sp >= stack); return *(sp--); };
- auto PUSH = [&](Value v) { SkASSERT(sp + 1 >= stack); *(++sp) = v; };
+void innerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack, Value* outReturn,
+ I32 initMask, VValue globals[], int globalCount) {
+ VValue* sp = stack + f->fParameterCount + f->fLocalCount - 1;
+
+ auto POP = [&] { SkASSERT(sp >= stack); return *(sp--); };
+ auto PUSH = [&](VValue v) { SkASSERT(sp + 1 >= stack); *(++sp) = v; };
const uint8_t* code = f->fCode.data();
const uint8_t* ip = code;
std::vector<StackFrame> frames;
+ I32 condStack[16]; // Independent condition masks
+ I32 maskStack[16]; // Combined masks (eg maskStack[0] & maskStack[1] & ...)
+ I32 contStack[16]; // Continue flags for loops
+ I32 loopStack[16]; // Loop execution masks
+ condStack[0] = maskStack[0] = initMask;
+ contStack[0] = I32( 0);
+ loopStack[0] = I32(~0);
+ I32* condPtr = condStack;
+ I32* maskPtr = maskStack;
+ I32* contPtr = contStack;
+ I32* loopPtr = loopStack;
+
+ auto mask = [&]() { return *maskPtr & *loopPtr; };
+
for (;;) {
#ifdef TRACE
printf("at %3d ", (int) (ip - code));
@@ -325,8 +400,21 @@
switch (inst) {
VECTOR_BINARY_OP(kAddI, fSigned, +)
VECTOR_MATRIX_BINARY_OP(kAddF, fFloat, +)
+
+ // Booleans are integer masks: 0/~0 for false/true. So bitwise ops do what we want:
case ByteCodeInstruction::kAndB:
- sp[-1] = sp[-1].fBool && sp[0].fBool;
+ sp[-1] = sp[-1].fSigned & sp[0].fSigned;
+ POP();
+ break;
+ case ByteCodeInstruction::kNotB:
+ sp[0] = ~sp[0].fSigned;
+ break;
+ case ByteCodeInstruction::kOrB:
+ sp[-1] = sp[-1].fSigned | sp[0].fSigned;
+ POP();
+ break;
+ case ByteCodeInstruction::kXorB:
+ sp[-1] = sp[-1].fSigned ^ sp[0].fSigned;
POP();
break;
@@ -340,10 +428,15 @@
// (plus space for locals).
int target = READ8();
const ByteCodeFunction* fun = byteCode->fFunctions[target].get();
- frames.push_back({ code, ip, stack });
- ip = code = fun->fCode.data();
- stack = sp - fun->fParameterCount + 1;
- sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
+ if (skvx::any(mask())) {
+ frames.push_back({ code, ip, stack });
+ ip = code = fun->fCode.data();
+ stack = sp - fun->fParameterCount + 1;
+ sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
+ } else {
+ sp -= fun->fParameterCount;
+ sp += fun->fReturnCount;
+ }
break;
}
@@ -354,10 +447,23 @@
ExternalValue* v = byteCode->fExternalValues[target];
sp -= argumentCount - 1;
- Value tmp[4];
- SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmp));
- v->call(sp, tmp);
- memcpy(sp, tmp, returnCount * sizeof(Value));
+ Value tmpArgs[4];
+ Value tmpReturn[4];
+ SkASSERT(argumentCount <= (int)SK_ARRAY_COUNT(tmpArgs));
+ SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmpReturn));
+
+ I32 m = mask();
+ for (int i = 0; i < VecWidth; ++i) {
+ if (m[i]) {
+ for (int j = 0; j < argumentCount; ++j) {
+ tmpArgs[j].fSigned = sp[j].fSigned[i];
+ }
+ v->call(tmpArgs, tmpReturn);
+ for (int j = 0; j < returnCount; ++j) {
+ sp[j].fSigned[i] = tmpReturn[j].fSigned;
+ }
+ }
+ }
sp += returnCount - 1;
break;
}
@@ -379,45 +485,36 @@
VECTOR_BINARY_OP(kCompareULTEQ, fUnsigned, <=)
VECTOR_BINARY_OP(kCompareFLTEQ, fFloat, <=)
- case ByteCodeInstruction::kConditionalBranch: {
- int target = READ16();
- if (POP().fBool) {
- ip = code + target;
- }
- break;
- }
-
- case ByteCodeInstruction::kConvertFtoI4: sp[-3].fSigned = (int)sp[-3].fFloat;
- case ByteCodeInstruction::kConvertFtoI3: sp[-2].fSigned = (int)sp[-2].fFloat;
- case ByteCodeInstruction::kConvertFtoI2: sp[-1].fSigned = (int)sp[-1].fFloat;
- case ByteCodeInstruction::kConvertFtoI: sp[ 0].fSigned = (int)sp[ 0].fFloat;
+ case ByteCodeInstruction::kConvertFtoI4: sp[-3] = skvx::cast<int>(sp[-3].fFloat);
+ case ByteCodeInstruction::kConvertFtoI3: sp[-2] = skvx::cast<int>(sp[-2].fFloat);
+ case ByteCodeInstruction::kConvertFtoI2: sp[-1] = skvx::cast<int>(sp[-1].fFloat);
+ case ByteCodeInstruction::kConvertFtoI: sp[ 0] = skvx::cast<int>(sp[ 0].fFloat);
break;
- case ByteCodeInstruction::kConvertStoF4: sp[-3].fFloat = sp[-3].fSigned;
- case ByteCodeInstruction::kConvertStoF3: sp[-2].fFloat = sp[-2].fSigned;
- case ByteCodeInstruction::kConvertStoF2: sp[-1].fFloat = sp[-1].fSigned;
- case ByteCodeInstruction::kConvertStoF : sp[ 0].fFloat = sp[ 0].fSigned;
+ case ByteCodeInstruction::kConvertStoF4: sp[-3] = skvx::cast<float>(sp[-3].fSigned);
+ case ByteCodeInstruction::kConvertStoF3: sp[-2] = skvx::cast<float>(sp[-2].fSigned);
+ case ByteCodeInstruction::kConvertStoF2: sp[-1] = skvx::cast<float>(sp[-1].fSigned);
+ case ByteCodeInstruction::kConvertStoF : sp[ 0] = skvx::cast<float>(sp[ 0].fSigned);
break;
- case ByteCodeInstruction::kConvertUtoF4: sp[-3].fFloat = sp[-3].fUnsigned;
- case ByteCodeInstruction::kConvertUtoF3: sp[-2].fFloat = sp[-2].fUnsigned;
- case ByteCodeInstruction::kConvertUtoF2: sp[-1].fFloat = sp[-1].fUnsigned;
- case ByteCodeInstruction::kConvertUtoF : sp[ 0].fFloat = sp[ 0].fUnsigned;
+ case ByteCodeInstruction::kConvertUtoF4: sp[-3] = skvx::cast<float>(sp[-3].fUnsigned);
+ case ByteCodeInstruction::kConvertUtoF3: sp[-2] = skvx::cast<float>(sp[-2].fUnsigned);
+ case ByteCodeInstruction::kConvertUtoF2: sp[-1] = skvx::cast<float>(sp[-1].fUnsigned);
+ case ByteCodeInstruction::kConvertUtoF : sp[ 0] = skvx::cast<float>(sp[ 0].fUnsigned);
break;
- VECTOR_UNARY_FN(kCos, cosf, fFloat)
+ VECTOR_UNARY_FN_VEC(kCos, cosf)
case ByteCodeInstruction::kCross: {
- SkPoint3 cross = SkPoint3::CrossProduct(SkPoint3::Make(sp[-5].fFloat,
- sp[-4].fFloat,
- sp[-3].fFloat),
- SkPoint3::Make(sp[-2].fFloat,
- sp[-1].fFloat,
- sp[ 0].fFloat));
+ F32 ax = sp[-5].fFloat, ay = sp[-4].fFloat, az = sp[-3].fFloat,
+ bx = sp[-2].fFloat, by = sp[-1].fFloat, bz = sp[ 0].fFloat;
+ F32 cx = ay*bz - az*by,
+ cy = az*bx - ax*bz,
+ cz = ax*by - ay*bx;
sp -= 3;
- sp[-2] = cross.fX;
- sp[-1] = cross.fY;
- sp[ 0] = cross.fZ;
+ sp[-2] = cx;
+ sp[-1] = cy;
+ sp[ 0] = cz;
break;
}
@@ -433,7 +530,7 @@
case ByteCodeInstruction::kDupN: {
int count = READ8();
- memcpy(sp + 1, sp - count + 1, count * sizeof(Value));
+ memcpy(sp + 1, sp - count + 1, count * sizeof(VValue));
sp += count;
break;
}
@@ -457,17 +554,30 @@
case ByteCodeInstruction::kLoadExtended: {
int count = READ8();
- int src = POP().fSigned;
- memcpy(sp + 1, &stack[src], count * sizeof(Value));
+ I32 src = POP().fSigned;
+ I32 m = mask();
+ for (int i = 0; i < count; ++i) {
+ for (int j = 0; j < VecWidth; ++j) {
+ if (m[j]) {
+ sp[i + 1].fSigned[j] = stack[src[j] + i].fSigned[j];
+ }
+ }
+ }
sp += count;
break;
}
case ByteCodeInstruction::kLoadExtendedGlobal: {
int count = READ8();
- int src = POP().fSigned;
- SkASSERT(src + count <= globalCount);
- memcpy(sp + 1, &globals[src], count * sizeof(Value));
+ I32 src = POP().fSigned;
+ I32 m = mask();
+ for (int i = 0; i < count; ++i) {
+ for (int j = 0; j < VecWidth; ++j) {
+ if (m[j]) {
+ sp[i + 1].fSigned[j] = globals[src[j] + i].fSigned[j];
+ }
+ }
+ }
sp += count;
break;
}
@@ -502,15 +612,17 @@
SkASSERT(srcRows >= 2 && srcRows <= 4);
SkASSERT(dstCols >= 2 && dstCols <= 4);
SkASSERT(dstRows >= 2 && dstRows <= 4);
- SkMatrix44 m;
+ F32 tmp[16];
+ memset(tmp, 0, sizeof(tmp));
+ tmp[0] = tmp[5] = tmp[10] = tmp[15] = F32(1.0f);
for (int c = srcCols - 1; c >= 0; --c) {
for (int r = srcRows - 1; r >= 0; --r) {
- m.set(r, c, POP().fFloat);
+ tmp[c*4 + r] = POP().fFloat;
}
}
for (int c = 0; c < dstCols; ++c) {
for (int r = 0; r < dstRows; ++r) {
- PUSH(m.get(r, c));
+ PUSH(tmp[c*4 + r]);
}
}
break;
@@ -521,9 +633,9 @@
int lRows = READ8();
int rCols = READ8();
int rRows = lCols;
- float tmp[16] = { 0.0f };
- float* B = &(sp - (rCols * rRows) + 1)->fFloat;
- float* A = B - (lCols * lRows);
+ F32 tmp[16] = { 0.0f };
+ F32* B = &(sp - (rCols * rRows) + 1)->fFloat;
+ F32* A = B - (lCols * lRows);
for (int c = 0; c < rCols; ++c) {
for (int r = 0; r < lRows; ++r) {
for (int j = 0; j < lCols; ++j) {
@@ -532,7 +644,7 @@
}
}
sp -= (lCols * lRows) + (rCols * rRows);
- memcpy(sp + 1, tmp, rCols * lRows * sizeof(Value));
+ memcpy(sp + 1, tmp, rCols * lRows * sizeof(VValue));
sp += (rCols * lRows);
break;
}
@@ -564,10 +676,6 @@
VECTOR_BINARY_OP(kMultiplyI, fSigned, *)
VECTOR_MATRIX_BINARY_OP(kMultiplyF, fFloat, *)
- case ByteCodeInstruction::kNot:
- sp[0].fBool = !sp[0].fBool;
- break;
-
case ByteCodeInstruction::kNegateF4: sp[-3] = -sp[-3].fFloat;
case ByteCodeInstruction::kNegateF3: sp[-2] = -sp[-2].fFloat;
case ByteCodeInstruction::kNegateF2: sp[-1] = -sp[-1].fFloat;
@@ -585,14 +693,9 @@
case ByteCodeInstruction::kNegateI4: sp[-3] = -sp[-3].fSigned;
case ByteCodeInstruction::kNegateI3: sp[-2] = -sp[-2].fSigned;
case ByteCodeInstruction::kNegateI2: sp[-1] = -sp[-1].fSigned;
- case ByteCodeInstruction::kNegateI : sp[ 0] = -sp [0].fSigned;
+ case ByteCodeInstruction::kNegateI : sp[ 0] = -sp[ 0].fSigned;
break;
- case ByteCodeInstruction::kOrB:
- sp[-1] = sp[-1].fBool || sp[0].fBool;
- POP();
- break;
-
case ByteCodeInstruction::kPop4: POP();
case ByteCodeInstruction::kPop3: POP();
case ByteCodeInstruction::kPop2: POP();
@@ -604,34 +707,56 @@
break;
case ByteCodeInstruction::kPushImmediate:
- PUSH(READ32());
+ PUSH(U32(READ32()));
break;
- case ByteCodeInstruction::kReadExternal: // fall through
- case ByteCodeInstruction::kReadExternal2: // fall through
- case ByteCodeInstruction::kReadExternal3: // fall through
+ case ByteCodeInstruction::kReadExternal:
+ case ByteCodeInstruction::kReadExternal2:
+ case ByteCodeInstruction::kReadExternal3:
case ByteCodeInstruction::kReadExternal4: {
+ // TODO: Support striped external values, or passing lane index? This model is odd.
+ int count = (int)inst - (int)ByteCodeInstruction::kReadExternal + 1;
int src = READ8();
- byteCode->fExternalValues[src]->read(sp + 1);
- sp += (int) inst - (int) ByteCodeInstruction::kReadExternal + 1;
+ int32_t tmp[4];
+ I32 m = mask();
+ for (int i = 0; i < VecWidth; ++i) {
+ if (m[i]) {
+ byteCode->fExternalValues[src]->read(tmp);
+ for (int j = 0; j < count; ++j) {
+ sp[j + 1].fSigned[i] = tmp[j];
+ }
+ }
+ }
+ sp += count;
break;
}
- VECTOR_BINARY_FN(kRemainderF, fFloat, fmodf)
- VECTOR_BINARY_OP(kRemainderS, fSigned, %)
- VECTOR_BINARY_OP(kRemainderU, fUnsigned, %)
+ VECTOR_BINARY_FN(kRemainderF, fFloat, vec_mod<F32>)
+ VECTOR_BINARY_FN(kRemainderS, fSigned, vec_mod<I32>)
+ VECTOR_BINARY_FN(kRemainderU, fUnsigned, vec_mod<U32>)
case ByteCodeInstruction::kReturn: {
int count = READ8();
if (frames.empty()) {
if (outReturn) {
- memcpy(outReturn, sp - count + 1, count * sizeof(Value));
+ // TODO: This can be smarter, knowing that mask is left-justified
+ I32 m = mask();
+ VValue* src = sp - count + 1;
+ for (int i = 0; i < count; ++i) {
+ for (int j = 0; j < VecWidth; ++j) {
+ if (m[j]) {
+ outReturn[count * j].fSigned = src->fSigned[j];
+ }
+ }
+ ++outReturn;
+ ++src;
+ }
}
return;
} else {
// When we were called, 'stack' was positioned at the old top-of-stack (where
// our parameters were placed). So copy our return values to that same spot.
- memmove(stack, sp - count + 1, count * sizeof(Value));
+ memmove(stack, sp - count + 1, count * sizeof(VValue));
// Now move the stack pointer to the end of the just-pushed return values,
// and restore everything else.
@@ -648,44 +773,67 @@
case ByteCodeInstruction::kScalarToMatrix: {
int cols = READ8();
int rows = READ8();
- Value v = POP();
+ VValue v = POP();
for (int c = 0; c < cols; ++c) {
for (int r = 0; r < rows; ++r) {
- PUSH(c == r ? v : 0.0f);
+ PUSH(c == r ? v : F32(0.0f));
}
}
break;
}
- VECTOR_UNARY_FN(kSin, sinf, fFloat)
- VECTOR_UNARY_FN(kSqrt, sqrtf, fFloat)
+ VECTOR_UNARY_FN_VEC(kSin, sinf)
+ VECTOR_UNARY_FN(kSqrt, skvx::sqrt, fFloat)
- case ByteCodeInstruction::kStore4: stack[*ip + 3] = POP();
- case ByteCodeInstruction::kStore3: stack[*ip + 2] = POP();
- case ByteCodeInstruction::kStore2: stack[*ip + 1] = POP();
- case ByteCodeInstruction::kStore : stack[*ip + 0] = POP();
- ++ip;
- break;
+ case ByteCodeInstruction::kStore4:
+ stack[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+3].fFloat);
+ case ByteCodeInstruction::kStore3:
+ stack[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+2].fFloat);
+ case ByteCodeInstruction::kStore2:
+ stack[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+1].fFloat);
+ case ByteCodeInstruction::kStore :
+ stack[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+0].fFloat);
+ ++ip;
+ break;
- case ByteCodeInstruction::kStoreGlobal4: globals[*ip + 3] = POP();
- case ByteCodeInstruction::kStoreGlobal3: globals[*ip + 2] = POP();
- case ByteCodeInstruction::kStoreGlobal2: globals[*ip + 1] = POP();
- case ByteCodeInstruction::kStoreGlobal : globals[*ip + 0] = POP();
- ++ip;
- break;
+ case ByteCodeInstruction::kStoreGlobal4:
+ globals[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+3].fFloat);
+ case ByteCodeInstruction::kStoreGlobal3:
+ globals[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+2].fFloat);
+ case ByteCodeInstruction::kStoreGlobal2:
+ globals[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+1].fFloat);
+ case ByteCodeInstruction::kStoreGlobal :
+ globals[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+0].fFloat);
+ ++ip;
+ break;
case ByteCodeInstruction::kStoreExtended: {
int count = READ8();
- int target = POP().fSigned;
- memcpy(&stack[target], sp - count + 1, count * sizeof(Value));
+ I32 target = POP().fSigned;
+ VValue* src = sp - count + 1;
+ I32 m = mask();
+ for (int i = 0; i < count; ++i) {
+ for (int j = 0; j < VecWidth; ++j) {
+ if (m[j]) {
+ stack[target[j] + i].fSigned[j] = src[i].fSigned[j];
+ }
+ }
+ }
sp -= count;
break;
}
case ByteCodeInstruction::kStoreExtendedGlobal: {
int count = READ8();
- int target = POP().fSigned;
- SkASSERT(target + count <= globalCount);
- memcpy(&globals[target], sp - count + 1, count * sizeof(Value));
+ I32 target = POP().fSigned;
+ VValue* src = sp - count + 1;
+ I32 m = mask();
+ for (int i = 0; i < count; ++i) {
+ for (int j = 0; j < VecWidth; ++j) {
+ if (m[j]) {
+ globals[target[j] + i].fSigned[j] = src[i].fSigned[j];
+ }
+ }
+ }
sp -= count;
break;
}
@@ -694,7 +842,8 @@
int target = READ8();
int count = READ8();
for (int i = count - 1; i >= 0; --i) {
- stack[target + *(ip + i)] = POP();
+ stack[target + *(ip + i)] = skvx::if_then_else(
+ mask(), POP().fFloat, stack[target + *(ip + i)].fFloat);
}
ip += count;
break;
@@ -704,25 +853,40 @@
int target = READ8();
int count = READ8();
for (int i = count - 1; i >= 0; --i) {
- globals[target + *(ip + i)] = POP();
+ globals[target + *(ip + i)] = skvx::if_then_else(
+ mask(), POP().fFloat, globals[target + *(ip + i)].fFloat);
}
ip += count;
break;
}
+
case ByteCodeInstruction::kStoreSwizzleIndirect: {
- int target = POP().fSigned;
int count = READ8();
+ I32 target = POP().fSigned;
+ I32 m = mask();
for (int i = count - 1; i >= 0; --i) {
- stack[target + *(ip + i)] = POP();
+ I32 v = POP().fSigned;
+ for (int j = 0; j < VecWidth; ++j) {
+ if (m[j]) {
+ stack[target[j] + *(ip + i)].fSigned[j] = v[j];
+ }
+ }
}
ip += count;
break;
}
+
case ByteCodeInstruction::kStoreSwizzleIndirectGlobal: {
- int target = POP().fSigned;
int count = READ8();
+ I32 target = POP().fSigned;
+ I32 m = mask();
for (int i = count - 1; i >= 0; --i) {
- globals[target + *(ip + i)] = POP();
+ I32 v = POP().fSigned;
+ for (int j = 0; j < VecWidth; ++j) {
+ if (m[j]) {
+ globals[target[j] + *(ip + i)].fSigned[j] = v[j];
+ }
+ }
}
ip += count;
break;
@@ -732,7 +896,7 @@
VECTOR_MATRIX_BINARY_OP(kSubtractF, fFloat, -)
case ByteCodeInstruction::kSwizzle: {
- Value tmp[4];
+ VValue tmp[4];
for (int i = READ8() - 1; i >= 0; --i) {
tmp[i] = POP();
}
@@ -742,70 +906,150 @@
break;
}
- VECTOR_UNARY_FN(kTan, tanf, fFloat)
+ VECTOR_UNARY_FN_VEC(kTan, tanf)
- case ByteCodeInstruction::kWriteExternal: // fall through
- case ByteCodeInstruction::kWriteExternal2: // fall through
- case ByteCodeInstruction::kWriteExternal3: // fall through
+ case ByteCodeInstruction::kWriteExternal:
+ case ByteCodeInstruction::kWriteExternal2:
+ case ByteCodeInstruction::kWriteExternal3:
case ByteCodeInstruction::kWriteExternal4: {
- int count = (int) inst - (int) ByteCodeInstruction::kWriteExternal + 1;
+ int count = (int)inst - (int)ByteCodeInstruction::kWriteExternal + 1;
int target = READ8();
- byteCode->fExternalValues[target]->write(sp - count + 1);
+ int32_t tmp[4];
+ I32 m = mask();
sp -= count;
+ for (int i = 0; i < VecWidth; ++i) {
+ if (m[i]) {
+ for (int j = 0; j < count; ++j) {
+ tmp[j] = sp[j + 1].fSigned[i];
+ }
+ byteCode->fExternalValues[target]->write(tmp);
+ }
+ }
+ break;
+ }
+
+ case ByteCodeInstruction::kMaskPush:
+ condPtr[1] = POP().fSigned;
+ maskPtr[1] = maskPtr[0] & condPtr[1];
+ ++condPtr; ++maskPtr;
+ break;
+ case ByteCodeInstruction::kMaskPop:
+ --condPtr; --maskPtr;
+ break;
+ case ByteCodeInstruction::kMaskNegate:
+ maskPtr[0] = maskPtr[-1] & ~condPtr[0];
+ break;
+ case ByteCodeInstruction::kMaskBlend: {
+ int count = READ8();
+ I32 m = condPtr[0];
+ --condPtr; --maskPtr;
+ for (int i = 0; i < count; ++i) {
+ sp[-count] = skvx::if_then_else(m, sp[-count].fFloat, sp[0].fFloat);
+ --sp;
+ }
+ break;
+ }
+ case ByteCodeInstruction::kBranchIfAllFalse: {
+ int target = READ16();
+ if (!skvx::any(mask())) {
+ ip = code + target;
+ }
+ break;
+ }
+
+ case ByteCodeInstruction::kLoopBegin:
+ *(++contPtr) = 0;
+ *(++loopPtr) = ~0;
+ break;
+ case ByteCodeInstruction::kLoopNext:
+ *loopPtr |= *contPtr;
+ *contPtr = 0;
+ break;
+ case ByteCodeInstruction::kLoopMask:
+ *loopPtr &= POP().fSigned;
+ break;
+ case ByteCodeInstruction::kLoopEnd:
+ --contPtr; --loopPtr;
+ break;
+ case ByteCodeInstruction::kLoopBreak:
+ *loopPtr &= ~mask();
+ break;
+ case ByteCodeInstruction::kLoopContinue: {
+ I32 m = mask();
+ *contPtr |= m;
+ *loopPtr &= ~m;
break;
}
default:
SkDEBUGFAILF("unsupported instruction %d\n", (int) inst);
}
+ }
+}
+
+void VecRun(const ByteCode* byteCode, const ByteCodeFunction* f, Value args[], Value* outReturn,
+ int N, Value uniforms[], int uniformCount) {
#ifdef TRACE
- int stackSize = (int) (sp - stack + 1);
- printf("STACK(%d):", stackSize);
- for (int i = 0; i < stackSize; ++i) {
- printf(" %d(%g)", stack[i].fSigned, stack[i].fFloat);
- }
- printf("\n");
+ disassemble(f);
#endif
+ VValue smallStack[128];
+
+ SkASSERT(uniformCount == (int)byteCode->fInputSlots.size());
+ VValue smallGlobals[32];
+ VValue* globals = smallGlobals;
+ SkASSERT((int)SK_ARRAY_COUNT(smallGlobals) >= byteCode->fGlobalCount);
+ for (uint8_t slot : byteCode->fInputSlots) {
+ globals[slot].fUnsigned = (uniforms++)->fUnsigned;
+ }
+
+ while (N) {
+ VValue* stack = smallStack;
+
+ int w = std::min(N, VecWidth);
+ N -= w;
+
+ // Transpose args into stack
+ {
+ uint32_t* src = (uint32_t*)args;
+ for (int i = 0; i < w; ++i) {
+ uint32_t* dst = (uint32_t*)stack + i;
+ for (int j = f->fParameterCount; j > 0; --j) {
+ *dst = *src++;
+ dst += VecWidth;
+ }
+ }
+ }
+
+ auto mask = w > gLanes;
+ innerRun(byteCode, f, stack, outReturn, mask, globals, byteCode->fGlobalCount);
+
+ // Transpose out parameters back
+ {
+ uint32_t* dst = (uint32_t*)args;
+ for (int i = 0; i < w; ++i) {
+ uint32_t* src = (uint32_t*)stack + i;
+ for (const auto& p : f->fParameters) {
+ if (p.fIsOutParameter) {
+ for (int j = p.fSlotCount; j > 0; --j) {
+ *dst++ = *src;
+ src += VecWidth;
+ }
+ } else {
+ dst += p.fSlotCount;
+ src += p.fSlotCount * VecWidth;
+ }
+ }
+ }
+ }
+
+ args += f->fParameterCount * w;
+ outReturn += f->fReturnCount * w;
}
}
void Run(const ByteCode* byteCode, const ByteCodeFunction* f, Value args[], Value* outReturn,
Value uniforms[], int uniformCount) {
-#ifdef TRACE
- disassemble(f);
-#endif
- Value smallStack[128];
- std::unique_ptr<Value[]> largeStack;
- Value* stack = smallStack;
- if ((int)SK_ARRAY_COUNT(smallStack) < f->fStackCount) {
- largeStack.reset(new Value[f->fStackCount]);
- stack = largeStack.get();
- }
-
- if (f->fParameterCount) {
- memcpy(stack, args, f->fParameterCount * sizeof(Value));
- }
-
- SkASSERT(uniformCount == (int)byteCode->fInputSlots.size());
- Value smallGlobals[32];
- std::unique_ptr<Value[]> largeGlobals;
- Value* globals = smallGlobals;
- if ((int)SK_ARRAY_COUNT(smallGlobals) < byteCode->fGlobalCount) {
- largeGlobals.reset(new Value[byteCode->fGlobalCount]);
- globals = largeGlobals.get();
- }
- for (uint8_t slot : byteCode->fInputSlots) {
- globals[slot] = *uniforms++;
- }
- innerRun(byteCode, f, stack, outReturn, globals, byteCode->fGlobalCount);
-
- for (const auto& p : f->fParameters) {
- if (p.fIsOutParameter) {
- memcpy(args, stack, p.fSlotCount * sizeof(Value));
- }
- args += p.fSlotCount;
- stack += p.fSlotCount;
- }
+ VecRun(byteCode, f, args, outReturn, 1, uniforms, uniformCount);
}
} // namespace Interpreter