move skvm interpreter to SkOpts again
This is the easiest way to guarantee Op::fma_f32
actually fuses, by using platform intrinsics.
While implementing this we noticed that quad-pumping
was actually slower than double-pumping by about 25%,
and single-pumping was between the two. Switch from
quad to double pumping.
Change-Id: Ib93fd175fb8f6aaf49f769a95edfa9fd6b2674f6
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/275299
Commit-Queue: Mike Klein <mtklein@google.com>
Commit-Queue: Herb Derby <herb@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/include/private/SkVx.h b/include/private/SkVx.h
index 9f813a5..0f571a3 100644
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@@ -28,7 +28,7 @@
#include <cstring> // memcpy()
#include <initializer_list> // std::initializer_list
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__AVX2__)
#include <immintrin.h>
#elif defined(__ARM_NEON)
#include <arm_neon.h>
@@ -416,6 +416,20 @@
#endif
}
+// fma() delivers a fused mul-add, even if that's really expensive. Call it when you know it's not.
+static inline Vec<1,float> fma(const Vec<1,float>& x,
+ const Vec<1,float>& y,
+ const Vec<1,float>& z) {
+ return std::fma(x.val, y.val, z.val);
+}
+template <int N>
+static inline Vec<N,float> fma(const Vec<N,float>& x,
+ const Vec<N,float>& y,
+ const Vec<N,float>& z) {
+ return join(fma(x.lo, y.lo, z.lo),
+ fma(x.hi, y.hi, z.hi));
+}
+
// div255(x) = (x + 127) / 255 is a bit-exact rounding divide-by-255, packing down to 8-bit.
template <int N>
static inline Vec<N,uint8_t> div255(const Vec<N,uint16_t>& x) {
@@ -521,6 +535,33 @@
}
#endif
+ #if defined(__AVX2__)
+ static inline Vec<4,float> fma(const Vec<4,float>& x,
+ const Vec<4,float>& y,
+ const Vec<4,float>& z) {
+ return bit_pun<Vec<4,float>>(_mm_fmadd_ps(bit_pun<__m128>(x),
+ bit_pun<__m128>(y),
+ bit_pun<__m128>(z)));
+ }
+
+ static inline Vec<8,float> fma(const Vec<8,float>& x,
+ const Vec<8,float>& y,
+ const Vec<8,float>& z) {
+ return bit_pun<Vec<8,float>>(_mm256_fmadd_ps(bit_pun<__m256>(x),
+ bit_pun<__m256>(y),
+ bit_pun<__m256>(z)));
+ }
+ #elif defined(__aarch64__)
+ static inline Vec<4,float> fma(const Vec<4,float>& x,
+ const Vec<4,float>& y,
+ const Vec<4,float>& z) {
+ // These instructions tend to work like z += xy, so the order here is z,x,y.
+ return bit_pun<Vec<4,float>>(vfmaq_f32(bit_pun<float32x4_t>(z),
+ bit_pun<float32x4_t>(x),
+ bit_pun<float32x4_t>(y)));
+ }
+ #endif
+
#endif // !defined(SKNX_NO_SIMD)
} // namespace skvx
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index fe4344d..1f54593 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -36,6 +36,7 @@
#define SK_OPTS_NS portable
#endif
+#include "src/core/SkCubicSolver.h"
#include "src/opts/SkBitmapProcState_opts.h"
#include "src/opts/SkBlitMask_opts.h"
#include "src/opts/SkBlitRow_opts.h"
@@ -43,10 +44,9 @@
#include "src/opts/SkRasterPipeline_opts.h"
#include "src/opts/SkSwizzler_opts.h"
#include "src/opts/SkUtils_opts.h"
+#include "src/opts/SkVM_opts.h"
#include "src/opts/SkXfermode_opts.h"
-#include "src/core/SkCubicSolver.h"
-
namespace SkOpts {
// Define default function pointer values here...
// If our global compile options are set high enough, these defaults might even be
@@ -85,6 +85,8 @@
DEFINE_DEFAULT(S32_alpha_D32_filter_DX);
DEFINE_DEFAULT(S32_alpha_D32_filter_DXDY);
+
+ DEFINE_DEFAULT(interpret_skvm);
#undef DEFINE_DEFAULT
#define M(st) (StageFn)SK_OPTS_NS::st,
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index dd1cc9d..5b8e773 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -13,6 +13,7 @@
#include "src/core/SkXfermodePriv.h"
struct SkBitmapProcState;
+namespace skvm { struct InterpreterInstruction; }
namespace SkOpts {
// Call to replace pointers to portable functions with pointers to CPU-specific functions.
@@ -77,6 +78,9 @@
extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**);
#undef M
+ extern void (*interpret_skvm)(const skvm::InterpreterInstruction insts[], int ninsts,
+ int nregs, int loop, const int strides[], int nargs,
+ int n, void* args[]);
}
#endif//SkOpts_DEFINED
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index bb437a9..d07dc62 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -48,10 +48,10 @@
namespace skvm {
struct Program::Impl {
- std::vector<Instruction> instructions;
- int regs = 0;
- int loop = 0;
- std::vector<int> strides;
+ std::vector<InterpreterInstruction> instructions;
+ int regs = 0;
+ int loop = 0;
+ std::vector<int> strides;
std::atomic<void*> jit_entry{nullptr}; // TODO: minimal std::memory_orders
size_t jit_size = 0;
@@ -272,7 +272,7 @@
o->writeDecAsText(i);
o->writeText("\t");
if (i >= fImpl->loop) { write(o, " "); }
- const Program::Instruction& inst = fImpl->instructions[i];
+ const InterpreterInstruction& inst = fImpl->instructions[i];
Op op = inst.op;
Reg d = inst.d,
x = inst.x,
@@ -377,7 +377,7 @@
std::vector<OptimizedInstruction> Builder::optimize(bool for_jit) const {
// If requested, first specialize for our JIT backend.
- auto specialize_for_jit = [&]() -> std::vector<Instruction> {
+ auto specialize_for_jit = [&]() -> std::vector<Builder::Instruction> {
Builder specialized;
for (int i = 0; i < (int)fProgram.size(); i++) {
Builder::Instruction inst = fProgram[i];
@@ -1678,270 +1678,9 @@
}
// So we'll sometimes use the interpreter here even if later calls will use the JIT.
- this->interpret(n, args);
- }
-
- void Program::interpret(int n, void* args[]) const {
- // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
- constexpr int K = 16;
- using I32 = skvx::Vec<K, int>;
- using F32 = skvx::Vec<K, float>;
- using U32 = skvx::Vec<K, uint32_t>;
- using U16 = skvx::Vec<K, uint16_t>;
- using U8 = skvx::Vec<K, uint8_t>;
-
- using I16x2 = skvx::Vec<2*K, int16_t>;
- using U16x2 = skvx::Vec<2*K, uint16_t>;
-
- union Slot {
- F32 f32;
- I32 i32;
- U32 u32;
- I16x2 i16x2;
- U16x2 u16x2;
- };
-
- Slot few_regs[16];
- std::unique_ptr<char[]> many_regs;
-
- Slot* regs = few_regs;
-
- if (fImpl->regs > (int)SK_ARRAY_COUNT(few_regs)) {
- // Annoyingly we can't trust that malloc() or new will work with Slot because
- // the skvx::Vec types may have alignment greater than what they provide.
- // We'll overallocate one extra register so we can align manually.
- many_regs.reset(new char[ sizeof(Slot) * (fImpl->regs + 1) ]);
-
- uintptr_t addr = (uintptr_t)many_regs.get();
- addr += alignof(Slot) -
- (addr & (alignof(Slot) - 1));
- SkASSERT((addr & (alignof(Slot) - 1)) == 0);
- regs = (Slot*)addr;
- }
-
-
- auto r = [&](Reg id) -> Slot& {
- SkASSERT(0 <= id && id < fImpl->regs);
- return regs[id];
- };
- auto arg = [&](int ix) {
- SkASSERT(0 <= ix && ix < (int)fImpl->strides.size());
- return args[ix];
- };
-
- // Step each argument pointer ahead by its stride a number of times.
- auto step_args = [&](int times) {
- for (int i = 0; i < (int)fImpl->strides.size(); i++) {
- args[i] = (void*)( (char*)args[i] + times * fImpl->strides[i] );
- }
- };
-
- int start = 0,
- stride;
- for ( ; n > 0; start = fImpl->loop, n -= stride, step_args(stride)) {
- stride = n >= K ? K : 1;
-
- for (int i = start; i < (int)fImpl->instructions.size(); i++) {
- Instruction inst = fImpl->instructions[i];
-
- // d = op(x,y/imm,z/imm)
- Reg d = inst.d,
- x = inst.x,
- y = inst.y,
- z = inst.z;
- int immy = inst.immy,
- immz = inst.immz;
-
- // Ops that interact with memory need to know whether we're stride=1 or K,
- // but all non-memory ops can run the same code no matter the stride.
- switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
- default: SkUNREACHABLE;
-
- #define STRIDE_1(op) case 2*(int)op
- #define STRIDE_K(op) case 2*(int)op + 1
- STRIDE_1(Op::store8 ): memcpy(arg(immy), &r(x).i32, 1); break;
- STRIDE_1(Op::store16): memcpy(arg(immy), &r(x).i32, 2); break;
- STRIDE_1(Op::store32): memcpy(arg(immy), &r(x).i32, 4); break;
-
- STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r(x).i32).store(arg(immy)); break;
- STRIDE_K(Op::store16): skvx::cast<uint16_t>(r(x).i32).store(arg(immy)); break;
- STRIDE_K(Op::store32): (r(x).i32).store(arg(immy)); break;
-
- STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 1); break;
- STRIDE_1(Op::load16): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 2); break;
- STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 4); break;
-
- STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(immy))); break;
- STRIDE_K(Op::load16): r(d).i32= skvx::cast<int>(U16::Load(arg(immy))); break;
- STRIDE_K(Op::load32): r(d).i32= I32::Load(arg(immy)) ; break;
-
- // The pointer we base our gather on is loaded indirectly from a uniform:
- // - arg(immy) is the uniform holding our gather base pointer somewhere;
- // - (const uint8_t*)arg(immy) + immz points to the gather base pointer;
- // - memcpy() loads the gather base and into a pointer of the right type.
- // After all that we have an ordinary (uniform) pointer `ptr` to load from,
- // and we then gather from it using the varying indices in r(x).
- STRIDE_1(Op::gather8):
- for (int i = 0; i < K; i++) {
- const uint8_t* ptr;
- memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
- r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
- } break;
- STRIDE_1(Op::gather16):
- for (int i = 0; i < K; i++) {
- const uint16_t* ptr;
- memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
- r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
- } break;
- STRIDE_1(Op::gather32):
- for (int i = 0; i < K; i++) {
- const int* ptr;
- memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
- r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
- } break;
-
- STRIDE_K(Op::gather8):
- for (int i = 0; i < K; i++) {
- const uint8_t* ptr;
- memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
- r(d).i32[i] = ptr[ r(x).i32[i] ];
- } break;
- STRIDE_K(Op::gather16):
- for (int i = 0; i < K; i++) {
- const uint16_t* ptr;
- memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
- r(d).i32[i] = ptr[ r(x).i32[i] ];
- } break;
- STRIDE_K(Op::gather32):
- for (int i = 0; i < K; i++) {
- const int* ptr;
- memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
- r(d).i32[i] = ptr[ r(x).i32[i] ];
- } break;
-
- #undef STRIDE_1
- #undef STRIDE_K
-
- // Ops that don't interact with memory should never care about the stride.
- #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
-
- CASE(Op::assert_true):
- #ifdef SK_DEBUG
- if (!all(r(x).i32)) {
- this->dump();
- SkDebugf("inst %d, register %d\n", i, y);
- for (int i = 0; i < K; i++) {
- SkDebugf("\t%2d: %08x (%g)\n", i, r(y).i32[i], r(y).f32[i]);
- }
- }
- SkASSERT(all(r(x).i32));
- #endif
- break;
-
- CASE(Op::index): static_assert(K == 16, "");
- r(d).i32 = n - I32{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
- break;
-
- CASE(Op::uniform8):
- r(d).i32 = *(const uint8_t* )( (const char*)arg(immy) + immz );
- break;
- CASE(Op::uniform16):
- r(d).i32 = *(const uint16_t*)( (const char*)arg(immy) + immz );
- break;
- CASE(Op::uniform32):
- r(d).i32 = *(const int* )( (const char*)arg(immy) + immz );
- break;
-
- CASE(Op::splat): r(d).i32 = immy; break;
-
- CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
- CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
- CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
- CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
- CASE(Op::min_f32): r(d).f32 = min(r(x).f32, r(y).f32); break;
- CASE(Op::max_f32): r(d).f32 = max(r(x).f32, r(y).f32); break;
-
- // These _imm instructions are all x86/JIT only.
- CASE(Op::add_f32_imm):
- CASE(Op::sub_f32_imm):
- CASE(Op::mul_f32_imm):
- CASE(Op::min_f32_imm):
- CASE(Op::max_f32_imm):
- CASE(Op::bit_and_imm):
- CASE(Op::bit_or_imm ):
- CASE(Op::bit_xor_imm): SkUNREACHABLE; break;
-
- CASE(Op::fma_f32): {
- // TODO: vectorized skvx calls
- for (int i = 0; i < K; i++) {
- r(d).f32[i] = std::fma(r(x).f32[i], r(y).f32[i], r(z).f32[i]);
- }
- } break;
-
- CASE(Op::sqrt_f32): r(d).f32 = sqrt(r(x).f32); break;
-
- CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
- CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
- CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
-
- CASE(Op::add_i16x2): r(d).i16x2 = r(x).i16x2 + r(y).i16x2; break;
- CASE(Op::sub_i16x2): r(d).i16x2 = r(x).i16x2 - r(y).i16x2; break;
- CASE(Op::mul_i16x2): r(d).i16x2 = r(x).i16x2 * r(y).i16x2; break;
-
- CASE(Op::shl_i32): r(d).i32 = r(x).i32 << immy; break;
- CASE(Op::sra_i32): r(d).i32 = r(x).i32 >> immy; break;
- CASE(Op::shr_i32): r(d).u32 = r(x).u32 >> immy; break;
-
- CASE(Op::shl_i16x2): r(d).i16x2 = r(x).i16x2 << immy; break;
- CASE(Op::sra_i16x2): r(d).i16x2 = r(x).i16x2 >> immy; break;
- CASE(Op::shr_i16x2): r(d).u16x2 = r(x).u16x2 >> immy; break;
-
- CASE(Op:: eq_f32): r(d).i32 = r(x).f32 == r(y).f32; break;
- CASE(Op::neq_f32): r(d).i32 = r(x).f32 != r(y).f32; break;
- CASE(Op:: gt_f32): r(d).i32 = r(x).f32 > r(y).f32; break;
- CASE(Op::gte_f32): r(d).i32 = r(x).f32 >= r(y).f32; break;
-
- CASE(Op:: eq_i32): r(d).i32 = r(x).i32 == r(y).i32; break;
- CASE(Op::neq_i32): r(d).i32 = r(x).i32 != r(y).i32; break;
- CASE(Op:: gt_i32): r(d).i32 = r(x).i32 > r(y).i32; break;
- CASE(Op::gte_i32): r(d).i32 = r(x).i32 >= r(y).i32; break;
-
- CASE(Op:: eq_i16x2): r(d).i16x2 = r(x).i16x2 == r(y).i16x2; break;
- CASE(Op::neq_i16x2): r(d).i16x2 = r(x).i16x2 != r(y).i16x2; break;
- CASE(Op:: gt_i16x2): r(d).i16x2 = r(x).i16x2 > r(y).i16x2; break;
- CASE(Op::gte_i16x2): r(d).i16x2 = r(x).i16x2 >= r(y).i16x2; break;
-
- CASE(Op::bit_and ): r(d).i32 = r(x).i32 & r(y).i32; break;
- CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
- CASE(Op::bit_xor ): r(d).i32 = r(x).i32 ^ r(y).i32; break;
- CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
-
- CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32);
- break;
-
- CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << immz); break;
-
- CASE(Op::bytes): {
- const U32 table[] = {
- 0,
- (r(x).u32 ) & 0xff,
- (r(x).u32 >> 8) & 0xff,
- (r(x).u32 >> 16) & 0xff,
- (r(x).u32 >> 24) & 0xff,
- };
- r(d).u32 = table[(immy >> 0) & 0xf] << 0
- | table[(immy >> 4) & 0xf] << 8
- | table[(immy >> 8) & 0xf] << 16
- | table[(immy >> 12) & 0xf] << 24;
- } break;
-
- CASE(Op::floor): r(d).f32 = skvx::floor(r(x).f32); break;
- CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
- CASE(Op::trunc): r(d).i32 = skvx::cast<int> (r(x).f32); break;
- #undef CASE
- }
- }
- }
+ SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(),
+ this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(),
+ n, args);
}
#if defined(SKVM_LLVM)
@@ -2433,13 +2172,13 @@
#endif
}
- std::vector<Program::Instruction> Program::instructions() const { return fImpl->instructions; }
+ std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; }
int Program::nargs() const { return (int)fImpl->strides.size(); }
int Program::nregs() const { return fImpl->regs; }
int Program::loop () const { return fImpl->loop; }
bool Program::empty() const { return fImpl->instructions.empty(); }
- // Translate OptimizedInstructions to Program::Instructions used by the interpreter.
+ // Translate OptimizedInstructions to InterpreterInstructions.
void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
// Register each instruction is assigned to.
std::vector<Reg> reg(instructions.size());
@@ -2496,7 +2235,7 @@
if (!hoisted(id)) { assign_register(id); }
}
- // Translate OptimizedInstructions to Program::Instructions by mapping values to
+ // Translate OptimizedInstructions to InterpreterIstructions by mapping values to
// registers. This will be two passes, first hoisted instructions, then inside the loop.
// The loop begins at the fImpl->loop'th Instruction.
@@ -2511,7 +2250,7 @@
};
auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
- Program::Instruction pinst{
+ InterpreterInstruction pinst{
inst.op,
lookup_register(id),
lookup_register(inst.x),
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 1bd3f21..0f3c706 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -611,15 +611,16 @@
using Reg = int;
+ // d = op(x, y/imm, z/imm)
+ struct InterpreterInstruction {
+ Op op;
+ Reg d,x;
+ union { Reg y; int immy; };
+ union { Reg z; int immz; };
+ };
+
class Program {
public:
- struct Instruction { // d = op(x, y/imm, z/imm)
- Op op;
- Reg d,x;
- union { Reg y; int immy; };
- union { Reg z; int immz; };
- };
-
Program(const std::vector<OptimizedInstruction>& interpreter,
const std::vector<int>& strides);
@@ -647,7 +648,7 @@
this->eval(n, args);
}
- std::vector<Instruction> instructions() const;
+ std::vector<InterpreterInstruction> instructions() const;
int nargs() const;
int nregs() const;
int loop () const;
@@ -663,8 +664,6 @@
void setupJIT (const std::vector<OptimizedInstruction>&, const char* debug_name);
void setupLLVM (const std::vector<OptimizedInstruction>&, const char* debug_name);
- void interpret(int n, void* args[]) const;
-
bool jit(const std::vector<OptimizedInstruction>&,
bool try_hoisting,
Assembler*) const;
diff --git a/src/opts/SkOpts_hsw.cpp b/src/opts/SkOpts_hsw.cpp
index 1cf40c5..21f4109 100644
--- a/src/opts/SkOpts_hsw.cpp
+++ b/src/opts/SkOpts_hsw.cpp
@@ -13,6 +13,7 @@
#include "src/opts/SkBlitRow_opts.h"
#include "src/opts/SkRasterPipeline_opts.h"
#include "src/opts/SkUtils_opts.h"
+#include "src/opts/SkVM_opts.h"
namespace SkOpts {
void Init_hsw() {
@@ -34,5 +35,7 @@
just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
#undef M
+
+ interpret_skvm = SK_OPTS_NS::interpret_skvm;
}
}
diff --git a/src/opts/SkVM_opts.h b/src/opts/SkVM_opts.h
new file mode 100644
index 0000000..ef3e446
--- /dev/null
+++ b/src/opts/SkVM_opts.h
@@ -0,0 +1,284 @@
+// Copyright 2020 Google LLC.
+
+#ifndef SkVM_opts_DEFINED
+#define SkVM_opts_DEFINED
+
+#include "include/private/SkVx.h"
+#include "src/core/SkVM.h"
+
+namespace SK_OPTS_NS {
+
+ inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts,
+ const int nregs, const int loop,
+ const int strides[], const int nargs,
+ int n, void* args[]) {
+ using namespace skvm;
+
+ // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
+ // We noticed quad-pumping is slower than single-pumping and both were slower than double.
+ #if defined(__AVX2__)
+ constexpr int K = 16;
+ #else
+ constexpr int K = 8;
+ #endif
+ using I32 = skvx::Vec<K, int>;
+ using F32 = skvx::Vec<K, float>;
+ using U32 = skvx::Vec<K, uint32_t>;
+ using U16 = skvx::Vec<K, uint16_t>;
+ using U8 = skvx::Vec<K, uint8_t>;
+
+ using I16x2 = skvx::Vec<2*K, int16_t>;
+ using U16x2 = skvx::Vec<2*K, uint16_t>;
+
+ union Slot {
+ F32 f32;
+ I32 i32;
+ U32 u32;
+ I16x2 i16x2;
+ U16x2 u16x2;
+ };
+
+ Slot few_regs[16];
+ std::unique_ptr<char[]> many_regs;
+
+ Slot* regs = few_regs;
+
+ if (nregs > (int)SK_ARRAY_COUNT(few_regs)) {
+ // Annoyingly we can't trust that malloc() or new will work with Slot because
+ // the skvx::Vec types may have alignment greater than what they provide.
+ // We'll overallocate one extra register so we can align manually.
+ many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]);
+
+ uintptr_t addr = (uintptr_t)many_regs.get();
+ addr += alignof(Slot) -
+ (addr & (alignof(Slot) - 1));
+ SkASSERT((addr & (alignof(Slot) - 1)) == 0);
+ regs = (Slot*)addr;
+ }
+
+
+ auto r = [&](Reg id) -> Slot& {
+ SkASSERT(0 <= id && id < nregs);
+ return regs[id];
+ };
+ auto arg = [&](int ix) {
+ SkASSERT(0 <= ix && ix < nargs);
+ return args[ix];
+ };
+
+ // Step each argument pointer ahead by its stride a number of times.
+ auto step_args = [&](int times) {
+ for (int i = 0; i < nargs; i++) {
+ args[i] = (void*)( (char*)args[i] + times * strides[i] );
+ }
+ };
+
+ int start = 0,
+ stride;
+ for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
+ stride = n >= K ? K : 1;
+
+ for (int i = start; i < ninsts; i++) {
+ InterpreterInstruction inst = insts[i];
+
+ // d = op(x,y/imm,z/imm)
+ Reg d = inst.d,
+ x = inst.x,
+ y = inst.y,
+ z = inst.z;
+ int immy = inst.immy,
+ immz = inst.immz;
+
+ // Ops that interact with memory need to know whether we're stride=1 or K,
+ // but all non-memory ops can run the same code no matter the stride.
+ switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
+ default: SkUNREACHABLE;
+
+ #define STRIDE_1(op) case 2*(int)op
+ #define STRIDE_K(op) case 2*(int)op + 1
+ STRIDE_1(Op::store8 ): memcpy(arg(immy), &r(x).i32, 1); break;
+ STRIDE_1(Op::store16): memcpy(arg(immy), &r(x).i32, 2); break;
+ STRIDE_1(Op::store32): memcpy(arg(immy), &r(x).i32, 4); break;
+
+ STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r(x).i32).store(arg(immy)); break;
+ STRIDE_K(Op::store16): skvx::cast<uint16_t>(r(x).i32).store(arg(immy)); break;
+ STRIDE_K(Op::store32): (r(x).i32).store(arg(immy)); break;
+
+ STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 1); break;
+ STRIDE_1(Op::load16): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 2); break;
+ STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 4); break;
+
+ STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(immy))); break;
+ STRIDE_K(Op::load16): r(d).i32= skvx::cast<int>(U16::Load(arg(immy))); break;
+ STRIDE_K(Op::load32): r(d).i32= I32::Load(arg(immy)) ; break;
+
+ // The pointer we base our gather on is loaded indirectly from a uniform:
+ // - arg(immy) is the uniform holding our gather base pointer somewhere;
+ // - (const uint8_t*)arg(immy) + immz points to the gather base pointer;
+ // - memcpy() loads the gather base and into a pointer of the right type.
+ // After all that we have an ordinary (uniform) pointer `ptr` to load from,
+ // and we then gather from it using the varying indices in r(x).
+ STRIDE_1(Op::gather8):
+ for (int i = 0; i < K; i++) {
+ const uint8_t* ptr;
+ memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+ r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
+ } break;
+ STRIDE_1(Op::gather16):
+ for (int i = 0; i < K; i++) {
+ const uint16_t* ptr;
+ memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+ r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
+ } break;
+ STRIDE_1(Op::gather32):
+ for (int i = 0; i < K; i++) {
+ const int* ptr;
+ memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+ r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
+ } break;
+
+ STRIDE_K(Op::gather8):
+ for (int i = 0; i < K; i++) {
+ const uint8_t* ptr;
+ memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+ r(d).i32[i] = ptr[ r(x).i32[i] ];
+ } break;
+ STRIDE_K(Op::gather16):
+ for (int i = 0; i < K; i++) {
+ const uint16_t* ptr;
+ memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+ r(d).i32[i] = ptr[ r(x).i32[i] ];
+ } break;
+ STRIDE_K(Op::gather32):
+ for (int i = 0; i < K; i++) {
+ const int* ptr;
+ memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
+ r(d).i32[i] = ptr[ r(x).i32[i] ];
+ } break;
+
+ #undef STRIDE_1
+ #undef STRIDE_K
+
+ // Ops that don't interact with memory should never care about the stride.
+ #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
+
+ CASE(Op::assert_true):
+ #ifdef SK_DEBUG
+ if (!all(r(x).i32)) {
+ SkDebugf("inst %d, register %d\n", i, y);
+ for (int i = 0; i < K; i++) {
+ SkDebugf("\t%2d: %08x (%g)\n", i, r(y).i32[i], r(y).f32[i]);
+ }
+ }
+ SkASSERT(all(r(x).i32));
+ #endif
+ break;
+
+ CASE(Op::index): {
+ const int iota[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+ static_assert(K <= SK_ARRAY_COUNT(iota), "");
+
+ r(d).i32 = n - I32::Load(iota);
+ } break;
+
+ CASE(Op::uniform8):
+ r(d).i32 = *(const uint8_t* )( (const char*)arg(immy) + immz );
+ break;
+ CASE(Op::uniform16):
+ r(d).i32 = *(const uint16_t*)( (const char*)arg(immy) + immz );
+ break;
+ CASE(Op::uniform32):
+ r(d).i32 = *(const int* )( (const char*)arg(immy) + immz );
+ break;
+
+ CASE(Op::splat): r(d).i32 = immy; break;
+
+ CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
+ CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
+ CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
+ CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
+ CASE(Op::min_f32): r(d).f32 = min(r(x).f32, r(y).f32); break;
+ CASE(Op::max_f32): r(d).f32 = max(r(x).f32, r(y).f32); break;
+
+ // These _imm instructions are all x86/JIT only.
+ CASE(Op::add_f32_imm):
+ CASE(Op::sub_f32_imm):
+ CASE(Op::mul_f32_imm):
+ CASE(Op::min_f32_imm):
+ CASE(Op::max_f32_imm):
+ CASE(Op::bit_and_imm):
+ CASE(Op::bit_or_imm ):
+ CASE(Op::bit_xor_imm): SkUNREACHABLE; break;
+
+ CASE(Op::fma_f32): r(d).f32 = fma(r(x).f32, r(y).f32, r(z).f32); break;
+
+ CASE(Op::sqrt_f32): r(d).f32 = sqrt(r(x).f32); break;
+
+ CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
+ CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
+ CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
+
+ CASE(Op::add_i16x2): r(d).i16x2 = r(x).i16x2 + r(y).i16x2; break;
+ CASE(Op::sub_i16x2): r(d).i16x2 = r(x).i16x2 - r(y).i16x2; break;
+ CASE(Op::mul_i16x2): r(d).i16x2 = r(x).i16x2 * r(y).i16x2; break;
+
+ CASE(Op::shl_i32): r(d).i32 = r(x).i32 << immy; break;
+ CASE(Op::sra_i32): r(d).i32 = r(x).i32 >> immy; break;
+ CASE(Op::shr_i32): r(d).u32 = r(x).u32 >> immy; break;
+
+ CASE(Op::shl_i16x2): r(d).i16x2 = r(x).i16x2 << immy; break;
+ CASE(Op::sra_i16x2): r(d).i16x2 = r(x).i16x2 >> immy; break;
+ CASE(Op::shr_i16x2): r(d).u16x2 = r(x).u16x2 >> immy; break;
+
+ CASE(Op:: eq_f32): r(d).i32 = r(x).f32 == r(y).f32; break;
+ CASE(Op::neq_f32): r(d).i32 = r(x).f32 != r(y).f32; break;
+ CASE(Op:: gt_f32): r(d).i32 = r(x).f32 > r(y).f32; break;
+ CASE(Op::gte_f32): r(d).i32 = r(x).f32 >= r(y).f32; break;
+
+ CASE(Op:: eq_i32): r(d).i32 = r(x).i32 == r(y).i32; break;
+ CASE(Op::neq_i32): r(d).i32 = r(x).i32 != r(y).i32; break;
+ CASE(Op:: gt_i32): r(d).i32 = r(x).i32 > r(y).i32; break;
+ CASE(Op::gte_i32): r(d).i32 = r(x).i32 >= r(y).i32; break;
+
+ CASE(Op:: eq_i16x2): r(d).i16x2 = r(x).i16x2 == r(y).i16x2; break;
+ CASE(Op::neq_i16x2): r(d).i16x2 = r(x).i16x2 != r(y).i16x2; break;
+ CASE(Op:: gt_i16x2): r(d).i16x2 = r(x).i16x2 > r(y).i16x2; break;
+ CASE(Op::gte_i16x2): r(d).i16x2 = r(x).i16x2 >= r(y).i16x2; break;
+
+ CASE(Op::bit_and ): r(d).i32 = r(x).i32 & r(y).i32; break;
+ CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
+ CASE(Op::bit_xor ): r(d).i32 = r(x).i32 ^ r(y).i32; break;
+ CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
+
+ CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32);
+ break;
+
+ CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << immz); break;
+
+ CASE(Op::bytes): {
+ const U32 table[] = {
+ 0,
+ (r(x).u32 ) & 0xff,
+ (r(x).u32 >> 8) & 0xff,
+ (r(x).u32 >> 16) & 0xff,
+ (r(x).u32 >> 24) & 0xff,
+ };
+ r(d).u32 = table[(immy >> 0) & 0xf] << 0
+ | table[(immy >> 4) & 0xf] << 8
+ | table[(immy >> 8) & 0xf] << 16
+ | table[(immy >> 12) & 0xf] << 24;
+ } break;
+
+ CASE(Op::floor): r(d).f32 = skvx::floor(r(x).f32); break;
+ CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
+ CASE(Op::trunc): r(d).i32 = skvx::cast<int> (r(x).f32); break;
+ #undef CASE
+ }
+ }
+ }
+ }
+
+}
+
+#endif//SkVM_opts_DEFINED