first sksl on skvm
Exactly enough implemented to run
fm --skvm -b cpu -s runtime_shader -w foo
This shader
0: 0077 load2 0
10: 00a2 pushimmediate 998277249(0.0039215688593685627)
22: 0070 dup
31: 0094 multiplyf2
40: 0080 loaduniform 2
50: 00a2 pushimmediate 1065353216(1.0)
62: 00c1 store4 2
71: 00b4 return 0
becomes this blitter, including matrix, blending, asserts, etc:
17 registers, 57 instructions:
0 r0 = uniform32 arg(0) 4
1 r0 = to_f32 r0
2 r1 = splat 3F000000 (0.5)
3 r0 = add_f32 r0 r1
4 r2 = uniform32 arg(0) 2C
5 r3 = uniform32 arg(0) 28
6 r2 = fma_f32 r0 r3 r2
7 r3 = uniform32 arg(0) 0
8 r4 = uniform32 arg(0) 24
9 r5 = splat 3F800000 (1)
10 r6 = uniform32 arg(0) 38
11 r6 = min_f32 r6 r5
12 r7 = splat 0 (0)
13 r6 = max_f32 r7 r6
14 r8 = splat 437F0000 (255)
15 r9 = mul_f32 r6 r8
16 r9 = round r9
17 r10 = splat FF (3.5733111e-43)
18 r10 = pack r9 r10 8
19 r9 = splat 3B808081 (0.0039215689)
20 r11 = uniform32 arg(0) 20
21 r12 = uniform32 arg(0) 1C
22 r11 = fma_f32 r0 r12 r11
23 r12 = uniform32 arg(0) 18
24 r0 = splat 3F800001 (1.0000001)
25 r13 = min_f32 r6 r0
26 r14 = splat B4000000 (-1.1920929e-07)
27 r13 = max_f32 r14 r13
28 r13 = eq_f32 r6 r13
29 assert_true r13 r6
loop:
30 r6 = index
31 r6 = sub_i32 r3 r6
32 r6 = to_f32 r6
33 r6 = add_f32 r6 r1
34 r13 = fma_f32 r6 r4 r2
35 r13 = mul_f32 r9 r13
36 r6 = fma_f32 r6 r12 r11
37 r13 = min_f32 r13 r5
38 r13 = max_f32 r7 r13
39 r15 = mul_f32 r13 r8
40 r15 = round r15
41 r6 = mul_f32 r9 r6
42 r6 = min_f32 r6 r5
43 r6 = max_f32 r7 r6
44 r16 = mul_f32 r6 r8
45 r16 = round r16
46 r15 = pack r16 r15 8
47 r15 = pack r15 r10 16
48 store32 arg(1) r15
49 r15 = min_f32 r13 r0
50 r15 = max_f32 r14 r15
51 r15 = eq_f32 r13 r15
52 assert_true r15 r13
53 r13 = min_f32 r6 r0
54 r13 = max_f32 r14 r13
55 r13 = eq_f32 r6 r13
56 assert_true r13 r6
And that JITs using 11 ymm registers.
Change-Id: Ib45b5fa6aee427f290b77d8900f10d433ad81133
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/281746
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkRuntimeEffect.cpp b/src/core/SkRuntimeEffect.cpp
index 518ff74..be53472 100644
--- a/src/core/SkRuntimeEffect.cpp
+++ b/src/core/SkRuntimeEffect.cpp
@@ -12,6 +12,7 @@
#include "include/private/SkMutex.h"
#include "src/core/SkRasterPipeline.h"
#include "src/core/SkReadBuffer.h"
+#include "src/core/SkUtils.h"
#include "src/core/SkVM.h"
#include "src/core/SkWriteBuffer.h"
#include "src/sksl/SkSLByteCode.h"
@@ -268,9 +269,10 @@
fInAndUniformVars.back().sizeInBytes());
}
-SkRuntimeEffect::SpecializeResult SkRuntimeEffect::specialize(SkSL::Program& baseProgram,
- const void* inputs,
- const SkSL::SharedCompiler& compiler) {
+SkRuntimeEffect::SpecializeResult
+SkRuntimeEffect::specialize(SkSL::Program& baseProgram,
+ const void* inputs,
+ const SkSL::SharedCompiler& compiler) const {
std::unordered_map<SkSL::String, SkSL::Program::Settings::Value> inputMap;
for (const auto& v : fInAndUniformVars) {
if (v.fQualifier != Variable::Qualifier::kIn) {
@@ -343,7 +345,7 @@
}
#endif
-SkRuntimeEffect::ByteCodeResult SkRuntimeEffect::toByteCode(const void* inputs) {
+SkRuntimeEffect::ByteCodeResult SkRuntimeEffect::toByteCode(const void* inputs) const {
SkSL::SharedCompiler compiler;
auto [specialized, errorText] = this->specialize(*fBaseProgram, inputs, compiler);
@@ -494,6 +496,19 @@
}
#endif
+ const SkSL::ByteCode* byteCode() const {
+ SkAutoMutexExclusive ama(fByteCodeMutex);
+ if (!fByteCode) {
+ auto [byteCode, errorText] = fEffect->toByteCode(fInputs->data());
+ if (!byteCode) {
+ SkDebugf("%s\n", errorText.c_str());
+ return nullptr;
+ }
+ fByteCode = std::move(byteCode);
+ }
+ return fByteCode.get();
+ }
+
bool onAppendStages(const SkStageRec& rec) const override {
SkMatrix inverse;
if (!this->computeTotalInverse(rec.fCTM, rec.fLocalM, &inverse)) {
@@ -506,24 +521,117 @@
ctx->ninputs = fEffect->uniformSize() / 4;
ctx->shaderConvention = true;
- SkAutoMutexExclusive ama(fByteCodeMutex);
- if (!fByteCode) {
- auto[byteCode, errorText] = fEffect->toByteCode(fInputs->data());
- if (!byteCode) {
- SkDebugf("%s\n", errorText.c_str());
- return false;
- }
- fByteCode = std::move(byteCode);
+ ctx->byteCode = this->byteCode();
+ if (!ctx->byteCode) {
+ return false;
}
- ctx->byteCode = fByteCode.get();
ctx->fn = ctx->byteCode->getFunction("main");
-
rec.fPipeline->append(SkRasterPipeline::seed_shader);
rec.fPipeline->append_matrix(rec.fAlloc, inverse);
rec.fPipeline->append(SkRasterPipeline::interpreter, ctx);
return true;
}
+ skvm::Color onProgram(skvm::Builder* p, skvm::F32 x, skvm::F32 y, skvm::Color paint,
+ const SkMatrix& ctm, const SkMatrix* localM,
+ SkFilterQuality quality, const SkColorInfo& dst,
+ skvm::Uniforms* uniforms, SkArenaAlloc* alloc) const override {
+ const SkSL::ByteCode* bc = this->byteCode();
+ if (!bc) {
+ return {};
+ }
+
+ const SkSL::ByteCodeFunction* fn = bc->getFunction("main");
+ if (!fn) {
+ return {};
+ }
+
+ SkMatrix inv;
+ if (!this->computeTotalInverse(ctm, localM, &inv)) {
+ return {};
+ }
+ SkShaderBase::ApplyMatrix(p,inv, &x,&y,uniforms);
+
+ std::vector<skvm::F32> uniform;
+ for (int i = 0; i < (int)fEffect->uniformSize() / 4; i++) {
+ float f;
+ memcpy(&f, (const char*)fInputs->data() + 4*i, 4);
+ uniform.push_back(p->uniformF(uniforms->pushF(f)));
+ }
+
+ std::vector<skvm::F32> stack = {
+ x,y,
+ paint.r, paint.g, paint.b, paint.a,
+ };
+
+ for (const uint8_t *ip = fn->code(), *end = ip + fn->size(); ip != end; ) {
+ using Inst = SkSL::ByteCodeInstruction;
+
+ auto inst = (Inst)(uintptr_t)sk_unaligned_load<SkSL::instruction>(ip);
+ ip += sizeof(SkSL::instruction);
+
+ auto u8 = [&]{ auto x = sk_unaligned_load<uint8_t >(ip); ip += sizeof(x); return x; };
+ //auto u16 = [&]{ auto x = sk_unaligned_load<uint16_t>(ip); ip += sizeof(x); return x; };
+ auto u32 = [&]{ auto x = sk_unaligned_load<uint32_t>(ip); ip += sizeof(x); return x; };
+
+ auto push = [&] (skvm::F32 x) { stack.push_back(x); };
+ auto pop = [&] { skvm::F32 x = stack.back(); stack.pop_back(); return x; };
+
+ switch (inst) {
+ default:
+ if (false) {
+ fn->disassemble();
+ SkDebugf("inst %04x unimplemented\n", inst);
+ }
+ return {};
+
+ case Inst::kLoad2: {
+ SkAssertResult(u8() == 2);
+ int ix = u8();
+ push(stack[ix + 0]);
+ push(stack[ix + 1]);
+ } break;
+
+ case Inst::kPushImmediate: {
+ push(bit_cast(p->splat(u32())));
+ } break;
+
+ case Inst::kDup: {
+ int off = u8();
+ push(stack[stack.size() - off]);
+ } break;
+
+ case Inst::kMultiplyF2: {
+ SkAssertResult(u8() == 2);
+ skvm::F32 x = pop(), y = pop(),
+ a = pop(), b = pop();
+ push(y*b);
+ push(x*a);
+ } break;
+
+ case Inst::kLoadUniform: {
+ SkAssertResult(u8() == 1);
+ int ix = u8();
+ push(uniform[ix]);
+ } break;
+
+ case Inst::kStore4: {
+ int ix = u8();
+ stack[ix + 3] = pop();
+ stack[ix + 2] = pop();
+ stack[ix + 1] = pop();
+ stack[ix + 0] = pop();
+ } break;
+
+ case Inst::kReturn: {
+ SkAssertResult(u8() == 0);
+ SkASSERT(ip == end);
+ } break;
+ }
+ }
+ return {stack[2], stack[3], stack[4], stack[5]};
+ }
+
void flatten(SkWriteBuffer& buffer) const override {
uint32_t flags = 0;
if (fIsOpaque) {
diff --git a/src/sksl/SkSLByteCode.cpp b/src/sksl/SkSLByteCode.cpp
index a9c3480..90c6671 100644
--- a/src/sksl/SkSLByteCode.cpp
+++ b/src/sksl/SkSLByteCode.cpp
@@ -55,7 +55,9 @@
case ByteCodeInstruction::op##N: printf(text "N %d", READ8()); break;
static const uint8_t* DisassembleInstruction(const uint8_t* ip) {
- switch ((ByteCodeInstruction) (intptr_t) READ_INST()) {
+ auto inst = (ByteCodeInstruction) (intptr_t) READ_INST();
+ printf("%04x ", (int)inst);
+ switch (inst) {
VECTOR_MATRIX_DISASSEMBLE(kAddF, "addf")
VECTOR_DISASSEMBLE(kAddI, "addi")
case ByteCodeInstruction::kAndB: printf("andb"); break;
diff --git a/src/sksl/SkSLByteCode.h b/src/sksl/SkSLByteCode.h
index f917eec..97a05dc 100644
--- a/src/sksl/SkSLByteCode.h
+++ b/src/sksl/SkSLByteCode.h
@@ -176,6 +176,9 @@
int getParameterCount() const { return fParameterCount; }
int getReturnCount() const { return fReturnCount; }
+ const uint8_t* code() const { return fCode.data(); }
+ size_t size() const { return fCode.size(); }
+
/**
* Print bytecode disassembly to stdout.
*/