first sksl on skvm

Exactly enough implemented to run

    fm --skvm -b cpu -s runtime_shader -w foo

This shader

     0: 0077 load2 0
    10: 00a2 pushimmediate 998277249(0.0039215688593685627)
    22: 0070 dup
    31: 0094 multiplyf2
    40: 0080 loaduniform 2
    50: 00a2 pushimmediate 1065353216(1.0)
    62: 00c1 store4 2
    71: 00b4 return 0

becomes this blitter, including matrix, blending, asserts, etc:

    17 registers, 57 instructions:
    0	r0 = uniform32 arg(0) 4
    1	r0 = to_f32 r0
    2	r1 = splat 3F000000 (0.5)
    3	r0 = add_f32 r0 r1
    4	r2 = uniform32 arg(0) 2C
    5	r3 = uniform32 arg(0) 28
    6	r2 = fma_f32 r0 r3 r2
    7	r3 = uniform32 arg(0) 0
    8	r4 = uniform32 arg(0) 24
    9	r5 = splat 3F800000 (1)
    10	r6 = uniform32 arg(0) 38
    11	r6 = min_f32 r6 r5
    12	r7 = splat 0 (0)
    13	r6 = max_f32 r7 r6
    14	r8 = splat 437F0000 (255)
    15	r9 = mul_f32 r6 r8
    16	r9 = round r9
    17	r10 = splat FF (3.5733111e-43)
    18	r10 = pack r9 r10 8
    19	r9 = splat 3B808081 (0.0039215689)
    20	r11 = uniform32 arg(0) 20
    21	r12 = uniform32 arg(0) 1C
    22	r11 = fma_f32 r0 r12 r11
    23	r12 = uniform32 arg(0) 18
    24	r0 = splat 3F800001 (1.0000001)
    25	r13 = min_f32 r6 r0
    26	r14 = splat B4000000 (-1.1920929e-07)
    27	r13 = max_f32 r14 r13
    28	r13 = eq_f32 r6 r13
    29	assert_true r13 r6
    loop:
    30	    r6 = index
    31	    r6 = sub_i32 r3 r6
    32	    r6 = to_f32 r6
    33	    r6 = add_f32 r6 r1
    34	    r13 = fma_f32 r6 r4 r2
    35	    r13 = mul_f32 r9 r13
    36	    r6 = fma_f32 r6 r12 r11
    37	    r13 = min_f32 r13 r5
    38	    r13 = max_f32 r7 r13
    39	    r15 = mul_f32 r13 r8
    40	    r15 = round r15
    41	    r6 = mul_f32 r9 r6
    42	    r6 = min_f32 r6 r5
    43	    r6 = max_f32 r7 r6
    44	    r16 = mul_f32 r6 r8
    45	    r16 = round r16
    46	    r15 = pack r16 r15 8
    47	    r15 = pack r15 r10 16
    48	    store32 arg(1) r15
    49	    r15 = min_f32 r13 r0
    50	    r15 = max_f32 r14 r15
    51	    r15 = eq_f32 r13 r15
    52	    assert_true r15 r13
    53	    r13 = min_f32 r6 r0
    54	    r13 = max_f32 r14 r13
    55	    r13 = eq_f32 r6 r13
    56	    assert_true r13 r6

And that JITs using 11 ymm registers.

Change-Id: Ib45b5fa6aee427f290b77d8900f10d433ad81133
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/281746
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkRuntimeEffect.cpp b/src/core/SkRuntimeEffect.cpp
index 518ff74..be53472 100644
--- a/src/core/SkRuntimeEffect.cpp
+++ b/src/core/SkRuntimeEffect.cpp
@@ -12,6 +12,7 @@
 #include "include/private/SkMutex.h"
 #include "src/core/SkRasterPipeline.h"
 #include "src/core/SkReadBuffer.h"
+#include "src/core/SkUtils.h"
 #include "src/core/SkVM.h"
 #include "src/core/SkWriteBuffer.h"
 #include "src/sksl/SkSLByteCode.h"
@@ -268,9 +269,10 @@
                                                 fInAndUniformVars.back().sizeInBytes());
 }
 
-SkRuntimeEffect::SpecializeResult SkRuntimeEffect::specialize(SkSL::Program& baseProgram,
-                                                              const void* inputs,
-                                                              const SkSL::SharedCompiler& compiler) {
+SkRuntimeEffect::SpecializeResult
+SkRuntimeEffect::specialize(SkSL::Program& baseProgram,
+                            const void* inputs,
+                            const SkSL::SharedCompiler& compiler) const {
     std::unordered_map<SkSL::String, SkSL::Program::Settings::Value> inputMap;
     for (const auto& v : fInAndUniformVars) {
         if (v.fQualifier != Variable::Qualifier::kIn) {
@@ -343,7 +345,7 @@
 }
 #endif
 
-SkRuntimeEffect::ByteCodeResult SkRuntimeEffect::toByteCode(const void* inputs) {
+SkRuntimeEffect::ByteCodeResult SkRuntimeEffect::toByteCode(const void* inputs) const {
     SkSL::SharedCompiler compiler;
 
     auto [specialized, errorText] = this->specialize(*fBaseProgram, inputs, compiler);
@@ -494,6 +496,19 @@
     }
 #endif
 
+    const SkSL::ByteCode* byteCode() const {
+        SkAutoMutexExclusive ama(fByteCodeMutex);
+        if (!fByteCode) {
+            auto [byteCode, errorText] = fEffect->toByteCode(fInputs->data());
+            if (!byteCode) {
+                SkDebugf("%s\n", errorText.c_str());
+                return nullptr;
+            }
+            fByteCode = std::move(byteCode);
+        }
+        return fByteCode.get();
+    }
+
     bool onAppendStages(const SkStageRec& rec) const override {
         SkMatrix inverse;
         if (!this->computeTotalInverse(rec.fCTM, rec.fLocalM, &inverse)) {
@@ -506,24 +521,117 @@
         ctx->ninputs = fEffect->uniformSize() / 4;
         ctx->shaderConvention = true;
 
-        SkAutoMutexExclusive ama(fByteCodeMutex);
-        if (!fByteCode) {
-            auto[byteCode, errorText] = fEffect->toByteCode(fInputs->data());
-            if (!byteCode) {
-                SkDebugf("%s\n", errorText.c_str());
-                return false;
-            }
-            fByteCode = std::move(byteCode);
+        ctx->byteCode = this->byteCode();
+        if (!ctx->byteCode) {
+            return false;
         }
-        ctx->byteCode = fByteCode.get();
         ctx->fn = ctx->byteCode->getFunction("main");
-
         rec.fPipeline->append(SkRasterPipeline::seed_shader);
         rec.fPipeline->append_matrix(rec.fAlloc, inverse);
         rec.fPipeline->append(SkRasterPipeline::interpreter, ctx);
         return true;
     }
 
+    skvm::Color onProgram(skvm::Builder* p, skvm::F32 x, skvm::F32 y, skvm::Color paint,
+                          const SkMatrix& ctm, const SkMatrix* localM,
+                          SkFilterQuality quality, const SkColorInfo& dst,
+                          skvm::Uniforms* uniforms, SkArenaAlloc* alloc) const override {
+        const SkSL::ByteCode* bc = this->byteCode();
+        if (!bc) {
+            return {};
+        }
+
+        const SkSL::ByteCodeFunction* fn = bc->getFunction("main");
+        if (!fn) {
+            return {};
+        }
+
+        SkMatrix inv;
+        if (!this->computeTotalInverse(ctm, localM, &inv)) {
+            return {};
+        }
+        SkShaderBase::ApplyMatrix(p,inv, &x,&y,uniforms);
+
+        std::vector<skvm::F32> uniform;
+        for (int i = 0; i < (int)fEffect->uniformSize() / 4; i++) {
+            float f;
+            memcpy(&f, (const char*)fInputs->data() + 4*i, 4);
+            uniform.push_back(p->uniformF(uniforms->pushF(f)));
+        }
+
+        std::vector<skvm::F32> stack = {
+            x,y,
+            paint.r, paint.g, paint.b, paint.a,
+        };
+
+        for (const uint8_t *ip = fn->code(), *end = ip + fn->size(); ip != end; ) {
+            using Inst = SkSL::ByteCodeInstruction;
+
+            auto inst = (Inst)(uintptr_t)sk_unaligned_load<SkSL::instruction>(ip);
+            ip += sizeof(SkSL::instruction);
+
+            auto u8  = [&]{ auto x = sk_unaligned_load<uint8_t >(ip); ip += sizeof(x); return x; };
+          //auto u16 = [&]{ auto x = sk_unaligned_load<uint16_t>(ip); ip += sizeof(x); return x; };
+            auto u32 = [&]{ auto x = sk_unaligned_load<uint32_t>(ip); ip += sizeof(x); return x; };
+
+            auto push = [&] (skvm::F32 x) { stack.push_back(x); };
+            auto pop  = [&] { skvm::F32 x = stack.back(); stack.pop_back(); return x; };
+
+            switch (inst) {
+                default:
+                    if (false) {
+                        fn->disassemble();
+                        SkDebugf("inst %04x unimplemented\n", inst);
+                    }
+                    return {};
+
+                case Inst::kLoad2: {
+                    SkAssertResult(u8() == 2);
+                    int ix = u8();
+                    push(stack[ix + 0]);
+                    push(stack[ix + 1]);
+                } break;
+
+                case Inst::kPushImmediate: {
+                    push(bit_cast(p->splat(u32())));
+                } break;
+
+                case Inst::kDup: {
+                    int off = u8();
+                    push(stack[stack.size() - off]);
+                } break;
+
+                case Inst::kMultiplyF2: {
+                    SkAssertResult(u8() == 2);
+                    skvm::F32 x = pop(), y = pop(),
+                              a = pop(), b = pop();
+                    push(y*b);
+                    push(x*a);
+                } break;
+
+                case Inst::kLoadUniform: {
+                    SkAssertResult(u8() == 1);
+                    int ix = u8();
+                    push(uniform[ix]);
+                } break;
+
+                case Inst::kStore4: {
+                    int ix = u8();
+                    stack[ix + 3] = pop();
+                    stack[ix + 2] = pop();
+                    stack[ix + 1] = pop();
+                    stack[ix + 0] = pop();
+                } break;
+
+                case Inst::kReturn: {
+                    SkAssertResult(u8() == 0);
+                    SkASSERT(ip == end);
+                } break;
+            }
+        }
+        return {stack[2], stack[3], stack[4], stack[5]};
+    }
+
     void flatten(SkWriteBuffer& buffer) const override {
         uint32_t flags = 0;
         if (fIsOpaque) {
diff --git a/src/sksl/SkSLByteCode.cpp b/src/sksl/SkSLByteCode.cpp
index a9c3480..90c6671 100644
--- a/src/sksl/SkSLByteCode.cpp
+++ b/src/sksl/SkSLByteCode.cpp
@@ -55,7 +55,9 @@
     case ByteCodeInstruction::op##N: printf(text "N %d", READ8()); break;
 
 static const uint8_t* DisassembleInstruction(const uint8_t* ip) {
-    switch ((ByteCodeInstruction) (intptr_t) READ_INST()) {
+    auto inst = (ByteCodeInstruction) (intptr_t) READ_INST();
+    printf("%04x ", (int)inst);
+    switch (inst) {
         VECTOR_MATRIX_DISASSEMBLE(kAddF, "addf")
         VECTOR_DISASSEMBLE(kAddI, "addi")
         case ByteCodeInstruction::kAndB: printf("andb"); break;
diff --git a/src/sksl/SkSLByteCode.h b/src/sksl/SkSLByteCode.h
index f917eec..97a05dc 100644
--- a/src/sksl/SkSLByteCode.h
+++ b/src/sksl/SkSLByteCode.h
@@ -176,6 +176,9 @@
     int getParameterCount() const { return fParameterCount; }
     int getReturnCount() const { return fReturnCount; }
 
+    const uint8_t* code() const { return fCode.data(); }
+    size_t         size() const { return fCode.size(); }
+
     /**
      * Print bytecode disassembly to stdout.
      */