refactor out a middle representation

Kind of brewing a big refactor here, to give me some room between
skvm::Builder and skvm::Program to do optimizations, bakend
specializations and analysis.

As a warmup, I'm trying to split up today's Builder::Instruction into
two forms, first just what the user requested in Builder (this stays
Builder::Instruction) then a new type representing any transformation or
analysis we've done to it (OptimizedInstruction).

Roughly six important optimizations happen in SkVM today, in this order:
   1) constant folding
   2) backend-specific instruction specialization
   3) common sub-expression elimination
   4) reordering + dead code elimination
   5) loop invariant and lifetime analysis
   6) register assignment

At head 1-5 all happen in Builder, and 2 is particularly
awkward to have there (e.g. mul_f32 -> mul_f32_imm).
6 happens in Program per-backend, and that seems healthy.

As of this CL, 1-3 happen in Builder, 4-5 now on this middle
OptimizedInstruction format, and 6 still in Program.

I'd like to get to the point where 1 stays in Builder, 2-5 all happen on
this middle IR, and 6 stays in Program.  That ought to let me do things
like turn mul_f32 -> mul_f32_imm when it's good to and still benefit
from things like common sub-expression elimination and code reordering
happening after that trnasformation.

And then, I hope that's also a good spot to do more complicated
transformations, like lowering gather8 into gather32 plus some fix up
when targeting an x86 JIT but not anywhere else.  Today's Builder is too
early to know whether we should do this or not, and in Program it's
actually kind of awkward to do this sort of thing while also doing
having to do register assignment.  Some middle might be right.

Change-Id: I9c00268a084f07fbab88d05eb441f1957a0d7c67
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/269181
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/resources/SkVMTest.expected b/resources/SkVMTest.expected
index cc0e27c..b4db656 100644
--- a/resources/SkVMTest.expected
+++ b/resources/SkVMTest.expected
@@ -1,5 +1,5 @@
 A8 over A8
-12 values:
+12 values (originally 16):
   v0 = load8 arg(0)
   v1 = to_f32 v0
   v2 = mul_f32 v1 3B808081 (0.0039215689)
@@ -29,7 +29,7 @@
 11	    store8 arg(1) r1
 
 A8 over G8
-17 values:
+17 values (originally 22):
   v0 = load8 arg(1)
   v1 = to_f32 v0
   v2 = mul_f32 v1 3B808081 (0.0039215689)
@@ -69,7 +69,7 @@
 16	    store8 arg(1) r3
 
 A8 over RGBA_8888
-36 values:
+36 values (originally 40):
   v0 = load32 arg(1)
   v1 = bit_and v0 FF
   v2 = to_f32 v1
@@ -147,7 +147,7 @@
 35	    store32 arg(1) r3
 
 G8 over A8
-9 values:
+9 values (originally 15):
 ↑ v0 = splat 3F800000 (1)
 ↑ v1 = splat 0 (0)
   v2 = load8 arg(1)
@@ -171,7 +171,7 @@
 8	    store8 arg(1) r2
 
 G8 over G8
-16 values:
+16 values (originally 20):
   v0 = load8 arg(0)
   v1 = to_f32 v0
   v2 = mul_f32 v1 3B808081 (0.0039215689)
@@ -209,7 +209,7 @@
 15	    store8 arg(1) r4
 
 G8 over RGBA_8888
-36 values:
+36 values (originally 39):
   v0 = load8 arg(0)
   v1 = to_f32 v0
   v2 = mul_f32 v1 3B808081 (0.0039215689)
@@ -287,7 +287,7 @@
 35	    store32 arg(1) r3
 
 RGBA_8888 over A8
-13 values:
+13 values (originally 31):
   v0 = load32 arg(0)
   v1 = shr_i32 v0 24
   v2 = to_f32 v1
@@ -319,7 +319,7 @@
 12	    store8 arg(1) r1
 
 RGBA_8888 over G8
-31 values:
+31 values (originally 36):
   v0 = load32 arg(0)
   v1 = bit_and v0 FF
   v2 = to_f32 v1
@@ -387,7 +387,7 @@
 30	    store8 arg(1) r3
 
 RGBA_8888 over RGBA_8888
-48 values:
+48 values (originally 51):
   v0 = load32 arg(0)
   v1 = bit_and v0 FF
   v2 = to_f32 v1
@@ -489,7 +489,7 @@
 47	    store32 arg(1) r5
 
 I32 (Naive) 8888 over 8888
-32 values:
+32 values (originally 33):
   v0 = load32 arg(0)
   v1 = bit_and v0 FF
   v2 = load32 arg(1)
@@ -559,7 +559,7 @@
 31	    store32 arg(1) r6
 
 I32 8888 over 8888
-28 values:
+28 values (originally 29):
   v0 = load32 arg(0)
   v1 = bit_and v0 FF
   v2 = load32 arg(1)
@@ -621,7 +621,7 @@
 27	    store32 arg(1) r6
 
 I32 (SWAR) 8888 over 8888
-14 values:
+14 values (originally 16):
   v0 = load32 arg(0)
   v1 = bytes v0 404
 ↑ v2 = splat 1000100 (2.3510604e-38)
@@ -654,7 +654,7 @@
 12	    r2 = add_i32 r1 r2
 13	    store32 arg(1) r2
 
-6 values:
+6 values (originally 6):
 ↟ v0 = splat 1 (1.4012985e-45)
 ↟ v1 = splat 2 (2.8025969e-45)
 ↑ v2 = add_i32 v0 v1
@@ -671,7 +671,7 @@
 4	    r0 = mul_i32 r0 r1
 5	    store32 arg(0) r0
 
-22 values:
+22 values (originally 23):
   v0 = load32 arg(0)
   v1 = bit_and v0 FF
   v2 = load32 arg(1)
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 7015e9b..81498e0 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -117,10 +117,18 @@
         }
     }
 
-    static void dump_builder_program(const std::vector<Builder::Instruction>& program,
-                                     SkWStream* o) {
-        for (Val id = 0; id < (Val)program.size(); id++) {
-            const Builder::Instruction& inst = program[id];
+
+    void Builder::dump(SkWStream* o) const {
+        SkDebugfStream debug;
+        if (!o) { o = &debug; }
+
+        std::vector<OptimizedInstruction> optimized = this->optimize();
+        o->writeDecAsText(optimized.size());
+        o->writeText(" values (originally ");
+        o->writeDecAsText(fProgram.size());
+        o->writeText("):\n");
+        for (Val id = 0; id < (Val)optimized.size(); id++) {
+            const OptimizedInstruction& inst = optimized[id];
             Op  op = inst.op;
             Val  x = inst.x,
                  y = inst.y,
@@ -225,15 +233,6 @@
         }
     }
 
-    void Builder::dump(SkWStream* o) const {
-        SkDebugfStream debug;
-        if (!o) { o = &debug; }
-
-        o->writeDecAsText(fProgram.size());
-        o->writeText(" values:\n");
-        dump_builder_program(fProgram, o);
-    }
-
     void Program::dump(SkWStream* o) const {
         SkDebugfStream debug;
         if (!o) { o = &debug; }
@@ -351,16 +350,16 @@
         }
     }
 
-    // Builder -> Program, with liveness and loop hoisting analysis.
-
-    Program Builder::done(const char* debug_name) {
-        // First rewrite the program by issuing instructions as late as possible:
+    std::vector<OptimizedInstruction> Builder::optimize() const {
+        // First rewrite the program order by issuing instructions as late as possible:
         //    - any side-effect-only (i.e. store) instruction in order as we see them;
         //    - any other instruction only once it's shown to be needed.
         // This elides all dead code and helps minimize value lifetime / register pressure.
-        std::vector<Instruction> rewritten;
-        rewritten.reserve(fProgram.size());
-        std::vector<Val> new_index(fProgram.size(), NA);  // Map old Val index to rewritten index.
+        std::vector<OptimizedInstruction> optimized;
+        optimized.reserve(fProgram.size());
+
+        // Map old Val index to rewritten index in optimized.
+        std::vector<Val> new_index(fProgram.size(), NA);
 
         auto rewrite = [&](Val id, auto& recurse) -> Val {
             auto rewrite_input = [&](Val input) -> Val {
@@ -376,8 +375,8 @@
             // The order we rewrite inputs is somewhat arbitrary; we could just go x,y,z.
             // But we try to preserve the original program order as much as possible by
             // rewriting inst's inputs in the order they were themselves originally issued.
-            // This makes debugging program dumps a little easier.
-            Instruction inst = fProgram[id];
+            // This makes debugging  dumps a little easier.
+            Builder::Instruction inst = fProgram[id];
             Val *min = &inst.x,
                 *mid = &inst.y,
                 *max = &inst.z;
@@ -387,8 +386,11 @@
             *min = rewrite_input(*min);
             *mid = rewrite_input(*mid);
             *max = rewrite_input(*max);
-            rewritten.push_back(inst);
-            return (Val)rewritten.size()-1;
+            optimized.push_back({inst.op,
+                                 inst.x, inst.y, inst.z,
+                                 inst.immy, inst.immz,
+                                 /*death=*/0, /*can_hoist=*/true, /*used_in_loop=*/false});
+            return (Val)optimized.size()-1;
         };
 
         // Here we go with the actual rewriting, starting with all the store instructions
@@ -398,29 +400,27 @@
                 rewrite(id, rewrite);
             }
         }
-        // We're done with the original order now... everything below will analyze the new program.
-        fProgram = std::move(rewritten);
 
+        // We're done with our original fProgram now... everything below will analyze `optimized`.
 
         // We'll want to know when it's safe to recycle registers holding the values
         // produced by each instruction, that is, when no future instruction needs it.
-        for (Val id = 0; id < (Val)fProgram.size(); id++) {
-            Instruction& inst = fProgram[id];
+        for (Val id = 0; id < (Val)optimized.size(); id++) {
+            OptimizedInstruction& inst = optimized[id];
             // Stores don't really produce values.  Just mark them as dying on issue.
             if (inst.op <= Op::store32) {
                 inst.death = id;
             }
             // Extend the lifetime of this instruction's inputs to live until it issues.
             // (We're walking in order, so this is the same as max()ing.)
-            if (inst.x != NA) { fProgram[inst.x].death = id; }
-            if (inst.y != NA) { fProgram[inst.y].death = id; }
-            if (inst.z != NA) { fProgram[inst.z].death = id; }
+            if (inst.x != NA) { optimized[inst.x].death = id; }
+            if (inst.y != NA) { optimized[inst.y].death = id; }
+            if (inst.z != NA) { optimized[inst.z].death = id; }
         }
 
-
         // Mark which values don't depend on the loop and can be hoisted.
-        for (Val id = 0; id < (Val)fProgram.size(); id++) {
-            Builder::Instruction& inst = fProgram[id];
+        for (Val id = 0; id < (Val)optimized.size(); id++) {
+            OptimizedInstruction& inst = optimized[id];
 
             // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
             if (inst.op <= Op::gather32 && inst.op != Op::assert_true) {
@@ -429,31 +429,32 @@
 
             // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
             if (inst.can_hoist) {
-                if (inst.x != NA) { inst.can_hoist &= fProgram[inst.x].can_hoist; }
-                if (inst.y != NA) { inst.can_hoist &= fProgram[inst.y].can_hoist; }
-                if (inst.z != NA) { inst.can_hoist &= fProgram[inst.z].can_hoist; }
+                if (inst.x != NA) { inst.can_hoist &= optimized[inst.x].can_hoist; }
+                if (inst.y != NA) { inst.can_hoist &= optimized[inst.y].can_hoist; }
+                if (inst.z != NA) { inst.can_hoist &= optimized[inst.z].can_hoist; }
             }
 
             // We'll want to know if hoisted values are used in the loop;
             // if not, we can recycle their registers like we do loop values.
             if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used_in_loop*/) {
-                if (inst.x != NA) { fProgram[inst.x].used_in_loop = true; }
-                if (inst.y != NA) { fProgram[inst.y].used_in_loop = true; }
-                if (inst.z != NA) { fProgram[inst.z].used_in_loop = true; }
+                if (inst.x != NA) { optimized[inst.x].used_in_loop = true; }
+                if (inst.y != NA) { optimized[inst.y].used_in_loop = true; }
+                if (inst.z != NA) { optimized[inst.z].used_in_loop = true; }
             }
         }
 
+        return optimized;
+    }
+
+    Program Builder::done(const char* debug_name) const {
         char buf[64] = "skvm-jit-";
         if (!debug_name) {
             *SkStrAppendU32(buf+9, this->hash()) = '\0';
             debug_name = buf;
         }
-
-        return {fProgram, fStrides, debug_name};
+        return {this->optimize(), fStrides, debug_name};
     }
 
-    // We skip fields only written after Builder::done() (death, can_hoist, used_in_loop) here
-    // for equality and hashing.  They'll always have the same default values in Builder::push().
     static bool operator==(const Builder::Instruction& a, const Builder::Instruction& b) {
         return a.op   == b.op
             && a.x    == b.x
@@ -464,7 +465,7 @@
     }
 
     uint32_t Builder::InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
-        return SkOpts::hash(&inst, offsetof(Instruction, death), seed);
+        return SkOpts::hash(&inst, sizeof(inst), seed);
     }
 
     uint64_t Builder::hash() const { return (uint64_t)fHashLo | (uint64_t)fHashHi << 32; }
@@ -472,8 +473,7 @@
     // Most instructions produce a value and return it by ID,
     // the value-producing instruction's own index in the program vector.
     Val Builder::push(Op op, Val x, Val y, Val z, int immy, int immz) {
-        Instruction inst{op, x, y, z, immy, immz,
-                         /*death=*/0, /*can_hoist=*/true, /*used_in_loop=*/false};
+        Instruction inst{op, x, y, z, immy, immz};
 
         // This first InstructionHash{}() call should be free given we're about to use fIndex below.
         fHashLo ^= InstructionHash{}(inst, 0);    // Two hash streams with different seeds.
@@ -1926,7 +1926,7 @@
 
     Program::Program() {}
 
-    Program::Program(const std::vector<Builder::Instruction>& instructions,
+    Program::Program(const std::vector<OptimizedInstruction>& instructions,
                      const std::vector<int>& strides,
                      const char* debug_name)
         : fStrides(strides)
@@ -1937,8 +1937,8 @@
     #endif
     }
 
-    // Translate Builder::Instructions to Program::Instructions used by the interpreter.
-    void Program::setupInterpreter(const std::vector<Builder::Instruction>& instructions) {
+    // Translate OptimizedInstructions to Program::Instructions used by the interpreter.
+    void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
         // Register each instruction is assigned to.
         std::vector<Reg> reg(instructions.size());
 
@@ -1957,7 +1957,7 @@
 
         // Assign this value to a register, recycling them where we can.
         auto assign_register = [&](Val id) {
-            const Builder::Instruction& inst = instructions[id];
+            const OptimizedInstruction& inst = instructions[id];
 
             // If this is a real input and it's lifetime ends at this instruction,
             // we can recycle the register it's occupying.
@@ -1994,7 +1994,7 @@
             if (!hoisted(id)) { assign_register(id); }
         }
 
-        // Translate Builder::Instructions to Program::Instructions by mapping values to
+        // Translate OptimizedInstructions to Program::Instructions by mapping values to
         // registers.  This will be two passes, first hoisted instructions, then inside the loop.
 
         // The loop begins at the fLoop'th Instruction.
@@ -2008,7 +2008,7 @@
                             : reg[id];
         };
 
-        auto push_instruction = [&](Val id, const Builder::Instruction& inst) {
+        auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
             Program::Instruction pinst{
                 inst.op,
                 lookup_register(id),
@@ -2022,14 +2022,14 @@
         };
 
         for (Val id = 0; id < (Val)instructions.size(); id++) {
-            const Builder::Instruction& inst = instructions[id];
+            const OptimizedInstruction& inst = instructions[id];
             if (hoisted(id)) {
                 push_instruction(id, inst);
                 fLoop++;
             }
         }
         for (Val id = 0; id < (Val)instructions.size(); id++) {
-            const Builder::Instruction& inst = instructions[id];
+            const OptimizedInstruction& inst = instructions[id];
             if (!hoisted(id)) {
                 push_instruction(id, inst);
             }
@@ -2071,7 +2071,7 @@
         }
     }
 
-    bool Program::jit(const std::vector<Builder::Instruction>& instructions,
+    bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
                       const bool try_hoisting,
                       Assembler* a) const {
         using A = Assembler;
@@ -2126,7 +2126,7 @@
         LabelAndReg                  iota;         // Exists _only_ to vary per-lane.
 
         auto warmup = [&](Val id) {
-            const Builder::Instruction& inst = instructions[id];
+            const OptimizedInstruction& inst = instructions[id];
 
             switch (inst.op) {
                 default: break;
@@ -2154,7 +2154,7 @@
         };
 
         auto emit = [&](Val id, bool scalar) {
-            const Builder::Instruction& inst = instructions[id];
+            const OptimizedInstruction& inst = instructions[id];
 
             Op op = inst.op;
             Val x = inst.x,
@@ -2654,7 +2654,7 @@
         return true;
     }
 
-    void Program::setupJIT(const std::vector<Builder::Instruction>& instructions,
+    void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
                            const char* debug_name) {
         // Assemble with no buffer to determine a.size(), the number of bytes we'll assemble.
         Assembler a{nullptr};
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 53ce6e5..e5d34cf 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -9,6 +9,7 @@
 #define SkVM_DEFINED
 
 #include "include/core/SkTypes.h"
+#include "include/private/SkMacros.h"
 #include "include/private/SkTHash.h"
 #include "src/core/SkVM_fwd.h"
 #include <vector>      // std::vector
@@ -336,26 +337,31 @@
 
     struct Color { skvm::F32 r,g,b,a; };
 
+    struct OptimizedInstruction {
+        Op op;
+        Val x,y,z;
+        int immy,immz;
+
+        int  death;
+        bool can_hoist;
+        bool used_in_loop;
+    };
+
     class Builder {
     public:
+        SK_BEGIN_REQUIRE_DENSE
         struct Instruction {
-            // Tightly packed for hashing:
             Op  op;         // v* = op(x,y,z,imm), where * == index of this Instruction.
             Val x,y,z;      // Enough arguments for mad().
             int immy,immz;  // Immediate bit pattern, shift count, argument index, etc.
-            // End tightly packed for hashing.
-
-            // Not populated until done() has been called.
-            int  death;         // Index of last live instruction taking this input; live if != 0.
-            bool can_hoist;     // Value independent of all loop variables?
-            bool used_in_loop;  // Is the value used in the loop (or only by hoisted values)?
         };
+        SK_END_REQUIRE_DENSE
 
-        Program done(const char* debug_name = nullptr);
+        Program done(const char* debug_name = nullptr) const;
 
         // Mostly for debugging, tests, etc.
         std::vector<Instruction> program() const { return fProgram; }
-
+        std::vector<OptimizedInstruction> optimize() const;
 
         // Declare an argument with given stride (use stride=0 for uniforms).
         // TODO: different types for varying and uniforms?
@@ -613,7 +619,7 @@
             union { Reg z; int immz; };
         };
 
-        Program(const std::vector<Builder::Instruction>& instructions,
+        Program(const std::vector<OptimizedInstruction>& instructions,
                 const std::vector<int>                 & strides,
                 const char* debug_name);
 
@@ -645,12 +651,12 @@
         void dump(SkWStream* = nullptr) const;
 
     private:
-        void setupInterpreter(const std::vector<Builder::Instruction>&);
-        void setupJIT        (const std::vector<Builder::Instruction>&, const char* debug_name);
+        void setupInterpreter(const std::vector<OptimizedInstruction>&);
+        void setupJIT        (const std::vector<OptimizedInstruction>&, const char* debug_name);
 
         void interpret(int n, void* args[]) const;
 
-        bool jit(const std::vector<Builder::Instruction>&,
+        bool jit(const std::vector<OptimizedInstruction>&,
                  bool try_hoisting,
                  Assembler*) const;
 
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 2f69e16..4bfb0f7 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -105,7 +105,7 @@
         skvm::Program program = b.done();
         REPORTER_ASSERT(r, program.nregs() == 2);
 
-        std::vector<skvm::Builder::Instruction> insts = b.program();
+        std::vector<skvm::OptimizedInstruction> insts = b.optimize();
         REPORTER_ASSERT(r, insts.size() == 6);
         REPORTER_ASSERT(r,  insts[0].can_hoist && insts[0].death == 2 && !insts[0].used_in_loop);
         REPORTER_ASSERT(r,  insts[1].can_hoist && insts[1].death == 2 && !insts[1].used_in_loop);
@@ -277,7 +277,7 @@
         }
     });
 
-    for (const skvm::Builder::Instruction& inst : b.program()) {
+    for (const skvm::OptimizedInstruction& inst : b.optimize()) {
         REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
     }
 }