more SkSL interpreter performance work

Bug: skia:
Change-Id: I21da1f1473fb73e8ba1371e7301f3fff2e33e8cc
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/212199
Commit-Queue: Ethan Nicholas <ethannicholas@google.com>
Reviewed-by: Mike Reed <reed@google.com>
diff --git a/src/sksl/SkSLInterpreter.cpp b/src/sksl/SkSLInterpreter.cpp
index ea926f6..1d9d593 100644
--- a/src/sksl/SkSLInterpreter.cpp
+++ b/src/sksl/SkSLInterpreter.cpp
@@ -29,39 +29,43 @@
 
 static constexpr int UNINITIALIZED = 0xDEADBEEF;
 
-Interpreter::Value* Interpreter::run(const ByteCodeFunction& f, Interpreter::Value args[],
-                                    Interpreter::Value inputs[]) {
+Interpreter::Interpreter(std::unique_ptr<Program> program, std::unique_ptr<ByteCode> byteCode,
+                         Interpreter::Value inputs[])
+    : fProgram(std::move(program))
+    , fByteCode(std::move(byteCode)) {
+    for (int i = 0; i < fByteCode->fGlobalCount; ++i) {
+        fGlobals.push_back(Value((int) UNINITIALIZED));
+    }
+    for (int i = fByteCode->fInputSlots.size() - 1; i >= 0; --i) {
+        fGlobals[fByteCode->fInputSlots[i]] = inputs[i];
+    }
+}
+
+void Interpreter::run(const ByteCodeFunction& f, Interpreter::Value args[],
+                      Interpreter::Value* outReturn) {
     fCurrentFunction = &f;
-    fStack.clear();
-    fGlobals.clear();
 #ifdef TRACE
     this->disassemble(f);
 #endif
-    for (int i = 0; i < f.fParameterCount; ++i) {
-        this->push(args[i]);
+    Value smallStack[128];
+    std::unique_ptr<Value[]> largeStack;
+    Value* stack = smallStack;
+    if ((int) SK_ARRAY_COUNT(smallStack) < fCurrentFunction->fStackCount) {
+        largeStack.reset(new Value[fCurrentFunction->fStackCount]);
+        stack = largeStack.get();
     }
-    for (int i = 0; i < f.fLocalCount; ++i) {
-        this->push(Value((int) UNINITIALIZED));
-    }
-    for (int i = 0; i < f.fOwner.fGlobalCount; ++i) {
-        fGlobals.push_back(Value((int) UNINITIALIZED));
-    }
-    for (int i = f.fOwner.fInputSlots.size() - 1; i >= 0; --i) {
-        fGlobals[f.fOwner.fInputSlots[i]] = inputs[i];
-    }
-    run();
+    run(stack, args, outReturn);
     int offset = 0;
     for (const auto& p : f.fDeclaration.fParameters) {
         if (p->fModifiers.fFlags & Modifiers::kOut_Flag) {
             for (int i = p->fType.columns() * p->fType.rows() - 1; i >= 0; --i) {
-                args[offset] = fStack[offset];
+                args[offset] = stack[offset];
                 ++offset;
             }
         } else {
             offset += p->fType.columns() * p->fType.rows();
         }
     }
-    return fStack.data();
 }
 
 struct CallbackCtx : public SkRasterPipeline_CallbackCtx {
@@ -69,27 +73,25 @@
     const FunctionDefinition* fFunction;
 };
 
-#define READ8() code[ip++]
+#define STACK_SIZE() (int) (sp - stack + 1)
+
+#define TOP() (*sp)
+
+#define POP() (*(sp--))
+
+#define PUSH(v) (*(++sp) = v)
+
+#define READ8() (*(ip++))
 
 #define READ16()                                                  \
-    (SkASSERT(ip % 2 == 0),                                       \
+    (SkASSERT((intptr_t) ip % 2 == 0),                            \
      ip += 2,                                                     \
-     *(uint16_t*) &code[ip - 2])
+     *(uint16_t*) (ip - 2))
 
 #define READ32()                                                  \
-    (SkASSERT(ip % 4 == 0),                                       \
+    (SkASSERT((intptr_t) ip % 4 == 0),                            \
      ip += 4,                                                     \
-     *(uint32_t*) &code[ip - 4])
-
-void Interpreter::push(Value v) {
-    fStack.push_back(v);
-}
-
-Interpreter::Value Interpreter::pop() {
-    Value v = fStack.back();
-    fStack.pop_back();
-    return v;
-}
+     *(uint32_t*) (ip - 4))
 
 static String value_string(uint32_t v) {
     union { uint32_t u; float f; } pun = { v };
@@ -97,10 +99,9 @@
 }
 
 void Interpreter::disassemble(const ByteCodeFunction& f) {
-    int ip = 0;
-    const uint8_t* code = f.fCode.data();
-    while (ip < (int) f.fCode.size()) {
-        printf("%d: ", ip);
+    const uint8_t* ip = f.fCode.data();
+    while (ip < f.fCode.data() + f.fCode.size()) {
+        printf("%d: ", (int) (ip - f.fCode.data()));
         switch ((ByteCodeInstruction) READ8()) {
             case ByteCodeInstruction::kAddF: printf("addf"); break;
             case ByteCodeInstruction::kAddI: printf("addi"); break;
@@ -148,6 +149,9 @@
             case ByteCodeInstruction::kMultiplyU: printf("multiplyu"); break;
             case ByteCodeInstruction::kNegateF: printf("negatef"); break;
             case ByteCodeInstruction::kNegateS: printf("negates"); break;
+            case ByteCodeInstruction::kNop1: printf("nop1"); break;
+            case ByteCodeInstruction::kNop2: printf("nop2"); break;
+            case ByteCodeInstruction::kNop3: printf("nop3"); break;
             case ByteCodeInstruction::kNot: printf("not"); break;
             case ByteCodeInstruction::kOrB: printf("orb"); break;
             case ByteCodeInstruction::kOrI: printf("ori"); break;
@@ -183,25 +187,16 @@
             }
             case ByteCodeInstruction::kUnsignedToFloat: printf("unsignedtofloat"); break;
             case ByteCodeInstruction::kVector: printf("vector%d", READ8()); break;
-            default: printf("%d\n", code[ip - 1]);
-                     SkASSERT(false);
+            default: printf("unknown(%d)\n", *(ip - 1)); SkASSERT(false);
         }
         printf("\n");
     }
 }
 
-void Interpreter::dumpStack() {
-    printf("STACK:");
-    for (size_t i = 0; i < fStack.size(); ++i) {
-        printf(" %d(%f)", fStack[i].fSigned, fStack[i].fFloat);
-    }
-    printf("\n");
-}
-
 #define BINARY_OP(inst, type, field, op) \
     case ByteCodeInstruction::inst: {    \
-        type b = this->pop().field;      \
-        Value* a = &fStack.back();       \
+        type b = POP().field;      \
+        Value* a = &TOP();       \
         *a = Value(a->field op b);       \
         break;                           \
     }
@@ -212,31 +207,34 @@
     case ByteCodeInstruction::inst: {                         \
         Value result[VECTOR_MAX];                             \
         for (int i = count - 1; i >= 0; --i) {                \
-            result[i] = this->pop();                          \
+            result[i] = POP();                                \
         }                                                     \
         for (int i = count - 1; i >= 0; --i) {                \
-            result[i] = this->pop().field op result[i].field; \
+            result[i] = POP().field op result[i].field;       \
         }                                                     \
         for (int i = 0; i < count; ++i) {                     \
-            this->push(result[i]);                            \
+            PUSH(result[i]);                                  \
         }                                                     \
         break;                                                \
     }
 
-void Interpreter::run() {
-    int ip = 0;
+void Interpreter::run(Value* stack, Value args[], Value* outReturn) {
     const uint8_t* code = fCurrentFunction->fCode.data();
+    const uint8_t* ip = code;
+    memcpy(stack, args, fCurrentFunction->fParameterCount * sizeof(Value));
+    Value* sp = stack + fCurrentFunction->fParameterCount + fCurrentFunction->fLocalCount - 1;
     for (;;) {
-#ifdef TRACE
-        printf("at %d\n", ip);
-#endif
         ByteCodeInstruction inst = (ByteCodeInstruction) READ8();
+#ifdef TRACE
+        printf("at %d\n", (int) (ip - fCurrentFunction->fCode.data() - 1));
+#endif
         switch (inst) {
             BINARY_OP(kAddI, int32_t, fSigned, +)
             BINARY_OP(kAddF, float, fFloat, +)
-            case ByteCodeInstruction::kBranch:
-                ip = READ16();
+            case ByteCodeInstruction::kBranch: {
+                ip = code + READ16();
                 break;
+            }
             BINARY_OP(kCompareIEQ, int32_t, fSigned, ==)
             BINARY_OP(kCompareFEQ, float, fFloat, ==)
             BINARY_OP(kCompareINEQ, int32_t, fSigned, !=)
@@ -255,62 +253,65 @@
             BINARY_OP(kCompareFLTEQ, float, fFloat, <=)
             case ByteCodeInstruction::kConditionalBranch: {
                 int target = READ16();
-                if (this->pop().fBool) {
-                    ip = target;
+                if (POP().fBool) {
+                    ip = code + target;
                 }
                 break;
             }
             case ByteCodeInstruction::kDebugPrint: {
-                Value v = this->pop();
+                Value v = POP();
                 printf("Debug: %d(int), %d(uint), %f(float)\n", v.fSigned, v.fUnsigned, v.fFloat);
                 break;
             }
             BINARY_OP(kDivideS, int32_t, fSigned, /)
             BINARY_OP(kDivideU, uint32_t, fUnsigned, /)
             BINARY_OP(kDivideF, float, fFloat, /)
-            case ByteCodeInstruction::kDup:
-                this->push(fStack.back());
+            case ByteCodeInstruction::kDup: {
+                Value& top = TOP();
+                PUSH(top);
                 break;
+            }
             case ByteCodeInstruction::kDupDown: {
                 int count = READ8();
-                for (int i = 0; i < count; ++i) {
-                    fStack.insert(fStack.end() - i - count - 1, fStack[fStack.size() - i - 1]);
-                }
+                // before dupdown 4: X A B C D
+                // after dupdown 4:  A B C D X A B C D
+                memmove(sp, sp - count, sizeof(Value) * (count + 1));
+                sp += count;
+                memcpy(sp - count * 2, sp - count + 1, sizeof(Value) * count);
                 break;
             }
             case ByteCodeInstruction::kFloatToInt: {
-                Value& top = fStack.back();
+                Value& top = TOP();
                 top.fSigned = (int) top.fFloat;
                 break;
             }
             case ByteCodeInstruction::kSignedToFloat: {
-                Value& top = fStack.back();
+                Value& top = TOP();
                 top.fFloat = (float) top.fSigned;
                 break;
             }
             case ByteCodeInstruction::kUnsignedToFloat: {
-                Value& top = fStack.back();
+                Value& top = TOP();
                 top.fFloat = (float) top.fUnsigned;
                 break;
             }
             case ByteCodeInstruction::kLoad: {
-                int target = this->pop().fSigned;
-                SkASSERT(target < (int) fStack.size());
-                this->push(fStack[target]);
+                int target = POP().fSigned;
+                SkASSERT(target < STACK_SIZE());
+                PUSH(stack[target]);
                 break;
             }
             case ByteCodeInstruction::kLoadGlobal: {
                 int target = READ8();
                 SkASSERT(target < (int) fGlobals.size());
-                this->push(fGlobals[target]);
+                PUSH(fGlobals[target]);
                 break;
             }
             case ByteCodeInstruction::kLoadSwizzle: {
-                Value target = this->pop();
+                Value target = POP();
                 int count = READ8();
                 for (int i = 0; i < count; ++i) {
-                    SkASSERT(target.fSigned + fCurrentFunction->fCode[ip + i] < (int) fStack.size());
-                    this->push(fStack[target.fSigned + fCurrentFunction->fCode[ip + i]]);
+                    PUSH(stack[target.fSigned + *(ip + i)]);
                 }
                 ip += count;
                 break;
@@ -319,61 +320,66 @@
             BINARY_OP(kMultiplyU, uint32_t, fUnsigned, *)
             BINARY_OP(kMultiplyF, float, fFloat, *)
             case ByteCodeInstruction::kNot: {
-                Value& top = fStack.back();
+                Value& top = TOP();
                 top.fBool = !top.fBool;
                 break;
             }
             case ByteCodeInstruction::kNegateF: {
-                Value& top = fStack.back();
+                Value& top = TOP();
                 top.fFloat = -top.fFloat;
                 break;
             }
             case ByteCodeInstruction::kNegateS: {
-                Value& top = fStack.back();
+                Value& top = TOP();
                 top.fSigned = -top.fSigned;
                 break;
             }
-            case ByteCodeInstruction::kNop:
-                break;
+            case ByteCodeInstruction::kNop1:
+                continue;
+            case ByteCodeInstruction::kNop2:
+                ++ip;
+                continue;
+            case ByteCodeInstruction::kNop3:
+                ip += 2;
+                continue;
             case ByteCodeInstruction::kPop:
                 for (int i = READ8(); i > 0; --i) {
-                    this->pop();
+                    POP();
                 }
                 break;
             case ByteCodeInstruction::kPushImmediate:
-                this->push(Value((int) READ32()));
+                PUSH(Value((int) READ32()));
                 break;
             BINARY_OP(kRemainderS, int32_t, fSigned, %)
             BINARY_OP(kRemainderU, uint32_t, fUnsigned, %)
             case ByteCodeInstruction::kReturn: {
-                int count = READ8();
-                for (int i = 0; i < count; ++i) {
-                    fStack[i] = fStack[fStack.size() - count + i];
+                if (outReturn) {
+                    int count = READ8();
+                    memcpy(outReturn, sp - count + 1, count * sizeof(Value));
                 }
                 return;
             }
             case ByteCodeInstruction::kStore: {
-                Value value = this->pop();
-                int target = this->pop().fSigned;
-                SkASSERT(target < (int) fStack.size());
-                fStack[target] = value;
+                Value value = POP();
+                int target = POP().fSigned;
+                SkASSERT(target < STACK_SIZE());
+                stack[target] = value;
                 break;
             }
             case ByteCodeInstruction::kStoreGlobal: {
-                Value value = this->pop();
-                int target = this->pop().fSigned;
+                Value value = POP();
+                int target = POP().fSigned;
                 SkASSERT(target < (int) fGlobals.size());
                 fGlobals[target] = value;
                 break;
             }
             case ByteCodeInstruction::kStoreSwizzle: {
                 int count = READ8();
-                int target = fStack[fStack.size() - count - 1].fSigned;
+                int target = (sp - count)->fSigned;
                 for (int i = count - 1; i >= 0; --i) {
-                    SkASSERT(target + fCurrentFunction->fCode[ip + i] < (int) fStack.size());
-                    fStack[target + fCurrentFunction->fCode[ip + i]] = this->pop();
+                    stack[target + *(ip + i)] = POP();
                 }
-                this->pop();
+                POP();
                 ip += count;
                 break;
             }
@@ -382,10 +388,10 @@
             case ByteCodeInstruction::kSwizzle: {
                 Value vec[4];
                 for (int i = READ8() - 1; i >= 0; --i) {
-                    vec[i] = this->pop();
+                    vec[i] = POP();
                 }
                 for (int i = READ8() - 1; i >= 0; --i) {
-                    this->push(vec[READ8()]);
+                    PUSH(vec[READ8()]);
                 }
                 break;
             }
@@ -395,9 +401,10 @@
                 switch (inst) {
                     VECTOR_BINARY_OP(kAddI, int32_t, fSigned, +)
                     VECTOR_BINARY_OP(kAddF, float, fFloat, +)
-                    case ByteCodeInstruction::kBranch:
-                        ip = READ16();
+                    case ByteCodeInstruction::kBranch: {
+                        ip = code + READ16();
                         break;
+                    }
                     VECTOR_BINARY_OP(kCompareIEQ, int32_t, fSigned, ==)
                     VECTOR_BINARY_OP(kCompareFEQ, float, fFloat, ==)
                     VECTOR_BINARY_OP(kCompareINEQ, int32_t, fSigned, !=)
@@ -415,9 +422,9 @@
                     VECTOR_BINARY_OP(kCompareULTEQ, uint32_t, fUnsigned, <=)
                     VECTOR_BINARY_OP(kCompareFLTEQ, float, fFloat, <=)
                     case ByteCodeInstruction::kConditionalBranch: {
-                        int target = READ16();
-                        if (this->pop().fBool) {
-                            ip = target;
+                        uint16_t target = READ16();
+                        if (POP().fBool) {
+                            ip = code + target;
                         }
                         break;
                     }
@@ -426,37 +433,35 @@
                     VECTOR_BINARY_OP(kDivideF, float, fFloat, /)
                     case ByteCodeInstruction::kFloatToInt: {
                         for (int i = 0; i < count; ++i) {
-                            Value& v = fStack[fStack.size() - i - 1];
+                            Value& v = sp[-i];
                             v.fSigned = (int) v.fFloat;
                         }
                         break;
                     }
                     case ByteCodeInstruction::kSignedToFloat: {
                         for (int i = 0; i < count; ++i) {
-                            Value& v = fStack[fStack.size() - i - 1];
+                            Value& v = sp[-i];
                             v.fFloat = (float) v.fSigned;
                         }
                         break;
                     }
                     case ByteCodeInstruction::kUnsignedToFloat: {
                         for (int i = 0; i < count; ++i) {
-                            Value& v = fStack[fStack.size() - i - 1];
+                            Value& v = stack[-i];
                             v.fFloat = (float) v.fUnsigned;
                         }
                         break;
                     }
                     case ByteCodeInstruction::kLoad: {
-                        int target = this->pop().fSigned;
-                        for (int i = 0; i < count; ++i) {
-                            SkASSERT(target < (int) fStack.size());
-                            this->push(fStack[target++]);
-                        }
+                        int src = POP().fSigned;
+                        memcpy(sp + 1, &stack[src], count * sizeof(Value));
+                        sp += count;
                         break;
                     }
                     case ByteCodeInstruction::kLoadGlobal: {
                         int target = READ8();
                         SkASSERT(target < (int) fGlobals.size());
-                        this->push(fGlobals[target]);
+                        PUSH(fGlobals[target]);
                         break;
                     }
                     VECTOR_BINARY_OP(kMultiplyS, int32_t, fSigned, *)
@@ -465,11 +470,9 @@
                     VECTOR_BINARY_OP(kRemainderS, int32_t, fSigned, %)
                     VECTOR_BINARY_OP(kRemainderU, uint32_t, fUnsigned, %)
                     case ByteCodeInstruction::kStore: {
-                        int target = fStack[fStack.size() - count - 1].fSigned + count;
-                        for (int i = count - 1; i >= 0; --i) {
-                            SkASSERT(target < (int) fStack.size());
-                            fStack[--target] = this->pop();
-                        }
+                        memcpy(&stack[(sp - count)->fSigned], sp - count + 1,
+                               count * sizeof(Value));
+                        sp -= count;
                         break;
                     }
                     VECTOR_BINARY_OP(kSubtractI, int32_t, fSigned, -)
@@ -485,7 +488,11 @@
                 SkASSERT(false);
         }
 #ifdef TRACE
-        this->dumpStack();
+        printf("STACK:");
+        for (int i = 0; i < STACK_SIZE(); ++i) {
+            printf(" %d(%f)", stack[i].fSigned, stack[i].fFloat);
+        }
+        printf("\n");
 #endif
     }
 }