Reland "Reland "Interpreter: Support striped inputs for less overhead""

This reverts commit 2c59b4e9ea856231e6c75608b66f202d16201679.

Change-Id: I2b06936994430722b8fc3890ff9b4a6f4710db04
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/221998
Reviewed-by: Brian Osman <brianosman@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
diff --git a/src/sksl/SkSLByteCode.cpp b/src/sksl/SkSLByteCode.cpp
index 8b9fc3e..23730c0 100644
--- a/src/sksl/SkSLByteCode.cpp
+++ b/src/sksl/SkSLByteCode.cpp
@@ -1041,6 +1041,57 @@
     }
 }
 
+void ByteCode::runStriped(const ByteCodeFunction* f, float* args[], int nargs, int N,
+                          const float* uniforms, int uniformCount) const {
+#ifdef TRACE
+    disassemble(f);
+#endif
+    Interpreter::VValue stack[128];
+
+    // Needs to be the first N non-negative integers, at least as large as VecWidth
+    static const Interpreter::I32 gLanes = {
+        0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+    };
+
+    SkASSERT(f->fReturnCount == 0);
+    SkASSERT(nargs == f->fParameterCount);
+    SkASSERT(uniformCount == (int)fInputSlots.size());
+    Interpreter::VValue globals[32];
+    SkASSERT((int)SK_ARRAY_COUNT(globals) >= fGlobalCount);
+    for (uint8_t slot : fInputSlots) {
+        globals[slot].fFloat = *uniforms++;
+    }
+
+    while (N) {
+        int w = std::min(N, Interpreter::VecWidth);
+
+        // Copy args into stack
+        for (int i = 0; i < nargs; ++i) {
+            memcpy(stack + i, args[i], w * sizeof(float));
+        }
+
+        auto mask = w > gLanes;
+        innerRun(this, f, stack, nullptr, mask, globals);
+
+        // Copy out parameters back
+        int slot = 0;
+        for (const auto& p : f->fParameters) {
+            if (p.fIsOutParameter) {
+                for (int i = slot; i < slot + p.fSlotCount; ++i) {
+                    memcpy(args[i], stack + i, w * sizeof(float));
+                }
+            }
+            slot += p.fSlotCount;
+        }
+
+        // Step each argument pointer ahead
+        for (int i = 0; i < nargs; ++i) {
+            args[i] += w;
+        }
+        N -= w;
+    }
+}
+
 } // namespace SkSL
 
 #endif