impl gather32 for x86

Some TODOs left over to make the scalar
tail case better... as is it issues a
256-bit gather for each 32-bit load!

I added a trimmed down variant of the existing
SkVM_gathers unit test to test just gather32,
covering this new JIT code.

Change-Id: Iabd2e6a61f0213b6d02d222b9f7aec2be000b70b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/264217
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 68c4c82..b6e1c8e 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -2307,6 +2307,43 @@
                                  else        { a->vmovups(        dst(), arg[immy]); }
                                  break;
 
+                case Op::gather32: {
+                    // We may not let any of dst(), index, or mask use the same register,
+                    // so we must allocate registers manually and very carefully.
+
+                    // index is argument x and has already been maybe_recycle_register()'d,
+                    // so we explicitly ignore its availability during this op.
+                    A::Ymm index = r[x];
+                    uint32_t avail_during_gather = avail & ~(1<<index);
+
+                    // Choose dst() to not overlap with index.
+                    if (int found = __builtin_ffs(avail_during_gather)) {
+                        set_dst((A::Ymm)(found-1));
+                        avail_during_gather ^= (1<<dst());
+                    } else {
+                        ok = false;
+                        break;
+                    }
+
+                    // Choose (temporary) mask to not overlap with dst() or index.
+                    A::Ymm mask;
+                    if (int found = __builtin_ffs(avail_during_gather)) {
+                        mask = (A::Ymm)(found-1);
+                    } else {
+                        ok = false;
+                        break;
+                    }
+
+                    // Our gather base pointer is immz bytes off of uniform immy.
+                    a->movq(scratch, arg[immy], immz);
+                    a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
+                    a->vgatherdps(dst(), A::FOUR, index, scratch, mask);
+
+                    // TODO: simpler impl. when scalar == true?
+                    // TODO: at least disable the other mask lanes?
+                }
+                break;
+
                 case Op::uniform8: a->movzbl(scratch, arg[immy], immz);
                                    a->vmovd_direct((A::Xmm)dst(), scratch);
                                    a->vbroadcastss(dst(), (A::Xmm)dst());
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 8967928..dfde64b 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -310,6 +310,59 @@
     });
 }
 
+DEF_TEST(SkVM_gather32, r) {
+    skvm::Builder b;
+    {
+        skvm::Arg uniforms = b.uniform(),
+                  buf      = b.varying<int>();
+        skvm::I32 x = b.load32(buf);
+        b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
+    }
+
+#if defined(SK_CPU_X86)
+    test_jit_and_interpreter
+#else
+    test_interpreter_only
+#endif
+    (r, b.done(), [&](const skvm::Program& program) {
+        const int img[] = {12,34,56,78, 90,98,76,54};
+
+        int buf[20];
+        for (int i = 0; i < 20; i++) {
+            buf[i] = i;
+        }
+
+        struct Uniforms {
+            const int* img;
+        } uniforms{img};
+
+        program.eval(20, &uniforms, buf);
+        int i = 0;
+        REPORTER_ASSERT(r, buf[i] == 12); i++;
+        REPORTER_ASSERT(r, buf[i] == 34); i++;
+        REPORTER_ASSERT(r, buf[i] == 56); i++;
+        REPORTER_ASSERT(r, buf[i] == 78); i++;
+        REPORTER_ASSERT(r, buf[i] == 90); i++;
+        REPORTER_ASSERT(r, buf[i] == 98); i++;
+        REPORTER_ASSERT(r, buf[i] == 76); i++;
+        REPORTER_ASSERT(r, buf[i] == 54); i++;
+
+        REPORTER_ASSERT(r, buf[i] == 12); i++;
+        REPORTER_ASSERT(r, buf[i] == 34); i++;
+        REPORTER_ASSERT(r, buf[i] == 56); i++;
+        REPORTER_ASSERT(r, buf[i] == 78); i++;
+        REPORTER_ASSERT(r, buf[i] == 90); i++;
+        REPORTER_ASSERT(r, buf[i] == 98); i++;
+        REPORTER_ASSERT(r, buf[i] == 76); i++;
+        REPORTER_ASSERT(r, buf[i] == 54); i++;
+
+        REPORTER_ASSERT(r, buf[i] == 12); i++;
+        REPORTER_ASSERT(r, buf[i] == 34); i++;
+        REPORTER_ASSERT(r, buf[i] == 56); i++;
+        REPORTER_ASSERT(r, buf[i] == 78); i++;
+    });
+}
+
 DEF_TEST(SkVM_gathers, r) {
     skvm::Builder b;
     {