impl gather32 for x86
Some TODOs left over to make the scalar
tail case better... as is it issues a
256-bit gather for each 32-bit load!
I added a trimmed down variant of the existing
SkVM_gathers unit test to test just gather32,
covering this new JIT code.
Change-Id: Iabd2e6a61f0213b6d02d222b9f7aec2be000b70b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/264217
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 68c4c82..b6e1c8e 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -2307,6 +2307,43 @@
else { a->vmovups( dst(), arg[immy]); }
break;
+ case Op::gather32: {
+ // We may not let any of dst(), index, or mask use the same register,
+ // so we must allocate registers manually and very carefully.
+
+ // index is argument x and has already been maybe_recycle_register()'d,
+ // so we explicitly ignore its availability during this op.
+ A::Ymm index = r[x];
+ uint32_t avail_during_gather = avail & ~(1<<index);
+
+ // Choose dst() to not overlap with index.
+ if (int found = __builtin_ffs(avail_during_gather)) {
+ set_dst((A::Ymm)(found-1));
+ avail_during_gather ^= (1<<dst());
+ } else {
+ ok = false;
+ break;
+ }
+
+ // Choose (temporary) mask to not overlap with dst() or index.
+ A::Ymm mask;
+ if (int found = __builtin_ffs(avail_during_gather)) {
+ mask = (A::Ymm)(found-1);
+ } else {
+ ok = false;
+ break;
+ }
+
+ // Our gather base pointer is immz bytes off of uniform immy.
+ a->movq(scratch, arg[immy], immz);
+ a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.)
+ a->vgatherdps(dst(), A::FOUR, index, scratch, mask);
+
+ // TODO: simpler impl. when scalar == true?
+ // TODO: at least disable the other mask lanes?
+ }
+ break;
+
case Op::uniform8: a->movzbl(scratch, arg[immy], immz);
a->vmovd_direct((A::Xmm)dst(), scratch);
a->vbroadcastss(dst(), (A::Xmm)dst());
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 8967928..dfde64b 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -310,6 +310,59 @@
});
}
+DEF_TEST(SkVM_gather32, r) {
+ skvm::Builder b;
+ {
+ skvm::Arg uniforms = b.uniform(),
+ buf = b.varying<int>();
+ skvm::I32 x = b.load32(buf);
+ b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
+ }
+
+#if defined(SK_CPU_X86)
+ test_jit_and_interpreter
+#else
+ test_interpreter_only
+#endif
+ (r, b.done(), [&](const skvm::Program& program) {
+ const int img[] = {12,34,56,78, 90,98,76,54};
+
+ int buf[20];
+ for (int i = 0; i < 20; i++) {
+ buf[i] = i;
+ }
+
+ struct Uniforms {
+ const int* img;
+ } uniforms{img};
+
+ program.eval(20, &uniforms, buf);
+ int i = 0;
+ REPORTER_ASSERT(r, buf[i] == 12); i++;
+ REPORTER_ASSERT(r, buf[i] == 34); i++;
+ REPORTER_ASSERT(r, buf[i] == 56); i++;
+ REPORTER_ASSERT(r, buf[i] == 78); i++;
+ REPORTER_ASSERT(r, buf[i] == 90); i++;
+ REPORTER_ASSERT(r, buf[i] == 98); i++;
+ REPORTER_ASSERT(r, buf[i] == 76); i++;
+ REPORTER_ASSERT(r, buf[i] == 54); i++;
+
+ REPORTER_ASSERT(r, buf[i] == 12); i++;
+ REPORTER_ASSERT(r, buf[i] == 34); i++;
+ REPORTER_ASSERT(r, buf[i] == 56); i++;
+ REPORTER_ASSERT(r, buf[i] == 78); i++;
+ REPORTER_ASSERT(r, buf[i] == 90); i++;
+ REPORTER_ASSERT(r, buf[i] == 98); i++;
+ REPORTER_ASSERT(r, buf[i] == 76); i++;
+ REPORTER_ASSERT(r, buf[i] == 54); i++;
+
+ REPORTER_ASSERT(r, buf[i] == 12); i++;
+ REPORTER_ASSERT(r, buf[i] == 34); i++;
+ REPORTER_ASSERT(r, buf[i] == 56); i++;
+ REPORTER_ASSERT(r, buf[i] == 78); i++;
+ });
+}
+
DEF_TEST(SkVM_gathers, r) {
skvm::Builder b;
{