add vgatherdps

A complicated instruction to say the least!

A "fun" wrinkle is that all the ymm registers must be unique!
(And the mask register is cleared by the instruction...)

Still kind of TODO is what that 0b100 r/m in the mod_rm() means.  Every
variant of the instruction I've assembled seems to have it set to 0b100
(e.g. 0x0c or 0x04) but I'd feel better if I knew what it meant.

Change-Id: Ia4ff5f8175bff545e2d10bb2d1b14f49073445a3
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/264116
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 1ef16a4..cc17074 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -934,13 +934,10 @@
         SkUNREACHABLE;
     }
 
-#if 0
     // SIB byte encodes a memory address, base + (index * scale).
-    enum class Scale { One, Two, Four, Eight };
-    static uint8_t sib(Scale scale, int index, int base) {
+    static uint8_t sib(Assembler::Scale scale, int index, int base) {
         return _233((int)scale, index, base);
     }
-#endif
 
     // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
     static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
@@ -1381,6 +1378,23 @@
         this->byte(imm);
     }
 
+    void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
+        // Unlike most instructions, no aliasing is permitted here.
+        SkASSERT(dst != ix);
+        SkASSERT(dst != mask);
+        SkASSERT(mask != ix);
+
+        int prefix = 0x66,
+            map    = 0x380f,
+            opcode = 0x92;
+        VEX v = vex(0, dst>>3, ix>>3, base>>3,
+                    map, mask, /*ymm?*/1, prefix);
+        this->bytes(v.bytes, v.len);
+        this->byte(opcode);
+        this->byte(mod_rm(Mod::Indirect, dst&7, 0b100/*TODO: what do these 0b100 bits mean?*/));
+        this->byte(sib(scale, ix&7, base&7));
+    }
+
     // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
 
     static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 1f75b0e..012bd97 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -154,6 +154,13 @@
         void vpextrw(GP64 ptr, Xmm src, int imm);           // *dst = src[imm]           , 16-bit
         void vpextrb(GP64 ptr, Xmm src, int imm);           // *dst = src[imm]           ,  8-bit
 
+        // if (mask & 0x8000'0000) {
+        //     dst = base[scale*ix];
+        // }
+        // mask = 0;
+        enum Scale { ONE, TWO, FOUR, EIGHT };
+        void vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask);
+
         // aarch64
 
         // d = op(n,m)
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 353acb9..2990a53 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -1114,6 +1114,24 @@
         0xc5,0xfd,0x5b,0xda,
     });
 
+    test_asm(r, [&](A& a) {
+        a.vgatherdps(A::ymm1 , A::FOUR , A::ymm0 , A::rdi, A::ymm2 );
+        a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm1 );
+        a.vgatherdps(A::ymm10, A::ONE  , A::ymm2 , A::rax, A::ymm1 );
+        a.vgatherdps(A::ymm0 , A::ONE  , A::ymm12, A::rax, A::ymm1 );
+        a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::r9 , A::ymm1 );
+        a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm12);
+        a.vgatherdps(A::ymm0 , A::EIGHT, A::ymm2 , A::rax, A::ymm12);
+    },{
+        0xc4,0xe2,0x6d,0x92,0x0c,0x87,
+        0xc4,0xe2,0x75,0x92,0x04,0x10,
+        0xc4,0x62,0x75,0x92,0x14,0x10,
+        0xc4,0xa2,0x75,0x92,0x04,0x20,
+        0xc4,0xc2,0x75,0x92,0x04,0x11,
+        0xc4,0xe2,0x1d,0x92,0x04,0x10,
+        0xc4,0xe2,0x1d,0x92,0x04,0xd0,
+    });
+
     // echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
 
     test_asm(r, [&](A& a) {