impl gather8/gather16 with gather32

This is our quick path to JIT small gathers.

The idea is roughly,

   const uint32_t* ptr32 = ptr8;
   uint32_t abcd = ptr32[ix/4];
   switch (ix & 3) {
     case 3: return (abcd >> 24)       ;
     case 2: return (abcd >> 16) & 0xff;
     case 1: return (abcd >>  8) & 0xff;
     case 0: return (abcd      ) & 0xff;
   }

With the idea that if we may load a given byte,
we should also be allowed to load the four byte
aligned word that byte falls within.

Change-Id: I7fb1085306050c918ccf505f1d2e1e87db3b8c9a
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/268381
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 10bd38e..be91326 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -143,8 +143,6 @@
                 case Op::load16: write(o, V{id}, "=", op, Arg{immy}); break;
                 case Op::load32: write(o, V{id}, "=", op, Arg{immy}); break;
 
-                case Op::gather8:  write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
-                case Op::gather16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
                 case Op::gather32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
 
                 case Op::uniform8:  write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
@@ -268,8 +266,6 @@
                 case Op::load16: write(o, R{d}, "=", op, Arg{immy}); break;
                 case Op::load32: write(o, R{d}, "=", op, Arg{immy}); break;
 
-                case Op::gather8:  write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
-                case Op::gather16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
                 case Op::gather32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
 
                 case Op::uniform8:  write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
@@ -529,10 +525,24 @@
     I32 Builder::load32(Arg ptr) { return {this->push(Op::load32, NA,NA,NA, ptr.ix) }; }
 
     I32 Builder::gather8 (Arg ptr, int offset, I32 index) {
-        return {this->push(Op::gather8 , index.id,NA,NA, ptr.ix,offset)};
+        // We'll gather32() the right 4 bytes, then select the correct byte.
+        skvm::I32 val = gather32(ptr, offset, shr(index,2)),
+                  rem = bit_and(index, splat(3));
+
+        val = bit_or(bit_and(eq(rem, splat(3)), shr(val, 24)),
+              bit_or(bit_and(eq(rem, splat(2)), shr(val, 16)),
+              bit_or(bit_and(eq(rem, splat(1)), shr(val,  8)),
+                     bit_and(eq(rem, splat(0)),     val     ))));
+        return bit_and(val, splat(0xff));
     }
     I32 Builder::gather16(Arg ptr, int offset, I32 index) {
-        return {this->push(Op::gather16, index.id,NA,NA, ptr.ix,offset)};
+        // We'll gather32() the right 4 bytes, then select the correct two.
+        skvm::I32 val = gather32(ptr, offset, shr(index,1)),
+                  rem = bit_and(index, splat(1));
+
+        val = bit_or(bit_and(eq(rem, splat(1)), shr(val, 16)),
+                     bit_and(eq(rem, splat(0)),     val     ));
+        return bit_and(val, splat(0xffff));
     }
     I32 Builder::gather32(Arg ptr, int offset, I32 index) {
         return {this->push(Op::gather32, index.id,NA,NA, ptr.ix,offset)};
@@ -1700,32 +1710,12 @@
                     //     - *(const T**)foo loads the gather base and casts it to the right type.
                     // After all that we have an ordinary (uniform) pointer `ptr` to load from,
                     // and we then gather from it using the varying indices in r(x).
-                    STRIDE_1(Op::gather8):
-                        for (int i = 0; i < K; i++) {
-                            auto ptr = *(const uint8_t**)((const uint8_t*)arg(immy) + immz);
-                            r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
-                        } break;
-                    STRIDE_1(Op::gather16):
-                        for (int i = 0; i < K; i++) {
-                            auto ptr = *(const uint16_t**)((const uint8_t*)arg(immy) + immz);
-                            r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
-                        } break;
                     STRIDE_1(Op::gather32):
                         for (int i = 0; i < K; i++) {
                             auto ptr = *(const int**)((const uint8_t*)arg(immy) + immz);
                             r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
                         } break;
 
-                    STRIDE_K(Op::gather8):
-                        for (int i = 0; i < K; i++) {
-                            auto ptr = *(const uint8_t**)((const uint8_t*)arg(immy) + immz);
-                            r(d).i32[i] = ptr[ r(x).i32[i] ];
-                        } break;
-                    STRIDE_K(Op::gather16):
-                        for (int i = 0; i < K; i++) {
-                            auto ptr = *(const uint16_t**)((const uint8_t*)arg(immy) + immz);
-                            r(d).i32[i] = ptr[ r(x).i32[i] ];
-                        } break;
                     STRIDE_K(Op::gather32):
                         for (int i = 0; i < K; i++) {
                             auto ptr = *(const int**)((const uint8_t*)arg(immy) + immz);
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 47cda38..5f733be 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -286,7 +286,7 @@
         M(store8)   M(store16)   M(store32)   \
         M(index)                              \
         M(load8)    M(load16)    M(load32)    \
-        M(gather8)  M(gather16)  M(gather32)  \
+                                 M(gather32)  \
         M(uniform8) M(uniform16) M(uniform32) \
         M(splat)                              \
         M(add_f32) M(add_i32) M(add_i16x2)    \
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 2f69e16..08f9540 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -378,7 +378,7 @@
         b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
     }
 
-    test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
+    test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
         const int img[] = {12,34,56,78, 90,98,76,54};
 
         constexpr int N = 20;