Revert "impl gather8/gather16 with gather32"

This reverts commit d4e3b9e8bcfc5e4cd4ea0b587a3b0cb76f32d0c0.

Reason for revert: will reland with fixes

Original change's description:
> impl gather8/gather16 with gather32
> 
> This is our quick path to JIT small gathers.
> 
> The idea is roughly,
> 
>    const uint32_t* ptr32 = ptr8;
>    uint32_t abcd = ptr32[ix/4];
>    switch (ix & 3) {
>      case 3: return (abcd >> 24)       ;
>      case 2: return (abcd >> 16) & 0xff;
>      case 1: return (abcd >>  8) & 0xff;
>      case 0: return (abcd      ) & 0xff;
>    }
> 
> With the idea that if we may load a given byte,
> we should also be allowed to load the four byte
> aligned word that byte falls within.
> 
> Change-Id: I7fb1085306050c918ccf505f1d2e1e87db3b8c9a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/268381
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>

TBR=mtklein@google.com,herb@google.com,reed@google.com

Change-Id: I48d800edc6517f37e04752c91616b666a5e0f384
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/268490
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index ecbf8c4..f05f89b 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -143,6 +143,8 @@
                 case Op::load16: write(o, V{id}, "=", op, Arg{immy}); break;
                 case Op::load32: write(o, V{id}, "=", op, Arg{immy}); break;
 
+                case Op::gather8:  write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
+                case Op::gather16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
                 case Op::gather32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
 
                 case Op::uniform8:  write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
@@ -266,6 +268,8 @@
                 case Op::load16: write(o, R{d}, "=", op, Arg{immy}); break;
                 case Op::load32: write(o, R{d}, "=", op, Arg{immy}); break;
 
+                case Op::gather8:  write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
+                case Op::gather16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
                 case Op::gather32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
 
                 case Op::uniform8:  write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
@@ -525,24 +529,10 @@
     I32 Builder::load32(Arg ptr) { return {this->push(Op::load32, NA,NA,NA, ptr.ix) }; }
 
     I32 Builder::gather8 (Arg ptr, int offset, I32 index) {
-        // We'll gather32() the right 4 bytes, then select the correct byte.
-        skvm::I32 val = gather32(ptr, offset, shr(index,2)),
-                  rem = bit_and(index, splat(3));
-
-        val = bit_or(bit_and(eq(rem, splat(3)), shr(val, 24)),
-              bit_or(bit_and(eq(rem, splat(2)), shr(val, 16)),
-              bit_or(bit_and(eq(rem, splat(1)), shr(val,  8)),
-                     bit_and(eq(rem, splat(0)),     val     ))));
-        return bit_and(val, splat(0xff));
+        return {this->push(Op::gather8 , index.id,NA,NA, ptr.ix,offset)};
     }
     I32 Builder::gather16(Arg ptr, int offset, I32 index) {
-        // We'll gather32() the right 4 bytes, then select the correct two.
-        skvm::I32 val = gather32(ptr, offset, shr(index,1)),
-                  rem = bit_and(index, splat(1));
-
-        val = bit_or(bit_and(eq(rem, splat(1)), shr(val, 16)),
-                     bit_and(eq(rem, splat(0)),     val     ));
-        return bit_and(val, splat(0xffff));
+        return {this->push(Op::gather16, index.id,NA,NA, ptr.ix,offset)};
     }
     I32 Builder::gather32(Arg ptr, int offset, I32 index) {
         return {this->push(Op::gather32, index.id,NA,NA, ptr.ix,offset)};
@@ -1710,12 +1700,32 @@
                     //     - *(const T**)foo loads the gather base and casts it to the right type.
                     // After all that we have an ordinary (uniform) pointer `ptr` to load from,
                     // and we then gather from it using the varying indices in r(x).
+                    STRIDE_1(Op::gather8):
+                        for (int i = 0; i < K; i++) {
+                            auto ptr = *(const uint8_t**)((const uint8_t*)arg(immy) + immz);
+                            r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
+                        } break;
+                    STRIDE_1(Op::gather16):
+                        for (int i = 0; i < K; i++) {
+                            auto ptr = *(const uint16_t**)((const uint8_t*)arg(immy) + immz);
+                            r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
+                        } break;
                     STRIDE_1(Op::gather32):
                         for (int i = 0; i < K; i++) {
                             auto ptr = *(const int**)((const uint8_t*)arg(immy) + immz);
                             r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
                         } break;
 
+                    STRIDE_K(Op::gather8):
+                        for (int i = 0; i < K; i++) {
+                            auto ptr = *(const uint8_t**)((const uint8_t*)arg(immy) + immz);
+                            r(d).i32[i] = ptr[ r(x).i32[i] ];
+                        } break;
+                    STRIDE_K(Op::gather16):
+                        for (int i = 0; i < K; i++) {
+                            auto ptr = *(const uint16_t**)((const uint8_t*)arg(immy) + immz);
+                            r(d).i32[i] = ptr[ r(x).i32[i] ];
+                        } break;
                     STRIDE_K(Op::gather32):
                         for (int i = 0; i < K; i++) {
                             auto ptr = *(const int**)((const uint8_t*)arg(immy) + immz);
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 5ad7716..0503d1f 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -286,7 +286,7 @@
         M(store8)   M(store16)   M(store32)   \
         M(index)                              \
         M(load8)    M(load16)    M(load32)    \
-                                 M(gather32)  \
+        M(gather8)  M(gather16)  M(gather32)  \
         M(uniform8) M(uniform16) M(uniform32) \
         M(splat)                              \
         M(add_f32) M(add_i32) M(add_i16x2)    \
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 08f9540..2f69e16 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -378,7 +378,7 @@
         b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
     }
 
-    test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
+    test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
         const int img[] = {12,34,56,78, 90,98,76,54};
 
         constexpr int N = 20;