revise extract instruction

Convert extract(x,bits,z) to be (x >> bits) & z,
now a more explicit parallel to pack().

This lets us eliminate the funky bit counting required from the old
instruction, but more saliently it makes it more likely that the masks
we AND with will be the same value.

Ultimately down at the x86 or ARM ISA level, the AND instructions don't
really benefit from having an immediate argument (while the shifts do).
We might as well treat the mask as a normal value, letting it get
commoned with identical values, loop hoisted, etc.

Change-Id: I48a38468b46f2c730574c025f412262296472447
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/219597
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/resources/SkVMTest.expected b/resources/SkVMTest.expected
index 38629c1..126a1ac 100644
--- a/resources/SkVMTest.expected
+++ b/resources/SkVMTest.expected
@@ -43,45 +43,46 @@
 store8 arg(1) r8
 
 A8 over RGBA_8888
-10 registers, 37 instructions:
+11 registers, 38 instructions:
 r0 = splat 3B808081 (0.0039215689)
-r1 = splat 3F800000 (1)
-r2 = splat 437F0000 (255)
-r3 = splat 3F000000 (0.5)
+r1 = splat FF (3.5733111e-43)
+r2 = splat 3F800000 (1)
+r3 = splat 437F0000 (255)
+r4 = splat 3F000000 (0.5)
 loop:
-r4 = load8 arg(0)
-r4 = to_f32 r4
-r4 = mul_f32 r0 r4
-r5 = load32 arg(1)
-r6 = extract r5 FF
-r6 = to_f32 r6
-r6 = mul_f32 r0 r6
-r7 = extract r5 FF00
-r7 = to_f32 r7
-r7 = mul_f32 r0 r7
-r8 = extract r5 FF0000
-r8 = to_f32 r8
-r8 = mul_f32 r0 r8
-r5 = shr r5 24
+r5 = load8 arg(0)
 r5 = to_f32 r5
 r5 = mul_f32 r0 r5
-r9 = sub_f32 r1 r4
-r6 = mul_f32 r6 r9
-r7 = mul_f32 r7 r9
-r8 = mul_f32 r8 r9
-r9 = mad_f32 r5 r9 r4
-r6 = mad_f32 r6 r2 r3
-r6 = to_i32 r6
-r7 = mad_f32 r7 r2 r3
+r6 = load32 arg(1)
+r7 = extract r6 0 r1
+r7 = to_f32 r7
+r7 = mul_f32 r0 r7
+r8 = extract r6 8 r1
+r8 = to_f32 r8
+r8 = mul_f32 r0 r8
+r9 = extract r6 16 r1
+r9 = to_f32 r9
+r9 = mul_f32 r0 r9
+r6 = extract r6 24 r1
+r6 = to_f32 r6
+r6 = mul_f32 r0 r6
+r10 = sub_f32 r2 r5
+r7 = mul_f32 r7 r10
+r8 = mul_f32 r8 r10
+r9 = mul_f32 r9 r10
+r10 = mad_f32 r6 r10 r5
+r7 = mad_f32 r7 r3 r4
 r7 = to_i32 r7
-r8 = mad_f32 r8 r2 r3
+r8 = mad_f32 r8 r3 r4
 r8 = to_i32 r8
-r9 = mad_f32 r9 r2 r3
+r9 = mad_f32 r9 r3 r4
 r9 = to_i32 r9
-r7 = pack r6 r7 8
-r9 = pack r8 r9 8
-r9 = pack r7 r9 16
-store32 arg(1) r9
+r10 = mad_f32 r10 r3 r4
+r10 = to_i32 r10
+r8 = pack r7 r8 8
+r10 = pack r9 r10 8
+r10 = pack r8 r10 16
+store32 arg(1) r10
 
 G8 over A8
 6 registers, 12 instructions:
@@ -125,168 +126,172 @@
 store8 arg(1) r8
 
 G8 over RGBA_8888
-10 registers, 37 instructions:
+11 registers, 38 instructions:
 r0 = splat 3B808081 (0.0039215689)
 r1 = splat 3F800000 (1)
-r2 = sub_f32 r1 r1
+r2 = splat FF (3.5733111e-43)
+r3 = sub_f32 r1 r1
+r4 = splat 437F0000 (255)
+r5 = splat 3F000000 (0.5)
+loop:
+r6 = load8 arg(0)
+r6 = to_f32 r6
+r6 = mul_f32 r0 r6
+r7 = load32 arg(1)
+r8 = extract r7 0 r2
+r8 = to_f32 r8
+r8 = mul_f32 r0 r8
+r9 = extract r7 8 r2
+r9 = to_f32 r9
+r9 = mul_f32 r0 r9
+r10 = extract r7 16 r2
+r10 = to_f32 r10
+r10 = mul_f32 r0 r10
+r7 = extract r7 24 r2
+r7 = to_f32 r7
+r7 = mul_f32 r0 r7
+r8 = mad_f32 r8 r3 r6
+r9 = mad_f32 r9 r3 r6
+r10 = mad_f32 r10 r3 r6
+r7 = mad_f32 r7 r3 r1
+r8 = mad_f32 r8 r4 r5
+r8 = to_i32 r8
+r9 = mad_f32 r9 r4 r5
+r9 = to_i32 r9
+r10 = mad_f32 r10 r4 r5
+r10 = to_i32 r10
+r7 = mad_f32 r7 r4 r5
+r7 = to_i32 r7
+r9 = pack r8 r9 8
+r7 = pack r10 r7 8
+r7 = pack r9 r7 16
+store32 arg(1) r7
+
+RGBA_8888 over A8
+8 registers, 17 instructions:
+r0 = splat FF (3.5733111e-43)
+r1 = splat 3B808081 (0.0039215689)
+r2 = splat 3F800000 (1)
 r3 = splat 437F0000 (255)
 r4 = splat 3F000000 (0.5)
 loop:
-r5 = load8 arg(0)
+r5 = load32 arg(0)
+r5 = extract r5 24 r0
 r5 = to_f32 r5
-r5 = mul_f32 r0 r5
-r6 = load32 arg(1)
-r7 = extract r6 FF
-r7 = to_f32 r7
-r7 = mul_f32 r0 r7
-r8 = extract r6 FF00
-r8 = to_f32 r8
-r8 = mul_f32 r0 r8
-r9 = extract r6 FF0000
-r9 = to_f32 r9
-r9 = mul_f32 r0 r9
-r6 = shr r6 24
+r5 = mul_f32 r1 r5
+r6 = load8 arg(1)
 r6 = to_f32 r6
-r6 = mul_f32 r0 r6
-r7 = mad_f32 r7 r2 r5
-r8 = mad_f32 r8 r2 r5
-r9 = mad_f32 r9 r2 r5
-r6 = mad_f32 r6 r2 r1
+r6 = mul_f32 r1 r6
+r7 = sub_f32 r2 r5
+r7 = mad_f32 r6 r7 r5
 r7 = mad_f32 r7 r3 r4
 r7 = to_i32 r7
-r8 = mad_f32 r8 r3 r4
-r8 = to_i32 r8
-r9 = mad_f32 r9 r3 r4
-r9 = to_i32 r9
-r6 = mad_f32 r6 r3 r4
-r6 = to_i32 r6
-r8 = pack r7 r8 8
-r6 = pack r9 r6 8
-r6 = pack r8 r6 16
-store32 arg(1) r6
-
-RGBA_8888 over A8
-7 registers, 16 instructions:
-r0 = splat 3B808081 (0.0039215689)
-r1 = splat 3F800000 (1)
-r2 = splat 437F0000 (255)
-r3 = splat 3F000000 (0.5)
-loop:
-r4 = load32 arg(0)
-r4 = shr r4 24
-r4 = to_f32 r4
-r4 = mul_f32 r0 r4
-r5 = load8 arg(1)
-r5 = to_f32 r5
-r5 = mul_f32 r0 r5
-r6 = sub_f32 r1 r4
-r6 = mad_f32 r5 r6 r4
-r6 = mad_f32 r6 r2 r3
-r6 = to_i32 r6
-store8 arg(1) r6
-
-RGBA_8888 over G8
-12 registers, 33 instructions:
-r0 = splat 3B808081 (0.0039215689)
-r1 = splat 3F800000 (1)
-r2 = splat 3E59B3D0 (0.21259999)
-r3 = splat 3F371759 (0.71520001)
-r4 = splat 3D93DD98 (0.0722)
-r5 = splat 437F0000 (255)
-r6 = splat 3F000000 (0.5)
-loop:
-r7 = load32 arg(0)
-r8 = extract r7 FF
-r8 = to_f32 r8
-r8 = mul_f32 r0 r8
-r9 = extract r7 FF00
-r9 = to_f32 r9
-r9 = mul_f32 r0 r9
-r10 = extract r7 FF0000
-r10 = to_f32 r10
-r10 = mul_f32 r0 r10
-r7 = shr r7 24
-r7 = to_f32 r7
-r7 = mul_f32 r0 r7
-r11 = load8 arg(1)
-r11 = to_f32 r11
-r11 = mul_f32 r0 r11
-r7 = sub_f32 r1 r7
-r8 = mad_f32 r11 r7 r8
-r9 = mad_f32 r11 r7 r9
-r7 = mad_f32 r11 r7 r10
-r7 = mul_f32 r7 r4
-r7 = mad_f32 r9 r3 r7
-r7 = mad_f32 r8 r2 r7
-r7 = mad_f32 r7 r5 r6
-r7 = to_i32 r7
 store8 arg(1) r7
 
-RGBA_8888 over RGBA_8888
-13 registers, 47 instructions:
-r0 = splat 3B808081 (0.0039215689)
-r1 = splat 3F800000 (1)
-r2 = splat 437F0000 (255)
-r3 = splat 3F000000 (0.5)
+RGBA_8888 over G8
+13 registers, 34 instructions:
+r0 = splat FF (3.5733111e-43)
+r1 = splat 3B808081 (0.0039215689)
+r2 = splat 3F800000 (1)
+r3 = splat 3E59B3D0 (0.21259999)
+r4 = splat 3F371759 (0.71520001)
+r5 = splat 3D93DD98 (0.0722)
+r6 = splat 437F0000 (255)
+r7 = splat 3F000000 (0.5)
 loop:
-r4 = load32 arg(0)
-r5 = extract r4 FF
-r5 = to_f32 r5
-r5 = mul_f32 r0 r5
-r6 = extract r4 FF00
-r6 = to_f32 r6
-r6 = mul_f32 r0 r6
-r7 = extract r4 FF0000
-r7 = to_f32 r7
-r7 = mul_f32 r0 r7
-r4 = shr r4 24
-r4 = to_f32 r4
-r4 = mul_f32 r0 r4
-r8 = load32 arg(1)
-r9 = extract r8 FF
+r8 = load32 arg(0)
+r9 = extract r8 0 r0
 r9 = to_f32 r9
-r9 = mul_f32 r0 r9
-r10 = extract r8 FF00
+r9 = mul_f32 r1 r9
+r10 = extract r8 8 r0
 r10 = to_f32 r10
-r10 = mul_f32 r0 r10
-r11 = extract r8 FF0000
+r10 = mul_f32 r1 r10
+r11 = extract r8 16 r0
 r11 = to_f32 r11
-r11 = mul_f32 r0 r11
-r8 = shr r8 24
+r11 = mul_f32 r1 r11
+r8 = extract r8 24 r0
 r8 = to_f32 r8
-r8 = mul_f32 r0 r8
-r12 = sub_f32 r1 r4
-r9 = mad_f32 r9 r12 r5
-r10 = mad_f32 r10 r12 r6
-r11 = mad_f32 r11 r12 r7
-r12 = mad_f32 r8 r12 r4
-r9 = mad_f32 r9 r2 r3
-r9 = to_i32 r9
-r10 = mad_f32 r10 r2 r3
+r8 = mul_f32 r1 r8
+r12 = load8 arg(1)
+r12 = to_f32 r12
+r12 = mul_f32 r1 r12
+r8 = sub_f32 r2 r8
+r9 = mad_f32 r12 r8 r9
+r10 = mad_f32 r12 r8 r10
+r8 = mad_f32 r12 r8 r11
+r8 = mul_f32 r8 r5
+r8 = mad_f32 r10 r4 r8
+r8 = mad_f32 r9 r3 r8
+r8 = mad_f32 r8 r6 r7
+r8 = to_i32 r8
+store8 arg(1) r8
+
+RGBA_8888 over RGBA_8888
+14 registers, 48 instructions:
+r0 = splat FF (3.5733111e-43)
+r1 = splat 3B808081 (0.0039215689)
+r2 = splat 3F800000 (1)
+r3 = splat 437F0000 (255)
+r4 = splat 3F000000 (0.5)
+loop:
+r5 = load32 arg(0)
+r6 = extract r5 0 r0
+r6 = to_f32 r6
+r6 = mul_f32 r1 r6
+r7 = extract r5 8 r0
+r7 = to_f32 r7
+r7 = mul_f32 r1 r7
+r8 = extract r5 16 r0
+r8 = to_f32 r8
+r8 = mul_f32 r1 r8
+r5 = extract r5 24 r0
+r5 = to_f32 r5
+r5 = mul_f32 r1 r5
+r9 = load32 arg(1)
+r10 = extract r9 0 r0
+r10 = to_f32 r10
+r10 = mul_f32 r1 r10
+r11 = extract r9 8 r0
+r11 = to_f32 r11
+r11 = mul_f32 r1 r11
+r12 = extract r9 16 r0
+r12 = to_f32 r12
+r12 = mul_f32 r1 r12
+r9 = extract r9 24 r0
+r9 = to_f32 r9
+r9 = mul_f32 r1 r9
+r13 = sub_f32 r2 r5
+r10 = mad_f32 r10 r13 r6
+r11 = mad_f32 r11 r13 r7
+r12 = mad_f32 r12 r13 r8
+r13 = mad_f32 r9 r13 r5
+r10 = mad_f32 r10 r3 r4
 r10 = to_i32 r10
-r11 = mad_f32 r11 r2 r3
+r11 = mad_f32 r11 r3 r4
 r11 = to_i32 r11
-r12 = mad_f32 r12 r2 r3
+r12 = mad_f32 r12 r3 r4
 r12 = to_i32 r12
-r10 = pack r9 r10 8
-r12 = pack r11 r12 8
-r12 = pack r10 r12 16
-store32 arg(1) r12
+r13 = mad_f32 r13 r3 r4
+r13 = to_i32 r13
+r11 = pack r10 r11 8
+r13 = pack r12 r13 8
+r13 = pack r11 r13 16
+store32 arg(1) r13
 
 I32 8888 over 8888
 10 registers, 20 instructions:
 r0 = splat FF (3.5733111e-43)
 loop:
 r1 = load32 arg(0)
-r2 = extract r1 FF
-r3 = extract r1 FF00
-r4 = extract r1 FF0000
-r1 = shr r1 24
+r2 = extract r1 0 r0
+r3 = extract r1 8 r0
+r4 = extract r1 16 r0
+r1 = extract r1 24 r0
 r5 = load32 arg(1)
-r6 = extract r5 FF
-r7 = extract r5 FF00
-r8 = extract r5 FF0000
-r5 = shr r5 24
+r6 = extract r5 0 r0
+r7 = extract r5 8 r0
+r8 = extract r5 16 r0
+r5 = extract r5 24 r0
 r9 = sub_i32 r0 r1
 r6 = mad_unorm8 r6 r9 r2
 r7 = mad_unorm8 r7 r9 r3
@@ -299,24 +304,24 @@
 
 I32 (SWAR) 8888 over 8888
 7 registers, 20 instructions:
-r0 = splat FF (3.5733111e-43)
-r1 = splat FF00FF (2.3418409e-38)
+r0 = splat FF00FF (2.3418409e-38)
+r1 = splat FF (3.5733111e-43)
 loop:
 r2 = load32 arg(0)
-r3 = extract r2 FF00FF
-r2 = extract r2 FF00FF00
+r3 = extract r2 0 r0
+r2 = extract r2 8 r0
 r4 = load32 arg(1)
-r5 = extract r4 FF00FF
-r4 = extract r4 FF00FF00
+r5 = extract r4 0 r0
+r4 = extract r4 8 r0
 r6 = shr r2 16
-r6 = sub_i32 r0 r6
+r6 = sub_i32 r1 r6
 r5 = mul_i32 r5 r6
-r5 = add_i32 r5 r1
-r5 = extract r5 FF00FF00
+r5 = add_i32 r5 r0
+r5 = extract r5 8 r0
 r5 = add_i32 r3 r5
 r6 = mul_i32 r4 r6
-r6 = add_i32 r6 r1
-r6 = extract r6 FF00FF00
+r6 = add_i32 r6 r0
+r6 = extract r6 8 r0
 r6 = add_i32 r2 r6
 r6 = pack r5 r6 8
 store32 arg(1) r6
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 48a4c50..3e0ebe9 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -9,9 +9,6 @@
 #include "src/core/SkOpts.h"
 #include "src/core/SkVM.h"
 #include <string.h>
-#if defined(SK_BUILD_FOR_WIN)
-    #include <intrin.h>
-#endif
 
 namespace skvm {
 
@@ -223,18 +220,8 @@
         return {this->push(Op::mad_unorm8, x.id, y.id, z.id)};
     }
 
-    I32 Builder::extract(I32 x, int mask) {
-        SkASSERT(mask != 0);
-    #if defined(SK_BUILD_FOR_WIN)
-        unsigned long shift;
-        _BitScanForward(&shift, mask);
-    #else
-        const int shift = __builtin_ctz(mask);
-    #endif
-        if ((unsigned)mask == (~0u << shift)) {
-            return this->shr(x, shift);
-        }
-        return {this->push(Op::extract, x.id,NA,NA, mask, shift)};
+    I32 Builder::extract(I32 x, int bits, I32 z) {
+        return {this->push(Op::extract, x.id,NA,z.id, bits,0)};
     }
 
     I32 Builder::pack(I32 x, I32 y, int bits) {
@@ -248,7 +235,6 @@
 
     struct R { ID id; };
     struct Shift { int bits; };
-    struct Mask  { int bits; };
     struct Splat { int bits; };
 
     static void write(SkWStream* o, const char* s) {
@@ -267,9 +253,6 @@
     static void write(SkWStream* o, Shift s) {
         o->writeDecAsText(s.bits);
     }
-    static void write(SkWStream* o, Mask m) {
-        o->writeHexAsText(m.bits);
-    }
     static void write(SkWStream* o, Splat s) {
         float f;
         memcpy(&f, &s.bits, 4);
@@ -331,8 +314,8 @@
                 case Op::mul_unorm8: write(o, R{d}, "= mul_unorm8", R{x}, R{y.id}         ); break;
                 case Op::mad_unorm8: write(o, R{d}, "= mad_unorm8", R{x}, R{y.id}, R{z.id}); break;
 
-                case Op::extract: write(o, R{d}, "= extract", R{x}, Mask{y.imm}); break;
-                case Op::pack: write(o, R{d}, "= pack", R{x}, R{y.id}, Shift{z.imm}); break;
+                case Op::extract: write(o, R{d}, "= extract", R{x}, Shift{y.imm}, R{z.id}); break;
+                case Op::pack:    write(o, R{d}, "= pack",    R{x}, R{y.id}, Shift{z.imm}); break;
 
                 case Op::to_f32: write(o, R{d}, "= to_f32", R{x}); break;
                 case Op::to_i32: write(o, R{d}, "= to_i32", R{x}); break;
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 3c4f2d4..e18a7de 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -99,16 +99,7 @@
         I32 mul_unorm8(I32 x, I32 y);          // (x*y+255)/256, approximating (x*y+127)/255.
         I32 mad_unorm8(I32 x, I32 y, I32 z);   // mul_unorm8(x,y) + z
 
-        // (x & mask) >> k, where k is the lowest set bit of mask. E.g.
-        //    extract(x, 0xff)   == (x & 0xff)
-        //    extract(x, 0xff00) == (x & 0xff00) >> 8
-        //
-        //    extract(x, 0x00ff00ff) == (x & 0x00ff00ff)
-        //    extract(x, 0xff00ff00) == (x & 0xff00ff00) >> 8
-        //
-        //    extract(x, 0x003ff) == (x & 0x003ff)
-        //    extract(x, 0xffc00) == (x & 0xffc00) >> 10
-        I32 extract(I32 x, int mask);
+        I32 extract(I32 x, int bits, I32 z);   // (x >> bits) & z
 
         // Interlace bits from x and y as if x | (y << bits),
         // assuming no bits from x and (y << bits) collide with each other, (x & (y << bits)) == 0.
diff --git a/src/opts/SkVM_opts.h b/src/opts/SkVM_opts.h
index ed36ff3..55823a1 100644
--- a/src/opts/SkVM_opts.h
+++ b/src/opts/SkVM_opts.h
@@ -136,9 +136,8 @@
                     CASE(Op::mad_unorm8): r(d).u32 = (r(x).u32 * r(y.id).u32 + 255) / 256
                                                    + r(z.id).u32; break;
 
-                    CASE(Op::extract): r(d).u32 = (r(x).u32 & y.imm) >> z.imm; break;
-
-                    CASE(Op::pack): r(d).i32 = r(x).i32 | (r(y.id).i32 << z.imm); break;
+                    CASE(Op::extract): r(d).u32 = (r(x).u32 >> y.imm) & r(z.id).u32; break;
+                    CASE(Op::pack):    r(d).u32 = r(x).u32 | (r(y.id).u32 << z.imm); break;
 
                     CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
                     CASE(Op::to_i32): r(d).i32 = skvx::cast<int>  (r(x).f32); break;
diff --git a/tools/SkVMBuilders.cpp b/tools/SkVMBuilders.cpp
index 11e61c9..6514a11 100644
--- a/tools/SkVMBuilders.cpp
+++ b/tools/SkVMBuilders.cpp
@@ -36,10 +36,10 @@
 
             case Fmt::RGBA_8888: {
                 skvm::I32 rgba = load32(ptr);
-                *r = byte_to_f32(extract(rgba, 0xff));
-                *g = byte_to_f32(extract(rgba, 0xff00));
-                *b = byte_to_f32(extract(rgba, 0xff0000));
-                *a = byte_to_f32(extract(rgba, 0xff000000));
+                *r = byte_to_f32(extract(rgba,  0, splat(0xff)));
+                *g = byte_to_f32(extract(rgba,  8, splat(0xff)));
+                *b = byte_to_f32(extract(rgba, 16, splat(0xff)));
+                *a = byte_to_f32(extract(rgba, 24, splat(0xff)));
             } break;
         }
     };
@@ -97,10 +97,10 @@
     auto load = [&](skvm::Arg ptr,
                     skvm::I32* r, skvm::I32* g, skvm::I32* b, skvm::I32* a) {
         skvm::I32 rgba = load32(ptr);
-        *r = extract(rgba, 0xff);
-        *g = extract(rgba, 0xff00);
-        *b = extract(rgba, 0xff0000);
-        *a = extract(rgba, 0xff000000);
+        *r = extract(rgba,  0, splat(0xff));
+        *g = extract(rgba,  8, splat(0xff));
+        *b = extract(rgba, 16, splat(0xff));
+        *a = extract(rgba, 24, splat(0xff));
     };
 
     skvm::I32 r,g,b,a;
@@ -128,15 +128,14 @@
     auto load = [&](skvm::Arg ptr,
                     skvm::I32* rb, skvm::I32* ga) {
         skvm::I32 rgba = load32(ptr);
-        *rb = extract(rgba, 0x00ff00ff);
-        *ga = extract(rgba, 0xff00ff00);
+        *rb = extract(rgba, 0, splat(0x00ff00ff));
+        *ga = extract(rgba, 8, splat(0x00ff00ff));
     };
 
     auto mul_unorm8_SWAR = [&](skvm::I32 x, skvm::I32 y) {
         // As above, assuming x is two SWAR bytes in lanes 0 and 2, and y is a byte.
         skvm::I32 _255 = splat(0x00ff00ff);
-        return extract(add(mul(x, y), _255),
-                       0xff00ff00);
+        return extract(add(mul(x, y), _255), 8, _255);
     };
 
     skvm::I32 rb, ga;