revise extract instruction

Convert extract(x,bits,z) to be (x >> bits) & z,
now a more explicit parallel to pack().

This lets us eliminate the funky bit counting required from the old
instruction, but more saliently it makes it more likely that the masks
we AND with will be the same value.

Ultimately down at the x86 or ARM ISA level, the AND instructions don't
really benefit from having an immediate argument (while the shifts do).
We might as well treat the mask as a normal value, letting it get
commoned with identical values, loop hoisted, etc.

Change-Id: I48a38468b46f2c730574c025f412262296472447
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/219597
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/tools/SkVMBuilders.cpp b/tools/SkVMBuilders.cpp
index 11e61c9..6514a11 100644
--- a/tools/SkVMBuilders.cpp
+++ b/tools/SkVMBuilders.cpp
@@ -36,10 +36,10 @@
 
             case Fmt::RGBA_8888: {
                 skvm::I32 rgba = load32(ptr);
-                *r = byte_to_f32(extract(rgba, 0xff));
-                *g = byte_to_f32(extract(rgba, 0xff00));
-                *b = byte_to_f32(extract(rgba, 0xff0000));
-                *a = byte_to_f32(extract(rgba, 0xff000000));
+                *r = byte_to_f32(extract(rgba,  0, splat(0xff)));
+                *g = byte_to_f32(extract(rgba,  8, splat(0xff)));
+                *b = byte_to_f32(extract(rgba, 16, splat(0xff)));
+                *a = byte_to_f32(extract(rgba, 24, splat(0xff)));
             } break;
         }
     };
@@ -97,10 +97,10 @@
     auto load = [&](skvm::Arg ptr,
                     skvm::I32* r, skvm::I32* g, skvm::I32* b, skvm::I32* a) {
         skvm::I32 rgba = load32(ptr);
-        *r = extract(rgba, 0xff);
-        *g = extract(rgba, 0xff00);
-        *b = extract(rgba, 0xff0000);
-        *a = extract(rgba, 0xff000000);
+        *r = extract(rgba,  0, splat(0xff));
+        *g = extract(rgba,  8, splat(0xff));
+        *b = extract(rgba, 16, splat(0xff));
+        *a = extract(rgba, 24, splat(0xff));
     };
 
     skvm::I32 r,g,b,a;
@@ -128,15 +128,14 @@
     auto load = [&](skvm::Arg ptr,
                     skvm::I32* rb, skvm::I32* ga) {
         skvm::I32 rgba = load32(ptr);
-        *rb = extract(rgba, 0x00ff00ff);
-        *ga = extract(rgba, 0xff00ff00);
+        *rb = extract(rgba, 0, splat(0x00ff00ff));
+        *ga = extract(rgba, 8, splat(0x00ff00ff));
     };
 
     auto mul_unorm8_SWAR = [&](skvm::I32 x, skvm::I32 y) {
         // As above, assuming x is two SWAR bytes in lanes 0 and 2, and y is a byte.
         skvm::I32 _255 = splat(0x00ff00ff);
-        return extract(add(mul(x, y), _255),
-                       0xff00ff00);
+        return extract(add(mul(x, y), _255), 8, _255);
     };
 
     skvm::I32 rb, ga;