jumper, rework callback a bit, use it for color_lookup_table

Looks like the color-space images have this well tested (even without
lab_to_xyz) and the diffs look like rounding/FMA.

The old plan to keep loads and stores outside callback was:
  1) awkward, with too many pointers and pointers to pointers to track
  2) misguided... load and store stages march ahead by x,
     working at ptr+0, ptr+8, ptr+16, etc. while callback
     always wants to be working at the same spot in the buffer.

I spent a frustrating day in lldb to understood 2).  :/

So now the stage always store4's its pixels to a buffer in the context
before the callback, and when the callback returns it load4's them back
from a pointer in the context, defaulting to that same buffer.

Instead of passing a void* into the callback, we pass the context
itself.  This lets us subclass the context and add our own data...
C-compatible object-oriented programming.

Change-Id: I7a03439b3abd2efb000a6973631a9336452e9a43
Reviewed-on: https://skia-review.googlesource.com/13985
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/core/SkColorSpaceXform_A2B.cpp b/src/core/SkColorSpaceXform_A2B.cpp
index a97d60b..19115d8 100644
--- a/src/core/SkColorSpaceXform_A2B.cpp
+++ b/src/core/SkColorSpaceXform_A2B.cpp
@@ -16,6 +16,7 @@
 #include "SkNx.h"
 #include "SkSRGB.h"
 #include "SkTypes.h"
+#include "../jumper/SkJumper.h"
 
 bool SkColorSpaceXform_A2B::onApply(ColorFormat dstFormat, void* dst, ColorFormat srcFormat,
                                     const void* src, int count, SkAlphaType alphaType) const {
@@ -183,8 +184,27 @@
             case SkColorSpace_A2B::Element::Type::kCLUT: {
                 SkCSXformPrintf("CLUT (%d -> %d) stage added\n", e.colorLUT().inputChannels(),
                                                                  e.colorLUT().outputChannels());
-                auto clut = this->copy(sk_ref_sp(&e.colorLUT()));
-                fElementsPipeline.append(SkRasterPipeline::color_lookup_table, clut->get());
+                struct CallbackCtx : SkJumper_CallbackCtx {
+                    sk_sp<const SkColorLookUpTable> clut;
+                    // clut->interp() can't always safely alias its arguments,
+                    // so we allocate a second buffer to hold our results.
+                    float results[4*SkJumper_kMaxStride];
+                };
+                auto cb = fAlloc.make<CallbackCtx>();
+                cb->clut      = sk_ref_sp(&e.colorLUT());
+                cb->read_from = cb->results;
+                cb->fn        = [](SkJumper_CallbackCtx* ctx, int active_pixels) {
+                    auto c = (CallbackCtx*)ctx;
+                    for (int i = 0; i < active_pixels; i++) {
+                        // Look up red, green, and blue for this pixel using 3-4 values from rgba.
+                        c->clut->interp(c->results+4*i, c->rgba+4*i);
+
+                        // If we used 3 inputs (rgb) preserve the fourth as alpha.
+                        // If we used 4 inputs (cmyk) force alpha to 1.
+                        c->results[4*i+3] = (3 == c->clut->inputChannels()) ? c->rgba[4*i+3] : 1.0f;
+                    }
+                };
+                fElementsPipeline.append(SkRasterPipeline::callback, cb);
                 break;
             }
             case SkColorSpace_A2B::Element::Type::kMatrix:
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 29c560d..66b4c3a 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -87,7 +87,7 @@
     M(parametric_r) M(parametric_g) M(parametric_b)              \
     M(parametric_a)                                              \
     M(table_r) M(table_g) M(table_b) M(table_a)                  \
-    M(color_lookup_table) M(lab_to_xyz)                          \
+    M(lab_to_xyz)                                                \
     M(clamp_x) M(mirror_x) M(repeat_x)                           \
     M(clamp_y) M(mirror_y) M(repeat_y)                           \
     M(gather_a8) M(gather_g8) M(gather_i8)                       \
diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h
index d4ab968..1dc0fc4 100644
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@@ -82,8 +82,12 @@
 };
 
 struct SkJumper_CallbackCtx {
-    MAYBE_MSABI void (*fn)(void* arg, int active_pixels/*<= SkJumper_kMaxStride*/);
-    void* arg;
+    MAYBE_MSABI void (*fn)(SkJumper_CallbackCtx* self, int active_pixels/*<= SkJumper_kMaxStride*/);
+
+    // When called, fn() will have our active pixels available in rgba.
+    // When fn() returns, the pipeline will read back those active pixels from read_from.
+    float rgba[4*SkJumper_kMaxStride];
+    float* read_from = rgba;
 };
 
 struct SkJumper_LoadTablesCtx {
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index dad3895..0b83ad9 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -3587,32 +3587,34 @@
 .globl _sk_callback_aarch64
 FUNCTION(_sk_callback_aarch64)
 _sk_callback_aarch64:
-  .long  0xd10283ff                          // sub           sp, sp, #0xa0
-  .long  0xa90853f5                          // stp           x21, x20, [sp, #128]
-  .long  0xa9097bf3                          // stp           x19, x30, [sp, #144]
-  .long  0xad031fe6                          // stp           q6, q7, [sp, #96]
-  .long  0xad0217e4                          // stp           q4, q5, [sp, #64]
-  .long  0xad010fe2                          // stp           q2, q3, [sp, #32]
-  .long  0xad0007e0                          // stp           q0, q1, [sp]
+  .long  0xd101c3ff                          // sub           sp, sp, #0x70
+  .long  0xf90023f6                          // str           x22, [sp, #64]
+  .long  0xa90553f5                          // stp           x21, x20, [sp, #80]
+  .long  0xa9067bf3                          // stp           x19, x30, [sp, #96]
+  .long  0xad011fe6                          // stp           q6, q7, [sp, #32]
+  .long  0xad0017e4                          // stp           q4, q5, [sp]
   .long  0xaa0103f4                          // mov           x20, x1
-  .long  0xf9400288                          // ldr           x8, [x20]
-  .long  0xaa0003f5                          // mov           x21, x0
+  .long  0xf9400295                          // ldr           x21, [x20]
+  .long  0xaa0003f6                          // mov           x22, x0
   .long  0x321e03e1                          // orr           w1, wzr, #0x4
   .long  0xaa0203f3                          // mov           x19, x2
-  .long  0xa9402109                          // ldp           x9, x8, [x8]
-  .long  0xaa0803e0                          // mov           x0, x8
-  .long  0xd63f0120                          // blr           x9
+  .long  0x910022a8                          // add           x8, x21, #0x8
+  .long  0x4c000900                          // st4           {v0.4s-v3.4s}, [x8]
+  .long  0xf94002a8                          // ldr           x8, [x21]
+  .long  0xaa1503e0                          // mov           x0, x21
+  .long  0xd63f0100                          // blr           x8
+  .long  0xf94046a8                          // ldr           x8, [x21, #136]
   .long  0xf9400683                          // ldr           x3, [x20, #8]
   .long  0x91004281                          // add           x1, x20, #0x10
-  .long  0xaa1503e0                          // mov           x0, x21
+  .long  0xaa1603e0                          // mov           x0, x22
+  .long  0x4c400900                          // ld4           {v0.4s-v3.4s}, [x8]
   .long  0xaa1303e2                          // mov           x2, x19
-  .long  0xad4007e0                          // ldp           q0, q1, [sp]
-  .long  0xad410fe2                          // ldp           q2, q3, [sp, #32]
-  .long  0xad4217e4                          // ldp           q4, q5, [sp, #64]
-  .long  0xad431fe6                          // ldp           q6, q7, [sp, #96]
-  .long  0xa9497bf3                          // ldp           x19, x30, [sp, #144]
-  .long  0xa94853f5                          // ldp           x21, x20, [sp, #128]
-  .long  0x910283ff                          // add           sp, sp, #0xa0
+  .long  0xad4017e4                          // ldp           q4, q5, [sp]
+  .long  0xad411fe6                          // ldp           q6, q7, [sp, #32]
+  .long  0xa9467bf3                          // ldp           x19, x30, [sp, #96]
+  .long  0xa94553f5                          // ldp           x21, x20, [sp, #80]
+  .long  0xf94023f6                          // ldr           x22, [sp, #64]
+  .long  0x9101c3ff                          // add           sp, sp, #0x70
   .long  0xd61f0060                          // br            x3
 #elif defined(__arm__)
 .balign 4
@@ -7506,38 +7508,34 @@
 .globl _sk_callback_vfp4
 FUNCTION(_sk_callback_vfp4)
 _sk_callback_vfp4:
-  .long  0xe92d4070                          // push          {r4, r5, r6, lr}
-  .long  0xed2d8b10                          // vpush         {d8-d15}
+  .long  0xe92d48f0                          // push          {r4, r5, r6, r7, fp, lr}
+  .long  0xed2d8b08                          // vpush         {d8-d11}
   .long  0xe1a05001                          // mov           r5, r1
   .long  0xe1a06000                          // mov           r6, r0
-  .long  0xe5950000                          // ldr           r0, [r5]
+  .long  0xe5957000                          // ldr           r7, [r5]
   .long  0xe1a04002                          // mov           r4, r2
   .long  0xe3a01002                          // mov           r1, #2
   .long  0xeeb08b47                          // vmov.f64      d8, d7
-  .long  0xe5902000                          // ldr           r2, [r0]
-  .long  0xe5900004                          // ldr           r0, [r0, #4]
+  .long  0xe2870004                          // add           r0, r7, #4
+  .long  0xf400008f                          // vst4.32       {d0-d3}, [r0]
+  .long  0xe1a00007                          // mov           r0, r7
+  .long  0xe5972000                          // ldr           r2, [r7]
   .long  0xeeb09b46                          // vmov.f64      d9, d6
   .long  0xeeb0ab45                          // vmov.f64      d10, d5
   .long  0xeeb0bb44                          // vmov.f64      d11, d4
-  .long  0xeeb0cb43                          // vmov.f64      d12, d3
-  .long  0xeeb0db42                          // vmov.f64      d13, d2
-  .long  0xeeb0eb41                          // vmov.f64      d14, d1
-  .long  0xeeb0fb40                          // vmov.f64      d15, d0
   .long  0xe12fff32                          // blx           r2
+  .long  0xe5970084                          // ldr           r0, [r7, #132]
   .long  0xe2851008                          // add           r1, r5, #8
   .long  0xe5953004                          // ldr           r3, [r5, #4]
-  .long  0xe1a00006                          // mov           r0, r6
   .long  0xe1a02004                          // mov           r2, r4
-  .long  0xeeb00b4f                          // vmov.f64      d0, d15
-  .long  0xeeb01b4e                          // vmov.f64      d1, d14
-  .long  0xeeb02b4d                          // vmov.f64      d2, d13
-  .long  0xeeb03b4c                          // vmov.f64      d3, d12
   .long  0xeeb04b4b                          // vmov.f64      d4, d11
+  .long  0xf420008f                          // vld4.32       {d0-d3}, [r0]
+  .long  0xe1a00006                          // mov           r0, r6
   .long  0xeeb05b4a                          // vmov.f64      d5, d10
   .long  0xeeb06b49                          // vmov.f64      d6, d9
   .long  0xeeb07b48                          // vmov.f64      d7, d8
-  .long  0xecbd8b10                          // vpop          {d8-d15}
-  .long  0xe8bd4070                          // pop           {r4, r5, r6, lr}
+  .long  0xecbd8b08                          // vpop          {d8-d11}
+  .long  0xe8bd48f0                          // pop           {r4, r5, r6, r7, fp, lr}
   .long  0xe12fff13                          // bx            r3
 #elif defined(__x86_64__)
 
@@ -11932,44 +11930,72 @@
 _sk_callback_hsw:
   .byte  65,87                               // push          %r15
   .byte  65,86                               // push          %r14
+  .byte  65,85                               // push          %r13
   .byte  65,84                               // push          %r12
   .byte  83                                  // push          %rbx
-  .byte  72,129,236,24,1,0,0                 // sub           $0x118,%rsp
-  .byte  197,252,17,188,36,224,0,0,0         // vmovups       %ymm7,0xe0(%rsp)
-  .byte  197,252,17,180,36,192,0,0,0         // vmovups       %ymm6,0xc0(%rsp)
-  .byte  197,252,17,172,36,160,0,0,0         // vmovups       %ymm5,0xa0(%rsp)
-  .byte  197,252,17,164,36,128,0,0,0         // vmovups       %ymm4,0x80(%rsp)
-  .byte  197,252,17,92,36,96                 // vmovups       %ymm3,0x60(%rsp)
-  .byte  197,252,17,84,36,64                 // vmovups       %ymm2,0x40(%rsp)
-  .byte  197,252,17,76,36,32                 // vmovups       %ymm1,0x20(%rsp)
-  .byte  197,252,17,4,36                     // vmovups       %ymm0,(%rsp)
-  .byte  72,137,203                          // mov           %rcx,%rbx
+  .byte  72,129,236,144,0,0,0                // sub           $0x90,%rsp
+  .byte  197,252,17,124,36,96                // vmovups       %ymm7,0x60(%rsp)
+  .byte  197,252,17,116,36,64                // vmovups       %ymm6,0x40(%rsp)
+  .byte  197,252,17,108,36,32                // vmovups       %ymm5,0x20(%rsp)
+  .byte  197,252,17,36,36                    // vmovups       %ymm4,(%rsp)
+  .byte  73,137,205                          // mov           %rcx,%r13
   .byte  73,137,214                          // mov           %rdx,%r14
   .byte  73,137,255                          // mov           %rdi,%r15
   .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,137,195                          // mov           %rax,%rbx
   .byte  73,137,244                          // mov           %rsi,%r12
-  .byte  72,139,120,8                        // mov           0x8(%rax),%rdi
-  .byte  72,133,219                          // test          %rbx,%rbx
+  .byte  197,252,20,225                      // vunpcklps     %ymm1,%ymm0,%ymm4
+  .byte  197,252,21,193                      // vunpckhps     %ymm1,%ymm0,%ymm0
+  .byte  197,236,20,203                      // vunpcklps     %ymm3,%ymm2,%ymm1
+  .byte  197,236,21,211                      // vunpckhps     %ymm3,%ymm2,%ymm2
+  .byte  197,221,20,217                      // vunpcklpd     %ymm1,%ymm4,%ymm3
+  .byte  197,221,21,201                      // vunpckhpd     %ymm1,%ymm4,%ymm1
+  .byte  197,253,20,226                      // vunpcklpd     %ymm2,%ymm0,%ymm4
+  .byte  197,253,21,194                      // vunpckhpd     %ymm2,%ymm0,%ymm0
+  .byte  196,227,101,24,209,1                // vinsertf128   $0x1,%xmm1,%ymm3,%ymm2
+  .byte  196,227,93,24,232,1                 // vinsertf128   $0x1,%xmm0,%ymm4,%ymm5
+  .byte  196,227,101,6,201,49                // vperm2f128    $0x31,%ymm1,%ymm3,%ymm1
+  .byte  196,227,93,6,192,49                 // vperm2f128    $0x31,%ymm0,%ymm4,%ymm0
+  .byte  197,253,17,83,8                     // vmovupd       %ymm2,0x8(%rbx)
+  .byte  197,253,17,107,40                   // vmovupd       %ymm5,0x28(%rbx)
+  .byte  197,253,17,75,72                    // vmovupd       %ymm1,0x48(%rbx)
+  .byte  197,253,17,67,104                   // vmovupd       %ymm0,0x68(%rbx)
+  .byte  77,133,237                          // test          %r13,%r13
   .byte  190,8,0,0,0                         // mov           $0x8,%esi
-  .byte  15,69,243                           // cmovne        %ebx,%esi
+  .byte  65,15,69,245                        // cmovne        %r13d,%esi
+  .byte  72,137,223                          // mov           %rbx,%rdi
   .byte  197,248,119                         // vzeroupper
-  .byte  255,16                              // callq         *(%rax)
+  .byte  255,19                              // callq         *(%rbx)
+  .byte  72,139,131,136,0,0,0                // mov           0x88(%rbx),%rax
+  .byte  197,248,16,0                        // vmovups       (%rax),%xmm0
+  .byte  197,248,16,72,16                    // vmovups       0x10(%rax),%xmm1
+  .byte  197,248,16,80,32                    // vmovups       0x20(%rax),%xmm2
+  .byte  197,248,16,88,48                    // vmovups       0x30(%rax),%xmm3
+  .byte  196,227,101,24,88,112,1             // vinsertf128   $0x1,0x70(%rax),%ymm3,%ymm3
+  .byte  196,227,109,24,80,96,1              // vinsertf128   $0x1,0x60(%rax),%ymm2,%ymm2
+  .byte  196,227,117,24,72,80,1              // vinsertf128   $0x1,0x50(%rax),%ymm1,%ymm1
+  .byte  196,227,125,24,64,64,1              // vinsertf128   $0x1,0x40(%rax),%ymm0,%ymm0
+  .byte  197,252,20,225                      // vunpcklps     %ymm1,%ymm0,%ymm4
+  .byte  197,252,21,233                      // vunpckhps     %ymm1,%ymm0,%ymm5
+  .byte  197,236,20,203                      // vunpcklps     %ymm3,%ymm2,%ymm1
+  .byte  197,236,21,219                      // vunpckhps     %ymm3,%ymm2,%ymm3
+  .byte  197,221,20,193                      // vunpcklpd     %ymm1,%ymm4,%ymm0
+  .byte  197,221,21,201                      // vunpckhpd     %ymm1,%ymm4,%ymm1
+  .byte  197,213,20,211                      // vunpcklpd     %ymm3,%ymm5,%ymm2
+  .byte  197,213,21,219                      // vunpckhpd     %ymm3,%ymm5,%ymm3
   .byte  76,137,230                          // mov           %r12,%rsi
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,137,255                          // mov           %r15,%rdi
   .byte  76,137,242                          // mov           %r14,%rdx
-  .byte  72,137,217                          // mov           %rbx,%rcx
-  .byte  197,252,16,4,36                     // vmovups       (%rsp),%ymm0
-  .byte  197,252,16,76,36,32                 // vmovups       0x20(%rsp),%ymm1
-  .byte  197,252,16,84,36,64                 // vmovups       0x40(%rsp),%ymm2
-  .byte  197,252,16,92,36,96                 // vmovups       0x60(%rsp),%ymm3
-  .byte  197,252,16,164,36,128,0,0,0         // vmovups       0x80(%rsp),%ymm4
-  .byte  197,252,16,172,36,160,0,0,0         // vmovups       0xa0(%rsp),%ymm5
-  .byte  197,252,16,180,36,192,0,0,0         // vmovups       0xc0(%rsp),%ymm6
-  .byte  197,252,16,188,36,224,0,0,0         // vmovups       0xe0(%rsp),%ymm7
-  .byte  72,129,196,24,1,0,0                 // add           $0x118,%rsp
+  .byte  76,137,233                          // mov           %r13,%rcx
+  .byte  197,252,16,36,36                    // vmovups       (%rsp),%ymm4
+  .byte  197,252,16,108,36,32                // vmovups       0x20(%rsp),%ymm5
+  .byte  197,252,16,116,36,64                // vmovups       0x40(%rsp),%ymm6
+  .byte  197,252,16,124,36,96                // vmovups       0x60(%rsp),%ymm7
+  .byte  72,129,196,144,0,0,0                // add           $0x90,%rsp
   .byte  91                                  // pop           %rbx
   .byte  65,92                               // pop           %r12
+  .byte  65,93                               // pop           %r13
   .byte  65,94                               // pop           %r14
   .byte  65,95                               // pop           %r15
   .byte  255,224                             // jmpq          *%rax
@@ -17777,44 +17803,72 @@
 _sk_callback_avx:
   .byte  65,87                               // push          %r15
   .byte  65,86                               // push          %r14
+  .byte  65,85                               // push          %r13
   .byte  65,84                               // push          %r12
   .byte  83                                  // push          %rbx
-  .byte  72,129,236,24,1,0,0                 // sub           $0x118,%rsp
-  .byte  197,252,17,188,36,224,0,0,0         // vmovups       %ymm7,0xe0(%rsp)
-  .byte  197,252,17,180,36,192,0,0,0         // vmovups       %ymm6,0xc0(%rsp)
-  .byte  197,252,17,172,36,160,0,0,0         // vmovups       %ymm5,0xa0(%rsp)
-  .byte  197,252,17,164,36,128,0,0,0         // vmovups       %ymm4,0x80(%rsp)
-  .byte  197,252,17,92,36,96                 // vmovups       %ymm3,0x60(%rsp)
-  .byte  197,252,17,84,36,64                 // vmovups       %ymm2,0x40(%rsp)
-  .byte  197,252,17,76,36,32                 // vmovups       %ymm1,0x20(%rsp)
-  .byte  197,252,17,4,36                     // vmovups       %ymm0,(%rsp)
-  .byte  72,137,203                          // mov           %rcx,%rbx
+  .byte  72,129,236,144,0,0,0                // sub           $0x90,%rsp
+  .byte  197,252,17,124,36,96                // vmovups       %ymm7,0x60(%rsp)
+  .byte  197,252,17,116,36,64                // vmovups       %ymm6,0x40(%rsp)
+  .byte  197,252,17,108,36,32                // vmovups       %ymm5,0x20(%rsp)
+  .byte  197,252,17,36,36                    // vmovups       %ymm4,(%rsp)
+  .byte  73,137,205                          // mov           %rcx,%r13
   .byte  73,137,214                          // mov           %rdx,%r14
   .byte  73,137,255                          // mov           %rdi,%r15
   .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,137,195                          // mov           %rax,%rbx
   .byte  73,137,244                          // mov           %rsi,%r12
-  .byte  72,139,120,8                        // mov           0x8(%rax),%rdi
-  .byte  72,133,219                          // test          %rbx,%rbx
+  .byte  197,252,20,225                      // vunpcklps     %ymm1,%ymm0,%ymm4
+  .byte  197,252,21,193                      // vunpckhps     %ymm1,%ymm0,%ymm0
+  .byte  197,236,20,203                      // vunpcklps     %ymm3,%ymm2,%ymm1
+  .byte  197,236,21,211                      // vunpckhps     %ymm3,%ymm2,%ymm2
+  .byte  197,221,20,217                      // vunpcklpd     %ymm1,%ymm4,%ymm3
+  .byte  197,221,21,201                      // vunpckhpd     %ymm1,%ymm4,%ymm1
+  .byte  197,253,20,226                      // vunpcklpd     %ymm2,%ymm0,%ymm4
+  .byte  197,253,21,194                      // vunpckhpd     %ymm2,%ymm0,%ymm0
+  .byte  196,227,101,24,209,1                // vinsertf128   $0x1,%xmm1,%ymm3,%ymm2
+  .byte  196,227,93,24,232,1                 // vinsertf128   $0x1,%xmm0,%ymm4,%ymm5
+  .byte  196,227,101,6,201,49                // vperm2f128    $0x31,%ymm1,%ymm3,%ymm1
+  .byte  196,227,93,6,192,49                 // vperm2f128    $0x31,%ymm0,%ymm4,%ymm0
+  .byte  197,253,17,83,8                     // vmovupd       %ymm2,0x8(%rbx)
+  .byte  197,253,17,107,40                   // vmovupd       %ymm5,0x28(%rbx)
+  .byte  197,253,17,75,72                    // vmovupd       %ymm1,0x48(%rbx)
+  .byte  197,253,17,67,104                   // vmovupd       %ymm0,0x68(%rbx)
+  .byte  77,133,237                          // test          %r13,%r13
   .byte  190,8,0,0,0                         // mov           $0x8,%esi
-  .byte  15,69,243                           // cmovne        %ebx,%esi
+  .byte  65,15,69,245                        // cmovne        %r13d,%esi
+  .byte  72,137,223                          // mov           %rbx,%rdi
   .byte  197,248,119                         // vzeroupper
-  .byte  255,16                              // callq         *(%rax)
+  .byte  255,19                              // callq         *(%rbx)
+  .byte  72,139,131,136,0,0,0                // mov           0x88(%rbx),%rax
+  .byte  197,248,16,0                        // vmovups       (%rax),%xmm0
+  .byte  197,248,16,72,16                    // vmovups       0x10(%rax),%xmm1
+  .byte  197,248,16,80,32                    // vmovups       0x20(%rax),%xmm2
+  .byte  197,248,16,88,48                    // vmovups       0x30(%rax),%xmm3
+  .byte  196,227,101,24,88,112,1             // vinsertf128   $0x1,0x70(%rax),%ymm3,%ymm3
+  .byte  196,227,109,24,80,96,1              // vinsertf128   $0x1,0x60(%rax),%ymm2,%ymm2
+  .byte  196,227,117,24,72,80,1              // vinsertf128   $0x1,0x50(%rax),%ymm1,%ymm1
+  .byte  196,227,125,24,64,64,1              // vinsertf128   $0x1,0x40(%rax),%ymm0,%ymm0
+  .byte  197,252,20,225                      // vunpcklps     %ymm1,%ymm0,%ymm4
+  .byte  197,252,21,233                      // vunpckhps     %ymm1,%ymm0,%ymm5
+  .byte  197,236,20,203                      // vunpcklps     %ymm3,%ymm2,%ymm1
+  .byte  197,236,21,219                      // vunpckhps     %ymm3,%ymm2,%ymm3
+  .byte  197,221,20,193                      // vunpcklpd     %ymm1,%ymm4,%ymm0
+  .byte  197,221,21,201                      // vunpckhpd     %ymm1,%ymm4,%ymm1
+  .byte  197,213,20,211                      // vunpcklpd     %ymm3,%ymm5,%ymm2
+  .byte  197,213,21,219                      // vunpckhpd     %ymm3,%ymm5,%ymm3
   .byte  76,137,230                          // mov           %r12,%rsi
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,137,255                          // mov           %r15,%rdi
   .byte  76,137,242                          // mov           %r14,%rdx
-  .byte  72,137,217                          // mov           %rbx,%rcx
-  .byte  197,252,16,4,36                     // vmovups       (%rsp),%ymm0
-  .byte  197,252,16,76,36,32                 // vmovups       0x20(%rsp),%ymm1
-  .byte  197,252,16,84,36,64                 // vmovups       0x40(%rsp),%ymm2
-  .byte  197,252,16,92,36,96                 // vmovups       0x60(%rsp),%ymm3
-  .byte  197,252,16,164,36,128,0,0,0         // vmovups       0x80(%rsp),%ymm4
-  .byte  197,252,16,172,36,160,0,0,0         // vmovups       0xa0(%rsp),%ymm5
-  .byte  197,252,16,180,36,192,0,0,0         // vmovups       0xc0(%rsp),%ymm6
-  .byte  197,252,16,188,36,224,0,0,0         // vmovups       0xe0(%rsp),%ymm7
-  .byte  72,129,196,24,1,0,0                 // add           $0x118,%rsp
+  .byte  76,137,233                          // mov           %r13,%rcx
+  .byte  197,252,16,36,36                    // vmovups       (%rsp),%ymm4
+  .byte  197,252,16,108,36,32                // vmovups       0x20(%rsp),%ymm5
+  .byte  197,252,16,116,36,64                // vmovups       0x40(%rsp),%ymm6
+  .byte  197,252,16,124,36,96                // vmovups       0x60(%rsp),%ymm7
+  .byte  72,129,196,144,0,0,0                // add           $0x90,%rsp
   .byte  91                                  // pop           %rbx
   .byte  65,92                               // pop           %r12
+  .byte  65,93                               // pop           %r13
   .byte  65,94                               // pop           %r14
   .byte  65,95                               // pop           %r15
   .byte  255,224                             // jmpq          *%rax
@@ -22267,37 +22321,65 @@
 _sk_callback_sse41:
   .byte  65,87                               // push          %r15
   .byte  65,86                               // push          %r14
+  .byte  65,84                               // push          %r12
   .byte  83                                  // push          %rbx
-  .byte  72,129,236,128,0,0,0                // sub           $0x80,%rsp
-  .byte  15,41,124,36,112                    // movaps        %xmm7,0x70(%rsp)
-  .byte  15,41,116,36,96                     // movaps        %xmm6,0x60(%rsp)
-  .byte  15,41,108,36,80                     // movaps        %xmm5,0x50(%rsp)
-  .byte  15,41,100,36,64                     // movaps        %xmm4,0x40(%rsp)
-  .byte  15,41,92,36,48                      // movaps        %xmm3,0x30(%rsp)
-  .byte  15,41,84,36,32                      // movaps        %xmm2,0x20(%rsp)
-  .byte  15,41,76,36,16                      // movaps        %xmm1,0x10(%rsp)
-  .byte  15,41,4,36                          // movaps        %xmm0,(%rsp)
+  .byte  72,131,236,72                       // sub           $0x48,%rsp
+  .byte  15,41,124,36,48                     // movaps        %xmm7,0x30(%rsp)
+  .byte  15,41,116,36,32                     // movaps        %xmm6,0x20(%rsp)
+  .byte  15,41,108,36,16                     // movaps        %xmm5,0x10(%rsp)
+  .byte  15,41,36,36                         // movaps        %xmm4,(%rsp)
   .byte  73,137,214                          // mov           %rdx,%r14
   .byte  73,137,255                          // mov           %rdi,%r15
   .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,137,243                          // mov           %rsi,%rbx
-  .byte  72,139,120,8                        // mov           0x8(%rax),%rdi
+  .byte  72,137,195                          // mov           %rax,%rbx
+  .byte  73,137,244                          // mov           %rsi,%r12
+  .byte  15,40,224                           // movaps        %xmm0,%xmm4
+  .byte  15,20,225                           // unpcklps      %xmm1,%xmm4
+  .byte  15,40,234                           // movaps        %xmm2,%xmm5
+  .byte  15,20,235                           // unpcklps      %xmm3,%xmm5
+  .byte  15,21,193                           // unpckhps      %xmm1,%xmm0
+  .byte  15,21,211                           // unpckhps      %xmm3,%xmm2
+  .byte  15,40,204                           // movaps        %xmm4,%xmm1
+  .byte  102,15,20,205                       // unpcklpd      %xmm5,%xmm1
+  .byte  15,18,236                           // movhlps       %xmm4,%xmm5
+  .byte  15,40,216                           // movaps        %xmm0,%xmm3
+  .byte  102,15,20,218                       // unpcklpd      %xmm2,%xmm3
+  .byte  15,18,208                           // movhlps       %xmm0,%xmm2
+  .byte  102,15,17,75,8                      // movupd        %xmm1,0x8(%rbx)
+  .byte  15,17,107,24                        // movups        %xmm5,0x18(%rbx)
+  .byte  102,15,17,91,40                     // movupd        %xmm3,0x28(%rbx)
+  .byte  15,17,83,56                         // movups        %xmm2,0x38(%rbx)
   .byte  190,4,0,0,0                         // mov           $0x4,%esi
-  .byte  255,16                              // callq         *(%rax)
-  .byte  72,137,222                          // mov           %rbx,%rsi
+  .byte  72,137,223                          // mov           %rbx,%rdi
+  .byte  255,19                              // callq         *(%rbx)
+  .byte  72,139,131,136,0,0,0                // mov           0x88(%rbx),%rax
+  .byte  15,16,32                            // movups        (%rax),%xmm4
+  .byte  15,16,64,16                         // movups        0x10(%rax),%xmm0
+  .byte  15,16,88,32                         // movups        0x20(%rax),%xmm3
+  .byte  15,16,80,48                         // movups        0x30(%rax),%xmm2
+  .byte  15,40,236                           // movaps        %xmm4,%xmm5
+  .byte  15,20,232                           // unpcklps      %xmm0,%xmm5
+  .byte  15,40,203                           // movaps        %xmm3,%xmm1
+  .byte  15,20,202                           // unpcklps      %xmm2,%xmm1
+  .byte  15,21,224                           // unpckhps      %xmm0,%xmm4
+  .byte  15,21,218                           // unpckhps      %xmm2,%xmm3
+  .byte  15,40,197                           // movaps        %xmm5,%xmm0
+  .byte  102,15,20,193                       // unpcklpd      %xmm1,%xmm0
+  .byte  15,18,205                           // movhlps       %xmm5,%xmm1
+  .byte  15,40,212                           // movaps        %xmm4,%xmm2
+  .byte  102,15,20,211                       // unpcklpd      %xmm3,%xmm2
+  .byte  15,18,220                           // movhlps       %xmm4,%xmm3
+  .byte  76,137,230                          // mov           %r12,%rsi
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,137,255                          // mov           %r15,%rdi
   .byte  76,137,242                          // mov           %r14,%rdx
-  .byte  15,40,4,36                          // movaps        (%rsp),%xmm0
-  .byte  15,40,76,36,16                      // movaps        0x10(%rsp),%xmm1
-  .byte  15,40,84,36,32                      // movaps        0x20(%rsp),%xmm2
-  .byte  15,40,92,36,48                      // movaps        0x30(%rsp),%xmm3
-  .byte  15,40,100,36,64                     // movaps        0x40(%rsp),%xmm4
-  .byte  15,40,108,36,80                     // movaps        0x50(%rsp),%xmm5
-  .byte  15,40,116,36,96                     // movaps        0x60(%rsp),%xmm6
-  .byte  15,40,124,36,112                    // movaps        0x70(%rsp),%xmm7
-  .byte  72,129,196,128,0,0,0                // add           $0x80,%rsp
+  .byte  15,40,36,36                         // movaps        (%rsp),%xmm4
+  .byte  15,40,108,36,16                     // movaps        0x10(%rsp),%xmm5
+  .byte  15,40,116,36,32                     // movaps        0x20(%rsp),%xmm6
+  .byte  15,40,124,36,48                     // movaps        0x30(%rsp),%xmm7
+  .byte  72,131,196,72                       // add           $0x48,%rsp
   .byte  91                                  // pop           %rbx
+  .byte  65,92                               // pop           %r12
   .byte  65,94                               // pop           %r14
   .byte  65,95                               // pop           %r15
   .byte  255,224                             // jmpq          *%rax
@@ -27052,37 +27134,65 @@
 _sk_callback_sse2:
   .byte  65,87                               // push          %r15
   .byte  65,86                               // push          %r14
+  .byte  65,84                               // push          %r12
   .byte  83                                  // push          %rbx
-  .byte  72,129,236,128,0,0,0                // sub           $0x80,%rsp
-  .byte  15,41,124,36,112                    // movaps        %xmm7,0x70(%rsp)
-  .byte  15,41,116,36,96                     // movaps        %xmm6,0x60(%rsp)
-  .byte  15,41,108,36,80                     // movaps        %xmm5,0x50(%rsp)
-  .byte  15,41,100,36,64                     // movaps        %xmm4,0x40(%rsp)
-  .byte  15,41,92,36,48                      // movaps        %xmm3,0x30(%rsp)
-  .byte  15,41,84,36,32                      // movaps        %xmm2,0x20(%rsp)
-  .byte  15,41,76,36,16                      // movaps        %xmm1,0x10(%rsp)
-  .byte  15,41,4,36                          // movaps        %xmm0,(%rsp)
+  .byte  72,131,236,72                       // sub           $0x48,%rsp
+  .byte  15,41,124,36,48                     // movaps        %xmm7,0x30(%rsp)
+  .byte  15,41,116,36,32                     // movaps        %xmm6,0x20(%rsp)
+  .byte  15,41,108,36,16                     // movaps        %xmm5,0x10(%rsp)
+  .byte  15,41,36,36                         // movaps        %xmm4,(%rsp)
   .byte  73,137,214                          // mov           %rdx,%r14
   .byte  73,137,255                          // mov           %rdi,%r15
   .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,137,243                          // mov           %rsi,%rbx
-  .byte  72,139,120,8                        // mov           0x8(%rax),%rdi
+  .byte  72,137,195                          // mov           %rax,%rbx
+  .byte  73,137,244                          // mov           %rsi,%r12
+  .byte  15,40,224                           // movaps        %xmm0,%xmm4
+  .byte  15,20,225                           // unpcklps      %xmm1,%xmm4
+  .byte  15,40,234                           // movaps        %xmm2,%xmm5
+  .byte  15,20,235                           // unpcklps      %xmm3,%xmm5
+  .byte  15,21,193                           // unpckhps      %xmm1,%xmm0
+  .byte  15,21,211                           // unpckhps      %xmm3,%xmm2
+  .byte  15,40,204                           // movaps        %xmm4,%xmm1
+  .byte  102,15,20,205                       // unpcklpd      %xmm5,%xmm1
+  .byte  15,18,236                           // movhlps       %xmm4,%xmm5
+  .byte  15,40,216                           // movaps        %xmm0,%xmm3
+  .byte  102,15,20,218                       // unpcklpd      %xmm2,%xmm3
+  .byte  15,18,208                           // movhlps       %xmm0,%xmm2
+  .byte  102,15,17,75,8                      // movupd        %xmm1,0x8(%rbx)
+  .byte  15,17,107,24                        // movups        %xmm5,0x18(%rbx)
+  .byte  102,15,17,91,40                     // movupd        %xmm3,0x28(%rbx)
+  .byte  15,17,83,56                         // movups        %xmm2,0x38(%rbx)
   .byte  190,4,0,0,0                         // mov           $0x4,%esi
-  .byte  255,16                              // callq         *(%rax)
-  .byte  72,137,222                          // mov           %rbx,%rsi
+  .byte  72,137,223                          // mov           %rbx,%rdi
+  .byte  255,19                              // callq         *(%rbx)
+  .byte  72,139,131,136,0,0,0                // mov           0x88(%rbx),%rax
+  .byte  15,16,32                            // movups        (%rax),%xmm4
+  .byte  15,16,64,16                         // movups        0x10(%rax),%xmm0
+  .byte  15,16,88,32                         // movups        0x20(%rax),%xmm3
+  .byte  15,16,80,48                         // movups        0x30(%rax),%xmm2
+  .byte  15,40,236                           // movaps        %xmm4,%xmm5
+  .byte  15,20,232                           // unpcklps      %xmm0,%xmm5
+  .byte  15,40,203                           // movaps        %xmm3,%xmm1
+  .byte  15,20,202                           // unpcklps      %xmm2,%xmm1
+  .byte  15,21,224                           // unpckhps      %xmm0,%xmm4
+  .byte  15,21,218                           // unpckhps      %xmm2,%xmm3
+  .byte  15,40,197                           // movaps        %xmm5,%xmm0
+  .byte  102,15,20,193                       // unpcklpd      %xmm1,%xmm0
+  .byte  15,18,205                           // movhlps       %xmm5,%xmm1
+  .byte  15,40,212                           // movaps        %xmm4,%xmm2
+  .byte  102,15,20,211                       // unpcklpd      %xmm3,%xmm2
+  .byte  15,18,220                           // movhlps       %xmm4,%xmm3
+  .byte  76,137,230                          // mov           %r12,%rsi
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,137,255                          // mov           %r15,%rdi
   .byte  76,137,242                          // mov           %r14,%rdx
-  .byte  15,40,4,36                          // movaps        (%rsp),%xmm0
-  .byte  15,40,76,36,16                      // movaps        0x10(%rsp),%xmm1
-  .byte  15,40,84,36,32                      // movaps        0x20(%rsp),%xmm2
-  .byte  15,40,92,36,48                      // movaps        0x30(%rsp),%xmm3
-  .byte  15,40,100,36,64                     // movaps        0x40(%rsp),%xmm4
-  .byte  15,40,108,36,80                     // movaps        0x50(%rsp),%xmm5
-  .byte  15,40,116,36,96                     // movaps        0x60(%rsp),%xmm6
-  .byte  15,40,124,36,112                    // movaps        0x70(%rsp),%xmm7
-  .byte  72,129,196,128,0,0,0                // add           $0x80,%rsp
+  .byte  15,40,36,36                         // movaps        (%rsp),%xmm4
+  .byte  15,40,108,36,16                     // movaps        0x10(%rsp),%xmm5
+  .byte  15,40,116,36,32                     // movaps        0x20(%rsp),%xmm6
+  .byte  15,40,124,36,48                     // movaps        0x30(%rsp),%xmm7
+  .byte  72,131,196,72                       // add           $0x48,%rsp
   .byte  91                                  // pop           %rbx
+  .byte  65,92                               // pop           %r12
   .byte  65,94                               // pop           %r14
   .byte  65,95                               // pop           %r15
   .byte  255,224                             // jmpq          *%rax
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 5d3c4ef..5151816 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -4203,40 +4203,68 @@
 
 PUBLIC _sk_callback_hsw
 _sk_callback_hsw LABEL PROC
+  DB  65,87                               ; push          %r15
   DB  65,86                               ; push          %r14
   DB  83                                  ; push          %rbx
-  DB  72,129,236,40,1,0,0                 ; sub           $0x128,%rsp
-  DB  197,252,17,188,36,0,1,0,0           ; vmovups       %ymm7,0x100(%rsp)
-  DB  197,252,17,180,36,224,0,0,0         ; vmovups       %ymm6,0xe0(%rsp)
-  DB  197,252,17,172,36,192,0,0,0         ; vmovups       %ymm5,0xc0(%rsp)
-  DB  197,252,17,164,36,160,0,0,0         ; vmovups       %ymm4,0xa0(%rsp)
-  DB  197,252,17,156,36,128,0,0,0         ; vmovups       %ymm3,0x80(%rsp)
-  DB  197,252,17,84,36,96                 ; vmovups       %ymm2,0x60(%rsp)
-  DB  197,252,17,76,36,64                 ; vmovups       %ymm1,0x40(%rsp)
-  DB  197,252,17,68,36,32                 ; vmovups       %ymm0,0x20(%rsp)
-  DB  72,137,203                          ; mov           %rcx,%rbx
+  DB  72,129,236,160,0,0,0                ; sub           $0xa0,%rsp
+  DB  197,252,17,188,36,128,0,0,0         ; vmovups       %ymm7,0x80(%rsp)
+  DB  197,252,17,116,36,96                ; vmovups       %ymm6,0x60(%rsp)
+  DB  197,252,17,108,36,64                ; vmovups       %ymm5,0x40(%rsp)
+  DB  197,252,17,100,36,32                ; vmovups       %ymm4,0x20(%rsp)
+  DB  73,137,207                          ; mov           %rcx,%r15
   DB  73,137,214                          ; mov           %rdx,%r14
   DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,72,8                         ; mov           0x8(%rax),%rcx
-  DB  72,133,219                          ; test          %rbx,%rbx
+  DB  72,137,195                          ; mov           %rax,%rbx
+  DB  197,252,20,225                      ; vunpcklps     %ymm1,%ymm0,%ymm4
+  DB  197,252,21,193                      ; vunpckhps     %ymm1,%ymm0,%ymm0
+  DB  197,236,20,203                      ; vunpcklps     %ymm3,%ymm2,%ymm1
+  DB  197,236,21,211                      ; vunpckhps     %ymm3,%ymm2,%ymm2
+  DB  197,221,20,217                      ; vunpcklpd     %ymm1,%ymm4,%ymm3
+  DB  197,221,21,201                      ; vunpckhpd     %ymm1,%ymm4,%ymm1
+  DB  197,253,20,226                      ; vunpcklpd     %ymm2,%ymm0,%ymm4
+  DB  197,253,21,194                      ; vunpckhpd     %ymm2,%ymm0,%ymm0
+  DB  196,227,101,24,209,1                ; vinsertf128   $0x1,%xmm1,%ymm3,%ymm2
+  DB  196,227,93,24,232,1                 ; vinsertf128   $0x1,%xmm0,%ymm4,%ymm5
+  DB  196,227,101,6,201,49                ; vperm2f128    $0x31,%ymm1,%ymm3,%ymm1
+  DB  196,227,93,6,192,49                 ; vperm2f128    $0x31,%ymm0,%ymm4,%ymm0
+  DB  197,253,17,83,8                     ; vmovupd       %ymm2,0x8(%rbx)
+  DB  197,253,17,107,40                   ; vmovupd       %ymm5,0x28(%rbx)
+  DB  197,253,17,75,72                    ; vmovupd       %ymm1,0x48(%rbx)
+  DB  197,253,17,67,104                   ; vmovupd       %ymm0,0x68(%rbx)
+  DB  77,133,255                          ; test          %r15,%r15
   DB  186,8,0,0,0                         ; mov           $0x8,%edx
-  DB  15,69,211                           ; cmovne        %ebx,%edx
+  DB  65,15,69,215                        ; cmovne        %r15d,%edx
+  DB  72,137,217                          ; mov           %rbx,%rcx
   DB  197,248,119                         ; vzeroupper
-  DB  255,16                              ; callq         *(%rax)
+  DB  255,19                              ; callq         *(%rbx)
+  DB  72,139,131,136,0,0,0                ; mov           0x88(%rbx),%rax
+  DB  197,248,16,0                        ; vmovups       (%rax),%xmm0
+  DB  197,248,16,72,16                    ; vmovups       0x10(%rax),%xmm1
+  DB  197,248,16,80,32                    ; vmovups       0x20(%rax),%xmm2
+  DB  197,248,16,88,48                    ; vmovups       0x30(%rax),%xmm3
+  DB  196,227,101,24,88,112,1             ; vinsertf128   $0x1,0x70(%rax),%ymm3,%ymm3
+  DB  196,227,109,24,80,96,1              ; vinsertf128   $0x1,0x60(%rax),%ymm2,%ymm2
+  DB  196,227,117,24,72,80,1              ; vinsertf128   $0x1,0x50(%rax),%ymm1,%ymm1
+  DB  196,227,125,24,64,64,1              ; vinsertf128   $0x1,0x40(%rax),%ymm0,%ymm0
+  DB  197,252,20,225                      ; vunpcklps     %ymm1,%ymm0,%ymm4
+  DB  197,252,21,233                      ; vunpckhps     %ymm1,%ymm0,%ymm5
+  DB  197,236,20,203                      ; vunpcklps     %ymm3,%ymm2,%ymm1
+  DB  197,236,21,219                      ; vunpckhps     %ymm3,%ymm2,%ymm3
+  DB  197,221,20,193                      ; vunpcklpd     %ymm1,%ymm4,%ymm0
+  DB  197,221,21,201                      ; vunpckhpd     %ymm1,%ymm4,%ymm1
+  DB  197,213,20,211                      ; vunpcklpd     %ymm3,%ymm5,%ymm2
+  DB  197,213,21,219                      ; vunpckhpd     %ymm3,%ymm5,%ymm3
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,137,242                          ; mov           %r14,%rdx
-  DB  72,137,217                          ; mov           %rbx,%rcx
-  DB  197,252,16,68,36,32                 ; vmovups       0x20(%rsp),%ymm0
-  DB  197,252,16,76,36,64                 ; vmovups       0x40(%rsp),%ymm1
-  DB  197,252,16,84,36,96                 ; vmovups       0x60(%rsp),%ymm2
-  DB  197,252,16,156,36,128,0,0,0         ; vmovups       0x80(%rsp),%ymm3
-  DB  197,252,16,164,36,160,0,0,0         ; vmovups       0xa0(%rsp),%ymm4
-  DB  197,252,16,172,36,192,0,0,0         ; vmovups       0xc0(%rsp),%ymm5
-  DB  197,252,16,180,36,224,0,0,0         ; vmovups       0xe0(%rsp),%ymm6
-  DB  197,252,16,188,36,0,1,0,0           ; vmovups       0x100(%rsp),%ymm7
-  DB  72,129,196,40,1,0,0                 ; add           $0x128,%rsp
+  DB  76,137,249                          ; mov           %r15,%rcx
+  DB  197,252,16,100,36,32                ; vmovups       0x20(%rsp),%ymm4
+  DB  197,252,16,108,36,64                ; vmovups       0x40(%rsp),%ymm5
+  DB  197,252,16,116,36,96                ; vmovups       0x60(%rsp),%ymm6
+  DB  197,252,16,188,36,128,0,0,0         ; vmovups       0x80(%rsp),%ymm7
+  DB  72,129,196,160,0,0,0                ; add           $0xa0,%rsp
   DB  91                                  ; pop           %rbx
   DB  65,94                               ; pop           %r14
+  DB  65,95                               ; pop           %r15
   DB  255,224                             ; jmpq          *%rax
 
 PUBLIC _sk_start_pipeline_avx
@@ -9849,40 +9877,68 @@
 
 PUBLIC _sk_callback_avx
 _sk_callback_avx LABEL PROC
+  DB  65,87                               ; push          %r15
   DB  65,86                               ; push          %r14
   DB  83                                  ; push          %rbx
-  DB  72,129,236,40,1,0,0                 ; sub           $0x128,%rsp
-  DB  197,252,17,188,36,0,1,0,0           ; vmovups       %ymm7,0x100(%rsp)
-  DB  197,252,17,180,36,224,0,0,0         ; vmovups       %ymm6,0xe0(%rsp)
-  DB  197,252,17,172,36,192,0,0,0         ; vmovups       %ymm5,0xc0(%rsp)
-  DB  197,252,17,164,36,160,0,0,0         ; vmovups       %ymm4,0xa0(%rsp)
-  DB  197,252,17,156,36,128,0,0,0         ; vmovups       %ymm3,0x80(%rsp)
-  DB  197,252,17,84,36,96                 ; vmovups       %ymm2,0x60(%rsp)
-  DB  197,252,17,76,36,64                 ; vmovups       %ymm1,0x40(%rsp)
-  DB  197,252,17,68,36,32                 ; vmovups       %ymm0,0x20(%rsp)
-  DB  72,137,203                          ; mov           %rcx,%rbx
+  DB  72,129,236,160,0,0,0                ; sub           $0xa0,%rsp
+  DB  197,252,17,188,36,128,0,0,0         ; vmovups       %ymm7,0x80(%rsp)
+  DB  197,252,17,116,36,96                ; vmovups       %ymm6,0x60(%rsp)
+  DB  197,252,17,108,36,64                ; vmovups       %ymm5,0x40(%rsp)
+  DB  197,252,17,100,36,32                ; vmovups       %ymm4,0x20(%rsp)
+  DB  73,137,207                          ; mov           %rcx,%r15
   DB  73,137,214                          ; mov           %rdx,%r14
   DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,72,8                         ; mov           0x8(%rax),%rcx
-  DB  72,133,219                          ; test          %rbx,%rbx
+  DB  72,137,195                          ; mov           %rax,%rbx
+  DB  197,252,20,225                      ; vunpcklps     %ymm1,%ymm0,%ymm4
+  DB  197,252,21,193                      ; vunpckhps     %ymm1,%ymm0,%ymm0
+  DB  197,236,20,203                      ; vunpcklps     %ymm3,%ymm2,%ymm1
+  DB  197,236,21,211                      ; vunpckhps     %ymm3,%ymm2,%ymm2
+  DB  197,221,20,217                      ; vunpcklpd     %ymm1,%ymm4,%ymm3
+  DB  197,221,21,201                      ; vunpckhpd     %ymm1,%ymm4,%ymm1
+  DB  197,253,20,226                      ; vunpcklpd     %ymm2,%ymm0,%ymm4
+  DB  197,253,21,194                      ; vunpckhpd     %ymm2,%ymm0,%ymm0
+  DB  196,227,101,24,209,1                ; vinsertf128   $0x1,%xmm1,%ymm3,%ymm2
+  DB  196,227,93,24,232,1                 ; vinsertf128   $0x1,%xmm0,%ymm4,%ymm5
+  DB  196,227,101,6,201,49                ; vperm2f128    $0x31,%ymm1,%ymm3,%ymm1
+  DB  196,227,93,6,192,49                 ; vperm2f128    $0x31,%ymm0,%ymm4,%ymm0
+  DB  197,253,17,83,8                     ; vmovupd       %ymm2,0x8(%rbx)
+  DB  197,253,17,107,40                   ; vmovupd       %ymm5,0x28(%rbx)
+  DB  197,253,17,75,72                    ; vmovupd       %ymm1,0x48(%rbx)
+  DB  197,253,17,67,104                   ; vmovupd       %ymm0,0x68(%rbx)
+  DB  77,133,255                          ; test          %r15,%r15
   DB  186,8,0,0,0                         ; mov           $0x8,%edx
-  DB  15,69,211                           ; cmovne        %ebx,%edx
+  DB  65,15,69,215                        ; cmovne        %r15d,%edx
+  DB  72,137,217                          ; mov           %rbx,%rcx
   DB  197,248,119                         ; vzeroupper
-  DB  255,16                              ; callq         *(%rax)
+  DB  255,19                              ; callq         *(%rbx)
+  DB  72,139,131,136,0,0,0                ; mov           0x88(%rbx),%rax
+  DB  197,248,16,0                        ; vmovups       (%rax),%xmm0
+  DB  197,248,16,72,16                    ; vmovups       0x10(%rax),%xmm1
+  DB  197,248,16,80,32                    ; vmovups       0x20(%rax),%xmm2
+  DB  197,248,16,88,48                    ; vmovups       0x30(%rax),%xmm3
+  DB  196,227,101,24,88,112,1             ; vinsertf128   $0x1,0x70(%rax),%ymm3,%ymm3
+  DB  196,227,109,24,80,96,1              ; vinsertf128   $0x1,0x60(%rax),%ymm2,%ymm2
+  DB  196,227,117,24,72,80,1              ; vinsertf128   $0x1,0x50(%rax),%ymm1,%ymm1
+  DB  196,227,125,24,64,64,1              ; vinsertf128   $0x1,0x40(%rax),%ymm0,%ymm0
+  DB  197,252,20,225                      ; vunpcklps     %ymm1,%ymm0,%ymm4
+  DB  197,252,21,233                      ; vunpckhps     %ymm1,%ymm0,%ymm5
+  DB  197,236,20,203                      ; vunpcklps     %ymm3,%ymm2,%ymm1
+  DB  197,236,21,219                      ; vunpckhps     %ymm3,%ymm2,%ymm3
+  DB  197,221,20,193                      ; vunpcklpd     %ymm1,%ymm4,%ymm0
+  DB  197,221,21,201                      ; vunpckhpd     %ymm1,%ymm4,%ymm1
+  DB  197,213,20,211                      ; vunpcklpd     %ymm3,%ymm5,%ymm2
+  DB  197,213,21,219                      ; vunpckhpd     %ymm3,%ymm5,%ymm3
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,137,242                          ; mov           %r14,%rdx
-  DB  72,137,217                          ; mov           %rbx,%rcx
-  DB  197,252,16,68,36,32                 ; vmovups       0x20(%rsp),%ymm0
-  DB  197,252,16,76,36,64                 ; vmovups       0x40(%rsp),%ymm1
-  DB  197,252,16,84,36,96                 ; vmovups       0x60(%rsp),%ymm2
-  DB  197,252,16,156,36,128,0,0,0         ; vmovups       0x80(%rsp),%ymm3
-  DB  197,252,16,164,36,160,0,0,0         ; vmovups       0xa0(%rsp),%ymm4
-  DB  197,252,16,172,36,192,0,0,0         ; vmovups       0xc0(%rsp),%ymm5
-  DB  197,252,16,180,36,224,0,0,0         ; vmovups       0xe0(%rsp),%ymm6
-  DB  197,252,16,188,36,0,1,0,0           ; vmovups       0x100(%rsp),%ymm7
-  DB  72,129,196,40,1,0,0                 ; add           $0x128,%rsp
+  DB  76,137,249                          ; mov           %r15,%rcx
+  DB  197,252,16,100,36,32                ; vmovups       0x20(%rsp),%ymm4
+  DB  197,252,16,108,36,64                ; vmovups       0x40(%rsp),%ymm5
+  DB  197,252,16,116,36,96                ; vmovups       0x60(%rsp),%ymm6
+  DB  197,252,16,188,36,128,0,0,0         ; vmovups       0x80(%rsp),%ymm7
+  DB  72,129,196,160,0,0,0                ; add           $0xa0,%rsp
   DB  91                                  ; pop           %rbx
   DB  65,94                               ; pop           %r14
+  DB  65,95                               ; pop           %r15
   DB  255,224                             ; jmpq          *%rax
 
 PUBLIC _sk_start_pipeline_sse41
@@ -14144,29 +14200,57 @@
 
 PUBLIC _sk_callback_sse41
 _sk_callback_sse41 LABEL PROC
+  DB  65,86                               ; push          %r14
   DB  83                                  ; push          %rbx
-  DB  72,131,236,32                       ; sub           $0x20,%rsp
+  DB  72,131,236,40                       ; sub           $0x28,%rsp
   DB  68,15,40,197                        ; movaps        %xmm5,%xmm8
   DB  68,15,40,204                        ; movaps        %xmm4,%xmm9
-  DB  68,15,40,211                        ; movaps        %xmm3,%xmm10
-  DB  68,15,40,218                        ; movaps        %xmm2,%xmm11
-  DB  68,15,40,225                        ; movaps        %xmm1,%xmm12
-  DB  68,15,40,232                        ; movaps        %xmm0,%xmm13
-  DB  72,137,211                          ; mov           %rdx,%rbx
+  DB  73,137,214                          ; mov           %rdx,%r14
   DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,72,8                         ; mov           0x8(%rax),%rcx
+  DB  72,137,195                          ; mov           %rax,%rbx
+  DB  15,40,224                           ; movaps        %xmm0,%xmm4
+  DB  15,20,225                           ; unpcklps      %xmm1,%xmm4
+  DB  15,40,234                           ; movaps        %xmm2,%xmm5
+  DB  15,20,235                           ; unpcklps      %xmm3,%xmm5
+  DB  15,21,193                           ; unpckhps      %xmm1,%xmm0
+  DB  15,21,211                           ; unpckhps      %xmm3,%xmm2
+  DB  15,40,204                           ; movaps        %xmm4,%xmm1
+  DB  102,15,20,205                       ; unpcklpd      %xmm5,%xmm1
+  DB  15,18,236                           ; movhlps       %xmm4,%xmm5
+  DB  15,40,216                           ; movaps        %xmm0,%xmm3
+  DB  102,15,20,218                       ; unpcklpd      %xmm2,%xmm3
+  DB  15,18,208                           ; movhlps       %xmm0,%xmm2
+  DB  102,15,17,75,8                      ; movupd        %xmm1,0x8(%rbx)
+  DB  15,17,107,24                        ; movups        %xmm5,0x18(%rbx)
+  DB  102,15,17,91,40                     ; movupd        %xmm3,0x28(%rbx)
+  DB  15,17,83,56                         ; movups        %xmm2,0x38(%rbx)
   DB  186,4,0,0,0                         ; mov           $0x4,%edx
-  DB  255,16                              ; callq         *(%rax)
+  DB  72,137,217                          ; mov           %rbx,%rcx
+  DB  255,19                              ; callq         *(%rbx)
+  DB  72,139,131,136,0,0,0                ; mov           0x88(%rbx),%rax
+  DB  15,16,32                            ; movups        (%rax),%xmm4
+  DB  15,16,64,16                         ; movups        0x10(%rax),%xmm0
+  DB  15,16,88,32                         ; movups        0x20(%rax),%xmm3
+  DB  15,16,80,48                         ; movups        0x30(%rax),%xmm2
+  DB  15,40,236                           ; movaps        %xmm4,%xmm5
+  DB  15,20,232                           ; unpcklps      %xmm0,%xmm5
+  DB  15,40,203                           ; movaps        %xmm3,%xmm1
+  DB  15,20,202                           ; unpcklps      %xmm2,%xmm1
+  DB  15,21,224                           ; unpckhps      %xmm0,%xmm4
+  DB  15,21,218                           ; unpckhps      %xmm2,%xmm3
+  DB  15,40,197                           ; movaps        %xmm5,%xmm0
+  DB  102,15,20,193                       ; unpcklpd      %xmm1,%xmm0
+  DB  15,18,205                           ; movhlps       %xmm5,%xmm1
+  DB  15,40,212                           ; movaps        %xmm4,%xmm2
+  DB  102,15,20,211                       ; unpcklpd      %xmm3,%xmm2
+  DB  15,18,220                           ; movhlps       %xmm4,%xmm3
   DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,137,218                          ; mov           %rbx,%rdx
-  DB  65,15,40,197                        ; movaps        %xmm13,%xmm0
-  DB  65,15,40,204                        ; movaps        %xmm12,%xmm1
-  DB  65,15,40,211                        ; movaps        %xmm11,%xmm2
-  DB  65,15,40,218                        ; movaps        %xmm10,%xmm3
+  DB  76,137,242                          ; mov           %r14,%rdx
   DB  65,15,40,225                        ; movaps        %xmm9,%xmm4
   DB  65,15,40,232                        ; movaps        %xmm8,%xmm5
-  DB  72,131,196,32                       ; add           $0x20,%rsp
+  DB  72,131,196,40                       ; add           $0x28,%rsp
   DB  91                                  ; pop           %rbx
+  DB  65,94                               ; pop           %r14
   DB  255,224                             ; jmpq          *%rax
 
 PUBLIC _sk_start_pipeline_sse2
@@ -18726,29 +18810,57 @@
 
 PUBLIC _sk_callback_sse2
 _sk_callback_sse2 LABEL PROC
+  DB  65,86                               ; push          %r14
   DB  83                                  ; push          %rbx
-  DB  72,131,236,32                       ; sub           $0x20,%rsp
+  DB  72,131,236,40                       ; sub           $0x28,%rsp
   DB  68,15,40,197                        ; movaps        %xmm5,%xmm8
   DB  68,15,40,204                        ; movaps        %xmm4,%xmm9
-  DB  68,15,40,211                        ; movaps        %xmm3,%xmm10
-  DB  68,15,40,218                        ; movaps        %xmm2,%xmm11
-  DB  68,15,40,225                        ; movaps        %xmm1,%xmm12
-  DB  68,15,40,232                        ; movaps        %xmm0,%xmm13
-  DB  72,137,211                          ; mov           %rdx,%rbx
+  DB  73,137,214                          ; mov           %rdx,%r14
   DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,72,8                         ; mov           0x8(%rax),%rcx
+  DB  72,137,195                          ; mov           %rax,%rbx
+  DB  15,40,224                           ; movaps        %xmm0,%xmm4
+  DB  15,20,225                           ; unpcklps      %xmm1,%xmm4
+  DB  15,40,234                           ; movaps        %xmm2,%xmm5
+  DB  15,20,235                           ; unpcklps      %xmm3,%xmm5
+  DB  15,21,193                           ; unpckhps      %xmm1,%xmm0
+  DB  15,21,211                           ; unpckhps      %xmm3,%xmm2
+  DB  15,40,204                           ; movaps        %xmm4,%xmm1
+  DB  102,15,20,205                       ; unpcklpd      %xmm5,%xmm1
+  DB  15,18,236                           ; movhlps       %xmm4,%xmm5
+  DB  15,40,216                           ; movaps        %xmm0,%xmm3
+  DB  102,15,20,218                       ; unpcklpd      %xmm2,%xmm3
+  DB  15,18,208                           ; movhlps       %xmm0,%xmm2
+  DB  102,15,17,75,8                      ; movupd        %xmm1,0x8(%rbx)
+  DB  15,17,107,24                        ; movups        %xmm5,0x18(%rbx)
+  DB  102,15,17,91,40                     ; movupd        %xmm3,0x28(%rbx)
+  DB  15,17,83,56                         ; movups        %xmm2,0x38(%rbx)
   DB  186,4,0,0,0                         ; mov           $0x4,%edx
-  DB  255,16                              ; callq         *(%rax)
+  DB  72,137,217                          ; mov           %rbx,%rcx
+  DB  255,19                              ; callq         *(%rbx)
+  DB  72,139,131,136,0,0,0                ; mov           0x88(%rbx),%rax
+  DB  15,16,32                            ; movups        (%rax),%xmm4
+  DB  15,16,64,16                         ; movups        0x10(%rax),%xmm0
+  DB  15,16,88,32                         ; movups        0x20(%rax),%xmm3
+  DB  15,16,80,48                         ; movups        0x30(%rax),%xmm2
+  DB  15,40,236                           ; movaps        %xmm4,%xmm5
+  DB  15,20,232                           ; unpcklps      %xmm0,%xmm5
+  DB  15,40,203                           ; movaps        %xmm3,%xmm1
+  DB  15,20,202                           ; unpcklps      %xmm2,%xmm1
+  DB  15,21,224                           ; unpckhps      %xmm0,%xmm4
+  DB  15,21,218                           ; unpckhps      %xmm2,%xmm3
+  DB  15,40,197                           ; movaps        %xmm5,%xmm0
+  DB  102,15,20,193                       ; unpcklpd      %xmm1,%xmm0
+  DB  15,18,205                           ; movhlps       %xmm5,%xmm1
+  DB  15,40,212                           ; movaps        %xmm4,%xmm2
+  DB  102,15,20,211                       ; unpcklpd      %xmm3,%xmm2
+  DB  15,18,220                           ; movhlps       %xmm4,%xmm3
   DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,137,218                          ; mov           %rbx,%rdx
-  DB  65,15,40,197                        ; movaps        %xmm13,%xmm0
-  DB  65,15,40,204                        ; movaps        %xmm12,%xmm1
-  DB  65,15,40,211                        ; movaps        %xmm11,%xmm2
-  DB  65,15,40,218                        ; movaps        %xmm10,%xmm3
+  DB  76,137,242                          ; mov           %r14,%rdx
   DB  65,15,40,225                        ; movaps        %xmm9,%xmm4
   DB  65,15,40,232                        ; movaps        %xmm8,%xmm5
-  DB  72,131,196,32                       ; add           $0x20,%rsp
+  DB  72,131,196,40                       ; add           $0x28,%rsp
   DB  91                                  ; pop           %rbx
+  DB  65,94                               ; pop           %r14
   DB  255,224                             ; jmpq          *%rax
 ENDIF
 END
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 76ea648..609ec6d 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -1070,6 +1070,8 @@
 STAGE(bicubic_p3y) { bicubic_y<+3>(ctx, &g); }
 
 STAGE(callback) {
-    auto c = (const SkJumper_CallbackCtx*)ctx;
-    c->fn(c->arg, tail ? tail : kStride);
+    auto c = (SkJumper_CallbackCtx*)ctx;
+    store4(c->rgba,0, r,g,b,a);
+    c->fn(c, tail ? tail : kStride);
+    load4(c->read_from,0, &r,&g,&b,&a);
 }
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index b15ebf3..2b5b2e5 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -9,7 +9,6 @@
 #define SkRasterPipeline_opts_DEFINED
 
 #include "SkColorPriv.h"
-#include "SkColorLookUpTable.h"
 #include "SkColorSpaceXform_A2B.h"
 #include "SkColorSpaceXformPriv.h"
 #include "SkHalf.h"
@@ -796,29 +795,6 @@
 STAGE_CTX(table_b, const SkTableTransferFn*) { b = table(b, *ctx); }
 STAGE_CTX(table_a, const SkTableTransferFn*) { a = table(a, *ctx); }
 
-STAGE_CTX(color_lookup_table, const SkColorLookUpTable*) {
-    const SkColorLookUpTable* colorLUT = ctx;
-    SkASSERT(3 == colorLUT->inputChannels() || 4 == colorLUT->inputChannels());
-    SkASSERT(3 == colorLUT->outputChannels());
-    float result[3][N];
-    for (int i = 0; i < N; ++i) {
-        const float in[4] = { r[i], g[i], b[i], a[i] };
-        float out[3];
-        colorLUT->interp(out, in);
-        for (int j = 0; j < colorLUT->outputChannels(); ++j) {
-            result[j][i] = out[j];
-        }
-    }
-    r = SkNf::Load(result[0]);
-    g = SkNf::Load(result[1]);
-    b = SkNf::Load(result[2]);
-    if (4 == colorLUT->inputChannels()) {
-        // we must set the pixel to opaque, as the alpha channel was used
-        // as input before this.
-        a = 1.f;
-    }
-}
-
 STAGE(lab_to_xyz) {
     const auto lab_l = r * 100.0f;
     const auto lab_a = g * 255.0f - 128.0f;
@@ -1099,8 +1075,10 @@
 }
 
 STAGE_CTX(callback, const void*) {
-    auto c = (const SkJumper_CallbackCtx*)ctx;
-    c->fn(c->arg, tail ? tail : N);
+    auto c = (SkJumper_CallbackCtx*)ctx;
+    SkNf::Store4(c->rgba, r,g,b,a);
+    c->fn(c, tail ? tail : N);
+    SkNf::Load4(c->read_from, &r,&g,&b,&a);
 }
 
 SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) {