Revert "Generate sse2/sse4.1 splices, use them."

This reverts commit 1fe55dc9fa47a73d470438d6298dc3fa959e4528.

Reason for revert: breaking Win GPU bots?

Original change's description:
> Generate sse2/sse4.1 splices, use them.
> 
> While we're at it, tidy up build_stages.py a bit.
> Redirecting stdout seems a lot easier than print >>f all over the place.
> 
> TODO: non-VEX-encoded before_loop() and after_loop()
> 
> CQ_INCLUDE_TRYBOTS=skia.primary:Test-Win2k8-MSVC-GCE-CPU-AVX2-x86_64-Debug
> 
> Change-Id: I3f38e55f081670dd598c6050435466d9f394e5be
> Reviewed-on: https://skia-review.googlesource.com/8230
> Commit-Queue: Mike Klein <mtklein@chromium.org>
> Reviewed-by: Herb Derby <herb@google.com>
> 

TBR=mtklein@chromium.org,herb@google.com
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
CQ_INCLUDE_TRYBOTS=skia.primary:Test-Win2k8-MSVC-GCE-CPU-AVX2-x86_64-Debug

Change-Id: Iba1905c54cb2dc42a880b9e6a8093835ffd530a2
Reviewed-on: https://skia-review.googlesource.com/8347
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Mike Klein <mtklein@chromium.org>
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
index 7231b5c..cba7b2a 100644
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@@ -112,6 +112,12 @@
         splice(buf, jb_near);      // jb <next 4 bytes>  (b == "before", unsigned less than)
         splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
     }
+    static void ret(SkWStream* buf) {
+        static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
+        static const uint8_t        ret[] = { 0xc3 };
+        splice(buf, vzeroupper);
+        splice(buf, ret);
+    }
 #endif
 
 #if defined(_MSC_VER)
@@ -121,16 +127,16 @@
             0x56,                                         // push   %rsi
             0x57,                                         // push   %rdi
             0x48,0x81,0xec,0xa8,0x00,0x00,0x00,           // sub    $0xa8,%rsp
-            0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps %xmm15,0x90(%rsp)
-            0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps %xmm14,0x80(%rsp)
-            0x44,0x0f,0x29,0x6c,0x24,0x70,                // movaps %xmm13,0x70(%rsp)
-            0x44,0x0f,0x29,0x64,0x24,0x60,                // movaps %xmm12,0x60(%rsp)
-            0x44,0x0f,0x29,0x5c,0x24,0x50,                // movaps %xmm11,0x50(%rsp)
-            0x44,0x0f,0x29,0x54,0x24,0x40,                // movaps %xmm10,0x40(%rsp)
-            0x44,0x0f,0x29,0x4c,0x24,0x30,                // movaps %xmm9,0x30(%rsp)
-            0x44,0x0f,0x29,0x44,0x24,0x20,                // movaps %xmm8,0x20(%rsp)
-            0x0f,0x29,0x7c,0x24,0x10,                     // movaps %xmm7,0x10(%rsp)
-            0x0f,0x29,0x34,0x24,                          // movaps %xmm6,(%rsp)
+            0xc5,0x78,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps %xmm15,0x90(%rsp)
+            0xc5,0x78,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps %xmm14,0x80(%rsp)
+            0xc5,0x78,0x29,0x6c,0x24,0x70,                // vmovaps %xmm13,0x70(%rsp)
+            0xc5,0x78,0x29,0x64,0x24,0x60,                // vmovaps %xmm12,0x60(%rsp)
+            0xc5,0x78,0x29,0x5c,0x24,0x50,                // vmovaps %xmm11,0x50(%rsp)
+            0xc5,0x78,0x29,0x54,0x24,0x40,                // vmovaps %xmm10,0x40(%rsp)
+            0xc5,0x78,0x29,0x4c,0x24,0x30,                // vmovaps %xmm9,0x30(%rsp)
+            0xc5,0x78,0x29,0x44,0x24,0x20,                // vmovaps %xmm8,0x20(%rsp)
+            0xc5,0xf8,0x29,0x7c,0x24,0x10,                // vmovaps %xmm7,0x10(%rsp)
+            0xc5,0xf8,0x29,0x34,0x24,                     // vmovaps %xmm6,(%rsp)
             0x48,0x89,0xcf,                               // mov    %rcx,%rdi
             0x48,0x89,0xd6,                               // mov    %rdx,%rsi
             0x4c,0x89,0xc2,                               // mov    %r8,%rdx
@@ -140,17 +146,16 @@
     }
     static void after_loop(SkWStream* buf) {
         static const uint8_t system_v_to_ms[] = {
-            // TODO: vzeroupper here?
-            0x0f,0x28,0x34,0x24,                          // movaps (%rsp),%xmm6
-            0x0f,0x28,0x7c,0x24,0x10,                     // movaps 0x10(%rsp),%xmm7
-            0x44,0x0f,0x28,0x44,0x24,0x20,                // movaps 0x20(%rsp),%xmm8
-            0x44,0x0f,0x28,0x4c,0x24,0x30,                // movaps 0x30(%rsp),%xmm9
-            0x44,0x0f,0x28,0x54,0x24,0x40,                // movaps 0x40(%rsp),%xmm10
-            0x44,0x0f,0x28,0x5c,0x24,0x50,                // movaps 0x50(%rsp),%xmm11
-            0x44,0x0f,0x28,0x64,0x24,0x60,                // movaps 0x60(%rsp),%xmm12
-            0x44,0x0f,0x28,0x6c,0x24,0x70,                // movaps 0x70(%rsp),%xmm13
-            0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps 0x80(%rsp),%xmm14
-            0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps 0x90(%rsp),%xmm15
+            0xc5,0xf8,0x28,0x34,0x24,                     // vmovaps (%rsp),%xmm6
+            0xc5,0xf8,0x28,0x7c,0x24,0x10,                // vmovaps 0x10(%rsp),%xmm7
+            0xc5,0x78,0x28,0x44,0x24,0x20,                // vmovaps 0x20(%rsp),%xmm8
+            0xc5,0x78,0x28,0x4c,0x24,0x30,                // vmovaps 0x30(%rsp),%xmm9
+            0xc5,0x78,0x28,0x54,0x24,0x40,                // vmovaps 0x40(%rsp),%xmm10
+            0xc5,0x78,0x28,0x5c,0x24,0x50,                // vmovaps 0x50(%rsp),%xmm11
+            0xc5,0x78,0x28,0x64,0x24,0x60,                // vmovaps 0x60(%rsp),%xmm12
+            0xc5,0x78,0x28,0x6c,0x24,0x70,                // vmovaps 0x70(%rsp),%xmm13
+            0xc5,0x78,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps 0x80(%rsp),%xmm14
+            0xc5,0x78,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps 0x90(%rsp),%xmm15
             0x48,0x81,0xc4,0xa8,0x00,0x00,0x00,           // add    $0xa8,%rsp
             0x5f,                                         // pop    %rdi
             0x5e,                                         // pop    %rsi
@@ -236,46 +241,35 @@
     }
 #endif
 
-#define CASE(prefix, st) case SkRasterPipeline::st: splice_until_ret(buf, prefix##_##st); break
-#define DEFINE_SPLICE(prefix)                                                        \
-    static bool prefix##_##splice(SkWStream* buf, SkRasterPipeline::StockStage st) { \
-        switch (st) {                                                                \
-            default: return false;                                                   \
-            CASE(prefix, clear);                                                     \
-            CASE(prefix, plus_);                                                     \
-            CASE(prefix, srcover);                                                   \
-            CASE(prefix, dstover);                                                   \
-            CASE(prefix, clamp_0);                                                   \
-            CASE(prefix, clamp_1);                                                   \
-            CASE(prefix, clamp_a);                                                   \
-            CASE(prefix, swap);                                                      \
-            CASE(prefix, move_src_dst);                                              \
-            CASE(prefix, move_dst_src);                                              \
-            CASE(prefix, premul);                                                    \
-            CASE(prefix, unpremul);                                                  \
-            CASE(prefix, from_srgb);                                                 \
-            CASE(prefix, to_srgb);                                                   \
-            CASE(prefix, scale_u8);                                                  \
-            CASE(prefix, load_tables);                                               \
-            CASE(prefix, load_8888);                                                 \
-            CASE(prefix, store_8888);                                                \
-            CASE(prefix, load_f16);                                                  \
-            CASE(prefix, store_f16);                                                 \
-            CASE(prefix, matrix_3x4);                                                \
-        }                                                                            \
-        return true;                                                                 \
+    static bool splice(SkWStream* buf, SkRasterPipeline::StockStage st) {
+        switch (st) {
+            default: return false;
+        #define CASE(st) case SkRasterPipeline::st: splice_until_ret(buf, kSplice_##st); break
+            CASE(clear);
+            CASE(plus_);
+            CASE(srcover);
+            CASE(dstover);
+            CASE(clamp_0);
+            CASE(clamp_1);
+            CASE(clamp_a);
+            CASE(swap);
+            CASE(move_src_dst);
+            CASE(move_dst_src);
+            CASE(premul);
+            CASE(unpremul);
+            CASE(from_srgb);
+            CASE(to_srgb);
+            CASE(scale_u8);
+            CASE(load_tables);
+            CASE(load_8888);
+            CASE(store_8888);
+            CASE(load_f16);
+            CASE(store_f16);
+            CASE(matrix_3x4);
+        #undef CASE
+        }
+        return true;
     }
-    #if defined(__aarch64__)
-        DEFINE_SPLICE(aarch64)
-    #elif defined(__ARM_NEON__)
-        DEFINE_SPLICE(armv7)
-    #else
-        DEFINE_SPLICE(sse2)
-        DEFINE_SPLICE(sse41)
-        DEFINE_SPLICE(hsw)
-    #endif
-#undef DEFINE_SPLICE
-#undef CASE
 
     struct Spliced {
 
@@ -288,46 +282,17 @@
             fSpliced    = nullptr;
             // If we return early anywhere in here, !fSpliced means we'll use fBackup instead.
 
-
         #if defined(__aarch64__)
-            auto splice_stage = [](SkWStream* buf, SkRasterPipeline::StockStage st) {
-                return aarch64_splice(buf, st);
-            };
-            auto inc_x = [](SkWStream* buf) { splice_until_ret(buf, aarch64_inc_x); };
         #elif defined(__ARM_NEON__)
             // Late generation ARMv7, e.g. Cortex A15 or Krait.
             if (!SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
                 return;
             }
-            auto splice_stage = [](SkWStream* buf, SkRasterPipeline::StockStage st) {
-                return armv7_splice(buf, st);
-            };
-            auto inc_x = [](SkWStream* buf) { splice_until_ret(buf, armv7_inc_x); };
         #else
-            // To keep things simple, only x86-64 for now.
-            if (sizeof(void*) != 8) {
+            // To keep things simple, only one x86 target supported: Haswell+ x86-64.
+            if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) {
                 return;
             }
-            bool hsw   = true && SkCpu::Supports(SkCpu::HSW),
-                 sse41 = true && SkCpu::Supports(SkCpu::SSE41);
-            auto splice_stage = [&](SkWStream* buf, SkRasterPipeline::StockStage st) {
-                if (  hsw) { return   hsw_splice(buf, st); }
-                if (sse41) { return sse41_splice(buf, st); }
-                return sse2_splice(buf, st);
-            };
-            auto inc_x = [&](SkWStream* buf) {
-                if (  hsw) { splice_until_ret(buf,   hsw_inc_x); return; }
-                if (sse41) { splice_until_ret(buf, sse41_inc_x); return; }
-                splice_until_ret(buf,  sse2_inc_x);
-            };
-            auto ret = [&](SkWStream* buf) {
-                static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
-                static const uint8_t        ret[] = { 0xc3 };
-                if (hsw) {
-                    splice(buf, vzeroupper);
-                }
-                splice(buf, ret);
-            };
         #endif
 
             SkDynamicMemoryWStream buf;
@@ -347,13 +312,13 @@
                 }
 
                 // Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
-                if (!splice_stage(&buf, stages[i].stage)) {
+                if (!splice(&buf, stages[i].stage)) {
                     //SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
                     return;
                 }
             }
 
-            inc_x(&buf);
+            splice_until_ret(&buf, kSplice_inc_x);
             loop(&buf, loop_start);  // Loop back to handle more pixels if not done.
             after_loop(&buf);
             ret(&buf);  // We're done.
diff --git a/src/splicer/SkSplicer_generated.h b/src/splicer/SkSplicer_generated.h
index 2a50282..a179e1d 100644
--- a/src/splicer/SkSplicer_generated.h
+++ b/src/splicer/SkSplicer_generated.h
@@ -11,67 +11,69 @@
 // This file is generated semi-automatically with this command:
 //   $ src/splicer/build_stages.py
 
-static const unsigned int aarch64_inc_x[] = {
+#if defined(__aarch64__)
+
+static const unsigned int kSplice_inc_x[] = {
     0x91001000,                                 //  add           x0, x0, #0x4
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_clear[] = {
+static const unsigned int kSplice_clear[] = {
     0x6f00e400,                                 //  movi          v0.2d, #0x0
     0x6f00e401,                                 //  movi          v1.2d, #0x0
     0x6f00e402,                                 //  movi          v2.2d, #0x0
     0x6f00e403,                                 //  movi          v3.2d, #0x0
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_plus_[] = {
+static const unsigned int kSplice_plus_[] = {
     0x4e24d400,                                 //  fadd          v0.4s, v0.4s, v4.4s
     0x4e25d421,                                 //  fadd          v1.4s, v1.4s, v5.4s
     0x4e26d442,                                 //  fadd          v2.4s, v2.4s, v6.4s
     0x4e27d463,                                 //  fadd          v3.4s, v3.4s, v7.4s
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_srcover[] = {
+static const unsigned int kSplice_srcover[] = {
     0x4d40c870,                                 //  ld1r          {v16.4s}, [x3]
     0x4ea3d610,                                 //  fsub          v16.4s, v16.4s, v3.4s
     0x4e24ce00,                                 //  fmla          v0.4s, v16.4s, v4.4s
     0x4e25ce01,                                 //  fmla          v1.4s, v16.4s, v5.4s
     0x4e26ce02,                                 //  fmla          v2.4s, v16.4s, v6.4s
     0x4e27ce03,                                 //  fmla          v3.4s, v16.4s, v7.4s
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_dstover[] = {
+static const unsigned int kSplice_dstover[] = {
     0x4d40c870,                                 //  ld1r          {v16.4s}, [x3]
     0x4ea7d610,                                 //  fsub          v16.4s, v16.4s, v7.4s
     0x4e20ce04,                                 //  fmla          v4.4s, v16.4s, v0.4s
     0x4e21ce05,                                 //  fmla          v5.4s, v16.4s, v1.4s
     0x4e22ce06,                                 //  fmla          v6.4s, v16.4s, v2.4s
     0x4e23ce07,                                 //  fmla          v7.4s, v16.4s, v3.4s
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_clamp_0[] = {
+static const unsigned int kSplice_clamp_0[] = {
     0x6f00e410,                                 //  movi          v16.2d, #0x0
     0x4e30f400,                                 //  fmax          v0.4s, v0.4s, v16.4s
     0x4e30f421,                                 //  fmax          v1.4s, v1.4s, v16.4s
     0x4e30f442,                                 //  fmax          v2.4s, v2.4s, v16.4s
     0x4e30f463,                                 //  fmax          v3.4s, v3.4s, v16.4s
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_clamp_1[] = {
+static const unsigned int kSplice_clamp_1[] = {
     0x4d40c870,                                 //  ld1r          {v16.4s}, [x3]
     0x4eb0f400,                                 //  fmin          v0.4s, v0.4s, v16.4s
     0x4eb0f421,                                 //  fmin          v1.4s, v1.4s, v16.4s
     0x4eb0f442,                                 //  fmin          v2.4s, v2.4s, v16.4s
     0x4eb0f463,                                 //  fmin          v3.4s, v3.4s, v16.4s
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_clamp_a[] = {
+static const unsigned int kSplice_clamp_a[] = {
     0x4d40c870,                                 //  ld1r          {v16.4s}, [x3]
     0x4eb0f463,                                 //  fmin          v3.4s, v3.4s, v16.4s
     0x4ea3f400,                                 //  fmin          v0.4s, v0.4s, v3.4s
     0x4ea3f421,                                 //  fmin          v1.4s, v1.4s, v3.4s
     0x4ea3f442,                                 //  fmin          v2.4s, v2.4s, v3.4s
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_swap[] = {
+static const unsigned int kSplice_swap[] = {
     0x4ea31c70,                                 //  mov           v16.16b, v3.16b
     0x4ea21c51,                                 //  mov           v17.16b, v2.16b
     0x4ea11c32,                                 //  mov           v18.16b, v1.16b
@@ -84,29 +86,29 @@
     0x4eb21e45,                                 //  mov           v5.16b, v18.16b
     0x4eb11e26,                                 //  mov           v6.16b, v17.16b
     0x4eb01e07,                                 //  mov           v7.16b, v16.16b
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_move_src_dst[] = {
+static const unsigned int kSplice_move_src_dst[] = {
     0x4ea01c04,                                 //  mov           v4.16b, v0.16b
     0x4ea11c25,                                 //  mov           v5.16b, v1.16b
     0x4ea21c46,                                 //  mov           v6.16b, v2.16b
     0x4ea31c67,                                 //  mov           v7.16b, v3.16b
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_move_dst_src[] = {
+static const unsigned int kSplice_move_dst_src[] = {
     0x4ea41c80,                                 //  mov           v0.16b, v4.16b
     0x4ea51ca1,                                 //  mov           v1.16b, v5.16b
     0x4ea61cc2,                                 //  mov           v2.16b, v6.16b
     0x4ea71ce3,                                 //  mov           v3.16b, v7.16b
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_premul[] = {
+static const unsigned int kSplice_premul[] = {
     0x6e23dc00,                                 //  fmul          v0.4s, v0.4s, v3.4s
     0x6e23dc21,                                 //  fmul          v1.4s, v1.4s, v3.4s
     0x6e23dc42,                                 //  fmul          v2.4s, v2.4s, v3.4s
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_unpremul[] = {
+static const unsigned int kSplice_unpremul[] = {
     0x4d40c870,                                 //  ld1r          {v16.4s}, [x3]
     0x4ea0d871,                                 //  fcmeq         v17.4s, v3.4s, #0.0
     0x6e23fe10,                                 //  fdiv          v16.4s, v16.4s, v3.4s
@@ -114,9 +116,9 @@
     0x6e20de00,                                 //  fmul          v0.4s, v16.4s, v0.4s
     0x6e21de01,                                 //  fmul          v1.4s, v16.4s, v1.4s
     0x6e22de02,                                 //  fmul          v2.4s, v16.4s, v2.4s
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_from_srgb[] = {
+static const unsigned int kSplice_from_srgb[] = {
     0x91005068,                                 //  add           x8, x3, #0x14
     0x4d40c910,                                 //  ld1r          {v16.4s}, [x8]
     0x91004068,                                 //  add           x8, x3, #0x10
@@ -146,9 +148,9 @@
     0x6e761ea0,                                 //  bsl           v0.16b, v21.16b, v22.16b
     0x6e781e41,                                 //  bsl           v1.16b, v18.16b, v24.16b
     0x6e711e62,                                 //  bsl           v2.16b, v19.16b, v17.16b
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_to_srgb[] = {
+static const unsigned int kSplice_to_srgb[] = {
     0x6ea1d810,                                 //  frsqrte       v16.4s, v0.4s
     0x6ea1d835,                                 //  frsqrte       v21.4s, v1.4s
     0x6e30de17,                                 //  fmul          v23.4s, v16.4s, v16.4s
@@ -209,9 +211,9 @@
     0x6e701f40,                                 //  bsl           v0.16b, v26.16b, v16.16b
     0x6e721e61,                                 //  bsl           v1.16b, v19.16b, v18.16b
     0x6e741e22,                                 //  bsl           v2.16b, v17.16b, v20.16b
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_scale_u8[] = {
+static const unsigned int kSplice_scale_u8[] = {
     0xf9400048,                                 //  ldr           x8, [x2]
     0xbd400871,                                 //  ldr           s17, [x3,#8]
     0x8b000108,                                 //  add           x8, x8, x0
@@ -231,9 +233,9 @@
     0x6e21de01,                                 //  fmul          v1.4s, v16.4s, v1.4s
     0x6e22de02,                                 //  fmul          v2.4s, v16.4s, v2.4s
     0x6e23de03,                                 //  fmul          v3.4s, v16.4s, v3.4s
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_load_tables[] = {
+static const unsigned int kSplice_load_tables[] = {
     0xa9402849,                                 //  ldp           x9, x10, [x2]
     0xd37ef408,                                 //  lsl           x8, x0, #2
     0x9100306b,                                 //  add           x11, x3, #0xc
@@ -285,9 +287,9 @@
     0x4e21d863,                                 //  scvtf         v3.4s, v3.4s
     0x6e1c0622,                                 //  mov           v2.s[3], v17.s[0]
     0x4f909063,                                 //  fmul          v3.4s, v3.4s, v16.s[0]
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_load_8888[] = {
+static const unsigned int kSplice_load_8888[] = {
     0xf9400048,                                 //  ldr           x8, [x2]
     0x91003069,                                 //  add           x9, x3, #0xc
     0x4d40c920,                                 //  ld1r          {v0.4s}, [x9]
@@ -308,9 +310,9 @@
     0x4f839021,                                 //  fmul          v1.4s, v1.4s, v3.s[0]
     0x4f839042,                                 //  fmul          v2.4s, v2.4s, v3.s[0]
     0x4f839243,                                 //  fmul          v3.4s, v18.4s, v3.s[0]
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_store_8888[] = {
+static const unsigned int kSplice_store_8888[] = {
     0xbd400470,                                 //  ldr           s16, [x3,#4]
     0xf9400048,                                 //  ldr           x8, [x2]
     0xd37ef409,                                 //  lsl           x9, x0, #2
@@ -329,9 +331,9 @@
     0x4f385610,                                 //  shl           v16.4s, v16.4s, #24
     0x4eb01e30,                                 //  orr           v16.16b, v17.16b, v16.16b
     0x3ca96910,                                 //  str           q16, [x8,x9]
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_load_f16[] = {
+static const unsigned int kSplice_load_f16[] = {
     0xf9400048,                                 //  ldr           x8, [x2]
     0x8b000d08,                                 //  add           x8, x8, x0, lsl #3
     0x0c400510,                                 //  ld4           {v16.4h-v19.4h}, [x8]
@@ -339,9 +341,9 @@
     0x0e217a21,                                 //  fcvtl         v1.4s, v17.4h
     0x0e217a42,                                 //  fcvtl         v2.4s, v18.4h
     0x0e217a63,                                 //  fcvtl         v3.4s, v19.4h
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_store_f16[] = {
+static const unsigned int kSplice_store_f16[] = {
     0xf9400048,                                 //  ldr           x8, [x2]
     0x0e216810,                                 //  fcvtn         v16.4h, v0.4s
     0x0e216831,                                 //  fcvtn         v17.4h, v1.4s
@@ -349,9 +351,9 @@
     0x8b000d08,                                 //  add           x8, x8, x0, lsl #3
     0x0e216873,                                 //  fcvtn         v19.4h, v3.4s
     0x0c000510,                                 //  st4           {v16.4h-v19.4h}, [x8]
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int aarch64_matrix_3x4[] = {
+static const unsigned int kSplice_matrix_3x4[] = {
     0xaa0203e8,                                 //  mov           x8, x2
     0x91009049,                                 //  add           x9, x2, #0x24
     0x4ddfc913,                                 //  ld1r          {v19.4s}, [x8], #4
@@ -377,69 +379,72 @@
     0x4eb01e00,                                 //  mov           v0.16b, v16.16b
     0x4eb11e21,                                 //  mov           v1.16b, v17.16b
     0x4eb21e42,                                 //  mov           v2.16b, v18.16b
-    0xd65f03c0,                                 //  return
+    0xd65f03c0,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_inc_x[] = {
+
+#elif defined(__ARM_NEON__)
+
+static const unsigned int kSplice_inc_x[] = {
     0xe2800002,                                 //  add           r0, r0, #2
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_clear[] = {
+static const unsigned int kSplice_clear[] = {
     0xf2800010,                                 //  vmov.i32      d0, #0
     0xf2801010,                                 //  vmov.i32      d1, #0
     0xf2802010,                                 //  vmov.i32      d2, #0
     0xf2803010,                                 //  vmov.i32      d3, #0
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_plus_[] = {
+static const unsigned int kSplice_plus_[] = {
     0xf2000d04,                                 //  vadd.f32      d0, d0, d4
     0xf2011d05,                                 //  vadd.f32      d1, d1, d5
     0xf2022d06,                                 //  vadd.f32      d2, d2, d6
     0xf2033d07,                                 //  vadd.f32      d3, d3, d7
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_srcover[] = {
+static const unsigned int kSplice_srcover[] = {
     0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
     0xf2600d83,                                 //  vsub.f32      d16, d16, d3
     0xf2040c30,                                 //  vfma.f32      d0, d4, d16
     0xf2051c30,                                 //  vfma.f32      d1, d5, d16
     0xf2062c30,                                 //  vfma.f32      d2, d6, d16
     0xf2073c30,                                 //  vfma.f32      d3, d7, d16
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_dstover[] = {
+static const unsigned int kSplice_dstover[] = {
     0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
     0xf2600d87,                                 //  vsub.f32      d16, d16, d7
     0xf2004c30,                                 //  vfma.f32      d4, d0, d16
     0xf2015c30,                                 //  vfma.f32      d5, d1, d16
     0xf2026c30,                                 //  vfma.f32      d6, d2, d16
     0xf2037c30,                                 //  vfma.f32      d7, d3, d16
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_clamp_0[] = {
+static const unsigned int kSplice_clamp_0[] = {
     0xf2c00010,                                 //  vmov.i32      d16, #0
     0xf2000f20,                                 //  vmax.f32      d0, d0, d16
     0xf2011f20,                                 //  vmax.f32      d1, d1, d16
     0xf2022f20,                                 //  vmax.f32      d2, d2, d16
     0xf2033f20,                                 //  vmax.f32      d3, d3, d16
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_clamp_1[] = {
+static const unsigned int kSplice_clamp_1[] = {
     0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
     0xf2200f20,                                 //  vmin.f32      d0, d0, d16
     0xf2211f20,                                 //  vmin.f32      d1, d1, d16
     0xf2222f20,                                 //  vmin.f32      d2, d2, d16
     0xf2233f20,                                 //  vmin.f32      d3, d3, d16
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_clamp_a[] = {
+static const unsigned int kSplice_clamp_a[] = {
     0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
     0xf2233f20,                                 //  vmin.f32      d3, d3, d16
     0xf2200f03,                                 //  vmin.f32      d0, d0, d3
     0xf2211f03,                                 //  vmin.f32      d1, d1, d3
     0xf2222f03,                                 //  vmin.f32      d2, d2, d3
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_swap[] = {
+static const unsigned int kSplice_swap[] = {
     0xeef00b43,                                 //  vmov.f64      d16, d3
     0xeef01b42,                                 //  vmov.f64      d17, d2
     0xeef02b41,                                 //  vmov.f64      d18, d1
@@ -452,29 +457,29 @@
     0xeeb05b62,                                 //  vmov.f64      d5, d18
     0xeeb06b61,                                 //  vmov.f64      d6, d17
     0xeeb07b60,                                 //  vmov.f64      d7, d16
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_move_src_dst[] = {
+static const unsigned int kSplice_move_src_dst[] = {
     0xeeb04b40,                                 //  vmov.f64      d4, d0
     0xeeb05b41,                                 //  vmov.f64      d5, d1
     0xeeb06b42,                                 //  vmov.f64      d6, d2
     0xeeb07b43,                                 //  vmov.f64      d7, d3
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_move_dst_src[] = {
+static const unsigned int kSplice_move_dst_src[] = {
     0xeeb00b44,                                 //  vmov.f64      d0, d4
     0xeeb01b45,                                 //  vmov.f64      d1, d5
     0xeeb02b46,                                 //  vmov.f64      d2, d6
     0xeeb03b47,                                 //  vmov.f64      d3, d7
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_premul[] = {
+static const unsigned int kSplice_premul[] = {
     0xf3000d13,                                 //  vmul.f32      d0, d0, d3
     0xf3011d13,                                 //  vmul.f32      d1, d1, d3
     0xf3022d13,                                 //  vmul.f32      d2, d2, d3
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_unpremul[] = {
+static const unsigned int kSplice_unpremul[] = {
     0xed2d8b04,                                 //  vpush         {d8-d9}
     0xed938a00,                                 //  vldr          s16, [r3]
     0xf2c00010,                                 //  vmov.i32      d16, #0
@@ -486,9 +491,9 @@
     0xf3011d91,                                 //  vmul.f32      d1, d17, d1
     0xf3012d92,                                 //  vmul.f32      d2, d17, d2
     0xecbd8b04,                                 //  vpop          {d8-d9}
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_from_srgb[] = {
+static const unsigned int kSplice_from_srgb[] = {
     0xed2d8b02,                                 //  vpush         {d8}
     0xe283c018,                                 //  add           ip, r3, #24
     0xed938a07,                                 //  vldr          s16, [r3, #28]
@@ -522,9 +527,9 @@
     0xf31611b2,                                 //  vbsl          d1, d22, d18
     0xf31921b8,                                 //  vbsl          d2, d25, d24
     0xecbd8b02,                                 //  vpop          {d8}
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_to_srgb[] = {
+static const unsigned int kSplice_to_srgb[] = {
     0xed2d8b02,                                 //  vpush         {d8}
     0xf3fb0580,                                 //  vrsqrte.f32   d16, d0
     0xe283c02c,                                 //  add           ip, r3, #44
@@ -590,9 +595,9 @@
     0xf31111b3,                                 //  vbsl          d1, d17, d19
     0xf31221b4,                                 //  vbsl          d2, d18, d20
     0xecbd8b02,                                 //  vpop          {d8}
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_scale_u8[] = {
+static const unsigned int kSplice_scale_u8[] = {
     0xed2d8b02,                                 //  vpush         {d8}
     0xe24dd008,                                 //  sub           sp, sp, #8
     0xe592c000,                                 //  ldr           ip, [r2]
@@ -612,9 +617,9 @@
     0xf3003d93,                                 //  vmul.f32      d3, d16, d3
     0xe28dd008,                                 //  add           sp, sp, #8
     0xecbd8b02,                                 //  vpop          {d8}
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_load_tables[] = {
+static const unsigned int kSplice_load_tables[] = {
     0xe92d41f0,                                 //  push          {r4, r5, r6, r7, r8, lr}
     0xe283600c,                                 //  add           r6, r3, #12
     0xe592c000,                                 //  ldr           ip, [r2]
@@ -652,9 +657,9 @@
     0xe0887105,                                 //  add           r7, r8, r5, lsl #2
     0xed972a00,                                 //  vldr          s4, [r7]
     0xe8bd41f0,                                 //  pop           {r4, r5, r6, r7, r8, lr}
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_load_8888[] = {
+static const unsigned int kSplice_load_8888[] = {
     0xe92d4800,                                 //  push          {fp, lr}
     0xe592c000,                                 //  ldr           ip, [r2]
     0xe283e00c,                                 //  add           lr, r3, #12
@@ -677,9 +682,9 @@
     0xf2a119c2,                                 //  vmul.f32      d1, d17, d2[0]
     0xf2a029c2,                                 //  vmul.f32      d2, d16, d2[0]
     0xe8bd4800,                                 //  pop           {fp, lr}
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_store_8888[] = {
+static const unsigned int kSplice_store_8888[] = {
     0xe283c004,                                 //  add           ip, r3, #4
     0xf2c3261f,                                 //  vmov.i32      d18, #1056964608
     0xf2c3361f,                                 //  vmov.i32      d19, #1056964608
@@ -703,9 +708,9 @@
     0xf26001b2,                                 //  vorr          d16, d16, d18
     0xf26001b1,                                 //  vorr          d16, d16, d17
     0xedcc0b00,                                 //  vstr          d16, [ip]
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_load_f16[] = {
+static const unsigned int kSplice_load_f16[] = {
     0xed2d8b04,                                 //  vpush         {d8-d9}
     0xe592c000,                                 //  ldr           ip, [r2]
     0xe08cc180,                                 //  add           ip, ip, r0, lsl #3
@@ -721,9 +726,9 @@
     0xeeb02b43,                                 //  vmov.f64      d2, d3
     0xeeb03b49,                                 //  vmov.f64      d3, d9
     0xecbd8b04,                                 //  vpop          {d8-d9}
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_store_f16[] = {
+static const unsigned int kSplice_store_f16[] = {
     0xeef00b41,                                 //  vmov.f64      d16, d1
     0xf2631113,                                 //  vorr          d17, d3, d3
     0xeef02b40,                                 //  vmov.f64      d18, d0
@@ -735,9 +740,9 @@
     0xf3f60622,                                 //  vcvt.f16.f32  d16, q9
     0xe08cc180,                                 //  add           ip, ip, r0, lsl #3
     0xf44c084f,                                 //  vst2.16       {d16-d17}, [ip]
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned int armv7_matrix_3x4[] = {
+static const unsigned int kSplice_matrix_3x4[] = {
     0xe282c020,                                 //  add           ip, r2, #32
     0xf4ec3c9f,                                 //  vld1.32       {d19[]}, [ip :32]
     0xe282c02c,                                 //  add           ip, r2, #44
@@ -773,1011 +778,72 @@
     0xf22101b1,                                 //  vorr          d0, d17, d17
     0xf22021b0,                                 //  vorr          d2, d16, d16
     0xf22211b2,                                 //  vorr          d1, d18, d18
-    0xe12fff1e,                                 //  return
+    0xe12fff1e,                                 //  return        (synthetic)
 };
-static const unsigned char sse2_inc_x[] = {
-    0x48,0x83,0xc7,0x04,                        //  add           $0x4,%rdi
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_clear[] = {
-    0x0f,0x57,0xc0,                             //  xorps         %xmm0,%xmm0
-    0x0f,0x57,0xc9,                             //  xorps         %xmm1,%xmm1
-    0x0f,0x57,0xd2,                             //  xorps         %xmm2,%xmm2
-    0x0f,0x57,0xdb,                             //  xorps         %xmm3,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_plus_[] = {
-    0x0f,0x58,0xc4,                             //  addps         %xmm4,%xmm0
-    0x0f,0x58,0xcd,                             //  addps         %xmm5,%xmm1
-    0x0f,0x58,0xd6,                             //  addps         %xmm6,%xmm2
-    0x0f,0x58,0xdf,                             //  addps         %xmm7,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_srcover[] = {
-    0xf3,0x44,0x0f,0x10,0x01,                   //  movss         (%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x44,0x0f,0x5c,0xc3,                        //  subps         %xmm3,%xmm8
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xcc,                        //  mulps         %xmm4,%xmm9
-    0x41,0x0f,0x58,0xc1,                        //  addps         %xmm9,%xmm0
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xcd,                        //  mulps         %xmm5,%xmm9
-    0x41,0x0f,0x58,0xc9,                        //  addps         %xmm9,%xmm1
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xce,                        //  mulps         %xmm6,%xmm9
-    0x41,0x0f,0x58,0xd1,                        //  addps         %xmm9,%xmm2
-    0x44,0x0f,0x59,0xc7,                        //  mulps         %xmm7,%xmm8
-    0x41,0x0f,0x58,0xd8,                        //  addps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_dstover[] = {
-    0xf3,0x44,0x0f,0x10,0x01,                   //  movss         (%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x44,0x0f,0x5c,0xc7,                        //  subps         %xmm7,%xmm8
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
-    0x41,0x0f,0x58,0xe1,                        //  addps         %xmm9,%xmm4
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc9,                        //  mulps         %xmm1,%xmm9
-    0x41,0x0f,0x58,0xe9,                        //  addps         %xmm9,%xmm5
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xca,                        //  mulps         %xmm2,%xmm9
-    0x41,0x0f,0x58,0xf1,                        //  addps         %xmm9,%xmm6
-    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
-    0x41,0x0f,0x58,0xf8,                        //  addps         %xmm8,%xmm7
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_clamp_0[] = {
-    0x45,0x0f,0x57,0xc0,                        //  xorps         %xmm8,%xmm8
-    0x41,0x0f,0x5f,0xc0,                        //  maxps         %xmm8,%xmm0
-    0x41,0x0f,0x5f,0xc8,                        //  maxps         %xmm8,%xmm1
-    0x41,0x0f,0x5f,0xd0,                        //  maxps         %xmm8,%xmm2
-    0x41,0x0f,0x5f,0xd8,                        //  maxps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_clamp_1[] = {
-    0xf3,0x44,0x0f,0x10,0x01,                   //  movss         (%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x41,0x0f,0x5d,0xc0,                        //  minps         %xmm8,%xmm0
-    0x41,0x0f,0x5d,0xc8,                        //  minps         %xmm8,%xmm1
-    0x41,0x0f,0x5d,0xd0,                        //  minps         %xmm8,%xmm2
-    0x41,0x0f,0x5d,0xd8,                        //  minps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_clamp_a[] = {
-    0xf3,0x44,0x0f,0x10,0x01,                   //  movss         (%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x41,0x0f,0x5d,0xd8,                        //  minps         %xmm8,%xmm3
-    0x0f,0x5d,0xc3,                             //  minps         %xmm3,%xmm0
-    0x0f,0x5d,0xcb,                             //  minps         %xmm3,%xmm1
-    0x0f,0x5d,0xd3,                             //  minps         %xmm3,%xmm2
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_swap[] = {
-    0x44,0x0f,0x28,0xc3,                        //  movaps        %xmm3,%xmm8
-    0x44,0x0f,0x28,0xca,                        //  movaps        %xmm2,%xmm9
-    0x44,0x0f,0x28,0xd1,                        //  movaps        %xmm1,%xmm10
-    0x44,0x0f,0x28,0xd8,                        //  movaps        %xmm0,%xmm11
-    0x0f,0x28,0xc4,                             //  movaps        %xmm4,%xmm0
-    0x0f,0x28,0xcd,                             //  movaps        %xmm5,%xmm1
-    0x0f,0x28,0xd6,                             //  movaps        %xmm6,%xmm2
-    0x0f,0x28,0xdf,                             //  movaps        %xmm7,%xmm3
-    0x41,0x0f,0x28,0xe3,                        //  movaps        %xmm11,%xmm4
-    0x41,0x0f,0x28,0xea,                        //  movaps        %xmm10,%xmm5
-    0x41,0x0f,0x28,0xf1,                        //  movaps        %xmm9,%xmm6
-    0x41,0x0f,0x28,0xf8,                        //  movaps        %xmm8,%xmm7
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_move_src_dst[] = {
-    0x0f,0x28,0xe0,                             //  movaps        %xmm0,%xmm4
-    0x0f,0x28,0xe9,                             //  movaps        %xmm1,%xmm5
-    0x0f,0x28,0xf2,                             //  movaps        %xmm2,%xmm6
-    0x0f,0x28,0xfb,                             //  movaps        %xmm3,%xmm7
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_move_dst_src[] = {
-    0x0f,0x28,0xc4,                             //  movaps        %xmm4,%xmm0
-    0x0f,0x28,0xcd,                             //  movaps        %xmm5,%xmm1
-    0x0f,0x28,0xd6,                             //  movaps        %xmm6,%xmm2
-    0x0f,0x28,0xdf,                             //  movaps        %xmm7,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_premul[] = {
-    0x0f,0x59,0xc3,                             //  mulps         %xmm3,%xmm0
-    0x0f,0x59,0xcb,                             //  mulps         %xmm3,%xmm1
-    0x0f,0x59,0xd3,                             //  mulps         %xmm3,%xmm2
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_unpremul[] = {
-    0x45,0x0f,0x57,0xc0,                        //  xorps         %xmm8,%xmm8
-    0x44,0x0f,0xc2,0xc3,0x00,                   //  cmpeqps       %xmm3,%xmm8
-    0xf3,0x44,0x0f,0x10,0x09,                   //  movss         (%rcx),%xmm9
-    0x45,0x0f,0xc6,0xc9,0x00,                   //  shufps        $0x0,%xmm9,%xmm9
-    0x44,0x0f,0x5e,0xcb,                        //  divps         %xmm3,%xmm9
-    0x45,0x0f,0x55,0xc1,                        //  andnps        %xmm9,%xmm8
-    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
-    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
-    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_from_srgb[] = {
-    0xf3,0x44,0x0f,0x10,0x41,0x1c,              //  movss         0x1c(%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x45,0x0f,0x28,0xe8,                        //  movaps        %xmm8,%xmm13
-    0x44,0x0f,0x59,0xe8,                        //  mulps         %xmm0,%xmm13
-    0x44,0x0f,0x28,0xe0,                        //  movaps        %xmm0,%xmm12
-    0x45,0x0f,0x59,0xe4,                        //  mulps         %xmm12,%xmm12
-    0xf3,0x44,0x0f,0x10,0x49,0x18,              //  movss         0x18(%rcx),%xmm9
-    0x45,0x0f,0xc6,0xc9,0x00,                   //  shufps        $0x0,%xmm9,%xmm9
-    0xf3,0x44,0x0f,0x10,0x51,0x10,              //  movss         0x10(%rcx),%xmm10
-    0xf3,0x44,0x0f,0x10,0x59,0x14,              //  movss         0x14(%rcx),%xmm11
-    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
-    0x45,0x0f,0x28,0xf1,                        //  movaps        %xmm9,%xmm14
-    0x44,0x0f,0x59,0xf0,                        //  mulps         %xmm0,%xmm14
-    0x45,0x0f,0x58,0xf3,                        //  addps         %xmm11,%xmm14
-    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
-    0x45,0x0f,0x59,0xf4,                        //  mulps         %xmm12,%xmm14
-    0x45,0x0f,0x58,0xf2,                        //  addps         %xmm10,%xmm14
-    0xf3,0x44,0x0f,0x10,0x61,0x20,              //  movss         0x20(%rcx),%xmm12
-    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
-    0x41,0x0f,0xc2,0xc4,0x01,                   //  cmpltps       %xmm12,%xmm0
-    0x44,0x0f,0x54,0xe8,                        //  andps         %xmm0,%xmm13
-    0x41,0x0f,0x55,0xc6,                        //  andnps        %xmm14,%xmm0
-    0x41,0x0f,0x56,0xc5,                        //  orps          %xmm13,%xmm0
-    0x45,0x0f,0x28,0xe8,                        //  movaps        %xmm8,%xmm13
-    0x44,0x0f,0x59,0xe9,                        //  mulps         %xmm1,%xmm13
-    0x44,0x0f,0x28,0xf1,                        //  movaps        %xmm1,%xmm14
-    0x45,0x0f,0x59,0xf6,                        //  mulps         %xmm14,%xmm14
-    0x45,0x0f,0x28,0xf9,                        //  movaps        %xmm9,%xmm15
-    0x44,0x0f,0x59,0xf9,                        //  mulps         %xmm1,%xmm15
-    0x45,0x0f,0x58,0xfb,                        //  addps         %xmm11,%xmm15
-    0x45,0x0f,0x59,0xfe,                        //  mulps         %xmm14,%xmm15
-    0x45,0x0f,0x58,0xfa,                        //  addps         %xmm10,%xmm15
-    0x41,0x0f,0xc2,0xcc,0x01,                   //  cmpltps       %xmm12,%xmm1
-    0x44,0x0f,0x54,0xe9,                        //  andps         %xmm1,%xmm13
-    0x41,0x0f,0x55,0xcf,                        //  andnps        %xmm15,%xmm1
-    0x41,0x0f,0x56,0xcd,                        //  orps          %xmm13,%xmm1
-    0x44,0x0f,0x59,0xc2,                        //  mulps         %xmm2,%xmm8
-    0x44,0x0f,0x28,0xea,                        //  movaps        %xmm2,%xmm13
-    0x45,0x0f,0x59,0xed,                        //  mulps         %xmm13,%xmm13
-    0x44,0x0f,0x59,0xca,                        //  mulps         %xmm2,%xmm9
-    0x45,0x0f,0x58,0xcb,                        //  addps         %xmm11,%xmm9
-    0x45,0x0f,0x59,0xcd,                        //  mulps         %xmm13,%xmm9
-    0x45,0x0f,0x58,0xca,                        //  addps         %xmm10,%xmm9
-    0x41,0x0f,0xc2,0xd4,0x01,                   //  cmpltps       %xmm12,%xmm2
-    0x44,0x0f,0x54,0xc2,                        //  andps         %xmm2,%xmm8
-    0x41,0x0f,0x55,0xd1,                        //  andnps        %xmm9,%xmm2
-    0x41,0x0f,0x56,0xd0,                        //  orps          %xmm8,%xmm2
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_to_srgb[] = {
-    0x0f,0x29,0x7c,0x24,0xe8,                   //  movaps        %xmm7,-0x18(%rsp)
-    0x0f,0x29,0x74,0x24,0xd8,                   //  movaps        %xmm6,-0x28(%rsp)
-    0x0f,0x28,0xf5,                             //  movaps        %xmm5,%xmm6
-    0x0f,0x28,0xec,                             //  movaps        %xmm4,%xmm5
-    0x0f,0x28,0xe3,                             //  movaps        %xmm3,%xmm4
-    0x44,0x0f,0x52,0xc0,                        //  rsqrtps       %xmm0,%xmm8
-    0x45,0x0f,0x53,0xe8,                        //  rcpps         %xmm8,%xmm13
-    0x45,0x0f,0x52,0xf8,                        //  rsqrtps       %xmm8,%xmm15
-    0xf3,0x0f,0x10,0x19,                        //  movss         (%rcx),%xmm3
-    0xf3,0x44,0x0f,0x10,0x41,0x24,              //  movss         0x24(%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x45,0x0f,0x28,0xf0,                        //  movaps        %xmm8,%xmm14
-    0x44,0x0f,0x59,0xf0,                        //  mulps         %xmm0,%xmm14
-    0x0f,0xc6,0xdb,0x00,                        //  shufps        $0x0,%xmm3,%xmm3
-    0xf3,0x44,0x0f,0x10,0x51,0x28,              //  movss         0x28(%rcx),%xmm10
-    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
-    0xf3,0x44,0x0f,0x10,0x59,0x2c,              //  movss         0x2c(%rcx),%xmm11
-    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
-    0xf3,0x44,0x0f,0x10,0x61,0x30,              //  movss         0x30(%rcx),%xmm12
-    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
-    0x45,0x0f,0x59,0xeb,                        //  mulps         %xmm11,%xmm13
-    0x45,0x0f,0x58,0xec,                        //  addps         %xmm12,%xmm13
-    0x45,0x0f,0x59,0xfa,                        //  mulps         %xmm10,%xmm15
-    0x45,0x0f,0x58,0xfd,                        //  addps         %xmm13,%xmm15
-    0x44,0x0f,0x28,0xcb,                        //  movaps        %xmm3,%xmm9
-    0x45,0x0f,0x5d,0xcf,                        //  minps         %xmm15,%xmm9
-    0xf3,0x44,0x0f,0x10,0x69,0x34,              //  movss         0x34(%rcx),%xmm13
-    0x45,0x0f,0xc6,0xed,0x00,                   //  shufps        $0x0,%xmm13,%xmm13
-    0x41,0x0f,0xc2,0xc5,0x01,                   //  cmpltps       %xmm13,%xmm0
-    0x44,0x0f,0x54,0xf0,                        //  andps         %xmm0,%xmm14
-    0x41,0x0f,0x55,0xc1,                        //  andnps        %xmm9,%xmm0
-    0x41,0x0f,0x56,0xc6,                        //  orps          %xmm14,%xmm0
-    0x44,0x0f,0x52,0xc9,                        //  rsqrtps       %xmm1,%xmm9
-    0x45,0x0f,0x53,0xf1,                        //  rcpps         %xmm9,%xmm14
-    0x45,0x0f,0x52,0xc9,                        //  rsqrtps       %xmm9,%xmm9
-    0x45,0x0f,0x59,0xf3,                        //  mulps         %xmm11,%xmm14
-    0x45,0x0f,0x58,0xf4,                        //  addps         %xmm12,%xmm14
-    0x45,0x0f,0x59,0xca,                        //  mulps         %xmm10,%xmm9
-    0x45,0x0f,0x58,0xce,                        //  addps         %xmm14,%xmm9
-    0x44,0x0f,0x28,0xf3,                        //  movaps        %xmm3,%xmm14
-    0x45,0x0f,0x5d,0xf1,                        //  minps         %xmm9,%xmm14
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc9,                        //  mulps         %xmm1,%xmm9
-    0x41,0x0f,0xc2,0xcd,0x01,                   //  cmpltps       %xmm13,%xmm1
-    0x44,0x0f,0x54,0xc9,                        //  andps         %xmm1,%xmm9
-    0x41,0x0f,0x55,0xce,                        //  andnps        %xmm14,%xmm1
-    0x41,0x0f,0x56,0xc9,                        //  orps          %xmm9,%xmm1
-    0x44,0x0f,0x52,0xca,                        //  rsqrtps       %xmm2,%xmm9
-    0x45,0x0f,0x53,0xf1,                        //  rcpps         %xmm9,%xmm14
-    0x45,0x0f,0x59,0xf3,                        //  mulps         %xmm11,%xmm14
-    0x45,0x0f,0x58,0xf4,                        //  addps         %xmm12,%xmm14
-    0x41,0x0f,0x52,0xf9,                        //  rsqrtps       %xmm9,%xmm7
-    0x41,0x0f,0x59,0xfa,                        //  mulps         %xmm10,%xmm7
-    0x41,0x0f,0x58,0xfe,                        //  addps         %xmm14,%xmm7
-    0x0f,0x5d,0xdf,                             //  minps         %xmm7,%xmm3
-    0x44,0x0f,0x59,0xc2,                        //  mulps         %xmm2,%xmm8
-    0x41,0x0f,0xc2,0xd5,0x01,                   //  cmpltps       %xmm13,%xmm2
-    0x44,0x0f,0x54,0xc2,                        //  andps         %xmm2,%xmm8
-    0x0f,0x55,0xd3,                             //  andnps        %xmm3,%xmm2
-    0x41,0x0f,0x56,0xd0,                        //  orps          %xmm8,%xmm2
-    0x0f,0x28,0xdc,                             //  movaps        %xmm4,%xmm3
-    0x0f,0x28,0xe5,                             //  movaps        %xmm5,%xmm4
-    0x0f,0x28,0xee,                             //  movaps        %xmm6,%xmm5
-    0x0f,0x28,0x74,0x24,0xd8,                   //  movaps        -0x28(%rsp),%xmm6
-    0x0f,0x28,0x7c,0x24,0xe8,                   //  movaps        -0x18(%rsp),%xmm7
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_scale_u8[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0x66,0x44,0x0f,0x6e,0x04,0x38,              //  movd          (%rax,%rdi,1),%xmm8
-    0x66,0x45,0x0f,0xef,0xc9,                   //  pxor          %xmm9,%xmm9
-    0x66,0x45,0x0f,0x60,0xc1,                   //  punpcklbw     %xmm9,%xmm8
-    0x66,0x45,0x0f,0x61,0xc1,                   //  punpcklwd     %xmm9,%xmm8
-    0x45,0x0f,0x5b,0xc0,                        //  cvtdq2ps      %xmm8,%xmm8
-    0xf3,0x44,0x0f,0x10,0x49,0x08,              //  movss         0x8(%rcx),%xmm9
-    0x45,0x0f,0xc6,0xc9,0x00,                   //  shufps        $0x0,%xmm9,%xmm9
-    0x45,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm9
-    0x41,0x0f,0x59,0xc1,                        //  mulps         %xmm9,%xmm0
-    0x41,0x0f,0x59,0xc9,                        //  mulps         %xmm9,%xmm1
-    0x41,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm2
-    0x41,0x0f,0x59,0xd9,                        //  mulps         %xmm9,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_load_tables[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0x4c,0x8b,0x42,0x08,                        //  mov           0x8(%rdx),%r8
-    0xf3,0x44,0x0f,0x6f,0x04,0xb8,              //  movdqu        (%rax,%rdi,4),%xmm8
-    0x66,0x0f,0x6e,0x41,0x0c,                   //  movd          0xc(%rcx),%xmm0
-    0x66,0x0f,0x70,0xc0,0x00,                   //  pshufd        $0x0,%xmm0,%xmm0
-    0x66,0x45,0x0f,0x6f,0xc8,                   //  movdqa        %xmm8,%xmm9
-    0x66,0x41,0x0f,0x72,0xd1,0x08,              //  psrld         $0x8,%xmm9
-    0x66,0x44,0x0f,0xdb,0xc8,                   //  pand          %xmm0,%xmm9
-    0x66,0x45,0x0f,0x6f,0xd0,                   //  movdqa        %xmm8,%xmm10
-    0x66,0x41,0x0f,0x72,0xd2,0x10,              //  psrld         $0x10,%xmm10
-    0x66,0x44,0x0f,0xdb,0xd0,                   //  pand          %xmm0,%xmm10
-    0x66,0x41,0x0f,0xdb,0xc0,                   //  pand          %xmm8,%xmm0
-    0x66,0x0f,0x70,0xd8,0x4e,                   //  pshufd        $0x4e,%xmm0,%xmm3
-    0x66,0x48,0x0f,0x7e,0xd8,                   //  movq          %xmm3,%rax
-    0x41,0x89,0xc1,                             //  mov           %eax,%r9d
-    0x48,0xc1,0xe8,0x20,                        //  shr           $0x20,%rax
-    0x66,0x49,0x0f,0x7e,0xc2,                   //  movq          %xmm0,%r10
-    0x45,0x89,0xd3,                             //  mov           %r10d,%r11d
-    0x49,0xc1,0xea,0x20,                        //  shr           $0x20,%r10
-    0xf3,0x43,0x0f,0x10,0x1c,0x90,              //  movss         (%r8,%r10,4),%xmm3
-    0xf3,0x41,0x0f,0x10,0x04,0x80,              //  movss         (%r8,%rax,4),%xmm0
-    0x0f,0x14,0xd8,                             //  unpcklps      %xmm0,%xmm3
-    0xf3,0x43,0x0f,0x10,0x04,0x98,              //  movss         (%r8,%r11,4),%xmm0
-    0xf3,0x43,0x0f,0x10,0x0c,0x88,              //  movss         (%r8,%r9,4),%xmm1
-    0x0f,0x14,0xc1,                             //  unpcklps      %xmm1,%xmm0
-    0x0f,0x14,0xc3,                             //  unpcklps      %xmm3,%xmm0
-    0x48,0x8b,0x42,0x10,                        //  mov           0x10(%rdx),%rax
-    0x66,0x41,0x0f,0x70,0xc9,0x4e,              //  pshufd        $0x4e,%xmm9,%xmm1
-    0x66,0x49,0x0f,0x7e,0xc8,                   //  movq          %xmm1,%r8
-    0x45,0x89,0xc1,                             //  mov           %r8d,%r9d
-    0x49,0xc1,0xe8,0x20,                        //  shr           $0x20,%r8
-    0x66,0x4d,0x0f,0x7e,0xca,                   //  movq          %xmm9,%r10
-    0x45,0x89,0xd3,                             //  mov           %r10d,%r11d
-    0x49,0xc1,0xea,0x20,                        //  shr           $0x20,%r10
-    0xf3,0x42,0x0f,0x10,0x1c,0x90,              //  movss         (%rax,%r10,4),%xmm3
-    0xf3,0x42,0x0f,0x10,0x0c,0x80,              //  movss         (%rax,%r8,4),%xmm1
-    0x0f,0x14,0xd9,                             //  unpcklps      %xmm1,%xmm3
-    0xf3,0x42,0x0f,0x10,0x0c,0x98,              //  movss         (%rax,%r11,4),%xmm1
-    0xf3,0x42,0x0f,0x10,0x14,0x88,              //  movss         (%rax,%r9,4),%xmm2
-    0x0f,0x14,0xca,                             //  unpcklps      %xmm2,%xmm1
-    0x0f,0x14,0xcb,                             //  unpcklps      %xmm3,%xmm1
-    0x48,0x8b,0x42,0x18,                        //  mov           0x18(%rdx),%rax
-    0x66,0x41,0x0f,0x70,0xd2,0x4e,              //  pshufd        $0x4e,%xmm10,%xmm2
-    0x66,0x49,0x0f,0x7e,0xd0,                   //  movq          %xmm2,%r8
-    0x45,0x89,0xc1,                             //  mov           %r8d,%r9d
-    0x49,0xc1,0xe8,0x20,                        //  shr           $0x20,%r8
-    0x66,0x4d,0x0f,0x7e,0xd2,                   //  movq          %xmm10,%r10
-    0x45,0x89,0xd3,                             //  mov           %r10d,%r11d
-    0x49,0xc1,0xea,0x20,                        //  shr           $0x20,%r10
-    0xf3,0x46,0x0f,0x10,0x0c,0x90,              //  movss         (%rax,%r10,4),%xmm9
-    0xf3,0x42,0x0f,0x10,0x14,0x80,              //  movss         (%rax,%r8,4),%xmm2
-    0x44,0x0f,0x14,0xca,                        //  unpcklps      %xmm2,%xmm9
-    0xf3,0x42,0x0f,0x10,0x14,0x98,              //  movss         (%rax,%r11,4),%xmm2
-    0xf3,0x42,0x0f,0x10,0x1c,0x88,              //  movss         (%rax,%r9,4),%xmm3
-    0x0f,0x14,0xd3,                             //  unpcklps      %xmm3,%xmm2
-    0x41,0x0f,0x14,0xd1,                        //  unpcklps      %xmm9,%xmm2
-    0x66,0x41,0x0f,0x72,0xd0,0x18,              //  psrld         $0x18,%xmm8
-    0x45,0x0f,0x5b,0xc0,                        //  cvtdq2ps      %xmm8,%xmm8
-    0xf3,0x0f,0x10,0x59,0x08,                   //  movss         0x8(%rcx),%xmm3
-    0x0f,0xc6,0xdb,0x00,                        //  shufps        $0x0,%xmm3,%xmm3
-    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_load_8888[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0xf3,0x0f,0x6f,0x1c,0xb8,                   //  movdqu        (%rax,%rdi,4),%xmm3
-    0x66,0x0f,0x6e,0x41,0x0c,                   //  movd          0xc(%rcx),%xmm0
-    0x66,0x0f,0x70,0xc0,0x00,                   //  pshufd        $0x0,%xmm0,%xmm0
-    0x66,0x0f,0x6f,0xcb,                        //  movdqa        %xmm3,%xmm1
-    0x66,0x0f,0x72,0xd1,0x08,                   //  psrld         $0x8,%xmm1
-    0x66,0x0f,0xdb,0xc8,                        //  pand          %xmm0,%xmm1
-    0x66,0x0f,0x6f,0xd3,                        //  movdqa        %xmm3,%xmm2
-    0x66,0x0f,0x72,0xd2,0x10,                   //  psrld         $0x10,%xmm2
-    0x66,0x0f,0xdb,0xd0,                        //  pand          %xmm0,%xmm2
-    0x66,0x0f,0xdb,0xc3,                        //  pand          %xmm3,%xmm0
-    0x0f,0x5b,0xc0,                             //  cvtdq2ps      %xmm0,%xmm0
-    0xf3,0x44,0x0f,0x10,0x41,0x08,              //  movss         0x8(%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
-    0x0f,0x5b,0xc9,                             //  cvtdq2ps      %xmm1,%xmm1
-    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
-    0x0f,0x5b,0xd2,                             //  cvtdq2ps      %xmm2,%xmm2
-    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
-    0x66,0x0f,0x72,0xd3,0x18,                   //  psrld         $0x18,%xmm3
-    0x0f,0x5b,0xdb,                             //  cvtdq2ps      %xmm3,%xmm3
-    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_store_8888[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0xf3,0x44,0x0f,0x10,0x41,0x04,              //  movss         0x4(%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
-    0x66,0x45,0x0f,0x5b,0xc9,                   //  cvtps2dq      %xmm9,%xmm9
-    0x45,0x0f,0x28,0xd0,                        //  movaps        %xmm8,%xmm10
-    0x44,0x0f,0x59,0xd1,                        //  mulps         %xmm1,%xmm10
-    0x66,0x45,0x0f,0x5b,0xd2,                   //  cvtps2dq      %xmm10,%xmm10
-    0x66,0x41,0x0f,0x72,0xf2,0x08,              //  pslld         $0x8,%xmm10
-    0x66,0x45,0x0f,0xeb,0xd1,                   //  por           %xmm9,%xmm10
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xca,                        //  mulps         %xmm2,%xmm9
-    0x66,0x45,0x0f,0x5b,0xc9,                   //  cvtps2dq      %xmm9,%xmm9
-    0x66,0x41,0x0f,0x72,0xf1,0x10,              //  pslld         $0x10,%xmm9
-    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
-    0x66,0x45,0x0f,0x5b,0xc0,                   //  cvtps2dq      %xmm8,%xmm8
-    0x66,0x41,0x0f,0x72,0xf0,0x18,              //  pslld         $0x18,%xmm8
-    0x66,0x45,0x0f,0xeb,0xc1,                   //  por           %xmm9,%xmm8
-    0x66,0x45,0x0f,0xeb,0xc2,                   //  por           %xmm10,%xmm8
-    0xf3,0x44,0x0f,0x7f,0x04,0xb8,              //  movdqu        %xmm8,(%rax,%rdi,4)
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_load_f16[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0xf3,0x0f,0x6f,0x04,0xf8,                   //  movdqu        (%rax,%rdi,8),%xmm0
-    0xf3,0x0f,0x6f,0x4c,0xf8,0x10,              //  movdqu        0x10(%rax,%rdi,8),%xmm1
-    0x66,0x0f,0x6f,0xd8,                        //  movdqa        %xmm0,%xmm3
-    0x66,0x0f,0x61,0xd9,                        //  punpcklwd     %xmm1,%xmm3
-    0x66,0x0f,0x69,0xc1,                        //  punpckhwd     %xmm1,%xmm0
-    0x66,0x0f,0x6f,0xcb,                        //  movdqa        %xmm3,%xmm1
-    0x66,0x0f,0x61,0xc8,                        //  punpcklwd     %xmm0,%xmm1
-    0x66,0x0f,0x69,0xd8,                        //  punpckhwd     %xmm0,%xmm3
-    0x66,0x45,0x0f,0xef,0xc0,                   //  pxor          %xmm8,%xmm8
-    0x66,0x0f,0x6f,0xc1,                        //  movdqa        %xmm1,%xmm0
-    0x66,0x41,0x0f,0x61,0xc0,                   //  punpcklwd     %xmm8,%xmm0
-    0x66,0x0f,0x72,0xf0,0x0d,                   //  pslld         $0xd,%xmm0
-    0x66,0x0f,0x6e,0x51,0x38,                   //  movd          0x38(%rcx),%xmm2
-    0x66,0x44,0x0f,0x70,0xca,0x00,              //  pshufd        $0x0,%xmm2,%xmm9
-    0x41,0x0f,0x59,0xc1,                        //  mulps         %xmm9,%xmm0
-    0x66,0x41,0x0f,0x69,0xc8,                   //  punpckhwd     %xmm8,%xmm1
-    0x66,0x0f,0x72,0xf1,0x0d,                   //  pslld         $0xd,%xmm1
-    0x41,0x0f,0x59,0xc9,                        //  mulps         %xmm9,%xmm1
-    0x66,0x0f,0x6f,0xd3,                        //  movdqa        %xmm3,%xmm2
-    0x66,0x41,0x0f,0x61,0xd0,                   //  punpcklwd     %xmm8,%xmm2
-    0x66,0x0f,0x72,0xf2,0x0d,                   //  pslld         $0xd,%xmm2
-    0x41,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm2
-    0x66,0x41,0x0f,0x69,0xd8,                   //  punpckhwd     %xmm8,%xmm3
-    0x66,0x0f,0x72,0xf3,0x0d,                   //  pslld         $0xd,%xmm3
-    0x41,0x0f,0x59,0xd9,                        //  mulps         %xmm9,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_store_f16[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0x66,0x44,0x0f,0x6e,0x41,0x3c,              //  movd          0x3c(%rcx),%xmm8
-    0x66,0x45,0x0f,0x70,0xc0,0x00,              //  pshufd        $0x0,%xmm8,%xmm8
-    0x66,0x45,0x0f,0x6f,0xc8,                   //  movdqa        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
-    0x66,0x41,0x0f,0x72,0xd1,0x0d,              //  psrld         $0xd,%xmm9
-    0x66,0x45,0x0f,0x6f,0xd0,                   //  movdqa        %xmm8,%xmm10
-    0x44,0x0f,0x59,0xd1,                        //  mulps         %xmm1,%xmm10
-    0x66,0x41,0x0f,0x72,0xd2,0x0d,              //  psrld         $0xd,%xmm10
-    0x66,0x45,0x0f,0x6f,0xd8,                   //  movdqa        %xmm8,%xmm11
-    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
-    0x66,0x41,0x0f,0x72,0xd3,0x0d,              //  psrld         $0xd,%xmm11
-    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
-    0x66,0x41,0x0f,0x72,0xd0,0x0d,              //  psrld         $0xd,%xmm8
-    0x66,0x41,0x0f,0x73,0xfa,0x02,              //  pslldq        $0x2,%xmm10
-    0x66,0x45,0x0f,0xeb,0xd1,                   //  por           %xmm9,%xmm10
-    0x66,0x41,0x0f,0x73,0xf8,0x02,              //  pslldq        $0x2,%xmm8
-    0x66,0x45,0x0f,0xeb,0xc3,                   //  por           %xmm11,%xmm8
-    0x66,0x45,0x0f,0x6f,0xca,                   //  movdqa        %xmm10,%xmm9
-    0x66,0x45,0x0f,0x62,0xc8,                   //  punpckldq     %xmm8,%xmm9
-    0xf3,0x44,0x0f,0x7f,0x0c,0xf8,              //  movdqu        %xmm9,(%rax,%rdi,8)
-    0x66,0x45,0x0f,0x6a,0xd0,                   //  punpckhdq     %xmm8,%xmm10
-    0xf3,0x44,0x0f,0x7f,0x54,0xf8,0x10,         //  movdqu        %xmm10,0x10(%rax,%rdi,8)
-    0xc3,                                       //  return
-};
-static const unsigned char sse2_matrix_3x4[] = {
-    0x44,0x0f,0x28,0xc9,                        //  movaps        %xmm1,%xmm9
-    0x44,0x0f,0x28,0xc0,                        //  movaps        %xmm0,%xmm8
-    0xf3,0x0f,0x10,0x02,                        //  movss         (%rdx),%xmm0
-    0xf3,0x0f,0x10,0x4a,0x04,                   //  movss         0x4(%rdx),%xmm1
-    0x0f,0xc6,0xc0,0x00,                        //  shufps        $0x0,%xmm0,%xmm0
-    0xf3,0x44,0x0f,0x10,0x52,0x0c,              //  movss         0xc(%rdx),%xmm10
-    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
-    0xf3,0x44,0x0f,0x10,0x5a,0x18,              //  movss         0x18(%rdx),%xmm11
-    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
-    0xf3,0x44,0x0f,0x10,0x62,0x24,              //  movss         0x24(%rdx),%xmm12
-    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
-    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
-    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
-    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
-    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
-    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
-    0x41,0x0f,0x58,0xc2,                        //  addps         %xmm10,%xmm0
-    0x0f,0xc6,0xc9,0x00,                        //  shufps        $0x0,%xmm1,%xmm1
-    0xf3,0x44,0x0f,0x10,0x52,0x10,              //  movss         0x10(%rdx),%xmm10
-    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
-    0xf3,0x44,0x0f,0x10,0x5a,0x1c,              //  movss         0x1c(%rdx),%xmm11
-    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
-    0xf3,0x44,0x0f,0x10,0x62,0x28,              //  movss         0x28(%rdx),%xmm12
-    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
-    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
-    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
-    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
-    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
-    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
-    0x41,0x0f,0x58,0xca,                        //  addps         %xmm10,%xmm1
-    0xf3,0x44,0x0f,0x10,0x52,0x08,              //  movss         0x8(%rdx),%xmm10
-    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
-    0xf3,0x44,0x0f,0x10,0x5a,0x14,              //  movss         0x14(%rdx),%xmm11
-    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
-    0xf3,0x44,0x0f,0x10,0x62,0x20,              //  movss         0x20(%rdx),%xmm12
-    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
-    0xf3,0x44,0x0f,0x10,0x6a,0x2c,              //  movss         0x2c(%rdx),%xmm13
-    0x45,0x0f,0xc6,0xed,0x00,                   //  shufps        $0x0,%xmm13,%xmm13
-    0x44,0x0f,0x59,0xe2,                        //  mulps         %xmm2,%xmm12
-    0x45,0x0f,0x58,0xe5,                        //  addps         %xmm13,%xmm12
-    0x45,0x0f,0x59,0xd9,                        //  mulps         %xmm9,%xmm11
-    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
-    0x45,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm10
-    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
-    0x41,0x0f,0x28,0xd2,                        //  movaps        %xmm10,%xmm2
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_inc_x[] = {
-    0x48,0x83,0xc7,0x04,                        //  add           $0x4,%rdi
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_clear[] = {
-    0x0f,0x57,0xc0,                             //  xorps         %xmm0,%xmm0
-    0x0f,0x57,0xc9,                             //  xorps         %xmm1,%xmm1
-    0x0f,0x57,0xd2,                             //  xorps         %xmm2,%xmm2
-    0x0f,0x57,0xdb,                             //  xorps         %xmm3,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_plus_[] = {
-    0x0f,0x58,0xc4,                             //  addps         %xmm4,%xmm0
-    0x0f,0x58,0xcd,                             //  addps         %xmm5,%xmm1
-    0x0f,0x58,0xd6,                             //  addps         %xmm6,%xmm2
-    0x0f,0x58,0xdf,                             //  addps         %xmm7,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_srcover[] = {
-    0xf3,0x44,0x0f,0x10,0x01,                   //  movss         (%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x44,0x0f,0x5c,0xc3,                        //  subps         %xmm3,%xmm8
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xcc,                        //  mulps         %xmm4,%xmm9
-    0x41,0x0f,0x58,0xc1,                        //  addps         %xmm9,%xmm0
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xcd,                        //  mulps         %xmm5,%xmm9
-    0x41,0x0f,0x58,0xc9,                        //  addps         %xmm9,%xmm1
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xce,                        //  mulps         %xmm6,%xmm9
-    0x41,0x0f,0x58,0xd1,                        //  addps         %xmm9,%xmm2
-    0x44,0x0f,0x59,0xc7,                        //  mulps         %xmm7,%xmm8
-    0x41,0x0f,0x58,0xd8,                        //  addps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_dstover[] = {
-    0xf3,0x44,0x0f,0x10,0x01,                   //  movss         (%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x44,0x0f,0x5c,0xc7,                        //  subps         %xmm7,%xmm8
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
-    0x41,0x0f,0x58,0xe1,                        //  addps         %xmm9,%xmm4
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc9,                        //  mulps         %xmm1,%xmm9
-    0x41,0x0f,0x58,0xe9,                        //  addps         %xmm9,%xmm5
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xca,                        //  mulps         %xmm2,%xmm9
-    0x41,0x0f,0x58,0xf1,                        //  addps         %xmm9,%xmm6
-    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
-    0x41,0x0f,0x58,0xf8,                        //  addps         %xmm8,%xmm7
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_clamp_0[] = {
-    0x45,0x0f,0x57,0xc0,                        //  xorps         %xmm8,%xmm8
-    0x41,0x0f,0x5f,0xc0,                        //  maxps         %xmm8,%xmm0
-    0x41,0x0f,0x5f,0xc8,                        //  maxps         %xmm8,%xmm1
-    0x41,0x0f,0x5f,0xd0,                        //  maxps         %xmm8,%xmm2
-    0x41,0x0f,0x5f,0xd8,                        //  maxps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_clamp_1[] = {
-    0xf3,0x44,0x0f,0x10,0x01,                   //  movss         (%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x41,0x0f,0x5d,0xc0,                        //  minps         %xmm8,%xmm0
-    0x41,0x0f,0x5d,0xc8,                        //  minps         %xmm8,%xmm1
-    0x41,0x0f,0x5d,0xd0,                        //  minps         %xmm8,%xmm2
-    0x41,0x0f,0x5d,0xd8,                        //  minps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_clamp_a[] = {
-    0xf3,0x44,0x0f,0x10,0x01,                   //  movss         (%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x41,0x0f,0x5d,0xd8,                        //  minps         %xmm8,%xmm3
-    0x0f,0x5d,0xc3,                             //  minps         %xmm3,%xmm0
-    0x0f,0x5d,0xcb,                             //  minps         %xmm3,%xmm1
-    0x0f,0x5d,0xd3,                             //  minps         %xmm3,%xmm2
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_swap[] = {
-    0x44,0x0f,0x28,0xc3,                        //  movaps        %xmm3,%xmm8
-    0x44,0x0f,0x28,0xca,                        //  movaps        %xmm2,%xmm9
-    0x44,0x0f,0x28,0xd1,                        //  movaps        %xmm1,%xmm10
-    0x44,0x0f,0x28,0xd8,                        //  movaps        %xmm0,%xmm11
-    0x0f,0x28,0xc4,                             //  movaps        %xmm4,%xmm0
-    0x0f,0x28,0xcd,                             //  movaps        %xmm5,%xmm1
-    0x0f,0x28,0xd6,                             //  movaps        %xmm6,%xmm2
-    0x0f,0x28,0xdf,                             //  movaps        %xmm7,%xmm3
-    0x41,0x0f,0x28,0xe3,                        //  movaps        %xmm11,%xmm4
-    0x41,0x0f,0x28,0xea,                        //  movaps        %xmm10,%xmm5
-    0x41,0x0f,0x28,0xf1,                        //  movaps        %xmm9,%xmm6
-    0x41,0x0f,0x28,0xf8,                        //  movaps        %xmm8,%xmm7
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_move_src_dst[] = {
-    0x0f,0x28,0xe0,                             //  movaps        %xmm0,%xmm4
-    0x0f,0x28,0xe9,                             //  movaps        %xmm1,%xmm5
-    0x0f,0x28,0xf2,                             //  movaps        %xmm2,%xmm6
-    0x0f,0x28,0xfb,                             //  movaps        %xmm3,%xmm7
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_move_dst_src[] = {
-    0x0f,0x28,0xc4,                             //  movaps        %xmm4,%xmm0
-    0x0f,0x28,0xcd,                             //  movaps        %xmm5,%xmm1
-    0x0f,0x28,0xd6,                             //  movaps        %xmm6,%xmm2
-    0x0f,0x28,0xdf,                             //  movaps        %xmm7,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_premul[] = {
-    0x0f,0x59,0xc3,                             //  mulps         %xmm3,%xmm0
-    0x0f,0x59,0xcb,                             //  mulps         %xmm3,%xmm1
-    0x0f,0x59,0xd3,                             //  mulps         %xmm3,%xmm2
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_unpremul[] = {
-    0x44,0x0f,0x28,0xc0,                        //  movaps        %xmm0,%xmm8
-    0x45,0x0f,0x57,0xc9,                        //  xorps         %xmm9,%xmm9
-    0xf3,0x44,0x0f,0x10,0x11,                   //  movss         (%rcx),%xmm10
-    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
-    0x44,0x0f,0x5e,0xd3,                        //  divps         %xmm3,%xmm10
-    0x0f,0x28,0xc3,                             //  movaps        %xmm3,%xmm0
-    0x41,0x0f,0xc2,0xc1,0x00,                   //  cmpeqps       %xmm9,%xmm0
-    0x66,0x45,0x0f,0x38,0x14,0xd1,              //  blendvps      %xmm0,%xmm9,%xmm10
-    0x45,0x0f,0x59,0xc2,                        //  mulps         %xmm10,%xmm8
-    0x41,0x0f,0x59,0xca,                        //  mulps         %xmm10,%xmm1
-    0x41,0x0f,0x59,0xd2,                        //  mulps         %xmm10,%xmm2
-    0x41,0x0f,0x28,0xc0,                        //  movaps        %xmm8,%xmm0
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_from_srgb[] = {
-    0xf3,0x44,0x0f,0x10,0x59,0x1c,              //  movss         0x1c(%rcx),%xmm11
-    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
-    0x45,0x0f,0x28,0xd3,                        //  movaps        %xmm11,%xmm10
-    0x44,0x0f,0x59,0xd0,                        //  mulps         %xmm0,%xmm10
-    0x44,0x0f,0x28,0xf0,                        //  movaps        %xmm0,%xmm14
-    0x45,0x0f,0x59,0xf6,                        //  mulps         %xmm14,%xmm14
-    0xf3,0x44,0x0f,0x10,0x41,0x18,              //  movss         0x18(%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0xf3,0x44,0x0f,0x10,0x61,0x10,              //  movss         0x10(%rcx),%xmm12
-    0xf3,0x44,0x0f,0x10,0x69,0x14,              //  movss         0x14(%rcx),%xmm13
-    0x45,0x0f,0xc6,0xed,0x00,                   //  shufps        $0x0,%xmm13,%xmm13
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
-    0x45,0x0f,0x58,0xcd,                        //  addps         %xmm13,%xmm9
-    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
-    0x45,0x0f,0x59,0xce,                        //  mulps         %xmm14,%xmm9
-    0x45,0x0f,0x58,0xcc,                        //  addps         %xmm12,%xmm9
-    0xf3,0x44,0x0f,0x10,0x71,0x20,              //  movss         0x20(%rcx),%xmm14
-    0x45,0x0f,0xc6,0xf6,0x00,                   //  shufps        $0x0,%xmm14,%xmm14
-    0x41,0x0f,0xc2,0xc6,0x01,                   //  cmpltps       %xmm14,%xmm0
-    0x66,0x45,0x0f,0x38,0x14,0xca,              //  blendvps      %xmm0,%xmm10,%xmm9
-    0x45,0x0f,0x28,0xfb,                        //  movaps        %xmm11,%xmm15
-    0x44,0x0f,0x59,0xf9,                        //  mulps         %xmm1,%xmm15
-    0x0f,0x28,0xc1,                             //  movaps        %xmm1,%xmm0
-    0x0f,0x59,0xc0,                             //  mulps         %xmm0,%xmm0
-    0x45,0x0f,0x28,0xd0,                        //  movaps        %xmm8,%xmm10
-    0x44,0x0f,0x59,0xd1,                        //  mulps         %xmm1,%xmm10
-    0x45,0x0f,0x58,0xd5,                        //  addps         %xmm13,%xmm10
-    0x44,0x0f,0x59,0xd0,                        //  mulps         %xmm0,%xmm10
-    0x45,0x0f,0x58,0xd4,                        //  addps         %xmm12,%xmm10
-    0x41,0x0f,0xc2,0xce,0x01,                   //  cmpltps       %xmm14,%xmm1
-    0x0f,0x28,0xc1,                             //  movaps        %xmm1,%xmm0
-    0x66,0x45,0x0f,0x38,0x14,0xd7,              //  blendvps      %xmm0,%xmm15,%xmm10
-    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
-    0x0f,0x28,0xc2,                             //  movaps        %xmm2,%xmm0
-    0x0f,0x59,0xc0,                             //  mulps         %xmm0,%xmm0
-    0x44,0x0f,0x59,0xc2,                        //  mulps         %xmm2,%xmm8
-    0x45,0x0f,0x58,0xc5,                        //  addps         %xmm13,%xmm8
-    0x44,0x0f,0x59,0xc0,                        //  mulps         %xmm0,%xmm8
-    0x45,0x0f,0x58,0xc4,                        //  addps         %xmm12,%xmm8
-    0x41,0x0f,0xc2,0xd6,0x01,                   //  cmpltps       %xmm14,%xmm2
-    0x0f,0x28,0xc2,                             //  movaps        %xmm2,%xmm0
-    0x66,0x45,0x0f,0x38,0x14,0xc3,              //  blendvps      %xmm0,%xmm11,%xmm8
-    0x41,0x0f,0x28,0xc1,                        //  movaps        %xmm9,%xmm0
-    0x41,0x0f,0x28,0xca,                        //  movaps        %xmm10,%xmm1
-    0x41,0x0f,0x28,0xd0,                        //  movaps        %xmm8,%xmm2
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_to_srgb[] = {
-    0x0f,0x29,0x7c,0x24,0xe8,                   //  movaps        %xmm7,-0x18(%rsp)
-    0x0f,0x28,0xfe,                             //  movaps        %xmm6,%xmm7
-    0x0f,0x28,0xf5,                             //  movaps        %xmm5,%xmm6
-    0x0f,0x28,0xec,                             //  movaps        %xmm4,%xmm5
-    0x0f,0x28,0xe3,                             //  movaps        %xmm3,%xmm4
-    0x0f,0x28,0xda,                             //  movaps        %xmm2,%xmm3
-    0x0f,0x28,0xd1,                             //  movaps        %xmm1,%xmm2
-    0x44,0x0f,0x52,0xc0,                        //  rsqrtps       %xmm0,%xmm8
-    0x45,0x0f,0x53,0xc8,                        //  rcpps         %xmm8,%xmm9
-    0x45,0x0f,0x52,0xd0,                        //  rsqrtps       %xmm8,%xmm10
-    0xf3,0x44,0x0f,0x10,0x01,                   //  movss         (%rcx),%xmm8
-    0xf3,0x44,0x0f,0x10,0x59,0x24,              //  movss         0x24(%rcx),%xmm11
-    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
-    0x41,0x0f,0x28,0xcb,                        //  movaps        %xmm11,%xmm1
-    0x0f,0x59,0xc8,                             //  mulps         %xmm0,%xmm1
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0xf3,0x44,0x0f,0x10,0x61,0x28,              //  movss         0x28(%rcx),%xmm12
-    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
-    0xf3,0x44,0x0f,0x10,0x69,0x2c,              //  movss         0x2c(%rcx),%xmm13
-    0x45,0x0f,0xc6,0xed,0x00,                   //  shufps        $0x0,%xmm13,%xmm13
-    0xf3,0x44,0x0f,0x10,0x71,0x30,              //  movss         0x30(%rcx),%xmm14
-    0x45,0x0f,0xc6,0xf6,0x00,                   //  shufps        $0x0,%xmm14,%xmm14
-    0x45,0x0f,0x59,0xcd,                        //  mulps         %xmm13,%xmm9
-    0x45,0x0f,0x58,0xce,                        //  addps         %xmm14,%xmm9
-    0x45,0x0f,0x59,0xd4,                        //  mulps         %xmm12,%xmm10
-    0x45,0x0f,0x58,0xd1,                        //  addps         %xmm9,%xmm10
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x45,0x0f,0x5d,0xca,                        //  minps         %xmm10,%xmm9
-    0xf3,0x44,0x0f,0x10,0x79,0x34,              //  movss         0x34(%rcx),%xmm15
-    0x45,0x0f,0xc6,0xff,0x00,                   //  shufps        $0x0,%xmm15,%xmm15
-    0x41,0x0f,0xc2,0xc7,0x01,                   //  cmpltps       %xmm15,%xmm0
-    0x66,0x44,0x0f,0x38,0x14,0xc9,              //  blendvps      %xmm0,%xmm1,%xmm9
-    0x0f,0x52,0xc2,                             //  rsqrtps       %xmm2,%xmm0
-    0x0f,0x53,0xc8,                             //  rcpps         %xmm0,%xmm1
-    0x0f,0x52,0xc0,                             //  rsqrtps       %xmm0,%xmm0
-    0x41,0x0f,0x59,0xcd,                        //  mulps         %xmm13,%xmm1
-    0x41,0x0f,0x58,0xce,                        //  addps         %xmm14,%xmm1
-    0x41,0x0f,0x59,0xc4,                        //  mulps         %xmm12,%xmm0
-    0x0f,0x58,0xc1,                             //  addps         %xmm1,%xmm0
-    0x45,0x0f,0x28,0xd0,                        //  movaps        %xmm8,%xmm10
-    0x44,0x0f,0x5d,0xd0,                        //  minps         %xmm0,%xmm10
-    0x41,0x0f,0x28,0xcb,                        //  movaps        %xmm11,%xmm1
-    0x0f,0x59,0xca,                             //  mulps         %xmm2,%xmm1
-    0x41,0x0f,0xc2,0xd7,0x01,                   //  cmpltps       %xmm15,%xmm2
-    0x0f,0x28,0xc2,                             //  movaps        %xmm2,%xmm0
-    0x66,0x44,0x0f,0x38,0x14,0xd1,              //  blendvps      %xmm0,%xmm1,%xmm10
-    0x0f,0x52,0xc3,                             //  rsqrtps       %xmm3,%xmm0
-    0x0f,0x53,0xc8,                             //  rcpps         %xmm0,%xmm1
-    0x41,0x0f,0x59,0xcd,                        //  mulps         %xmm13,%xmm1
-    0x41,0x0f,0x58,0xce,                        //  addps         %xmm14,%xmm1
-    0x0f,0x52,0xc0,                             //  rsqrtps       %xmm0,%xmm0
-    0x41,0x0f,0x59,0xc4,                        //  mulps         %xmm12,%xmm0
-    0x0f,0x58,0xc1,                             //  addps         %xmm1,%xmm0
-    0x44,0x0f,0x5d,0xc0,                        //  minps         %xmm0,%xmm8
-    0x44,0x0f,0x59,0xdb,                        //  mulps         %xmm3,%xmm11
-    0x41,0x0f,0xc2,0xdf,0x01,                   //  cmpltps       %xmm15,%xmm3
-    0x0f,0x28,0xc3,                             //  movaps        %xmm3,%xmm0
-    0x66,0x45,0x0f,0x38,0x14,0xc3,              //  blendvps      %xmm0,%xmm11,%xmm8
-    0x41,0x0f,0x28,0xc1,                        //  movaps        %xmm9,%xmm0
-    0x41,0x0f,0x28,0xca,                        //  movaps        %xmm10,%xmm1
-    0x41,0x0f,0x28,0xd0,                        //  movaps        %xmm8,%xmm2
-    0x0f,0x28,0xdc,                             //  movaps        %xmm4,%xmm3
-    0x0f,0x28,0xe5,                             //  movaps        %xmm5,%xmm4
-    0x0f,0x28,0xee,                             //  movaps        %xmm6,%xmm5
-    0x0f,0x28,0xf7,                             //  movaps        %xmm7,%xmm6
-    0x0f,0x28,0x7c,0x24,0xe8,                   //  movaps        -0x18(%rsp),%xmm7
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_scale_u8[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0x66,0x44,0x0f,0x38,0x31,0x04,0x38,         //  pmovzxbd      (%rax,%rdi,1),%xmm8
-    0x45,0x0f,0x5b,0xc0,                        //  cvtdq2ps      %xmm8,%xmm8
-    0xf3,0x44,0x0f,0x10,0x49,0x08,              //  movss         0x8(%rcx),%xmm9
-    0x45,0x0f,0xc6,0xc9,0x00,                   //  shufps        $0x0,%xmm9,%xmm9
-    0x45,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm9
-    0x41,0x0f,0x59,0xc1,                        //  mulps         %xmm9,%xmm0
-    0x41,0x0f,0x59,0xc9,                        //  mulps         %xmm9,%xmm1
-    0x41,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm2
-    0x41,0x0f,0x59,0xd9,                        //  mulps         %xmm9,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_load_tables[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0x4c,0x8b,0x42,0x08,                        //  mov           0x8(%rdx),%r8
-    0xf3,0x44,0x0f,0x6f,0x04,0xb8,              //  movdqu        (%rax,%rdi,4),%xmm8
-    0x66,0x0f,0x6e,0x41,0x0c,                   //  movd          0xc(%rcx),%xmm0
-    0x66,0x0f,0x70,0xc0,0x00,                   //  pshufd        $0x0,%xmm0,%xmm0
-    0x66,0x41,0x0f,0x6f,0xc8,                   //  movdqa        %xmm8,%xmm1
-    0x66,0x0f,0x72,0xd1,0x08,                   //  psrld         $0x8,%xmm1
-    0x66,0x0f,0xdb,0xc8,                        //  pand          %xmm0,%xmm1
-    0x66,0x41,0x0f,0x6f,0xd0,                   //  movdqa        %xmm8,%xmm2
-    0x66,0x0f,0x72,0xd2,0x10,                   //  psrld         $0x10,%xmm2
-    0x66,0x0f,0xdb,0xd0,                        //  pand          %xmm0,%xmm2
-    0x66,0x41,0x0f,0xdb,0xc0,                   //  pand          %xmm8,%xmm0
-    0x66,0x48,0x0f,0x3a,0x16,0xc0,0x01,         //  pextrq        $0x1,%xmm0,%rax
-    0x41,0x89,0xc1,                             //  mov           %eax,%r9d
-    0x48,0xc1,0xe8,0x20,                        //  shr           $0x20,%rax
-    0x66,0x49,0x0f,0x7e,0xc2,                   //  movq          %xmm0,%r10
-    0x45,0x89,0xd3,                             //  mov           %r10d,%r11d
-    0x49,0xc1,0xea,0x20,                        //  shr           $0x20,%r10
-    0xf3,0x43,0x0f,0x10,0x04,0x98,              //  movss         (%r8,%r11,4),%xmm0
-    0x66,0x43,0x0f,0x3a,0x21,0x04,0x90,0x10,    //  insertps      $0x10,(%r8,%r10,4),%xmm0
-    0xf3,0x43,0x0f,0x10,0x1c,0x88,              //  movss         (%r8,%r9,4),%xmm3
-    0x66,0x0f,0x3a,0x21,0xc3,0x20,              //  insertps      $0x20,%xmm3,%xmm0
-    0xf3,0x41,0x0f,0x10,0x1c,0x80,              //  movss         (%r8,%rax,4),%xmm3
-    0x66,0x0f,0x3a,0x21,0xc3,0x30,              //  insertps      $0x30,%xmm3,%xmm0
-    0x48,0x8b,0x42,0x10,                        //  mov           0x10(%rdx),%rax
-    0x66,0x49,0x0f,0x3a,0x16,0xc8,0x01,         //  pextrq        $0x1,%xmm1,%r8
-    0x45,0x89,0xc1,                             //  mov           %r8d,%r9d
-    0x49,0xc1,0xe8,0x20,                        //  shr           $0x20,%r8
-    0x66,0x49,0x0f,0x7e,0xca,                   //  movq          %xmm1,%r10
-    0x45,0x89,0xd3,                             //  mov           %r10d,%r11d
-    0x49,0xc1,0xea,0x20,                        //  shr           $0x20,%r10
-    0xf3,0x42,0x0f,0x10,0x0c,0x98,              //  movss         (%rax,%r11,4),%xmm1
-    0x66,0x42,0x0f,0x3a,0x21,0x0c,0x90,0x10,    //  insertps      $0x10,(%rax,%r10,4),%xmm1
-    0xf3,0x42,0x0f,0x10,0x1c,0x88,              //  movss         (%rax,%r9,4),%xmm3
-    0x66,0x0f,0x3a,0x21,0xcb,0x20,              //  insertps      $0x20,%xmm3,%xmm1
-    0xf3,0x42,0x0f,0x10,0x1c,0x80,              //  movss         (%rax,%r8,4),%xmm3
-    0x66,0x0f,0x3a,0x21,0xcb,0x30,              //  insertps      $0x30,%xmm3,%xmm1
-    0x48,0x8b,0x42,0x18,                        //  mov           0x18(%rdx),%rax
-    0x66,0x49,0x0f,0x3a,0x16,0xd0,0x01,         //  pextrq        $0x1,%xmm2,%r8
-    0x45,0x89,0xc1,                             //  mov           %r8d,%r9d
-    0x49,0xc1,0xe8,0x20,                        //  shr           $0x20,%r8
-    0x66,0x49,0x0f,0x7e,0xd2,                   //  movq          %xmm2,%r10
-    0x45,0x89,0xd3,                             //  mov           %r10d,%r11d
-    0x49,0xc1,0xea,0x20,                        //  shr           $0x20,%r10
-    0xf3,0x42,0x0f,0x10,0x14,0x98,              //  movss         (%rax,%r11,4),%xmm2
-    0x66,0x42,0x0f,0x3a,0x21,0x14,0x90,0x10,    //  insertps      $0x10,(%rax,%r10,4),%xmm2
-    0xf3,0x42,0x0f,0x10,0x1c,0x88,              //  movss         (%rax,%r9,4),%xmm3
-    0x66,0x0f,0x3a,0x21,0xd3,0x20,              //  insertps      $0x20,%xmm3,%xmm2
-    0xf3,0x42,0x0f,0x10,0x1c,0x80,              //  movss         (%rax,%r8,4),%xmm3
-    0x66,0x0f,0x3a,0x21,0xd3,0x30,              //  insertps      $0x30,%xmm3,%xmm2
-    0x66,0x41,0x0f,0x72,0xd0,0x18,              //  psrld         $0x18,%xmm8
-    0x45,0x0f,0x5b,0xc0,                        //  cvtdq2ps      %xmm8,%xmm8
-    0xf3,0x0f,0x10,0x59,0x08,                   //  movss         0x8(%rcx),%xmm3
-    0x0f,0xc6,0xdb,0x00,                        //  shufps        $0x0,%xmm3,%xmm3
-    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_load_8888[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0xf3,0x0f,0x6f,0x1c,0xb8,                   //  movdqu        (%rax,%rdi,4),%xmm3
-    0x66,0x0f,0x6e,0x41,0x0c,                   //  movd          0xc(%rcx),%xmm0
-    0x66,0x0f,0x70,0xc0,0x00,                   //  pshufd        $0x0,%xmm0,%xmm0
-    0x66,0x0f,0x6f,0xcb,                        //  movdqa        %xmm3,%xmm1
-    0x66,0x0f,0x72,0xd1,0x08,                   //  psrld         $0x8,%xmm1
-    0x66,0x0f,0xdb,0xc8,                        //  pand          %xmm0,%xmm1
-    0x66,0x0f,0x6f,0xd3,                        //  movdqa        %xmm3,%xmm2
-    0x66,0x0f,0x72,0xd2,0x10,                   //  psrld         $0x10,%xmm2
-    0x66,0x0f,0xdb,0xd0,                        //  pand          %xmm0,%xmm2
-    0x66,0x0f,0xdb,0xc3,                        //  pand          %xmm3,%xmm0
-    0x0f,0x5b,0xc0,                             //  cvtdq2ps      %xmm0,%xmm0
-    0xf3,0x44,0x0f,0x10,0x41,0x08,              //  movss         0x8(%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
-    0x0f,0x5b,0xc9,                             //  cvtdq2ps      %xmm1,%xmm1
-    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
-    0x0f,0x5b,0xd2,                             //  cvtdq2ps      %xmm2,%xmm2
-    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
-    0x66,0x0f,0x72,0xd3,0x18,                   //  psrld         $0x18,%xmm3
-    0x0f,0x5b,0xdb,                             //  cvtdq2ps      %xmm3,%xmm3
-    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_store_8888[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0xf3,0x44,0x0f,0x10,0x41,0x04,              //  movss         0x4(%rcx),%xmm8
-    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
-    0x66,0x45,0x0f,0x5b,0xc9,                   //  cvtps2dq      %xmm9,%xmm9
-    0x45,0x0f,0x28,0xd0,                        //  movaps        %xmm8,%xmm10
-    0x44,0x0f,0x59,0xd1,                        //  mulps         %xmm1,%xmm10
-    0x66,0x45,0x0f,0x5b,0xd2,                   //  cvtps2dq      %xmm10,%xmm10
-    0x66,0x41,0x0f,0x72,0xf2,0x08,              //  pslld         $0x8,%xmm10
-    0x66,0x45,0x0f,0xeb,0xd1,                   //  por           %xmm9,%xmm10
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xca,                        //  mulps         %xmm2,%xmm9
-    0x66,0x45,0x0f,0x5b,0xc9,                   //  cvtps2dq      %xmm9,%xmm9
-    0x66,0x41,0x0f,0x72,0xf1,0x10,              //  pslld         $0x10,%xmm9
-    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
-    0x66,0x45,0x0f,0x5b,0xc0,                   //  cvtps2dq      %xmm8,%xmm8
-    0x66,0x41,0x0f,0x72,0xf0,0x18,              //  pslld         $0x18,%xmm8
-    0x66,0x45,0x0f,0xeb,0xc1,                   //  por           %xmm9,%xmm8
-    0x66,0x45,0x0f,0xeb,0xc2,                   //  por           %xmm10,%xmm8
-    0xf3,0x44,0x0f,0x7f,0x04,0xb8,              //  movdqu        %xmm8,(%rax,%rdi,4)
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_load_f16[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0xf3,0x0f,0x6f,0x04,0xf8,                   //  movdqu        (%rax,%rdi,8),%xmm0
-    0xf3,0x0f,0x6f,0x4c,0xf8,0x10,              //  movdqu        0x10(%rax,%rdi,8),%xmm1
-    0x66,0x0f,0x6f,0xd8,                        //  movdqa        %xmm0,%xmm3
-    0x66,0x0f,0x61,0xd9,                        //  punpcklwd     %xmm1,%xmm3
-    0x66,0x0f,0x69,0xc1,                        //  punpckhwd     %xmm1,%xmm0
-    0x66,0x0f,0x6f,0xcb,                        //  movdqa        %xmm3,%xmm1
-    0x66,0x0f,0x61,0xc8,                        //  punpcklwd     %xmm0,%xmm1
-    0x66,0x0f,0x69,0xd8,                        //  punpckhwd     %xmm0,%xmm3
-    0x66,0x0f,0x38,0x33,0xc1,                   //  pmovzxwd      %xmm1,%xmm0
-    0x66,0x0f,0x72,0xf0,0x0d,                   //  pslld         $0xd,%xmm0
-    0x66,0x0f,0x6e,0x51,0x38,                   //  movd          0x38(%rcx),%xmm2
-    0x66,0x44,0x0f,0x70,0xc2,0x00,              //  pshufd        $0x0,%xmm2,%xmm8
-    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
-    0x66,0x45,0x0f,0xef,0xc9,                   //  pxor          %xmm9,%xmm9
-    0x66,0x41,0x0f,0x69,0xc9,                   //  punpckhwd     %xmm9,%xmm1
-    0x66,0x0f,0x72,0xf1,0x0d,                   //  pslld         $0xd,%xmm1
-    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
-    0x66,0x0f,0x38,0x33,0xd3,                   //  pmovzxwd      %xmm3,%xmm2
-    0x66,0x0f,0x72,0xf2,0x0d,                   //  pslld         $0xd,%xmm2
-    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
-    0x66,0x41,0x0f,0x69,0xd9,                   //  punpckhwd     %xmm9,%xmm3
-    0x66,0x0f,0x72,0xf3,0x0d,                   //  pslld         $0xd,%xmm3
-    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_store_f16[] = {
-    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
-    0x66,0x44,0x0f,0x6e,0x41,0x3c,              //  movd          0x3c(%rcx),%xmm8
-    0x66,0x45,0x0f,0x70,0xc0,0x00,              //  pshufd        $0x0,%xmm8,%xmm8
-    0x66,0x45,0x0f,0x6f,0xc8,                   //  movdqa        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
-    0x66,0x41,0x0f,0x72,0xd1,0x0d,              //  psrld         $0xd,%xmm9
-    0x66,0x45,0x0f,0x6f,0xd0,                   //  movdqa        %xmm8,%xmm10
-    0x44,0x0f,0x59,0xd1,                        //  mulps         %xmm1,%xmm10
-    0x66,0x41,0x0f,0x72,0xd2,0x0d,              //  psrld         $0xd,%xmm10
-    0x66,0x45,0x0f,0x6f,0xd8,                   //  movdqa        %xmm8,%xmm11
-    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
-    0x66,0x41,0x0f,0x72,0xd3,0x0d,              //  psrld         $0xd,%xmm11
-    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
-    0x66,0x41,0x0f,0x72,0xd0,0x0d,              //  psrld         $0xd,%xmm8
-    0x66,0x41,0x0f,0x73,0xfa,0x02,              //  pslldq        $0x2,%xmm10
-    0x66,0x45,0x0f,0xeb,0xd1,                   //  por           %xmm9,%xmm10
-    0x66,0x41,0x0f,0x73,0xf8,0x02,              //  pslldq        $0x2,%xmm8
-    0x66,0x45,0x0f,0xeb,0xc3,                   //  por           %xmm11,%xmm8
-    0x66,0x45,0x0f,0x6f,0xca,                   //  movdqa        %xmm10,%xmm9
-    0x66,0x45,0x0f,0x62,0xc8,                   //  punpckldq     %xmm8,%xmm9
-    0xf3,0x44,0x0f,0x7f,0x0c,0xf8,              //  movdqu        %xmm9,(%rax,%rdi,8)
-    0x66,0x45,0x0f,0x6a,0xd0,                   //  punpckhdq     %xmm8,%xmm10
-    0xf3,0x44,0x0f,0x7f,0x54,0xf8,0x10,         //  movdqu        %xmm10,0x10(%rax,%rdi,8)
-    0xc3,                                       //  return
-};
-static const unsigned char sse41_matrix_3x4[] = {
-    0x44,0x0f,0x28,0xc9,                        //  movaps        %xmm1,%xmm9
-    0x44,0x0f,0x28,0xc0,                        //  movaps        %xmm0,%xmm8
-    0xf3,0x0f,0x10,0x02,                        //  movss         (%rdx),%xmm0
-    0xf3,0x0f,0x10,0x4a,0x04,                   //  movss         0x4(%rdx),%xmm1
-    0x0f,0xc6,0xc0,0x00,                        //  shufps        $0x0,%xmm0,%xmm0
-    0xf3,0x44,0x0f,0x10,0x52,0x0c,              //  movss         0xc(%rdx),%xmm10
-    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
-    0xf3,0x44,0x0f,0x10,0x5a,0x18,              //  movss         0x18(%rdx),%xmm11
-    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
-    0xf3,0x44,0x0f,0x10,0x62,0x24,              //  movss         0x24(%rdx),%xmm12
-    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
-    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
-    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
-    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
-    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
-    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
-    0x41,0x0f,0x58,0xc2,                        //  addps         %xmm10,%xmm0
-    0x0f,0xc6,0xc9,0x00,                        //  shufps        $0x0,%xmm1,%xmm1
-    0xf3,0x44,0x0f,0x10,0x52,0x10,              //  movss         0x10(%rdx),%xmm10
-    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
-    0xf3,0x44,0x0f,0x10,0x5a,0x1c,              //  movss         0x1c(%rdx),%xmm11
-    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
-    0xf3,0x44,0x0f,0x10,0x62,0x28,              //  movss         0x28(%rdx),%xmm12
-    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
-    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
-    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
-    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
-    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
-    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
-    0x41,0x0f,0x58,0xca,                        //  addps         %xmm10,%xmm1
-    0xf3,0x44,0x0f,0x10,0x52,0x08,              //  movss         0x8(%rdx),%xmm10
-    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
-    0xf3,0x44,0x0f,0x10,0x5a,0x14,              //  movss         0x14(%rdx),%xmm11
-    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
-    0xf3,0x44,0x0f,0x10,0x62,0x20,              //  movss         0x20(%rdx),%xmm12
-    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
-    0xf3,0x44,0x0f,0x10,0x6a,0x2c,              //  movss         0x2c(%rdx),%xmm13
-    0x45,0x0f,0xc6,0xed,0x00,                   //  shufps        $0x0,%xmm13,%xmm13
-    0x44,0x0f,0x59,0xe2,                        //  mulps         %xmm2,%xmm12
-    0x45,0x0f,0x58,0xe5,                        //  addps         %xmm13,%xmm12
-    0x45,0x0f,0x59,0xd9,                        //  mulps         %xmm9,%xmm11
-    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
-    0x45,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm10
-    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
-    0x41,0x0f,0x28,0xd2,                        //  movaps        %xmm10,%xmm2
-    0xc3,                                       //  return
-};
-static const unsigned char hsw_inc_x[] = {
+
+#else
+
+static const unsigned char kSplice_inc_x[] = {
     0x48,0x83,0xc7,0x08,                        //  add           $0x8,%rdi
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_clear[] = {
+static const unsigned char kSplice_clear[] = {
     0xc5,0xfc,0x57,0xc0,                        //  vxorps        %ymm0,%ymm0,%ymm0
     0xc5,0xf4,0x57,0xc9,                        //  vxorps        %ymm1,%ymm1,%ymm1
     0xc5,0xec,0x57,0xd2,                        //  vxorps        %ymm2,%ymm2,%ymm2
     0xc5,0xe4,0x57,0xdb,                        //  vxorps        %ymm3,%ymm3,%ymm3
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_plus_[] = {
+static const unsigned char kSplice_plus_[] = {
     0xc5,0xfc,0x58,0xc4,                        //  vaddps        %ymm4,%ymm0,%ymm0
     0xc5,0xf4,0x58,0xcd,                        //  vaddps        %ymm5,%ymm1,%ymm1
     0xc5,0xec,0x58,0xd6,                        //  vaddps        %ymm6,%ymm2,%ymm2
     0xc5,0xe4,0x58,0xdf,                        //  vaddps        %ymm7,%ymm3,%ymm3
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_srcover[] = {
+static const unsigned char kSplice_srcover[] = {
     0xc4,0x62,0x7d,0x18,0x01,                   //  vbroadcastss  (%rcx),%ymm8
     0xc5,0x3c,0x5c,0xc3,                        //  vsubps        %ymm3,%ymm8,%ymm8
     0xc4,0xc2,0x5d,0xb8,0xc0,                   //  vfmadd231ps   %ymm8,%ymm4,%ymm0
     0xc4,0xc2,0x55,0xb8,0xc8,                   //  vfmadd231ps   %ymm8,%ymm5,%ymm1
     0xc4,0xc2,0x4d,0xb8,0xd0,                   //  vfmadd231ps   %ymm8,%ymm6,%ymm2
     0xc4,0xc2,0x45,0xb8,0xd8,                   //  vfmadd231ps   %ymm8,%ymm7,%ymm3
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_dstover[] = {
+static const unsigned char kSplice_dstover[] = {
     0xc4,0x62,0x7d,0x18,0x01,                   //  vbroadcastss  (%rcx),%ymm8
     0xc5,0x3c,0x5c,0xc7,                        //  vsubps        %ymm7,%ymm8,%ymm8
     0xc4,0xc2,0x7d,0xb8,0xe0,                   //  vfmadd231ps   %ymm8,%ymm0,%ymm4
     0xc4,0xc2,0x75,0xb8,0xe8,                   //  vfmadd231ps   %ymm8,%ymm1,%ymm5
     0xc4,0xc2,0x6d,0xb8,0xf0,                   //  vfmadd231ps   %ymm8,%ymm2,%ymm6
     0xc4,0xc2,0x65,0xb8,0xf8,                   //  vfmadd231ps   %ymm8,%ymm3,%ymm7
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_clamp_0[] = {
+static const unsigned char kSplice_clamp_0[] = {
     0xc4,0x41,0x3c,0x57,0xc0,                   //  vxorps        %ymm8,%ymm8,%ymm8
     0xc4,0xc1,0x7c,0x5f,0xc0,                   //  vmaxps        %ymm8,%ymm0,%ymm0
     0xc4,0xc1,0x74,0x5f,0xc8,                   //  vmaxps        %ymm8,%ymm1,%ymm1
     0xc4,0xc1,0x6c,0x5f,0xd0,                   //  vmaxps        %ymm8,%ymm2,%ymm2
     0xc4,0xc1,0x64,0x5f,0xd8,                   //  vmaxps        %ymm8,%ymm3,%ymm3
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_clamp_1[] = {
+static const unsigned char kSplice_clamp_1[] = {
     0xc4,0x62,0x7d,0x18,0x01,                   //  vbroadcastss  (%rcx),%ymm8
     0xc4,0xc1,0x7c,0x5d,0xc0,                   //  vminps        %ymm8,%ymm0,%ymm0
     0xc4,0xc1,0x74,0x5d,0xc8,                   //  vminps        %ymm8,%ymm1,%ymm1
     0xc4,0xc1,0x6c,0x5d,0xd0,                   //  vminps        %ymm8,%ymm2,%ymm2
     0xc4,0xc1,0x64,0x5d,0xd8,                   //  vminps        %ymm8,%ymm3,%ymm3
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_clamp_a[] = {
+static const unsigned char kSplice_clamp_a[] = {
     0xc4,0x62,0x7d,0x18,0x01,                   //  vbroadcastss  (%rcx),%ymm8
     0xc4,0xc1,0x64,0x5d,0xd8,                   //  vminps        %ymm8,%ymm3,%ymm3
     0xc5,0xfc,0x5d,0xc3,                        //  vminps        %ymm3,%ymm0,%ymm0
     0xc5,0xf4,0x5d,0xcb,                        //  vminps        %ymm3,%ymm1,%ymm1
     0xc5,0xec,0x5d,0xd3,                        //  vminps        %ymm3,%ymm2,%ymm2
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_swap[] = {
+static const unsigned char kSplice_swap[] = {
     0xc5,0x7c,0x28,0xc3,                        //  vmovaps       %ymm3,%ymm8
     0xc5,0x7c,0x28,0xca,                        //  vmovaps       %ymm2,%ymm9
     0xc5,0x7c,0x28,0xd1,                        //  vmovaps       %ymm1,%ymm10
@@ -1790,29 +856,29 @@
     0xc5,0x7c,0x29,0xd5,                        //  vmovaps       %ymm10,%ymm5
     0xc5,0x7c,0x29,0xce,                        //  vmovaps       %ymm9,%ymm6
     0xc5,0x7c,0x29,0xc7,                        //  vmovaps       %ymm8,%ymm7
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_move_src_dst[] = {
+static const unsigned char kSplice_move_src_dst[] = {
     0xc5,0xfc,0x28,0xe0,                        //  vmovaps       %ymm0,%ymm4
     0xc5,0xfc,0x28,0xe9,                        //  vmovaps       %ymm1,%ymm5
     0xc5,0xfc,0x28,0xf2,                        //  vmovaps       %ymm2,%ymm6
     0xc5,0xfc,0x28,0xfb,                        //  vmovaps       %ymm3,%ymm7
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_move_dst_src[] = {
+static const unsigned char kSplice_move_dst_src[] = {
     0xc5,0xfc,0x28,0xc4,                        //  vmovaps       %ymm4,%ymm0
     0xc5,0xfc,0x28,0xcd,                        //  vmovaps       %ymm5,%ymm1
     0xc5,0xfc,0x28,0xd6,                        //  vmovaps       %ymm6,%ymm2
     0xc5,0xfc,0x28,0xdf,                        //  vmovaps       %ymm7,%ymm3
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_premul[] = {
+static const unsigned char kSplice_premul[] = {
     0xc5,0xfc,0x59,0xc3,                        //  vmulps        %ymm3,%ymm0,%ymm0
     0xc5,0xf4,0x59,0xcb,                        //  vmulps        %ymm3,%ymm1,%ymm1
     0xc5,0xec,0x59,0xd3,                        //  vmulps        %ymm3,%ymm2,%ymm2
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_unpremul[] = {
+static const unsigned char kSplice_unpremul[] = {
     0xc4,0x41,0x3c,0x57,0xc0,                   //  vxorps        %ymm8,%ymm8,%ymm8
     0xc4,0x41,0x64,0xc2,0xc8,0x00,              //  vcmpeqps      %ymm8,%ymm3,%ymm9
     0xc4,0x62,0x7d,0x18,0x11,                   //  vbroadcastss  (%rcx),%ymm10
@@ -1821,9 +887,9 @@
     0xc5,0xbc,0x59,0xc0,                        //  vmulps        %ymm0,%ymm8,%ymm0
     0xc5,0xbc,0x59,0xc9,                        //  vmulps        %ymm1,%ymm8,%ymm1
     0xc5,0xbc,0x59,0xd2,                        //  vmulps        %ymm2,%ymm8,%ymm2
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_from_srgb[] = {
+static const unsigned char kSplice_from_srgb[] = {
     0xc4,0x62,0x7d,0x18,0x41,0x1c,              //  vbroadcastss  0x1c(%rcx),%ymm8
     0xc5,0x3c,0x59,0xc8,                        //  vmulps        %ymm0,%ymm8,%ymm9
     0xc5,0x7c,0x59,0xd0,                        //  vmulps        %ymm0,%ymm0,%ymm10
@@ -1849,9 +915,9 @@
     0xc4,0x42,0x35,0xa8,0xde,                   //  vfmadd213ps   %ymm14,%ymm9,%ymm11
     0xc4,0xc1,0x6c,0xc2,0xd2,0x01,              //  vcmpltps      %ymm10,%ymm2,%ymm2
     0xc4,0xc3,0x25,0x4a,0xd0,0x20,              //  vblendvps     %ymm2,%ymm8,%ymm11,%ymm2
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_to_srgb[] = {
+static const unsigned char kSplice_to_srgb[] = {
     0xc5,0x7c,0x52,0xc0,                        //  vrsqrtps      %ymm0,%ymm8
     0xc4,0x41,0x7c,0x53,0xc8,                   //  vrcpps        %ymm8,%ymm9
     0xc4,0x41,0x7c,0x52,0xd0,                   //  vrsqrtps      %ymm8,%ymm10
@@ -1885,9 +951,9 @@
     0xc5,0x3c,0x59,0xc2,                        //  vmulps        %ymm2,%ymm8,%ymm8
     0xc4,0xc1,0x6c,0xc2,0xd2,0x01,              //  vcmpltps      %ymm10,%ymm2,%ymm2
     0xc4,0xc3,0x35,0x4a,0xd0,0x20,              //  vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_scale_u8[] = {
+static const unsigned char kSplice_scale_u8[] = {
     0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
     0xc4,0x62,0x7d,0x31,0x04,0x38,              //  vpmovzxbd     (%rax,%rdi,1),%ymm8
     0xc4,0x41,0x7c,0x5b,0xc0,                   //  vcvtdq2ps     %ymm8,%ymm8
@@ -1897,9 +963,9 @@
     0xc5,0xbc,0x59,0xc9,                        //  vmulps        %ymm1,%ymm8,%ymm1
     0xc5,0xbc,0x59,0xd2,                        //  vmulps        %ymm2,%ymm8,%ymm2
     0xc5,0xbc,0x59,0xdb,                        //  vmulps        %ymm3,%ymm8,%ymm3
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_load_tables[] = {
+static const unsigned char kSplice_load_tables[] = {
     0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
     0x4c,0x8b,0x42,0x08,                        //  mov           0x8(%rdx),%r8
     0xc5,0xfc,0x10,0x1c,0xb8,                   //  vmovups       (%rax,%rdi,4),%ymm3
@@ -1922,9 +988,9 @@
     0xc5,0xfc,0x5b,0xdb,                        //  vcvtdq2ps     %ymm3,%ymm3
     0xc4,0x62,0x7d,0x18,0x41,0x08,              //  vbroadcastss  0x8(%rcx),%ymm8
     0xc4,0xc1,0x64,0x59,0xd8,                   //  vmulps        %ymm8,%ymm3,%ymm3
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_load_8888[] = {
+static const unsigned char kSplice_load_8888[] = {
     0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
     0xc5,0xfc,0x10,0x1c,0xb8,                   //  vmovups       (%rax,%rdi,4),%ymm3
     0xc4,0xe2,0x7d,0x18,0x51,0x0c,              //  vbroadcastss  0xc(%rcx),%ymm2
@@ -1943,9 +1009,9 @@
     0xc5,0xe5,0x72,0xd3,0x18,                   //  vpsrld        $0x18,%ymm3,%ymm3
     0xc5,0xfc,0x5b,0xdb,                        //  vcvtdq2ps     %ymm3,%ymm3
     0xc4,0xc1,0x64,0x59,0xd8,                   //  vmulps        %ymm8,%ymm3,%ymm3
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_store_8888[] = {
+static const unsigned char kSplice_store_8888[] = {
     0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
     0xc4,0x62,0x7d,0x18,0x41,0x04,              //  vbroadcastss  0x4(%rcx),%ymm8
     0xc5,0x3c,0x59,0xc8,                        //  vmulps        %ymm0,%ymm8,%ymm9
@@ -1963,9 +1029,9 @@
     0xc4,0x41,0x2d,0xeb,0xc0,                   //  vpor          %ymm8,%ymm10,%ymm8
     0xc4,0x41,0x35,0xeb,0xc0,                   //  vpor          %ymm8,%ymm9,%ymm8
     0xc5,0x7e,0x7f,0x04,0xb8,                   //  vmovdqu       %ymm8,(%rax,%rdi,4)
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_load_f16[] = {
+static const unsigned char kSplice_load_f16[] = {
     0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
     0xc5,0xfa,0x6f,0x04,0xf8,                   //  vmovdqu       (%rax,%rdi,8),%xmm0
     0xc5,0xfa,0x6f,0x4c,0xf8,0x10,              //  vmovdqu       0x10(%rax,%rdi,8),%xmm1
@@ -1987,9 +1053,9 @@
     0xc4,0xe2,0x7d,0x13,0xd2,                   //  vcvtph2ps     %xmm2,%ymm2
     0xc4,0xc1,0x39,0x6d,0xda,                   //  vpunpckhqdq   %xmm10,%xmm8,%xmm3
     0xc4,0xe2,0x7d,0x13,0xdb,                   //  vcvtph2ps     %xmm3,%ymm3
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_store_f16[] = {
+static const unsigned char kSplice_store_f16[] = {
     0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
     0xc4,0xc3,0x7d,0x1d,0xc0,0x04,              //  vcvtps2ph     $0x4,%ymm0,%xmm8
     0xc4,0xc3,0x7d,0x1d,0xc9,0x04,              //  vcvtps2ph     $0x4,%ymm1,%xmm9
@@ -2007,9 +1073,9 @@
     0xc5,0x7a,0x7f,0x4c,0xf8,0x20,              //  vmovdqu       %xmm9,0x20(%rax,%rdi,8)
     0xc4,0x41,0x39,0x6a,0xc2,                   //  vpunpckhdq    %xmm10,%xmm8,%xmm8
     0xc5,0x7a,0x7f,0x44,0xf8,0x30,              //  vmovdqu       %xmm8,0x30(%rax,%rdi,8)
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
-static const unsigned char hsw_matrix_3x4[] = {
+static const unsigned char kSplice_matrix_3x4[] = {
     0xc4,0x62,0x7d,0x18,0x0a,                   //  vbroadcastss  (%rdx),%ymm9
     0xc4,0x62,0x7d,0x18,0x52,0x0c,              //  vbroadcastss  0xc(%rdx),%ymm10
     0xc4,0x62,0x7d,0x18,0x5a,0x18,              //  vbroadcastss  0x18(%rdx),%ymm11
@@ -2034,6 +1100,9 @@
     0xc5,0x7c,0x29,0xc0,                        //  vmovaps       %ymm8,%ymm0
     0xc5,0x7c,0x29,0xc9,                        //  vmovaps       %ymm9,%ymm1
     0xc5,0x7c,0x29,0xd2,                        //  vmovaps       %ymm10,%ymm2
-    0xc3,                                       //  return
+    0xc3,                                       //  return        (synthetic)
 };
+
+#endif
+
 #endif//SkSplicer_generated_DEFINED
diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py
index b54503b..a4fd97d 100755
--- a/src/splicer/build_stages.py
+++ b/src/splicer/build_stages.py
@@ -9,8 +9,6 @@
 import subprocess
 import sys
 
-sys.stdout = open('src/splicer/SkSplicer_generated.h', 'w')
-
 ndk = '/Users/mtklein/brew/opt/android-ndk/'
 objdump = 'gobjdump'
 
@@ -53,9 +51,8 @@
                       ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
                       ['-o', 'armv7.o'])
 
-def parse_object_file(dot_o, array_type, jump, ret, target=None):
-  prefix = dot_o.replace('.o', '_')
-  cmd = [ objdump, '-d', '--insn-width=8', dot_o]
+def parse_object_file(dst, dot_o, array_type, jump, ret, target=None):
+  cmd = [ objdump, '-d', dot_o]
   if target:
     cmd += ['--target', target]
   for line in subprocess.check_output(cmd).split('\n'):
@@ -66,7 +63,7 @@
     # E.g. 00000000000003a4 <_load_f16>:
     m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
     if m:
-      print 'static const', array_type, prefix + m.group(1) + '[] = {'
+      print >>dst,'static const', array_type, 'kSplice_' + m.group(1) + '[] = {'
       continue
 
     columns = line.split('\t')
@@ -87,15 +84,16 @@
     if code == jump:
       code = ret
       inst = 'return'
-      args = ''
+      args = '(synthetic)'
 
     hexed = ''.join('0x'+x+',' for x in code.split(' '))
-    print '    ' + hexed + ' '*(44-len(hexed)) + \
-          '//  ' + inst  + (' '*(14-len(inst)) + args if args else '')
+    print >>dst,'    ' + hexed + ' '*(44-len(hexed)) + \
+                '//  ' + inst  + ' '*(14-len(inst))  + args
     if code == ret:
-      print '};'
+      print >>dst,'};'
 
-print '''/*
+with open('src/splicer/SkSplicer_generated.h', 'w') as f:
+  print >>f,'''/*
  * Copyright 2017 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
@@ -107,11 +105,14 @@
 
 // This file is generated semi-automatically with this command:
 //   $ src/splicer/build_stages.py
+
+#if defined(__aarch64__)
 '''
-parse_object_file('aarch64.o', 'unsigned int', '14000000', 'd65f03c0')
-parse_object_file(  'armv7.o', 'unsigned int', 'eafffffe', 'e12fff1e',
+  parse_object_file(f, 'aarch64.o', 'unsigned int', '14000000', 'd65f03c0')
+  print >>f,'\n#elif defined(__ARM_NEON__)\n'
+  parse_object_file(f, 'armv7.o', 'unsigned int', 'eafffffe', 'e12fff1e',
                   target='elf32-littlearm')
-parse_object_file( 'sse2.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
-parse_object_file('sse41.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
-parse_object_file(  'hsw.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
-print '#endif//SkSplicer_generated_DEFINED'
+  print >>f,'\n#else\n'
+  parse_object_file(f, 'hsw.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
+  print >>f,'\n#endif\n'
+  print >>f,'#endif//SkSplicer_generated_DEFINED'