SkSplicer

I think I may have cracked the compile-ahead-of-time-splice-at-runtime nut.

This compiles stages ahead of time using clang, then splices them together at runtime.  This means the stages can be written in simple C++, with some mild restrictions.

This performs identically to our Xbyak experiment, and already supports more stages.  As written this stands alone from SkRasterPipeline_opts.h, but I'm fairly confident that the bulk (the STAGE implementations) can ultimately be shared.

As of PS 25 or so, this also supports all the stages used by bench/SkRasterPipelineBench.cpp:

    SkRasterPipeline_…
    400  …f16_compile 1x  …f16_run 1.38x  …srgb_compile 1.89x  …srgb_run 2.21x

That is, ~30% faster than baseline for f16, ~15% faster for sRGB.

Change-Id: I1ec7dcb769613713ce56978c58038f606f87d63d
Reviewed-on: https://skia-review.googlesource.com/6733
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/BUILD.gn b/BUILD.gn
index 82751cf..3ed9ab2 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -31,6 +31,7 @@
   skia_enable_discrete_gpu = true
   skia_enable_gpu = true
   skia_enable_pdf = true
+  skia_enable_splicer = false
   skia_enable_tools = is_skia_standalone
   skia_enable_vulkan_debug_layers = is_skia_standalone && is_debug
   skia_vulkan_sdk = getenv("VULKAN_SDK")
@@ -481,6 +482,15 @@
   ]
 }
 
+optional("splicer") {
+  enabled = skia_enable_splicer
+  public_defines = [ "SK_RASTER_PIPELINE_HAS_JIT" ]
+
+  sources = [
+    "src/splicer/SkSplicer.cpp",
+  ]
+}
+
 optional("typeface_freetype") {
   enabled = skia_use_freetype
 
@@ -509,7 +519,7 @@
 
 optional("xbyak") {
   enabled = skia_use_xbyak
-  public_defines = [ "SK_XBYAK" ]
+  public_defines = [ "SK_RASTER_PIPELINE_HAS_JIT" ]
 
   deps = [
     "//third_party/xbyak",
@@ -557,6 +567,7 @@
     ":pdf",
     ":png",
     ":raw",
+    ":splicer",
     ":sse2",
     ":sse41",
     ":sse42",
diff --git a/src/core/SkRasterPipeline.cpp b/src/core/SkRasterPipeline.cpp
index 13a7cae..7c3536c 100644
--- a/src/core/SkRasterPipeline.cpp
+++ b/src/core/SkRasterPipeline.cpp
@@ -27,7 +27,7 @@
 }
 
 std::function<void(size_t, size_t, size_t)> SkRasterPipeline::compile() const {
-#ifdef SK_XBYAK
+#ifdef SK_RASTER_PIPELINE_HAS_JIT
     if (auto fn = this->jit()) {
         return fn;
     }
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
new file mode 100644
index 0000000..167a89a
--- /dev/null
+++ b/src/splicer/SkSplicer.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkCpu.h"
+#include "SkOpts.h"
+#include "SkRasterPipeline.h"
+#include "SkStream.h"
+#include <sys/mman.h>
+
+#include "SkSplicer_generated.h"
+#include "SkSplicer_shared.h"
+
+// Uncomment to dump output with IACA markers.
+// #define IACA_DUMP "/tmp/dump.o"
+// https://software.intel.com/en-us/articles/intel-architecture-code-analyzer
+// $ ./iaca.sh -arch HSW -64 -mark 0 /tmp/dump.o | less
+
+namespace {
+
+    // Stages expect these constants to be set to these values.
+    // It's fine to rearrange and add new ones if you update SkSplicer_constants.
+    static const SkSplicer_constants kConstants = {
+        0x000000ff, 1.0f, 255.0f, 1/255.0f,
+        0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f,       // from_srgb
+        12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f,   //   to_srgb
+    };
+
+    // Short x86-64 instruction sequences that we'll use as glue to splice together Stages.
+    static const uint8_t   vzeroupper[] = { 0xc5, 0xf8, 0x77 };        // clear top half of all ymm
+    static const uint8_t          ret[] = { 0xc3 };                    // return
+    static const uint8_t  movabsq_rcx[] = { 0x48, 0xb9 };              // move next 8 bytes into rcx
+    static const uint8_t  movabsq_rdx[] = { 0x48, 0xba };              // move next 8 bytes into rdx
+    static const uint8_t   addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 };  // rdi += 8
+    static const uint8_t cmpq_rsi_rdi[] = { 0x48, 0x39, 0xf7 };        // rdi cmp? rsi
+    static const uint8_t      jb_near[] = { 0x0f, 0x8c };              // jump relative next 4 bytes
+                                                                       //  if cmp set unsigned < bit
+
+    // We do this a lot, so it's nice to infer the correct size.  Works fine with arrays.
+    template <typename T>
+    void splice(SkWStream* stream, const T& val) {
+        stream->write(&val, sizeof(val));
+    }
+
+#ifdef IACA_DUMP
+    static const uint8_t      ud2[] = { 0x0f, 0x0b };         // undefined... crashes when run
+    static const uint8_t     nop3[] = { 0x64, 0x67, 0x90 };   // 3 byte no-op
+    static const uint8_t movl_ebx[] = { 0xbb };               // move next 4 bytes into ebx
+
+    static void iaca_start(SkWStream* stream) {
+        splice(stream, ud2);
+        splice(stream, movl_ebx);
+        splice(stream, 111);
+        splice(stream, nop3);
+    }
+    static void iaca_end(SkWStream* stream) {
+        splice(stream, movl_ebx);
+        splice(stream, 222);
+        splice(stream, nop3);
+        splice(stream, ud2);
+    }
+#else
+    static void iaca_start(SkWStream*) {}
+    static void iaca_end  (SkWStream*) {}
+#endif
+
+    // Copy len bytes from src to memory that's executable.  cleanup with cleanup_executable_mem().
+    static void* copy_to_executable_mem(const void* src, size_t len) {
+        if (src && len) {
+            // TODO: w^x
+            auto fn = mmap(nullptr, len, PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
+            return memcpy(fn, src, len);
+        }
+        return nullptr;
+    }
+    static void cleanup_executable_mem(void* fn, size_t len) {
+        if (fn) {
+            munmap(fn, len);
+        }
+    }
+
+    struct Spliced {
+
+        Spliced(const SkRasterPipeline::Stage* stages, int nstages) {
+            // We always create a backup interpreter pipeline,
+            //   - to handle any program we can't, and
+            //   - to handle the n < 8 tails.
+            fBackup     = SkOpts::compile_pipeline(stages, nstages);
+            fSplicedLen = 0;
+            fSpliced    = nullptr;
+            // If we return early anywhere in here, !fSpliced means we'll use fBackup instead.
+
+            // To keep things simple, only one target supported: Haswell+ x86-64.
+            if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) {
+                return;
+            }
+
+            SkDynamicMemoryWStream buf;
+
+            // Put the address of kConstants in rcx, Stage argument 4 "k".
+            splice(&buf, movabsq_rcx);
+            splice(&buf, &kConstants);
+
+            // We'll loop back to here as long as x<n after x+=8.
+            iaca_start(&buf);
+            auto loop_start = buf.bytesWritten();  // Think of this like a label, loop_start:
+
+            for (int i = 0; i < nstages; i++) {
+                // If a stage has a context pointer, load it into rdx, Stage argument 3 "ctx".
+                if (stages[i].ctx) {
+                    splice(&buf, movabsq_rdx);
+                    splice(&buf, stages[i].ctx);
+                }
+
+                // Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
+                switch(stages[i].stage) {
+                    case SkRasterPipeline::clear:        splice(&buf, kSplice_clear       ); break;
+                    case SkRasterPipeline::plus_:        splice(&buf, kSplice_plus        ); break;
+                    case SkRasterPipeline::srcover:      splice(&buf, kSplice_srcover     ); break;
+                    case SkRasterPipeline::dstover:      splice(&buf, kSplice_dstover     ); break;
+                    case SkRasterPipeline::clamp_0:      splice(&buf, kSplice_clamp_0     ); break;
+                    case SkRasterPipeline::clamp_1:      splice(&buf, kSplice_clamp_1     ); break;
+                    case SkRasterPipeline::clamp_a:      splice(&buf, kSplice_clamp_a     ); break;
+                    case SkRasterPipeline::swap:         splice(&buf, kSplice_swap        ); break;
+                    case SkRasterPipeline::move_src_dst: splice(&buf, kSplice_move_src_dst); break;
+                    case SkRasterPipeline::move_dst_src: splice(&buf, kSplice_move_dst_src); break;
+                    case SkRasterPipeline::premul:       splice(&buf, kSplice_premul      ); break;
+                    case SkRasterPipeline::unpremul:     splice(&buf, kSplice_unpremul    ); break;
+                    case SkRasterPipeline::from_srgb:    splice(&buf, kSplice_from_srgb   ); break;
+                    case SkRasterPipeline::to_srgb:      splice(&buf, kSplice_to_srgb     ); break;
+                    case SkRasterPipeline::scale_u8:     splice(&buf, kSplice_scale_u8    ); break;
+                    case SkRasterPipeline::load_8888:    splice(&buf, kSplice_load_8888   ); break;
+                    case SkRasterPipeline::store_8888:   splice(&buf, kSplice_store_8888  ); break;
+                    case SkRasterPipeline::load_f16:     splice(&buf, kSplice_load_f16    ); break;
+                    case SkRasterPipeline::store_f16:    splice(&buf, kSplice_store_f16   ); break;
+
+                    // No joy (probably just not yet implemented).
+                    default:
+                        //SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
+                        return;
+                }
+            }
+
+            // See if we should loop back to handle more pixels.
+            splice(&buf, addq_8_rdi);    // x += 8
+            splice(&buf, cmpq_rsi_rdi);  // if (x < n)
+            splice(&buf, jb_near);       //     goto loop_start;
+            splice(&buf, (int)loop_start - (int)(buf.bytesWritten() + 4));
+            iaca_end(&buf);
+
+            // Nope!  We're done.
+            splice(&buf, vzeroupper);
+            splice(&buf, ret);
+
+            auto data = buf.detachAsData();
+            fSplicedLen = data->size();
+            fSpliced    = copy_to_executable_mem(data->data(), fSplicedLen);
+
+        #ifdef IACA_DUMP
+            SkFILEWStream(IACA_DUMP).write(data->data(), data->size());
+        #endif
+        }
+
+        // Spliced is stored in a std::function, so it needs to be copyable.
+        Spliced(const Spliced& o) : fBackup    (o.fBackup)
+                                  , fSplicedLen(o.fSplicedLen)
+                                  , fSpliced   (copy_to_executable_mem(o.fSpliced, fSplicedLen)) {}
+
+        ~Spliced() {
+            cleanup_executable_mem(fSpliced, fSplicedLen);
+        }
+
+        // Here's where we call fSpliced if we created it, fBackup if not.
+        void operator()(size_t x, size_t y, size_t n) const {
+            // TODO: The looping logic is probably not correct for handling n<8 tails.
+            if (fSpliced) {
+                // TODO: At some point we will want to pass in y...
+                using Fn = void(size_t x, size_t n);
+                ((Fn*)fSpliced)(x,n);
+
+                // Fall through to fBackup for any n<8 last pixels.
+                size_t body = n/8*8;
+                x += body;
+                n -= body;
+            }
+            fBackup(x,y,n);
+        }
+
+        std::function<void(size_t, size_t, size_t)> fBackup;
+        size_t                                      fSplicedLen;
+        void*                                       fSpliced;
+    };
+
+}
+
+std::function<void(size_t, size_t, size_t)> SkRasterPipeline::jit() const {
+    return Spliced(fStages.data(), SkToInt(fStages.size()));
+}
diff --git a/src/splicer/SkSplicer_generated.h b/src/splicer/SkSplicer_generated.h
new file mode 100644
index 0000000..ab04583
--- /dev/null
+++ b/src/splicer/SkSplicer_generated.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkSplicer_generated_DEFINED
+#define SkSplicer_generated_DEFINED
+
+// This file is generated semi-automatically with this command:
+//   $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
+
+static const unsigned char kSplice_clear[] = {
+    0xc5,0xfc,0x57,0xc0,                        // vxorps       %ymm0, %ymm0, %ymm0
+    0xc5,0xf4,0x57,0xc9,                        // vxorps       %ymm1, %ymm1, %ymm1
+    0xc5,0xec,0x57,0xd2,                        // vxorps       %ymm2, %ymm2, %ymm2
+    0xc5,0xe4,0x57,0xdb,                        // vxorps       %ymm3, %ymm3, %ymm3
+};
+static const unsigned char kSplice_plus[] = {
+    0xc5,0xfc,0x58,0xc4,                        // vaddps       %ymm4, %ymm0, %ymm0
+    0xc5,0xf4,0x58,0xcd,                        // vaddps       %ymm5, %ymm1, %ymm1
+    0xc5,0xec,0x58,0xd6,                        // vaddps       %ymm6, %ymm2, %ymm2
+    0xc5,0xe4,0x58,0xdf,                        // vaddps       %ymm7, %ymm3, %ymm3
+};
+static const unsigned char kSplice_srcover[] = {
+    0xc4,0x62,0x7d,0x18,0x41,0x04,              // vbroadcastss 0x4(%rcx), %ymm8
+    0xc5,0x3c,0x5c,0xc3,                        // vsubps       %ymm3, %ymm8, %ymm8
+    0xc4,0xc2,0x5d,0xb8,0xc0,                   // vfmadd231ps  %ymm8, %ymm4, %ymm0
+    0xc4,0xc2,0x55,0xb8,0xc8,                   // vfmadd231ps  %ymm8, %ymm5, %ymm1
+    0xc4,0xc2,0x4d,0xb8,0xd0,                   // vfmadd231ps  %ymm8, %ymm6, %ymm2
+    0xc4,0xc2,0x4d,0xb8,0xd8,                   // vfmadd231ps  %ymm8, %ymm6, %ymm3
+};
+static const unsigned char kSplice_dstover[] = {
+    0xc4,0x62,0x7d,0x18,0x41,0x04,              // vbroadcastss 0x4(%rcx), %ymm8
+    0xc5,0x3c,0x5c,0xc7,                        // vsubps       %ymm7, %ymm8, %ymm8
+    0xc4,0xc2,0x7d,0xb8,0xe0,                   // vfmadd231ps  %ymm8, %ymm0, %ymm4
+    0xc4,0xc2,0x75,0xb8,0xe8,                   // vfmadd231ps  %ymm8, %ymm1, %ymm5
+    0xc4,0xc2,0x6d,0xb8,0xf0,                   // vfmadd231ps  %ymm8, %ymm2, %ymm6
+    0xc4,0xc2,0x6d,0xb8,0xf8,                   // vfmadd231ps  %ymm8, %ymm2, %ymm7
+};
+static const unsigned char kSplice_clamp_0[] = {
+    0xc4,0x41,0x3c,0x57,0xc0,                   // vxorps       %ymm8, %ymm8, %ymm8
+    0xc4,0xc1,0x7c,0x5f,0xc0,                   // vmaxps       %ymm8, %ymm0, %ymm0
+    0xc4,0xc1,0x74,0x5f,0xc8,                   // vmaxps       %ymm8, %ymm1, %ymm1
+    0xc4,0xc1,0x6c,0x5f,0xd0,                   // vmaxps       %ymm8, %ymm2, %ymm2
+    0xc4,0xc1,0x64,0x5f,0xd8,                   // vmaxps       %ymm8, %ymm3, %ymm3
+};
+static const unsigned char kSplice_clamp_1[] = {
+    0xc4,0x62,0x7d,0x18,0x41,0x04,              // vbroadcastss 0x4(%rcx), %ymm8
+    0xc4,0xc1,0x7c,0x5d,0xc0,                   // vminps       %ymm8, %ymm0, %ymm0
+    0xc4,0xc1,0x74,0x5d,0xc8,                   // vminps       %ymm8, %ymm1, %ymm1
+    0xc4,0xc1,0x6c,0x5d,0xd0,                   // vminps       %ymm8, %ymm2, %ymm2
+    0xc4,0xc1,0x64,0x5d,0xd8,                   // vminps       %ymm8, %ymm3, %ymm3
+};
+static const unsigned char kSplice_clamp_a[] = {
+    0xc4,0x62,0x7d,0x18,0x41,0x04,              // vbroadcastss 0x4(%rcx), %ymm8
+    0xc4,0xc1,0x64,0x5d,0xd8,                   // vminps       %ymm8, %ymm3, %ymm3
+    0xc5,0xfc,0x5d,0xc3,                        // vminps       %ymm3, %ymm0, %ymm0
+    0xc5,0xf4,0x5d,0xcb,                        // vminps       %ymm3, %ymm1, %ymm1
+    0xc5,0xec,0x5d,0xd3,                        // vminps       %ymm3, %ymm2, %ymm2
+};
+static const unsigned char kSplice_swap[] = {
+    0xc5,0x7c,0x28,0xc3,                        // vmovaps      %ymm3, %ymm8
+    0xc5,0x7c,0x28,0xca,                        // vmovaps      %ymm2, %ymm9
+    0xc5,0x7c,0x28,0xd1,                        // vmovaps      %ymm1, %ymm10
+    0xc5,0x7c,0x28,0xd8,                        // vmovaps      %ymm0, %ymm11
+    0xc5,0xfc,0x28,0xc4,                        // vmovaps      %ymm4, %ymm0
+    0xc5,0xfc,0x28,0xcd,                        // vmovaps      %ymm5, %ymm1
+    0xc5,0xfc,0x28,0xd6,                        // vmovaps      %ymm6, %ymm2
+    0xc5,0xfc,0x28,0xdf,                        // vmovaps      %ymm7, %ymm3
+    0xc5,0x7c,0x29,0xdc,                        // vmovaps      %ymm11, %ymm4
+    0xc5,0x7c,0x29,0xd5,                        // vmovaps      %ymm10, %ymm5
+    0xc5,0x7c,0x29,0xce,                        // vmovaps      %ymm9, %ymm6
+    0xc5,0x7c,0x29,0xc7,                        // vmovaps      %ymm8, %ymm7
+};
+static const unsigned char kSplice_move_src_dst[] = {
+    0xc5,0xfc,0x28,0xe0,                        // vmovaps      %ymm0, %ymm4
+    0xc5,0xfc,0x28,0xe9,                        // vmovaps      %ymm1, %ymm5
+    0xc5,0xfc,0x28,0xf2,                        // vmovaps      %ymm2, %ymm6
+    0xc5,0xfc,0x28,0xfb,                        // vmovaps      %ymm3, %ymm7
+};
+static const unsigned char kSplice_move_dst_src[] = {
+    0xc5,0xfc,0x28,0xc4,                        // vmovaps      %ymm4, %ymm0
+    0xc5,0xfc,0x28,0xcd,                        // vmovaps      %ymm5, %ymm1
+    0xc5,0xfc,0x28,0xd6,                        // vmovaps      %ymm6, %ymm2
+    0xc5,0xfc,0x28,0xdf,                        // vmovaps      %ymm7, %ymm3
+};
+static const unsigned char kSplice_premul[] = {
+    0xc5,0xfc,0x59,0xc3,                        // vmulps       %ymm3, %ymm0, %ymm0
+    0xc5,0xf4,0x59,0xcb,                        // vmulps       %ymm3, %ymm1, %ymm1
+    0xc5,0xec,0x59,0xd3,                        // vmulps       %ymm3, %ymm2, %ymm2
+};
+static const unsigned char kSplice_unpremul[] = {
+    0xc4,0x41,0x3c,0x57,0xc0,                   // vxorps       %ymm8, %ymm8, %ymm8
+    0xc4,0x41,0x64,0xc2,0xc8,0x00,              // vcmpeqps     %ymm8, %ymm3, %ymm9
+    0xc4,0x62,0x7d,0x18,0x51,0x04,              // vbroadcastss 0x4(%rcx), %ymm10
+    0xc5,0x2c,0x5e,0xd3,                        // vdivps       %ymm3, %ymm10, %ymm10
+    0xc4,0x43,0x2d,0x4a,0xc0,0x90,              // vblendvps    %ymm9, %ymm8, %ymm10, %ymm8
+    0xc5,0xbc,0x59,0xc0,                        // vmulps       %ymm0, %ymm8, %ymm0
+    0xc5,0xbc,0x59,0xc9,                        // vmulps       %ymm1, %ymm8, %ymm1
+    0xc5,0xbc,0x59,0xd2,                        // vmulps       %ymm2, %ymm8, %ymm2
+};
+static const unsigned char kSplice_from_srgb[] = {
+    0xc4,0x62,0x7d,0x18,0x41,0x1c,              // vbroadcastss 0x1c(%rcx), %ymm8
+    0xc5,0x3c,0x59,0xc8,                        // vmulps       %ymm0, %ymm8, %ymm9
+    0xc5,0x7c,0x59,0xd0,                        // vmulps       %ymm0, %ymm0, %ymm10
+    0xc4,0x62,0x7d,0x18,0x59,0x18,              // vbroadcastss 0x18(%rcx), %ymm11
+    0xc4,0x62,0x7d,0x18,0x61,0x14,              // vbroadcastss 0x14(%rcx), %ymm12
+    0xc4,0x41,0x7c,0x28,0xeb,                   // vmovaps      %ymm11, %ymm13
+    0xc4,0x42,0x7d,0xa8,0xec,                   // vfmadd213ps  %ymm12, %ymm0, %ymm13
+    0xc4,0x62,0x7d,0x18,0x71,0x10,              // vbroadcastss 0x10(%rcx), %ymm14
+    0xc4,0x42,0x2d,0xa8,0xee,                   // vfmadd213ps  %ymm14, %ymm10, %ymm13
+    0xc4,0x62,0x7d,0x18,0x51,0x20,              // vbroadcastss 0x20(%rcx), %ymm10
+    0xc4,0xc1,0x7c,0xc2,0xc2,0x01,              // vcmpltps     %ymm10, %ymm0, %ymm0
+    0xc4,0xc3,0x15,0x4a,0xc1,0x00,              // vblendvps    %ymm0, %ymm9, %ymm13, %ymm0
+    0xc5,0x3c,0x59,0xc9,                        // vmulps       %ymm1, %ymm8, %ymm9
+    0xc5,0x74,0x59,0xe9,                        // vmulps       %ymm1, %ymm1, %ymm13
+    0xc4,0x41,0x7c,0x28,0xfb,                   // vmovaps      %ymm11, %ymm15
+    0xc4,0x42,0x75,0xa8,0xfc,                   // vfmadd213ps  %ymm12, %ymm1, %ymm15
+    0xc4,0x42,0x15,0xa8,0xfe,                   // vfmadd213ps  %ymm14, %ymm13, %ymm15
+    0xc4,0xc1,0x74,0xc2,0xca,0x01,              // vcmpltps     %ymm10, %ymm1, %ymm1
+    0xc4,0xc3,0x05,0x4a,0xc9,0x10,              // vblendvps    %ymm1, %ymm9, %ymm15, %ymm1
+    0xc5,0x3c,0x59,0xc2,                        // vmulps       %ymm2, %ymm8, %ymm8
+    0xc5,0x6c,0x59,0xca,                        // vmulps       %ymm2, %ymm2, %ymm9
+    0xc4,0x42,0x6d,0xa8,0xdc,                   // vfmadd213ps  %ymm12, %ymm2, %ymm11
+    0xc4,0x42,0x35,0xa8,0xde,                   // vfmadd213ps  %ymm14, %ymm9, %ymm11
+    0xc4,0xc1,0x6c,0xc2,0xd2,0x01,              // vcmpltps     %ymm10, %ymm2, %ymm2
+    0xc4,0xc3,0x25,0x4a,0xd0,0x20,              // vblendvps    %ymm2, %ymm8, %ymm11, %ymm2
+};
+static const unsigned char kSplice_to_srgb[] = {
+    0xc5,0x7c,0x52,0xc0,                        // vrsqrtps     %ymm0, %ymm8
+    0xc4,0x41,0x7c,0x53,0xc8,                   // vrcpps       %ymm8, %ymm9
+    0xc4,0x41,0x7c,0x52,0xd0,                   // vrsqrtps     %ymm8, %ymm10
+    0xc4,0x62,0x7d,0x18,0x41,0x24,              // vbroadcastss 0x24(%rcx), %ymm8
+    0xc5,0x3c,0x59,0xd8,                        // vmulps       %ymm0, %ymm8, %ymm11
+    0xc4,0x62,0x7d,0x18,0x61,0x04,              // vbroadcastss 0x4(%rcx), %ymm12
+    0xc4,0x62,0x7d,0x18,0x69,0x28,              // vbroadcastss 0x28(%rcx), %ymm13
+    0xc4,0x62,0x7d,0x18,0x71,0x2c,              // vbroadcastss 0x2c(%rcx), %ymm14
+    0xc4,0x62,0x7d,0x18,0x79,0x30,              // vbroadcastss 0x30(%rcx), %ymm15
+    0xc4,0x42,0x0d,0xa8,0xcf,                   // vfmadd213ps  %ymm15, %ymm14, %ymm9
+    0xc4,0x42,0x15,0xb8,0xca,                   // vfmadd231ps  %ymm10, %ymm13, %ymm9
+    0xc4,0x41,0x1c,0x5d,0xc9,                   // vminps       %ymm9, %ymm12, %ymm9
+    0xc4,0x62,0x7d,0x18,0x51,0x34,              // vbroadcastss 0x34(%rcx), %ymm10
+    0xc4,0xc1,0x7c,0xc2,0xc2,0x01,              // vcmpltps     %ymm10, %ymm0, %ymm0
+    0xc4,0xc3,0x35,0x4a,0xc3,0x00,              // vblendvps    %ymm0, %ymm11, %ymm9, %ymm0
+    0xc5,0x7c,0x52,0xc9,                        // vrsqrtps     %ymm1, %ymm9
+    0xc4,0x41,0x7c,0x53,0xd9,                   // vrcpps       %ymm9, %ymm11
+    0xc4,0x41,0x7c,0x52,0xc9,                   // vrsqrtps     %ymm9, %ymm9
+    0xc4,0x42,0x0d,0xa8,0xdf,                   // vfmadd213ps  %ymm15, %ymm14, %ymm11
+    0xc4,0x42,0x15,0xb8,0xd9,                   // vfmadd231ps  %ymm9, %ymm13, %ymm11
+    0xc5,0x3c,0x59,0xc9,                        // vmulps       %ymm1, %ymm8, %ymm9
+    0xc4,0x41,0x1c,0x5d,0xdb,                   // vminps       %ymm11, %ymm12, %ymm11
+    0xc4,0xc1,0x74,0xc2,0xca,0x01,              // vcmpltps     %ymm10, %ymm1, %ymm1
+    0xc4,0xc3,0x25,0x4a,0xc9,0x10,              // vblendvps    %ymm1, %ymm9, %ymm11, %ymm1
+    0xc5,0x7c,0x52,0xca,                        // vrsqrtps     %ymm2, %ymm9
+    0xc4,0x41,0x7c,0x53,0xd9,                   // vrcpps       %ymm9, %ymm11
+    0xc4,0x42,0x0d,0xa8,0xdf,                   // vfmadd213ps  %ymm15, %ymm14, %ymm11
+    0xc4,0x41,0x7c,0x52,0xc9,                   // vrsqrtps     %ymm9, %ymm9
+    0xc4,0x42,0x15,0xb8,0xd9,                   // vfmadd231ps  %ymm9, %ymm13, %ymm11
+    0xc4,0x41,0x1c,0x5d,0xcb,                   // vminps       %ymm11, %ymm12, %ymm9
+    0xc5,0x3c,0x59,0xc2,                        // vmulps       %ymm2, %ymm8, %ymm8
+    0xc4,0xc1,0x6c,0xc2,0xd2,0x01,              // vcmpltps     %ymm10, %ymm2, %ymm2
+    0xc4,0xc3,0x35,0x4a,0xd0,0x20,              // vblendvps    %ymm2, %ymm8, %ymm9, %ymm2
+};
+static const unsigned char kSplice_scale_u8[] = {
+    0x48,0x8b,0x02,                             // movq         (%rdx), %rax
+    0xc4,0x62,0x7d,0x31,0x04,0x38,              // vpmovzxbd    (%rax,%rdi), %ymm8
+    0xc4,0x41,0x7c,0x5b,0xc0,                   // vcvtdq2ps    %ymm8, %ymm8
+    0xc4,0x62,0x7d,0x18,0x49,0x0c,              // vbroadcastss 0xc(%rcx), %ymm9
+    0xc4,0x41,0x3c,0x59,0xc1,                   // vmulps       %ymm9, %ymm8, %ymm8
+    0xc5,0xbc,0x59,0xc0,                        // vmulps       %ymm0, %ymm8, %ymm0
+    0xc5,0xbc,0x59,0xc9,                        // vmulps       %ymm1, %ymm8, %ymm1
+    0xc5,0xbc,0x59,0xd2,                        // vmulps       %ymm2, %ymm8, %ymm2
+    0xc5,0xbc,0x59,0xdb,                        // vmulps       %ymm3, %ymm8, %ymm3
+};
+static const unsigned char kSplice_load_8888[] = {
+    0x48,0x8b,0x02,                             // movq         (%rdx), %rax
+    0xc5,0xfc,0x10,0x1c,0xb8,                   // vmovups      (%rax,%rdi,4), %ymm3
+    0xc4,0xe2,0x7d,0x18,0x11,                   // vbroadcastss (%rcx), %ymm2
+    0xc5,0xec,0x54,0xc3,                        // vandps       %ymm3, %ymm2, %ymm0
+    0xc5,0xfc,0x5b,0xc0,                        // vcvtdq2ps    %ymm0, %ymm0
+    0xc4,0x62,0x7d,0x18,0x41,0x0c,              // vbroadcastss 0xc(%rcx), %ymm8
+    0xc5,0xbc,0x59,0xc0,                        // vmulps       %ymm0, %ymm8, %ymm0
+    0xc5,0xf5,0x72,0xd3,0x08,                   // vpsrld       $0x8, %ymm3, %ymm1
+    0xc5,0xec,0x54,0xc9,                        // vandps       %ymm1, %ymm2, %ymm1
+    0xc5,0xfc,0x5b,0xc9,                        // vcvtdq2ps    %ymm1, %ymm1
+    0xc5,0xbc,0x59,0xc9,                        // vmulps       %ymm1, %ymm8, %ymm1
+    0xc5,0xb5,0x72,0xd3,0x10,                   // vpsrld       $0x10, %ymm3, %ymm9
+    0xc4,0xc1,0x6c,0x54,0xd1,                   // vandps       %ymm9, %ymm2, %ymm2
+    0xc5,0xfc,0x5b,0xd2,                        // vcvtdq2ps    %ymm2, %ymm2
+    0xc5,0xbc,0x59,0xd2,                        // vmulps       %ymm2, %ymm8, %ymm2
+    0xc5,0xe5,0x72,0xd3,0x18,                   // vpsrld       $0x18, %ymm3, %ymm3
+    0xc5,0xfc,0x5b,0xdb,                        // vcvtdq2ps    %ymm3, %ymm3
+    0xc4,0xc1,0x64,0x59,0xd8,                   // vmulps       %ymm8, %ymm3, %ymm3
+};
+static const unsigned char kSplice_store_8888[] = {
+    0x48,0x8b,0x02,                             // movq         (%rdx), %rax
+    0xc4,0x62,0x7d,0x18,0x41,0x08,              // vbroadcastss 0x8(%rcx), %ymm8
+    0xc5,0x3c,0x59,0xc8,                        // vmulps       %ymm0, %ymm8, %ymm9
+    0xc4,0x41,0x7d,0x5b,0xc9,                   // vcvtps2dq    %ymm9, %ymm9
+    0xc5,0x3c,0x59,0xd1,                        // vmulps       %ymm1, %ymm8, %ymm10
+    0xc4,0x41,0x7d,0x5b,0xd2,                   // vcvtps2dq    %ymm10, %ymm10
+    0xc4,0xc1,0x2d,0x72,0xf2,0x08,              // vpslld       $0x8, %ymm10, %ymm10
+    0xc4,0x41,0x2d,0xeb,0xc9,                   // vpor         %ymm9, %ymm10, %ymm9
+    0xc5,0x3c,0x59,0xd2,                        // vmulps       %ymm2, %ymm8, %ymm10
+    0xc4,0x41,0x7d,0x5b,0xd2,                   // vcvtps2dq    %ymm10, %ymm10
+    0xc4,0xc1,0x2d,0x72,0xf2,0x10,              // vpslld       $0x10, %ymm10, %ymm10
+    0xc5,0x3c,0x59,0xc3,                        // vmulps       %ymm3, %ymm8, %ymm8
+    0xc4,0x41,0x7d,0x5b,0xc0,                   // vcvtps2dq    %ymm8, %ymm8
+    0xc4,0xc1,0x3d,0x72,0xf0,0x18,              // vpslld       $0x18, %ymm8, %ymm8
+    0xc4,0x41,0x2d,0xeb,0xc0,                   // vpor         %ymm8, %ymm10, %ymm8
+    0xc4,0x41,0x35,0xeb,0xc0,                   // vpor         %ymm8, %ymm9, %ymm8
+    0xc5,0x7e,0x7f,0x04,0xb8,                   // vmovdqu      %ymm8, (%rax,%rdi,4)
+};
+static const unsigned char kSplice_load_f16[] = {
+    0x48,0x8b,0x02,                             // movq         (%rdx), %rax
+    0xc5,0xfa,0x6f,0x04,0xf8,                   // vmovdqu      (%rax,%rdi,8), %xmm0
+    0xc5,0xfa,0x6f,0x4c,0xf8,0x10,              // vmovdqu      0x10(%rax,%rdi,8), %xmm1
+    0xc5,0xfa,0x6f,0x54,0xf8,0x20,              // vmovdqu      0x20(%rax,%rdi,8), %xmm2
+    0xc5,0xfa,0x6f,0x5c,0xf8,0x30,              // vmovdqu      0x30(%rax,%rdi,8), %xmm3
+    0xc5,0x79,0x61,0xc1,                        // vpunpcklwd   %xmm1, %xmm0, %xmm8
+    0xc5,0xf9,0x69,0xc1,                        // vpunpckhwd   %xmm1, %xmm0, %xmm0
+    0xc5,0xe9,0x61,0xcb,                        // vpunpcklwd   %xmm3, %xmm2, %xmm1
+    0xc5,0xe9,0x69,0xd3,                        // vpunpckhwd   %xmm3, %xmm2, %xmm2
+    0xc5,0x39,0x61,0xc8,                        // vpunpcklwd   %xmm0, %xmm8, %xmm9
+    0xc5,0x39,0x69,0xc0,                        // vpunpckhwd   %xmm0, %xmm8, %xmm8
+    0xc5,0xf1,0x61,0xda,                        // vpunpcklwd   %xmm2, %xmm1, %xmm3
+    0xc5,0x71,0x69,0xd2,                        // vpunpckhwd   %xmm2, %xmm1, %xmm10
+    0xc5,0xb1,0x6c,0xc3,                        // vpunpcklqdq  %xmm3, %xmm9, %xmm0
+    0xc4,0xe2,0x7d,0x13,0xc0,                   // vcvtph2ps    %xmm0, %ymm0
+    0xc5,0xb1,0x6d,0xcb,                        // vpunpckhqdq  %xmm3, %xmm9, %xmm1
+    0xc4,0xe2,0x7d,0x13,0xc9,                   // vcvtph2ps    %xmm1, %ymm1
+    0xc4,0xc1,0x39,0x6c,0xd2,                   // vpunpcklqdq  %xmm10, %xmm8, %xmm2
+    0xc4,0xe2,0x7d,0x13,0xd2,                   // vcvtph2ps    %xmm2, %ymm2
+    0xc4,0xc1,0x39,0x6d,0xda,                   // vpunpckhqdq  %xmm10, %xmm8, %xmm3
+    0xc4,0xe2,0x7d,0x13,0xdb,                   // vcvtph2ps    %xmm3, %ymm3
+};
+static const unsigned char kSplice_store_f16[] = {
+    0x48,0x8b,0x02,                             // movq         (%rdx), %rax
+    0xc4,0xc3,0x7d,0x1d,0xc0,0x04,              // vcvtps2ph    $0x4, %ymm0, %xmm8
+    0xc4,0xc3,0x7d,0x1d,0xc9,0x04,              // vcvtps2ph    $0x4, %ymm1, %xmm9
+    0xc4,0xc3,0x7d,0x1d,0xd2,0x04,              // vcvtps2ph    $0x4, %ymm2, %xmm10
+    0xc4,0xc3,0x7d,0x1d,0xdb,0x04,              // vcvtps2ph    $0x4, %ymm3, %xmm11
+    0xc4,0x41,0x39,0x61,0xe1,                   // vpunpcklwd   %xmm9, %xmm8, %xmm12
+    0xc4,0x41,0x39,0x69,0xc1,                   // vpunpckhwd   %xmm9, %xmm8, %xmm8
+    0xc4,0x41,0x29,0x61,0xcb,                   // vpunpcklwd   %xmm11, %xmm10, %xmm9
+    0xc4,0x41,0x29,0x69,0xd3,                   // vpunpckhwd   %xmm11, %xmm10, %xmm10
+    0xc4,0x41,0x19,0x62,0xd9,                   // vpunpckldq   %xmm9, %xmm12, %xmm11
+    0xc5,0x7a,0x7f,0x1c,0xf8,                   // vmovdqu      %xmm11, (%rax,%rdi,8)
+    0xc4,0x41,0x19,0x6a,0xc9,                   // vpunpckhdq   %xmm9, %xmm12, %xmm9
+    0xc5,0x7a,0x7f,0x4c,0xf8,0x10,              // vmovdqu      %xmm9, 0x10(%rax,%rdi,8)
+    0xc4,0x41,0x39,0x62,0xca,                   // vpunpckldq   %xmm10, %xmm8, %xmm9
+    0xc5,0x7a,0x7f,0x4c,0xf8,0x20,              // vmovdqu      %xmm9, 0x20(%rax,%rdi,8)
+    0xc4,0x41,0x39,0x6a,0xc2,                   // vpunpckhdq   %xmm10, %xmm8, %xmm8
+    0xc5,0x7a,0x7f,0x44,0xf8,0x30,              // vmovdqu      %xmm8, 0x30(%rax,%rdi,8)
+};
+#endif//SkSplicer_generated_DEFINED
diff --git a/src/splicer/SkSplicer_shared.h b/src/splicer/SkSplicer_shared.h
new file mode 100644
index 0000000..1c530c4
--- /dev/null
+++ b/src/splicer/SkSplicer_shared.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkSplicer_shared_DEFINED
+#define SkSplicer_shared_DEFINED
+
+// This file contains definitions shared by SkSplicer.cpp (compiled normally as part of Skia)
+// and SkSplicer_stages.cpp (compiled offline into SkSplicer_generated.h).  Keep it simple!
+
+#include <stdint.h>
+
+// SkSplicer Stages can use constant literals only if they end up baked into the instruction,
+// like bit shifts and rounding modes.  Any other constant values must be pulled from this struct
+// (except 0 and 0.0f, which always end up as some sort of xor instruction).
+//
+// This constraint makes it much easier to move and reorder the code for each Stage.
+
+struct SkSplicer_constants {
+    uint32_t _0x000000ff;  //  0x000000ff
+    float    _1;           //  1.0f
+    float    _255;         //  255.0f
+    float    _1_255;       //  1/255.0f
+
+    // from_srgb
+    float    _00025;       //  0.0025f
+    float    _06975;       //  0.6975f
+    float    _03000;       //  0.3000f
+    float    _1_1292;      //  1/12.92f
+    float    _0055;        //  0.055f
+
+    // to_srgb
+    float    _1246;        //  12.46f
+    float    _0411192;     //  0.411192f
+    float    _0689206;     //  0.689206f
+    float   n_00988;       // -0.0988f
+    float    _00043;       //  0.0043f
+};
+
+#endif//SkSplicer_shared_DEFINED
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
new file mode 100644
index 0000000..f3608d4
--- /dev/null
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkSplicer_shared.h"
+#include <immintrin.h>
+#include <string.h>
+
+#if !defined(__clang__) || !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
+    #error This file is not like the rest of Skia.
+    #error It must be compiled with clang and with -mavx2 -mfma -mf16c -fomit-frame-pointer.
+#endif
+
+// We have very specific inlining requirements.  It helps to just take total control.
+#define AI __attribute__((always_inline)) inline
+
+// We'll be compiling this file to an object file, then extracting parts of it into
+// SkSplicer_generated.h.  It's easier to do if the function names are not C++ mangled.
+#define C extern "C"
+
+// Since we know we're using Clang, we can use its vector extensions.
+// These are __m256 and __m256i, but friendlier and strongly-typed.
+using F   = float    __attribute__((ext_vector_type(8)));
+using I32 =  int32_t __attribute__((ext_vector_type(8)));
+using U32 = uint32_t __attribute__((ext_vector_type(8)));
+using U8  = uint8_t  __attribute__((ext_vector_type(8)));
+
+// We polyfill a few routines that Clang doesn't build into ext_vector_types.
+AI static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F); }
+AI static U32 round (F   v) { return _mm256_cvtps_epi32(v); }
+AI static U32 expand(U8  v) { return __builtin_convertvector(v, U32); }
+
+AI static F rcp  (F v)           { return _mm256_rcp_ps  (v); }
+AI static F rsqrt(F v)           { return _mm256_rsqrt_ps(v); }
+AI static F min  (F a, F b)      { return _mm256_min_ps  (a,b); }
+AI static F max  (F a, F b)      { return _mm256_max_ps  (a,b); }
+AI static F fma  (F f, F m, F a) { return _mm256_fmadd_ps(f,m,a); }
+
+AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+
+// Stages all fit a common interface that allows SkSplicer to splice them together.
+using K = const SkSplicer_constants;
+using Stage = void(size_t x, size_t n, void* ctx, K* constants, F,F,F,F, F,F,F,F);
+
+// Stage's arguments act as the working set of registers within the final spliced function.
+// Here's a little primer on the ABI:
+//   x:         rdi         x and n work to drive the loop, like for (; x < n; x += 8)
+//   n:         rsi
+//   ctx:       rdx         Look for movabsq_rdx in SkSplicer.cpp to see how this works.
+//   constants: rcx         Look for movabsq_rcx in SkSplicer.cpp to see how this works.
+//   vectors:   ymm0-ymm7
+
+
+// done() is the key to this entire splicing strategy.
+//
+// It matches the signature of Stage, so all the registers are kept live.
+// Every Stage calls done() and so will end in a single jmp (i.e. tail-call) into done(),
+// which marks the point where we can splice one Stage onto the next.
+//
+// The lovely bit is that we don't have to define done(), just declare it.
+C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
+
+// This should feel familiar to anyone who's read SkRasterPipeline_opts.h.
+// It's just a convenience to make a valid, spliceable Stage, nothing magic.
+#define STAGE(name)                                                              \
+    AI static void name##_k(size_t x, size_t n, void* ctx, K* k,                 \
+                            F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+    C void name(size_t x, size_t n, void* ctx, K* k,                             \
+                F r, F g, F b, F a, F dr, F dg, F db, F da) {                    \
+        name##_k(x,n,ctx,k, r,g,b,a, dr,dg,db,da);                               \
+        done    (x,n,ctx,k, r,g,b,a, dr,dg,db,da);                               \
+    }                                                                            \
+    AI static void name##_k(size_t x, size_t n, void* ctx, K* k,                 \
+                            F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+
+// We can now define Stages!
+
+// Some things to keep in mind while writing Stages:
+//   - do not branch;                                       (i.e. avoid jmp)
+//   - do not call functions that don't inline;             (i.e. avoid call, ret, stack use)
+//   - do not use constant literals other than 0 and 0.0f.  (i.e. avoid rip relative addressing)
+//
+// Some things that should work fine:
+//   - 0 and 0.0f;
+//   - arithmetic;
+//   - functions of F and U32 that we've defined above;
+//   - temporary values;
+//   - lambdas;
+//   - memcpy() with a compile-time constant size argument.
+
+STAGE(clear) {
+    r = g = b = a = 0;
+}
+
+STAGE(plus) {
+    r = r + dr;
+    g = g + dg;
+    b = b + db;
+    a = a + da;
+}
+
+STAGE(srcover) {
+    auto A = k->_1 - a;
+    r = fma(dr, A, r);
+    g = fma(dg, A, g);
+    b = fma(db, A, b);
+    a = fma(db, A, a);
+}
+STAGE(dstover) { srcover_k(x,n,ctx,k, dr,dg,db,da, r,g,b,a); }
+
+STAGE(clamp_0) {
+    r = max(r, 0);
+    g = max(g, 0);
+    b = max(b, 0);
+    a = max(a, 0);
+}
+
+STAGE(clamp_1) {
+    r = min(r, k->_1);
+    g = min(g, k->_1);
+    b = min(b, k->_1);
+    a = min(a, k->_1);
+}
+
+STAGE(clamp_a) {
+    a = min(a, k->_1);
+    r = min(r, a);
+    g = min(g, a);
+    b = min(b, a);
+}
+
+STAGE(swap) {
+    auto swap = [](F& v, F& dv) {
+        auto tmp = v;
+        v = dv;
+        dv = tmp;
+    };
+    swap(r, dr);
+    swap(g, dg);
+    swap(b, db);
+    swap(a, da);
+}
+STAGE(move_src_dst) {
+    dr = r;
+    dg = g;
+    db = b;
+    da = a;
+}
+STAGE(move_dst_src) {
+    r = dr;
+    g = dg;
+    b = db;
+    a = da;
+}
+
+STAGE(premul) {
+    r = r * a;
+    g = g * a;
+    b = b * a;
+}
+STAGE(unpremul) {
+    auto scale = if_then_else(a == 0, 0, k->_1 / a);
+    r = r * scale;
+    g = g * scale;
+    b = b * scale;
+}
+
+STAGE(from_srgb) {
+    auto fn = [&](F s) {
+        auto lo = s * k->_1_1292;
+        auto hi = fma(s*s, fma(s, k->_03000, k->_06975), k->_00025);
+        return if_then_else(s < k->_0055, lo, hi);
+    };
+    r = fn(r);
+    g = fn(g);
+    b = fn(b);
+}
+STAGE(to_srgb) {
+    auto fn = [&](F l) {
+        F sqrt = rcp  (rsqrt(l)),
+          ftrt = rsqrt(rsqrt(l));
+        auto lo = l * k->_1246;
+        auto hi = min(k->_1, fma(k->_0411192, ftrt,
+                             fma(k->_0689206, sqrt,
+                                 k->n_00988)));
+        return if_then_else(l < k->_00043, lo, hi);
+    };
+    r = fn(r);
+    g = fn(g);
+    b = fn(b);
+}
+
+STAGE(scale_u8) {
+    auto ptr = *(const uint8_t**)ctx + x;
+
+    U8 scales;
+    memcpy(&scales, ptr, sizeof(scales));
+    auto c = cast(expand(scales)) * k->_1_255;
+
+    r = r * c;
+    g = g * c;
+    b = b * c;
+    a = a * c;
+}
+
+STAGE(load_8888) {
+    auto ptr = *(const uint32_t**)ctx + x;
+
+    U32 px;
+    memcpy(&px, ptr, sizeof(px));
+
+    r = cast((px      ) & k->_0x000000ff) * k->_1_255;
+    g = cast((px >>  8) & k->_0x000000ff) * k->_1_255;
+    b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
+    a = cast((px >> 24)                 ) * k->_1_255;
+}
+
+STAGE(store_8888) {
+    auto ptr = *(uint32_t**)ctx + x;
+
+    U32 px = round(r * k->_255)
+           | round(g * k->_255) <<  8
+           | round(b * k->_255) << 16
+           | round(a * k->_255) << 24;
+    memcpy(ptr, &px, sizeof(px));
+}
+
+STAGE(load_f16) {
+    auto ptr = *(const uint64_t**)ctx + x;
+
+    auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
+         _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
+         _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
+         _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+
+    auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
+         _13 = _mm_unpackhi_epi16(_01, _23),  // r1 r3 g1 g3 b1 b3 a1 a3
+         _46 = _mm_unpacklo_epi16(_45, _67),
+         _57 = _mm_unpackhi_epi16(_45, _67);
+
+    auto rg0123 = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
+         ba0123 = _mm_unpackhi_epi16(_02, _13),  // b0 b1 b2 b3 a0 a1 a2 a3
+         rg4567 = _mm_unpacklo_epi16(_46, _57),
+         ba4567 = _mm_unpackhi_epi16(_46, _57);
+
+    r = _mm256_cvtph_ps(_mm_unpacklo_epi64(rg0123, rg4567));
+    g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
+    b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
+    a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
+}
+
+STAGE(store_f16) {
+    auto ptr = *(uint64_t**)ctx + x;
+
+    auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
+         G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
+         B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
+         A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION);
+
+    auto rg0123 = _mm_unpacklo_epi16(R, G),  // r0 g0 r1 g1 r2 g2 r3 g3
+         rg4567 = _mm_unpackhi_epi16(R, G),  // r4 g4 r5 g5 r6 g6 r7 g7
+         ba0123 = _mm_unpacklo_epi16(B, A),
+         ba4567 = _mm_unpackhi_epi16(B, A);
+
+    _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
+    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
+    _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
+    _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+}
diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py
new file mode 100755
index 0000000..0cb633d
--- /dev/null
+++ b/src/splicer/build_stages.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python2.7
+#
+# Copyright 2017 Google Inc.
+#
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import re
+import subprocess
+import sys
+
+cflags = '-std=c++11 -Os -fomit-frame-pointer -mavx2 -mfma -mf16c'
+
+subprocess.check_call(['clang++'] + cflags.split() +
+                      ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
+                      ['-o', 'stages.o'])
+
+print '''/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkSplicer_generated_DEFINED
+#define SkSplicer_generated_DEFINED
+
+// This file is generated semi-automatically with this command:
+//   $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
+'''
+
+for line in subprocess.check_output(['otool', '-tvj', 'stages.o']).split('\n'):
+  line = line.strip()
+  if line == '' or line == 'stages.o:' or line == '(__TEXT,__text) section':
+    continue
+
+  m = re.match('_(.*):', line)
+  if m:
+    name = m.group(1)
+    print 'static const unsigned char kSplice_' + m.group(1) + '[] = {'
+    continue
+
+  # Skip the leading 16 byte address and a tab,
+  # leaving the hex and mnemonics of each instruction.
+  line = line[17:]
+  columns = line.split('\t')
+  _hex  = columns[0].strip()
+  instr = columns[1]
+  args  = columns[2:]
+
+  # We can't splice code that uses rip relative addressing.
+  for arg in args:
+    assert 'rip' not in arg
+
+  # jmp done, the end of each stage (the address of done is not yet filled in)
+  if _hex == 'e9 00 00 00 00':
+    print '};'
+    continue
+
+  sys.stdout.write('    ')
+  _bytes = _hex.split(' ')
+  # This is the meat of things: copy the code to a C unsigned char array.
+  for byte in _bytes:
+    sys.stdout.write('0x' + byte + ',')
+  # From here on we're just making the generated file readable and pretty.
+  sys.stdout.write(' ' * (44 - 5*len(_bytes)))
+  sys.stdout.write('// ' + instr)
+  if args:
+    sys.stdout.write(' ' * (13 - len(instr)))
+    sys.stdout.write(' '.join(args))
+  sys.stdout.write('\n')
+
+print '''#endif//SkSplicer_generated_DEFINED'''