SkSplicer: lowp hacking

Add lowp variants for most stages in SkSplicer.  These double the number
of pixels handled by representing each channel with 16 bits, ranging from
0x0000 as 0 to 0x8000 as 1.  This format lets us use the Q15 multiply
instructions available in NEON and SSSE3 at full register width, with
a little platform-specific fix up to smooth over the fact that these
aren't quite Q15 values.

When a lowp stage is unavailable, the entire pipeline upgrades to
floats.  So by simply not implementing sRGB, f16, matrix multiplication,
etc, we naturally express that they're best handled with floats.

These lowp stages ended up different enough that I've found it clearer
to have them live in their own files, noting where they differ from the
float stages.  HSW, aarch64, and armv7 are all supported.

I've seen very good things performance-wise on all platforms.

Change-Id: Ib4f820c6665f2c9020f7449a2b51bbaf6c408a63
Reviewed-on: https://skia-review.googlesource.com/7098
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
index 0295c4f..57ce670 100644
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@@ -16,10 +16,12 @@
 #endif
 
 #include "SkSplicer_generated.h"
+#include "SkSplicer_generated_lowp.h"
 #include "SkSplicer_shared.h"
 
 // Uncomment to dump output JIT'd pipeline.
 //#define DUMP "/tmp/dump.bin"
+//#define DUMP "/data/local/tmp/dump.bin"
 //
 // On x86, we'll include IACA markers too.
 //   https://software.intel.com/en-us/articles/intel-architecture-code-analyzer
@@ -27,10 +29,10 @@
 //   $ ./iaca.sh -arch HSW -64 -mark 0 /tmp/dump.bin | less
 //
 // To disassemble an aarch64 dump,
-//   $ gobjdump -b binary -D dump.bin -m aarch64
+//   $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m aarch64 | less
 //
 // To disassemble an armv7 dump,
-//   $ gobjdump -b binary -D dump.bin -m arm
+//   $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m arm | less
 
 namespace {
 
@@ -41,11 +43,18 @@
         0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f,       // from_srgb
         12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f,   //   to_srgb
     };
+    static const SkSplicer_constants_lowp kConstants_lowp = {
+        0x0001, 0x8000,
+    };
 
     // We do this a lot, so it's nice to infer the correct size.  Works fine with arrays.
     template <typename T>
     static void splice(SkWStream* buf, const T& val) {
-        buf->write(&val, sizeof(val));
+        // This null check makes determining whether we can drop to lowp easier.
+        // It's always known at compile time..
+        if (buf) {
+            buf->write(&val, sizeof(val));
+        }
     }
 
 #if defined(__aarch64__)
@@ -59,7 +68,6 @@
         splice(buf, 0xf2800000 | (parts[0] << 5) | 0x2);  // merge 16-bit intermediate <<  0 into x2
     }
     static void loop(SkWStream* buf, int loop_start) {
-        splice(buf, 0x91001000);        // add x0, x0, #4
         splice(buf, 0xeb01001f);        // cmp x0, x1
         int off = loop_start - (int)buf->bytesWritten();
         off /= 4;   // bytes -> instructions, still signed
@@ -81,7 +89,6 @@
         splice(buf, 0xe3402000 | encode(parts[1]));  // movt r2,    <top 16 bits>
     }
     static void loop(SkWStream* buf, int loop_start) {
-        splice(buf, 0xe2800002);  // add r0, r0, #2
         splice(buf, 0xe1500001);  // cmp r0, r1
         int off = loop_start - ((int)buf->bytesWritten() + 8 /*ARM is weird*/);
         off /= 4;   // bytes -> instructions, still signed
@@ -99,10 +106,8 @@
         splice(buf, ctx);
     }
     static void loop(SkWStream* buf, int loop_start) {
-        static const uint8_t  addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 };
         static const uint8_t cmp_rsi_rdi[] = { 0x48, 0x39, 0xf7 };
         static const uint8_t     jb_near[] = { 0x0f, 0x8c };
-        splice(buf, addq_8_rdi);   // addq $8, %rdi
         splice(buf, cmp_rsi_rdi);  // cmp %rsi, %rdi
         splice(buf, jb_near);      // jb <next 4 bytes>  (b == "before", unsigned less than)
         splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
@@ -236,12 +241,64 @@
     }
 #endif
 
+    static bool splice_lowp(SkWStream* buf, SkRasterPipeline::StockStage st) {
+        switch (st) {
+            default: return false;
+            case SkRasterPipeline::clamp_0: break;  // lowp can't go below 0.
+        #define CASE(st) case SkRasterPipeline::st: splice(buf, kSplice_##st##_lowp); break
+            CASE(clear);
+            CASE(plus_);
+            CASE(srcover);
+            CASE(dstover);
+            CASE(clamp_1);
+            CASE(clamp_a);
+            CASE(swap);
+            CASE(move_src_dst);
+            CASE(move_dst_src);
+            CASE(premul);
+            CASE(load_8888);
+            CASE(store_8888);
+        #undef CASE
+        }
+        return true;
+    }
+
+    static bool splice_highp(SkWStream* buf, SkRasterPipeline::StockStage st) {
+        switch (st) {
+            default: return false;
+        #define CASE(st) case SkRasterPipeline::st: splice(buf, kSplice_##st); break
+            CASE(clear);
+            CASE(plus_);
+            CASE(srcover);
+            CASE(dstover);
+            CASE(clamp_0);
+            CASE(clamp_1);
+            CASE(clamp_a);
+            CASE(swap);
+            CASE(move_src_dst);
+            CASE(move_dst_src);
+            CASE(premul);
+            CASE(unpremul);
+            CASE(from_srgb);
+            CASE(to_srgb);
+            CASE(scale_u8);
+            CASE(load_tables);
+            CASE(load_8888);
+            CASE(store_8888);
+            CASE(load_f16);
+            CASE(store_f16);
+            CASE(matrix_3x4);
+        #undef CASE
+        }
+        return true;
+    }
+
     struct Spliced {
 
         Spliced(const SkRasterPipeline::Stage* stages, int nstages) {
             // We always create a backup interpreter pipeline,
             //   - to handle any program we can't, and
-            //   - to handle the n < kStride tails.
+            //   - to handle the n < stride tails.
             fBackup     = SkOpts::compile_pipeline(stages, nstages);
             fSplicedLen = 0;
             fSpliced    = nullptr;
@@ -260,12 +317,23 @@
             }
         #endif
 
+            // See if all the stages can run in lowp mode.  If so, we can run at ~2x speed.
+            bool lowp = true;
+            for (int i = 0; i < nstages; i++) {
+                if (!splice_lowp(nullptr, stages[i].stage)) {
+                    //SkDebugf("SkSplicer can't yet handle stage %d in lowp.\n", stages[i].stage);
+                    lowp = false;
+                    break;
+                }
+            }
+            fLowp = lowp;
+
             SkDynamicMemoryWStream buf;
 
             // Our loop is the equivalent of this C++ code:
             //    do {
             //        ... run spliced stages...
-            //        x += kStride;
+            //        x += stride;
             //    } while(x < limit);
             before_loop(&buf);
             auto loop_start = buf.bytesWritten();  // Think of this like a label, loop_start:
@@ -277,36 +345,18 @@
                 }
 
                 // Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
-                switch(stages[i].stage) {
-                    case SkRasterPipeline::clear:        splice(&buf, kSplice_clear       ); break;
-                    case SkRasterPipeline::plus_:        splice(&buf, kSplice_plus        ); break;
-                    case SkRasterPipeline::srcover:      splice(&buf, kSplice_srcover     ); break;
-                    case SkRasterPipeline::dstover:      splice(&buf, kSplice_dstover     ); break;
-                    case SkRasterPipeline::clamp_0:      splice(&buf, kSplice_clamp_0     ); break;
-                    case SkRasterPipeline::clamp_1:      splice(&buf, kSplice_clamp_1     ); break;
-                    case SkRasterPipeline::clamp_a:      splice(&buf, kSplice_clamp_a     ); break;
-                    case SkRasterPipeline::swap:         splice(&buf, kSplice_swap        ); break;
-                    case SkRasterPipeline::move_src_dst: splice(&buf, kSplice_move_src_dst); break;
-                    case SkRasterPipeline::move_dst_src: splice(&buf, kSplice_move_dst_src); break;
-                    case SkRasterPipeline::premul:       splice(&buf, kSplice_premul      ); break;
-                    case SkRasterPipeline::unpremul:     splice(&buf, kSplice_unpremul    ); break;
-                    case SkRasterPipeline::from_srgb:    splice(&buf, kSplice_from_srgb   ); break;
-                    case SkRasterPipeline::to_srgb:      splice(&buf, kSplice_to_srgb     ); break;
-                    case SkRasterPipeline::scale_u8:     splice(&buf, kSplice_scale_u8    ); break;
-                    case SkRasterPipeline::load_tables:  splice(&buf, kSplice_load_tables ); break;
-                    case SkRasterPipeline::load_8888:    splice(&buf, kSplice_load_8888   ); break;
-                    case SkRasterPipeline::store_8888:   splice(&buf, kSplice_store_8888  ); break;
-                    case SkRasterPipeline::load_f16:     splice(&buf, kSplice_load_f16    ); break;
-                    case SkRasterPipeline::store_f16:    splice(&buf, kSplice_store_f16   ); break;
-                    case SkRasterPipeline::matrix_3x4:   splice(&buf, kSplice_matrix_3x4  ); break;
-
-                    // No joy (probably just not yet implemented).
-                    default:
-                        //SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
-                        return;
+                if (lowp) {
+                    SkAssertResult(splice_lowp(&buf, stages[i].stage));
+                    continue;
+                }
+                if (!splice_highp(&buf, stages[i].stage)) {
+                    //SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
+                    return;
                 }
             }
 
+            lowp ? splice(&buf, kSplice_inc_x_lowp)
+                 : splice(&buf, kSplice_inc_x);
             loop(&buf, loop_start);  // Loop back to handle more pixels if not done.
             after_loop(&buf);
             ret(&buf);  // We're done.
@@ -323,7 +373,8 @@
         // Spliced is stored in a std::function, so it needs to be copyable.
         Spliced(const Spliced& o) : fBackup    (o.fBackup)
                                   , fSplicedLen(o.fSplicedLen)
-                                  , fSpliced   (copy_to_executable_mem(o.fSpliced, &fSplicedLen)) {}
+                                  , fSpliced   (copy_to_executable_mem(o.fSpliced, &fSplicedLen))
+                                  , fLowp      (o.fLowp) {}
 
         ~Spliced() {
             cleanup_executable_mem(fSpliced, fSplicedLen);
@@ -331,13 +382,17 @@
 
         // Here's where we call fSpliced if we created it, fBackup if not.
         void operator()(size_t x, size_t y, size_t n) const {
-            size_t body = n/kStride*kStride;   // Largest multiple of kStride (4 or 8) <= n.
-            if (fSpliced && body) {            // Can we run fSpliced for at least one kStride?
+            size_t stride = fLowp ? kStride*2
+                                  : kStride;
+            size_t body = n/stride*stride;     // Largest multiple of stride (2, 4, 8, or 16) <= n.
+            if (fSpliced && body) {            // Can we run fSpliced for at least one stride?
                 // TODO: At some point we will want to pass in y...
-                using Fn = void(size_t x, size_t limit, void* ctx, const SkSplicer_constants* k);
-                ((Fn*)fSpliced)(x, x+body, nullptr, &kConstants);
+                using Fn = void(size_t x, size_t limit, void* ctx, const void* k);
+                auto k = fLowp ? (const void*)&kConstants_lowp
+                               : (const void*)&kConstants;
+                ((Fn*)fSpliced)(x, x+body, nullptr, k);
 
-                // Fall through to fBackup for any n<kStride last pixels.
+                // Fall through to fBackup for any n<stride last pixels.
                 x += body;
                 n -= body;
             }
@@ -347,6 +402,7 @@
         std::function<void(size_t, size_t, size_t)> fBackup;
         size_t                                      fSplicedLen;
         void*                                       fSpliced;
+        bool                                        fLowp;
     };
 
 }
diff --git a/src/splicer/SkSplicer_generated.h b/src/splicer/SkSplicer_generated.h
index e08c428..8549247 100644
--- a/src/splicer/SkSplicer_generated.h
+++ b/src/splicer/SkSplicer_generated.h
@@ -9,17 +9,20 @@
 #define SkSplicer_generated_DEFINED
 
 // This file is generated semi-automatically with this command:
-//   $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
+//   $ src/splicer/build_stages.py
 
 #if defined(__aarch64__)
 
+static const unsigned int kSplice_inc_x[] = {
+    0x91001000,                                 //  add           x0, x0, #0x4
+};
 static const unsigned int kSplice_clear[] = {
     0x6f00e400,                                 //  movi          v0.2d, #0x0
     0x6f00e401,                                 //  movi          v1.2d, #0x0
     0x6f00e402,                                 //  movi          v2.2d, #0x0
     0x6f00e403,                                 //  movi          v3.2d, #0x0
 };
-static const unsigned int kSplice_plus[] = {
+static const unsigned int kSplice_plus_[] = {
     0x4e24d400,                                 //  fadd          v0.4s, v0.4s, v4.4s
     0x4e25d421,                                 //  fadd          v1.4s, v1.4s, v5.4s
     0x4e26d442,                                 //  fadd          v2.4s, v2.4s, v6.4s
@@ -363,13 +366,16 @@
 
 #elif defined(__ARM_NEON__)
 
+static const unsigned int kSplice_inc_x[] = {
+    0xe2800002,                                 //  add           r0, r0, #2
+};
 static const unsigned int kSplice_clear[] = {
     0xf2800010,                                 //  vmov.i32      d0, #0
     0xf2801010,                                 //  vmov.i32      d1, #0
     0xf2802010,                                 //  vmov.i32      d2, #0
     0xf2803010,                                 //  vmov.i32      d3, #0
 };
-static const unsigned int kSplice_plus[] = {
+static const unsigned int kSplice_plus_[] = {
     0xf2000d04,                                 //  vadd.f32      d0, d0, d4
     0xf2011d05,                                 //  vadd.f32      d1, d1, d5
     0xf2022d06,                                 //  vadd.f32      d2, d2, d6
@@ -738,13 +744,16 @@
 
 #else
 
+static const unsigned char kSplice_inc_x[] = {
+    0x48,0x83,0xc7,0x08,                        //  add           $0x8,%rdi
+};
 static const unsigned char kSplice_clear[] = {
     0xc5,0xfc,0x57,0xc0,                        //  vxorps        %ymm0,%ymm0,%ymm0
     0xc5,0xf4,0x57,0xc9,                        //  vxorps        %ymm1,%ymm1,%ymm1
     0xc5,0xec,0x57,0xd2,                        //  vxorps        %ymm2,%ymm2,%ymm2
     0xc5,0xe4,0x57,0xdb,                        //  vxorps        %ymm3,%ymm3,%ymm3
 };
-static const unsigned char kSplice_plus[] = {
+static const unsigned char kSplice_plus_[] = {
     0xc5,0xfc,0x58,0xc4,                        //  vaddps        %ymm4,%ymm0,%ymm0
     0xc5,0xf4,0x58,0xcd,                        //  vaddps        %ymm5,%ymm1,%ymm1
     0xc5,0xec,0x58,0xd6,                        //  vaddps        %ymm6,%ymm2,%ymm2
diff --git a/src/splicer/SkSplicer_generated_lowp.h b/src/splicer/SkSplicer_generated_lowp.h
new file mode 100644
index 0000000..8e2796e
--- /dev/null
+++ b/src/splicer/SkSplicer_generated_lowp.h
@@ -0,0 +1,499 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkSplicer_generated_lowp_DEFINED
+#define SkSplicer_generated_lowp_DEFINED
+
+// This file is generated semi-automatically with this command:
+//   $ src/splicer/build_stages.py
+
+#if defined(__aarch64__)
+
+static const unsigned int kSplice_inc_x_lowp[] = {
+    0x91002000,                                 //  add           x0, x0, #0x8
+};
+static const unsigned int kSplice_clear_lowp[] = {
+    0x6f00e400,                                 //  movi          v0.2d, #0x0
+    0x6f00e401,                                 //  movi          v1.2d, #0x0
+    0x6f00e402,                                 //  movi          v2.2d, #0x0
+    0x6f00e403,                                 //  movi          v3.2d, #0x0
+};
+static const unsigned int kSplice_plus__lowp[] = {
+    0x6e640c00,                                 //  uqadd         v0.8h, v0.8h, v4.8h
+    0x6e650c21,                                 //  uqadd         v1.8h, v1.8h, v5.8h
+    0x6e660c42,                                 //  uqadd         v2.8h, v2.8h, v6.8h
+    0x6e670c63,                                 //  uqadd         v3.8h, v3.8h, v7.8h
+};
+static const unsigned int kSplice_srcover_lowp[] = {
+    0x91000868,                                 //  add           x8, x3, #0x2
+    0x4d40c510,                                 //  ld1r          {v16.8h}, [x8]
+    0x6e632e10,                                 //  uqsub         v16.8h, v16.8h, v3.8h
+    0x6e70b491,                                 //  sqrdmulh      v17.8h, v4.8h, v16.8h
+    0x4e241e12,                                 //  and           v18.16b, v16.16b, v4.16b
+    0x6e70b4b3,                                 //  sqrdmulh      v19.8h, v5.8h, v16.8h
+    0x4e60ba31,                                 //  abs           v17.8h, v17.8h
+    0x4e251e14,                                 //  and           v20.16b, v16.16b, v5.16b
+    0x6f111651,                                 //  usra          v17.8h, v18.8h, #15
+    0x6e70b4d2,                                 //  sqrdmulh      v18.8h, v6.8h, v16.8h
+    0x4e60ba73,                                 //  abs           v19.8h, v19.8h
+    0x6f111693,                                 //  usra          v19.8h, v20.8h, #15
+    0x4e261e14,                                 //  and           v20.16b, v16.16b, v6.16b
+    0x4e60ba52,                                 //  abs           v18.8h, v18.8h
+    0x6f111692,                                 //  usra          v18.8h, v20.8h, #15
+    0x6e70b4f4,                                 //  sqrdmulh      v20.8h, v7.8h, v16.8h
+    0x4e271e10,                                 //  and           v16.16b, v16.16b, v7.16b
+    0x4e60ba94,                                 //  abs           v20.8h, v20.8h
+    0x6f111614,                                 //  usra          v20.8h, v16.8h, #15
+    0x6e600e20,                                 //  uqadd         v0.8h, v17.8h, v0.8h
+    0x6e610e61,                                 //  uqadd         v1.8h, v19.8h, v1.8h
+    0x6e620e42,                                 //  uqadd         v2.8h, v18.8h, v2.8h
+    0x6e630e83,                                 //  uqadd         v3.8h, v20.8h, v3.8h
+};
+static const unsigned int kSplice_dstover_lowp[] = {
+    0x91000868,                                 //  add           x8, x3, #0x2
+    0x4d40c510,                                 //  ld1r          {v16.8h}, [x8]
+    0x6e672e10,                                 //  uqsub         v16.8h, v16.8h, v7.8h
+    0x6e70b411,                                 //  sqrdmulh      v17.8h, v0.8h, v16.8h
+    0x4e201e12,                                 //  and           v18.16b, v16.16b, v0.16b
+    0x6e70b433,                                 //  sqrdmulh      v19.8h, v1.8h, v16.8h
+    0x4e60ba31,                                 //  abs           v17.8h, v17.8h
+    0x4e211e14,                                 //  and           v20.16b, v16.16b, v1.16b
+    0x6f111651,                                 //  usra          v17.8h, v18.8h, #15
+    0x6e70b452,                                 //  sqrdmulh      v18.8h, v2.8h, v16.8h
+    0x4e60ba73,                                 //  abs           v19.8h, v19.8h
+    0x6f111693,                                 //  usra          v19.8h, v20.8h, #15
+    0x4e221e14,                                 //  and           v20.16b, v16.16b, v2.16b
+    0x4e60ba52,                                 //  abs           v18.8h, v18.8h
+    0x6f111692,                                 //  usra          v18.8h, v20.8h, #15
+    0x6e70b474,                                 //  sqrdmulh      v20.8h, v3.8h, v16.8h
+    0x4e231e10,                                 //  and           v16.16b, v16.16b, v3.16b
+    0x4e60ba94,                                 //  abs           v20.8h, v20.8h
+    0x6f111614,                                 //  usra          v20.8h, v16.8h, #15
+    0x6e640e24,                                 //  uqadd         v4.8h, v17.8h, v4.8h
+    0x6e650e65,                                 //  uqadd         v5.8h, v19.8h, v5.8h
+    0x6e660e46,                                 //  uqadd         v6.8h, v18.8h, v6.8h
+    0x6e670e87,                                 //  uqadd         v7.8h, v20.8h, v7.8h
+};
+static const unsigned int kSplice_clamp_1_lowp[] = {
+    0x91000868,                                 //  add           x8, x3, #0x2
+    0x4d40c510,                                 //  ld1r          {v16.8h}, [x8]
+    0x6e706c00,                                 //  umin          v0.8h, v0.8h, v16.8h
+    0x6e706c21,                                 //  umin          v1.8h, v1.8h, v16.8h
+    0x6e706c42,                                 //  umin          v2.8h, v2.8h, v16.8h
+    0x6e706c63,                                 //  umin          v3.8h, v3.8h, v16.8h
+};
+static const unsigned int kSplice_clamp_a_lowp[] = {
+    0x91000868,                                 //  add           x8, x3, #0x2
+    0x4d40c510,                                 //  ld1r          {v16.8h}, [x8]
+    0x6e706c63,                                 //  umin          v3.8h, v3.8h, v16.8h
+    0x6e636c00,                                 //  umin          v0.8h, v0.8h, v3.8h
+    0x6e636c21,                                 //  umin          v1.8h, v1.8h, v3.8h
+    0x6e636c42,                                 //  umin          v2.8h, v2.8h, v3.8h
+};
+static const unsigned int kSplice_swap_lowp[] = {
+    0x4ea31c70,                                 //  mov           v16.16b, v3.16b
+    0x4ea21c51,                                 //  mov           v17.16b, v2.16b
+    0x4ea11c32,                                 //  mov           v18.16b, v1.16b
+    0x4ea01c13,                                 //  mov           v19.16b, v0.16b
+    0x4ea41c80,                                 //  mov           v0.16b, v4.16b
+    0x4ea51ca1,                                 //  mov           v1.16b, v5.16b
+    0x4ea61cc2,                                 //  mov           v2.16b, v6.16b
+    0x4ea71ce3,                                 //  mov           v3.16b, v7.16b
+    0x4eb31e64,                                 //  mov           v4.16b, v19.16b
+    0x4eb21e45,                                 //  mov           v5.16b, v18.16b
+    0x4eb11e26,                                 //  mov           v6.16b, v17.16b
+    0x4eb01e07,                                 //  mov           v7.16b, v16.16b
+};
+static const unsigned int kSplice_move_src_dst_lowp[] = {
+    0x4ea01c04,                                 //  mov           v4.16b, v0.16b
+    0x4ea11c25,                                 //  mov           v5.16b, v1.16b
+    0x4ea21c46,                                 //  mov           v6.16b, v2.16b
+    0x4ea31c67,                                 //  mov           v7.16b, v3.16b
+};
+static const unsigned int kSplice_move_dst_src_lowp[] = {
+    0x4ea41c80,                                 //  mov           v0.16b, v4.16b
+    0x4ea51ca1,                                 //  mov           v1.16b, v5.16b
+    0x4ea61cc2,                                 //  mov           v2.16b, v6.16b
+    0x4ea71ce3,                                 //  mov           v3.16b, v7.16b
+};
+static const unsigned int kSplice_premul_lowp[] = {
+    0x6e63b410,                                 //  sqrdmulh      v16.8h, v0.8h, v3.8h
+    0x4e201c71,                                 //  and           v17.16b, v3.16b, v0.16b
+    0x4e60ba00,                                 //  abs           v0.8h, v16.8h
+    0x6e63b430,                                 //  sqrdmulh      v16.8h, v1.8h, v3.8h
+    0x6f111620,                                 //  usra          v0.8h, v17.8h, #15
+    0x4e211c71,                                 //  and           v17.16b, v3.16b, v1.16b
+    0x4e60ba01,                                 //  abs           v1.8h, v16.8h
+    0x6e63b450,                                 //  sqrdmulh      v16.8h, v2.8h, v3.8h
+    0x6f111621,                                 //  usra          v1.8h, v17.8h, #15
+    0x4e221c71,                                 //  and           v17.16b, v3.16b, v2.16b
+    0x4e60ba02,                                 //  abs           v2.8h, v16.8h
+    0x6f111622,                                 //  usra          v2.8h, v17.8h, #15
+};
+static const unsigned int kSplice_load_8888_lowp[] = {
+    0xf9400048,                                 //  ldr           x8, [x2]
+    0x8b000908,                                 //  add           x8, x8, x0, lsl #2
+    0x0c400110,                                 //  ld4           {v16.8b-v19.8b}, [x8]
+    0x2f0fa600,                                 //  ushll         v0.8h, v16.8b, #7
+    0x2f0fa621,                                 //  ushll         v1.8h, v17.8b, #7
+    0x2f0fa642,                                 //  ushll         v2.8h, v18.8b, #7
+    0x2f0fa663,                                 //  ushll         v3.8h, v19.8b, #7
+    0x6f183400,                                 //  ursra         v0.8h, v0.8h, #8
+    0x6f183421,                                 //  ursra         v1.8h, v1.8h, #8
+    0x6f183442,                                 //  ursra         v2.8h, v2.8h, #8
+    0x6f183463,                                 //  ursra         v3.8h, v3.8h, #8
+};
+static const unsigned int kSplice_store_8888_lowp[] = {
+    0xf9400048,                                 //  ldr           x8, [x2]
+    0x2f099410,                                 //  uqshrn        v16.8b, v0.8h, #7
+    0x2f099431,                                 //  uqshrn        v17.8b, v1.8h, #7
+    0x2f099452,                                 //  uqshrn        v18.8b, v2.8h, #7
+    0x8b000908,                                 //  add           x8, x8, x0, lsl #2
+    0x2f099473,                                 //  uqshrn        v19.8b, v3.8h, #7
+    0x0c000110,                                 //  st4           {v16.8b-v19.8b}, [x8]
+};
+
+#elif defined(__ARM_NEON__)
+
+static const unsigned int kSplice_inc_x_lowp[] = {
+    0xe2800004,                                 //  add           r0, r0, #4
+};
+static const unsigned int kSplice_clear_lowp[] = {
+    0xf2800010,                                 //  vmov.i32      d0, #0
+    0xf2801010,                                 //  vmov.i32      d1, #0
+    0xf2802010,                                 //  vmov.i32      d2, #0
+    0xf2803010,                                 //  vmov.i32      d3, #0
+};
+static const unsigned int kSplice_plus__lowp[] = {
+    0xf3100014,                                 //  vqadd.u16     d0, d0, d4
+    0xf3111015,                                 //  vqadd.u16     d1, d1, d5
+    0xf3122016,                                 //  vqadd.u16     d2, d2, d6
+    0xf3133017,                                 //  vqadd.u16     d3, d3, d7
+};
+static const unsigned int kSplice_srcover_lowp[] = {
+    0xe283c002,                                 //  add           ip, r3, #2
+    0xf4ec0c5f,                                 //  vld1.16       {d16[]}, [ip :16]
+    0xf3500293,                                 //  vqsub.u16     d16, d16, d3
+    0xf3541b20,                                 //  vqrdmulh.s16  d17, d4, d16
+    0xf3552b20,                                 //  vqrdmulh.s16  d18, d5, d16
+    0xf3563b20,                                 //  vqrdmulh.s16  d19, d6, d16
+    0xf3574b20,                                 //  vqrdmulh.s16  d20, d7, d16
+    0xf2405194,                                 //  vand          d21, d16, d4
+    0xf2406195,                                 //  vand          d22, d16, d5
+    0xf2407196,                                 //  vand          d23, d16, d6
+    0xf2400197,                                 //  vand          d16, d16, d7
+    0xf3f51321,                                 //  vabs.s16      d17, d17
+    0xf3f52322,                                 //  vabs.s16      d18, d18
+    0xf3f53323,                                 //  vabs.s16      d19, d19
+    0xf3f54324,                                 //  vabs.s16      d20, d20
+    0xf3d11135,                                 //  vsra.u16      d17, d21, #15
+    0xf3d12136,                                 //  vsra.u16      d18, d22, #15
+    0xf3d13137,                                 //  vsra.u16      d19, d23, #15
+    0xf3d14130,                                 //  vsra.u16      d20, d16, #15
+    0xf3110090,                                 //  vqadd.u16     d0, d17, d0
+    0xf3121091,                                 //  vqadd.u16     d1, d18, d1
+    0xf3132092,                                 //  vqadd.u16     d2, d19, d2
+    0xf3143093,                                 //  vqadd.u16     d3, d20, d3
+};
+static const unsigned int kSplice_dstover_lowp[] = {
+    0xe283c002,                                 //  add           ip, r3, #2
+    0xf4ec0c5f,                                 //  vld1.16       {d16[]}, [ip :16]
+    0xf3500297,                                 //  vqsub.u16     d16, d16, d7
+    0xf3501b20,                                 //  vqrdmulh.s16  d17, d0, d16
+    0xf3512b20,                                 //  vqrdmulh.s16  d18, d1, d16
+    0xf3523b20,                                 //  vqrdmulh.s16  d19, d2, d16
+    0xf3534b20,                                 //  vqrdmulh.s16  d20, d3, d16
+    0xf2405190,                                 //  vand          d21, d16, d0
+    0xf2406191,                                 //  vand          d22, d16, d1
+    0xf2407192,                                 //  vand          d23, d16, d2
+    0xf2400193,                                 //  vand          d16, d16, d3
+    0xf3f51321,                                 //  vabs.s16      d17, d17
+    0xf3f52322,                                 //  vabs.s16      d18, d18
+    0xf3f53323,                                 //  vabs.s16      d19, d19
+    0xf3f54324,                                 //  vabs.s16      d20, d20
+    0xf3d11135,                                 //  vsra.u16      d17, d21, #15
+    0xf3d12136,                                 //  vsra.u16      d18, d22, #15
+    0xf3d13137,                                 //  vsra.u16      d19, d23, #15
+    0xf3d14130,                                 //  vsra.u16      d20, d16, #15
+    0xf3114094,                                 //  vqadd.u16     d4, d17, d4
+    0xf3125095,                                 //  vqadd.u16     d5, d18, d5
+    0xf3136096,                                 //  vqadd.u16     d6, d19, d6
+    0xf3147097,                                 //  vqadd.u16     d7, d20, d7
+};
+static const unsigned int kSplice_clamp_1_lowp[] = {
+    0xe283c002,                                 //  add           ip, r3, #2
+    0xf4ec0c5f,                                 //  vld1.16       {d16[]}, [ip :16]
+    0xf3100630,                                 //  vmin.u16      d0, d0, d16
+    0xf3111630,                                 //  vmin.u16      d1, d1, d16
+    0xf3122630,                                 //  vmin.u16      d2, d2, d16
+    0xf3133630,                                 //  vmin.u16      d3, d3, d16
+};
+static const unsigned int kSplice_clamp_a_lowp[] = {
+    0xe283c002,                                 //  add           ip, r3, #2
+    0xf4ec0c5f,                                 //  vld1.16       {d16[]}, [ip :16]
+    0xf3133630,                                 //  vmin.u16      d3, d3, d16
+    0xf3100613,                                 //  vmin.u16      d0, d0, d3
+    0xf3111613,                                 //  vmin.u16      d1, d1, d3
+    0xf3122613,                                 //  vmin.u16      d2, d2, d3
+};
+static const unsigned int kSplice_swap_lowp[] = {
+    0xeef00b43,                                 //  vmov.f64      d16, d3
+    0xeef01b42,                                 //  vmov.f64      d17, d2
+    0xeef02b41,                                 //  vmov.f64      d18, d1
+    0xeef03b40,                                 //  vmov.f64      d19, d0
+    0xeeb00b44,                                 //  vmov.f64      d0, d4
+    0xeeb01b45,                                 //  vmov.f64      d1, d5
+    0xeeb02b46,                                 //  vmov.f64      d2, d6
+    0xeeb03b47,                                 //  vmov.f64      d3, d7
+    0xeeb04b63,                                 //  vmov.f64      d4, d19
+    0xeeb05b62,                                 //  vmov.f64      d5, d18
+    0xeeb06b61,                                 //  vmov.f64      d6, d17
+    0xeeb07b60,                                 //  vmov.f64      d7, d16
+};
+static const unsigned int kSplice_move_src_dst_lowp[] = {
+    0xeeb04b40,                                 //  vmov.f64      d4, d0
+    0xeeb05b41,                                 //  vmov.f64      d5, d1
+    0xeeb06b42,                                 //  vmov.f64      d6, d2
+    0xeeb07b43,                                 //  vmov.f64      d7, d3
+};
+static const unsigned int kSplice_move_dst_src_lowp[] = {
+    0xeeb00b44,                                 //  vmov.f64      d0, d4
+    0xeeb01b45,                                 //  vmov.f64      d1, d5
+    0xeeb02b46,                                 //  vmov.f64      d2, d6
+    0xeeb03b47,                                 //  vmov.f64      d3, d7
+};
+static const unsigned int kSplice_premul_lowp[] = {
+    0xf3500b03,                                 //  vqrdmulh.s16  d16, d0, d3
+    0xf3511b03,                                 //  vqrdmulh.s16  d17, d1, d3
+    0xf3522b03,                                 //  vqrdmulh.s16  d18, d2, d3
+    0xf2433110,                                 //  vand          d19, d3, d0
+    0xf2434111,                                 //  vand          d20, d3, d1
+    0xf3b50320,                                 //  vabs.s16      d0, d16
+    0xf2430112,                                 //  vand          d16, d3, d2
+    0xf3b51321,                                 //  vabs.s16      d1, d17
+    0xf3b52322,                                 //  vabs.s16      d2, d18
+    0xf3910133,                                 //  vsra.u16      d0, d19, #15
+    0xf3911134,                                 //  vsra.u16      d1, d20, #15
+    0xf3912130,                                 //  vsra.u16      d2, d16, #15
+};
+static const unsigned int kSplice_load_8888_lowp[] = {
+    0xe592c000,                                 //  ldr           ip, [r2]
+    0xe08cc100,                                 //  add           ip, ip, r0, lsl #2
+    0xf4ec030d,                                 //  vld4.8        {d16[0],d17[0],d18[0],d19[0]}, [ip]!
+    0xf4ec032d,                                 //  vld4.8        {d16[1],d17[1],d18[1],d19[1]}, [ip]!
+    0xf4ec034d,                                 //  vld4.8        {d16[2],d17[2],d18[2],d19[2]}, [ip]!
+    0xf4ec036d,                                 //  vld4.8        {d16[3],d17[3],d18[3],d19[3]}, [ip]!
+    0xf38f0a30,                                 //  vshll.u8      q0, d16, #7
+    0xf38f2a32,                                 //  vshll.u8      q1, d18, #7
+    0xf3cf0a31,                                 //  vshll.u8      q8, d17, #7
+    0xf3cf2a33,                                 //  vshll.u8      q9, d19, #7
+    0xf3980350,                                 //  vrsra.u16     q0, q0, #8
+    0xf3d80370,                                 //  vrsra.u16     q8, q8, #8
+    0xf3d82372,                                 //  vrsra.u16     q9, q9, #8
+    0xf3982352,                                 //  vrsra.u16     q1, q1, #8
+    0xf22011b0,                                 //  vorr          d1, d16, d16
+    0xf22231b2,                                 //  vorr          d3, d18, d18
+};
+static const unsigned int kSplice_store_8888_lowp[] = {
+    0xf2630113,                                 //  vorr          d16, d3, d3
+    0xe592c000,                                 //  ldr           ip, [r2]
+    0xf2612111,                                 //  vorr          d18, d1, d1
+    0xf3c94910,                                 //  vqshrn.u16    d20, q0, #7
+    0xe08cc100,                                 //  add           ip, ip, r0, lsl #2
+    0xf3c96912,                                 //  vqshrn.u16    d22, q1, #7
+    0xf3c95932,                                 //  vqshrn.u16    d21, q9, #7
+    0xf3c97930,                                 //  vqshrn.u16    d23, q8, #7
+    0xf4cc430d,                                 //  vst4.8        {d20[0],d21[0],d22[0],d23[0]}, [ip]!
+    0xf4cc432d,                                 //  vst4.8        {d20[1],d21[1],d22[1],d23[1]}, [ip]!
+    0xf4cc434d,                                 //  vst4.8        {d20[2],d21[2],d22[2],d23[2]}, [ip]!
+    0xf4cc436d,                                 //  vst4.8        {d20[3],d21[3],d22[3],d23[3]}, [ip]!
+};
+
+#else
+
+static const unsigned char kSplice_inc_x_lowp[] = {
+    0x48,0x83,0xc7,0x10,                        //  add           $0x10,%rdi
+};
+static const unsigned char kSplice_clear_lowp[] = {
+    0xc5,0xfc,0x57,0xc0,                        //  vxorps        %ymm0,%ymm0,%ymm0
+    0xc5,0xf4,0x57,0xc9,                        //  vxorps        %ymm1,%ymm1,%ymm1
+    0xc5,0xec,0x57,0xd2,                        //  vxorps        %ymm2,%ymm2,%ymm2
+    0xc5,0xe4,0x57,0xdb,                        //  vxorps        %ymm3,%ymm3,%ymm3
+};
+static const unsigned char kSplice_plus__lowp[] = {
+    0xc5,0xfd,0xdd,0xc4,                        //  vpaddusw      %ymm4,%ymm0,%ymm0
+    0xc5,0xf5,0xdd,0xcd,                        //  vpaddusw      %ymm5,%ymm1,%ymm1
+    0xc5,0xed,0xdd,0xd6,                        //  vpaddusw      %ymm6,%ymm2,%ymm2
+    0xc5,0xe5,0xdd,0xdf,                        //  vpaddusw      %ymm7,%ymm3,%ymm3
+};
+static const unsigned char kSplice_srcover_lowp[] = {
+    0xc4,0x62,0x7d,0x79,0x41,0x02,              //  vpbroadcastw  0x2(%rcx),%ymm8
+    0xc5,0x3d,0xd9,0xc3,                        //  vpsubusw      %ymm3,%ymm8,%ymm8
+    0xc4,0x42,0x5d,0x0b,0xc8,                   //  vpmulhrsw     %ymm8,%ymm4,%ymm9
+    0xc4,0x42,0x7d,0x1d,0xc9,                   //  vpabsw        %ymm9,%ymm9
+    0xc5,0xb5,0xdd,0xc0,                        //  vpaddusw      %ymm0,%ymm9,%ymm0
+    0xc4,0x42,0x55,0x0b,0xc8,                   //  vpmulhrsw     %ymm8,%ymm5,%ymm9
+    0xc4,0x42,0x7d,0x1d,0xc9,                   //  vpabsw        %ymm9,%ymm9
+    0xc5,0xb5,0xdd,0xc9,                        //  vpaddusw      %ymm1,%ymm9,%ymm1
+    0xc4,0x42,0x4d,0x0b,0xc8,                   //  vpmulhrsw     %ymm8,%ymm6,%ymm9
+    0xc4,0x42,0x7d,0x1d,0xc9,                   //  vpabsw        %ymm9,%ymm9
+    0xc5,0xb5,0xdd,0xd2,                        //  vpaddusw      %ymm2,%ymm9,%ymm2
+    0xc4,0x42,0x45,0x0b,0xc0,                   //  vpmulhrsw     %ymm8,%ymm7,%ymm8
+    0xc4,0x42,0x7d,0x1d,0xc0,                   //  vpabsw        %ymm8,%ymm8
+    0xc5,0xbd,0xdd,0xdb,                        //  vpaddusw      %ymm3,%ymm8,%ymm3
+};
+static const unsigned char kSplice_dstover_lowp[] = {
+    0xc4,0x62,0x7d,0x79,0x41,0x02,              //  vpbroadcastw  0x2(%rcx),%ymm8
+    0xc5,0x3d,0xd9,0xc7,                        //  vpsubusw      %ymm7,%ymm8,%ymm8
+    0xc4,0x42,0x7d,0x0b,0xc8,                   //  vpmulhrsw     %ymm8,%ymm0,%ymm9
+    0xc4,0x42,0x7d,0x1d,0xc9,                   //  vpabsw        %ymm9,%ymm9
+    0xc5,0xb5,0xdd,0xe4,                        //  vpaddusw      %ymm4,%ymm9,%ymm4
+    0xc4,0x42,0x75,0x0b,0xc8,                   //  vpmulhrsw     %ymm8,%ymm1,%ymm9
+    0xc4,0x42,0x7d,0x1d,0xc9,                   //  vpabsw        %ymm9,%ymm9
+    0xc5,0xb5,0xdd,0xed,                        //  vpaddusw      %ymm5,%ymm9,%ymm5
+    0xc4,0x42,0x6d,0x0b,0xc8,                   //  vpmulhrsw     %ymm8,%ymm2,%ymm9
+    0xc4,0x42,0x7d,0x1d,0xc9,                   //  vpabsw        %ymm9,%ymm9
+    0xc5,0xb5,0xdd,0xf6,                        //  vpaddusw      %ymm6,%ymm9,%ymm6
+    0xc4,0x42,0x65,0x0b,0xc0,                   //  vpmulhrsw     %ymm8,%ymm3,%ymm8
+    0xc4,0x42,0x7d,0x1d,0xc0,                   //  vpabsw        %ymm8,%ymm8
+    0xc5,0xbd,0xdd,0xff,                        //  vpaddusw      %ymm7,%ymm8,%ymm7
+};
+static const unsigned char kSplice_clamp_1_lowp[] = {
+    0xc4,0x62,0x7d,0x79,0x41,0x02,              //  vpbroadcastw  0x2(%rcx),%ymm8
+    0xc4,0xc2,0x7d,0x3a,0xc0,                   //  vpminuw       %ymm8,%ymm0,%ymm0
+    0xc4,0xc2,0x75,0x3a,0xc8,                   //  vpminuw       %ymm8,%ymm1,%ymm1
+    0xc4,0xc2,0x6d,0x3a,0xd0,                   //  vpminuw       %ymm8,%ymm2,%ymm2
+    0xc4,0xc2,0x65,0x3a,0xd8,                   //  vpminuw       %ymm8,%ymm3,%ymm3
+};
+static const unsigned char kSplice_clamp_a_lowp[] = {
+    0xc4,0x62,0x7d,0x79,0x41,0x02,              //  vpbroadcastw  0x2(%rcx),%ymm8
+    0xc4,0xc2,0x65,0x3a,0xd8,                   //  vpminuw       %ymm8,%ymm3,%ymm3
+    0xc4,0xe2,0x7d,0x3a,0xc3,                   //  vpminuw       %ymm3,%ymm0,%ymm0
+    0xc4,0xe2,0x75,0x3a,0xcb,                   //  vpminuw       %ymm3,%ymm1,%ymm1
+    0xc4,0xe2,0x6d,0x3a,0xd3,                   //  vpminuw       %ymm3,%ymm2,%ymm2
+};
+static const unsigned char kSplice_swap_lowp[] = {
+    0xc5,0x7c,0x28,0xc3,                        //  vmovaps       %ymm3,%ymm8
+    0xc5,0x7c,0x28,0xca,                        //  vmovaps       %ymm2,%ymm9
+    0xc5,0x7c,0x28,0xd1,                        //  vmovaps       %ymm1,%ymm10
+    0xc5,0x7c,0x28,0xd8,                        //  vmovaps       %ymm0,%ymm11
+    0xc5,0xfc,0x28,0xc4,                        //  vmovaps       %ymm4,%ymm0
+    0xc5,0xfc,0x28,0xcd,                        //  vmovaps       %ymm5,%ymm1
+    0xc5,0xfc,0x28,0xd6,                        //  vmovaps       %ymm6,%ymm2
+    0xc5,0xfc,0x28,0xdf,                        //  vmovaps       %ymm7,%ymm3
+    0xc5,0x7c,0x29,0xdc,                        //  vmovaps       %ymm11,%ymm4
+    0xc5,0x7c,0x29,0xd5,                        //  vmovaps       %ymm10,%ymm5
+    0xc5,0x7c,0x29,0xce,                        //  vmovaps       %ymm9,%ymm6
+    0xc5,0x7c,0x29,0xc7,                        //  vmovaps       %ymm8,%ymm7
+};
+static const unsigned char kSplice_move_src_dst_lowp[] = {
+    0xc5,0xfc,0x28,0xe0,                        //  vmovaps       %ymm0,%ymm4
+    0xc5,0xfc,0x28,0xe9,                        //  vmovaps       %ymm1,%ymm5
+    0xc5,0xfc,0x28,0xf2,                        //  vmovaps       %ymm2,%ymm6
+    0xc5,0xfc,0x28,0xfb,                        //  vmovaps       %ymm3,%ymm7
+};
+static const unsigned char kSplice_move_dst_src_lowp[] = {
+    0xc5,0xfc,0x28,0xc4,                        //  vmovaps       %ymm4,%ymm0
+    0xc5,0xfc,0x28,0xcd,                        //  vmovaps       %ymm5,%ymm1
+    0xc5,0xfc,0x28,0xd6,                        //  vmovaps       %ymm6,%ymm2
+    0xc5,0xfc,0x28,0xdf,                        //  vmovaps       %ymm7,%ymm3
+};
+static const unsigned char kSplice_premul_lowp[] = {
+    0xc4,0xe2,0x7d,0x0b,0xc3,                   //  vpmulhrsw     %ymm3,%ymm0,%ymm0
+    0xc4,0xe2,0x7d,0x1d,0xc0,                   //  vpabsw        %ymm0,%ymm0
+    0xc4,0xe2,0x75,0x0b,0xcb,                   //  vpmulhrsw     %ymm3,%ymm1,%ymm1
+    0xc4,0xe2,0x7d,0x1d,0xc9,                   //  vpabsw        %ymm1,%ymm1
+    0xc4,0xe2,0x6d,0x0b,0xd3,                   //  vpmulhrsw     %ymm3,%ymm2,%ymm2
+    0xc4,0xe2,0x7d,0x1d,0xd2,                   //  vpabsw        %ymm2,%ymm2
+};
+static const unsigned char kSplice_load_8888_lowp[] = {
+    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
+    0xc5,0xfa,0x6f,0x04,0xb8,                   //  vmovdqu       (%rax,%rdi,4),%xmm0
+    0xc5,0xfa,0x6f,0x4c,0xb8,0x10,              //  vmovdqu       0x10(%rax,%rdi,4),%xmm1
+    0xc5,0xfa,0x6f,0x54,0xb8,0x20,              //  vmovdqu       0x20(%rax,%rdi,4),%xmm2
+    0xc5,0xfa,0x6f,0x5c,0xb8,0x30,              //  vmovdqu       0x30(%rax,%rdi,4),%xmm3
+    0xc5,0x79,0x60,0xc1,                        //  vpunpcklbw    %xmm1,%xmm0,%xmm8
+    0xc5,0xf9,0x68,0xc1,                        //  vpunpckhbw    %xmm1,%xmm0,%xmm0
+    0xc5,0xe9,0x60,0xcb,                        //  vpunpcklbw    %xmm3,%xmm2,%xmm1
+    0xc5,0xe9,0x68,0xd3,                        //  vpunpckhbw    %xmm3,%xmm2,%xmm2
+    0xc5,0xb9,0x60,0xd8,                        //  vpunpcklbw    %xmm0,%xmm8,%xmm3
+    0xc5,0xb9,0x68,0xc0,                        //  vpunpckhbw    %xmm0,%xmm8,%xmm0
+    0xc5,0x71,0x60,0xc2,                        //  vpunpcklbw    %xmm2,%xmm1,%xmm8
+    0xc5,0xf1,0x68,0xca,                        //  vpunpckhbw    %xmm2,%xmm1,%xmm1
+    0xc5,0xe1,0x60,0xd0,                        //  vpunpcklbw    %xmm0,%xmm3,%xmm2
+    0xc5,0x61,0x68,0xc8,                        //  vpunpckhbw    %xmm0,%xmm3,%xmm9
+    0xc5,0xb9,0x60,0xd9,                        //  vpunpcklbw    %xmm1,%xmm8,%xmm3
+    0xc5,0x39,0x68,0xc1,                        //  vpunpckhbw    %xmm1,%xmm8,%xmm8
+    0xc5,0xe9,0x6c,0xc3,                        //  vpunpcklqdq   %xmm3,%xmm2,%xmm0
+    0xc4,0xe2,0x7d,0x30,0xc0,                   //  vpmovzxbw     %xmm0,%ymm0
+    0xc5,0xf5,0x71,0xf0,0x07,                   //  vpsllw        $0x7,%ymm0,%ymm1
+    0xc5,0xad,0x71,0xd0,0x01,                   //  vpsrlw        $0x1,%ymm0,%ymm10
+    0xc4,0xc1,0x75,0xdd,0xca,                   //  vpaddusw      %ymm10,%ymm1,%ymm1
+    0xc4,0x62,0x7d,0x79,0x11,                   //  vpbroadcastw  (%rcx),%ymm10
+    0xc4,0xc1,0x7d,0xdd,0xc2,                   //  vpaddusw      %ymm10,%ymm0,%ymm0
+    0xc5,0xfd,0x71,0xd0,0x08,                   //  vpsrlw        $0x8,%ymm0,%ymm0
+    0xc5,0xf5,0xdd,0xc0,                        //  vpaddusw      %ymm0,%ymm1,%ymm0
+    0xc5,0xe9,0x6d,0xcb,                        //  vpunpckhqdq   %xmm3,%xmm2,%xmm1
+    0xc4,0xe2,0x7d,0x30,0xc9,                   //  vpmovzxbw     %xmm1,%ymm1
+    0xc5,0xed,0x71,0xf1,0x07,                   //  vpsllw        $0x7,%ymm1,%ymm2
+    0xc5,0xe5,0x71,0xd1,0x01,                   //  vpsrlw        $0x1,%ymm1,%ymm3
+    0xc5,0xed,0xdd,0xd3,                        //  vpaddusw      %ymm3,%ymm2,%ymm2
+    0xc4,0xc1,0x75,0xdd,0xca,                   //  vpaddusw      %ymm10,%ymm1,%ymm1
+    0xc5,0xf5,0x71,0xd1,0x08,                   //  vpsrlw        $0x8,%ymm1,%ymm1
+    0xc5,0xed,0xdd,0xc9,                        //  vpaddusw      %ymm1,%ymm2,%ymm1
+    0xc4,0xc1,0x31,0x6c,0xd0,                   //  vpunpcklqdq   %xmm8,%xmm9,%xmm2
+    0xc4,0xe2,0x7d,0x30,0xd2,                   //  vpmovzxbw     %xmm2,%ymm2
+    0xc5,0xe5,0x71,0xf2,0x07,                   //  vpsllw        $0x7,%ymm2,%ymm3
+    0xc5,0xa5,0x71,0xd2,0x01,                   //  vpsrlw        $0x1,%ymm2,%ymm11
+    0xc4,0xc1,0x65,0xdd,0xdb,                   //  vpaddusw      %ymm11,%ymm3,%ymm3
+    0xc4,0xc1,0x6d,0xdd,0xd2,                   //  vpaddusw      %ymm10,%ymm2,%ymm2
+    0xc5,0xed,0x71,0xd2,0x08,                   //  vpsrlw        $0x8,%ymm2,%ymm2
+    0xc5,0xe5,0xdd,0xd2,                        //  vpaddusw      %ymm2,%ymm3,%ymm2
+    0xc4,0xc1,0x31,0x6d,0xd8,                   //  vpunpckhqdq   %xmm8,%xmm9,%xmm3
+    0xc4,0xe2,0x7d,0x30,0xdb,                   //  vpmovzxbw     %xmm3,%ymm3
+    0xc5,0xbd,0x71,0xf3,0x07,                   //  vpsllw        $0x7,%ymm3,%ymm8
+    0xc5,0xb5,0x71,0xd3,0x01,                   //  vpsrlw        $0x1,%ymm3,%ymm9
+    0xc4,0x41,0x3d,0xdd,0xc1,                   //  vpaddusw      %ymm9,%ymm8,%ymm8
+    0xc4,0xc1,0x65,0xdd,0xda,                   //  vpaddusw      %ymm10,%ymm3,%ymm3
+    0xc5,0xe5,0x71,0xd3,0x08,                   //  vpsrlw        $0x8,%ymm3,%ymm3
+    0xc5,0xbd,0xdd,0xdb,                        //  vpaddusw      %ymm3,%ymm8,%ymm3
+};
+static const unsigned char kSplice_store_8888_lowp[] = {
+    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
+    0xc5,0x7d,0xdd,0xc0,                        //  vpaddusw      %ymm0,%ymm0,%ymm8
+    0xc4,0xc1,0x3d,0x71,0xd0,0x08,              //  vpsrlw        $0x8,%ymm8,%ymm8
+    0xc4,0x43,0x7d,0x39,0xc1,0x01,              //  vextracti128  $0x1,%ymm8,%xmm9
+    0xc4,0x41,0x39,0x67,0xc1,                   //  vpackuswb     %xmm9,%xmm8,%xmm8
+    0xc5,0x75,0xdd,0xc9,                        //  vpaddusw      %ymm1,%ymm1,%ymm9
+    0xc4,0xc1,0x35,0x71,0xd1,0x08,              //  vpsrlw        $0x8,%ymm9,%ymm9
+    0xc4,0x43,0x7d,0x39,0xca,0x01,              //  vextracti128  $0x1,%ymm9,%xmm10
+    0xc4,0x41,0x31,0x67,0xca,                   //  vpackuswb     %xmm10,%xmm9,%xmm9
+    0xc5,0x6d,0xdd,0xd2,                        //  vpaddusw      %ymm2,%ymm2,%ymm10
+    0xc4,0xc1,0x2d,0x71,0xd2,0x08,              //  vpsrlw        $0x8,%ymm10,%ymm10
+    0xc4,0x43,0x7d,0x39,0xd3,0x01,              //  vextracti128  $0x1,%ymm10,%xmm11
+    0xc4,0x41,0x29,0x67,0xd3,                   //  vpackuswb     %xmm11,%xmm10,%xmm10
+    0xc5,0x65,0xdd,0xdb,                        //  vpaddusw      %ymm3,%ymm3,%ymm11
+    0xc4,0xc1,0x25,0x71,0xd3,0x08,              //  vpsrlw        $0x8,%ymm11,%ymm11
+    0xc4,0x43,0x7d,0x39,0xdc,0x01,              //  vextracti128  $0x1,%ymm11,%xmm12
+    0xc4,0x41,0x21,0x67,0xdc,                   //  vpackuswb     %xmm12,%xmm11,%xmm11
+    0xc4,0x41,0x39,0x60,0xe1,                   //  vpunpcklbw    %xmm9,%xmm8,%xmm12
+    0xc4,0x41,0x39,0x68,0xc1,                   //  vpunpckhbw    %xmm9,%xmm8,%xmm8
+    0xc4,0x41,0x29,0x60,0xcb,                   //  vpunpcklbw    %xmm11,%xmm10,%xmm9
+    0xc4,0x41,0x29,0x68,0xd3,                   //  vpunpckhbw    %xmm11,%xmm10,%xmm10
+    0xc4,0x41,0x19,0x61,0xd9,                   //  vpunpcklwd    %xmm9,%xmm12,%xmm11
+    0xc5,0x7a,0x7f,0x1c,0xb8,                   //  vmovdqu       %xmm11,(%rax,%rdi,4)
+    0xc4,0x41,0x19,0x69,0xc9,                   //  vpunpckhwd    %xmm9,%xmm12,%xmm9
+    0xc5,0x7a,0x7f,0x4c,0xb8,0x10,              //  vmovdqu       %xmm9,0x10(%rax,%rdi,4)
+    0xc4,0x41,0x39,0x61,0xca,                   //  vpunpcklwd    %xmm10,%xmm8,%xmm9
+    0xc5,0x7a,0x7f,0x4c,0xb8,0x20,              //  vmovdqu       %xmm9,0x20(%rax,%rdi,4)
+    0xc4,0x41,0x39,0x69,0xc2,                   //  vpunpckhwd    %xmm10,%xmm8,%xmm8
+    0xc5,0x7a,0x7f,0x44,0xb8,0x30,              //  vmovdqu       %xmm8,0x30(%rax,%rdi,4)
+};
+
+#endif
+
+#endif//SkSplicer_generated_lowp_DEFINED
diff --git a/src/splicer/SkSplicer_shared.h b/src/splicer/SkSplicer_shared.h
index 1c530c4..7f4db14 100644
--- a/src/splicer/SkSplicer_shared.h
+++ b/src/splicer/SkSplicer_shared.h
@@ -40,4 +40,9 @@
     float    _00043;       //  0.0043f
 };
 
+struct SkSplicer_constants_lowp {
+    uint16_t _0x0001;      // 0x0001 ==     1 == epsilon
+    uint16_t _1;           // 0x8000 == 32768 == 1.0
+};
+
 #endif//SkSplicer_shared_DEFINED
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
index 755bb5d..c61a267 100644
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -119,14 +119,14 @@
 // This should feel familiar to anyone who's read SkRasterPipeline_opts.h.
 // It's just a convenience to make a valid, spliceable Stage, nothing magic.
 #define STAGE(name)                                                           \
-    static void name##_k(size_t x, size_t limit, void* ctx, K* k,             \
+    static void name##_k(size_t& x, size_t limit, void* ctx, K* k,            \
                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
     C void name(size_t x, size_t limit, void* ctx, K* k,                      \
                 F r, F g, F b, F a, F dr, F dg, F db, F da) {                 \
         name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da);                        \
         done    (x,limit,ctx,k, r,g,b,a, dr,dg,db,da);                        \
     }                                                                         \
-    static void name##_k(size_t x, size_t limit, void* ctx, K* k,             \
+    static void name##_k(size_t& x, size_t limit, void* ctx, K* k,            \
                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
 
 // We can now define Stages!
@@ -144,11 +144,15 @@
 //   - lambdas;
 //   - memcpy() with a compile-time constant size argument.
 
+STAGE(inc_x) {
+    x += sizeof(F) / sizeof(float);
+}
+
 STAGE(clear) {
     r = g = b = a = 0;
 }
 
-STAGE(plus) {
+STAGE(plus_) {
     r = r + dr;
     g = g + dg;
     b = b + db;
diff --git a/src/splicer/SkSplicer_stages_lowp.cpp b/src/splicer/SkSplicer_stages_lowp.cpp
new file mode 100644
index 0000000..ef3ab40
--- /dev/null
+++ b/src/splicer/SkSplicer_stages_lowp.cpp
@@ -0,0 +1,336 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+// This file is very similar to SkSplicer_stages.cpp, and you will want to read through that file
+// first before trying to understand this one.  We'll note only key differences here.
+
+#include "SkSplicer_shared.h"
+#include <string.h>
+
+#if !defined(__clang__)
+    #error This file is not like the rest of Skia.  It must be compiled with clang.
+#endif
+
+#if defined(__aarch64__)
+    #include <arm_neon.h>
+
+    // In this file, F is a vector of SkFixed15.
+    // See SkFixed15.h for notes on its various operations.
+    struct F {
+        using V = uint16_t __attribute__((ext_vector_type(8)));
+
+        V vec;
+
+        F(uint16x8_t v) : vec(v) {}
+        operator V() const { return vec; }
+
+        F() = default;
+        F(uint16_t v) : vec(v) {}
+
+        F operator+(F o) const { return vqaddq_u16(vec, o.vec); }
+        F operator-(F o) const { return vqsubq_u16(vec, o.vec); }
+        F operator*(F o) const {
+            return vsraq_n_u16(vabsq_s16(vqrdmulhq_s16(vec, o.vec)),
+                               vandq_s16(vec, o.vec), 15);
+        }
+        F operator>>(int k) const { return vec >> k; }
+        F operator<<(int k) const { return vec << k; }
+    };
+    static F min(F a, F b) { return vminq_u16(a,b); }
+    static F max(F a, F b) { return vmaxq_u16(a,b); }
+
+#elif defined(__ARM_NEON__)
+    #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
+        #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
+    #endif
+    #include <arm_neon.h>
+
+    struct F {
+        using V = uint16_t __attribute__((ext_vector_type(4)));
+
+        V vec;
+
+        F(uint16x4_t v) : vec(v) {}
+        operator V() const { return vec; }
+
+        F() = default;
+        F(uint16_t v) : vec(v) {}
+
+        F operator+(F o) const { return vqadd_u16(vec, o.vec); }
+        F operator-(F o) const { return vqsub_u16(vec, o.vec); }
+        F operator*(F o) const {
+            return vsra_n_u16(vabs_s16(vqrdmulh_s16(vec, o.vec)),
+                              vand_s16(vec, o.vec), 15);
+        }
+        F operator>>(int k) const { return vec >> k; }
+        F operator<<(int k) const { return vec << k; }
+    };
+    static F min(F a, F b) { return vmin_u16(a,b); }
+    static F max(F a, F b) { return vmax_u16(a,b); }
+
+#else
+    #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
+        #error On x86, compile with -mavx2 -mfma -mf16c.
+    #endif
+    #include <immintrin.h>
+
+    struct F {
+        using V = uint16_t __attribute__((ext_vector_type(16)));
+
+        V vec;
+
+        F(__m256 v) : vec(v) {}
+        operator V() const { return vec; }
+
+        F() = default;
+        F(uint16_t v) : vec(v) {}
+
+        F operator+(F o) const { return _mm256_adds_epu16(vec, o.vec); }
+        F operator-(F o) const { return _mm256_subs_epu16(vec, o.vec); }
+        F operator*(F o) const { return _mm256_abs_epi16(_mm256_mulhrs_epi16(vec, o.vec)); }
+        F operator>>(int k) const { return vec >> k; }
+        F operator<<(int k) const { return vec << k; }
+    };
+    static F min(F a, F b) { return _mm256_min_epu16(a,b); }
+    static F max(F a, F b) { return _mm256_max_epu16(a,b); }
+#endif
+
+// No platform actually supports FMA for SkFixed15.
+// This fma() method just makes it easier to port stages to lowp.
+static F fma(F f, F m, F a) { return f*m+a; }
+
+#if defined(__ARM_NEON__)
+    #define C extern "C" __attribute__((pcs("aapcs-vfp")))
+#else
+    #define C extern "C"
+#endif
+
+// We use a set of constants suitable for SkFixed15 math.
+using K = const SkSplicer_constants_lowp;
+using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
+
+// The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in
+// registers.  This shouldn't affect performance or how you write STAGEs in any way.
+C void done(size_t, size_t, void*, K*, F::V,F::V,F::V,F::V, F::V,F::V,F::V,F::V);
+
+#define STAGE(name)                                                           \
+    static void name##_k(size_t& x, size_t limit, void* ctx, K* k,            \
+                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+    C void name##_lowp(size_t x, size_t limit, void* ctx, K* k,               \
+                       F::V  R, F::V  G, F::V  B, F::V  A,                    \
+                       F::V DR, F::V DG, F::V DB, F::V DA) {                  \
+        F r = R, g = G, b = B, a = A, dr = DR, dg = DG, db = DB, da = DA;     \
+        name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da);                        \
+        done    (x,limit,ctx,k, r,g,b,a, dr,dg,db,da);                        \
+    }                                                                         \
+    static void name##_k(size_t& x, size_t limit, void* ctx, K* k,            \
+                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+
+STAGE(inc_x) {
+    x += sizeof(F) / sizeof(uint16_t);
+}
+
+STAGE(clear) {
+    r = g = b = a = 0;
+}
+
+STAGE(plus_) {
+    r = r + dr;
+    g = g + dg;
+    b = b + db;
+    a = a + da;
+}
+
+STAGE(srcover) {
+    auto A = F(k->_1) - a;
+    r = fma(dr, A, r);
+    g = fma(dg, A, g);
+    b = fma(db, A, b);
+    a = fma(da, A, a);
+}
+STAGE(dstover) { srcover_k(x,limit,ctx,k, dr,dg,db,da, r,g,b,a); }
+
+STAGE(clamp_1) {
+    r = min(r, k->_1);
+    g = min(g, k->_1);
+    b = min(b, k->_1);
+    a = min(a, k->_1);
+}
+
+STAGE(clamp_a) {
+    a = min(a, k->_1);
+    r = min(r, a);
+    g = min(g, a);
+    b = min(b, a);
+}
+
+STAGE(swap) {
+    auto swap = [](F& v, F& dv) {
+        auto tmp = v;
+        v = dv;
+        dv = tmp;
+    };
+    swap(r, dr);
+    swap(g, dg);
+    swap(b, db);
+    swap(a, da);
+}
+STAGE(move_src_dst) {
+    dr = r;
+    dg = g;
+    db = b;
+    da = a;
+}
+STAGE(move_dst_src) {
+    r = dr;
+    g = dg;
+    b = db;
+    a = da;
+}
+
+STAGE(premul) {
+    r = r * a;
+    g = g * a;
+    b = b * a;
+}
+
+STAGE(load_8888) {
+    auto ptr = *(const uint32_t**)ctx + x;
+
+#if defined(__aarch64__)
+    auto to_fixed15 = [](uint8x8_t u8) {
+        // u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8  ( see SkFixed15.h)
+        //
+        // Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0
+        // and 255, and never off by more than 1 in between.  Thanks to NEON, it's 2 instructions!
+        auto u16 = vshll_n_u8(u8, 7);      // u16 =  u8*128
+        return vrsraq_n_u16(u16, u16, 8);  // u16 + u16/256, with rounding
+    };
+
+    uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr);
+    r = to_fixed15(rgba.val[0]);
+    g = to_fixed15(rgba.val[1]);
+    b = to_fixed15(rgba.val[2]);
+    a = to_fixed15(rgba.val[3]);
+
+#elif defined(__ARM_NEON__)
+    auto to_fixed15 = [](uint8x8_t u8) {
+        // Same as aarch64, but only keeping the bottom 4 lanes.
+        auto u16 = vshll_n_u8(u8, 7);
+        return vget_low_u16(vrsraq_n_u16(u16, u16, 8));
+    };
+
+    // I can't get quite the code generation I want using vld4_lane_u8(),
+    // so we're going to drop into assembly to do the loads.  :/
+
+    uint8x8_t R,G,B,A;
+    asm("vld4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n"
+        "vld4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n"
+        "vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
+        "vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
+        : "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A));
+    r = to_fixed15(R);
+    g = to_fixed15(G);
+    b = to_fixed15(B);
+    a = to_fixed15(A);
+
+#else
+    auto to_fixed15 = [k](__m128i u8) {
+        F u16 = _mm256_cvtepu8_epi16(u8);
+        return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
+    };
+
+    // TODO: shorter, more confusing, faster with 256-bit loads and shuffles
+
+    // Load 16 interplaced pixels.
+    auto _0123 = _mm_loadu_si128((const __m128i*)ptr + 0),
+         _4567 = _mm_loadu_si128((const __m128i*)ptr + 1),
+         _89AB = _mm_loadu_si128((const __m128i*)ptr + 2),
+         _CDEF = _mm_loadu_si128((const __m128i*)ptr + 3);
+
+    // We've got an awful lot of unpacking to do to transpose this...
+    auto _0415 = _mm_unpacklo_epi8(_0123, _4567),  // r04 g04 b04 a04  r15 g15 b15 a15
+         _2637 = _mm_unpackhi_epi8(_0123, _4567),  // r26 g26 b26 a26  r37 g37 b37 a37
+         _8C9D = _mm_unpacklo_epi8(_89AB, _CDEF),
+         _AEBF = _mm_unpackhi_epi8(_89AB, _CDEF);
+
+    auto _0246 = _mm_unpacklo_epi8(_0415, _2637),  // r0246 g0246  b0246 a0246
+         _1357 = _mm_unpackhi_epi8(_0415, _2637),  // r1357 g1357  b1357 a1357
+         _8ACE = _mm_unpacklo_epi8(_8C9D, _AEBF),
+         _9BDF = _mm_unpackhi_epi8(_8C9D, _AEBF);
+
+    auto rg_01234567 = _mm_unpacklo_epi8(_0246, _1357),  // r01234567 g01234567
+         ba_01234567 = _mm_unpackhi_epi8(_0246, _1357),  // b01234567 a01234567
+         rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF),  // r89ABCDEF g89ABCDEF
+         ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF);  // b89ABCDEF a89ABCDEF
+
+    r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF));
+    g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF));
+    b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF));
+    a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF));
+#endif
+}
+
+STAGE(store_8888) {
+    auto ptr = *(uint32_t**)ctx + x;
+
+#if defined(__aarch64__)
+    auto from_fixed15 = [](F v) {
+        // The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7.
+        // But what's really most important is that all bytes round trip.
+
+        // We can do this in NEON in one instruction, a saturating narrowing right shift:
+        return vqshrn_n_u16(v, 7);
+    };
+
+    uint8x8x4_t rgba = {{
+        from_fixed15(r),
+        from_fixed15(g),
+        from_fixed15(b),
+        from_fixed15(a),
+    }};
+    vst4_u8((uint8_t*)ptr, rgba);
+#elif defined(__ARM_NEON__)
+    auto from_fixed15 = [](F v) {
+        // Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes.
+        F whatever;
+        return vqshrn_n_u16(vcombine_u8(v, whatever), 7);
+    };
+
+    // As in load_8888, I can't get quite the ideal code generation using vst4_lane_u8().
+    asm("vst4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n"
+        "vst4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n"
+        "vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
+        "vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
+        : "+r"(ptr)
+        : "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a))
+        : "memory");
+
+#else
+    auto from_fixed15 = [](F v) {
+        // See the note in aarch64's from_fixed15().  The same roundtrip goal applies here.
+        // Here we take a different approach: (v saturated+ v) >> 8.
+        v = (v+v) >> 8;
+        return _mm_packus_epi16(_mm256_extracti128_si256(v, 0),
+                                _mm256_extracti128_si256(v, 1));
+    };
+
+    auto R = from_fixed15(r),
+         G = from_fixed15(g),
+         B = from_fixed15(b),
+         A = from_fixed15(a);
+
+    auto rg_01234567 = _mm_unpacklo_epi8(R,G),  // rg0 rg1 rg2 ... rg7
+         rg_89ABCDEF = _mm_unpackhi_epi8(R,G),  // rg8 rg9 rgA ... rgF
+         ba_01234567 = _mm_unpacklo_epi8(B,A),
+         ba_89ABCDEF = _mm_unpackhi_epi8(B,A);
+    _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi16(rg_01234567, ba_01234567));
+    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi16(rg_01234567, ba_01234567));
+    _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi16(rg_89ABCDEF, ba_89ABCDEF));
+    _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi16(rg_89ABCDEF, ba_89ABCDEF));
+#endif
+}
diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py
index f124c64..f0cedf2 100755
--- a/src/splicer/build_stages.py
+++ b/src/splicer/build_stages.py
@@ -15,6 +15,9 @@
 subprocess.check_call(['clang++'] + cflags + hsw +
                       ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
                       ['-o', 'hsw.o'])
+subprocess.check_call(['clang++'] + cflags + hsw +
+                      ['-c', 'src/splicer/SkSplicer_stages_lowp.cpp'] +
+                      ['-o', 'hsw_lowp.o'])
 
 aarch64 = [
     '--target=aarch64-linux-android',
@@ -24,6 +27,9 @@
 subprocess.check_call(['clang++'] + cflags + aarch64 +
                       ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
                       ['-o', 'aarch64.o'])
+subprocess.check_call(['clang++'] + cflags + aarch64 +
+                      ['-c', 'src/splicer/SkSplicer_stages_lowp.cpp'] +
+                      ['-o', 'aarch64_lowp.o'])
 
 armv7 = [
     '--target=arm-linux-androideabi',
@@ -35,8 +41,11 @@
 subprocess.check_call(['clang++'] + cflags + armv7 +
                       ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
                       ['-o', 'armv7.o'])
+subprocess.check_call(['clang++'] + cflags + armv7 +
+                      ['-c', 'src/splicer/SkSplicer_stages_lowp.cpp'] +
+                      ['-o', 'armv7_lowp.o'])
 
-def parse_object_file(dot_o, array_type, done, target=None):
+def parse_object_file(dst, dot_o, array_type, done, target=None):
   cmd = ['gobjdump', '-d', dot_o]
   if target:
     cmd += ['--target', target]
@@ -48,7 +57,7 @@
     # E.g. 00000000000003a4 <_load_f16>:
     m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
     if m:
-      print 'static const', array_type, 'kSplice_' + m.group(1) + '[] = {'
+      print >>dst,'static const', array_type, 'kSplice_' + m.group(1) + '[] = {'
       continue
 
     columns = line.split('\t')
@@ -65,33 +74,35 @@
       assert 'rip' not in arg  # TODO: detect on aarch64 too
 
     if code == done:
-      print '};'
+      print >>dst,'};'
       continue
 
     hexed = ''.join('0x'+x+',' for x in code.split(' '))
-    print '    ' + hexed + ' '*(44-len(hexed)) + \
-          '//  ' + inst  + ' '*(14-len(inst))  + args
+    print >>dst,'    ' + hexed + ' '*(44-len(hexed)) + \
+                '//  ' + inst  + ' '*(14-len(inst))  + args
 
-print '''/*
+for suffix in ['', '_lowp']:
+  with open('src/splicer/SkSplicer_generated%s.h' % suffix, 'w') as f:
+    print >>f,'''/*
  * Copyright 2017 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */
 
-#ifndef SkSplicer_generated_DEFINED
-#define SkSplicer_generated_DEFINED
+#ifndef SkSplicer_generated%s_DEFINED
+#define SkSplicer_generated%s_DEFINED
 
 // This file is generated semi-automatically with this command:
-//   $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
+//   $ src/splicer/build_stages.py
 
 #if defined(__aarch64__)
-'''
-parse_object_file('aarch64.o', 'unsigned int', '14000000')
-print '\n#elif defined(__ARM_NEON__)\n'
-parse_object_file('armv7.o', 'unsigned int', 'eafffffe',
-                  target='elf32-littlearm')
-print '\n#else\n'
-parse_object_file('hsw.o', 'unsigned char', 'e9 00 00 00 00')
-print '\n#endif\n'
-print '#endif//SkSplicer_generated_DEFINED'
+''' % (suffix, suffix)
+    parse_object_file(f, 'aarch64%s.o' % suffix, 'unsigned int', '14000000')
+    print >>f,'\n#elif defined(__ARM_NEON__)\n'
+    parse_object_file(f, 'armv7%s.o' % suffix, 'unsigned int', 'eafffffe',
+                    target='elf32-littlearm')
+    print >>f,'\n#else\n'
+    parse_object_file(f, 'hsw%s.o' % suffix, 'unsigned char', 'e9 00 00 00 00')
+    print >>f,'\n#endif\n'
+    print >>f,'#endif//SkSplicer_generated%s_DEFINED' % suffix