Merge changes I683078ff,I426fba9f,I2fcc57ac

* changes:
  Optimisations to 3DLUT assembly.
  Make Blur AArch64 assembly position-independent.
  Add AArch64 assembly for ColorMatrix.
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 9cb4847..d44f872 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -40,8 +40,10 @@
 LOCAL_ASFLAGS_arm64 += -no-integrated-as
 
 #LOCAL_SRC_FILES_arm64 += \
+#    rsCpuIntrinsics_advsimd_3DLUT.S \
 #    rsCpuIntrinsics_advsimd_Blend.S \
 #    rsCpuIntrinsics_advsimd_Blur.S \
+#    rsCpuIntrinsics_advsimd_ColorMatrix.S \
 #    rsCpuIntrinsics_advsimd_YuvToRGB.S
 
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
@@ -52,9 +54,10 @@
     LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_VFP
     LOCAL_SRC_FILES_arm += \
     rsCpuIntrinsics_neon.S \
-    rsCpuIntrinsics_neon_ColorMatrix.S \
+    rsCpuIntrinsics_neon_3DLUT.S \
     rsCpuIntrinsics_neon_Blend.S \
     rsCpuIntrinsics_neon_Blur.S \
+    rsCpuIntrinsics_neon_ColorMatrix.S \
     rsCpuIntrinsics_neon_YuvToRGB.S \
     convolve/convolve_copy_neon.s \
     convolve/convolve_avg_neon.s \
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index bfa3f73..c19eca3 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -52,9 +52,10 @@
     mLUT.set(static_cast<Allocation *>(data));
 }
 
-extern "C" void rsdIntrinsic3DLUT_K(void *dst, const void *src, const void *lut,
-                                    size_t lut_stride_y, size_t lut_stride_z,
-                                    uint32_t count, const void *constants);
+extern "C" size_t rsdIntrinsic3DLUT_K(void *dst, void const *in, size_t count,
+                                      void const *lut,
+                                      int32_t pitchy, int32_t pitchz,
+                                      int dimx, int dimy, int dimz);
 
 
 void RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p,
@@ -85,21 +86,18 @@
     while (x1 < x2) {
 #if defined(ARCH_ARM_HAVE_VFP)
         if (gArchUseSIMD) {
-            int32_t len = (x2 - x1 - 1) >> 1;
-            if(len > 0) {
-                const short neon_constants[] = {
-                    static_cast<short>(coordMul.x), static_cast<short>(coordMul.y),
-                    static_cast<short>(coordMul.z), 0, 0, 0, 0, static_cast<short>(0xffff),
+            int32_t len = x2 - x1;
+            if(len >= 8) {
+                size_t done;
+               done = len - rsdIntrinsic3DLUT_K(out, in, len,
+                                      bp, stride_y, stride_z,
+                                      dims.x, dims.y, dims.z);
 
-                };
-
-                rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants);
-                x1 += len << 1;
-                out += len << 1;
-                in += len << 1;
+                x1 += done;
+                out += done;
+                in += done;
             }
         }
-
 #endif
 
         int4 baseCoord = convert_int4(*in) * coordMul;
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 87db9ba..5d4241a 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -125,6 +125,32 @@
     } u;
 } Key_t;
 
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+typedef struct {
+    void (*column[4])(void);
+    void (*store)(void);
+    void (*load)(void);
+} FunctionTab_t;
+
+extern "C" size_t rsdIntrinsicColorMatrix_int_K(
+             void *out, void const *in, size_t count,
+             FunctionTab_t const *fns,
+             int16_t const *mult, int32_t const *add);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
+             FunctionTab_t const *fns,
+             uint32_t mask, int dt, int st);
+
+extern "C" size_t rsdIntrinsicColorMatrix_float_K(
+             void *out, void const *in, size_t count,
+             FunctionTab_t const *fns,
+             float const *mult, float const *add);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
+             FunctionTab_t const *fns,
+             uint32_t mask, int dt, int st);
+#endif
+
 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
 public:
     virtual void populateScript(Script *);
@@ -146,9 +172,12 @@
     // The following four fields are read as constants
     // by the SIMD assembly code.
     short ip[16];
-    int ipa[16];
+    int ipa[4];
     float tmpFp[16];
-    float tmpFpa[16];
+    float tmpFpa[4];
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+    FunctionTab_t mFnTab;
+#endif
 
     static void kernel(const RsForEachStubParamStruct *p,
                        uint32_t xstart, uint32_t xend,
@@ -212,9 +241,9 @@
             }
         }
         if (ipa[0] != 0) key.u.addMask |= 0x1;
-        if (ipa[4] != 0) key.u.addMask |= 0x2;
-        if (ipa[8] != 0) key.u.addMask |= 0x4;
-        if (ipa[12] != 0) key.u.addMask |= 0x8;
+        if (ipa[1] != 0) key.u.addMask |= 0x2;
+        if (ipa[2] != 0) key.u.addMask |= 0x4;
+        if (ipa[3] != 0) key.u.addMask |= 0x8;
     }
 
     // Look for a dot product where the r,g,b colums are the same
@@ -257,13 +286,16 @@
     case 3:
         key.u.outVecSize = 2;
         key.u.coeffMask &= ~0x8888;
+        key.u.addMask &= 7;
         break;
     case 2:
         key.u.outVecSize = 1;
         key.u.coeffMask &= ~0xCCCC;
+        key.u.addMask &= 3;
         break;
     default:
         key.u.coeffMask &= ~0xEEEE;
+        key.u.addMask &= 1;
         break;
     }
 
@@ -278,7 +310,7 @@
     return key;
 }
 
-#if defined(ARCH_ARM_HAVE_NEON) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
 
 #define DEF_SYM(x)                                  \
     extern "C" uint32_t _N_ColorMatrix_##x;      \
@@ -408,7 +440,7 @@
 
 
 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
-#if defined(ARCH_ARM_HAVE_NEON) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
     mBufSize = 4096;
     //StopWatch build_time("rs cm: build time");
     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
@@ -676,18 +708,12 @@
     float add = 0.f;
     if (fpMul > 254.f) add = 0.5f;
     for(int ct=0; ct < 4; ct++) {
-        tmpFpa[ct * 4 + 0] = fpa[ct] * addMul + add;
+        tmpFpa[ct] = fpa[ct] * addMul + add;
         //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
-        tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4];
-        tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4];
-        tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4];
     }
 
     for(int ct=0; ct < 4; ct++) {
-        ipa[ct * 4 + 0] = (int)(fpa[ct] * 65536.f + 0.5f);
-        ipa[ct * 4 + 1] = ipa[ct * 4];
-        ipa[ct * 4 + 2] = ipa[ct * 4];
-        ipa[ct * 4 + 3] = ipa[ct * 4];
+        ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
     }
 }
 
@@ -768,9 +794,9 @@
     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
 
     sum.x += add[0];
-    sum.y += add[4];
-    sum.z += add[8];
-    sum.w += add[12];
+    sum.y += add[1];
+    sum.z += add[2];
+    sum.w += add[3];
 
 
     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
@@ -826,12 +852,27 @@
     //if (!p->y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
 
     if(x2 > x1) {
-        int32_t len = (x2 - x1) >> 2;
-        if((cp->mOptKernel != NULL) && (len > 0)) {
-            cp->mOptKernel(out, in, cp->ip, len);
-            x1 += len << 2;
-            out += outstep * (len << 2);
-            in += instep * (len << 2);
+        int32_t len = x2 - x1;
+        if (gArchUseSIMD) {
+            if((cp->mOptKernel != NULL) && (len >= 4)) {
+                cp->mOptKernel(out, in, cp->ip, len >> 2);
+                x1 += len;
+                out += outstep * len;
+                in += instep * len;
+            }
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+            else {
+                size_t done;
+                if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
+                    done = len - rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+                } else {
+                    done = len - rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
+                }
+                x1 += done;
+                out += outstep * done;
+                in += instep * done;
+            }
+#endif
         }
 
         while(x1 != x2) {
@@ -872,8 +913,29 @@
         mOptKernel = NULL;
         if (build(key)) {
             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
-            mLastKey = key;
         }
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+        else {
+            int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
+            int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
+            uint32_t mm = 0;
+            int i;
+            for (i = 0; i < 4; i++)
+            {
+                uint32_t m = (key.u.coeffMask >> i) & 0x1111;
+                m = ((m * 0x249) >> 9) & 15;
+                m |= ((key.u.addMask >> i) & 1) << 4;
+                mm |= m << (i * 5);
+            }
+
+            if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
+                rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
+            } else {
+                rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
+            }
+        }
+#endif
+        mLastKey = key;
     }
 }
 
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S b/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S
new file mode 100644
index 0000000..ebceb24
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S
@@ -0,0 +1,213 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
+
+            smov        x6, \src0
+            smov        x7, \src1
+
+            add         x6, x6, x3
+            add         x7, x7, x3
+
+            ld1         {v16.2s}, [x6], x4
+            ld1         {v17.2s}, [x7], x4
+
+            ld1         {v18.2s}, [x6], x5
+            ld1         {v19.2s}, [x7], x5
+
+            dup         v8.8b, \yr0
+            dup         v9.8b, \yr1
+            /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
+            zip1        v12.16b, v5.16b, v16.16b
+            zip1        v13.16b, v5.16b, v17.16b
+            umlsl       v12.8h, v16.8b, v8.8b
+            umlsl       v13.8h, v17.8b, v9.8b
+            umlal       v12.8h, v18.8b, v8.8b
+            umlal       v13.8h, v19.8b, v9.8b
+
+            ld1         {v18.2s}, [x6]
+            ld1         {v19.2s}, [x7]
+
+            sub         x6, x6, x4
+            sub         x7, x7, x4
+
+            ld1         {v16.2s}, [x6]
+            ld1         {v17.2s}, [x7]
+
+            /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
+            zip1        v14.16b, v5.16b, v16.16b
+            zip1        v15.16b, v5.16b, v17.16b
+            umlsl       v14.8h, v16.8b, v8.8b
+            umlsl       v15.8h, v17.8b, v9.8b
+            umlal       v14.8h, v18.8b, v8.8b
+            umlal       v15.8h, v19.8b, v9.8b
+
+            /* Z interpolate, lane 0 v12/v14 -> v10 */
+            ushll       v8.4s, v12.4h, #8
+            ushll2      v9.4s, v12.8h, #8
+            umlsl       v8.4s, v12.4h, \zr0
+            umlsl2      v9.4s, v12.8h, \zr0
+            umlal       v8.4s, v14.4h, \zr0
+            umlal2      v9.4s, v14.8h, \zr0
+            rshrn       v10.4h, v8.4s, #8
+            rshrn2      v10.8h, v9.4s, #8
+
+            /* Z interpolate, lane 1 v13/v15 -> v11 */
+            ushll       v8.4s, v13.4h, #8
+            ushll2      v9.4s, v13.8h, #8
+            umlsl       v8.4s, v13.4h, \zr1
+            umlsl2      v9.4s, v13.8h, \zr1
+            umlal       v8.4s, v15.4h, \zr1
+            umlal2      v9.4s, v15.8h, \zr1
+            rshrn       v11.4h, v8.4s, #8
+            rshrn2      v11.8h, v9.4s, #8
+
+            /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
+            ushll       v8.4s, v10.4h, #8
+            ushll       v9.4s, v11.4h, #8
+            umlsl       v8.4s, v10.4h, \xr0
+            umlsl       v9.4s, v11.4h, \xr1
+            umlal2      v8.4s, v10.8h, \xr0
+            umlal2      v9.4s, v11.8h, \xr1
+            shrn        v14.4h, v8.4s, #8
+            shrn2       v14.8h, v9.4s, #8
+
+            /* pack lanes 0-1 -> v6 */
+.ifc \dst, v20.16b
+            uqrshrn2    \dst, v14.8h, #8
+.else ; .ifc \dst, v21.16b
+            uqrshrn2    \dst, v14.8h, #8
+.else
+            uqrshrn     \dst, v14.8h, #8
+.endif ; .endif
+.endm
+
+/* size_t rsdIntrinsic3DLUT_K(
+ *          void *dst,          // x0
+ *          void const *in,     // x1
+ *          size_t count,       // x2
+ *          void const *lut,    // x3
+ *          int32_t pitchy,     // w4
+ *          int32_t pitchz,     // w5
+ *          int dimx,           // w6
+ *          int dimy,           // w7
+ *          int dimz);          // [sp]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+            ldr         w8, [sp]
+            stp         d8, d9, [sp, #-64]!
+            stp         d10, d11, [sp, #16]
+            stp         d12, d13, [sp, #32]
+            stp         d14, d15, [sp, #48]
+            movi        v4.8b, #1
+            ins         v4.h[0], w6
+            ins         v4.h[1], w7
+            ins         v4.h[2], w8
+            ins         v4.s[2], w4
+            ins         v4.s[3], w5
+            movi        v5.16b, #0
+
+            b           2f
+
+            .align 6
+/* x0  = dst
+ * x1  = src
+ * x2  = count
+ * x3  = lut
+ * x4  = pitchy
+ * x5  = pitchz
+ * x6 = offset0
+ * x7 = offset1
+ */
+
+1:          ld4         {v0.8b-v3.8b}, [x1], #32
+/* v0,v1,v2,v3 source data
+ * v4 dimensions and pitches
+ */
+            uxtl        v0.8h, v0.8b
+            uxtl        v1.8h, v1.8b
+            uxtl        v2.8h, v2.8b
+            mul         v0.8h, v0.8h, v4.h[0]
+            mul         v1.8h, v1.8h, v4.h[1]
+            mul         v2.8h, v2.8h, v4.h[2]
+
+/* ursra below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero.  Strictly this is
+ * correct, except for the llegal access problem.
+ */
+            usra        v0.8h, v0.8h, #8
+            usra        v1.8h, v1.8h, #8
+            usra        v2.8h, v2.8h, #8
+
+            ushr        v12.8h, v0.8h, #8
+            ushr        v13.8h, v1.8h, #8
+            ushr        v14.8h, v2.8h, #8
+            bic         v0.8h, #0xff, LSL #8
+            xtn         v1.8b, v1.8h
+            bic         v2.8h, #0xff, LSL #8
+
+/* v0.8h,v1.8b,v2.hb fractional offset
+ * v12.8h,v13.8h,v14.8h integer offset
+ */
+
+            ushll       v6.4s, v12.4h, #2
+            ushll2      v7.4s, v12.8h, #2
+            uxtl        v8.4s, v13.4h
+            uxtl2       v9.4s, v13.8h
+            uxtl        v10.4s, v14.4h
+            uxtl2       v11.4s, v14.8h
+            mla         v6.4s, v8.4s,  v4.s[2]
+            mla         v7.4s, v9.4s,  v4.s[2]
+            mla         v6.4s, v10.4s, v4.s[3]
+            mla         v7.4s, v11.4s, v4.s[3]
+
+/* v6,v7 list of table offsets */
+
+        /* lanes 0 and 1 */
+            lanepair    dst=v20.8b,  src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
+
+        /* lanes 2 and 3 */
+            lanepair    dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
+
+        /* lanes 4 and 5 */
+            lanepair    dst=v21.8b,  src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
+
+        /* lanes 6 and 7 */
+            lanepair    dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
+
+            uzp1        v6.16b, v20.16b, v21.16b
+            uzp2        v7.16b, v20.16b, v21.16b
+            uzp1        v20.16b, v6.16b, v7.16b
+            uzp2        v22.16b, v6.16b, v7.16b
+            mov         v21.d[0], v20.d[1]
+            mov         v23.8b, v3.8b
+
+            st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
+
+2:          subs        x2, x2, #8
+            bhs         1b
+            add         x0, x2, #8
+            ldp         d14, d15, [sp, #48]
+            ldp         d12, d13, [sp, #32]
+            ldp         d10, d11, [sp, #16]
+            ldp         d8, d9, [sp], #64
+            ret
+END(rsdIntrinsic3DLUT_K)
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
index 202f903..c4a85c2 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
@@ -15,6 +15,7 @@
  */
 
 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define PRIVATE(f) .text; .align 4; .type f,#function; f:
 #define END(f) .size f, .-f;
 
 .set FRACTION_BITS, 7
@@ -54,7 +55,7 @@
  *      q0-q3 -- coefficient table
  *      x13 = -pitch
  *      x15 = top-row in
- *      x16 = bottom-row in
+ *      x19 = bottom-row in
  * Output:
  *      x1 += 16
  *      q10,q11 -- 16 convolved columns
@@ -82,7 +83,7 @@
             umull       v12.4s, v14.4h, v0.h[0]
     ifcc    sub         \reg, \reg, x5, LSL #6
             umull2      v13.4s, v14.8h, v0.h[0]
-            mov         x11, x16
+            mov         x11, x19
             umull       v14.4s, v15.4h, v0.h[0]
     ifcc    add         \reg, \reg, x5, LSL #3
             umull2      v15.4s, v15.8h, v0.h[0]
@@ -101,7 +102,7 @@
             uaddl       v16.8h, v10.8b, v11.8b
     ifcc    cmp         x7, #i
             uaddl2      v11.8h, v10.16b, v11.16b
-    ifcc    csel        x11, x16, x11, lo
+    ifcc    csel        x11, x19, x11, lo
             umlal       v12.4s, v16.4h, v\dreg\doth[\lane]
             umlal2      v13.4s, v16.8h, v\dreg\doth[\lane]
 //            prfm        PLDL1KEEP,[x10, #32] // TODO: confirm
@@ -123,7 +124,7 @@
             uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
             add         x15, x15, #16
             uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
-            add         x16, x16, #16
+            add         x19, x19, #16
             uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
             uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
 .endm /*}}}*/
@@ -142,16 +143,16 @@
  * more data that won't be used and it means that rotating the window involves
  * more mov operations.
  *
- * When the buffer gets too big the buffer at [r9] is used.
+ * When the buffer gets too big the buffer at [x9] is used.
  *
  * Input:
  *      q4-q11 -- convoltion window
- *      r9 -- pointer to additional convolution window data
+ *      x9 -- pointer to additional convolution window data
  * Output:
- *      r9 -- updated buffer pointer (if used)
+ *      x9 -- updated buffer pointer (if used)
  *      d31 -- result to be stored
  * Modifies:
- *      r12 -- temp buffer pointer
+ *      x12 -- temp buffer pointer
  *      q12-q13 -- temporaries for load and vext operations.
  *      q14-q15 -- intermediate sums
  */
@@ -160,17 +161,19 @@
             umull       v14.4s, v9.4h, v0.h[0]
             umull2      v15.4s, v9.8h, v0.h[0]
 
-            adr         x12, 199f-8
-            ldr         x12, [x12, x5, LSL #3]
+            adr         x16, 100f
+            ldrsh       x12, [x16, x5, LSL #1]
+            add         x12, x12, x16
             br          x12
-   199:     .xword 101f
-            .xword 102f
-            .xword 103f
-            .xword 104f
-            .xword 105f
-            .xword 106f
-            .xword 107f
-            .xword 108f
+   100:     .hword -4
+            .hword 101f-100b
+            .hword 102f-100b
+            .hword 103f-100b
+            .hword 104f-100b
+            .hword 105f-100b
+            .hword 106f-100b
+            .hword 107f-100b
+            .hword 108f-100b
             .align      4
     108:    umlal       v14.4s, v8.4h, v1.h[0]
             umlal2      v15.4s, v8.8h, v1.h[0]
@@ -232,25 +235,27 @@
             umull       v14.4s, v8.4h, v0.h[0]
             umull2      v15.4s, v8.8h, v0.h[0]
 
-            adr         x12, 199f-8
-            ldr         x12, [x12, x5, LSL #3]
+            adr         x16, 100f
+            ldrsh       x12, [x16, x5, LSL #1]
+            add         x12, x12, x16
             br          x12
-   199:     .xword 101f
-            .xword 102f
-            .xword 103f
-            .xword 104f
-            .xword 105f
-            .xword 106f
-            .xword 107f
-            .xword 108f
-            .xword 109f
-            .xword 110f
-            .xword 111f
-            .xword 112f
-            .xword 113f
-            .xword 114f
-            .xword 115f
-            .xword 116f
+   100:     .hword -4
+            .hword 101f-100b
+            .hword 102f-100b
+            .hword 103f-100b
+            .hword 104f-100b
+            .hword 105f-100b
+            .hword 106f-100b
+            .hword 107f-100b
+            .hword 108f-100b
+            .hword 109f-100b
+            .hword 110f-100b
+            .hword 111f-100b
+            .hword 112f-100b
+            .hword 113f-100b
+            .hword 114f-100b
+            .hword 115f-100b
+            .hword 116f-100b
             .align 4
     116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
             //ext         v13.16b, v10.16b, v11.16b, #0*2
@@ -365,34 +370,36 @@
             umull       v14.4s, v12.4h, v0.h[0]
             umull2      v15.4s, v12.8h, v0.h[0]
 
-            adr         x12, 199f-8
-            ldr         x12, [x12, x5, LSL #3]
+            adr         x16, 100f
+            ldrsh       x12, [x16, x5, LSL #1]
+            add         x12, x12, x16
             br          x12
-   199:     .xword 101f
-            .xword 102f
-            .xword 103f
-            .xword 104f
-            .xword 105f
-            .xword 106f
-            .xword 107f
-            .xword 108f
-            .xword 109f
-            .xword 110f
-            .xword 111f
-            .xword 112f
-            .xword 113f
-            .xword 114f
-            .xword 115f
-            .xword 116f
-            .xword 117f
-            .xword 118f
-            .xword 119f
-            .xword 120f
-            .xword 121f
-            .xword 122f
-            .xword 123f
-            .xword 124f
-            .xword 125f
+   100:     .hword -4
+            .hword 101f-100b
+            .hword 102f-100b
+            .hword 103f-100b
+            .hword 104f-100b
+            .hword 105f-100b
+            .hword 106f-100b
+            .hword 107f-100b
+            .hword 108f-100b
+            .hword 109f-100b
+            .hword 110f-100b
+            .hword 111f-100b
+            .hword 112f-100b
+            .hword 113f-100b
+            .hword 114f-100b
+            .hword 115f-100b
+            .hword 116f-100b
+            .hword 117f-100b
+            .hword 118f-100b
+            .hword 119f-100b
+            .hword 120f-100b
+            .hword 121f-100b
+            .hword 122f-100b
+            .hword 123f-100b
+            .hword 124f-100b
+            .hword 125f-100b
             .align 4
     125:    ext         v12.16b, v3.16b, v4.16b, #6*2
             ext         v13.16b, v10.16b, v11.16b, #0*2
@@ -564,15 +571,17 @@
             umull       v14.4s, v7.4h, v0.h[0]
             umull2      v15.4s, v7.8h, v0.h[0]
 
-            adr         x12, 199f-8
-            ldr         x12, [x12, x5, LSL #3]
+            adr         x16, 100f
+            ldrsh       x12, [x16, x5, LSL #1]
+            add         x12, x12, x16
             br          x12
-   199:     .xword 101f
-            .xword 102f
-            .xword 103f
-            .xword 104f
-            .xword 105f
-            .xword 106f
+   100:     .hword -4
+            .hword 101f-100b
+            .hword 102f-100b
+            .hword 103f-100b
+            .hword 104f-100b
+            .hword 105f-100b
+            .hword 106f-100b
             .align      4
     106:    umlal       v14.4s, v4.4h,  v0.h[6]
             umlal2      v15.4s, v4.8h,  v0.h[6]
@@ -616,21 +625,23 @@
             umull       v14.4s, v4.4h, v0.h[0]
             umull2      v15.4s, v4.8h, v0.h[0]
 
-            adr         x12, 199f-8
-            ldr         x12, [x12, x5, LSL #3]
+            adr         x16, 100f
+            ldrsh       x12, [x16, x5, LSL #1]
+            add         x12, x12, x16
             br          x12
-   199:     .xword 101f
-            .xword 102f
-            .xword 103f
-            .xword 104f
-            .xword 105f
-            .xword 106f
-            .xword 107f
-            .xword 108f
-            .xword 109f
-            .xword 110f
-            .xword 111f
-            .xword 112f
+   100:     .hword -4
+            .hword 101f-100b
+            .hword 102f-100b
+            .hword 103f-100b
+            .hword 104f-100b
+            .hword 105f-100b
+            .hword 106f-100b
+            .hword 107f-100b
+            .hword 108f-100b
+            .hword 109f-100b
+            .hword 110f-100b
+            .hword 111f-100b
+            .hword 112f-100b
             .align 4
     112:    add         x12, x9, #0x1a0
             bic         x12, x12, #0x200
@@ -751,34 +762,36 @@
             umull       v14.4s, v12.4h, v0.h[0]
             umull       v15.4s, v13.4h, v0.h[0]
 
-            adr         x12, 199f-8
-            ldr         x12, [x12, x5, LSL #3]
+            adr         x16, 100f
+            ldrsh       x12, [x16, x5, LSL #1]
+            add         x12, x12, x16
             br          x12
-   199:     .xword 101f
-            .xword 102f
-            .xword 103f
-            .xword 104f
-            .xword 105f
-            .xword 106f
-            .xword 107f
-            .xword 108f
-            .xword 109f
-            .xword 110f
-            .xword 111f
-            .xword 112f
-            .xword 113f
-            .xword 114f
-            .xword 115f
-            .xword 116f
-            .xword 117f
-            .xword 118f
-            .xword 119f
-            .xword 120f
-            .xword 121f
-            .xword 122f
-            .xword 123f
-            .xword 124f
-            .xword 125f
+   100:     .hword -4
+            .hword 101f-100b
+            .hword 102f-100b
+            .hword 103f-100b
+            .hword 104f-100b
+            .hword 105f-100b
+            .hword 106f-100b
+            .hword 107f-100b
+            .hword 108f-100b
+            .hword 109f-100b
+            .hword 110f-100b
+            .hword 111f-100b
+            .hword 112f-100b
+            .hword 113f-100b
+            .hword 114f-100b
+            .hword 115f-100b
+            .hword 116f-100b
+            .hword 117f-100b
+            .hword 118f-100b
+            .hword 119f-100b
+            .hword 120f-100b
+            .hword 121f-100b
+            .hword 122f-100b
+            .hword 123f-100b
+            .hword 124f-100b
+            .hword 125f-100b
             .align 4
     125:    add         x12, x9, #0x0d0
             bic         x12, x12, #0x200
@@ -1043,7 +1056,7 @@
 /* Dedicated function wrapper for the fetch macro, for the cases where
  * performance isn't that important, to keep code size down.
  */
-ENTRY(fetch_generic_asm)
+PRIVATE(fetch_generic_asm)
             stp         x10, x11, [sp, #-16]!
             fetch
             ldp         x10, x11, [sp], #16
@@ -1055,10 +1068,10 @@
  * hand edge of the window when starting too close to the right hand edge of
  * the image.
  */
-ENTRY(prefetch_clamp1)
+PRIVATE(prefetch_clamp1)
             sub         x11, xzr, x11
             sub         x15, x15, x1
-            sub         x16, x16, x1
+            sub         x19, x19, x1
             tbz         x11, #3, 1f
             mov         v11.16b, v10.16b
             sub         x1, x1, #16
@@ -1084,14 +1097,14 @@
             mov         v11.16b, v12.16b
 1:          sub         x11, xzr, x11
             add         x15, x15, x1
-            add         x16, x16, x1
+            add         x19, x19, x1
             ret
 END(prefetch_clamp1)
 
-ENTRY(prefetch_clamp4)
+PRIVATE(prefetch_clamp4)
             sub         x11, xzr, x11
             sub         x15, x15, x1
-            sub         x16, x16, x1
+            sub         x19, x19, x1
             tbz         x11, #3, 1f
             sub         x1, x1, #16     // what's this?
             mov         v11.16b, v10.16b
@@ -1105,7 +1118,7 @@
             mov         v11.16b, v12.16b
 1:          sub         x11, xzr, x11
             add         x15, x15, x1
-            add         x16, x16, x1
+            add         x19, x19, x1
             ret
 END(prefetch_clamp4)
 
@@ -1174,7 +1187,7 @@
  *      x9 -- buffer (if needed)
  *      x13 = -pitch
  *      x15 = top-row in
- *      x16 = bottom-row in
+ *      x19 = bottom-row in
  * Output:
  *      x1 += rlf + min(count, rrt)
  * Modifies:
@@ -1221,11 +1234,11 @@
   .endif
 1:          sub         x1, x1, x10
             sub         x15, x15, x10
-            sub         x16, x16, x10
+            sub         x19, x19, x10
             bic         x10, x10, #15
             add         x1, x1, x10
             add         x15, x15, x10
-            add         x16, x16, x10
+            add         x19, x19, x10
 2:
   .if \step > 1
             /* it's only in the uchar2 and uchar4 cases where the register file
@@ -1276,7 +1289,7 @@
  *      x9 = buffer
  *      x13 = -pitch
  *      x15 = top-row in
- *      x16 = bottom-row in
+ *      x19 = bottom-row in
  * Modifies
  *      x8 = fetch code pointer
  */
@@ -1324,10 +1337,10 @@
 
 1:          sub         x1, x1, #16
             sub         x15, x15, #16
-            sub         x16, x16, #16
+            sub         x19, x19, #16
             add         x1, x1, x4
             add         x15, x15, x4
-            add         x16, x16, x4
+            add         x19, x19, x4
             bl          fetch_generic_asm
 
   .if \step==1
@@ -1373,7 +1386,7 @@
 .endm
 
 .irep r, TUNED_LIST1, 25
-ENTRY(convolve1_\r)
+PRIVATE(convolve1_\r)
             stp         x29,x30, [sp, #-16]!
 
             prefetch    step=1, max_r=\r
@@ -1386,7 +1399,7 @@
 .endr
 
 .irep r, TUNED_LIST4, 25
-ENTRY(convolve4_\r)
+PRIVATE(convolve4_\r)
             sub         x12, sp, #0x200
             bic         x9, x12, #0x3fc
             mov         sp, x9
@@ -1421,17 +1434,13 @@
  *                  uint16_t *tab); // [sp,#8]
  */
 ENTRY(rsdIntrinsicBlurU1_K)
-            stp         x16,x30, [sp, #-80]!
-            stp         x14,x15, [sp, #16]
-            stp         x12,x13, [sp, #32]
-            stp         x10,x11, [sp, #48]
-            stp         x8,x9, [sp, #64]
+            stp         x19,x30, [sp, #-16]!
             sub         x8, sp, #32
             sub         sp, sp, #64
             st1         {v8.1d - v11.1d}, [sp]
             st1         {v12.1d - v15.1d}, [x8]
             mov         x8, x5        // x
-            ldr         w5, [sp,#144] // r
+            ldr         w5, [sp,#80]  // r
             sub         x9, x2, x8
             sub         x10, x3, x6
             mov         x2, x4        // pitch
@@ -1439,7 +1448,7 @@
             sub         x7, x10, #1
             sub         x9, x9, x3
 
-            ldr         x12, [sp, #152] // tab
+            ldr         x12, [sp, #88] // tab
 
             add         x0, x0, x8
             add         x1, x1, x8
@@ -1460,7 +1469,7 @@
 
             sub         x13, xzr, x2
             msub        x15, x2, x6, x1
-            madd        x16, x2, x7, x1
+            madd        x19, x2, x7, x1
 
             ld1         {v0.8h,v1.8h}, [x12], #32
             ld1         {v2.8h,v3.8h}, [x12], #32
@@ -1474,11 +1483,7 @@
 
 1:          ld1         {v8.1d - v11.1d}, [sp], #32
             ld1         {v12.1d - v15.1d}, [sp], #32
-            ldp         x8,x9, [sp, #64]
-            ldp         x10,x11, [sp, #48]
-            ldp         x12,x13, [sp, #32]
-            ldp         x14,x15, [sp, #16]
-            ldp         x12,x30, [sp], #80
+            ldp         x19,x30, [sp], #16
             ret
 END(rsdIntrinsicBlurU1_K)
 
@@ -1495,17 +1500,13 @@
  *                  uint16_t *tab); // [sp,#8]
  */
 ENTRY(rsdIntrinsicBlurU4_K)
-            stp         x16,x30, [sp, #-80]!
-            stp         x14,x15, [sp, #16]
-            stp         x12,x13, [sp, #32]
-            stp         x10,x11, [sp, #48]
-            stp         x8,x9, [sp, #64]
+            stp         x19,x30, [sp, #-16]!
             sub         x8, sp, #32
             sub         sp, sp, #64
             st1         {v8.1d - v11.1d}, [sp]
             st1         {v12.1d - v15.1d}, [x8]
             mov         x8, x5        // x
-            ldr         w5, [sp,#144] // r
+            ldr         w5, [sp,#80]  // r
             sub         x9, x2, x8
             sub         x10, x3, x6
             mov         x2, x4        // pitch
@@ -1513,7 +1514,7 @@
             sub         x7, x10, #1
             sub         x9, x9, x3
 
-            ldr         x12, [sp, #152]
+            ldr         x12, [sp, #88]
 
             add         x0, x0, x8, LSL #2
             add         x1, x1, x8, LSL #2
@@ -1535,7 +1536,7 @@
 
             sub         x13, xzr, x2
             msub        x15, x2, x6, x1
-            madd        x16, x2, x7, x1
+            madd        x19, x2, x7, x1
 
             ld1         {v0.8h,v1.8h}, [x12], #32
             ld1         {v2.8h,v3.8h}, [x12], #32
@@ -1549,10 +1550,6 @@
 
 1:          ld1         {v8.1d - v11.1d}, [sp], #32
             ld1         {v12.1d - v15.1d}, [sp], #32
-            ldp         x8,x9, [sp, #64]
-            ldp         x10,x11, [sp, #48]
-            ldp         x12,x13, [sp, #32]
-            ldp         x14,x15, [sp, #16]
-            ldp         x12,x30, [sp], #80
+            ldp         x19,x30, [sp], #16
             ret
 END(rsdIntrinsicBlurU4_K)
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
new file mode 100644
index 0000000..7a6d4c5
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
@@ -0,0 +1,838 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro vmxx_f32 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1)
+        fmla            \opd, \opa, \opb
+    .else
+        fmul            \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1)
+        fadd            \opd, \opa, \opb
+    .else
+        mov             \stupidsyntax1, \stupidsyntax2
+    .endif
+  .endif
+.endm
+
+.macro vmxx_s16 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1 + 16)
+        smlal           \opd, \opa, \opb
+    .else
+        smull           \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+.macro vmxx2_s16 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1 + 16)
+        smlal2          \opd, \opa, \opb
+    .else
+        smull2          \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+/* x0 = dst
+ * x1 = src
+ * x2 = count
+ * x3 = params
+ * x4 = column0_fn
+ * x5 = column1_fn
+ * x6 = column2_fn
+ * x7 = column3_fn
+ * x8 = store_fn
+ * x9 = load_fn
+ */
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+.align 6
+colormatrix_int_col0_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[0]
+            dup         v7.4s, v4.s[0]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[0]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[4]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[0]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[4]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[0]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[4]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[0]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[4]
+            sqshrun     v8.4h, v6.4s, #8
+            sqshrun2    v8.8h, v7.4s, #8
+            br          x5
+
+colormatrix_int_col0_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[0]
+            dup         v7.4s, v4.s[0]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[0]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[4]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[0]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[4]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[0]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[4]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[0]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[4]
+            sqshrun     v8.4h, v6.4s, #8
+            sqshrun2    v8.8h, v7.4s, #8
+            br          x5
+
+.align 6
+colormatrix_int_col1_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[1]
+            dup         v7.4s, v4.s[1]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[1]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[5]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[1]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[5]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[1]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[5]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[1]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[5]
+            sqshrun     v9.4h, v6.4s, #8
+            sqshrun2    v9.8h, v7.4s, #8
+            br          x6
+
+colormatrix_int_col1_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[1]
+            dup         v7.4s, v4.s[1]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[1]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[5]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[1]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[5]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[1]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[5]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[1]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[5]
+            sqshrun     v9.4h, v6.4s, #8
+            sqshrun2    v9.8h, v7.4s, #8
+            br          x6
+
+.align 6
+colormatrix_int_col2_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[2]
+            dup         v7.4s, v4.s[2]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[2]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[6]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[2]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[6]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[2]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[6]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[2]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[6]
+            sqshrun     v10.4h, v6.4s, #8
+            sqshrun2    v10.8h, v7.4s, #8
+            br          x7
+
+colormatrix_int_col2_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[2]
+            dup         v7.4s, v4.s[2]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[2]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[6]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[2]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[6]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[2]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[6]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[2]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[6]
+            sqshrun     v10.4h, v6.4s, #8
+            sqshrun2    v10.8h, v7.4s, #8
+            br          x7
+
+.align 6
+colormatrix_int_col3_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[3]
+            dup         v7.4s, v4.s[3]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[3]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[7]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[3]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[7]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[3]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[7]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[3]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[7]
+            sqshrun     v11.4h, v6.4s, #8
+            sqshrun2    v11.8h, v7.4s, #8
+            br          x8
+
+colormatrix_int_col3_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[3]
+            dup         v7.4s, v4.s[3]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[3]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[7]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[3]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[7]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[3]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[7]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[3]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[7]
+            sqshrun     v11.4h, v6.4s, #8
+            sqshrun2    v11.8h, v7.4s, #8
+            br          x8
+
+.align 5
+colormatrix_float_col0_\i:
+            vmxx_f32    \i, 1,  v8.4s, v12.4s, v0.s[0]
+            vmxx_f32    \i, 2,  v8.4s, v13.4s, v1.s[0]
+            vmxx_f32    \i, 4,  v8.4s, v14.4s, v2.s[0]
+            vmxx_f32    \i, 8,  v8.4s, v15.4s, v3.s[0]
+            vadd_f32    \i, 16, v8.4s, v8.4s, v4.4s,        v8.16b, v4.16b
+            vmxx_f32    \i, 1,  v16.4s, v20.4s, v0.s[0]
+            vmxx_f32    \i, 2,  v16.4s, v21.4s, v1.s[0]
+            vmxx_f32    \i, 4,  v16.4s, v22.4s, v2.s[0]
+            vmxx_f32    \i, 8,  v16.4s, v23.4s, v3.s[0]
+            vadd_f32    \i, 16, v16.4s, v16.4s, v4.4s,      v16.16b, v4.16b
+            br          x5
+
+.align 4
+colormatrix_float_col0_n\i:
+            vmxx_f32    \i^31, 1,  v8.4s, v12.4s, v0.s[0]
+            vmxx_f32    \i^31, 2,  v8.4s, v13.4s, v1.s[0]
+            vmxx_f32    \i^31, 4,  v8.4s, v14.4s, v2.s[0]
+            vmxx_f32    \i^31, 8,  v8.4s, v15.4s, v3.s[0]
+            vadd_f32    \i^31, 16, v8.4s, v8.4s, v4.4s,     v8.16b, v4.16b
+            vmxx_f32    \i^31, 1,  v16.4s, v20.4s, v0.s[0]
+            vmxx_f32    \i^31, 2,  v16.4s, v21.4s, v1.s[0]
+            vmxx_f32    \i^31, 4,  v16.4s, v22.4s, v2.s[0]
+            vmxx_f32    \i^31, 8,  v16.4s, v23.4s, v3.s[0]
+            vadd_f32    \i^31, 16, v16.4s, v16.4s, v4.4s,   v16.16b, v4.16b
+            br          x5
+
+.align 5
+colormatrix_float_col1_\i:
+            vmxx_f32    \i, 1,  v9.4s, v12.4s, v0.s[1]
+            vmxx_f32    \i, 2,  v9.4s, v13.4s, v1.s[1]
+            vmxx_f32    \i, 4,  v9.4s, v14.4s, v2.s[1]
+            vmxx_f32    \i, 8,  v9.4s, v15.4s, v3.s[1]
+            vadd_f32    \i, 16, v9.4s, v9.4s, v5.4s,        v9.16b, v5.16b
+            vmxx_f32    \i, 1,  v17.4s, v20.4s, v0.s[1]
+            vmxx_f32    \i, 2,  v17.4s, v21.4s, v1.s[1]
+            vmxx_f32    \i, 4,  v17.4s, v22.4s, v2.s[1]
+            vmxx_f32    \i, 8,  v17.4s, v23.4s, v3.s[1]
+            vadd_f32    \i, 16, v17.4s, v17.4s, v5.4s,      v17.16b, v5.16b
+            br          x6
+
+.align 4
+colormatrix_float_col1_n\i:
+            vmxx_f32    \i^31, 1,  v9.4s, v12.4s, v0.s[1]
+            vmxx_f32    \i^31, 2,  v9.4s, v13.4s, v1.s[1]
+            vmxx_f32    \i^31, 4,  v9.4s, v14.4s, v2.s[1]
+            vmxx_f32    \i^31, 8,  v9.4s, v15.4s, v3.s[1]
+            vadd_f32    \i^31, 16, v9.4s, v9.4s, v5.4s,     v9.16b, v5.16b
+            vmxx_f32    \i^31, 1,  v17.4s, v20.4s, v0.s[1]
+            vmxx_f32    \i^31, 2,  v17.4s, v21.4s, v1.s[1]
+            vmxx_f32    \i^31, 4,  v17.4s, v22.4s, v2.s[1]
+            vmxx_f32    \i^31, 8,  v17.4s, v23.4s, v3.s[1]
+            vadd_f32    \i^31, 16, v17.4s, v17.4s, v5.4s,   v17.16b, v5.16b
+            br          x6
+
+.align 5
+colormatrix_float_col2_\i:
+            vmxx_f32    \i, 1,  v10.4s, v12.4s, v0.s[2]
+            vmxx_f32    \i, 2,  v10.4s, v13.4s, v1.s[2]
+            vmxx_f32    \i, 4,  v10.4s, v14.4s, v2.s[2]
+            vmxx_f32    \i, 8,  v10.4s, v15.4s, v3.s[2]
+            vadd_f32    \i, 16, v10.4s, v10.4s, v6.4s,      v10.16b, v6.16b
+            vmxx_f32    \i, 1,  v18.4s, v20.4s, v0.s[2]
+            vmxx_f32    \i, 2,  v18.4s, v21.4s, v1.s[2]
+            vmxx_f32    \i, 4,  v18.4s, v22.4s, v2.s[2]
+            vmxx_f32    \i, 8,  v18.4s, v23.4s, v3.s[2]
+            vadd_f32    \i, 16, v18.4s, v18.4s, v6.4s,      v18.16b, v6.16b
+            br          x7
+
+.align 4
+colormatrix_float_col2_n\i:
+            vmxx_f32    \i^31, 1,  v10.4s, v12.4s, v0.s[2]
+            vmxx_f32    \i^31, 2,  v10.4s, v13.4s, v1.s[2]
+            vmxx_f32    \i^31, 4,  v10.4s, v14.4s, v2.s[2]
+            vmxx_f32    \i^31, 8,  v10.4s, v15.4s, v3.s[2]
+            vadd_f32    \i^31, 16, v10.4s, v10.4s, v6.4s,   v10.16b, v6.16b
+            vmxx_f32    \i^31, 1,  v18.4s, v20.4s, v0.s[2]
+            vmxx_f32    \i^31, 2,  v18.4s, v21.4s, v1.s[2]
+            vmxx_f32    \i^31, 4,  v18.4s, v22.4s, v2.s[2]
+            vmxx_f32    \i^31, 8,  v18.4s, v23.4s, v3.s[2]
+            vadd_f32    \i^31, 16, v18.4s, v18.4s, v6.4s,   v18.16b, v6.16b
+            br          x7
+
+.align 5
+colormatrix_float_col3_\i:
+            vmxx_f32    \i, 1,  v11.4s, v12.4s, v0.s[3]
+            vmxx_f32    \i, 2,  v11.4s, v13.4s, v1.s[3]
+            vmxx_f32    \i, 4,  v11.4s, v14.4s, v2.s[3]
+            vmxx_f32    \i, 8,  v11.4s, v15.4s, v3.s[3]
+            vadd_f32    \i, 16, v11.4s, v11.4s, v7.4s,      v11.16b, v7.16b
+            vmxx_f32    \i, 1,  v19.4s, v20.4s, v0.s[3]
+            vmxx_f32    \i, 2,  v19.4s, v21.4s, v1.s[3]
+            vmxx_f32    \i, 4,  v19.4s, v22.4s, v2.s[3]
+            vmxx_f32    \i, 8,  v19.4s, v23.4s, v3.s[3]
+            vadd_f32    \i, 16, v19.4s, v19.4s, v7.4s,      v19.16b, v7.16b
+            br          x8
+
+.align 4
+colormatrix_float_col3_n\i:
+            vmxx_f32    \i^31, 1,  v11.4s, v12.4s, v0.s[3]
+            vmxx_f32    \i^31, 2,  v11.4s, v13.4s, v1.s[3]
+            vmxx_f32    \i^31, 4,  v11.4s, v14.4s, v2.s[3]
+            vmxx_f32    \i^31, 8,  v11.4s, v15.4s, v3.s[3]
+            vadd_f32    \i^31, 16, v11.4s, v11.4s, v7.4s,  v11.16b, v7.16b
+            vmxx_f32    \i^31, 1,  v19.4s, v20.4s, v0.s[3]
+            vmxx_f32    \i^31, 2,  v19.4s, v21.4s, v1.s[3]
+            vmxx_f32    \i^31, 4,  v19.4s, v22.4s, v2.s[3]
+            vmxx_f32    \i^31, 8,  v19.4s, v23.4s, v3.s[3]
+            vadd_f32    \i^31, 16, v19.4s, v19.4s, v7.4s,  v19.16b, v7.16b
+            br          x8
+
+.endr
+
+.align 6
+colormatrix_float_ldu4:
+            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v23.8h, v23.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl        v15.4s, v23.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            uxtl2       v23.4s, v23.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v15.4s, v15.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            ucvtf       v23.4s, v23.4s
+            br          x4
+
+.align 5
+colormatrix_int_ldu4:
+            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            uxtl        v15.8h, v15.8b
+            br          x4
+
+.align 6
+colormatrix_float_ldu3:
+            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            br          x4
+
+colormatrix_int_ldu3:
+            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            br          x4
+
+.align 5
+colormatrix_float_ldu1:
+            ld1         {v20.8b}, [x1], #8
+            uxtl        v20.8h, v20.8b
+            uxtl        v12.4s, v20.4h
+            uxtl2       v20.4s, v20.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v20.4s, v20.4s
+            br          x4
+
+.align 6
+colormatrix_float_ldu2:
+            ld2         {v20.8b,v21.8b}, [x1], #16
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            br          x4
+
+.align 4
+colormatrix_int_ldu2:
+            ld2         {v12.8b,v13.8b}, [x1], #16
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            br          x4
+
+.align 6
+colormatrix_float_stu4:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v27.4s, v11.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            fcvtzs      v31.4s, v19.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun    v27.4h, v27.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            sqrshrun2   v27.8h, v31.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            uqxtn       v26.8b, v26.8h
+            uqxtn       v27.8b, v27.8h
+            subs        x2, x2, #8
+            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_int_stu4:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            uqxtn       v15.8b, v11.8h
+            subs        x2, x2, #8
+            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+            blo         colormatrix_int_end
+            br          x9
+
+.align 6
+colormatrix_float_stu3:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            uqxtn       v26.8b, v26.8h
+            movi        v27.8b, #0
+            subs        x2, x2, #8
+            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+.align 4
+colormatrix_int_ldu1:
+            ld1         {v12.8b}, [x1], #8
+            uxtl        v12.8h, v12.8b
+            br          x4
+
+.align 5
+colormatrix_int_stu3:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            movi        v15.8b, #0
+            subs        x2, x2, #8
+            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+            blo         colormatrix_int_end
+            br          x9
+
+.align 6
+colormatrix_float_stu2:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            subs        x2, x2, #8
+            st2         {v24.8b,v25.8b}, [x0], #16
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_int_stu2:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            subs        x2, x2, #8
+            st2         {v12.8b,v13.8b}, [x0], #16
+            blo         colormatrix_int_end
+            br          x9
+
+.align 5
+colormatrix_int_stu1:
+            uqxtn       v12.8b, v8.8h
+            subs        x2, x2, #8
+            st1         {v12.8b}, [x0], #8
+            blo         colormatrix_int_end
+            br          x9
+
+colormatrix_float_ldf3:
+            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+            br          x4
+
+.align 6
+colormatrix_float_stu1:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            uqxtn       v24.8b, v24.8h
+            subs        x2, x2, #8
+            st1         {v24.8b}, [x0], #8
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_stf3:
+            movi        v11.16b, #0
+            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+            movi        v19.16b, #0
+            subs        x2, x2, #8
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_float_stf4:
+            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+            subs        x2, x2, #8
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf4:
+            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+            br          x4
+
+.align 5
+colormatrix_float_stf2:
+            st2         {v8.4s, v9.4s}, [x0], #32
+            subs        x2, x2, #8
+            st2         {v16.4s, v17.4s}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf2:
+            ld2         {v12.4s,v13.4s}, [x1], #32
+            ld2         {v20.4s,v21.4s}, [x1], #32
+            br          x4
+
+.align 5
+colormatrix_float_stf1:
+            st1         {v8.4s}, [x0], #16
+            subs        x2, x2, #8
+            st1         {v16.4s}, [x0], #16
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf1:
+            ld1         {v12.4s}, [x1], #16
+            ld1         {v20.4s}, [x1], #16
+            br          x4
+
+
+/* size_t rsdIntrinsicColorMatrix_int_K(
+ *          void *out,              // x0
+ *          void const *in,         // x1
+ *          size_t count,           // x2
+ *          fntab_t const *fns,     // x3
+ *          int16_t const *mult,    // x4
+ *          int32_t const *add);    // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_int_K)
+            stp         x8,x9, [sp, #-16]!
+            sub         x7, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d-v11.1d}, [sp]
+            st1         {v12.1d-v15.1d}, [x7]
+
+            ld1         {v0.8h,v1.8h}, [x4], #32
+            ld1         {v4.4s}, [x5], #16
+
+            ldp         x4,x5, [x3],#16
+            ldp         x6,x7, [x3],#16
+            ldp         x8,x9, [x3],#16
+
+            dup         v12.4s, v4.s[0]
+            dup         v13.4s, v4.s[1]
+            dup         v14.4s, v4.s[2]
+            dup         v15.4s, v4.s[3]
+            sqshrun     v8.4h, v12.4s, #8
+            sqshrun2    v8.8h, v12.4s, #8
+            sqshrun     v9.4h, v13.4s, #8
+            sqshrun2    v9.8h, v13.4s, #8
+            sqshrun     v10.4h, v14.4s, #8
+            sqshrun2    v10.8h, v14.4s, #8
+            sqshrun     v11.4h, v15.4s, #8
+            sqshrun2    v11.8h, v15.4s, #8
+
+            subs        x2, x2, #8
+            blo         colormatrix_int_end
+            br          x9
+
+colormatrix_int_end:
+            ld1         {v8.1d-v11.1d}, [sp], #32
+            ld1         {v12.1d-v15.1d}, [sp], #32
+            ldp         x8,x9, [sp], #16
+            add         x0, x2, #8
+            ret
+END(rsdIntrinsicColorMatrix_int_K)
+
+/* void rsdIntrinsicColorMatrixSetup_int_K(
+ *          fntab_t const *fns, // x0
+ *          uint32_t mask,      // x1
+ *          int dt,             // x2
+ *          int st);            // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
+            adr         x4, 2f
+            ldrsh       x2, [x4, x2, LSL #1]
+            add         x2, x2, x4
+            adr         x4, 3f
+            ldrsh       x3, [x4, x3, LSL #1]
+            add         x3, x3, x4
+            stp         x2, x3, [x0, #32]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+            mov         x3, #4
+            adr         x4, 4f
+1:          ands        x2, x1, #15
+            beq         9f
+            and         x2, x1, #31
+            lsl         x2, x2, #3
+            ldrsh       x2, [x4, x2]
+            add         x2, x2, x4
+9:          str         x2, [x0], #8
+            lsr         x1, x1, #5
+            add         x4, x4, #2
+            subs        x3, x3, #1
+            bne         1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+            ldr         x2, [x0]
+            mov         x3, #4
+1:          ldr         x1, [x0, #-8]!
+            cmp         x1, #0
+            csel        x2, x1, x2, ne
+            str         x2, [x0]
+            subs        x3, x3, #1
+            bne         1b
+            ret
+
+            .align 4
+2:          .hword      colormatrix_int_stu1-2b
+            .hword      colormatrix_int_stu2-2b
+            .hword      colormatrix_int_stu3-2b
+            .hword      colormatrix_int_stu4-2b
+3:          .hword      colormatrix_int_ldu1-3b
+            .hword      colormatrix_int_ldu2-3b
+            .hword      colormatrix_int_ldu3-3b
+            .hword      colormatrix_int_ldu4-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            .hword      colormatrix_int_col0_\i-4b
+            .hword      colormatrix_int_col1_\i-4b-2
+            .hword      colormatrix_int_col2_\i-4b-4
+            .hword      colormatrix_int_col3_\i-4b-6
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+            .hword      colormatrix_int_col0_n\i-4b
+            .hword      colormatrix_int_col1_n\i-4b-2
+            .hword      colormatrix_int_col2_n\i-4b-4
+            .hword      colormatrix_int_col3_n\i-4b-6
+.endr
+END(rsdIntrinsicColorMatrixSetup_int_K)
+
+
+/* size_t rsdIntrinsicColorMatrix_float_K(
+ *          void *out,              // x0
+ *          void const *in,         // x1
+ *          size_t count,           // x2
+ *          fntab_t const *fns,     // x3
+ *          float const *mult,      // x4
+ *          float const *add);      // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_float_K)
+            stp         x8,x9, [sp, #-16]!
+            sub         x7, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d-v11.1d}, [sp]
+            st1         {v12.1d-v15.1d}, [x7]
+
+            ld1         {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
+            ld1r        {v4.4s}, [x5], #4
+            ld1r        {v5.4s}, [x5], #4
+            ld1r        {v6.4s}, [x5], #4
+            ld1r        {v7.4s}, [x5], #4
+
+            ldp         x4,x5, [x3], #16
+            ldp         x6,x7, [x3], #16
+            ldp         x8,x9, [x3], #16
+
+            mov         v8.16b, v4.16b
+            mov         v9.16b, v5.16b
+            mov         v10.16b, v6.16b
+            mov         v11.16b, v7.16b
+
+            mov         v16.16b, v4.16b
+            mov         v17.16b, v5.16b
+            mov         v18.16b, v6.16b
+            mov         v19.16b, v7.16b
+
+            subs        x2, x2, #8
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_end:
+            ld1         {v8.1d-v11.1d}, [sp], #32
+            ld1         {v12.1d-v15.1d}, [sp], #32
+            ldp         x8,x9, [sp], #16
+            add         x0, x2, #8
+            ret
+END(rsdIntrinsicColorMatrix_float_K)
+
+/* void rsdIntrinsicColorMatrixSetup_float_K(
+ *          fntab_t const *fns, // x0
+ *          uint32_t mask,      // x1
+ *          int dt,             // x2
+ *          int st);            // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
+            adr         x4, 2f
+            ldrsh       x2, [x4, x2, LSL #1]
+            add         x2, x2, x4
+            adr         x4, 3f
+            ldrsh       x3, [x4, x3, LSL #1]
+            add         x3, x3, x4
+            stp         x2, x3, [x0, #32]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+            mov         x3, #4
+            adr         x4, 4f
+1:          ands        x2, x1, #15
+            beq         9f
+            and         x2, x1, #31
+            lsl         x2, x2, #3
+            ldrsh       x2, [x4, x2]
+            add         x2, x2, x4
+9:          str         x2, [x0], #8
+            lsr         x1, x1, #5
+            add         x4, x4, #2
+            subs        x3, x3, #1
+            bne         1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+            ldr         x2, [x0]
+            mov         x3, #4
+1:          ldr         x1, [x0, #-8]!
+            cmp         x1, #0
+            csel        x2, x1, x2, ne
+            str         x2, [x0]
+            subs        x3, x3, #1
+            bne         1b
+            ret
+
+            .align 4
+2:          .hword      colormatrix_float_stu1-2b
+            .hword      colormatrix_float_stu2-2b
+            .hword      colormatrix_float_stu3-2b
+            .hword      colormatrix_float_stu4-2b
+            .hword      colormatrix_float_stf1-2b
+            .hword      colormatrix_float_stf2-2b
+            .hword      colormatrix_float_stf3-2b
+            .hword      colormatrix_float_stf4-2b
+3:          .hword      colormatrix_float_ldu1-3b
+            .hword      colormatrix_float_ldu2-3b
+            .hword      colormatrix_float_ldu3-3b
+            .hword      colormatrix_float_ldu4-3b
+            .hword      colormatrix_float_ldf1-3b
+            .hword      colormatrix_float_ldf2-3b
+            .hword      colormatrix_float_ldf3-3b
+            .hword      colormatrix_float_ldf4-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            .hword      colormatrix_float_col0_\i-4b
+            .hword      colormatrix_float_col1_\i-4b-2
+            .hword      colormatrix_float_col2_\i-4b-4
+            .hword      colormatrix_float_col3_\i-4b-6
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+            .hword      colormatrix_float_col0_n\i-4b
+            .hword      colormatrix_float_col1_n\i-4b-2
+            .hword      colormatrix_float_col2_n\i-4b-4
+            .hword      colormatrix_float_col3_n\i-4b-6
+.endr
+END(rsdIntrinsicColorMatrixSetup_float_K)
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 0092d0e..ee10884 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -285,211 +285,3 @@
         bx          lr
 
 END(rsdIntrinsicConvolve5x5_K)
-
-
-/* 3D LUT */
-
-/*
-        r0 = dst
-        r1 = src
-        r2 = cube base pointer
-        r3 = cube Y stride
-        r4 = cube Z stride
-        r5 = count
-        xr10 = * constants
-
-        d0  / q0  = weight 1 p1
-        d1        = weight 2 p1
-
-        d2  / q1  = weight 1 p2
-        d3        = weight 2 p2
-
-        d4  / q2  = src1
-        d5        = src2
-
-        d6  / q3  = baseCoord
-        d7        = baseCoord
-
-        d8  / q4  = coord1 p1
-        d9        =
-
-        d10 / q5  = coord1 p2
-        d11       =
-
-        d12 / q6  =
-        d13       =
-
-        d14 / q7  =
-        d15       =
-
-
-        d16 / q8  = x0 y0 z0
-        d17       = x1 y0 z0
-        d18 / q9  = x0 y1 z0
-        d19       = x1 y1 z0
-        d20 / q10 = x0 y0 z1
-        d21       = x1 y0 z1
-        d22 / q11 = x0 y1 z1
-        d23       = x1 y1 z1
-
-        d24 / q12 = alpha mash
-        d25       = current pixel alpha
-        d26 / q13 = 4, y stride
-        d27       = z stride, 0
-        d28 / q14 = 0x8000
-        d29       = 0x7fff
-        d30 / q15 = 0, 0, 0, 0xffff
-
-
-        d31 = coordMult
-*/
-
-ENTRY(rsdIntrinsic3DLUT_K)
-        push        {r4-r8, r10, r11, lr}
-        vpush       {q4-q7}
-
-        /* load Z stride in r4 */
-        ldr     r4, [sp, #32 + 64]
-
-        /* Load count */
-        ldr     r5, [sp, #36 + 64]
-
-        vmov.u16 d28, #0x8000
-        vmov.u16 d29, #0x7fff
-        vmov.u32 d24, #0xff000000
-
-        /* load constants using r10 */
-        ldr     r10, [sp, #40 + 64]
-        vld1.32 {d31}, [r10]!
-        vld1.32 {d30}, [r10]!
-
-        mov r6, #4
-        vmov d26, r6, r3
-        mov r6, #0
-        vmov d27, r4, r6
-
-        add r8, r3, r4
-
-
-
-1:
-        vld1.8 {d4}, [r1]!
-        vand.u8 d25, d4, d24
-        vmovl.u8 q2, d4
-
-
-        vmull.u16 q3, d4, d31
-        vshr.u32 q4, q3, #15       // coord1 p1
-        vmovn.u32 d1, q3
-        vand.u16 d1, d29           // weight 2
-        vsub.u16 d0, d28, d1       // weight 1
-        vmul.u32 q4, q4, q13           // q4 = x*4, y*ystride, z*zstride, 0
-
-        vmull.u16 q3, d5, d31
-        vshr.u32 q5, q3, #15       // coord1 p2
-        vmovn.u32 d3, q3
-        vand.u16 d3, d29           // weight 2
-        vsub.u16 d2, d28, d3       // weight 1
-        vmul.u32 q5, q5, q13       // q5 = x*4, y*ystride, z*zstride, 0
-
-        vpadd.u32 d8, d8, d9
-        vpadd.u32 d9, d10, d11
-        vpadd.u32 d8, d8, d9
-        vmov r6, r7, d8            // base pointers
-
-        add  r6, r6, r2
-        add  r7, r7, r2
-
-        vld1.8 {d16}, [r6]
-        add r11, r6, r3
-        vld1.8 {d18}, [r11]
-        add r11, r6, r4
-        vld1.8 {d20}, [r11]
-        add r11, r6, r8
-        vld1.8 {d22}, [r11]
-
-        vmovl.u8 q8, d16
-        vmovl.u8 q9, d18
-        vmovl.u8 q10, d20
-        vmovl.u8 q11, d22
-
-        vmull.u16 q6, d16, d0[0]
-        vmlal.u16 q6, d17, d1[0]
-        vshrn.u32 d16, q6, #7
-        vmull.u16 q6, d18, d0[0]
-        vmlal.u16 q6, d19, d1[0]
-        vshrn.u32 d18, q6, #7
-        vmull.u16 q6, d20, d0[0]
-        vmlal.u16 q6, d21, d1[0]
-        vshrn.u32 d20, q6, #7
-        vmull.u16 q6, d22, d0[0]
-        vmlal.u16 q6, d23, d1[0]
-        vshrn.u32 d22, q6, #7
-
-        vmull.u16 q6, d16, d0[1]
-        vmlal.u16 q6, d18, d1[1]
-        vshrn.u32 d16, q6, #15
-        vmull.u16 q6, d20, d0[1]
-        vmlal.u16 q6, d22, d1[1]
-        vshrn.u32 d18, q6, #15
-
-        vmull.u16 q6, d16, d0[2]
-        vmlal.u16 q6, d18, d1[2]
-        vshrn.u32 d14, q6, #15
-
-
-        vld1.8 {d16}, [r7]
-        add r11, r7, r3
-        vld1.8 {d18}, [r11]
-        add r11, r7, r4
-        vld1.8 {d20}, [r11]
-        add r11, r7, r8
-        vld1.8 {d22}, [r11]
-        vmovl.u8 q8, d16
-        vmovl.u8 q9, d18
-        vmovl.u8 q10, d20
-        vmovl.u8 q11, d22
-
-        vmull.u16 q6, d16, d2[0]
-        vmlal.u16 q6, d17, d3[0]
-        vshrn.u32 d16, q6, #7
-        vmull.u16 q6, d18, d2[0]
-        vmlal.u16 q6, d19, d3[0]
-        vshrn.u32 d18, q6, #7
-        vmull.u16 q6, d20, d2[0]
-        vmlal.u16 q6, d21, d3[0]
-        vshrn.u32 d20, q6, #7
-        vmull.u16 q6, d22, d2[0]
-        vmlal.u16 q6, d23, d3[0]
-        vshrn.u32 d22, q6, #7
-
-        vmull.u16 q6, d16, d2[1]
-        vmlal.u16 q6, d18, d3[1]
-        vshrn.u32 d16, q6, #15
-        vmull.u16 q6, d20, d2[1]
-        vmlal.u16 q6, d22, d3[1]
-        vshrn.u32 d18, q6, #15
-
-        vmull.u16 q6, d16, d2[2]
-        vmlal.u16 q6, d18, d3[2]
-        vshrn.u32 d15, q6, #15
-
-        vrshrn.u16 d14, q7, #8
-
-        vbic.u8 d14, d14, d24  // mix in alpha
-        vorr.u8 d14, d14, d25
-        vst1.32 {d14}, [r0]!
-
-
-        /* Are we done? */
-        subs r5, r5, #1
-        bne 1b
-
-        /* Yup, bye */
-        vpop            {q4-q7}
-        pop         {r4-r8, r10, r11, lr}
-        bx          lr
-
-END(rsdIntrinsic3DLUT_K)
-
-
diff --git a/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S b/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S
new file mode 100644
index 0000000..597154b
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S
@@ -0,0 +1,211 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+.macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
+
+            vmov.s32    r6, r7, \src
+
+            add         r6, r6, r3
+            add         r7, r7, r3
+
+            vld1.u8     d16, [r6], r4
+            vld1.u8     d17, [r7], r4
+
+            vld1.u8     d18, [r6], r5
+            vld1.u8     d19, [r7], r5
+
+            vdup.u8     d6, \yr0
+            vdup.u8     d7, \yr1
+            /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */
+            vshll.u8    q12, d16, #8
+            vshll.u8    q13, d17, #8
+            vmlsl.u8    q12, d16, d6
+            vmlsl.u8    q13, d17, d7
+            vmlal.u8    q12, d18, d6
+            vmlal.u8    q13, d19, d7
+
+            vld1.u8     d18, [r6]
+            vld1.u8     d19, [r7]
+
+            sub         r6, r6, r4
+            sub         r7, r7, r4
+
+            vld1.u8     d16, [r6]
+            vld1.u8     d17, [r7]
+
+            /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */
+            vshll.u8    q14, d16, #8
+            vshll.u8    q15, d17, #8
+            vmlsl.u8    q14, d16, d6
+            vmlsl.u8    q15, d17, d7
+            vmlal.u8    q14, d18, d6
+            vmlal.u8    q15, d19, d7
+
+            /* Z interpolate, lane 0 q12/q14 -> q10 */
+            vshll.u16   q8, d24, #8
+            vshll.u16   q9, d25, #8
+            vmlsl.u16   q8, d24, \zr0
+            vmlsl.u16   q9, d25, \zr0
+            vmlal.u16   q8, d28, \zr0
+            vmlal.u16   q9, d29, \zr0
+            vrshrn.u32  d20, q8, #8
+            vrshrn.u32  d21, q9, #8
+
+            /* Z interpolate, lane 1 q13/q15 -> q11 */
+            vshll.u16   q8, d26, #8
+            vshll.u16   q9, d27, #8
+            vmlsl.u16   q8, d26, \zr1
+            vmlsl.u16   q9, d27, \zr1
+            vmlal.u16   q8, d30, \zr1
+            vmlal.u16   q9, d31, \zr1
+            vrshrn.u32  d22, q8, #8
+            vrshrn.u32  d23, q9, #8
+
+            /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */
+            vshll.u16   q8, d20, #8
+            vshll.u16   q9, d22, #8
+            vmlsl.u16   q8, d20, \xr0
+            vmlsl.u16   q9, d22, \xr1
+            vmlal.u16   q8, d21, \xr0
+            vmlal.u16   q9, d23, \xr1
+            vshrn.u32   d28, q8, #8
+            vshrn.u32   d29, q9, #8
+
+            /* pack lanes 0-1 -> d12 */
+            vqrshrn.u16  \dst, q14, #8
+.endm
+
+/* size_t rsdIntrinsic3DLUT_K(
+ *          void *dst,          // r0
+ *          void const *in,     // r1
+ *          size_t count,       // r2
+ *          void const *lut,    // r3
+ *          int32_t pitchy,     // [sp]
+ *          int32_t pitchz,     // [sp+#4]
+ *          int dimx,           // [sp+#8]
+ *          int dimy,           // [sp+#12]
+ *          int dimz);          // [sp+#16]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+            push        {r4,r5,r6,r7}
+            ldr         r4, [sp, #16]
+            ldr         r5, [sp, #20]
+            ldr         r6, [sp, #24]
+            ldr         r7, [sp, #28]
+            ldr         r12, [sp, #32]
+            vpush       {d8-d15}
+
+            vmov.u8     d8, #1
+            vmov.u16    d8[0], r6
+            vmov.u16    d8[1], r7
+            vmov.u16    d8[2], r12
+            vmov.s32    d9, r4, r5
+
+            b           2f
+
+            .align 6
+/* r0  = dst
+ * r1  = src
+ * r2  = count
+ * r3  = lut
+ * r4  = pitchy
+ * r5  = pitchz
+ * r6 = offset0
+ * r7 = offset1
+ */
+
+1:          vld4.u8     {d0,d2,d4,d6}, [r1]!
+            vmov        d10, d6
+/* q0,q1,q2,q5 source data
+ * q4 dimensions and pitches
+ * q3, scratch register for scalar access
+ */
+            vmov        q3, q4
+            vmovl.u8    q0, d0
+            vmovl.u8    q1, d2
+            vmovl.u8    q2, d4
+            vmul.u16    q0, q0, d6[0]
+            vmul.u16    q1, q1, d6[1]
+            vmul.u16    q2, q2, d6[2]
+
+/* vrsra.u16 below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero.  Strictly this is
+ * correct, except for the llegal access problem.
+ */
+            vsra.u16    q0, q0, #8
+            vsra.u16    q1, q1, #8
+            vsra.u16    q2, q2, #8
+
+            vshr.u16    q12, q0, #8
+            vshr.u16    q13, q1, #8
+            vshr.u16    q14, q2, #8
+
+            vbic.u16    q0, #0xff00
+            vmovn.u16   d2, q1
+            vbic.u16    q2, #0xff00
+
+/* q0,d2,q2 fractional offset
+ * q12,q13,q14 integer offset
+ */
+
+            vshll.u16   q6, d24, #2
+            vshll.u16   q7, d25, #2
+            vmovl.u16   q8, d26
+            vmovl.u16   q9, d27
+            vmovl.u16   q10, d28
+            vmovl.u16   q11, d29
+            vmla.s32    q6, q8,  d9[0]
+            vmla.s32    q7, q9,  d9[0]
+            vmla.s32    q6, q10, d9[1]
+            vmla.s32    q7, q11, d9[1]
+
+/* q6,q7 list of table offsets */
+
+        /* lanes 0 and 1 */
+            lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1]
+
+        /* lanes 2 and 3 */
+            lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3]
+
+        /* lanes 4 and 5 */
+            lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1]
+
+        /* lanes 6 and 7 */
+            lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3]
+
+            vuzp.u8     d12, d13
+            vuzp.u8     d14, d15
+            vuzp.u8     d12, d14
+            vuzp.u8     d13, d15
+
+            vmov.u8     d15, d10
+
+            vst4.u8     {d12,d13,d14,d15}, [r0]!
+
+2:          subs        r2, #8
+            bhs         1b
+            add         r0, r2, #8
+            vpop        {d8-d15}
+            pop         {r4,r5,r6,r7}
+            bx lr
+END(rsdIntrinsic3DLUT_K)
diff --git a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
index 249ac58..34162ee 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
@@ -29,10 +29,10 @@
     vpush           {q4-q7}
     vld1.16 {q2}, [r2]!
     vld1.16 {q3}, [r2]!
-    vld1.32 {q4}, [r2]!
-    vld1.32 {q5}, [r2]!
-    vld1.32 {q6}, [r2]!
-    vld1.32 {q7}, [r2]!
+    vld1.32 {d8[],d9[]}, [r2]!
+    vld1.32 {d10[],d11[]}, [r2]!
+    vld1.32 {d12[],d13[]}, [r2]!
+    vld1.32 {d14[],d15[]}, [r2]!
     veor q0, q0
     veor q1, q1
     veor q9, q9
@@ -43,15 +43,15 @@
 SNIP_START(_N_ColorMatrix_prefix_f)
     stmfd           sp!, {r4, lr}
     vpush           {q4-q7}
-    add r2, #96
+    add r2, #48
     vld1.32 {q4}, [r2]!
     vld1.32 {q5}, [r2]!
     vld1.32 {q6}, [r2]!
     vld1.32 {q7}, [r2]!
-    vld1.32 {q8}, [r2]!
-    vld1.32 {q9}, [r2]!
-    vld1.32 {q10}, [r2]!
-    vld1.32 {q11}, [r2]!
+    vld1.32 {d16[],d17[]}, [r2]!
+    vld1.32 {d18[],d19[]}, [r2]!
+    vld1.32 {d20[],d21[]}, [r2]!
+    vld1.32 {d22[],d23[]}, [r2]!
     veor q1, q1
     veor q2, q2
     veor q3, q3