Add AArch64 assembly for ColorMatrix.

Change-Id: I2fcc57aceea08243d5132287f6de053b846c5fe7
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 9cb4847..5699777 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -42,6 +42,7 @@
 #LOCAL_SRC_FILES_arm64 += \
 #    rsCpuIntrinsics_advsimd_Blend.S \
 #    rsCpuIntrinsics_advsimd_Blur.S \
+#    rsCpuIntrinsics_advsimd_ColorMatrix.S \
 #    rsCpuIntrinsics_advsimd_YuvToRGB.S
 
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 87db9ba..5d4241a 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -125,6 +125,32 @@
     } u;
 } Key_t;
 
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+typedef struct {
+    void (*column[4])(void);
+    void (*store)(void);
+    void (*load)(void);
+} FunctionTab_t;
+
+extern "C" size_t rsdIntrinsicColorMatrix_int_K(
+             void *out, void const *in, size_t count,
+             FunctionTab_t const *fns,
+             int16_t const *mult, int32_t const *add);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
+             FunctionTab_t const *fns,
+             uint32_t mask, int dt, int st);
+
+extern "C" size_t rsdIntrinsicColorMatrix_float_K(
+             void *out, void const *in, size_t count,
+             FunctionTab_t const *fns,
+             float const *mult, float const *add);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
+             FunctionTab_t const *fns,
+             uint32_t mask, int dt, int st);
+#endif
+
 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
 public:
     virtual void populateScript(Script *);
@@ -146,9 +172,12 @@
     // The following four fields are read as constants
     // by the SIMD assembly code.
     short ip[16];
-    int ipa[16];
+    int ipa[4];
     float tmpFp[16];
-    float tmpFpa[16];
+    float tmpFpa[4];
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+    FunctionTab_t mFnTab;
+#endif
 
     static void kernel(const RsForEachStubParamStruct *p,
                        uint32_t xstart, uint32_t xend,
@@ -212,9 +241,9 @@
             }
         }
         if (ipa[0] != 0) key.u.addMask |= 0x1;
-        if (ipa[4] != 0) key.u.addMask |= 0x2;
-        if (ipa[8] != 0) key.u.addMask |= 0x4;
-        if (ipa[12] != 0) key.u.addMask |= 0x8;
+        if (ipa[1] != 0) key.u.addMask |= 0x2;
+        if (ipa[2] != 0) key.u.addMask |= 0x4;
+        if (ipa[3] != 0) key.u.addMask |= 0x8;
     }
 
     // Look for a dot product where the r,g,b colums are the same
@@ -257,13 +286,16 @@
     case 3:
         key.u.outVecSize = 2;
         key.u.coeffMask &= ~0x8888;
+        key.u.addMask &= 7;
         break;
     case 2:
         key.u.outVecSize = 1;
         key.u.coeffMask &= ~0xCCCC;
+        key.u.addMask &= 3;
         break;
     default:
         key.u.coeffMask &= ~0xEEEE;
+        key.u.addMask &= 1;
         break;
     }
 
@@ -278,7 +310,7 @@
     return key;
 }
 
-#if defined(ARCH_ARM_HAVE_NEON) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
 
 #define DEF_SYM(x)                                  \
     extern "C" uint32_t _N_ColorMatrix_##x;      \
@@ -408,7 +440,7 @@
 
 
 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
-#if defined(ARCH_ARM_HAVE_NEON) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
     mBufSize = 4096;
     //StopWatch build_time("rs cm: build time");
     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
@@ -676,18 +708,12 @@
     float add = 0.f;
     if (fpMul > 254.f) add = 0.5f;
     for(int ct=0; ct < 4; ct++) {
-        tmpFpa[ct * 4 + 0] = fpa[ct] * addMul + add;
+        tmpFpa[ct] = fpa[ct] * addMul + add;
         //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
-        tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4];
-        tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4];
-        tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4];
     }
 
     for(int ct=0; ct < 4; ct++) {
-        ipa[ct * 4 + 0] = (int)(fpa[ct] * 65536.f + 0.5f);
-        ipa[ct * 4 + 1] = ipa[ct * 4];
-        ipa[ct * 4 + 2] = ipa[ct * 4];
-        ipa[ct * 4 + 3] = ipa[ct * 4];
+        ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
     }
 }
 
@@ -768,9 +794,9 @@
     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
 
     sum.x += add[0];
-    sum.y += add[4];
-    sum.z += add[8];
-    sum.w += add[12];
+    sum.y += add[1];
+    sum.z += add[2];
+    sum.w += add[3];
 
 
     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
@@ -826,12 +852,27 @@
     //if (!p->y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
 
     if(x2 > x1) {
-        int32_t len = (x2 - x1) >> 2;
-        if((cp->mOptKernel != NULL) && (len > 0)) {
-            cp->mOptKernel(out, in, cp->ip, len);
-            x1 += len << 2;
-            out += outstep * (len << 2);
-            in += instep * (len << 2);
+        int32_t len = x2 - x1;
+        if (gArchUseSIMD) {
+            if((cp->mOptKernel != NULL) && (len >= 4)) {
+                cp->mOptKernel(out, in, cp->ip, len >> 2);
+                x1 += len;
+                out += outstep * len;
+                in += instep * len;
+            }
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+            else {
+                size_t done;
+                if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
+                    done = len - rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+                } else {
+                    done = len - rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
+                }
+                x1 += done;
+                out += outstep * done;
+                in += instep * done;
+            }
+#endif
         }
 
         while(x1 != x2) {
@@ -872,8 +913,29 @@
         mOptKernel = NULL;
         if (build(key)) {
             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
-            mLastKey = key;
         }
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+        else {
+            int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
+            int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
+            uint32_t mm = 0;
+            int i;
+            for (i = 0; i < 4; i++)
+            {
+                uint32_t m = (key.u.coeffMask >> i) & 0x1111;
+                m = ((m * 0x249) >> 9) & 15;
+                m |= ((key.u.addMask >> i) & 1) << 4;
+                mm |= m << (i * 5);
+            }
+
+            if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
+                rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
+            } else {
+                rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
+            }
+        }
+#endif
+        mLastKey = key;
     }
 }
 
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
new file mode 100644
index 0000000..7a6d4c5
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
@@ -0,0 +1,838 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro vmxx_f32 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1)
+        fmla            \opd, \opa, \opb
+    .else
+        fmul            \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1)
+        fadd            \opd, \opa, \opb
+    .else
+        mov             \stupidsyntax1, \stupidsyntax2
+    .endif
+  .endif
+.endm
+
+.macro vmxx_s16 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1 + 16)
+        smlal           \opd, \opa, \opb
+    .else
+        smull           \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+.macro vmxx2_s16 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1 + 16)
+        smlal2          \opd, \opa, \opb
+    .else
+        smull2          \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+/* x0 = dst
+ * x1 = src
+ * x2 = count
+ * x3 = params
+ * x4 = column0_fn
+ * x5 = column1_fn
+ * x6 = column2_fn
+ * x7 = column3_fn
+ * x8 = store_fn
+ * x9 = load_fn
+ */
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+.align 6
+colormatrix_int_col0_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[0]
+            dup         v7.4s, v4.s[0]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[0]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[4]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[0]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[4]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[0]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[4]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[0]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[4]
+            sqshrun     v8.4h, v6.4s, #8
+            sqshrun2    v8.8h, v7.4s, #8
+            br          x5
+
+colormatrix_int_col0_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[0]
+            dup         v7.4s, v4.s[0]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[0]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[4]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[0]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[4]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[0]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[4]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[0]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[4]
+            sqshrun     v8.4h, v6.4s, #8
+            sqshrun2    v8.8h, v7.4s, #8
+            br          x5
+
+.align 6
+colormatrix_int_col1_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[1]
+            dup         v7.4s, v4.s[1]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[1]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[5]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[1]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[5]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[1]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[5]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[1]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[5]
+            sqshrun     v9.4h, v6.4s, #8
+            sqshrun2    v9.8h, v7.4s, #8
+            br          x6
+
+colormatrix_int_col1_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[1]
+            dup         v7.4s, v4.s[1]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[1]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[5]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[1]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[5]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[1]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[5]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[1]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[5]
+            sqshrun     v9.4h, v6.4s, #8
+            sqshrun2    v9.8h, v7.4s, #8
+            br          x6
+
+.align 6
+colormatrix_int_col2_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[2]
+            dup         v7.4s, v4.s[2]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[2]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[6]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[2]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[6]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[2]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[6]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[2]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[6]
+            sqshrun     v10.4h, v6.4s, #8
+            sqshrun2    v10.8h, v7.4s, #8
+            br          x7
+
+colormatrix_int_col2_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[2]
+            dup         v7.4s, v4.s[2]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[2]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[6]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[2]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[6]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[2]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[6]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[2]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[6]
+            sqshrun     v10.4h, v6.4s, #8
+            sqshrun2    v10.8h, v7.4s, #8
+            br          x7
+
+.align 6
+colormatrix_int_col3_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[3]
+            dup         v7.4s, v4.s[3]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[3]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[7]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[3]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[7]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[3]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[7]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[3]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[7]
+            sqshrun     v11.4h, v6.4s, #8
+            sqshrun2    v11.8h, v7.4s, #8
+            br          x8
+
+colormatrix_int_col3_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[3]
+            dup         v7.4s, v4.s[3]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[3]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[7]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[3]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[7]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[3]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[7]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[3]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[7]
+            sqshrun     v11.4h, v6.4s, #8
+            sqshrun2    v11.8h, v7.4s, #8
+            br          x8
+
+.align 5
+colormatrix_float_col0_\i:
+            vmxx_f32    \i, 1,  v8.4s, v12.4s, v0.s[0]
+            vmxx_f32    \i, 2,  v8.4s, v13.4s, v1.s[0]
+            vmxx_f32    \i, 4,  v8.4s, v14.4s, v2.s[0]
+            vmxx_f32    \i, 8,  v8.4s, v15.4s, v3.s[0]
+            vadd_f32    \i, 16, v8.4s, v8.4s, v4.4s,        v8.16b, v4.16b
+            vmxx_f32    \i, 1,  v16.4s, v20.4s, v0.s[0]
+            vmxx_f32    \i, 2,  v16.4s, v21.4s, v1.s[0]
+            vmxx_f32    \i, 4,  v16.4s, v22.4s, v2.s[0]
+            vmxx_f32    \i, 8,  v16.4s, v23.4s, v3.s[0]
+            vadd_f32    \i, 16, v16.4s, v16.4s, v4.4s,      v16.16b, v4.16b
+            br          x5
+
+.align 4
+colormatrix_float_col0_n\i:
+            vmxx_f32    \i^31, 1,  v8.4s, v12.4s, v0.s[0]
+            vmxx_f32    \i^31, 2,  v8.4s, v13.4s, v1.s[0]
+            vmxx_f32    \i^31, 4,  v8.4s, v14.4s, v2.s[0]
+            vmxx_f32    \i^31, 8,  v8.4s, v15.4s, v3.s[0]
+            vadd_f32    \i^31, 16, v8.4s, v8.4s, v4.4s,     v8.16b, v4.16b
+            vmxx_f32    \i^31, 1,  v16.4s, v20.4s, v0.s[0]
+            vmxx_f32    \i^31, 2,  v16.4s, v21.4s, v1.s[0]
+            vmxx_f32    \i^31, 4,  v16.4s, v22.4s, v2.s[0]
+            vmxx_f32    \i^31, 8,  v16.4s, v23.4s, v3.s[0]
+            vadd_f32    \i^31, 16, v16.4s, v16.4s, v4.4s,   v16.16b, v4.16b
+            br          x5
+
+.align 5
+colormatrix_float_col1_\i:
+            vmxx_f32    \i, 1,  v9.4s, v12.4s, v0.s[1]
+            vmxx_f32    \i, 2,  v9.4s, v13.4s, v1.s[1]
+            vmxx_f32    \i, 4,  v9.4s, v14.4s, v2.s[1]
+            vmxx_f32    \i, 8,  v9.4s, v15.4s, v3.s[1]
+            vadd_f32    \i, 16, v9.4s, v9.4s, v5.4s,        v9.16b, v5.16b
+            vmxx_f32    \i, 1,  v17.4s, v20.4s, v0.s[1]
+            vmxx_f32    \i, 2,  v17.4s, v21.4s, v1.s[1]
+            vmxx_f32    \i, 4,  v17.4s, v22.4s, v2.s[1]
+            vmxx_f32    \i, 8,  v17.4s, v23.4s, v3.s[1]
+            vadd_f32    \i, 16, v17.4s, v17.4s, v5.4s,      v17.16b, v5.16b
+            br          x6
+
+.align 4
+colormatrix_float_col1_n\i:
+            vmxx_f32    \i^31, 1,  v9.4s, v12.4s, v0.s[1]
+            vmxx_f32    \i^31, 2,  v9.4s, v13.4s, v1.s[1]
+            vmxx_f32    \i^31, 4,  v9.4s, v14.4s, v2.s[1]
+            vmxx_f32    \i^31, 8,  v9.4s, v15.4s, v3.s[1]
+            vadd_f32    \i^31, 16, v9.4s, v9.4s, v5.4s,     v9.16b, v5.16b
+            vmxx_f32    \i^31, 1,  v17.4s, v20.4s, v0.s[1]
+            vmxx_f32    \i^31, 2,  v17.4s, v21.4s, v1.s[1]
+            vmxx_f32    \i^31, 4,  v17.4s, v22.4s, v2.s[1]
+            vmxx_f32    \i^31, 8,  v17.4s, v23.4s, v3.s[1]
+            vadd_f32    \i^31, 16, v17.4s, v17.4s, v5.4s,   v17.16b, v5.16b
+            br          x6
+
+.align 5
+colormatrix_float_col2_\i:
+            vmxx_f32    \i, 1,  v10.4s, v12.4s, v0.s[2]
+            vmxx_f32    \i, 2,  v10.4s, v13.4s, v1.s[2]
+            vmxx_f32    \i, 4,  v10.4s, v14.4s, v2.s[2]
+            vmxx_f32    \i, 8,  v10.4s, v15.4s, v3.s[2]
+            vadd_f32    \i, 16, v10.4s, v10.4s, v6.4s,      v10.16b, v6.16b
+            vmxx_f32    \i, 1,  v18.4s, v20.4s, v0.s[2]
+            vmxx_f32    \i, 2,  v18.4s, v21.4s, v1.s[2]
+            vmxx_f32    \i, 4,  v18.4s, v22.4s, v2.s[2]
+            vmxx_f32    \i, 8,  v18.4s, v23.4s, v3.s[2]
+            vadd_f32    \i, 16, v18.4s, v18.4s, v6.4s,      v18.16b, v6.16b
+            br          x7
+
+.align 4
+colormatrix_float_col2_n\i:
+            vmxx_f32    \i^31, 1,  v10.4s, v12.4s, v0.s[2]
+            vmxx_f32    \i^31, 2,  v10.4s, v13.4s, v1.s[2]
+            vmxx_f32    \i^31, 4,  v10.4s, v14.4s, v2.s[2]
+            vmxx_f32    \i^31, 8,  v10.4s, v15.4s, v3.s[2]
+            vadd_f32    \i^31, 16, v10.4s, v10.4s, v6.4s,   v10.16b, v6.16b
+            vmxx_f32    \i^31, 1,  v18.4s, v20.4s, v0.s[2]
+            vmxx_f32    \i^31, 2,  v18.4s, v21.4s, v1.s[2]
+            vmxx_f32    \i^31, 4,  v18.4s, v22.4s, v2.s[2]
+            vmxx_f32    \i^31, 8,  v18.4s, v23.4s, v3.s[2]
+            vadd_f32    \i^31, 16, v18.4s, v18.4s, v6.4s,   v18.16b, v6.16b
+            br          x7
+
+.align 5
+colormatrix_float_col3_\i:
+            vmxx_f32    \i, 1,  v11.4s, v12.4s, v0.s[3]
+            vmxx_f32    \i, 2,  v11.4s, v13.4s, v1.s[3]
+            vmxx_f32    \i, 4,  v11.4s, v14.4s, v2.s[3]
+            vmxx_f32    \i, 8,  v11.4s, v15.4s, v3.s[3]
+            vadd_f32    \i, 16, v11.4s, v11.4s, v7.4s,      v11.16b, v7.16b
+            vmxx_f32    \i, 1,  v19.4s, v20.4s, v0.s[3]
+            vmxx_f32    \i, 2,  v19.4s, v21.4s, v1.s[3]
+            vmxx_f32    \i, 4,  v19.4s, v22.4s, v2.s[3]
+            vmxx_f32    \i, 8,  v19.4s, v23.4s, v3.s[3]
+            vadd_f32    \i, 16, v19.4s, v19.4s, v7.4s,      v19.16b, v7.16b
+            br          x8
+
+.align 4
+colormatrix_float_col3_n\i:
+            vmxx_f32    \i^31, 1,  v11.4s, v12.4s, v0.s[3]
+            vmxx_f32    \i^31, 2,  v11.4s, v13.4s, v1.s[3]
+            vmxx_f32    \i^31, 4,  v11.4s, v14.4s, v2.s[3]
+            vmxx_f32    \i^31, 8,  v11.4s, v15.4s, v3.s[3]
+            vadd_f32    \i^31, 16, v11.4s, v11.4s, v7.4s,  v11.16b, v7.16b
+            vmxx_f32    \i^31, 1,  v19.4s, v20.4s, v0.s[3]
+            vmxx_f32    \i^31, 2,  v19.4s, v21.4s, v1.s[3]
+            vmxx_f32    \i^31, 4,  v19.4s, v22.4s, v2.s[3]
+            vmxx_f32    \i^31, 8,  v19.4s, v23.4s, v3.s[3]
+            vadd_f32    \i^31, 16, v19.4s, v19.4s, v7.4s,  v19.16b, v7.16b
+            br          x8
+
+.endr
+
+.align 6
+colormatrix_float_ldu4:
+            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v23.8h, v23.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl        v15.4s, v23.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            uxtl2       v23.4s, v23.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v15.4s, v15.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            ucvtf       v23.4s, v23.4s
+            br          x4
+
+.align 5
+colormatrix_int_ldu4:
+            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            uxtl        v15.8h, v15.8b
+            br          x4
+
+.align 6
+colormatrix_float_ldu3:
+            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            br          x4
+
+colormatrix_int_ldu3:
+            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            br          x4
+
+.align 5
+colormatrix_float_ldu1:
+            ld1         {v20.8b}, [x1], #8
+            uxtl        v20.8h, v20.8b
+            uxtl        v12.4s, v20.4h
+            uxtl2       v20.4s, v20.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v20.4s, v20.4s
+            br          x4
+
+.align 6
+colormatrix_float_ldu2:
+            ld2         {v20.8b,v21.8b}, [x1], #16
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            br          x4
+
+.align 4
+colormatrix_int_ldu2:
+            ld2         {v12.8b,v13.8b}, [x1], #16
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            br          x4
+
+.align 6
+colormatrix_float_stu4:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v27.4s, v11.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            fcvtzs      v31.4s, v19.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun    v27.4h, v27.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            sqrshrun2   v27.8h, v31.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            uqxtn       v26.8b, v26.8h
+            uqxtn       v27.8b, v27.8h
+            subs        x2, x2, #8
+            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_int_stu4:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            uqxtn       v15.8b, v11.8h
+            subs        x2, x2, #8
+            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+            blo         colormatrix_int_end
+            br          x9
+
+.align 6
+colormatrix_float_stu3:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            uqxtn       v26.8b, v26.8h
+            movi        v27.8b, #0
+            subs        x2, x2, #8
+            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+.align 4
+colormatrix_int_ldu1:
+            ld1         {v12.8b}, [x1], #8
+            uxtl        v12.8h, v12.8b
+            br          x4
+
+.align 5
+colormatrix_int_stu3:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            movi        v15.8b, #0
+            subs        x2, x2, #8
+            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+            blo         colormatrix_int_end
+            br          x9
+
+.align 6
+colormatrix_float_stu2:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            subs        x2, x2, #8
+            st2         {v24.8b,v25.8b}, [x0], #16
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_int_stu2:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            subs        x2, x2, #8
+            st2         {v12.8b,v13.8b}, [x0], #16
+            blo         colormatrix_int_end
+            br          x9
+
+.align 5
+colormatrix_int_stu1:
+            uqxtn       v12.8b, v8.8h
+            subs        x2, x2, #8
+            st1         {v12.8b}, [x0], #8
+            blo         colormatrix_int_end
+            br          x9
+
+colormatrix_float_ldf3:
+            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+            br          x4
+
+.align 6
+colormatrix_float_stu1:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            uqxtn       v24.8b, v24.8h
+            subs        x2, x2, #8
+            st1         {v24.8b}, [x0], #8
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_stf3:
+            movi        v11.16b, #0
+            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+            movi        v19.16b, #0
+            subs        x2, x2, #8
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_float_stf4:
+            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+            subs        x2, x2, #8
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf4:
+            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+            br          x4
+
+.align 5
+colormatrix_float_stf2:
+            st2         {v8.4s, v9.4s}, [x0], #32
+            subs        x2, x2, #8
+            st2         {v16.4s, v17.4s}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf2:
+            ld2         {v12.4s,v13.4s}, [x1], #32
+            ld2         {v20.4s,v21.4s}, [x1], #32
+            br          x4
+
+.align 5
+colormatrix_float_stf1:
+            st1         {v8.4s}, [x0], #16
+            subs        x2, x2, #8
+            st1         {v16.4s}, [x0], #16
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf1:
+            ld1         {v12.4s}, [x1], #16
+            ld1         {v20.4s}, [x1], #16
+            br          x4
+
+
+/* size_t rsdIntrinsicColorMatrix_int_K(
+ *          void *out,              // x0
+ *          void const *in,         // x1
+ *          size_t count,           // x2
+ *          fntab_t const *fns,     // x3
+ *          int16_t const *mult,    // x4
+ *          int32_t const *add);    // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_int_K)
+            stp         x8,x9, [sp, #-16]!
+            sub         x7, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d-v11.1d}, [sp]
+            st1         {v12.1d-v15.1d}, [x7]
+
+            ld1         {v0.8h,v1.8h}, [x4], #32
+            ld1         {v4.4s}, [x5], #16
+
+            ldp         x4,x5, [x3],#16
+            ldp         x6,x7, [x3],#16
+            ldp         x8,x9, [x3],#16
+
+            dup         v12.4s, v4.s[0]
+            dup         v13.4s, v4.s[1]
+            dup         v14.4s, v4.s[2]
+            dup         v15.4s, v4.s[3]
+            sqshrun     v8.4h, v12.4s, #8
+            sqshrun2    v8.8h, v12.4s, #8
+            sqshrun     v9.4h, v13.4s, #8
+            sqshrun2    v9.8h, v13.4s, #8
+            sqshrun     v10.4h, v14.4s, #8
+            sqshrun2    v10.8h, v14.4s, #8
+            sqshrun     v11.4h, v15.4s, #8
+            sqshrun2    v11.8h, v15.4s, #8
+
+            subs        x2, x2, #8
+            blo         colormatrix_int_end
+            br          x9
+
+colormatrix_int_end:
+            ld1         {v8.1d-v11.1d}, [sp], #32
+            ld1         {v12.1d-v15.1d}, [sp], #32
+            ldp         x8,x9, [sp], #16
+            add         x0, x2, #8
+            ret
+END(rsdIntrinsicColorMatrix_int_K)
+
+/* void rsdIntrinsicColorMatrixSetup_int_K(
+ *          fntab_t const *fns, // x0
+ *          uint32_t mask,      // x1
+ *          int dt,             // x2
+ *          int st);            // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
+            adr         x4, 2f
+            ldrsh       x2, [x4, x2, LSL #1]
+            add         x2, x2, x4
+            adr         x4, 3f
+            ldrsh       x3, [x4, x3, LSL #1]
+            add         x3, x3, x4
+            stp         x2, x3, [x0, #32]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+            mov         x3, #4
+            adr         x4, 4f
+1:          ands        x2, x1, #15
+            beq         9f
+            and         x2, x1, #31
+            lsl         x2, x2, #3
+            ldrsh       x2, [x4, x2]
+            add         x2, x2, x4
+9:          str         x2, [x0], #8
+            lsr         x1, x1, #5
+            add         x4, x4, #2
+            subs        x3, x3, #1
+            bne         1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+            ldr         x2, [x0]
+            mov         x3, #4
+1:          ldr         x1, [x0, #-8]!
+            cmp         x1, #0
+            csel        x2, x1, x2, ne
+            str         x2, [x0]
+            subs        x3, x3, #1
+            bne         1b
+            ret
+
+            .align 4
+2:          .hword      colormatrix_int_stu1-2b
+            .hword      colormatrix_int_stu2-2b
+            .hword      colormatrix_int_stu3-2b
+            .hword      colormatrix_int_stu4-2b
+3:          .hword      colormatrix_int_ldu1-3b
+            .hword      colormatrix_int_ldu2-3b
+            .hword      colormatrix_int_ldu3-3b
+            .hword      colormatrix_int_ldu4-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            .hword      colormatrix_int_col0_\i-4b
+            .hword      colormatrix_int_col1_\i-4b-2
+            .hword      colormatrix_int_col2_\i-4b-4
+            .hword      colormatrix_int_col3_\i-4b-6
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+            .hword      colormatrix_int_col0_n\i-4b
+            .hword      colormatrix_int_col1_n\i-4b-2
+            .hword      colormatrix_int_col2_n\i-4b-4
+            .hword      colormatrix_int_col3_n\i-4b-6
+.endr
+END(rsdIntrinsicColorMatrixSetup_int_K)
+
+
+/* size_t rsdIntrinsicColorMatrix_float_K(
+ *          void *out,              // x0
+ *          void const *in,         // x1
+ *          size_t count,           // x2
+ *          fntab_t const *fns,     // x3
+ *          float const *mult,      // x4
+ *          float const *add);      // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_float_K)
+            stp         x8,x9, [sp, #-16]!
+            sub         x7, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d-v11.1d}, [sp]
+            st1         {v12.1d-v15.1d}, [x7]
+
+            ld1         {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
+            ld1r        {v4.4s}, [x5], #4
+            ld1r        {v5.4s}, [x5], #4
+            ld1r        {v6.4s}, [x5], #4
+            ld1r        {v7.4s}, [x5], #4
+
+            ldp         x4,x5, [x3], #16
+            ldp         x6,x7, [x3], #16
+            ldp         x8,x9, [x3], #16
+
+            mov         v8.16b, v4.16b
+            mov         v9.16b, v5.16b
+            mov         v10.16b, v6.16b
+            mov         v11.16b, v7.16b
+
+            mov         v16.16b, v4.16b
+            mov         v17.16b, v5.16b
+            mov         v18.16b, v6.16b
+            mov         v19.16b, v7.16b
+
+            subs        x2, x2, #8
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_end:
+            ld1         {v8.1d-v11.1d}, [sp], #32
+            ld1         {v12.1d-v15.1d}, [sp], #32
+            ldp         x8,x9, [sp], #16
+            add         x0, x2, #8
+            ret
+END(rsdIntrinsicColorMatrix_float_K)
+
+/* void rsdIntrinsicColorMatrixSetup_float_K(
+ *          fntab_t const *fns, // x0
+ *          uint32_t mask,      // x1
+ *          int dt,             // x2
+ *          int st);            // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
+            adr         x4, 2f
+            ldrsh       x2, [x4, x2, LSL #1]
+            add         x2, x2, x4
+            adr         x4, 3f
+            ldrsh       x3, [x4, x3, LSL #1]
+            add         x3, x3, x4
+            stp         x2, x3, [x0, #32]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+            mov         x3, #4
+            adr         x4, 4f
+1:          ands        x2, x1, #15
+            beq         9f
+            and         x2, x1, #31
+            lsl         x2, x2, #3
+            ldrsh       x2, [x4, x2]
+            add         x2, x2, x4
+9:          str         x2, [x0], #8
+            lsr         x1, x1, #5
+            add         x4, x4, #2
+            subs        x3, x3, #1
+            bne         1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+            ldr         x2, [x0]
+            mov         x3, #4
+1:          ldr         x1, [x0, #-8]!
+            cmp         x1, #0
+            csel        x2, x1, x2, ne
+            str         x2, [x0]
+            subs        x3, x3, #1
+            bne         1b
+            ret
+
+            .align 4
+2:          .hword      colormatrix_float_stu1-2b
+            .hword      colormatrix_float_stu2-2b
+            .hword      colormatrix_float_stu3-2b
+            .hword      colormatrix_float_stu4-2b
+            .hword      colormatrix_float_stf1-2b
+            .hword      colormatrix_float_stf2-2b
+            .hword      colormatrix_float_stf3-2b
+            .hword      colormatrix_float_stf4-2b
+3:          .hword      colormatrix_float_ldu1-3b
+            .hword      colormatrix_float_ldu2-3b
+            .hword      colormatrix_float_ldu3-3b
+            .hword      colormatrix_float_ldu4-3b
+            .hword      colormatrix_float_ldf1-3b
+            .hword      colormatrix_float_ldf2-3b
+            .hword      colormatrix_float_ldf3-3b
+            .hword      colormatrix_float_ldf4-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            .hword      colormatrix_float_col0_\i-4b
+            .hword      colormatrix_float_col1_\i-4b-2
+            .hword      colormatrix_float_col2_\i-4b-4
+            .hword      colormatrix_float_col3_\i-4b-6
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+            .hword      colormatrix_float_col0_n\i-4b
+            .hword      colormatrix_float_col1_n\i-4b-2
+            .hword      colormatrix_float_col2_n\i-4b-4
+            .hword      colormatrix_float_col3_n\i-4b-6
+.endr
+END(rsdIntrinsicColorMatrixSetup_float_K)
diff --git a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
index 249ac58..34162ee 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
@@ -29,10 +29,10 @@
     vpush           {q4-q7}
     vld1.16 {q2}, [r2]!
     vld1.16 {q3}, [r2]!
-    vld1.32 {q4}, [r2]!
-    vld1.32 {q5}, [r2]!
-    vld1.32 {q6}, [r2]!
-    vld1.32 {q7}, [r2]!
+    vld1.32 {d8[],d9[]}, [r2]!
+    vld1.32 {d10[],d11[]}, [r2]!
+    vld1.32 {d12[],d13[]}, [r2]!
+    vld1.32 {d14[],d15[]}, [r2]!
     veor q0, q0
     veor q1, q1
     veor q9, q9
@@ -43,15 +43,15 @@
 SNIP_START(_N_ColorMatrix_prefix_f)
     stmfd           sp!, {r4, lr}
     vpush           {q4-q7}
-    add r2, #96
+    add r2, #48
     vld1.32 {q4}, [r2]!
     vld1.32 {q5}, [r2]!
     vld1.32 {q6}, [r2]!
     vld1.32 {q7}, [r2]!
-    vld1.32 {q8}, [r2]!
-    vld1.32 {q9}, [r2]!
-    vld1.32 {q10}, [r2]!
-    vld1.32 {q11}, [r2]!
+    vld1.32 {d16[],d17[]}, [r2]!
+    vld1.32 {d18[],d19[]}, [r2]!
+    vld1.32 {d20[],d21[]}, [r2]!
+    vld1.32 {d22[],d23[]}, [r2]!
     veor q1, q1
     veor q2, q2
     veor q3, q3