Color Matrix improvements.

Change-Id: I4594ea43a0a2b298a9ad66bd5e63d8b829d4f620
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 5becdf1..b3de4f4 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -34,7 +34,8 @@
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
     LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
     LOCAL_SRC_FILES+= \
-        rsCpuIntrinsics_neon.S
+        rsCpuIntrinsics_neon.S \
+        rsCpuIntrinsics_neon_ColorMatrix.S
 endif
 
 ifeq ($(ARCH_ARM_HAVE_VFP),true)
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 3fc322c..4a57c89 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -14,9 +14,19 @@
  * limitations under the License.
  */
 
+#include <sys/mman.h>
+#include <unistd.h>
 
 #include "rsCpuIntrinsic.h"
 #include "rsCpuIntrinsicInlines.h"
+#include "linkloader/include/MemChunk.h"
+
+#include <sys/mman.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+//#include <utils/StopWatch.h>
+
 
 using namespace android;
 using namespace android::renderscript;
@@ -34,49 +44,425 @@
     virtual ~RsdCpuScriptIntrinsicColorMatrix();
     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
+    virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
+                           const void * usr, uint32_t usrLen, const RsScriptCall *sc);
+    virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
+                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
+
 protected:
     float fp[16];
+    float fpa[4];
     short ip[16];
+    int ipa[4];
 
-    static void kernel4x4(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
-    static void kernel3x3(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
-    static void kernelDot(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+
+    int64_t mLastKey;
+    unsigned char *mBuf;
+    size_t mBufSize;
+
+    int64_t computeKey(const Element *ein, const Element *eout);
+
+    bool build(int64_t key);
+
+    void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
+
 };
 
 }
 }
 
 
-void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
-                                                    size_t dataLength) {
-    rsAssert(slot == 0);
-    memcpy (fp, data, dataLength);
-    for(int ct=0; ct < 16; ct++) {
-        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+#define CM_IN_VEC_SIZE_MASK         0x00000003
+#define CM_OUT_VEC_SIZE_MASK        0x0000000c
+#define CM_IN_TYPE_SIZE_MASK        0x000000F0
+#define CM_OUT_TYPE_SIZE_MASK       0x00000F00
+#define CM_DOT_MASK                 0x00001000
+#define CM_ADD_MASK                 0x00002000
+#define CM_COPY_ALPHA               0x00004000
+#define CM_MATRIX_MASK              0xFFFF0000
+
+
+int64_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
+        const Element *ein, const Element *eout) {
+
+    int64_t key = 0;
+
+    // Compute a unique code key for this operation
+
+    // Add to the key the input and output types
+    key |= (ein->getVectorSize() - 1);
+    key |= (eout->getVectorSize() - 1) << 2;
+
+    bool hasFloat = false;
+    if (ein->getType() == RS_TYPE_FLOAT_32) {
+        hasFloat = true;
+        key |= 1 << 4;
+    }
+    if (eout->getType() == RS_TYPE_FLOAT_32) {
+        hasFloat = true;
+        key |= 1 << 8;
     }
 
-    mRootPtr = &kernel4x4;
-    if ((ip[3] == 0) && (ip[7] == 0) && (ip[11] == 0) &&
-        (ip[12] == 0) && (ip[13] == 0) && (ip[14] == 0) && (ip[15] == 255)) {
-        mRootPtr = &kernel3x3;
-
-        if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
-            (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
-            (ip[8] == ip[9]) && (ip[8] == ip[10])) {
-            mRootPtr = &kernelDot;
+    // Mask in the bits indicating which coefficients in the
+    // color matrix are needed.
+    if (hasFloat) {
+        for (uint32_t i=0; i < 16; i++) {
+            if (fabs(fp[i]) != 0.f) {
+                key |= (uint32_t)(1 << (i + 16));
+            }
+        }
+    } else {
+        for (uint32_t i=0; i < 16; i++) {
+            if (ip[i] != 0) {
+                key |= (uint32_t)(1 << (i + 16));
+            }
         }
     }
+
+    // Look for a dot product where the r,g,b colums are the same
+    if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
+        (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
+        (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
+        (ip[12] == ip[13]) && (ip[12] == ip[14])) {
+
+        key |= CM_DOT_MASK;
+    }
+
+    // Is alpha a simple copy
+    if (!(key & 0x08880000) && (ip[15] == 256)) {
+        key |= CM_COPY_ALPHA;
+    }
+
+    //ALOGE("build key %08x, %08x", (int32_t)(key >> 32), (int32_t)key);
+    return key;
 }
 
-extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, const short *coef, uint32_t count);
-extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, const short *coef, uint32_t count);
-extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, const short *coef, uint32_t count);
+#if defined(ARCH_ARM_HAVE_NEON)
+
+#define DEF_SYM(x)                                  \
+    extern "C" uint32_t _N_ColorMatrix_##x;      \
+    extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
+    extern "C" uint32_t _N_ColorMatrix_##x##_len;
+
+DEF_SYM(prefix)
+DEF_SYM(postfix1)
+DEF_SYM(postfix2)
+DEF_SYM(load_u8_4)
+DEF_SYM(load_u8_2)
+DEF_SYM(load_u8_1)
+DEF_SYM(store_u8_4)
+DEF_SYM(store_u8_2)
+DEF_SYM(store_u8_1)
+DEF_SYM(unpack_u8_4)
+DEF_SYM(unpack_u8_3)
+DEF_SYM(unpack_u8_2)
+DEF_SYM(unpack_u8_1)
+DEF_SYM(pack_u8_4)
+DEF_SYM(pack_u8_3)
+DEF_SYM(pack_u8_2)
+DEF_SYM(pack_u8_1)
+DEF_SYM(dot)
+DEF_SYM(add_0_u8)
+DEF_SYM(add_1_u8)
+DEF_SYM(add_2_u8)
+DEF_SYM(add_3_u8)
+
+#define ADD_CHUNK(x) \
+    memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
+    buf += _N_ColorMatrix_##x##_len
+
+
+static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
+    size_t off = (target - buf - 8) >> 2;
+    rsAssert(((off & 0xff000000) == 0) ||
+           ((off & 0xff000000) == 0xff000000));
+
+    uint32_t op = (condition << 28);
+    op |= 0xa << 24;  // branch
+    op |= 0xffffff & off;
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm, bool q, bool sz) {
+    rsAssert(vd < 32);
+    rsAssert(vm < 32);
+    rsAssert(vn < 32);
+
+    uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
+    op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
+    op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
+    if (q) op |= 1 << 6;
+    if (sz) op |= 1 << 8;
+    return op;
+}
+
+static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
+    //vmlal.s16 Q#1, D#1, D#2[#]
+    uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3), false, false);
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
+    //vmull.s16 Q#1, D#1, D#2[#]
+    uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3), false, false);
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+#endif
+
+
+bool RsdCpuScriptIntrinsicColorMatrix::build(int64_t key) {
+#if defined(ARCH_ARM_HAVE_NEON)
+    mBufSize = 4096;
+    //StopWatch build_time("rs cm: build time");
+    mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
+                                  MAP_PRIVATE | MAP_ANON, -1, 0);
+    if (!mBuf) {
+        return false;
+    }
+
+    uint8_t *buf = mBuf;
+
+    // Add the function prefix
+    // Store the address for the loop return
+    ADD_CHUNK(prefix);
+    uint8_t *buf2 = buf;
+
+    // Load the incoming r,g,b,a as needed
+    switch(key & CM_IN_VEC_SIZE_MASK) {
+    case 3:
+        ADD_CHUNK(load_u8_4);
+        if (key & CM_COPY_ALPHA) {
+            ADD_CHUNK(unpack_u8_3);
+        } else {
+            ADD_CHUNK(unpack_u8_4);
+        }
+        break;
+    case 2:
+        ADD_CHUNK(load_u8_4);
+        ADD_CHUNK(unpack_u8_3);
+        break;
+    case 1:
+        ADD_CHUNK(load_u8_2);
+        ADD_CHUNK(unpack_u8_2);
+        break;
+    case 0:
+        ADD_CHUNK(load_u8_1);
+        ADD_CHUNK(unpack_u8_1);
+        break;
+    }
+
+    // Add multiply and accumulate
+    // use MULL to init the output register,
+    // use MLAL from there
+    bool linit[4] = {false, false, false, false};
+    if (key & (1 << 16)) {
+        buf = addVMULL_S16(buf, 8, 24, 4, 0);
+        linit[0] = true;
+    }
+    if (!(key & CM_DOT_MASK)) {
+        if (key & (1 << 17)) {
+            buf = addVMULL_S16(buf, 9, 24, 4, 1);
+            linit[1] = true;
+        }
+        if (key & (1 << 18)) {
+            buf = addVMULL_S16(buf, 10, 24, 4, 2);
+            linit[2] = true;
+        }
+    }
+    if (key & (1 << 19)) {
+        buf = addVMULL_S16(buf, 11, 24, 4, 3);
+        linit[3] = true;
+    }
+
+    if (key & (1 << 20)) {
+        if (linit[0]) {
+            buf = addVMLAL_S16(buf, 8, 26, 5, 0);
+        } else {
+            buf = addVMULL_S16(buf, 8, 26, 5, 0);
+            linit[0] = true;
+        }
+    }
+    if (!(key & CM_DOT_MASK)) {
+        if (key & (1 << 21)) {
+            if (linit[1]) {
+                buf = addVMLAL_S16(buf, 9, 26, 5, 1);
+            } else {
+                buf = addVMULL_S16(buf, 9, 26, 5, 1);
+                linit[1] = true;
+            }
+        }
+        if (key & (1 << 22)) {
+            if (linit[2]) {
+                buf = addVMLAL_S16(buf, 10, 26, 5, 2);
+            } else {
+                buf = addVMULL_S16(buf, 10, 26, 5, 2);
+                linit[2] = true;
+            }
+        }
+    }
+    if (key & (1 << 23)) {
+        if (linit[3]) {
+            buf = addVMLAL_S16(buf, 11, 26, 5, 3);
+        } else {
+            buf = addVMULL_S16(buf, 11, 26, 5, 3);
+            linit[3] = true;
+        }
+    }
+
+    if (key & (1 << 24)) {
+        if (linit[0]) {
+            buf = addVMLAL_S16(buf, 8, 28, 6, 0);
+        } else {
+            buf = addVMULL_S16(buf, 8, 28, 6, 0);
+            linit[0] = true;
+        }
+    }
+    if (!(key & CM_DOT_MASK)) {
+        if (key & (1 << 25)) {
+            if (linit[1]) {
+                buf = addVMLAL_S16(buf, 9, 28, 6, 1);
+            } else {
+                buf = addVMULL_S16(buf, 9, 28, 6, 1);
+                linit[1] = true;
+            }
+        }
+        if (key & (1 << 26)) {
+            if (linit[2]) {
+                buf = addVMLAL_S16(buf, 10, 28, 6, 2);
+            } else {
+                buf = addVMULL_S16(buf, 10, 28, 6, 2);
+                linit[2] = true;
+            }
+        }
+    }
+    if (key & (1 << 27)) {
+        if (linit[3]) {
+            buf = addVMLAL_S16(buf, 11, 28, 6, 3);
+        } else {
+            buf = addVMULL_S16(buf, 11, 28, 6, 3);
+            linit[3] = true;
+        }
+    }
+
+    if (key & (1 << 28)) {
+        if (linit[0]) {
+            buf = addVMLAL_S16(buf, 8, 30, 7, 0);
+        } else {
+            buf = addVMULL_S16(buf, 8, 30, 7, 0);
+            linit[0] = true;
+        }
+    }
+    if (!(key & CM_DOT_MASK)) {
+        if (key & (1 << 29)) {
+            if (linit[1]) {
+                buf = addVMLAL_S16(buf, 9, 30, 7, 1);
+            } else {
+                buf = addVMULL_S16(buf, 9, 30, 7, 1);
+                linit[1] = true;
+            }
+        }
+        if (key & (1 << 30)) {
+            if (linit[2]) {
+                buf = addVMLAL_S16(buf, 10, 30, 7, 2);
+            } else {
+                buf = addVMULL_S16(buf, 10, 30, 7, 2);
+                linit[2] = true;
+            }
+        }
+    }
+    if (!(key & CM_COPY_ALPHA)) {
+        if (key & (1 << 31)) {
+            if (linit[3]) {
+                buf = addVMLAL_S16(buf, 11, 30, 7, 3);
+            } else {
+                buf = addVMULL_S16(buf, 11, 30, 7, 3);
+                linit[3] = true;
+            }
+        }
+    }
+
+    // If we have a dot product, perform the special pack.
+    if (key & CM_DOT_MASK) {
+        ADD_CHUNK(pack_u8_1);
+        ADD_CHUNK(dot);
+    } else {
+        switch(key & CM_IN_VEC_SIZE_MASK) {
+        case 3:
+            ADD_CHUNK(pack_u8_4);
+            break;
+        case 2:
+            ADD_CHUNK(pack_u8_3);
+            break;
+        case 1:
+            ADD_CHUNK(pack_u8_2);
+            break;
+        case 0:
+            ADD_CHUNK(pack_u8_1);
+            break;
+        }
+    }
+
+    // Write out result
+    switch(key & CM_IN_VEC_SIZE_MASK) {
+    case 3:
+    case 2:
+        ADD_CHUNK(store_u8_4);
+        break;
+    case 1:
+        ADD_CHUNK(store_u8_2);
+        break;
+    case 0:
+        ADD_CHUNK(store_u8_1);
+        break;
+    }
+
+    // Loop, branch, and cleanup
+    ADD_CHUNK(postfix1);
+    buf = addBranch(buf, buf2, 0x01);
+    ADD_CHUNK(postfix2);
+
+    int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
+    if (ret == -1) {
+        ALOGE("mprotect error %i", ret);
+        return false;
+    }
+
+    cacheflush((long)mBuf, (long)mBuf + mBufSize, 0);
+    return true;
+#else
+    return false;
+#endif
+}
+
+void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
+                                                    size_t dataLength) {
+    switch(slot) {
+    case 0:
+        memcpy (fp, data, dataLength);
+        for(int ct=0; ct < 16; ct++) {
+            ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
+        }
+        break;
+    case 1:
+        memcpy (fpa, data, dataLength);
+        for(int ct=0; ct < 4; ct++) {
+            ipa[ct] = (int)(fpa[ct] * 256.f + 0.5f);
+        }
+        break;
+    default:
+        rsAssert(0);
+        break;
+    }
+
+    mRootPtr = &kernel;
+}
+
 
 static void One(const RsForEachStubParamStruct *p, uchar4 *out,
                 const uchar4 *py, const float* coeff) {
@@ -108,9 +494,9 @@
     *out = convert_uchar4(sum);
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::kernel4x4(const RsForEachStubParamStruct *p,
-                                                 uint32_t xstart, uint32_t xend,
-                                                 uint32_t instep, uint32_t outstep) {
+void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
+                                              uint32_t xstart, uint32_t xend,
+                                              uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
     uchar4 *out = (uchar4 *)p->out;
     uchar4 *in = (uchar4 *)p->in;
@@ -118,15 +504,13 @@
     uint32_t x2 = xend;
 
     if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
         int32_t len = (x2 - x1) >> 2;
-        if(len > 0) {
-            rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
+        if((cp->mOptKernel != NULL) && (len > 0)) {
+            cp->mOptKernel(out, in, cp->ip, len);
             x1 += len << 2;
             out += len << 2;
             in += len << 2;
         }
-#endif
 
         while(x1 != x2) {
             One(p, out++, in++, cp->fp);
@@ -135,79 +519,57 @@
     }
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::kernel3x3(const RsForEachStubParamStruct *p,
-                                                 uint32_t xstart, uint32_t xend,
-                                                 uint32_t instep, uint32_t outstep) {
-    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
-    uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->in;
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
+void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
+        uint32_t slot, const Allocation * ain, Allocation * aout,
+        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
 
-    if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1) >> 2;
-        if(len > 0) {
-            rsdIntrinsicColorMatrix3x3_K(out, in, cp->ip, len);
-            x1 += len << 2;
-            out += len << 2;
-            in += len << 2;
-        }
-#endif
-
-        while(x1 != x2) {
-            One(p, out++, in++, cp->fp);
-            x1++;
+    int64_t key = computeKey(ain->mHal.state.type->getElement(),
+                             aout->mHal.state.type->getElement());
+    if ((mOptKernel == NULL) || (mLastKey != key)) {
+        if (mBuf) munmap(mBuf, mBufSize);
+        mBuf = NULL;
+        mOptKernel = NULL;
+        if (build(key)) {
+            mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
+            mLastKey = key;
         }
     }
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::kernelDot(const RsForEachStubParamStruct *p,
-                                                 uint32_t xstart, uint32_t xend,
-                                                 uint32_t instep, uint32_t outstep) {
-    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
-    uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->in;
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
+void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
+        uint32_t slot, const Allocation * ain, Allocation * aout,
+        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
 
-    if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1) >> 2;
-        if(len > 0) {
-            rsdIntrinsicColorMatrixDot_K(out, in, cp->ip, len);
-            x1 += len << 2;
-            out += len << 2;
-            in += len << 2;
-        }
-#endif
 
-        while(x1 != x2) {
-            One(p, out++, in++, cp->fp);
-            x1++;
-        }
-    }
 }
 
-
 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
 
+    mLastKey = 0;
+    mBuf = NULL;
+    mBufSize = 0;
+    mOptKernel = NULL;
     const static float defaultMatrix[] = {
         1.f, 0.f, 0.f, 0.f,
         0.f, 1.f, 0.f, 0.f,
         0.f, 0.f, 1.f, 0.f,
         0.f, 0.f, 0.f, 1.f
     };
+    const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
     setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
+    setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
 }
 
 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
+    if (mBuf) munmap(mBuf, mBufSize);
+    mBuf = NULL;
+    mOptKernel = NULL;
 }
 
 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
-    s->mHal.info.exportedVariableCount = 1;
+    s->mHal.info.exportedVariableCount = 2;
 }
 
 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index c8dc9bf..52fd565 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -104,173 +104,6 @@
         bx              lr
 END(rsdIntrinsicConvolve3x3_K)
 
-/*
-        r0 = dst
-        r1 = src
-        r2 = matrix
-        r3 = length
-*/
-ENTRY(rsdIntrinsicColorMatrix4x4_K)
-        stmfd           sp!, {r4, lr}
-        vpush           {q4-q7}
-
-        vld1.16 {q2}, [r2]!
-        vld1.16 {q3}, [r2]!
-
-1:
-        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
-        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
-        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
-        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
-
-        vmovl.u8 q12, d0  /* R */
-        vmovl.u8 q13, d1  /* G */
-        vmovl.u8 q14, d2  /* B */
-        vmovl.u8 q15, d3  /* A */
-
-        vmull.s16 q8,  d24, d4[0]
-        vmull.s16 q9,  d24, d4[1]
-        vmull.s16 q10, d24, d4[2]
-        vmull.s16 q11, d24, d4[3]
-
-        vmlal.s16 q8,  d26, d5[0]
-        vmlal.s16 q9,  d26, d5[1]
-        vmlal.s16 q10, d26, d5[2]
-        vmlal.s16 q11, d26, d5[3]
-
-        vmlal.s16 q8,  d28, d6[0]
-        vmlal.s16 q9,  d28, d6[1]
-        vmlal.s16 q10, d28, d6[2]
-        vmlal.s16 q11, d28, d6[3]
-
-        vmlal.s16 q8,  d30, d7[0]
-        vmlal.s16 q9,  d30, d7[1]
-        vmlal.s16 q10, d30, d7[2]
-        vmlal.s16 q11, d30, d7[3]
-
-        vshrn.i32 d24, q8, #8
-        vshrn.i32 d26, q9, #8
-        vshrn.i32 d28, q10, #8
-        vshrn.i32 d30, q11, #8
-
-        vqmovun.s16 d0, q12
-        vqmovun.s16 d1, q13
-        vqmovun.s16 d2, q14
-        vqmovun.s16 d3, q15
-
-        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
-        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
-        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
-        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
-
-        subs r3, r3, #1
-        bne 1b
-
-        vpop            {q4-q7}
-        ldmfd           sp!, {r4, lr}
-        bx              lr
-END(rsdIntrinsicColorMatrix4x4_K)
-
-/*
-        r0 = dst
-        r1 = src
-        r2 = matrix
-        r3 = length
-*/
-ENTRY(rsdIntrinsicColorMatrix3x3_K)
-        stmfd           sp!, {r4, lr}
-        vpush           {q4-q7}
-
-        vld1.16 {q2}, [r2]!
-        vld1.16 {q3}, [r2]!
-
-1:
-        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
-        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
-        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
-        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
-
-        vmovl.u8 q12, d0
-        vmovl.u8 q13, d1
-        vmovl.u8 q14, d2
-
-        vmull.s16 q8,  d24, d4[0]
-        vmull.s16 q9,  d24, d4[1]
-        vmull.s16 q10, d24, d4[2]
-
-        vmlal.s16 q8,  d26, d5[0]
-        vmlal.s16 q9,  d26, d5[1]
-        vmlal.s16 q10, d26, d5[2]
-
-        vmlal.s16 q8,  d28, d6[0]
-        vmlal.s16 q9,  d28, d6[1]
-        vmlal.s16 q10, d28, d6[2]
-
-        vshrn.i32 d24, q8, #8
-        vshrn.i32 d26, q9, #8
-        vshrn.i32 d28, q10, #8
-
-        vqmovun.s16 d0, q12
-        vqmovun.s16 d1, q13
-        vqmovun.s16 d2, q14
-
-        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
-        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
-        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
-        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
-
-        subs r3, r3, #1
-        bne 1b
-
-        vpop            {q4-q7}
-        ldmfd           sp!, {r4, lr}
-        bx              lr
-END(rsdIntrinsicColorMatrix3x3_K)
-
-/*
-        r0 = dst
-        r1 = src
-        r2 = matrix
-        r3 = length
-*/
-ENTRY(rsdIntrinsicColorMatrixDot_K)
-        stmfd           sp!, {r4, lr}
-        vpush           {q4-q7}
-
-        vld1.16 {q2}, [r2]!
-        vld1.16 {q3}, [r2]!
-
-1:
-        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
-        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
-        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
-        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
-
-        vmovl.u8 q12, d0
-        vmovl.u8 q13, d1
-        vmovl.u8 q14, d2
-
-        vmull.s16 q8,  d24, d4[0]
-        vmlal.s16 q8,  d26, d5[0]
-        vmlal.s16 q8,  d28, d6[0]
-        vshrn.i32 d24, q8, #8
-        vqmovun.s16 d0, q12
-        vmov.u8 d1, d0
-        vmov.u8 d2, d0
-
-        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
-        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
-        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
-        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
-
-        subs r3, r3, #1
-        bne 1b
-
-        vpop            {q4-q7}
-        ldmfd           sp!, {r4, lr}
-        bx              lr
-END(rsdIntrinsicColorMatrixDot_K)
-
 
 /*
 static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
diff --git a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
new file mode 100644
index 0000000..ce8c033
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+#define SNIP_START(x) \
+	.globl x; x:
+
+#define SNIP_END(x) \
+	.globl x##_end; x##_end: \
+	.globl x##_len; x##_len: \
+    .word x##_end-x
+
+SNIP_START(_N_ColorMatrix_prefix)
+    stmfd           sp!, {r4, lr}
+    vpush           {q4-q7}
+    vld1.16 {q2}, [r2]!
+    vld1.16 {q3}, [r2]!
+    vld1.16 {d8}, [r2]!
+SNIP_END(_N_ColorMatrix_prefix)
+
+SNIP_START(_N_ColorMatrix_postfix1)
+    subs r3, r3, #1
+    #bne 1b
+SNIP_END(_N_ColorMatrix_postfix1)
+
+SNIP_START(_N_ColorMatrix_postfix2)
+
+    #mov r0, #0
+    #ldr r0, [r0]
+
+    vpop            {q4-q7}
+    ldmfd           sp!, {r4, lr}
+    bx              lr
+SNIP_END(_N_ColorMatrix_postfix2)
+
+SNIP_START(_N_ColorMatrix_load_u8_4)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+SNIP_END(_N_ColorMatrix_load_u8_4)
+
+SNIP_START(_N_ColorMatrix_load_u8_2)
+    vld2.8 {d0[0],d1[0]}, [r1]!
+    vld2.8 {d0[1],d1[1]}, [r1]!
+    vld2.8 {d0[2],d1[2]}, [r1]!
+    vld2.8 {d0[3],d1[3]}, [r1]!
+SNIP_END(_N_ColorMatrix_load_u8_2)
+
+SNIP_START(_N_ColorMatrix_load_u8_1)
+    vld1.32 {d0}, [r1]!
+SNIP_END(_N_ColorMatrix_load_u8_1)
+
+SNIP_START(_N_ColorMatrix_store_u8_4)
+    vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+    vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+    vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+    vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_4)
+
+SNIP_START(_N_ColorMatrix_store_u8_2)
+    vst2.8 {d0[0],d1[0]}, [r0]!
+    vst2.8 {d0[1],d1[1]}, [r0]!
+    vst2.8 {d0[2],d1[2]}, [r0]!
+    vst2.8 {d0[3],d1[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_2)
+
+SNIP_START(_N_ColorMatrix_store_u8_1)
+    vst1.32 {d0}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_1)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_4)
+    vmovl.u8 q12, d0  /* R */
+    vmovl.u8 q13, d1  /* G */
+    vmovl.u8 q14, d2  /* B */
+    vmovl.u8 q15, d3  /* A */
+SNIP_END(_N_ColorMatrix_unpack_u8_4)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_3)
+    vmovl.u8 q12, d0  /* R */
+    vmovl.u8 q13, d1  /* G */
+    vmovl.u8 q14, d2  /* B */
+SNIP_END(_N_ColorMatrix_unpack_u8_3)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_2)
+    vmovl.u8 q12, d0  /* R */
+    vmovl.u8 q13, d1  /* G */
+SNIP_END(_N_ColorMatrix_unpack_u8_2)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_1)
+    vmovl.u8 q12, d0  /* R */
+SNIP_END(_N_ColorMatrix_unpack_u8_1)
+
+SNIP_START(_N_ColorMatrix_pack_u8_4)
+    vshrn.i32 d24, q8, #8
+    vshrn.i32 d26, q9, #8
+    vshrn.i32 d28, q10, #8
+    vshrn.i32 d30, q11, #8
+    vqmovun.s16 d0, q12
+    vqmovun.s16 d1, q13
+    vqmovun.s16 d2, q14
+    vqmovun.s16 d3, q15
+SNIP_END(_N_ColorMatrix_pack_u8_4)
+
+SNIP_START(_N_ColorMatrix_pack_u8_3)
+    vshrn.i32 d24, q8, #8
+    vshrn.i32 d26, q9, #8
+    vshrn.i32 d28, q10, #8
+    vqmovun.s16 d0, q12
+    vqmovun.s16 d1, q13
+    vqmovun.s16 d2, q14
+SNIP_END(_N_ColorMatrix_pack_u8_3)
+
+SNIP_START(_N_ColorMatrix_pack_u8_2)
+    vshrn.i32 d24, q8, #8
+    vshrn.i32 d26, q9, #8
+    vqmovun.s16 d0, q12
+    vqmovun.s16 d1, q13
+SNIP_END(_N_ColorMatrix_pack_u8_2)
+
+SNIP_START(_N_ColorMatrix_pack_u8_1)
+    vshrn.i32 d24, q8, #8
+    vqmovun.s16 d0, q12
+SNIP_END(_N_ColorMatrix_pack_u8_1)
+
+SNIP_START(_N_ColorMatrix_dot)
+    vmov.u8 d1, d0
+    vmov.u8 d2, d0
+SNIP_END(_N_ColorMatrix_dot)
+