Enable 64 bit asm for int->int ColorMatrix

This brings 64 bit perf close to 32bit for the common
color matrix paths.

bug 17923388

Change-Id: I96c2c28a07864bf71c6c4a0186fb76f4a946316f
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 64ce43f..467cc27 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -126,7 +126,7 @@
 } Key_t;
 
 //Re-enable when intrinsic is fixed
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
 typedef struct {
     void (*column[4])(void);
     void (*store)(void);
@@ -183,7 +183,7 @@
     int ipa[4];
     float tmpFp[16];
     float tmpFpa[4];
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
     FunctionTab_t mFnTab;
 #endif
 
@@ -912,16 +912,20 @@
                 out += outstep * len;
                 in += instep * len;
             }
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
             else {
                 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
-                    rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+                    // Currently this generates off by one errors.
+                    //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+                    //x1 += len;
+                    //out += outstep * len;
+                    //in += instep * len;
                 } else {
                     rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
+                    x1 += len;
+                    out += outstep * len;
+                    in += instep * len;
                 }
-                x1 += len;
-                out += outstep * len;
-                in += instep * len;
             }
 #endif
         }
@@ -977,7 +981,7 @@
         if (build(key)) {
             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
         }
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
         else {
             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);