Fix ColorMatrix performance and correctness issues.

bug 10427746

Change-Id: Ie42753a551badf4de3144f16fa0e407eca5b9c74
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index e09c08a..d89d5d7 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -152,7 +152,7 @@
     static void kernel(const RsForEachStubParamStruct *p,
                        uint32_t xstart, uint32_t xend,
                        uint32_t instep, uint32_t outstep);
-    void updateCoeffCache(float fpMul);
+    void updateCoeffCache(float fpMul, float addMul);
 
     Key_t mLastKey;
     unsigned char *mBuf;
@@ -227,7 +227,7 @@
 
     // Is alpha a simple copy
     if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
-        key.u.copyAlpha = 1;
+        key.u.copyAlpha = !(key.u.inType || key.u.outType);
     }
 
     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
@@ -266,6 +266,13 @@
         break;
     }
 
+    if (key.u.inType && !key.u.outType) {
+        key.u.addMask |= 1;
+        if (key.u.outVecSize > 0) key.u.addMask |= 2;
+        if (key.u.outVecSize > 1) key.u.addMask |= 4;
+        if (key.u.outVecSize > 2) key.u.addMask |= 8;
+    }
+
     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
     return key;
 }
@@ -283,12 +290,15 @@
 DEF_SYM(postfix2)
 
 DEF_SYM(load_u8_4)
+DEF_SYM(load_u8_3)
 DEF_SYM(load_u8_2)
 DEF_SYM(load_u8_1)
 DEF_SYM(load_u8f_4)
+DEF_SYM(load_u8f_3)
 DEF_SYM(load_u8f_2)
 DEF_SYM(load_u8f_1)
 DEF_SYM(load_f32_4)
+DEF_SYM(load_f32_3)
 DEF_SYM(load_f32_2)
 DEF_SYM(load_f32_1)
 
@@ -296,6 +306,7 @@
 DEF_SYM(store_u8_2)
 DEF_SYM(store_u8_1)
 DEF_SYM(store_f32_4)
+DEF_SYM(store_f32_3)
 DEF_SYM(store_f32_2)
 DEF_SYM(store_f32_1)
 DEF_SYM(store_f32u_4)
@@ -434,14 +445,9 @@
             }
         }
     }
-    for (int i=0; i < 4; i++) {
-        if (key.u.addMask & (1 << i)) {
-            ops[4][i] = 0x2 | opInit[i];
-            opInit[i] = 1;
-        }
-    }
 
     if (key.u.inType || key.u.outType) {
+        key.u.copyAlpha = 0;
         ADD_CHUNK(prefix_f);
         buf2 = buf;
 
@@ -449,9 +455,11 @@
         if (key.u.inType) {
             switch(key.u.inVecSize) {
             case 3:
-            case 2:
                 ADD_CHUNK(load_f32_4);
                 break;
+            case 2:
+                ADD_CHUNK(load_f32_3);
+                break;
             case 1:
                 ADD_CHUNK(load_f32_2);
                 break;
@@ -462,9 +470,11 @@
         } else {
             switch(key.u.inVecSize) {
             case 3:
-            case 2:
                 ADD_CHUNK(load_u8f_4);
                 break;
+            case 2:
+                ADD_CHUNK(load_u8f_3);
+                break;
             case 1:
                 ADD_CHUNK(load_u8f_2);
                 break;
@@ -505,9 +515,11 @@
         if (key.u.outType) {
             switch(key.u.outVecSize) {
             case 3:
-            case 2:
                 ADD_CHUNK(store_f32_4);
                 break;
+            case 2:
+                ADD_CHUNK(store_f32_3);
+                break;
             case 1:
                 ADD_CHUNK(store_f32_2);
                 break;
@@ -548,7 +560,7 @@
             }
             break;
         case 2:
-            ADD_CHUNK(load_u8_4);
+            ADD_CHUNK(load_u8_3);
             ADD_CHUNK(unpack_u8_3);
             break;
         case 1:
@@ -647,15 +659,18 @@
 #endif
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul) {
+void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float adMul) {
     for(int ct=0; ct < 16; ct++) {
-        //ALOGE("mat %i %f", ct, fp[ct]);
         ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
         tmpFp[ct] = fp[ct] * fpMul;
+        //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
     }
 
+    float ad = 0.f;
+    if (fpMul > 254.f) ad = 0.5f;
     for(int ct=0; ct < 4; ct++) {
-        tmpFpa[ct * 4 + 0] = fpa[ct] * fpMul;
+        tmpFpa[ct * 4 + 0] = fpa[ct] * adMul + ad;
+        //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
         tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4];
         tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4];
         tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4];
@@ -692,8 +707,11 @@
     if (fin) {
         switch(vsin) {
         case 3:
+            f = ((const float4 *)py)[0];
+            break;
         case 2:
             f = ((const float4 *)py)[0];
+            f.w = 0.f;
             break;
         case 1:
             f.xy = ((const float2 *)py)[0];
@@ -705,8 +723,11 @@
     } else {
         switch(vsin) {
         case 3:
+            f = convert_float4(((const uchar4 *)py)[0]);
+            break;
         case 2:
             f = convert_float4(((const uchar4 *)py)[0]);
+            f.w = 0.f;
             break;
         case 1:
             f.xy = convert_float2(((const uchar2 *)py)[0]);
@@ -758,10 +779,11 @@
             break;
         }
     } else {
-        sum.x = sum.x < 0 ? 0 : (sum.x > 255 ? 255 : sum.x);
-        sum.y = sum.y < 0 ? 0 : (sum.y > 255 ? 255 : sum.y);
-        sum.z = sum.z < 0 ? 0 : (sum.z > 255 ? 255 : sum.z);
-        sum.w = sum.w < 0 ? 0 : (sum.w > 255 ? 255 : sum.w);
+        sum += 0.5f;
+        sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
+        sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
+        sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
+        sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
 
         switch(vsout) {
         case 3:
@@ -793,6 +815,8 @@
     bool floatIn = !!cp->mLastKey.u.inType;
     bool floatOut = !!cp->mLastKey.u.outType;
 
+    //if (!p->y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
+
     if(x2 > x1) {
         int32_t len = (x2 - x1) >> 2;
         if((cp->mOptKernel != NULL) && (len > 0)) {
@@ -819,12 +843,12 @@
     const Element *eout = aout->mHal.state.type->getElement();
 
     if (ein->getType() == eout->getType()) {
-        updateCoeffCache(1.f);
+        updateCoeffCache(1.f, 1.f);
     } else {
         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
-            updateCoeffCache(255.f);
+            updateCoeffCache(255.f, 255.f);
         } else {
-            updateCoeffCache(1.f / 255.f);
+            updateCoeffCache(1.f / 255.f, 1.f);
         }
     }
 
diff --git a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
index 2545833..7b4d7f6 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
@@ -84,15 +84,28 @@
     vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
 SNIP_END(_N_ColorMatrix_load_u8_4)
 
+SNIP_START(_N_ColorMatrix_load_u8_3)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+    veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_3)
+
 SNIP_START(_N_ColorMatrix_load_u8_2)
     vld2.8 {d0[0],d1[0]}, [r1]!
     vld2.8 {d0[1],d1[1]}, [r1]!
     vld2.8 {d0[2],d1[2]}, [r1]!
     vld2.8 {d0[3],d1[3]}, [r1]!
+    veor d2, d2
+    veor d3, d3
 SNIP_END(_N_ColorMatrix_load_u8_2)
 
 SNIP_START(_N_ColorMatrix_load_u8_1)
     vld1.32 {d0[0]}, [r1]!
+    veor d1, d1
+    veor d2, d2
+    veor d3, d3
 SNIP_END(_N_ColorMatrix_load_u8_1)
 
 SNIP_START(_N_ColorMatrix_load_u8f_4)
@@ -114,6 +127,23 @@
     vcvt.f32.s32 q0, q0
 SNIP_END(_N_ColorMatrix_load_u8f_4)
 
+SNIP_START(_N_ColorMatrix_load_u8f_3)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+    vmovl.u8 q2, d2
+    vmovl.u8 q1, d1
+    vmovl.u8 q0, d0
+    vmovl.u16 q2, d4
+    vmovl.u16 q1, d2
+    vmovl.u16 q0, d0
+    vcvt.f32.s32 q2, q2
+    vcvt.f32.s32 q1, q1
+    vcvt.f32.s32 q0, q0
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_3)
+
 SNIP_START(_N_ColorMatrix_load_u8f_2)
     vld2.8 {d0[0],d1[0]}, [r1]!
     vld2.8 {d0[1],d1[1]}, [r1]!
@@ -125,13 +155,18 @@
     vmovl.u16 q0, d0
     vcvt.f32.s32 q1, q1
     vcvt.f32.s32 q0, q0
+    veor q2, q2
+    veor q3, q3
 SNIP_END(_N_ColorMatrix_load_u8f_2)
 
 SNIP_START(_N_ColorMatrix_load_u8f_1)
-    vld1.32 {d0}, [r1]!
+    vld1.32 {d0[0]}, [r1]!
     vmovl.u8 q0, d0
     vmovl.u16 q0, d0
     vcvt.f32.s32 q0, q0
+    veor q1, q1
+    veor q2, q2
+    veor q3, q3
 SNIP_END(_N_ColorMatrix_load_u8f_1)
 
 SNIP_START(_N_ColorMatrix_load_f32_4)
@@ -141,15 +176,32 @@
     vld4.32 {d1[1],d3[1],d5[1],d7[1]}, [r1]!
 SNIP_END(_N_ColorMatrix_load_f32_4)
 
+SNIP_START(_N_ColorMatrix_load_f32_3)
+    vld3.32 {d0[0],d2[0],d4[0]}, [r1]!
+    add r1, r1, #4
+    vld3.32 {d0[1],d2[1],d4[1]}, [r1]!
+    add r1, r1, #4
+    vld3.32 {d1[0],d3[0],d5[0]}, [r1]!
+    add r1, r1, #4
+    vld3.32 {d1[1],d3[1],d5[1]}, [r1]!
+    add r1, r1, #4
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_3)
+
 SNIP_START(_N_ColorMatrix_load_f32_2)
     vld2.32 {d0[0],d2[0]}, [r1]!
     vld2.32 {d0[1],d2[1]}, [r1]!
     vld2.32 {d1[0],d3[0]}, [r1]!
     vld2.32 {d1[1],d3[1]}, [r1]!
+    veor q2, q2
+    veor q3, q3
 SNIP_END(_N_ColorMatrix_load_f32_2)
 
 SNIP_START(_N_ColorMatrix_load_f32_1)
     vld1.32 {q0}, [r1]!
+    veor q1, q1
+    veor q2, q2
+    veor q3, q3
 SNIP_END(_N_ColorMatrix_load_f32_1)
 
 
@@ -203,7 +255,6 @@
     vqmovn.s32 d2, q1
     vqmovun.s16 d0, q0
     vqmovun.s16 d1, q1
-    vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
     vst2.8 {d0[0],d1[0]}, [r0]!
     vst2.8 {d0[1],d1[1]}, [r0]!
     vst2.8 {d0[2],d1[2]}, [r0]!
@@ -224,6 +275,13 @@
     vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
 SNIP_END(_N_ColorMatrix_store_f32_4)
 
+SNIP_START(_N_ColorMatrix_store_f32_3)
+    vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [r0]!
+    vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r0]!
+    vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [r0]!
+    vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_3)
+
 SNIP_START(_N_ColorMatrix_store_f32_2)
     vst2.32 {d0[0],d2[0]}, [r0]!
     vst2.32 {d0[1],d2[1]}, [r0]!
@@ -259,10 +317,10 @@
 SNIP_END(_N_ColorMatrix_unpack_u8_1)
 
 SNIP_START(_N_ColorMatrix_pack_u8_4)
-    vshrn.i32 d24, q8, #8
-    vshrn.i32 d26, q9, #8
-    vshrn.i32 d28, q10, #8
-    vshrn.i32 d30, q11, #8
+    vrshrn.i32 d24, q8, #8
+    vrshrn.i32 d26, q9, #8
+    vrshrn.i32 d28, q10, #8
+    vrshrn.i32 d30, q11, #8
     vqmovun.s16 d0, q12
     vqmovun.s16 d1, q13
     vqmovun.s16 d2, q14
@@ -270,23 +328,23 @@
 SNIP_END(_N_ColorMatrix_pack_u8_4)
 
 SNIP_START(_N_ColorMatrix_pack_u8_3)
-    vshrn.i32 d24, q8, #8
-    vshrn.i32 d26, q9, #8
-    vshrn.i32 d28, q10, #8
+    vrshrn.i32 d24, q8, #8
+    vrshrn.i32 d26, q9, #8
+    vrshrn.i32 d28, q10, #8
     vqmovun.s16 d0, q12
     vqmovun.s16 d1, q13
     vqmovun.s16 d2, q14
 SNIP_END(_N_ColorMatrix_pack_u8_3)
 
 SNIP_START(_N_ColorMatrix_pack_u8_2)
-    vshrn.i32 d24, q8, #8
-    vshrn.i32 d26, q9, #8
+    vrshrn.i32 d24, q8, #8
+    vrshrn.i32 d26, q9, #8
     vqmovun.s16 d0, q12
     vqmovun.s16 d1, q13
 SNIP_END(_N_ColorMatrix_pack_u8_2)
 
 SNIP_START(_N_ColorMatrix_pack_u8_1)
-    vshrn.i32 d24, q8, #8
+    vrshrn.i32 d24, q8, #8
     vqmovun.s16 d0, q12
 SNIP_END(_N_ColorMatrix_pack_u8_1)