Merge "Fix bugs in optimized colorMatrix." into jb-mr1-dev
diff --git a/driver/rsdIntrinsicColorMatrix.cpp b/driver/rsdIntrinsicColorMatrix.cpp
index bbeb1ef..8f6c70c 100644
--- a/driver/rsdIntrinsicColorMatrix.cpp
+++ b/driver/rsdIntrinsicColorMatrix.cpp
@@ -27,6 +27,8 @@
 struct ConvolveParams {
     float fp[16];
     short ip[16];
+    bool use3x3;
+    bool useDot;
 };
 
 static void ColorMatrix_SetVar(const Context *dc, const Script *script, void * intrinsicData,
@@ -38,10 +40,23 @@
     for(int ct=0; ct < 16; ct++) {
         cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
     }
+
+    if ((cp->ip[3] == 0) && (cp->ip[7] == 0) && (cp->ip[11] == 0) &&
+        (cp->ip[12] == 0) && (cp->ip[13] == 0) && (cp->ip[14] == 0) &&
+        (cp->ip[15] == 255)) {
+        cp->use3x3 = true;
+
+        if ((cp->ip[0] == cp->ip[1]) && (cp->ip[0] == cp->ip[2]) &&
+            (cp->ip[4] == cp->ip[5]) && (cp->ip[4] == cp->ip[6]) &&
+            (cp->ip[8] == cp->ip[9]) && (cp->ip[8] == cp->ip[10])) {
+            cp->useDot = true;
+        }
+    }
 }
 
 extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, const short *coef, uint32_t count);
 extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, const short *coef, uint32_t count);
+extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, const short *coef, uint32_t count);
 
 static void One(const RsForEachStubParamStruct *p, uchar4 *out,
                 const uchar4 *py, const float* coeff) {
@@ -89,7 +104,15 @@
 #if defined(ARCH_ARM_HAVE_NEON)
         int32_t len = (x2 - x1) >> 2;
         if(len > 0) {
-            rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
+            if (cp->use3x3) {
+                if (cp->useDot) {
+                    rsdIntrinsicColorMatrixDot_K(out, in, cp->ip, len);
+                } else {
+                    rsdIntrinsicColorMatrix3x3_K(out, in, cp->ip, len);
+                }
+            } else {
+                rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
+            }
             x1 += len << 2;
             out += len << 2;
             in += len << 2;
diff --git a/driver/rsdIntrinsicConvolve3x3.cpp b/driver/rsdIntrinsicConvolve3x3.cpp
index 2aa7849..02f34a6 100644
--- a/driver/rsdIntrinsicConvolve3x3.cpp
+++ b/driver/rsdIntrinsicConvolve3x3.cpp
@@ -104,7 +104,7 @@
 #if defined(ARCH_ARM_HAVE_NEON)
         int32_t len = (x2 - x1 - 1) >> 1;
         if(len > 0) {
-            rsdIntrinsicConvolve3x3_K(out, &py0[x1], &py1[x1], &py2[x1], cp->ip, len);
+            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->ip, len);
             x1 += len << 1;
             out += len << 1;
         }
diff --git a/driver/rsdIntrinsics_Convolve.S b/driver/rsdIntrinsics_Convolve.S
index a08658d..8fe80c5 100644
--- a/driver/rsdIntrinsics_Convolve.S
+++ b/driver/rsdIntrinsics_Convolve.S
@@ -109,7 +109,7 @@
 /*
         r0 = dst
         r1 = src
-        r2 = matrx
+        r2 = matrix
         r3 = length
 */
 ENTRY(rsdIntrinsicColorMatrix4x4_K)
@@ -126,29 +126,29 @@
         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
 
-        vmovl.u8 q12, d0
-        vmovl.u8 q13, d1
-        vmovl.u8 q14, d2
-        vmovl.u8 q15, d3
+        vmovl.u8 q12, d0  /* R */
+        vmovl.u8 q13, d1  /* G */
+        vmovl.u8 q14, d2  /* B */
+        vmovl.u8 q15, d3  /* A */
 
         vmull.s16 q8,  d24, d4[0]
-        vmull.s16 q9,  d26, d4[1]
-        vmull.s16 q10, d28, d4[2]
-        vmull.s16 q11, d30, d4[3]
+        vmull.s16 q9,  d24, d4[1]
+        vmull.s16 q10, d24, d4[2]
+        vmull.s16 q11, d24, d4[3]
 
-        vmlal.s16 q8,  d24, d5[0]
+        vmlal.s16 q8,  d26, d5[0]
         vmlal.s16 q9,  d26, d5[1]
-        vmlal.s16 q10, d28, d5[2]
-        vmlal.s16 q11, d30, d5[3]
+        vmlal.s16 q10, d26, d5[2]
+        vmlal.s16 q11, d26, d5[3]
 
-        vmlal.s16 q8,  d24, d6[0]
-        vmlal.s16 q9,  d26, d6[1]
+        vmlal.s16 q8,  d28, d6[0]
+        vmlal.s16 q9,  d28, d6[1]
         vmlal.s16 q10, d28, d6[2]
-        vmlal.s16 q11, d30, d6[3]
+        vmlal.s16 q11, d28, d6[3]
 
-        vmlal.s16 q8,  d24, d7[0]
-        vmlal.s16 q9,  d26, d7[1]
-        vmlal.s16 q10, d28, d7[2]
+        vmlal.s16 q8,  d30, d7[0]
+        vmlal.s16 q9,  d30, d7[1]
+        vmlal.s16 q10, d30, d7[2]
         vmlal.s16 q11, d30, d7[3]
 
         vshrn.i32 d24, q8, #8
@@ -172,15 +172,15 @@
         vpop            {q4-q7}
         ldmfd           sp!, {r4, lr}
         bx              lr
-END(rsdIntrinsicColorMatrix4x4K)
+END(rsdIntrinsicColorMatrix4x4_K)
 
 /*
         r0 = dst
         r1 = src
-        r2 = matrx
+        r2 = matrix
         r3 = length
 */
-ENTRY(rsdIntrinsicColorMatrix3x3K)
+ENTRY(rsdIntrinsicColorMatrix3x3_K)
         .save           {r4, lr}
         stmfd           sp!, {r4, lr}
         vpush           {q4-q7}
@@ -199,15 +199,15 @@
         vmovl.u8 q14, d2
 
         vmull.s16 q8,  d24, d4[0]
-        vmull.s16 q9,  d26, d4[1]
-        vmull.s16 q10, d28, d4[2]
+        vmull.s16 q9,  d24, d4[1]
+        vmull.s16 q10, d24, d4[2]
 
-        vmlal.s16 q8,  d24, d5[0]
+        vmlal.s16 q8,  d26, d5[0]
         vmlal.s16 q9,  d26, d5[1]
-        vmlal.s16 q10, d28, d5[2]
+        vmlal.s16 q10, d26, d5[2]
 
-        vmlal.s16 q8,  d24, d6[0]
-        vmlal.s16 q9,  d26, d6[1]
+        vmlal.s16 q8,  d28, d6[0]
+        vmlal.s16 q9,  d28, d6[1]
         vmlal.s16 q10, d28, d6[2]
 
         vshrn.i32 d24, q8, #8
@@ -229,5 +229,50 @@
         vpop            {q4-q7}
         ldmfd           sp!, {r4, lr}
         bx              lr
-END(rsdIntrinsicColorMatrix3x3K)
+END(rsdIntrinsicColorMatrix3x3_K)
+
+/*
+        r0 = dst
+        r1 = src
+        r2 = matrix
+        r3 = length
+*/
+ENTRY(rsdIntrinsicColorMatrixDot_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        vld1.16 {q2}, [r2]!
+        vld1.16 {q3}, [r2]!
+
+1:
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+
+        vmull.s16 q8,  d24, d4[0]
+        vmlal.s16 q8,  d26, d5[0]
+        vmlal.s16 q8,  d28, d6[0]
+        vshrn.i32 d24, q8, #8
+        vqmovun.s16 d0, q12
+        vmov.u8 d1, d0
+        vmov.u8 d2, d0
+
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+        subs r3, r3, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicColorMatrixDot_K)