Fix ColorMatrix performance and correctness issues.
bug 10427746
Change-Id: Ie42753a551badf4de3144f16fa0e407eca5b9c74
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index e09c08a..d89d5d7 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -152,7 +152,7 @@
static void kernel(const RsForEachStubParamStruct *p,
uint32_t xstart, uint32_t xend,
uint32_t instep, uint32_t outstep);
- void updateCoeffCache(float fpMul);
+ void updateCoeffCache(float fpMul, float addMul);
Key_t mLastKey;
unsigned char *mBuf;
@@ -227,7 +227,7 @@
// Is alpha a simple copy
if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
- key.u.copyAlpha = 1;
+ key.u.copyAlpha = !(key.u.inType || key.u.outType);
}
//ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
@@ -266,6 +266,13 @@
break;
}
+ if (key.u.inType && !key.u.outType) {
+ key.u.addMask |= 1;
+ if (key.u.outVecSize > 0) key.u.addMask |= 2;
+ if (key.u.outVecSize > 1) key.u.addMask |= 4;
+ if (key.u.outVecSize > 2) key.u.addMask |= 8;
+ }
+
//ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
return key;
}
@@ -283,12 +290,15 @@
DEF_SYM(postfix2)
DEF_SYM(load_u8_4)
+DEF_SYM(load_u8_3)
DEF_SYM(load_u8_2)
DEF_SYM(load_u8_1)
DEF_SYM(load_u8f_4)
+DEF_SYM(load_u8f_3)
DEF_SYM(load_u8f_2)
DEF_SYM(load_u8f_1)
DEF_SYM(load_f32_4)
+DEF_SYM(load_f32_3)
DEF_SYM(load_f32_2)
DEF_SYM(load_f32_1)
@@ -296,6 +306,7 @@
DEF_SYM(store_u8_2)
DEF_SYM(store_u8_1)
DEF_SYM(store_f32_4)
+DEF_SYM(store_f32_3)
DEF_SYM(store_f32_2)
DEF_SYM(store_f32_1)
DEF_SYM(store_f32u_4)
@@ -434,14 +445,9 @@
}
}
}
- for (int i=0; i < 4; i++) {
- if (key.u.addMask & (1 << i)) {
- ops[4][i] = 0x2 | opInit[i];
- opInit[i] = 1;
- }
- }
if (key.u.inType || key.u.outType) {
+ key.u.copyAlpha = 0;
ADD_CHUNK(prefix_f);
buf2 = buf;
@@ -449,9 +455,11 @@
if (key.u.inType) {
switch(key.u.inVecSize) {
case 3:
- case 2:
ADD_CHUNK(load_f32_4);
break;
+ case 2:
+ ADD_CHUNK(load_f32_3);
+ break;
case 1:
ADD_CHUNK(load_f32_2);
break;
@@ -462,9 +470,11 @@
} else {
switch(key.u.inVecSize) {
case 3:
- case 2:
ADD_CHUNK(load_u8f_4);
break;
+ case 2:
+ ADD_CHUNK(load_u8f_3);
+ break;
case 1:
ADD_CHUNK(load_u8f_2);
break;
@@ -505,9 +515,11 @@
if (key.u.outType) {
switch(key.u.outVecSize) {
case 3:
- case 2:
ADD_CHUNK(store_f32_4);
break;
+ case 2:
+ ADD_CHUNK(store_f32_3);
+ break;
case 1:
ADD_CHUNK(store_f32_2);
break;
@@ -548,7 +560,7 @@
}
break;
case 2:
- ADD_CHUNK(load_u8_4);
+ ADD_CHUNK(load_u8_3);
ADD_CHUNK(unpack_u8_3);
break;
case 1:
@@ -647,15 +659,18 @@
#endif
}
-void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul) {
+void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float adMul) {
for(int ct=0; ct < 16; ct++) {
- //ALOGE("mat %i %f", ct, fp[ct]);
ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
tmpFp[ct] = fp[ct] * fpMul;
+ //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]);
}
+ float ad = 0.f;
+ if (fpMul > 254.f) ad = 0.5f;
for(int ct=0; ct < 4; ct++) {
- tmpFpa[ct * 4 + 0] = fpa[ct] * fpMul;
+ tmpFpa[ct * 4 + 0] = fpa[ct] * adMul + ad;
+ //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4];
tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4];
tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4];
@@ -692,8 +707,11 @@
if (fin) {
switch(vsin) {
case 3:
+ f = ((const float4 *)py)[0];
+ break;
case 2:
f = ((const float4 *)py)[0];
+ f.w = 0.f;
break;
case 1:
f.xy = ((const float2 *)py)[0];
@@ -705,8 +723,11 @@
} else {
switch(vsin) {
case 3:
+ f = convert_float4(((const uchar4 *)py)[0]);
+ break;
case 2:
f = convert_float4(((const uchar4 *)py)[0]);
+ f.w = 0.f;
break;
case 1:
f.xy = convert_float2(((const uchar2 *)py)[0]);
@@ -758,10 +779,11 @@
break;
}
} else {
- sum.x = sum.x < 0 ? 0 : (sum.x > 255 ? 255 : sum.x);
- sum.y = sum.y < 0 ? 0 : (sum.y > 255 ? 255 : sum.y);
- sum.z = sum.z < 0 ? 0 : (sum.z > 255 ? 255 : sum.z);
- sum.w = sum.w < 0 ? 0 : (sum.w > 255 ? 255 : sum.w);
+ sum += 0.5f;
+ sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
+ sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
+ sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
+ sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
switch(vsout) {
case 3:
@@ -793,6 +815,8 @@
bool floatIn = !!cp->mLastKey.u.inType;
bool floatOut = !!cp->mLastKey.u.outType;
+ //if (!p->y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
+
if(x2 > x1) {
int32_t len = (x2 - x1) >> 2;
if((cp->mOptKernel != NULL) && (len > 0)) {
@@ -819,12 +843,12 @@
const Element *eout = aout->mHal.state.type->getElement();
if (ein->getType() == eout->getType()) {
- updateCoeffCache(1.f);
+ updateCoeffCache(1.f, 1.f);
} else {
if (eout->getType() == RS_TYPE_UNSIGNED_8) {
- updateCoeffCache(255.f);
+ updateCoeffCache(255.f, 255.f);
} else {
- updateCoeffCache(1.f / 255.f);
+ updateCoeffCache(1.f / 255.f, 1.f);
}
}
diff --git a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
index 2545833..7b4d7f6 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
@@ -84,15 +84,28 @@
vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
SNIP_END(_N_ColorMatrix_load_u8_4)
+SNIP_START(_N_ColorMatrix_load_u8_3)
+ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+ vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+ vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+ vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+ veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_3)
+
SNIP_START(_N_ColorMatrix_load_u8_2)
vld2.8 {d0[0],d1[0]}, [r1]!
vld2.8 {d0[1],d1[1]}, [r1]!
vld2.8 {d0[2],d1[2]}, [r1]!
vld2.8 {d0[3],d1[3]}, [r1]!
+ veor d2, d2
+ veor d3, d3
SNIP_END(_N_ColorMatrix_load_u8_2)
SNIP_START(_N_ColorMatrix_load_u8_1)
vld1.32 {d0[0]}, [r1]!
+ veor d1, d1
+ veor d2, d2
+ veor d3, d3
SNIP_END(_N_ColorMatrix_load_u8_1)
SNIP_START(_N_ColorMatrix_load_u8f_4)
@@ -114,6 +127,23 @@
vcvt.f32.s32 q0, q0
SNIP_END(_N_ColorMatrix_load_u8f_4)
+SNIP_START(_N_ColorMatrix_load_u8f_3)
+ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+ vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+ vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+ vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+ vmovl.u8 q2, d2
+ vmovl.u8 q1, d1
+ vmovl.u8 q0, d0
+ vmovl.u16 q2, d4
+ vmovl.u16 q1, d2
+ vmovl.u16 q0, d0
+ vcvt.f32.s32 q2, q2
+ vcvt.f32.s32 q1, q1
+ vcvt.f32.s32 q0, q0
+ veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_3)
+
SNIP_START(_N_ColorMatrix_load_u8f_2)
vld2.8 {d0[0],d1[0]}, [r1]!
vld2.8 {d0[1],d1[1]}, [r1]!
@@ -125,13 +155,18 @@
vmovl.u16 q0, d0
vcvt.f32.s32 q1, q1
vcvt.f32.s32 q0, q0
+ veor q2, q2
+ veor q3, q3
SNIP_END(_N_ColorMatrix_load_u8f_2)
SNIP_START(_N_ColorMatrix_load_u8f_1)
- vld1.32 {d0}, [r1]!
+ vld1.32 {d0[0]}, [r1]!
vmovl.u8 q0, d0
vmovl.u16 q0, d0
vcvt.f32.s32 q0, q0
+ veor q1, q1
+ veor q2, q2
+ veor q3, q3
SNIP_END(_N_ColorMatrix_load_u8f_1)
SNIP_START(_N_ColorMatrix_load_f32_4)
@@ -141,15 +176,32 @@
vld4.32 {d1[1],d3[1],d5[1],d7[1]}, [r1]!
SNIP_END(_N_ColorMatrix_load_f32_4)
+SNIP_START(_N_ColorMatrix_load_f32_3)
+ vld3.32 {d0[0],d2[0],d4[0]}, [r1]!
+ add r1, r1, #4
+ vld3.32 {d0[1],d2[1],d4[1]}, [r1]!
+ add r1, r1, #4
+ vld3.32 {d1[0],d3[0],d5[0]}, [r1]!
+ add r1, r1, #4
+ vld3.32 {d1[1],d3[1],d5[1]}, [r1]!
+ add r1, r1, #4
+ veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_3)
+
SNIP_START(_N_ColorMatrix_load_f32_2)
vld2.32 {d0[0],d2[0]}, [r1]!
vld2.32 {d0[1],d2[1]}, [r1]!
vld2.32 {d1[0],d3[0]}, [r1]!
vld2.32 {d1[1],d3[1]}, [r1]!
+ veor q2, q2
+ veor q3, q3
SNIP_END(_N_ColorMatrix_load_f32_2)
SNIP_START(_N_ColorMatrix_load_f32_1)
vld1.32 {q0}, [r1]!
+ veor q1, q1
+ veor q2, q2
+ veor q3, q3
SNIP_END(_N_ColorMatrix_load_f32_1)
@@ -203,7 +255,6 @@
vqmovn.s32 d2, q1
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
- vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
vst2.8 {d0[0],d1[0]}, [r0]!
vst2.8 {d0[1],d1[1]}, [r0]!
vst2.8 {d0[2],d1[2]}, [r0]!
@@ -224,6 +275,13 @@
vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
SNIP_END(_N_ColorMatrix_store_f32_4)
+SNIP_START(_N_ColorMatrix_store_f32_3)
+ vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [r0]!
+ vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r0]!
+ vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [r0]!
+ vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_3)
+
SNIP_START(_N_ColorMatrix_store_f32_2)
vst2.32 {d0[0],d2[0]}, [r0]!
vst2.32 {d0[1],d2[1]}, [r0]!
@@ -259,10 +317,10 @@
SNIP_END(_N_ColorMatrix_unpack_u8_1)
SNIP_START(_N_ColorMatrix_pack_u8_4)
- vshrn.i32 d24, q8, #8
- vshrn.i32 d26, q9, #8
- vshrn.i32 d28, q10, #8
- vshrn.i32 d30, q11, #8
+ vrshrn.i32 d24, q8, #8
+ vrshrn.i32 d26, q9, #8
+ vrshrn.i32 d28, q10, #8
+ vrshrn.i32 d30, q11, #8
vqmovun.s16 d0, q12
vqmovun.s16 d1, q13
vqmovun.s16 d2, q14
@@ -270,23 +328,23 @@
SNIP_END(_N_ColorMatrix_pack_u8_4)
SNIP_START(_N_ColorMatrix_pack_u8_3)
- vshrn.i32 d24, q8, #8
- vshrn.i32 d26, q9, #8
- vshrn.i32 d28, q10, #8
+ vrshrn.i32 d24, q8, #8
+ vrshrn.i32 d26, q9, #8
+ vrshrn.i32 d28, q10, #8
vqmovun.s16 d0, q12
vqmovun.s16 d1, q13
vqmovun.s16 d2, q14
SNIP_END(_N_ColorMatrix_pack_u8_3)
SNIP_START(_N_ColorMatrix_pack_u8_2)
- vshrn.i32 d24, q8, #8
- vshrn.i32 d26, q9, #8
+ vrshrn.i32 d24, q8, #8
+ vrshrn.i32 d26, q9, #8
vqmovun.s16 d0, q12
vqmovun.s16 d1, q13
SNIP_END(_N_ColorMatrix_pack_u8_2)
SNIP_START(_N_ColorMatrix_pack_u8_1)
- vshrn.i32 d24, q8, #8
+ vrshrn.i32 d24, q8, #8
vqmovun.s16 d0, q12
SNIP_END(_N_ColorMatrix_pack_u8_1)