Fix ScriptGroup performance regression

bug 10151545

Change-Id: Ica4a30c6fe8718f7fdbff0b446885d9ac7083769
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index d24f9dc..e09c08a 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -591,7 +591,11 @@
         } else {
             switch(key.u.outVecSize) {
             case 3:
-                ADD_CHUNK(pack_u8_4);
+                if (key.u.copyAlpha) {
+                    ADD_CHUNK(pack_u8_3);
+                } else {
+                    ADD_CHUNK(pack_u8_4);
+                }
                 break;
             case 2:
                 ADD_CHUNK(pack_u8_3);
@@ -676,14 +680,12 @@
         rsAssert(0);
         break;
     }
-    updateCoeffCache(1.f);
-
     mRootPtr = &kernel;
 }
 
 
 static void One(const RsForEachStubParamStruct *p, void *out,
-                const void *py, const float* coeff,
+                const void *py, const float* coeff, const float *add,
                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
 
     float4 f = 0.f;
@@ -735,10 +737,11 @@
             f.w * coeff[15];
     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
 
-    sum.x = sum.x < 0 ? 0 : (sum.x > 255 ? 255 : sum.x);
-    sum.y = sum.y < 0 ? 0 : (sum.y > 255 ? 255 : sum.y);
-    sum.z = sum.z < 0 ? 0 : (sum.z > 255 ? 255 : sum.z);
-    sum.w = sum.w < 0 ? 0 : (sum.w > 255 ? 255 : sum.w);
+    sum.x += add[0];
+    sum.y += add[1];
+    sum.z += add[2];
+    sum.w += add[3];
+
 
     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
     if (fout) {
@@ -755,6 +758,11 @@
             break;
         }
     } else {
+        sum.x = sum.x < 0 ? 0 : (sum.x > 255 ? 255 : sum.x);
+        sum.y = sum.y < 0 ? 0 : (sum.y > 255 ? 255 : sum.y);
+        sum.z = sum.z < 0 ? 0 : (sum.z > 255 ? 255 : sum.z);
+        sum.w = sum.w < 0 ? 0 : (sum.w > 255 ? 255 : sum.w);
+
         switch(vsout) {
         case 3:
         case 2:
@@ -780,18 +788,14 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-   // if (p->y > 2) return;
-
     uint32_t vsin = cp->mLastKey.u.inVecSize;
     uint32_t vsout = cp->mLastKey.u.outVecSize;
     bool floatIn = !!cp->mLastKey.u.inType;
     bool floatOut = !!cp->mLastKey.u.outType;
 
-
     if(x2 > x1) {
         int32_t len = (x2 - x1) >> 2;
         if((cp->mOptKernel != NULL) && (len > 0)) {
-            //ALOGE("%p %p %i", out, in, len);
             cp->mOptKernel(out, in, cp->ip, len);
             x1 += len << 2;
             out += outstep * (len << 2);
@@ -799,7 +803,7 @@
         }
 
         while(x1 != x2) {
-            One(p, out, in, cp->tmpFp, vsin, vsout, floatIn, floatOut);
+            One(p, out, in, cp->tmpFp, cp->fpa, vsin, vsout, floatIn, floatOut);
             out += outstep;
             in += instep;
             x1++;
@@ -811,8 +815,21 @@
         uint32_t slot, const Allocation * ain, Allocation * aout,
         const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
 
+    const Element *ein = ain->mHal.state.type->getElement();
+    const Element *eout = aout->mHal.state.type->getElement();
+
+    if (ein->getType() == eout->getType()) {
+        updateCoeffCache(1.f);
+    } else {
+        if (eout->getType() == RS_TYPE_UNSIGNED_8) {
+            updateCoeffCache(255.f);
+        } else {
+            updateCoeffCache(1.f / 255.f);
+        }
+    }
+
     Key_t key = computeKey(ain->mHal.state.type->getElement(),
-                             aout->mHal.state.type->getElement());
+                           aout->mHal.state.type->getElement());
     if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
         if (mBuf) munmap(mBuf, mBufSize);
         mBuf = NULL;
@@ -822,16 +839,6 @@
             mLastKey = key;
         }
     }
-
-    if (key.u.inType == key.u.outType) {
-        updateCoeffCache(1.f);
-    } else {
-        if (key.u.inType) {
-            updateCoeffCache(255.f);
-        } else {
-            updateCoeffCache(1.f / 255.f);
-        }
-    }
 }
 
 void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 0d34d96..0669326 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -1097,6 +1097,18 @@
     return NULL;
 }
 
+void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation * ain,
+                       Allocation * aout, const void * usr,
+                       uint32_t usrLen, const RsScriptCall *sc)
+{
+}
+
+void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation * ain,
+                        Allocation * aout, const void * usr,
+                        uint32_t usrLen, const RsScriptCall *sc)
+{
+}
+
 
 }
 }
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 7d52507..c8a73bb 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -52,6 +52,12 @@
 
     virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
     virtual int invokeRoot();
+    virtual void preLaunch(uint32_t slot, const Allocation * ain,
+                           Allocation * aout, const void * usr,
+                           uint32_t usrLen, const RsScriptCall *sc);
+    virtual void postLaunch(uint32_t slot, const Allocation * ain,
+                            Allocation * aout, const void * usr,
+                            uint32_t usrLen, const RsScriptCall *sc);
     virtual void invokeForEach(uint32_t slot,
                        const Allocation * ain,
                        Allocation * aout,
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 91779b4..1db6e16 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -67,8 +67,12 @@
         mp->ptrOut = NULL;
         mp->out = NULL;
 
+        uint32_t istep = 0;
+        uint32_t ostep = 0;
+
         if (sl->ins[ct]) {
             mp->ptrIn = (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+            istep = sl->ins[ct]->mHal.state.elementSizeBytes;
             mp->in = mp->ptrIn;
             if (sl->inExts[ct]) {
                 mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->y;
@@ -82,6 +86,7 @@
         if (sl->outs[ct]) {
             mp->ptrOut = (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
             mp->out = mp->ptrOut;
+            ostep = sl->outs[ct]->mHal.state.elementSizeBytes;
             if (sl->outExts[ct]) {
                 mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->y;
             } else {
@@ -92,7 +97,7 @@
         }
 
         //ALOGE("kernel %i %p,%p  %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
-        func(p, xstart, xend, instep, outstep);
+        func(p, xstart, xend, istep, ostep);
     }
     //ALOGE("script group root");
 
@@ -204,6 +209,7 @@
             fnPtrs.add((void *)mtls.kernel);
             usrPtrs.add(mtls.fep.usr);
             sigs.add(mtls.fep.usrLen);
+            si->preLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
         }
         sl.sigs = sigs.array();
         sl.usrPtrs = usrPtrs.array();
@@ -218,6 +224,12 @@
         mtls.kernel = (void (*)())&scriptGroupRoot;
         mtls.fep.usr = &sl;
         mCtx->launchThreads(ins[0], outs[0], NULL, &mtls);
+
+        for (size_t ct=0; ct < kernels.size(); ct++) {
+            Script *s = kernels[ct]->mScript;
+            RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
+            si->postLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], NULL, 0, NULL);
+        }
     }
 }