Fix ScriptGroup performance regression
bug 10151545
Change-Id: Ica4a30c6fe8718f7fdbff0b446885d9ac7083769
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index d24f9dc..e09c08a 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -591,7 +591,11 @@
} else {
switch(key.u.outVecSize) {
case 3:
- ADD_CHUNK(pack_u8_4);
+ if (key.u.copyAlpha) {
+ ADD_CHUNK(pack_u8_3);
+ } else {
+ ADD_CHUNK(pack_u8_4);
+ }
break;
case 2:
ADD_CHUNK(pack_u8_3);
@@ -676,14 +680,12 @@
rsAssert(0);
break;
}
- updateCoeffCache(1.f);
-
mRootPtr = &kernel;
}
static void One(const RsForEachStubParamStruct *p, void *out,
- const void *py, const float* coeff,
+ const void *py, const float* coeff, const float *add,
uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
float4 f = 0.f;
@@ -735,10 +737,11 @@
f.w * coeff[15];
//ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
- sum.x = sum.x < 0 ? 0 : (sum.x > 255 ? 255 : sum.x);
- sum.y = sum.y < 0 ? 0 : (sum.y > 255 ? 255 : sum.y);
- sum.z = sum.z < 0 ? 0 : (sum.z > 255 ? 255 : sum.z);
- sum.w = sum.w < 0 ? 0 : (sum.w > 255 ? 255 : sum.w);
+ sum.x += add[0];
+ sum.y += add[1];
+ sum.z += add[2];
+ sum.w += add[3];
+
//ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
if (fout) {
@@ -755,6 +758,11 @@
break;
}
} else {
+ sum.x = sum.x < 0 ? 0 : (sum.x > 255 ? 255 : sum.x);
+ sum.y = sum.y < 0 ? 0 : (sum.y > 255 ? 255 : sum.y);
+ sum.z = sum.z < 0 ? 0 : (sum.z > 255 ? 255 : sum.z);
+ sum.w = sum.w < 0 ? 0 : (sum.w > 255 ? 255 : sum.w);
+
switch(vsout) {
case 3:
case 2:
@@ -780,18 +788,14 @@
uint32_t x1 = xstart;
uint32_t x2 = xend;
- // if (p->y > 2) return;
-
uint32_t vsin = cp->mLastKey.u.inVecSize;
uint32_t vsout = cp->mLastKey.u.outVecSize;
bool floatIn = !!cp->mLastKey.u.inType;
bool floatOut = !!cp->mLastKey.u.outType;
-
if(x2 > x1) {
int32_t len = (x2 - x1) >> 2;
if((cp->mOptKernel != NULL) && (len > 0)) {
- //ALOGE("%p %p %i", out, in, len);
cp->mOptKernel(out, in, cp->ip, len);
x1 += len << 2;
out += outstep * (len << 2);
@@ -799,7 +803,7 @@
}
while(x1 != x2) {
- One(p, out, in, cp->tmpFp, vsin, vsout, floatIn, floatOut);
+ One(p, out, in, cp->tmpFp, cp->fpa, vsin, vsout, floatIn, floatOut);
out += outstep;
in += instep;
x1++;
@@ -811,8 +815,21 @@
uint32_t slot, const Allocation * ain, Allocation * aout,
const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
+ const Element *ein = ain->mHal.state.type->getElement();
+ const Element *eout = aout->mHal.state.type->getElement();
+
+ if (ein->getType() == eout->getType()) {
+ updateCoeffCache(1.f);
+ } else {
+ if (eout->getType() == RS_TYPE_UNSIGNED_8) {
+ updateCoeffCache(255.f);
+ } else {
+ updateCoeffCache(1.f / 255.f);
+ }
+ }
+
Key_t key = computeKey(ain->mHal.state.type->getElement(),
- aout->mHal.state.type->getElement());
+ aout->mHal.state.type->getElement());
if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
if (mBuf) munmap(mBuf, mBufSize);
mBuf = NULL;
@@ -822,16 +839,6 @@
mLastKey = key;
}
}
-
- if (key.u.inType == key.u.outType) {
- updateCoeffCache(1.f);
- } else {
- if (key.u.inType) {
- updateCoeffCache(255.f);
- } else {
- updateCoeffCache(1.f / 255.f);
- }
- }
}
void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 0d34d96..0669326 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -1097,6 +1097,18 @@
return NULL;
}
+void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc)
+{
+}
+
+void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc)
+{
+}
+
}
}
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 7d52507..c8a73bb 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -52,6 +52,12 @@
virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
virtual int invokeRoot();
+ virtual void preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc);
+ virtual void postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc);
virtual void invokeForEach(uint32_t slot,
const Allocation * ain,
Allocation * aout,
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 91779b4..1db6e16 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -67,8 +67,12 @@
mp->ptrOut = NULL;
mp->out = NULL;
+ uint32_t istep = 0;
+ uint32_t ostep = 0;
+
if (sl->ins[ct]) {
mp->ptrIn = (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+ istep = sl->ins[ct]->mHal.state.elementSizeBytes;
mp->in = mp->ptrIn;
if (sl->inExts[ct]) {
mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->y;
@@ -82,6 +86,7 @@
if (sl->outs[ct]) {
mp->ptrOut = (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
mp->out = mp->ptrOut;
+ ostep = sl->outs[ct]->mHal.state.elementSizeBytes;
if (sl->outExts[ct]) {
mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->y;
} else {
@@ -92,7 +97,7 @@
}
//ALOGE("kernel %i %p,%p %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
- func(p, xstart, xend, instep, outstep);
+ func(p, xstart, xend, istep, ostep);
}
//ALOGE("script group root");
@@ -204,6 +209,7 @@
fnPtrs.add((void *)mtls.kernel);
usrPtrs.add(mtls.fep.usr);
sigs.add(mtls.fep.usrLen);
+ si->preLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
}
sl.sigs = sigs.array();
sl.usrPtrs = usrPtrs.array();
@@ -218,6 +224,12 @@
mtls.kernel = (void (*)())&scriptGroupRoot;
mtls.fep.usr = &sl;
mCtx->launchThreads(ins[0], outs[0], NULL, &mtls);
+
+ for (size_t ct=0; ct < kernels.size(); ct++) {
+ Script *s = kernels[ct]->mScript;
+ RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
+ si->postLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], NULL, 0, NULL);
+ }
}
}