add array launch support.
Change-Id: I66cd89b5b44eafa92f391708a06464cd7cdde3ed
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 40f4745..0ec7b28 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -164,7 +164,7 @@
// fast path for very small launches
MTLaunchStruct *mtls = (MTLaunchStruct *)data;
- if (mtls && mtls->fep.dim.y <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
+ if (mtls && mtls->fep.dim.y <= 1 && mtls->end.x <= mtls->start.x + mtls->mSliceSize) {
if (mWorkers.mLaunchCallback) {
mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
}
@@ -370,7 +370,7 @@
kparams->z = fep->current.z;
}
-static inline void fepPtrSetup(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo *fep,
+static inline void FepPtrSetup(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo *fep,
uint32_t x, uint32_t y,
uint32_t z = 0, uint32_t lod = 0,
RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
@@ -385,31 +385,89 @@
}
}
+static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
+ if (start >= end) {
+ *p = start;
+ return val;
+ }
+
+ uint32_t div = end - start;
+
+ uint32_t n = val / div;
+ *p = (val - (n * div)) + start;
+ return n;
+}
+
+static bool SelectOuterSlice(MTLaunchStruct* mtls, uint32_t sliceNum) {
+
+ uint32_t r = sliceNum;
+ r = sliceInt(&mtls->fep.current.z, r, mtls->start.z, mtls->end.z);
+ r = sliceInt(&mtls->fep.current.lod, r, mtls->start.lod, mtls->end.lod);
+ r = sliceInt(&mtls->fep.current.face, r, mtls->start.face, mtls->end.face);
+ r = sliceInt(&mtls->fep.current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
+ r = sliceInt(&mtls->fep.current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
+ r = sliceInt(&mtls->fep.current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
+ r = sliceInt(&mtls->fep.current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
+ return r == 0;
+}
+
+
+static void walk_general(void *usr, uint32_t idx) {
+ MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+ RsExpandKernelDriverInfo fep = mtls->fep;
+ fep.lid = idx;
+ outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+
+
+ while(1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+
+ if (!SelectOuterSlice(mtls, slice)) {
+ return;
+ }
+
+ for (mtls->fep.current.y = mtls->start.y;
+ mtls->fep.current.y < mtls->end.y;
+ mtls->fep.current.y++) {
+
+ FepPtrSetup(mtls, &mtls->fep, mtls->start.x,
+ mtls->fep.current.y, mtls->fep.current.z, mtls->fep.current.lod,
+ (RsAllocationCubemapFace)mtls->fep.current.face,
+ mtls->fep.current.array[0], mtls->fep.current.array[1],
+ mtls->fep.current.array[2], mtls->fep.current.array[3]);
+
+ RsExpandKernelParams kparams;
+ kparamSetup(&kparams, &mtls->fep);
+ fn(&kparams, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
+ }
+ }
+
+}
static void walk_2d(void *usr, uint32_t idx) {
MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
RsExpandKernelDriverInfo fep = mtls->fep;
fep.lid = idx;
+ outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
while (1) {
uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
- uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+ uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
uint32_t yEnd = yStart + mtls->mSliceSize;
- yEnd = rsMin(yEnd, mtls->yEnd);
+ yEnd = rsMin(yEnd, mtls->end.y);
if (yEnd <= yStart) {
return;
}
for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
- fepPtrSetup(mtls, &fep, mtls->xStart, fep.current.y);
+ FepPtrSetup(mtls, &fep, mtls->start.x, fep.current.y);
RsExpandKernelParams kparams;
kparamSetup(&kparams, &fep);
- outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
- fn(&kparams, mtls->xStart, mtls->xEnd, fep.outStride[0]);
+ fn(&kparams, mtls->start.x, mtls->end.x, fep.outStride[0]);
}
}
}
@@ -418,29 +476,28 @@
MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
RsExpandKernelDriverInfo fep = mtls->fep;
fep.lid = idx;
+ outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
while (1) {
uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
- uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+ uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
uint32_t xEnd = xStart + mtls->mSliceSize;
- xEnd = rsMin(xEnd, mtls->xEnd);
+ xEnd = rsMin(xEnd, mtls->end.x);
if (xEnd <= xStart) {
return;
}
- fepPtrSetup(mtls, &fep, xStart, 0);
+ FepPtrSetup(mtls, &fep, xStart, 0);
RsExpandKernelParams kparams;
kparamSetup(&kparams, &fep);
- outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
fn(&kparams, xStart, xEnd, fep.outStride[0]);
}
}
-
void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
uint32_t inLen,
Allocation* aout,
@@ -449,11 +506,23 @@
//android::StopWatch kernel_time("kernel time");
+ bool outerDims = (mtls->start.z != mtls->end.z) ||
+ (mtls->start.face != mtls->end.face) ||
+ (mtls->start.lod != mtls->end.lod) ||
+ (mtls->start.array[0] != mtls->end.array[0]) ||
+ (mtls->start.array[1] != mtls->end.array[1]) ||
+ (mtls->start.array[2] != mtls->end.array[2]) ||
+ (mtls->start.array[3] != mtls->end.array[3]);
+
if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
const size_t targetByteChunk = 16 * 1024;
mInForEach = true;
- if (mtls->fep.dim.y > 1) {
+ if (outerDims) {
+ // No fancy logic for chunk size
+ mtls->mSliceSize = 1;
+ launchThreads(walk_general, mtls);
+ } else if (mtls->fep.dim.y > 1) {
uint32_t s1 = mtls->fep.dim.y / ((mWorkers.mCount + 1) * 4);
uint32_t s2 = 0;
@@ -496,21 +565,23 @@
} else {
outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
- for (uint32_t arrayIndex = mtls->arrayStart;
- arrayIndex < mtls->arrayEnd; arrayIndex++) {
+ uint32_t slice = 0;
- for (mtls->fep.current.z = mtls->zStart; mtls->fep.current.z < mtls->zEnd;
- mtls->fep.current.z++) {
- for (mtls->fep.current.y = mtls->yStart; mtls->fep.current.y < mtls->yEnd;
- mtls->fep.current.y++) {
+ while(SelectOuterSlice(mtls, slice++)) {
+ for (mtls->fep.current.y = mtls->start.y;
+ mtls->fep.current.y < mtls->end.y;
+ mtls->fep.current.y++) {
- fepPtrSetup(mtls, &mtls->fep, mtls->xStart, mtls->fep.current.y, mtls->fep.current.z);
+ FepPtrSetup(mtls, &mtls->fep, mtls->start.x,
+ mtls->fep.current.y, mtls->fep.current.z, mtls->fep.current.lod,
+ (RsAllocationCubemapFace) mtls->fep.current.face,
+ mtls->fep.current.array[0], mtls->fep.current.array[1],
+ mtls->fep.current.array[2], mtls->fep.current.array[3]);
- RsExpandKernelParams kparams;
- kparamSetup(&kparams, &mtls->fep);
- fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.outStride[0]);
- }
+ RsExpandKernelParams kparams;
+ kparamSetup(&kparams, &mtls->fep);
+ fn(&kparams, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
}
}
}
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index a42cef7..ff087b0 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -46,7 +46,7 @@
uint32_t y;
uint32_t z;
uint32_t lod;
- uint32_t faces;
+ uint32_t face;
uint32_t array[4 /*make a define*/];
};
@@ -134,18 +134,8 @@
volatile int mSliceNum;
bool isThreadable;
- // origin of the launch
- RsLaunchDimensions origin;
-
- // TODO: convert to RsLaunchDimensions
- uint32_t xStart;
- uint32_t xEnd;
- uint32_t yStart;
- uint32_t yEnd;
- uint32_t zStart;
- uint32_t zEnd;
- uint32_t arrayStart;
- uint32_t arrayEnd;
+ RsLaunchDimensions start;
+ RsLaunchDimensions end;
};
class RsdCpuReferenceImpl : public RsdCpuReference {
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 8437c99..1636369 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -97,16 +97,17 @@
preLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
- forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
- mtls.script = this;
- mtls.fep.slot = slot;
+ if (forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls)) {
+ mtls.script = this;
+ mtls.fep.slot = slot;
- mtls.kernel = (void (*)())mRootPtr;
- mtls.fep.usr = this;
+ mtls.kernel = (void (*)())mRootPtr;
+ mtls.fep.usr = this;
- RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
- mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
- mCtx->setTLS(oldTLS);
+ RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+ mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+ mCtx->setTLS(oldTLS);
+ }
postLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
}
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 7d614da..a4e3059 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -880,7 +880,7 @@
typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
+bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
uint32_t inLen,
Allocation * aout,
const void * usr, uint32_t usrLen,
@@ -898,7 +898,7 @@
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
"rsForEach called with null in allocations");
- return;
+ return false;
}
}
@@ -907,7 +907,7 @@
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
"rsForEach called with null out allocations");
- return;
+ return false;
}
if (inLen > 0) {
@@ -923,7 +923,7 @@
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
"Failed to launch kernel; dimensions of input and output allocations do not match.");
- return;
+ return false;
}
}
@@ -937,7 +937,7 @@
} else {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
"rsForEach called with null allocations");
- return;
+ return false;
}
if (inLen > 0 && aout != nullptr) {
@@ -945,49 +945,70 @@
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
"Failed to launch kernel; dimensions of input and output allocations do not match.");
- return;
+ return false;
}
}
if (!sc || (sc->xEnd == 0)) {
- mtls->xEnd = mtls->fep.dim.x;
+ mtls->end.x = mtls->fep.dim.x;
} else {
- rsAssert(sc->xStart < mtls->fep.dim.x);
- rsAssert(sc->xEnd <= mtls->fep.dim.x);
- rsAssert(sc->xStart < sc->xEnd);
- mtls->xStart = rsMin(mtls->fep.dim.x, sc->xStart);
- mtls->xEnd = rsMin(mtls->fep.dim.x, sc->xEnd);
- if (mtls->xStart >= mtls->xEnd) return;
+ mtls->start.x = rsMin(mtls->fep.dim.x, sc->xStart);
+ mtls->end.x = rsMin(mtls->fep.dim.x, sc->xEnd);
+ if (mtls->start.x >= mtls->end.x) return false;
}
if (!sc || (sc->yEnd == 0)) {
- mtls->yEnd = mtls->fep.dim.y;
+ mtls->end.y = mtls->fep.dim.y;
} else {
- rsAssert(sc->yStart < mtls->fep.dim.y);
- rsAssert(sc->yEnd <= mtls->fep.dim.y);
- rsAssert(sc->yStart < sc->yEnd);
- mtls->yStart = rsMin(mtls->fep.dim.y, sc->yStart);
- mtls->yEnd = rsMin(mtls->fep.dim.y, sc->yEnd);
- if (mtls->yStart >= mtls->yEnd) return;
+ mtls->start.y = rsMin(mtls->fep.dim.y, sc->yStart);
+ mtls->end.y = rsMin(mtls->fep.dim.y, sc->yEnd);
+ if (mtls->start.y >= mtls->end.y) return false;
}
if (!sc || (sc->zEnd == 0)) {
- mtls->zEnd = mtls->fep.dim.z;
+ mtls->end.z = mtls->fep.dim.z;
} else {
- rsAssert(sc->zStart < mtls->fep.dim.z);
- rsAssert(sc->zEnd <= mtls->fep.dim.z);
- rsAssert(sc->zStart < sc->zEnd);
- mtls->zStart = rsMin(mtls->fep.dim.z, sc->zStart);
- mtls->zEnd = rsMin(mtls->fep.dim.z, sc->zEnd);
- if (mtls->zStart >= mtls->zEnd) return;
+ mtls->start.z = rsMin(mtls->fep.dim.z, sc->zStart);
+ mtls->end.z = rsMin(mtls->fep.dim.z, sc->zEnd);
+ if (mtls->start.z >= mtls->end.z) return false;
}
- mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
- mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
- mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
- mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+ if (!sc || (sc->arrayEnd == 0)) {
+ mtls->end.array[0] = mtls->fep.dim.array[0];
+ } else {
+ mtls->start.array[0] = rsMin(mtls->fep.dim.array[0], sc->arrayStart);
+ mtls->end.array[0] = rsMin(mtls->fep.dim.array[0], sc->arrayEnd);
+ if (mtls->start.array[0] >= mtls->end.array[0]) return false;
+ }
- rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
+ if (!sc || (sc->array2End == 0)) {
+ mtls->end.array[1] = mtls->fep.dim.array[1];
+ } else {
+ mtls->start.array[1] = rsMin(mtls->fep.dim.array[1], sc->array2Start);
+ mtls->end.array[1] = rsMin(mtls->fep.dim.array[1], sc->array2End);
+ if (mtls->start.array[1] >= mtls->end.array[1]) return false;
+ }
+
+ if (!sc || (sc->array3End == 0)) {
+ mtls->end.array[2] = mtls->fep.dim.array[2];
+ } else {
+ mtls->start.array[2] = rsMin(mtls->fep.dim.array[2], sc->array3Start);
+ mtls->end.array[2] = rsMin(mtls->fep.dim.array[2], sc->array3End);
+ if (mtls->start.array[2] >= mtls->end.array[2]) return false;
+ }
+
+ if (!sc || (sc->array4End == 0)) {
+ mtls->end.array[3] = mtls->fep.dim.array[3];
+ } else {
+ mtls->start.array[3] = rsMin(mtls->fep.dim.array[3], sc->array4Start);
+ mtls->end.array[3] = rsMin(mtls->fep.dim.array[3], sc->array4End);
+ if (mtls->start.array[3] >= mtls->end.array[3]) return false;
+ }
+
+
+ // The X & Y walkers always want 0-1 min even if dim is not present
+ mtls->end.x = rsMax((uint32_t)1, mtls->end.x);
+ mtls->end.y = rsMax((uint32_t)1, mtls->end.y);
mtls->rsc = mCtx;
if (ains) {
@@ -1013,6 +1034,9 @@
mtls->fep.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
mtls->fep.outStride[0] = aout->getType()->getElementSizeBytes();
}
+
+ // All validation passed, ok to launch threads
+ return true;
}
@@ -1026,12 +1050,13 @@
MTLaunchStruct mtls;
- forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
- forEachKernelSetup(slot, &mtls);
+ if (forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls)) {
+ forEachKernelSetup(slot, &mtls);
- RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
- mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
- mCtx->setTLS(oldTLS);
+ RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+ mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+ mCtx->setTLS(oldTLS);
+ }
}
void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 43be88b..b84abb3 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -182,7 +182,7 @@
const Script * getScript() {return mScript;}
- void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+ bool forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
Allocation * aout, const void * usr, uint32_t usrLen,
const RsScriptCall *sc, MTLaunchStruct *mtls);
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 3d32a51..166c80d 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -223,13 +223,15 @@
ains = const_cast<const Allocation**>(&ins[ct]);
}
- si->forEachMtlsSetup(ains, inLen, outs[ct], nullptr, 0, nullptr, &mtls);
+ bool launchOK = si->forEachMtlsSetup(ains, inLen, outs[ct], nullptr, 0, nullptr, &mtls);
si->forEachKernelSetup(slot, &mtls);
si->preLaunch(slot, ains, inLen, outs[ct], mtls.fep.usr,
mtls.fep.usrLen, nullptr);
- mCtx->launchThreads(ains, inLen, outs[ct], nullptr, &mtls);
+ if (launchOK) {
+ mCtx->launchThreads(ains, inLen, outs[ct], nullptr, &mtls);
+ }
si->postLaunch(slot, ains, inLen, outs[ct], nullptr, 0, nullptr);
}
@@ -283,13 +285,14 @@
Script *s = kernels[0]->mScript;
RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
- si->forEachMtlsSetup(ains, inLen, outs[0], nullptr, 0, nullptr, &mtls);
+ if (si->forEachMtlsSetup(ains, inLen, outs[0], nullptr, 0, nullptr, &mtls)) {
- mtls.script = nullptr;
- mtls.kernel = (void (*)())&scriptGroupRoot;
- mtls.fep.usr = &sl;
+ mtls.script = nullptr;
+ mtls.kernel = (void (*)())&scriptGroupRoot;
+ mtls.fep.usr = &sl;
- mCtx->launchThreads(ains, inLen, outs[0], nullptr, &mtls);
+ mCtx->launchThreads(ains, inLen, outs[0], nullptr, &mtls);
+ }
for (size_t ct=0; ct < kernels.size(); ct++) {
Script *s = kernels[ct]->mScript;
diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp
index 9dc4d90..52cd8a0 100644
--- a/cpu_ref/rsCpuScriptGroup2.cpp
+++ b/cpu_ref/rsCpuScriptGroup2.cpp
@@ -167,16 +167,17 @@
const Closure* closure = cpuClosure->mClosure;
MTLaunchStruct mtls;
- cpuClosure->mSi->forEachMtlsSetup((const Allocation**)&closure->mArgs[0],
- closure->mArgs.size(),
- closure->mReturnValue,
- nullptr, 0, nullptr, &mtls);
+ if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)&closure->mArgs[0],
+ closure->mArgs.size(),
+ closure->mReturnValue,
+ nullptr, 0, nullptr, &mtls)) {
- mtls.script = nullptr;
- mtls.kernel = (void (*)())&groupRoot;
- mtls.fep.usr = &batch;
+ mtls.script = nullptr;
+ mtls.kernel = (void (*)())&groupRoot;
+ mtls.fep.usr = &batch;
- mCpuRefImpl->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
+ mCpuRefImpl->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
+ }
for (CPUClosure* cpuClosure : batch) {
const Closure* closure = cpuClosure->mClosure;