Add a basic implementation of the reduce kernel API to the CPU
reference implementation.
Bug: 22631253
For now, this just runs a serial reduction on one thread.
Change-Id: I34c96d24bb6f44274de72bb53160abcf79d143b0
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 09e7ab7..5adca54 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -50,6 +50,12 @@
static const bool kDebugGlobalVariables = false;
+static bool allocationLODIsNull(const android::renderscript::Allocation *alloc) {
+ // Even if alloc != nullptr, mallocPtr could be null if
+ // IO_OUTPUT/IO_INPUT with no bound surface.
+ return alloc && alloc->mHal.drvState.lod[0].mallocPtr == nullptr;
+}
+
#ifndef RS_COMPATIBILITY_LIB
static bool is_force_recompile() {
@@ -282,11 +288,11 @@
if (mRootExpand) {
//ALOGE("Found root.expand(): %p", mRootExpand);
}
- mInit = (InvokeFunc_t) dlsym(mScriptSO, "init");
+ mInit = (InitOrDtorFunc_t) dlsym(mScriptSO, "init");
if (mInit) {
//ALOGE("Found init(): %p", mInit);
}
- mFreeChildren = (InvokeFunc_t) dlsym(mScriptSO, ".rs.dtor");
+ mFreeChildren = (InitOrDtorFunc_t) dlsym(mScriptSO, ".rs.dtor");
if (mFreeChildren) {
//ALOGE("Found .rs.dtor(): %p", mFreeChildren);
}
@@ -490,6 +496,8 @@
void RsdCpuScriptImpl::populateScript(Script *script) {
// Copy info over to runtime
script->mHal.info.exportedFunctionCount = mScriptExec->getExportedFunctionCount();
+ script->mHal.info.exportedReduceCount = mScriptExec->getExportedReduceCount();
+ script->mHal.info.exportedForEachCount = mScriptExec->getExportedForEachCount();
script->mHal.info.exportedVariableCount = mScriptExec->getExportedVariableCount();
script->mHal.info.exportedPragmaCount = mScriptExec->getPragmaCount();;
script->mHal.info.exportedPragmaKeyList = mScriptExec->getPragmaKeys();
@@ -503,32 +511,105 @@
}
}
+// Set up the launch dimensions, and write the values of the launch
+// dimensions into the mtls start/end fields.
+//
+// Inputs:
+// baseDim - base shape of the input
+// sc - used to constrain the launch dimensions
+//
+// Returns:
+// True on success, false on failure to set up
+bool RsdCpuScriptImpl::setUpMtlsDimensions(MTLaunchStructCommon *mtls,
+ const RsLaunchDimensions &baseDim,
+ const RsScriptCall *sc) {
+ rsAssert(mtls);
+#define SET_UP_DIMENSION(DIM_FIELD, SC_FIELD) do { \
+ if (!sc || (sc->SC_FIELD##End == 0)) { \
+ mtls->end.DIM_FIELD = baseDim.DIM_FIELD; \
+ } else { \
+ mtls->start.DIM_FIELD = \
+ rsMin(baseDim.DIM_FIELD, sc->SC_FIELD##Start); \
+ mtls->end.DIM_FIELD = \
+ rsMin(baseDim.DIM_FIELD, sc->SC_FIELD##End); \
+ if (mtls->start.DIM_FIELD >= mtls->end.DIM_FIELD) { \
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, \
+ "Failed to launch kernel; Invalid " \
+ #SC_FIELD "Start or " #SC_FIELD "End."); \
+ return false; \
+ } \
+ }} while(0)
+
+ SET_UP_DIMENSION(x, x);
+ SET_UP_DIMENSION(y, y);
+ SET_UP_DIMENSION(z, z);
+ SET_UP_DIMENSION(array[0], array);
+ SET_UP_DIMENSION(array[1], array2);
+ SET_UP_DIMENSION(array[2], array3);
+ SET_UP_DIMENSION(array[3], array4);
+#undef SET_UP_DIMENSION
+
+ return true;
+}
+
+// Preliminary work to prepare a reduce-style kernel for launch.
+bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation *ain,
+ const Allocation *aout,
+ const RsScriptCall *sc,
+ MTLaunchStructReduce *mtls) {
+ rsAssert(ain && aout);
+ memset(mtls, 0, sizeof(MTLaunchStructReduce));
+ mtls->dimPtr = &mtls->inputDim;
+
+ if (allocationLODIsNull(ain) || allocationLODIsNull(aout)) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "reduce called with a null allocation");
+ return false;
+ }
+
+ // Set up the dimensions of the input.
+ const Type *inType = ain->getType();
+ mtls->inputDim.x = inType->getDimX();
+ rsAssert(inType->getDimY() == 0);
+
+ if (!setUpMtlsDimensions(mtls, mtls->inputDim, sc)) {
+ return false;
+ }
+
+ mtls->rs = mCtx;
+ // Currently not threaded.
+ mtls->isThreadable = false;
+ mtls->mSliceNum = -1;
+
+ // Set up input and output.
+ mtls->inBuf = static_cast<uint8_t *>(ain->getPointerUnchecked(0, 0));
+ mtls->outBuf = static_cast<uint8_t *>(aout->getPointerUnchecked(0, 0));
+
+ rsAssert(mtls->inBuf && mtls->outBuf);
+
+ return true;
+}
+
+// Preliminary work to prepare a forEach-style kernel for launch.
bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
uint32_t inLen,
Allocation * aout,
const void * usr, uint32_t usrLen,
const RsScriptCall *sc,
- MTLaunchStruct *mtls) {
-
- memset(mtls, 0, sizeof(MTLaunchStruct));
+ MTLaunchStructForEach *mtls) {
+ memset(mtls, 0, sizeof(MTLaunchStructForEach));
+ mtls->dimPtr = &mtls->fep.dim;
for (int index = inLen; --index >= 0;) {
- const Allocation* ain = ains[index];
-
- // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
- if (ain != nullptr &&
- (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == nullptr) {
-
+ if (allocationLODIsNull(ains[index])) {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
"rsForEach called with null in allocations");
return false;
}
}
- if (aout &&
- (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == nullptr) {
-
+ if (allocationLODIsNull(aout)) {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
"rsForEach called with null out allocations");
return false;
@@ -578,96 +659,14 @@
}
}
- if (!sc || (sc->xEnd == 0)) {
- mtls->end.x = mtls->fep.dim.x;
- } else {
- mtls->start.x = rsMin(mtls->fep.dim.x, sc->xStart);
- mtls->end.x = rsMin(mtls->fep.dim.x, sc->xEnd);
- if (mtls->start.x >= mtls->end.x) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "Failed to launch kernel; Invalid xStart or xEnd.");
- return false;
- }
+ if (!setUpMtlsDimensions(mtls, mtls->fep.dim, sc)) {
+ return false;
}
- if (!sc || (sc->yEnd == 0)) {
- mtls->end.y = mtls->fep.dim.y;
- } else {
- mtls->start.y = rsMin(mtls->fep.dim.y, sc->yStart);
- mtls->end.y = rsMin(mtls->fep.dim.y, sc->yEnd);
- if (mtls->start.y >= mtls->end.y) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "Failed to launch kernel; Invalid yStart or yEnd.");
- return false;
- }
- }
-
- if (!sc || (sc->zEnd == 0)) {
- mtls->end.z = mtls->fep.dim.z;
- } else {
- mtls->start.z = rsMin(mtls->fep.dim.z, sc->zStart);
- mtls->end.z = rsMin(mtls->fep.dim.z, sc->zEnd);
- if (mtls->start.z >= mtls->end.z) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "Failed to launch kernel; Invalid zStart or zEnd.");
- return false;
- }
- }
-
- if (!sc || (sc->arrayEnd == 0)) {
- mtls->end.array[0] = mtls->fep.dim.array[0];
- } else {
- mtls->start.array[0] = rsMin(mtls->fep.dim.array[0], sc->arrayStart);
- mtls->end.array[0] = rsMin(mtls->fep.dim.array[0], sc->arrayEnd);
- if (mtls->start.array[0] >= mtls->end.array[0]) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "Failed to launch kernel; Invalid arrayStart or arrayEnd.");
- return false;
- }
- }
-
- if (!sc || (sc->array2End == 0)) {
- mtls->end.array[1] = mtls->fep.dim.array[1];
- } else {
- mtls->start.array[1] = rsMin(mtls->fep.dim.array[1], sc->array2Start);
- mtls->end.array[1] = rsMin(mtls->fep.dim.array[1], sc->array2End);
- if (mtls->start.array[1] >= mtls->end.array[1]) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "Failed to launch kernel; Invalid array2Start or array2End.");
- return false;
- }
- }
-
- if (!sc || (sc->array3End == 0)) {
- mtls->end.array[2] = mtls->fep.dim.array[2];
- } else {
- mtls->start.array[2] = rsMin(mtls->fep.dim.array[2], sc->array3Start);
- mtls->end.array[2] = rsMin(mtls->fep.dim.array[2], sc->array3End);
- if (mtls->start.array[2] >= mtls->end.array[2]) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "Failed to launch kernel; Invalid array3Start or array3End.");
- return false;
- }
- }
-
- if (!sc || (sc->array4End == 0)) {
- mtls->end.array[3] = mtls->fep.dim.array[3];
- } else {
- mtls->start.array[3] = rsMin(mtls->fep.dim.array[3], sc->array4Start);
- mtls->end.array[3] = rsMin(mtls->fep.dim.array[3], sc->array4End);
- if (mtls->start.array[3] >= mtls->end.array[3]) {
- mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "Failed to launch kernel; Invalid array4Start or array4End.");
- return false;
- }
- }
-
-
// The X & Y walkers always want 0-1 min even if dim is not present
mtls->end.x = rsMax((uint32_t)1, mtls->end.x);
mtls->end.y = rsMax((uint32_t)1, mtls->end.y);
-
- mtls->rsc = mCtx;
+ mtls->rs = mCtx;
if (ains) {
memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
}
@@ -705,18 +704,32 @@
uint32_t usrLen,
const RsScriptCall *sc) {
- MTLaunchStruct mtls;
+ MTLaunchStructForEach mtls;
if (forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls)) {
forEachKernelSetup(slot, &mtls);
RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
- mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+ mCtx->launchForEach(ains, inLen, aout, sc, &mtls);
mCtx->setTLS(oldTLS);
}
}
-void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
+void RsdCpuScriptImpl::invokeReduce(uint32_t slot,
+ const Allocation *ain,
+ Allocation *aout,
+ const RsScriptCall *sc) {
+ MTLaunchStructReduce mtls;
+
+ if (reduceMtlsSetup(ain, aout, sc, &mtls)) {
+ reduceKernelSetup(slot, &mtls);
+ RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
+ mCtx->launchReduce(ain, aout, &mtls);
+ mCtx->setTLS(oldTLS);
+ }
+}
+
+void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls) {
mtls->script = this;
mtls->fep.slot = slot;
mtls->kernel = mScriptExec->getForEachFunction(slot);
@@ -724,6 +737,12 @@
mtls->sig = mScriptExec->getForEachSignature(slot);
}
+void RsdCpuScriptImpl::reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls) {
+ mtls->script = this;
+ mtls->kernel = mScriptExec->getReduceFunction(slot);
+ rsAssert(mtls->kernel != nullptr);
+}
+
int RsdCpuScriptImpl::invokeRoot() {
RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
int ret = mRoot();