Adds support for multi-input kernels to Frameworks/RS.
This patch modifies Frameworks/RS in the following ways:
* Adjusted the data-layout of the C/C++ version of RsForEachStubParamStruct to
accommodate a pointer to an array of input allocations and a pointer to an
array of stride sizes for each of these allocatoins.
* Adds a new code path for Java code to pass multiple allocations to a RS
kernel.
* Packs base pointers and step values for multi-input kernels into the new
RsForEachStubParamStruct members.
Change-Id: I46d2834c37075b2a2407fd8b010546818a4540d1
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 9755b9a..499f890 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -479,6 +479,109 @@
}
}
+void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
+ const RsScriptCall* sc, MTLaunchStruct* mtls) {
+
+ //android::StopWatch kernel_time("kernel time");
+
+ if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+ const size_t targetByteChunk = 16 * 1024;
+ mInForEach = true;
+ if (mtls->fep.dimY > 1) {
+ uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+ uint32_t s2 = 0;
+
+ // This chooses our slice size to rate limit atomic ops to
+ // one per 16k bytes of reads/writes.
+ if (mtls->fep.yStrideOut) {
+ s2 = targetByteChunk / mtls->fep.yStrideOut;
+ } else {
+ s2 = targetByteChunk / mtls->fep.yStrideIn;
+ }
+ mtls->mSliceSize = rsMin(s1, s2);
+
+ if(mtls->mSliceSize < 1) {
+ mtls->mSliceSize = 1;
+ }
+
+ // mtls->mSliceSize = 2;
+ launchThreads(wc_xy, mtls);
+ } else {
+ uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+ uint32_t s2 = 0;
+
+ // This chooses our slice size to rate limit atomic ops to
+ // one per 16k bytes of reads/writes.
+ if (mtls->fep.eStrideOut) {
+ s2 = targetByteChunk / mtls->fep.eStrideOut;
+ } else {
+ s2 = targetByteChunk / mtls->fep.eStrideIn;
+ }
+ mtls->mSliceSize = rsMin(s1, s2);
+
+ if (mtls->mSliceSize < 1) {
+ mtls->mSliceSize = 1;
+ }
+
+ launchThreads(wc_x, mtls);
+ }
+ mInForEach = false;
+
+ //ALOGE("launch 1");
+ } else {
+ RsForEachStubParamStruct p;
+ memcpy(&p, &mtls->fep, sizeof(p));
+ uint32_t sig = mtls->sig;
+
+ // Allocate space for our input base pointers.
+ p.ins = new const void*[inLen];
+
+ // Allocate space for our input stride information.
+ p.eStrideIns = new uint32_t[inLen];
+
+ // Fill our stride information.
+ for (int index = inLen; --index >= 0;) {
+ p.eStrideIns[index] = mtls->fep.inStrides[index].eStride;
+ }
+
+ //ALOGE("launch 3");
+ outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+ uint32_t offset_invariant = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0];
+
+ for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
+ uint32_t offset_part = offset_invariant * p.ar[0];
+
+ for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
+ for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
+ uint32_t offset = offset_part + mtls->fep.dimY * p.z + p.y;
+
+ p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
+ (mtls->fep.eStrideOut * mtls->xStart);
+
+ for (int index = inLen; --index >= 0;) {
+ StridePair &strides = mtls->fep.inStrides[index];
+
+ p.ins[index] = mtls->fep.ptrIns[index] +
+ (strides.yStride * offset) +
+ (strides.eStride * mtls->xStart);
+ }
+
+ /*
+ * The fourth argument is zero here because multi-input
+ * kernels get their stride information from a member of p
+ * that points to an array.
+ */
+ fn(&p, mtls->xStart, mtls->xEnd, 0, mtls->fep.eStrideOut);
+ }
+ }
+ }
+
+ // Free our arrays.
+ delete[] p.ins;
+ delete[] p.eStrideIns;
+ }
+}
+
RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
//ALOGE("setTls %p", sc);
ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
@@ -595,5 +698,3 @@
}
return sgi;
}
-
-
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index d2b47fb..c54dca2 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -35,6 +35,11 @@
namespace renderscript {
typedef struct {
+ uint32_t eStride;
+ uint32_t yStride;
+} StridePair;
+
+typedef struct {
const void *in;
void *out;
const void *usr;
@@ -45,6 +50,10 @@
uint32_t lod;
RsAllocationCubemapFace face;
uint32_t ar[16];
+
+ const void **ins;
+ uint32_t *eStrideIns;
+
uint32_t lid;
uint32_t dimX;
@@ -59,6 +68,9 @@
uint32_t yStrideIn;
uint32_t yStrideOut;
uint32_t slot;
+
+ const uint8_t** ptrIns;
+ StridePair* inStrides;
} RsForEachStubParamStruct;
extern bool gArchUseSIMD;
@@ -99,6 +111,9 @@
uint32_t zEnd;
uint32_t arrayStart;
uint32_t arrayEnd;
+
+ // Multi-input data.
+ const Allocation ** ains;
} MTLaunchStruct;
@@ -126,6 +141,9 @@
void launchThreads(const Allocation * ain, Allocation * aout,
const RsScriptCall *sc, MTLaunchStruct *mtls);
+ void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
+ const RsScriptCall* sc, MTLaunchStruct* mtls);
+
virtual CpuScript * createScript(const ScriptC *s,
char const *resName, char const *cacheDir,
uint8_t const *bitcode, size_t bitcodeSize,
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 7195714..d146b76 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -107,6 +107,35 @@
postLaunch(slot, ain, aout, usr, usrLen, sc);
}
+void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot,
+ const Allocation ** ains,
+ size_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc) {
+
+ MTLaunchStruct mtls;
+ /*
+ * FIXME: Possibly create new preLaunch and postLaunch functions that take
+ * all of the input allocation pointers.
+ */
+ preLaunch(slot, ains[0], aout, usr, usrLen, sc);
+
+ forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
+ mtls.script = this;
+ mtls.fep.slot = slot;
+
+ mtls.kernel = (void (*)())mRootPtr;
+ mtls.fep.usr = this;
+
+ RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+ mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+ mCtx->setTLS(oldTLS);
+
+ postLaunch(slot, ains[0], aout, usr, usrLen, sc);
+}
+
void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
mtls->script = this;
@@ -114,6 +143,3 @@
mtls->kernel = (void (*)())mRootPtr;
mtls->fep.usr = this;
}
-
-
-
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index 1cf889c..85e2ddc 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -36,6 +36,15 @@
const void * usr,
uint32_t usrLen,
const RsScriptCall *sc);
+
+ virtual void invokeForEachMulti(uint32_t slot,
+ const Allocation ** ain,
+ size_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc);
+
virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
virtual void invokeInit();
virtual void invokeFreeChildren();
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index cf7b377..b9f8aba 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -806,21 +806,34 @@
return;
}
- if (ain) {
- mtls->fep.dimX = ain->getType()->getDimX();
- mtls->fep.dimY = ain->getType()->getDimY();
- mtls->fep.dimZ = ain->getType()->getDimZ();
- //mtls->dimArray = ain->getType()->getDimArray();
- } else if (aout) {
- mtls->fep.dimX = aout->getType()->getDimX();
- mtls->fep.dimY = aout->getType()->getDimY();
- mtls->fep.dimZ = aout->getType()->getDimZ();
- //mtls->dimArray = aout->getType()->getDimArray();
+ if (ain != NULL) {
+ const Type *inType = ain->getType();
+
+ mtls->fep.dimX = inType->getDimX();
+ mtls->fep.dimY = inType->getDimY();
+ mtls->fep.dimZ = inType->getDimZ();
+
+ } else if (aout != NULL) {
+ const Type *outType = aout->getType();
+
+ mtls->fep.dimX = outType->getDimX();
+ mtls->fep.dimY = outType->getDimY();
+ mtls->fep.dimZ = outType->getDimZ();
+
} else {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
return;
}
+ if (ain != NULL && aout != NULL) {
+ if (!ain->hasSameDims(aout)) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
+ return;
+ }
+ }
+
if (!sc || (sc->xEnd == 0)) {
mtls->xEnd = mtls->fep.dimX;
} else {
@@ -888,6 +901,147 @@
}
}
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+ Allocation * aout,
+ const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc,
+ MTLaunchStruct *mtls) {
+
+ memset(mtls, 0, sizeof(MTLaunchStruct));
+
+ // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+ if (ains != NULL) {
+ for (int index = inLen; --index >= 0;) {
+ const Allocation* ain = ains[index];
+
+ if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
+ return;
+ }
+ }
+ }
+
+ if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
+ return;
+ }
+
+ if (ains != NULL) {
+ const Allocation *ain0 = ains[0];
+ const Type *inType = ain0->getType();
+
+ mtls->fep.dimX = inType->getDimX();
+ mtls->fep.dimY = inType->getDimY();
+ mtls->fep.dimZ = inType->getDimZ();
+
+ for (int Index = inLen; --Index >= 1;) {
+ if (!ain0->hasSameDims(ains[Index])) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
+ return;
+ }
+ }
+
+ } else if (aout != NULL) {
+ const Type *outType = aout->getType();
+
+ mtls->fep.dimX = outType->getDimX();
+ mtls->fep.dimY = outType->getDimY();
+ mtls->fep.dimZ = outType->getDimZ();
+
+ } else {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+ return;
+ }
+
+ if (ains != NULL && aout != NULL) {
+ if (!ains[0]->hasSameDims(aout)) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
+ return;
+ }
+ }
+
+ if (!sc || (sc->xEnd == 0)) {
+ mtls->xEnd = mtls->fep.dimX;
+ } else {
+ rsAssert(sc->xStart < mtls->fep.dimX);
+ rsAssert(sc->xEnd <= mtls->fep.dimX);
+ rsAssert(sc->xStart < sc->xEnd);
+ mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
+ mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
+ if (mtls->xStart >= mtls->xEnd) return;
+ }
+
+ if (!sc || (sc->yEnd == 0)) {
+ mtls->yEnd = mtls->fep.dimY;
+ } else {
+ rsAssert(sc->yStart < mtls->fep.dimY);
+ rsAssert(sc->yEnd <= mtls->fep.dimY);
+ rsAssert(sc->yStart < sc->yEnd);
+ mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
+ mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
+ if (mtls->yStart >= mtls->yEnd) return;
+ }
+
+ if (!sc || (sc->zEnd == 0)) {
+ mtls->zEnd = mtls->fep.dimZ;
+ } else {
+ rsAssert(sc->zStart < mtls->fep.dimZ);
+ rsAssert(sc->zEnd <= mtls->fep.dimZ);
+ rsAssert(sc->zStart < sc->zEnd);
+ mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
+ mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
+ if (mtls->zStart >= mtls->zEnd) return;
+ }
+
+ mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
+ mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
+ mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
+ mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+
+ rsAssert(!ains || (ains[0]->getType()->getDimZ() == 0));
+
+ mtls->rsc = mCtx;
+ mtls->ains = ains;
+ mtls->aout = aout;
+ mtls->fep.usr = usr;
+ mtls->fep.usrLen = usrLen;
+ mtls->mSliceSize = 1;
+ mtls->mSliceNum = 0;
+
+ mtls->fep.ptrIns = NULL;
+ mtls->fep.eStrideIn = 0;
+ mtls->isThreadable = mIsThreadable;
+
+ if (ains) {
+ mtls->fep.ptrIns = new const uint8_t*[inLen];
+ mtls->fep.inStrides = new StridePair[inLen];
+
+ for (int index = inLen; --index >= 0;) {
+ const Allocation *ain = ains[index];
+
+ mtls->fep.ptrIns[index] =
+ (const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
+
+ mtls->fep.inStrides[index].eStride =
+ ain->getType()->getElementSizeBytes();
+ mtls->fep.inStrides[index].yStride =
+ ain->mHal.drvState.lod[0].stride;
+ }
+ }
+
+ mtls->fep.ptrOut = NULL;
+ mtls->fep.eStrideOut = 0;
+ if (aout) {
+ mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+ mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
+ mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+ }
+}
+
void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
const Allocation * ain,
@@ -905,6 +1059,24 @@
mCtx->setTLS(oldTLS);
}
+void RsdCpuScriptImpl::invokeForEachMulti(uint32_t slot,
+ const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc) {
+
+ MTLaunchStruct mtls;
+
+ forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
+ forEachKernelSetup(slot, &mtls);
+
+ RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+ mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+ mCtx->setTLS(oldTLS);
+}
+
void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
mtls->script = this;
mtls->fep.slot = slot;
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 666379d..f4ca1ed 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -76,6 +76,14 @@
const void * usr,
uint32_t usrLen,
const RsScriptCall *sc);
+
+ virtual void invokeForEachMulti(uint32_t slot,
+ const Allocation** ains,
+ uint32_t inLen,
+ Allocation* aout,
+ const void* usr,
+ uint32_t usrLen,
+ const RsScriptCall* sc);
virtual void invokeInit();
virtual void invokeFreeChildren();
@@ -95,6 +103,11 @@
void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
const void * usr, uint32_t usrLen,
const RsScriptCall *sc, MTLaunchStruct *mtls);
+
+ void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+ Allocation * aout, const void * usr, uint32_t usrLen,
+ const RsScriptCall *sc, MTLaunchStruct *mtls);
+
virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 675bb97..0076cb9 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -75,6 +75,15 @@
const void * usr,
uint32_t usrLen,
const RsScriptCall *sc) = 0;
+
+ virtual void invokeForEachMulti(uint32_t slot,
+ const Allocation** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ const void * usr,
+ uint32_t usrLen,
+ const RsScriptCall *sc) = 0;
+
virtual void invokeInit() = 0;
virtual void invokeFreeChildren() = 0;