Adds support for multi-input kernels to Frameworks/RS.

This patch modifies Frameworks/RS in the following ways:
* Adjusted the data-layout of the C/C++ version of RsForEachStubParamStruct to
  accommodate a pointer to an array of input allocations and a pointer to an
  array of stride sizes for each of these allocatoins.
* Adds a new code path for Java code to pass multiple allocations to a RS
  kernel.
* Packs base pointers and step values for multi-input kernels into the new
  RsForEachStubParamStruct members.

Change-Id: I46d2834c37075b2a2407fd8b010546818a4540d1
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 9755b9a..499f890 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -479,6 +479,109 @@
     }
 }
 
+void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
+                                        const RsScriptCall* sc, MTLaunchStruct* mtls) {
+
+    //android::StopWatch kernel_time("kernel time");
+
+    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+        const size_t targetByteChunk = 16 * 1024;
+        mInForEach = true;
+        if (mtls->fep.dimY > 1) {
+            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+         //   mtls->mSliceSize = 2;
+            launchThreads(wc_xy, mtls);
+        } else {
+            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
+            if (mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+            launchThreads(wc_x, mtls);
+        }
+        mInForEach = false;
+
+        //ALOGE("launch 1");
+    } else {
+        RsForEachStubParamStruct p;
+        memcpy(&p, &mtls->fep, sizeof(p));
+        uint32_t sig = mtls->sig;
+
+        // Allocate space for our input base pointers.
+        p.ins = new const void*[inLen];
+
+        // Allocate space for our input stride information.
+        p.eStrideIns = new uint32_t[inLen];
+
+        // Fill our stride information.
+        for (int index = inLen; --index >= 0;) {
+          p.eStrideIns[index] = mtls->fep.inStrides[index].eStride;
+        }
+
+        //ALOGE("launch 3");
+        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+        uint32_t offset_invariant = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0];
+
+        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
+            uint32_t offset_part = offset_invariant * p.ar[0];
+
+            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
+                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
+                    uint32_t offset = offset_part + mtls->fep.dimY * p.z + p.y;
+
+                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
+                            (mtls->fep.eStrideOut * mtls->xStart);
+
+                    for (int index = inLen; --index >= 0;) {
+                        StridePair &strides = mtls->fep.inStrides[index];
+
+                        p.ins[index] = mtls->fep.ptrIns[index] +
+                                       (strides.yStride * offset) +
+                                       (strides.eStride * mtls->xStart);
+                    }
+
+                    /*
+                     * The fourth argument is zero here because multi-input
+                     * kernels get their stride information from a member of p
+                     * that points to an array.
+                     */
+                    fn(&p, mtls->xStart, mtls->xEnd, 0, mtls->fep.eStrideOut);
+                }
+            }
+        }
+
+        // Free our arrays.
+        delete[] p.ins;
+        delete[] p.eStrideIns;
+    }
+}
+
 RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
     //ALOGE("setTls %p", sc);
     ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
@@ -595,5 +698,3 @@
     }
     return sgi;
 }
-
-
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index d2b47fb..c54dca2 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -35,6 +35,11 @@
 namespace renderscript {
 
 typedef struct {
+  uint32_t eStride;
+  uint32_t yStride;
+} StridePair;
+
+typedef struct {
     const void *in;
     void *out;
     const void *usr;
@@ -45,6 +50,10 @@
     uint32_t lod;
     RsAllocationCubemapFace face;
     uint32_t ar[16];
+
+    const void **ins;
+    uint32_t *eStrideIns;
+
     uint32_t lid;
 
     uint32_t dimX;
@@ -59,6 +68,9 @@
     uint32_t yStrideIn;
     uint32_t yStrideOut;
     uint32_t slot;
+
+    const uint8_t** ptrIns;
+    StridePair* inStrides;
 } RsForEachStubParamStruct;
 
 extern bool gArchUseSIMD;
@@ -99,6 +111,9 @@
     uint32_t zEnd;
     uint32_t arrayStart;
     uint32_t arrayEnd;
+
+    // Multi-input data.
+    const Allocation ** ains;
 } MTLaunchStruct;
 
 
@@ -126,6 +141,9 @@
     void launchThreads(const Allocation * ain, Allocation * aout,
                        const RsScriptCall *sc, MTLaunchStruct *mtls);
 
+    void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
+                       const RsScriptCall* sc, MTLaunchStruct* mtls);
+
     virtual CpuScript * createScript(const ScriptC *s,
                                      char const *resName, char const *cacheDir,
                                      uint8_t const *bitcode, size_t bitcodeSize,
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 7195714..d146b76 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -107,6 +107,35 @@
     postLaunch(slot, ain, aout, usr, usrLen, sc);
 }
 
+void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot,
+                                               const Allocation ** ains,
+                                               size_t inLen,
+                                               Allocation * aout,
+                                               const void * usr,
+                                               uint32_t usrLen,
+                                               const RsScriptCall *sc) {
+
+    MTLaunchStruct mtls;
+    /*
+     * FIXME: Possibly create new preLaunch and postLaunch functions that take
+     *        all of the input allocation pointers.
+     */
+    preLaunch(slot, ains[0], aout, usr, usrLen, sc);
+
+    forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
+    mtls.script = this;
+    mtls.fep.slot = slot;
+
+    mtls.kernel = (void (*)())mRootPtr;
+    mtls.fep.usr = this;
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+    mCtx->setTLS(oldTLS);
+
+    postLaunch(slot, ains[0], aout, usr, usrLen, sc);
+}
+
 void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
 
     mtls->script = this;
@@ -114,6 +143,3 @@
     mtls->kernel = (void (*)())mRootPtr;
     mtls->fep.usr = this;
 }
-
-
-
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index 1cf889c..85e2ddc 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -36,6 +36,15 @@
                        const void * usr,
                        uint32_t usrLen,
                        const RsScriptCall *sc);
+
+    virtual void invokeForEachMulti(uint32_t slot,
+                       const Allocation ** ain,
+                       size_t inLen,
+                       Allocation * aout,
+                       const void * usr,
+                       uint32_t usrLen,
+                       const RsScriptCall *sc);
+
     virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
     virtual void invokeInit();
     virtual void invokeFreeChildren();
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index cf7b377..b9f8aba 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -806,21 +806,34 @@
         return;
     }
 
-    if (ain) {
-        mtls->fep.dimX = ain->getType()->getDimX();
-        mtls->fep.dimY = ain->getType()->getDimY();
-        mtls->fep.dimZ = ain->getType()->getDimZ();
-        //mtls->dimArray = ain->getType()->getDimArray();
-    } else if (aout) {
-        mtls->fep.dimX = aout->getType()->getDimX();
-        mtls->fep.dimY = aout->getType()->getDimY();
-        mtls->fep.dimZ = aout->getType()->getDimZ();
-        //mtls->dimArray = aout->getType()->getDimArray();
+    if (ain != NULL) {
+        const Type *inType = ain->getType();
+
+        mtls->fep.dimX = inType->getDimX();
+        mtls->fep.dimY = inType->getDimY();
+        mtls->fep.dimZ = inType->getDimZ();
+
+    } else if (aout != NULL) {
+        const Type *outType = aout->getType();
+
+        mtls->fep.dimX = outType->getDimX();
+        mtls->fep.dimY = outType->getDimY();
+        mtls->fep.dimZ = outType->getDimZ();
+
     } else {
         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
         return;
     }
 
+    if (ain != NULL && aout != NULL) {
+        if (!ain->hasSameDims(aout)) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+              "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
+            return;
+        }
+    }
+
     if (!sc || (sc->xEnd == 0)) {
         mtls->xEnd = mtls->fep.dimX;
     } else {
@@ -888,6 +901,147 @@
     }
 }
 
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+                                        Allocation * aout,
+                                        const void * usr, uint32_t usrLen,
+                                        const RsScriptCall *sc,
+                                        MTLaunchStruct *mtls) {
+
+    memset(mtls, 0, sizeof(MTLaunchStruct));
+
+    // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+    if (ains != NULL) {
+        for (int index = inLen; --index >= 0;) {
+            const Allocation* ain = ains[index];
+
+            if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
+                mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
+                return;
+            }
+        }
+    }
+
+    if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
+        return;
+    }
+
+    if (ains != NULL) {
+        const Allocation *ain0   = ains[0];
+        const Type       *inType = ain0->getType();
+
+        mtls->fep.dimX = inType->getDimX();
+        mtls->fep.dimY = inType->getDimY();
+        mtls->fep.dimZ = inType->getDimZ();
+
+        for (int Index = inLen; --Index >= 1;) {
+            if (!ain0->hasSameDims(ains[Index])) {
+                mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                  "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
+                return;
+            }
+        }
+
+    } else if (aout != NULL) {
+        const Type *outType = aout->getType();
+
+        mtls->fep.dimX = outType->getDimX();
+        mtls->fep.dimY = outType->getDimY();
+        mtls->fep.dimZ = outType->getDimZ();
+
+    } else {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        return;
+    }
+
+    if (ains != NULL && aout != NULL) {
+        if (!ains[0]->hasSameDims(aout)) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+              "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
+            return;
+        }
+    }
+
+    if (!sc || (sc->xEnd == 0)) {
+        mtls->xEnd = mtls->fep.dimX;
+    } else {
+        rsAssert(sc->xStart < mtls->fep.dimX);
+        rsAssert(sc->xEnd <= mtls->fep.dimX);
+        rsAssert(sc->xStart < sc->xEnd);
+        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
+        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
+        if (mtls->xStart >= mtls->xEnd) return;
+    }
+
+    if (!sc || (sc->yEnd == 0)) {
+        mtls->yEnd = mtls->fep.dimY;
+    } else {
+        rsAssert(sc->yStart < mtls->fep.dimY);
+        rsAssert(sc->yEnd <= mtls->fep.dimY);
+        rsAssert(sc->yStart < sc->yEnd);
+        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
+        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
+        if (mtls->yStart >= mtls->yEnd) return;
+    }
+
+    if (!sc || (sc->zEnd == 0)) {
+        mtls->zEnd = mtls->fep.dimZ;
+    } else {
+        rsAssert(sc->zStart < mtls->fep.dimZ);
+        rsAssert(sc->zEnd <= mtls->fep.dimZ);
+        rsAssert(sc->zStart < sc->zEnd);
+        mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
+        mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
+        if (mtls->zStart >= mtls->zEnd) return;
+    }
+
+    mtls->xEnd     = rsMax((uint32_t)1, mtls->xEnd);
+    mtls->yEnd     = rsMax((uint32_t)1, mtls->yEnd);
+    mtls->zEnd     = rsMax((uint32_t)1, mtls->zEnd);
+    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+
+    rsAssert(!ains || (ains[0]->getType()->getDimZ() == 0));
+
+    mtls->rsc        = mCtx;
+    mtls->ains       = ains;
+    mtls->aout       = aout;
+    mtls->fep.usr    = usr;
+    mtls->fep.usrLen = usrLen;
+    mtls->mSliceSize = 1;
+    mtls->mSliceNum  = 0;
+
+    mtls->fep.ptrIns    = NULL;
+    mtls->fep.eStrideIn = 0;
+    mtls->isThreadable  = mIsThreadable;
+
+    if (ains) {
+        mtls->fep.ptrIns    = new const uint8_t*[inLen];
+        mtls->fep.inStrides = new StridePair[inLen];
+
+        for (int index = inLen; --index >= 0;) {
+            const Allocation *ain = ains[index];
+
+            mtls->fep.ptrIns[index] =
+              (const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
+
+            mtls->fep.inStrides[index].eStride =
+              ain->getType()->getElementSizeBytes();
+            mtls->fep.inStrides[index].yStride =
+              ain->mHal.drvState.lod[0].stride;
+        }
+    }
+
+    mtls->fep.ptrOut = NULL;
+    mtls->fep.eStrideOut = 0;
+    if (aout) {
+        mtls->fep.ptrOut     = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
+        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+    }
+}
+
 
 void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
                                      const Allocation * ain,
@@ -905,6 +1059,24 @@
     mCtx->setTLS(oldTLS);
 }
 
+void RsdCpuScriptImpl::invokeForEachMulti(uint32_t slot,
+                                          const Allocation ** ains,
+                                          uint32_t inLen,
+                                          Allocation * aout,
+                                          const void * usr,
+                                          uint32_t usrLen,
+                                          const RsScriptCall *sc) {
+
+    MTLaunchStruct mtls;
+
+    forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
+    forEachKernelSetup(slot, &mtls);
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+    mCtx->setTLS(oldTLS);
+}
+
 void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
     mtls->script = this;
     mtls->fep.slot = slot;
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 666379d..f4ca1ed 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -76,6 +76,14 @@
                        const void * usr,
                        uint32_t usrLen,
                        const RsScriptCall *sc);
+
+    virtual void invokeForEachMulti(uint32_t slot,
+                                     const Allocation** ains,
+                                     uint32_t inLen,
+                                     Allocation* aout,
+                                     const void* usr,
+                                     uint32_t usrLen,
+                                     const RsScriptCall* sc);
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
@@ -95,6 +103,11 @@
     void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
                           const void * usr, uint32_t usrLen,
                           const RsScriptCall *sc, MTLaunchStruct *mtls);
+
+    void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+                          Allocation * aout, const void * usr, uint32_t usrLen,
+                          const RsScriptCall *sc, MTLaunchStruct *mtls);
+
     virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
 
 
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 675bb97..0076cb9 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -75,6 +75,15 @@
                            const void * usr,
                            uint32_t usrLen,
                            const RsScriptCall *sc) = 0;
+                           
+        virtual void invokeForEachMulti(uint32_t slot,
+                                         const Allocation** ains,
+                                         uint32_t inLen,
+                                         Allocation * aout,
+                                         const void * usr,
+                                         uint32_t usrLen,
+                                         const RsScriptCall *sc) = 0;
+        
         virtual void invokeInit() = 0;
         virtual void invokeFreeChildren() = 0;