Revert "Collapse code paths for single- and multi-input kernels."

This reverts commit 818cfa034e257c7bb48356257f5cb67334e19aa6.

Change-Id: I59f39f52e6c8f60bb01cbcb8ccf2215eaf46a57f
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 729e702..aeb75a6 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -91,7 +91,7 @@
 endif
 include frameworks/compile/libbcc/libbcc-targets.mk
 
-LOCAL_CFLAGS += $(rs_base_CFLAGS) -std=c++11
+LOCAL_CFLAGS += $(rs_base_CFLAGS)
 
 LOCAL_MODULE_TAGS := optional
 
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index db3cc7f..a0564fc 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -350,134 +350,96 @@
 }
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-typedef void (*walk_loop_t)(MTLaunchStruct*,
-                            RsExpandKernelParams&,
-                            outer_foreach_t);
 
-
-static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
+static void wc_xy(void *usr, uint32_t idx) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
 
-    uint32_t inLen = mtls->fep.inLen;
-
     RsExpandKernelParams kparams;
     kparams.takeFields(mtls->fep);
 
     // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
     kparams.lid = idx;
 
-    if (inLen > 0) {
-        // Allocate space for our input base pointers.
-        kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    while (1) {
+        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+        uint32_t yEnd   = yStart + mtls->mSliceSize;
 
-        // Allocate space for our input stride information.
-        kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
+        yEnd = rsMin(yEnd, mtls->yEnd);
 
-        // Fill our stride information.
-        for (int inIndex = inLen; --inIndex >= 0;) {
-          kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
+        if (yEnd <= yStart) {
+            return;
+        }
+
+        //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
+        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+
+        for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
+            kparams.out = mtls->fep.ptrOut +
+                          (mtls->fep.yStrideOut * kparams.y) +
+                          (mtls->fep.eStrideOut * mtls->xStart);
+
+            kparams.in = mtls->fep.ptrIn +
+                         (mtls->fep.yStrideIn * kparams.y) +
+                         (mtls->fep.eStrideIn * mtls->xStart);
+
+
+            fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
+               mtls->fep.eStrideOut);
         }
     }
+}
+
+static void wc_x(void *usr, uint32_t idx) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+
+    RsExpandKernelParams kparams;
+    kparams.takeFields(mtls->fep);
+
+    // Used by CpuScriptGroup, IntrinsicBlur, and IntrisicHistogram
+    kparams.lid = idx;
 
     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    while (1) {
+        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+        uint32_t xEnd   = xStart + mtls->mSliceSize;
 
-    walk_loop(mtls, kparams, fn);
-}
+        xEnd = rsMin(xEnd, mtls->xEnd);
 
-static void walk_2d(void *usr, uint32_t idx) {
-    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
-                              RsExpandKernelParams &kparams,
-                              outer_foreach_t fn) {
-
-        while (1) {
-            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-            uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
-            uint32_t yEnd   = yStart + mtls->mSliceSize;
-
-            yEnd = rsMin(yEnd, mtls->yEnd);
-
-            if (yEnd <= yStart) {
-                return;
-            }
-
-            for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
-                kparams.out = mtls->fep.outPtr +
-                              (mtls->fep.outStride.yStride * kparams.y) +
-                              (mtls->fep.outStride.eStride * mtls->xStart);
-
-                for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
-                    StridePair &strides = mtls->fep.inStrides[inIndex];
-
-                    kparams.ins[inIndex] =
-                      mtls->fep.inPtrs[inIndex] +
-                      (strides.yStride * kparams.y) +
-                      (strides.eStride * mtls->xStart);
-                }
-
-                // Kernels now get their input strides from kparams.
-                fn(&kparams, mtls->xStart, mtls->xEnd, 0,
-                   mtls->fep.outStride.eStride);
-            }
+        if (xEnd <= xStart) {
+            return;
         }
-    });
+
+        //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
+        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+
+        kparams.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
+        kparams.in  = mtls->fep.ptrIn  + (mtls->fep.eStrideIn  * xStart);
+
+        fn(&kparams, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+    }
 }
 
-static void walk_1d(void *usr, uint32_t idx) {
-    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
-                              RsExpandKernelParams &kparams,
-                              outer_foreach_t fn) {
-
-        while (1) {
-            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-            uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
-            uint32_t xEnd   = xStart + mtls->mSliceSize;
-
-            xEnd = rsMin(xEnd, mtls->xEnd);
-
-            if (xEnd <= xStart) {
-                return;
-            }
-
-            kparams.out = mtls->fep.outPtr +
-                          (mtls->fep.outStride.eStride * xStart);
-
-            for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
-                StridePair &strides = mtls->fep.inStrides[inIndex];
-
-                kparams.ins[inIndex] =
-                  mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
-            }
-
-            // Kernels now get their input strides from kparams.
-            fn(&kparams, xStart, xEnd, 0, mtls->fep.outStride.eStride);
-        }
-    });
-}
-
-
-void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
-                                        uint32_t inLen,
-                                        Allocation* aout,
-                                        const RsScriptCall* sc,
-                                        MTLaunchStruct* mtls) {
+void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
+                                        const RsScriptCall *sc, MTLaunchStruct *mtls) {
 
     //android::StopWatch kernel_time("kernel time");
 
     if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
         const size_t targetByteChunk = 16 * 1024;
         mInForEach = true;
-
         if (mtls->fep.dimY > 1) {
             uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.outStride.yStride) {
-                s2 = targetByteChunk / mtls->fep.outStride.yStride;
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
             } else {
-                // We know that there is either an output or an input.
-                s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -485,18 +447,102 @@
                 mtls->mSliceSize = 1;
             }
 
-            launchThreads(walk_2d, mtls);
+         //   mtls->mSliceSize = 2;
+            launchThreads(wc_xy, mtls);
         } else {
             uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.outStride.eStride) {
-                s2 = targetByteChunk / mtls->fep.outStride.eStride;
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
             } else {
-                // We know that there is either an output or an input.
-                s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+            launchThreads(wc_x, mtls);
+        }
+        mInForEach = false;
+
+        //ALOGE("launch 1");
+    } else {
+        RsExpandKernelParams kparams;
+        kparams.takeFields(mtls->fep);
+
+        //ALOGE("launch 3");
+        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+        for (uint32_t arrayIndex = mtls->arrayStart;
+             arrayIndex < mtls->arrayEnd; arrayIndex++) {
+
+            for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
+                 kparams.z++) {
+
+                for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
+                     kparams.y++) {
+
+                    uint32_t offset =
+                      kparams.dimY * kparams.dimZ * arrayIndex +
+                      kparams.dimY * kparams.z + kparams.y;
+
+                    kparams.out = mtls->fep.ptrOut +
+                                  (mtls->fep.yStrideOut * offset) +
+                                  (mtls->fep.eStrideOut * mtls->xStart);
+
+                    kparams.in = mtls->fep.ptrIn +
+                                 (mtls->fep.yStrideIn * offset) +
+                                 (mtls->fep.eStrideIn * mtls->xStart);
+
+                    fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
+                       mtls->fep.eStrideOut);
+                }
+            }
+        }
+    }
+}
+
+void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
+                                        const RsScriptCall* sc, MTLaunchStruct* mtls) {
+
+    //android::StopWatch kernel_time("kernel time");
+
+    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+        const size_t targetByteChunk = 16 * 1024;
+        mInForEach = true;
+        if (mtls->fep.dimY > 1) {
+            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+         //   mtls->mSliceSize = 2;
+            launchThreads(wc_xy, mtls);
+        } else {
+            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -504,26 +550,24 @@
                 mtls->mSliceSize = 1;
             }
 
-            launchThreads(walk_1d, mtls);
+            launchThreads(wc_x, mtls);
         }
         mInForEach = false;
 
+        //ALOGE("launch 1");
     } else {
         RsExpandKernelParams kparams;
         kparams.takeFields(mtls->fep);
 
-        if (inLen > 0) {
-            // Allocate space for our input base pointers.
-            kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+        // Allocate space for our input base pointers.
+        kparams.ins = new const void*[inLen];
 
-            // Allocate space for our input stride information.
-            kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
+        // Allocate space for our input stride information.
+        kparams.eStrideIns = new uint32_t[inLen];
 
-            // Fill our stride information.
-            for (int inIndex = inLen; --inIndex >= 0;) {
-                kparams.inEStrides[inIndex] =
-                    mtls->fep.inStrides[inIndex].eStride;
-            }
+        // Fill our stride information.
+        for (int inIndex = inLen; --inIndex >= 0;) {
+          kparams.eStrideIns[inIndex] = mtls->fep.inStrides[inIndex].eStride;
         }
 
         //ALOGE("launch 3");
@@ -541,15 +585,15 @@
                       mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
                       mtls->fep.dimY * kparams.z + kparams.y;
 
-                    kparams.out = mtls->fep.outPtr +
-                                  (mtls->fep.outStride.yStride * offset) +
-                                  (mtls->fep.outStride.eStride * mtls->xStart);
+                    kparams.out = mtls->fep.ptrOut +
+                                  (mtls->fep.yStrideOut * offset) +
+                                  (mtls->fep.eStrideOut * mtls->xStart);
 
                     for (int inIndex = inLen; --inIndex >= 0;) {
                         StridePair &strides = mtls->fep.inStrides[inIndex];
 
                         kparams.ins[inIndex] =
-                          mtls->fep.inPtrs[inIndex] +
+                          mtls->fep.ptrIns[inIndex] +
                           (strides.yStride * offset) +
                           (strides.eStride * mtls->xStart);
                     }
@@ -560,10 +604,14 @@
                      * that points to an array.
                      */
                     fn(&kparams, mtls->xStart, mtls->xEnd, 0,
-                       mtls->fep.outStride.eStride);
+                       mtls->fep.eStrideOut);
                 }
             }
         }
+
+        // Free our arrays.
+        delete[] kparams.ins;
+        delete[] kparams.eStrideIns;
     }
 }
 
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 2fea3fc..5d4b6cc 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -25,8 +25,6 @@
 
 #include <string>
 
-#define RS_KERNEL_INPUT_THRESHOLD 32
-
 namespace bcc {
     class BCCContext;
     class RSCompilerDriver;
@@ -42,36 +40,31 @@
 };
 
 struct RsExpandKernelDriverInfo {
-    const uint8_t **inPtrs;
-    uint32_t inLen;
-
-    uint8_t *outPtr;
-
-    StridePair *inStrides;
-    StridePair  outStride;
+    const void *usr;
+    uint32_t usrLen;
 
     uint32_t dimX;
     uint32_t dimY;
     uint32_t dimZ;
 
+    const uint8_t *ptrIn;
+    uint8_t *ptrOut;
+    uint32_t eStrideIn;
+    uint32_t eStrideOut;
+    uint32_t yStrideIn;
+    uint32_t yStrideOut;
     uint32_t slot;
 
-    const void *usr;
-    uint32_t usrLen;
-
-    bool heapAllocatedArrays;
-
-    RsExpandKernelDriverInfo() : heapAllocatedArrays(false) {}
+    const uint8_t** ptrIns;
+    StridePair* inStrides;
 
     ~RsExpandKernelDriverInfo() {
-        if (heapAllocatedArrays) {
-            if (inPtrs != NULL) {
-                delete[] inPtrs;
-            }
+        if (ptrIns != NULL) {
+            delete[] ptrIns;
+        }
 
-            if (inStrides != NULL) {
-                delete[] inStrides;
-            }
+        if (inStrides != NULL) {
+            delete[] inStrides;
         }
     }
 };
@@ -79,13 +72,15 @@
 struct RsExpandKernelParams {
 
     // Used by kernels
-    const void **ins;
-    uint32_t *inEStrides;
+    const void *in;
     void *out;
     uint32_t y;
     uint32_t z;
     uint32_t lid;
 
+    const void **ins;
+    uint32_t *eStrideIns;
+
     // Used by ScriptGroup and user kernels.
     const void *usr;
 
@@ -120,13 +115,13 @@
 class RsdCpuScriptImpl;
 class RsdCpuReferenceImpl;
 
-struct ScriptTLSStruct {
+typedef struct ScriptTLSStructRec {
     android::renderscript::Context * mContext;
     const android::renderscript::Script * mScript;
     RsdCpuScriptImpl *mImpl;
-};
+} ScriptTLSStruct;
 
-struct MTLaunchStruct {
+typedef struct {
     RsExpandKernelDriverInfo fep;
 
     RsdCpuReferenceImpl *rsc;
@@ -134,7 +129,7 @@
 
     ForEachFunc_t kernel;
     uint32_t sig;
-    const Allocation ** ains;
+    const Allocation * ain;
     Allocation * aout;
 
     uint32_t mSliceSize;
@@ -150,9 +145,12 @@
     uint32_t arrayStart;
     uint32_t arrayEnd;
 
-    const uint8_t *inPtrsBuff[RS_KERNEL_INPUT_THRESHOLD];
-    StridePair     inStridesBuff[RS_KERNEL_INPUT_THRESHOLD];
-};
+    // Multi-input data.
+    const Allocation ** ains;
+} MTLaunchStruct;
+
+
+
 
 class RsdCpuReferenceImpl : public RsdCpuReference {
 public:
@@ -173,6 +171,9 @@
         return mWorkers.mCount + 1;
     }
 
+    void launchThreads(const Allocation * ain, Allocation * aout,
+                       const RsScriptCall *sc, MTLaunchStruct *mtls);
+
     void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
                        const RsScriptCall* sc, MTLaunchStruct* mtls);
 
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 8437c99..5a7fffd 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -73,29 +73,54 @@
 }
 
 
-void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation ** ains,
-                                      uint32_t inLen, Allocation * aout,
-                                      const void * usr, uint32_t usrLen,
-                                      const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation * ain,
+                                      Allocation * aout, const void * usr,
+                                      uint32_t usrLen, const RsScriptCall *sc) {
 }
 
-void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation ** ains,
-                                       uint32_t inLen, Allocation * aout,
-                                       const void * usr, uint32_t usrLen,
-                                       const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation * ain,
+                                       Allocation * aout, const void * usr,
+                                       uint32_t usrLen, const RsScriptCall *sc) {
 }
 
 void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
-                                          const Allocation ** ains,
-                                          uint32_t inLen,
+                                          const Allocation * ain,
                                           Allocation * aout,
                                           const void * usr,
                                           uint32_t usrLen,
                                           const RsScriptCall *sc) {
 
     MTLaunchStruct mtls;
+    preLaunch(slot, ain, aout, usr, usrLen, sc);
 
-    preLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
+    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
+    mtls.script = this;
+    mtls.fep.slot = slot;
+
+    mtls.kernel = (void (*)())mRootPtr;
+    mtls.fep.usr = this;
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    mCtx->launchThreads(ain, aout, sc, &mtls);
+    mCtx->setTLS(oldTLS);
+
+    postLaunch(slot, ain, aout, usr, usrLen, sc);
+}
+
+void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot,
+                                               const Allocation ** ains,
+                                               uint32_t inLen,
+                                               Allocation * aout,
+                                               const void * usr,
+                                               uint32_t usrLen,
+                                               const RsScriptCall *sc) {
+
+    MTLaunchStruct mtls;
+    /*
+     * FIXME: Possibly create new preLaunch and postLaunch functions that take
+     *        all of the input allocation pointers.
+     */
+    preLaunch(slot, ains[0], aout, usr, usrLen, sc);
 
     forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
     mtls.script = this;
@@ -108,7 +133,7 @@
     mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
     mCtx->setTLS(oldTLS);
 
-    postLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
+    postLaunch(slot, ains[0], aout, usr, usrLen, sc);
 }
 
 void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index 95aaa14..bf6a8ac 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -28,42 +28,43 @@
 public:
     virtual void populateScript(Script *) = 0;
 
-    virtual void invokeFunction(uint32_t slot, const void * params,
-                                size_t paramLength);
+    virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
     virtual int invokeRoot();
-
     virtual void invokeForEach(uint32_t slot,
-                               const Allocation ** ain,
-                               uint32_t inLen,
-                               Allocation * aout,
-                               const void * usr,
-                               uint32_t usrLen,
-                               const RsScriptCall *sc);
+                       const Allocation * ain,
+                       Allocation * aout,
+                       const void * usr,
+                       uint32_t usrLen,
+                       const RsScriptCall *sc);
 
-    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct * mtls);
+    virtual void invokeForEachMulti(uint32_t slot,
+                       const Allocation ** ain,
+                       uint32_t inLen,
+                       Allocation * aout,
+                       const void * usr,
+                       uint32_t usrLen,
+                       const RsScriptCall *sc);
+
+    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
-    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
-                           uint32_t inLen, Allocation * aout, const void * usr,
-                           uint32_t usrLen, const RsScriptCall * sc);
-    virtual void postLaunch(uint32_t slot, const Allocation ** ains,
-                            uint32_t inLen, Allocation * aout,
-                            const void * usr, uint32_t usrLen,
-                            const RsScriptCall * sc);
+    virtual void preLaunch(uint32_t slot, const Allocation * ain,
+                           Allocation * aout, const void * usr,
+                           uint32_t usrLen, const RsScriptCall *sc);
+    virtual void postLaunch(uint32_t slot, const Allocation * ain,
+                            Allocation * aout, const void * usr,
+                            uint32_t usrLen, const RsScriptCall *sc);
 
-    virtual void setGlobalVar(uint32_t slot, const void * data,
-                              size_t dataLength);
-    virtual void setGlobalVarWithElemDims(uint32_t slot, const void * data,
-                                          size_t dataLength, const Element * e,
-                                          const uint32_t * dims,
-                                          size_t dimLength);
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
+                                  const Element *e, const uint32_t *dims, size_t dimLength);
     virtual void setGlobalBind(uint32_t slot, Allocation *data);
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
     virtual ~RsdCpuScriptIntrinsic();
-    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl * ctx, const Script * s,
-                          const Element * e, RsScriptIntrinsicID iid);
+    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, const Element *,
+                          RsScriptIntrinsicID iid);
 
 protected:
     RsScriptIntrinsicID mID;
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index a19d885..c839c19 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -64,7 +64,7 @@
     RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
 
     uchar4 *out = (uchar4 *)p->out + xstart;
-    uchar4 *in = (uchar4 *)p->ins[0] + xstart;
+    uchar4 *in = (uchar4 *)p->in + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -161,9 +161,9 @@
     }
 }
 
-RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(
-    RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) :
-        RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
+RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
+                                                     const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
 
     mRootPtr = &kernel;
 }
@@ -185,3 +185,5 @@
 
     return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
 }
+
+
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 0378e07..b604658 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -117,7 +117,7 @@
 
     // instep/outstep can be ignored--sizeof(uchar4) known at compile time
     uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->ins[0];
+    uchar4 *in = (uchar4 *)p->in;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -509,3 +509,6 @@
                                       const Script *s, const Element *e) {
     return new RsdCpuScriptIntrinsicBlend(ctx, s, e);
 }
+
+
+
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 4e90ad7..bf78eb3 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -169,9 +169,10 @@
     virtual ~RsdCpuScriptIntrinsicColorMatrix();
     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
-    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
-                           uint32_t inLen, Allocation * aout, const void * usr,
-                           uint32_t usrLen, const RsScriptCall *sc);
+    virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
+                           const void * usr, uint32_t usrLen, const RsScriptCall *sc);
+    virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
+                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
 
 protected:
     float fp[16];
@@ -882,13 +883,8 @@
                                               uint32_t xstart, uint32_t xend,
                                               uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
-
-    // Update the instep due to change in parameter passing.
-    instep = p->inEStrides[0];
-
-    uchar *out = (uchar *)p->out    + outstep * xstart;
-    uchar *in  = (uchar *)p->ins[0] + instep  * xstart;
-
+    uchar *out = (uchar *)p->out + outstep * xstart;
+    uchar *in = (uchar *)p->in + instep * xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -936,15 +932,11 @@
     }
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
-                                                 const Allocation ** ains,
-                                                 uint32_t inLen,
-                                                 Allocation * aout,
-                                                 const void * usr,
-                                                 uint32_t usrLen,
-                                                 const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
+        uint32_t slot, const Allocation * ain, Allocation * aout,
+        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
 
-    const Element *ein = ains[0]->mHal.state.type->getElement();
+    const Element *ein = ain->mHal.state.type->getElement();
     const Element *eout = aout->mHal.state.type->getElement();
 
     if (ein->getType() == eout->getType()) {
@@ -961,8 +953,8 @@
         }
     }
 
-    Key_t key = computeKey(ein, eout);
-
+    Key_t key = computeKey(ain->mHal.state.type->getElement(),
+                           aout->mHal.state.type->getElement());
 #if defined(ARCH_X86_HAVE_SSSE3)
     if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
@@ -1004,6 +996,12 @@
 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
 }
 
+void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
+        uint32_t slot, const Allocation * ain, Allocation * aout,
+        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
+
+}
+
 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
index b5dbfa8..1c430b7 100644
--- a/cpu_ref/rsCpuIntrinsicHistogram.cpp
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -36,10 +36,10 @@
     RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
-    void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
+    void preLaunch(uint32_t slot, const Allocation * ain,
                    Allocation * aout, const void * usr,
                    uint32_t usrLen, const RsScriptCall *sc);
-    void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
+    void postLaunch(uint32_t slot, const Allocation * ain,
                     Allocation * aout, const void * usr,
                     uint32_t usrLen, const RsScriptCall *sc);
 
@@ -97,12 +97,9 @@
 
 
 
-void
-RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
-                                          const Allocation ** ains,
-                                          uint32_t inLen, Allocation * aout,
-                                          const void * usr, uint32_t usrLen,
-                                          const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain,
+                                      Allocation * aout, const void * usr,
+                                      uint32_t usrLen, const RsScriptCall *sc) {
 
     const uint32_t threads = mCtx->getThreadCount();
     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
@@ -126,7 +123,7 @@
         }
         break;
     case 1:
-        switch(ains[0]->getType()->getElement()->getVectorSize()) {
+        switch(ain->getType()->getElement()->getVectorSize()) {
         case 1:
             mRootPtr = &kernelP1L1;
             break;
@@ -145,12 +142,9 @@
     memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
 }
 
-void
-RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot,
-                                           const Allocation ** ains,
-                                           uint32_t inLen,  Allocation * aout,
-                                           const void * usr, uint32_t usrLen,
-                                           const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
+                                       Allocation * aout, const void * usr,
+                                       uint32_t usrLen, const RsScriptCall *sc) {
 
     unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
     uint32_t threads = mCtx->getThreadCount();
@@ -171,7 +165,7 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
+    uchar *in = (uchar *)p->in;
     int * sums = &cp->mSums[256 * 4 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -179,7 +173,7 @@
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
         sums[(in[3] << 2) + 3] ++;
-        in += p->inEStrides[0];
+        in += instep;
     }
 }
 
@@ -188,14 +182,14 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
+    uchar *in = (uchar *)p->in;
     int * sums = &cp->mSums[256 * 4 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 2)    ] ++;
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
-        in += p->inEStrides[0];
+        in += instep;
     }
 }
 
@@ -204,13 +198,13 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
+    uchar *in = (uchar *)p->in;
     int * sums = &cp->mSums[256 * 2 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 1)    ] ++;
         sums[(in[1] << 1) + 1] ++;
-        in += p->inEStrides[0];
+        in += instep;
     }
 }
 
@@ -219,7 +213,7 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
+    uchar *in = (uchar *)p->in;
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -228,7 +222,7 @@
                 (cp->mDotI[2] * in[2]) +
                 (cp->mDotI[3] * in[3]);
         sums[(t + 0x7f) >> 8] ++;
-        in += p->inEStrides[0];
+        in += instep;
     }
 }
 
@@ -237,7 +231,7 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
+    uchar *in = (uchar *)p->in;
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -245,7 +239,7 @@
                 (cp->mDotI[1] * in[1]) +
                 (cp->mDotI[2] * in[2]);
         sums[(t + 0x7f) >> 8] ++;
-        in += p->inEStrides[0];
+        in += instep;
     }
 }
 
@@ -254,14 +248,14 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
+    uchar *in = (uchar *)p->in;
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]) +
                 (cp->mDotI[1] * in[1]);
         sums[(t + 0x7f) >> 8] ++;
-        in += p->inEStrides[0];
+        in += instep;
     }
 }
 
@@ -270,13 +264,13 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
+    uchar *in = (uchar *)p->in;
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]);
         sums[(t + 0x7f) >> 8] ++;
-        in += p->inEStrides[0];
+        in += instep;
     }
 }
 
@@ -285,12 +279,12 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
+    uchar *in = (uchar *)p->in;
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[in[0]] ++;
-        in += p->inEStrides[0];
+        in += instep;
     }
 }
 
@@ -329,3 +323,5 @@
 
     return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
 }
+
+
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
index 9d3b400..db73a83 100644
--- a/cpu_ref/rsCpuIntrinsicLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -59,7 +59,7 @@
     RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr;
 
     uchar *out = (uchar *)p->out;
-    const uchar *in = (uchar *)p->ins[0];
+    const uchar *in = (uchar *)p->in;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -103,3 +103,5 @@
 
     return new RsdCpuScriptIntrinsicLUT(ctx, s, e);
 }
+
+
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 3a307d6..af1127e 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -35,8 +35,8 @@
     virtual ~RsdCpuScriptIntrinsicResize();
     RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
 
-    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
-                           uint32_t inLen, Allocation * aout, const void * usr,
+    virtual void preLaunch(uint32_t slot, const Allocation * ain,
+                           Allocation * aout, const void * usr,
                            uint32_t usrLen, const RsScriptCall *sc);
 
     float scaleX;
@@ -308,11 +308,9 @@
 RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() {
 }
 
-void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot,
-                                            const Allocation ** ains,
-                                            uint32_t inLen, Allocation * aout,
-                                            const void * usr, uint32_t usrLen,
-                                            const RsScriptCall *sc)
+void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot, const Allocation * ain,
+                                            Allocation * aout, const void * usr,
+                                            uint32_t usrLen, const RsScriptCall *sc)
 {
     if (!mAlloc.get()) {
         ALOGE("Resize executed without input, skipping");
@@ -353,3 +351,5 @@
 
     return new RsdCpuScriptIntrinsicResize(ctx, s, e);
 }
+
+
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 0598420..a11fda1 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -789,8 +789,119 @@
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
 
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
-                                        uint32_t inLen,
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation * ain, Allocation * aout,
+                                        const void * usr, uint32_t usrLen,
+                                        const RsScriptCall *sc,
+                                        MTLaunchStruct *mtls) {
+
+    memset(mtls, 0, sizeof(MTLaunchStruct));
+
+    // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+    if (ain && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
+        return;
+    }
+    if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
+        return;
+    }
+
+    if (ain != NULL) {
+        const Type *inType = ain->getType();
+
+        mtls->fep.dimX = inType->getDimX();
+        mtls->fep.dimY = inType->getDimY();
+        mtls->fep.dimZ = inType->getDimZ();
+
+    } else if (aout != NULL) {
+        const Type *outType = aout->getType();
+
+        mtls->fep.dimX = outType->getDimX();
+        mtls->fep.dimY = outType->getDimY();
+        mtls->fep.dimZ = outType->getDimZ();
+
+    } else {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        return;
+    }
+
+    if (ain != NULL && aout != NULL) {
+        if (!ain->hasSameDims(aout)) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+              "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
+            return;
+        }
+    }
+
+    if (!sc || (sc->xEnd == 0)) {
+        mtls->xEnd = mtls->fep.dimX;
+    } else {
+        rsAssert(sc->xStart < mtls->fep.dimX);
+        rsAssert(sc->xEnd <= mtls->fep.dimX);
+        rsAssert(sc->xStart < sc->xEnd);
+        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
+        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
+        if (mtls->xStart >= mtls->xEnd) return;
+    }
+
+    if (!sc || (sc->yEnd == 0)) {
+        mtls->yEnd = mtls->fep.dimY;
+    } else {
+        rsAssert(sc->yStart < mtls->fep.dimY);
+        rsAssert(sc->yEnd <= mtls->fep.dimY);
+        rsAssert(sc->yStart < sc->yEnd);
+        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
+        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
+        if (mtls->yStart >= mtls->yEnd) return;
+    }
+
+    if (!sc || (sc->zEnd == 0)) {
+        mtls->zEnd = mtls->fep.dimZ;
+    } else {
+        rsAssert(sc->zStart < mtls->fep.dimZ);
+        rsAssert(sc->zEnd <= mtls->fep.dimZ);
+        rsAssert(sc->zStart < sc->zEnd);
+        mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
+        mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
+        if (mtls->zStart >= mtls->zEnd) return;
+    }
+
+    mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
+    mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
+    mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
+    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+
+    rsAssert(!ain || (ain->getType()->getDimZ() == 0));
+
+    mtls->rsc = mCtx;
+    mtls->ain = ain;
+    mtls->aout = aout;
+    mtls->fep.usr = usr;
+    mtls->fep.usrLen = usrLen;
+    mtls->mSliceSize = 1;
+    mtls->mSliceNum = 0;
+
+    mtls->fep.ptrIn = NULL;
+    mtls->fep.eStrideIn = 0;
+    mtls->isThreadable = mIsThreadable;
+
+    if (ain) {
+        mtls->fep.ptrIn = (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr;
+        mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes();
+        mtls->fep.yStrideIn = ain->mHal.drvState.lod[0].stride;
+    }
+
+    mtls->fep.ptrOut = NULL;
+    mtls->fep.eStrideOut = 0;
+    if (aout) {
+        mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
+        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+    }
+}
+
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
                                         Allocation * aout,
                                         const void * usr, uint32_t usrLen,
                                         const RsScriptCall *sc,
@@ -798,24 +909,24 @@
 
     memset(mtls, 0, sizeof(MTLaunchStruct));
 
-    for (int index = inLen; --index >= 0;) {
-        const Allocation* ain = ains[index];
+    // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+    if (ains != NULL) {
+        for (int index = inLen; --index >= 0;) {
+            const Allocation* ain = ains[index];
 
-        // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
-        if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
-            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
-                                         "rsForEach called with null in allocations");
-            return;
+            if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
+                mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
+                return;
+            }
         }
     }
 
     if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
-                                     "rsForEach called with null out allocations");
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
         return;
     }
 
-    if (inLen > 0) {
+    if (ains != NULL) {
         const Allocation *ain0   = ains[0];
         const Type       *inType = ain0->getType();
 
@@ -840,12 +951,11 @@
         mtls->fep.dimZ = outType->getDimZ();
 
     } else {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
-                                     "rsForEach called with null allocations");
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
         return;
     }
 
-    if (inLen > 0 && aout != NULL) {
+    if (ains != NULL && aout != NULL) {
         if (!ains[0]->hasSameDims(aout)) {
             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
               "Failed to launch kernel; dimensions of input and output allocations do not match.");
@@ -892,7 +1002,7 @@
     mtls->zEnd     = rsMax((uint32_t)1, mtls->zEnd);
     mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
 
-    rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
+    rsAssert(!ains || (ains[0]->getType()->getDimZ() == 0));
 
     mtls->rsc        = mCtx;
     mtls->ains       = ains;
@@ -902,28 +1012,18 @@
     mtls->mSliceSize = 1;
     mtls->mSliceNum  = 0;
 
-    mtls->fep.inPtrs    = NULL;
-    mtls->fep.inStrides = NULL;
+    mtls->fep.ptrIns    = NULL;
+    mtls->fep.eStrideIn = 0;
     mtls->isThreadable  = mIsThreadable;
 
-    if (inLen > 0) {
-
-        if (inLen <= RS_KERNEL_INPUT_THRESHOLD) {
-            mtls->fep.inPtrs    = (const uint8_t**)mtls->inPtrsBuff;
-            mtls->fep.inStrides = mtls->inStridesBuff;
-        } else {
-            mtls->fep.heapAllocatedArrays = true;
-
-            mtls->fep.inPtrs    = new const uint8_t*[inLen];
-            mtls->fep.inStrides = new StridePair[inLen];
-        }
-
-        mtls->fep.inLen = inLen;
+    if (ains) {
+        mtls->fep.ptrIns    = new const uint8_t*[inLen];
+        mtls->fep.inStrides = new StridePair[inLen];
 
         for (int index = inLen; --index >= 0;) {
             const Allocation *ain = ains[index];
 
-            mtls->fep.inPtrs[index] =
+            mtls->fep.ptrIns[index] =
               (const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
 
             mtls->fep.inStrides[index].eStride =
@@ -933,27 +1033,41 @@
         }
     }
 
-    mtls->fep.outPtr            = NULL;
-    mtls->fep.outStride.eStride = 0;
-    mtls->fep.outStride.yStride = 0;
-    if (aout != NULL) {
-        mtls->fep.outPtr = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
-
-        mtls->fep.outStride.eStride = aout->getType()->getElementSizeBytes();
-        mtls->fep.outStride.yStride = aout->mHal.drvState.lod[0].stride;
+    mtls->fep.ptrOut = NULL;
+    mtls->fep.eStrideOut = 0;
+    if (aout) {
+        mtls->fep.ptrOut     = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
+        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
     }
 }
 
 
 void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
-                                     const Allocation ** ains,
-                                     uint32_t inLen,
+                                     const Allocation * ain,
                                      Allocation * aout,
                                      const void * usr,
                                      uint32_t usrLen,
                                      const RsScriptCall *sc) {
 
     MTLaunchStruct mtls;
+    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
+    forEachKernelSetup(slot, &mtls);
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    mCtx->launchThreads(ain, aout, sc, &mtls);
+    mCtx->setTLS(oldTLS);
+}
+
+void RsdCpuScriptImpl::invokeForEachMulti(uint32_t slot,
+                                          const Allocation ** ains,
+                                          uint32_t inLen,
+                                          Allocation * aout,
+                                          const void * usr,
+                                          uint32_t usrLen,
+                                          const RsScriptCall *sc) {
+
+    MTLaunchStruct mtls;
 
     forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
     forEachKernelSetup(slot, &mtls);
@@ -1224,15 +1338,17 @@
     return NULL;
 }
 
-void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation ** ains,
-                                 uint32_t inLen, Allocation * aout,
-                                 const void * usr, uint32_t usrLen,
-                                 const RsScriptCall *sc) {}
+void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation * ain,
+                       Allocation * aout, const void * usr,
+                       uint32_t usrLen, const RsScriptCall *sc)
+{
+}
 
-void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation ** ains,
-                                  uint32_t inLen, Allocation * aout,
-                                  const void * usr, uint32_t usrLen,
-                                  const RsScriptCall *sc) {}
+void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation * ain,
+                        Allocation * aout, const void * usr,
+                        uint32_t usrLen, const RsScriptCall *sc)
+{
+}
 
 
 }
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index f0843cc..d51e9e3 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -64,22 +64,26 @@
 
     virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
     virtual int invokeRoot();
-    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
-                           uint32_t inLen, Allocation * aout, const void * usr,
+    virtual void preLaunch(uint32_t slot, const Allocation * ain,
+                           Allocation * aout, const void * usr,
                            uint32_t usrLen, const RsScriptCall *sc);
-    virtual void postLaunch(uint32_t slot, const Allocation ** ains,
-                            uint32_t inLen, Allocation * aout,
-                            const void * usr, uint32_t usrLen,
-                            const RsScriptCall *sc);
-
+    virtual void postLaunch(uint32_t slot, const Allocation * ain,
+                            Allocation * aout, const void * usr,
+                            uint32_t usrLen, const RsScriptCall *sc);
     virtual void invokeForEach(uint32_t slot,
-                               const Allocation ** ains,
-                               uint32_t inLen,
-                               Allocation* aout,
-                               const void* usr,
-                               uint32_t usrLen,
-                               const RsScriptCall* sc);
+                       const Allocation * ain,
+                       Allocation * aout,
+                       const void * usr,
+                       uint32_t usrLen,
+                       const RsScriptCall *sc);
 
+    virtual void invokeForEachMulti(uint32_t slot,
+                                     const Allocation** ains,
+                                     uint32_t inLen,
+                                     Allocation* aout,
+                                     const void* usr,
+                                     uint32_t usrLen,
+                                     const RsScriptCall* sc);
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
@@ -96,6 +100,10 @@
 
     const Script * getScript() {return mScript;}
 
+    void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
+                          const void * usr, uint32_t usrLen,
+                          const RsScriptCall *sc, MTLaunchStruct *mtls);
+
     void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
                           Allocation * aout, const void * usr, uint32_t usrLen,
                           const RsScriptCall *sc, MTLaunchStruct *mtls);
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 20ee09d..0878552 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -53,45 +53,38 @@
                                          uint32_t instep, uint32_t outstep) {
 
 
-    const ScriptList *sl           = (const ScriptList *)kparams->usr;
+    const ScriptList *sl            = (const ScriptList *)kparams->usr;
     RsExpandKernelParams *mkparams = (RsExpandKernelParams *)kparams;
 
-    const void **oldIns  = mkparams->ins;
-    uint32_t *oldStrides = mkparams->inEStrides;
-
-    void *localIns[1];
-    uint32_t localStride[1];
-
-    mkparams->ins        = (const void**)localIns;
-    mkparams->inEStrides = localStride;
-
     for (size_t ct = 0; ct < sl->count; ct++) {
         ScriptGroupRootFunc_t func;
         func          = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
         mkparams->usr = sl->usrPtrs[ct];
 
-        if (sl->ins[ct]) {
-            localIns[0] = sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+        mkparams->in  = NULL;
+        mkparams->out = NULL;
 
-            localStride[0] = sl->ins[ct]->mHal.state.elementSizeBytes;
+        uint32_t istep = 0;
+        uint32_t ostep = 0;
+
+        if (sl->ins[ct]) {
+            mkparams->in =
+              (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+
+            istep = sl->ins[ct]->mHal.state.elementSizeBytes;
 
             if (sl->inExts[ct]) {
-                localIns[0] = (void*)
-                  ((const uint8_t *)localIns[0] +
-                   sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y);
+                mkparams->in =
+                  (const uint8_t *)mkparams->in +
+                  sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y;
 
             } else if (sl->ins[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
-                localIns[0] = (void*)
-                  ((const uint8_t *)localIns[0] +
-                   sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid);
+                mkparams->in =
+                  (const uint8_t *)mkparams->in +
+                  sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid;
             }
-
-        } else {
-            localIns[0]    = NULL;
-            localStride[0] = 0;
         }
 
-        uint32_t ostep;
         if (sl->outs[ct]) {
             mkparams->out =
               (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
@@ -108,23 +101,14 @@
                   (uint8_t *)mkparams->out +
                   sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->lid;
             }
-        } else {
-            mkparams->out = NULL;
-            ostep         = 0;
         }
 
         //ALOGE("kernel %i %p,%p  %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
-        /*
-         * The fourth argument is zero here because kernels get their stride
-         * information from a member of p that points to an array.
-         */
-        func(kparams, xstart, xend, 0, ostep);
+        func(kparams, xstart, xend, istep, ostep);
     }
     //ALOGE("script group root");
 
-    mkparams->ins        = oldIns;
-    mkparams->inEStrides = oldStrides;
-    mkparams->usr        = sl;
+    mkparams->usr = sl;
 }
 
 
@@ -211,33 +195,17 @@
 
     MTLaunchStruct mtls;
 
-    if (fieldDep) {
+    if(fieldDep) {
         for (size_t ct=0; ct < ins.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
             uint32_t slot = kernels[ct]->mSlot;
 
-            uint32_t inLen;
-            const Allocation **ains;
-
-            if (ins[ct] == NULL) {
-                inLen = 0;
-                ains  = NULL;
-
-            } else {
-                inLen = 1;
-                ains  = const_cast<const Allocation**>(&ins[ct]);
-            }
-
-            si->forEachMtlsSetup(ains, inLen, outs[ct], NULL, 0, NULL, &mtls);
-
+            si->forEachMtlsSetup(ins[ct], outs[ct], NULL, 0, NULL, &mtls);
             si->forEachKernelSetup(slot, &mtls);
-            si->preLaunch(slot, ains, inLen, outs[ct], mtls.fep.usr,
-                          mtls.fep.usrLen, NULL);
-
-            mCtx->launchThreads(ains, inLen, outs[ct], NULL, &mtls);
-
-            si->postLaunch(slot, ains, inLen, outs[ct], NULL, 0, NULL);
+            si->preLaunch(slot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
+            mCtx->launchThreads(ins[ct], outs[ct], NULL, &mtls);
+            si->postLaunch(slot, ins[ct], outs[ct], NULL, 0, NULL);
         }
     } else {
         ScriptList sl;
@@ -246,18 +214,6 @@
         sl.kernels = kernels.array();
         sl.count = kernels.size();
 
-        uint32_t inLen;
-        const Allocation **ains;
-
-        if (ins[0] == NULL) {
-            inLen = 0;
-            ains  = NULL;
-
-        } else {
-            inLen = 1;
-            ains  = const_cast<const Allocation**>(&ins[0]);
-        }
-
         Vector<const void *> usrPtrs;
         Vector<const void *> fnPtrs;
         Vector<uint32_t> sigs;
@@ -269,8 +225,7 @@
             fnPtrs.add((void *)mtls.kernel);
             usrPtrs.add(mtls.fep.usr);
             sigs.add(mtls.fep.usrLen);
-            si->preLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct],
-                          mtls.fep.usr, mtls.fep.usrLen, NULL);
+            si->preLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
         }
         sl.sigs = sigs.array();
         sl.usrPtrs = usrPtrs.array();
@@ -280,20 +235,16 @@
 
         Script *s = kernels[0]->mScript;
         RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
-
-        si->forEachMtlsSetup(ains, inLen, outs[0], NULL, 0, NULL, &mtls);
-
+        si->forEachMtlsSetup(ins[0], outs[0], NULL, 0, NULL, &mtls);
         mtls.script = NULL;
         mtls.kernel = (void (*)())&scriptGroupRoot;
         mtls.fep.usr = &sl;
-
-        mCtx->launchThreads(ains, inLen, outs[0], NULL, &mtls);
+        mCtx->launchThreads(ins[0], outs[0], NULL, &mtls);
 
         for (size_t ct=0; ct < kernels.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
-            si->postLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct], NULL, 0,
-                           NULL);
+            si->postLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], NULL, 0, NULL);
         }
     }
 }
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 4728b7c..0076cb9 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -69,15 +69,21 @@
         virtual void populateScript(Script *) = 0;
         virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength) = 0;
         virtual int invokeRoot() = 0;
-
         virtual void invokeForEach(uint32_t slot,
-                                   const Allocation ** ains,
-                                   uint32_t inLen,
-                                   Allocation * aout,
-                                   const void * usr,
-                                   uint32_t usrLen,
-                                   const RsScriptCall *sc) = 0;
-
+                           const Allocation * ain,
+                           Allocation * aout,
+                           const void * usr,
+                           uint32_t usrLen,
+                           const RsScriptCall *sc) = 0;
+                           
+        virtual void invokeForEachMulti(uint32_t slot,
+                                         const Allocation** ains,
+                                         uint32_t inLen,
+                                         Allocation * aout,
+                                         const void * usr,
+                                         uint32_t usrLen,
+                                         const RsScriptCall *sc) = 0;
+        
         virtual void invokeInit() = 0;
         virtual void invokeFreeChildren() = 0;