Collapse code paths for single- and multi-input kernels.

This patch simplifies the RenderScript driver and CPU reference implementation
by removing the distinction between sing- and multi-input kernels in many
places.  The distinction is maintained in some places due to the need to
maintain backwards compatibility.  This permits the deletion of some functions
and struct members that are no longer needed.  Several related functions were
also cleaned up.

Change-Id: Id70a223ea5e3aa2b0b935b2b7f9af933339ae8a4
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index aeb75a6..ad7cef7 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -1,7 +1,8 @@
 
 LOCAL_PATH:=$(call my-dir)
 
-rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable -fno-exceptions
+rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable \
+                  -fno-exceptions -std=c++11
 ifeq ($(TARGET_BUILD_PDK), true)
   rs_base_CFLAGS += -D__RS_PDK__
 endif
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index a0564fc..db3cc7f 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -350,180 +350,134 @@
 }
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+typedef void (*walk_loop_t)(MTLaunchStruct*,
+                            RsExpandKernelParams&,
+                            outer_foreach_t);
 
-static void wc_xy(void *usr, uint32_t idx) {
+
+static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
 
+    uint32_t inLen = mtls->fep.inLen;
+
     RsExpandKernelParams kparams;
     kparams.takeFields(mtls->fep);
 
     // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
     kparams.lid = idx;
 
-    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-    while (1) {
-        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
-        uint32_t yEnd   = yStart + mtls->mSliceSize;
+    if (inLen > 0) {
+        // Allocate space for our input base pointers.
+        kparams.ins = (const void**)alloca(inLen * sizeof(void*));
 
-        yEnd = rsMin(yEnd, mtls->yEnd);
+        // Allocate space for our input stride information.
+        kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
 
-        if (yEnd <= yStart) {
-            return;
-        }
-
-        //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
-        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
-
-        for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
-            kparams.out = mtls->fep.ptrOut +
-                          (mtls->fep.yStrideOut * kparams.y) +
-                          (mtls->fep.eStrideOut * mtls->xStart);
-
-            kparams.in = mtls->fep.ptrIn +
-                         (mtls->fep.yStrideIn * kparams.y) +
-                         (mtls->fep.eStrideIn * mtls->xStart);
-
-
-            fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
-               mtls->fep.eStrideOut);
+        // Fill our stride information.
+        for (int inIndex = inLen; --inIndex >= 0;) {
+          kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
         }
     }
-}
-
-static void wc_x(void *usr, uint32_t idx) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-
-    RsExpandKernelParams kparams;
-    kparams.takeFields(mtls->fep);
-
-    // Used by CpuScriptGroup, IntrinsicBlur, and IntrisicHistogram
-    kparams.lid = idx;
 
     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-    while (1) {
-        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
-        uint32_t xEnd   = xStart + mtls->mSliceSize;
 
-        xEnd = rsMin(xEnd, mtls->xEnd);
-
-        if (xEnd <= xStart) {
-            return;
-        }
-
-        //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
-        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
-
-        kparams.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
-        kparams.in  = mtls->fep.ptrIn  + (mtls->fep.eStrideIn  * xStart);
-
-        fn(&kparams, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
-    }
+    walk_loop(mtls, kparams, fn);
 }
 
-void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
-                                        const RsScriptCall *sc, MTLaunchStruct *mtls) {
+static void walk_2d(void *usr, uint32_t idx) {
+    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
+                              RsExpandKernelParams &kparams,
+                              outer_foreach_t fn) {
 
-    //android::StopWatch kernel_time("kernel time");
+        while (1) {
+            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+            uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+            uint32_t yEnd   = yStart + mtls->mSliceSize;
 
-    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
-        const size_t targetByteChunk = 16 * 1024;
-        mInForEach = true;
-        if (mtls->fep.dimY > 1) {
-            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
-            uint32_t s2 = 0;
+            yEnd = rsMin(yEnd, mtls->yEnd);
 
-            // This chooses our slice size to rate limit atomic ops to
-            // one per 16k bytes of reads/writes.
-            if (mtls->fep.yStrideOut) {
-                s2 = targetByteChunk / mtls->fep.yStrideOut;
-            } else {
-                s2 = targetByteChunk / mtls->fep.yStrideIn;
-            }
-            mtls->mSliceSize = rsMin(s1, s2);
-
-            if(mtls->mSliceSize < 1) {
-                mtls->mSliceSize = 1;
+            if (yEnd <= yStart) {
+                return;
             }
 
-         //   mtls->mSliceSize = 2;
-            launchThreads(wc_xy, mtls);
-        } else {
-            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
-            uint32_t s2 = 0;
+            for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
+                kparams.out = mtls->fep.outPtr +
+                              (mtls->fep.outStride.yStride * kparams.y) +
+                              (mtls->fep.outStride.eStride * mtls->xStart);
 
-            // This chooses our slice size to rate limit atomic ops to
-            // one per 16k bytes of reads/writes.
-            if (mtls->fep.eStrideOut) {
-                s2 = targetByteChunk / mtls->fep.eStrideOut;
-            } else {
-                s2 = targetByteChunk / mtls->fep.eStrideIn;
-            }
-            mtls->mSliceSize = rsMin(s1, s2);
+                for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
+                    StridePair &strides = mtls->fep.inStrides[inIndex];
 
-            if(mtls->mSliceSize < 1) {
-                mtls->mSliceSize = 1;
-            }
-
-            launchThreads(wc_x, mtls);
-        }
-        mInForEach = false;
-
-        //ALOGE("launch 1");
-    } else {
-        RsExpandKernelParams kparams;
-        kparams.takeFields(mtls->fep);
-
-        //ALOGE("launch 3");
-        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-        for (uint32_t arrayIndex = mtls->arrayStart;
-             arrayIndex < mtls->arrayEnd; arrayIndex++) {
-
-            for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
-                 kparams.z++) {
-
-                for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
-                     kparams.y++) {
-
-                    uint32_t offset =
-                      kparams.dimY * kparams.dimZ * arrayIndex +
-                      kparams.dimY * kparams.z + kparams.y;
-
-                    kparams.out = mtls->fep.ptrOut +
-                                  (mtls->fep.yStrideOut * offset) +
-                                  (mtls->fep.eStrideOut * mtls->xStart);
-
-                    kparams.in = mtls->fep.ptrIn +
-                                 (mtls->fep.yStrideIn * offset) +
-                                 (mtls->fep.eStrideIn * mtls->xStart);
-
-                    fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
-                       mtls->fep.eStrideOut);
+                    kparams.ins[inIndex] =
+                      mtls->fep.inPtrs[inIndex] +
+                      (strides.yStride * kparams.y) +
+                      (strides.eStride * mtls->xStart);
                 }
+
+                // Kernels now get their input strides from kparams.
+                fn(&kparams, mtls->xStart, mtls->xEnd, 0,
+                   mtls->fep.outStride.eStride);
             }
         }
-    }
+    });
 }
 
-void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
-                                        const RsScriptCall* sc, MTLaunchStruct* mtls) {
+static void walk_1d(void *usr, uint32_t idx) {
+    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
+                              RsExpandKernelParams &kparams,
+                              outer_foreach_t fn) {
+
+        while (1) {
+            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+            uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+            uint32_t xEnd   = xStart + mtls->mSliceSize;
+
+            xEnd = rsMin(xEnd, mtls->xEnd);
+
+            if (xEnd <= xStart) {
+                return;
+            }
+
+            kparams.out = mtls->fep.outPtr +
+                          (mtls->fep.outStride.eStride * xStart);
+
+            for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
+                StridePair &strides = mtls->fep.inStrides[inIndex];
+
+                kparams.ins[inIndex] =
+                  mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
+            }
+
+            // Kernels now get their input strides from kparams.
+            fn(&kparams, xStart, xEnd, 0, mtls->fep.outStride.eStride);
+        }
+    });
+}
+
+
+void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
+                                        uint32_t inLen,
+                                        Allocation* aout,
+                                        const RsScriptCall* sc,
+                                        MTLaunchStruct* mtls) {
 
     //android::StopWatch kernel_time("kernel time");
 
     if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
         const size_t targetByteChunk = 16 * 1024;
         mInForEach = true;
+
         if (mtls->fep.dimY > 1) {
             uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.yStrideOut) {
-                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            if (mtls->fep.outStride.yStride) {
+                s2 = targetByteChunk / mtls->fep.outStride.yStride;
             } else {
-                s2 = targetByteChunk / mtls->fep.yStrideIn;
+                // We know that there is either an output or an input.
+                s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -531,18 +485,18 @@
                 mtls->mSliceSize = 1;
             }
 
-         //   mtls->mSliceSize = 2;
-            launchThreads(wc_xy, mtls);
+            launchThreads(walk_2d, mtls);
         } else {
             uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.eStrideOut) {
-                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            if (mtls->fep.outStride.eStride) {
+                s2 = targetByteChunk / mtls->fep.outStride.eStride;
             } else {
-                s2 = targetByteChunk / mtls->fep.eStrideIn;
+                // We know that there is either an output or an input.
+                s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -550,24 +504,26 @@
                 mtls->mSliceSize = 1;
             }
 
-            launchThreads(wc_x, mtls);
+            launchThreads(walk_1d, mtls);
         }
         mInForEach = false;
 
-        //ALOGE("launch 1");
     } else {
         RsExpandKernelParams kparams;
         kparams.takeFields(mtls->fep);
 
-        // Allocate space for our input base pointers.
-        kparams.ins = new const void*[inLen];
+        if (inLen > 0) {
+            // Allocate space for our input base pointers.
+            kparams.ins = (const void**)alloca(inLen * sizeof(void*));
 
-        // Allocate space for our input stride information.
-        kparams.eStrideIns = new uint32_t[inLen];
+            // Allocate space for our input stride information.
+            kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
 
-        // Fill our stride information.
-        for (int inIndex = inLen; --inIndex >= 0;) {
-          kparams.eStrideIns[inIndex] = mtls->fep.inStrides[inIndex].eStride;
+            // Fill our stride information.
+            for (int inIndex = inLen; --inIndex >= 0;) {
+                kparams.inEStrides[inIndex] =
+                    mtls->fep.inStrides[inIndex].eStride;
+            }
         }
 
         //ALOGE("launch 3");
@@ -585,15 +541,15 @@
                       mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
                       mtls->fep.dimY * kparams.z + kparams.y;
 
-                    kparams.out = mtls->fep.ptrOut +
-                                  (mtls->fep.yStrideOut * offset) +
-                                  (mtls->fep.eStrideOut * mtls->xStart);
+                    kparams.out = mtls->fep.outPtr +
+                                  (mtls->fep.outStride.yStride * offset) +
+                                  (mtls->fep.outStride.eStride * mtls->xStart);
 
                     for (int inIndex = inLen; --inIndex >= 0;) {
                         StridePair &strides = mtls->fep.inStrides[inIndex];
 
                         kparams.ins[inIndex] =
-                          mtls->fep.ptrIns[inIndex] +
+                          mtls->fep.inPtrs[inIndex] +
                           (strides.yStride * offset) +
                           (strides.eStride * mtls->xStart);
                     }
@@ -604,14 +560,10 @@
                      * that points to an array.
                      */
                     fn(&kparams, mtls->xStart, mtls->xEnd, 0,
-                       mtls->fep.eStrideOut);
+                       mtls->fep.outStride.eStride);
                 }
             }
         }
-
-        // Free our arrays.
-        delete[] kparams.ins;
-        delete[] kparams.eStrideIns;
     }
 }
 
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 5d4b6cc..2fea3fc 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -25,6 +25,8 @@
 
 #include <string>
 
+#define RS_KERNEL_INPUT_THRESHOLD 32
+
 namespace bcc {
     class BCCContext;
     class RSCompilerDriver;
@@ -40,31 +42,36 @@
 };
 
 struct RsExpandKernelDriverInfo {
-    const void *usr;
-    uint32_t usrLen;
+    const uint8_t **inPtrs;
+    uint32_t inLen;
+
+    uint8_t *outPtr;
+
+    StridePair *inStrides;
+    StridePair  outStride;
 
     uint32_t dimX;
     uint32_t dimY;
     uint32_t dimZ;
 
-    const uint8_t *ptrIn;
-    uint8_t *ptrOut;
-    uint32_t eStrideIn;
-    uint32_t eStrideOut;
-    uint32_t yStrideIn;
-    uint32_t yStrideOut;
     uint32_t slot;
 
-    const uint8_t** ptrIns;
-    StridePair* inStrides;
+    const void *usr;
+    uint32_t usrLen;
+
+    bool heapAllocatedArrays;
+
+    RsExpandKernelDriverInfo() : heapAllocatedArrays(false) {}
 
     ~RsExpandKernelDriverInfo() {
-        if (ptrIns != NULL) {
-            delete[] ptrIns;
-        }
+        if (heapAllocatedArrays) {
+            if (inPtrs != NULL) {
+                delete[] inPtrs;
+            }
 
-        if (inStrides != NULL) {
-            delete[] inStrides;
+            if (inStrides != NULL) {
+                delete[] inStrides;
+            }
         }
     }
 };
@@ -72,15 +79,13 @@
 struct RsExpandKernelParams {
 
     // Used by kernels
-    const void *in;
+    const void **ins;
+    uint32_t *inEStrides;
     void *out;
     uint32_t y;
     uint32_t z;
     uint32_t lid;
 
-    const void **ins;
-    uint32_t *eStrideIns;
-
     // Used by ScriptGroup and user kernels.
     const void *usr;
 
@@ -115,13 +120,13 @@
 class RsdCpuScriptImpl;
 class RsdCpuReferenceImpl;
 
-typedef struct ScriptTLSStructRec {
+struct ScriptTLSStruct {
     android::renderscript::Context * mContext;
     const android::renderscript::Script * mScript;
     RsdCpuScriptImpl *mImpl;
-} ScriptTLSStruct;
+};
 
-typedef struct {
+struct MTLaunchStruct {
     RsExpandKernelDriverInfo fep;
 
     RsdCpuReferenceImpl *rsc;
@@ -129,7 +134,7 @@
 
     ForEachFunc_t kernel;
     uint32_t sig;
-    const Allocation * ain;
+    const Allocation ** ains;
     Allocation * aout;
 
     uint32_t mSliceSize;
@@ -145,12 +150,9 @@
     uint32_t arrayStart;
     uint32_t arrayEnd;
 
-    // Multi-input data.
-    const Allocation ** ains;
-} MTLaunchStruct;
-
-
-
+    const uint8_t *inPtrsBuff[RS_KERNEL_INPUT_THRESHOLD];
+    StridePair     inStridesBuff[RS_KERNEL_INPUT_THRESHOLD];
+};
 
 class RsdCpuReferenceImpl : public RsdCpuReference {
 public:
@@ -171,9 +173,6 @@
         return mWorkers.mCount + 1;
     }
 
-    void launchThreads(const Allocation * ain, Allocation * aout,
-                       const RsScriptCall *sc, MTLaunchStruct *mtls);
-
     void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
                        const RsScriptCall* sc, MTLaunchStruct* mtls);
 
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 5a7fffd..8437c99 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -73,54 +73,29 @@
 }
 
 
-void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation * ain,
-                                      Allocation * aout, const void * usr,
-                                      uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation ** ains,
+                                      uint32_t inLen, Allocation * aout,
+                                      const void * usr, uint32_t usrLen,
+                                      const RsScriptCall *sc) {
 }
 
-void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation * ain,
-                                       Allocation * aout, const void * usr,
-                                       uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation ** ains,
+                                       uint32_t inLen, Allocation * aout,
+                                       const void * usr, uint32_t usrLen,
+                                       const RsScriptCall *sc) {
 }
 
 void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
-                                          const Allocation * ain,
+                                          const Allocation ** ains,
+                                          uint32_t inLen,
                                           Allocation * aout,
                                           const void * usr,
                                           uint32_t usrLen,
                                           const RsScriptCall *sc) {
 
     MTLaunchStruct mtls;
-    preLaunch(slot, ain, aout, usr, usrLen, sc);
 
-    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
-    mtls.script = this;
-    mtls.fep.slot = slot;
-
-    mtls.kernel = (void (*)())mRootPtr;
-    mtls.fep.usr = this;
-
-    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
-    mCtx->launchThreads(ain, aout, sc, &mtls);
-    mCtx->setTLS(oldTLS);
-
-    postLaunch(slot, ain, aout, usr, usrLen, sc);
-}
-
-void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot,
-                                               const Allocation ** ains,
-                                               uint32_t inLen,
-                                               Allocation * aout,
-                                               const void * usr,
-                                               uint32_t usrLen,
-                                               const RsScriptCall *sc) {
-
-    MTLaunchStruct mtls;
-    /*
-     * FIXME: Possibly create new preLaunch and postLaunch functions that take
-     *        all of the input allocation pointers.
-     */
-    preLaunch(slot, ains[0], aout, usr, usrLen, sc);
+    preLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
 
     forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
     mtls.script = this;
@@ -133,7 +108,7 @@
     mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
     mCtx->setTLS(oldTLS);
 
-    postLaunch(slot, ains[0], aout, usr, usrLen, sc);
+    postLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
 }
 
 void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index bf6a8ac..95aaa14 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -28,43 +28,42 @@
 public:
     virtual void populateScript(Script *) = 0;
 
-    virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
+    virtual void invokeFunction(uint32_t slot, const void * params,
+                                size_t paramLength);
     virtual int invokeRoot();
+
     virtual void invokeForEach(uint32_t slot,
-                       const Allocation * ain,
-                       Allocation * aout,
-                       const void * usr,
-                       uint32_t usrLen,
-                       const RsScriptCall *sc);
+                               const Allocation ** ain,
+                               uint32_t inLen,
+                               Allocation * aout,
+                               const void * usr,
+                               uint32_t usrLen,
+                               const RsScriptCall *sc);
 
-    virtual void invokeForEachMulti(uint32_t slot,
-                       const Allocation ** ain,
-                       uint32_t inLen,
-                       Allocation * aout,
-                       const void * usr,
-                       uint32_t usrLen,
-                       const RsScriptCall *sc);
-
-    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
+    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct * mtls);
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
-    virtual void preLaunch(uint32_t slot, const Allocation * ain,
-                           Allocation * aout, const void * usr,
-                           uint32_t usrLen, const RsScriptCall *sc);
-    virtual void postLaunch(uint32_t slot, const Allocation * ain,
-                            Allocation * aout, const void * usr,
-                            uint32_t usrLen, const RsScriptCall *sc);
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
+                           uint32_t usrLen, const RsScriptCall * sc);
+    virtual void postLaunch(uint32_t slot, const Allocation ** ains,
+                            uint32_t inLen, Allocation * aout,
+                            const void * usr, uint32_t usrLen,
+                            const RsScriptCall * sc);
 
-    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
-    virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
-                                  const Element *e, const uint32_t *dims, size_t dimLength);
+    virtual void setGlobalVar(uint32_t slot, const void * data,
+                              size_t dataLength);
+    virtual void setGlobalVarWithElemDims(uint32_t slot, const void * data,
+                                          size_t dataLength, const Element * e,
+                                          const uint32_t * dims,
+                                          size_t dimLength);
     virtual void setGlobalBind(uint32_t slot, Allocation *data);
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
     virtual ~RsdCpuScriptIntrinsic();
-    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, const Element *,
-                          RsScriptIntrinsicID iid);
+    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl * ctx, const Script * s,
+                          const Element * e, RsScriptIntrinsicID iid);
 
 protected:
     RsScriptIntrinsicID mID;
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index c839c19..a19d885 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -64,7 +64,7 @@
     RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
 
     uchar4 *out = (uchar4 *)p->out + xstart;
-    uchar4 *in = (uchar4 *)p->in + xstart;
+    uchar4 *in = (uchar4 *)p->ins[0] + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -161,9 +161,9 @@
     }
 }
 
-RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
-                                                     const Script *s, const Element *e)
-            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
+RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(
+    RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) :
+        RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
 
     mRootPtr = &kernel;
 }
@@ -185,5 +185,3 @@
 
     return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index b604658..0378e07 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -117,7 +117,7 @@
 
     // instep/outstep can be ignored--sizeof(uchar4) known at compile time
     uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->in;
+    uchar4 *in = (uchar4 *)p->ins[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -509,6 +509,3 @@
                                       const Script *s, const Element *e) {
     return new RsdCpuScriptIntrinsicBlend(ctx, s, e);
 }
-
-
-
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index bf78eb3..4e90ad7 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -169,10 +169,9 @@
     virtual ~RsdCpuScriptIntrinsicColorMatrix();
     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
-    virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
-                           const void * usr, uint32_t usrLen, const RsScriptCall *sc);
-    virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
-                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
+                           uint32_t usrLen, const RsScriptCall *sc);
 
 protected:
     float fp[16];
@@ -883,8 +882,13 @@
                                               uint32_t xstart, uint32_t xend,
                                               uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
-    uchar *out = (uchar *)p->out + outstep * xstart;
-    uchar *in = (uchar *)p->in + instep * xstart;
+
+    // Update the instep due to change in parameter passing.
+    instep = p->inEStrides[0];
+
+    uchar *out = (uchar *)p->out    + outstep * xstart;
+    uchar *in  = (uchar *)p->ins[0] + instep  * xstart;
+
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -932,11 +936,15 @@
     }
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
-        uint32_t slot, const Allocation * ain, Allocation * aout,
-        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
+                                                 const Allocation ** ains,
+                                                 uint32_t inLen,
+                                                 Allocation * aout,
+                                                 const void * usr,
+                                                 uint32_t usrLen,
+                                                 const RsScriptCall *sc) {
 
-    const Element *ein = ain->mHal.state.type->getElement();
+    const Element *ein = ains[0]->mHal.state.type->getElement();
     const Element *eout = aout->mHal.state.type->getElement();
 
     if (ein->getType() == eout->getType()) {
@@ -953,8 +961,8 @@
         }
     }
 
-    Key_t key = computeKey(ain->mHal.state.type->getElement(),
-                           aout->mHal.state.type->getElement());
+    Key_t key = computeKey(ein, eout);
+
 #if defined(ARCH_X86_HAVE_SSSE3)
     if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
@@ -996,12 +1004,6 @@
 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
-        uint32_t slot, const Allocation * ain, Allocation * aout,
-        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
-
-}
-
 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
index 1c430b7..b5dbfa8 100644
--- a/cpu_ref/rsCpuIntrinsicHistogram.cpp
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -36,10 +36,10 @@
     RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
-    void preLaunch(uint32_t slot, const Allocation * ain,
+    void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
                    Allocation * aout, const void * usr,
                    uint32_t usrLen, const RsScriptCall *sc);
-    void postLaunch(uint32_t slot, const Allocation * ain,
+    void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
                     Allocation * aout, const void * usr,
                     uint32_t usrLen, const RsScriptCall *sc);
 
@@ -97,9 +97,12 @@
 
 
 
-void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain,
-                                      Allocation * aout, const void * usr,
-                                      uint32_t usrLen, const RsScriptCall *sc) {
+void
+RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
+                                          const Allocation ** ains,
+                                          uint32_t inLen, Allocation * aout,
+                                          const void * usr, uint32_t usrLen,
+                                          const RsScriptCall *sc) {
 
     const uint32_t threads = mCtx->getThreadCount();
     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
@@ -123,7 +126,7 @@
         }
         break;
     case 1:
-        switch(ain->getType()->getElement()->getVectorSize()) {
+        switch(ains[0]->getType()->getElement()->getVectorSize()) {
         case 1:
             mRootPtr = &kernelP1L1;
             break;
@@ -142,9 +145,12 @@
     memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
 }
 
-void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
-                                       Allocation * aout, const void * usr,
-                                       uint32_t usrLen, const RsScriptCall *sc) {
+void
+RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot,
+                                           const Allocation ** ains,
+                                           uint32_t inLen,  Allocation * aout,
+                                           const void * usr, uint32_t usrLen,
+                                           const RsScriptCall *sc) {
 
     unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
     uint32_t threads = mCtx->getThreadCount();
@@ -165,7 +171,7 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * 4 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -173,7 +179,7 @@
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
         sums[(in[3] << 2) + 3] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
@@ -182,14 +188,14 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * 4 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 2)    ] ++;
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
@@ -198,13 +204,13 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * 2 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 1)    ] ++;
         sums[(in[1] << 1) + 1] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
@@ -213,7 +219,7 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -222,7 +228,7 @@
                 (cp->mDotI[2] * in[2]) +
                 (cp->mDotI[3] * in[3]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
@@ -231,7 +237,7 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -239,7 +245,7 @@
                 (cp->mDotI[1] * in[1]) +
                 (cp->mDotI[2] * in[2]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
@@ -248,14 +254,14 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]) +
                 (cp->mDotI[1] * in[1]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
@@ -264,13 +270,13 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
@@ -279,12 +285,12 @@
                                                 uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[in[0]] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
@@ -323,5 +329,3 @@
 
     return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
index db73a83..9d3b400 100644
--- a/cpu_ref/rsCpuIntrinsicLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -59,7 +59,7 @@
     RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr;
 
     uchar *out = (uchar *)p->out;
-    const uchar *in = (uchar *)p->in;
+    const uchar *in = (uchar *)p->ins[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -103,5 +103,3 @@
 
     return new RsdCpuScriptIntrinsicLUT(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index af1127e..3a307d6 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -35,8 +35,8 @@
     virtual ~RsdCpuScriptIntrinsicResize();
     RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
 
-    virtual void preLaunch(uint32_t slot, const Allocation * ain,
-                           Allocation * aout, const void * usr,
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
                            uint32_t usrLen, const RsScriptCall *sc);
 
     float scaleX;
@@ -308,9 +308,11 @@
 RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() {
 }
 
-void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot, const Allocation * ain,
-                                            Allocation * aout, const void * usr,
-                                            uint32_t usrLen, const RsScriptCall *sc)
+void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot,
+                                            const Allocation ** ains,
+                                            uint32_t inLen, Allocation * aout,
+                                            const void * usr, uint32_t usrLen,
+                                            const RsScriptCall *sc)
 {
     if (!mAlloc.get()) {
         ALOGE("Resize executed without input, skipping");
@@ -351,5 +353,3 @@
 
     return new RsdCpuScriptIntrinsicResize(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index a11fda1..0598420 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -789,119 +789,8 @@
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
 
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation * ain, Allocation * aout,
-                                        const void * usr, uint32_t usrLen,
-                                        const RsScriptCall *sc,
-                                        MTLaunchStruct *mtls) {
-
-    memset(mtls, 0, sizeof(MTLaunchStruct));
-
-    // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
-    if (ain && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
-        return;
-    }
-    if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
-        return;
-    }
-
-    if (ain != NULL) {
-        const Type *inType = ain->getType();
-
-        mtls->fep.dimX = inType->getDimX();
-        mtls->fep.dimY = inType->getDimY();
-        mtls->fep.dimZ = inType->getDimZ();
-
-    } else if (aout != NULL) {
-        const Type *outType = aout->getType();
-
-        mtls->fep.dimX = outType->getDimX();
-        mtls->fep.dimY = outType->getDimY();
-        mtls->fep.dimZ = outType->getDimZ();
-
-    } else {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
-        return;
-    }
-
-    if (ain != NULL && aout != NULL) {
-        if (!ain->hasSameDims(aout)) {
-            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
-              "Failed to launch kernel; dimensions of input and output allocations do not match.");
-
-            return;
-        }
-    }
-
-    if (!sc || (sc->xEnd == 0)) {
-        mtls->xEnd = mtls->fep.dimX;
-    } else {
-        rsAssert(sc->xStart < mtls->fep.dimX);
-        rsAssert(sc->xEnd <= mtls->fep.dimX);
-        rsAssert(sc->xStart < sc->xEnd);
-        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
-        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
-        if (mtls->xStart >= mtls->xEnd) return;
-    }
-
-    if (!sc || (sc->yEnd == 0)) {
-        mtls->yEnd = mtls->fep.dimY;
-    } else {
-        rsAssert(sc->yStart < mtls->fep.dimY);
-        rsAssert(sc->yEnd <= mtls->fep.dimY);
-        rsAssert(sc->yStart < sc->yEnd);
-        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
-        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
-        if (mtls->yStart >= mtls->yEnd) return;
-    }
-
-    if (!sc || (sc->zEnd == 0)) {
-        mtls->zEnd = mtls->fep.dimZ;
-    } else {
-        rsAssert(sc->zStart < mtls->fep.dimZ);
-        rsAssert(sc->zEnd <= mtls->fep.dimZ);
-        rsAssert(sc->zStart < sc->zEnd);
-        mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
-        mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
-        if (mtls->zStart >= mtls->zEnd) return;
-    }
-
-    mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
-    mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
-    mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
-    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
-
-    rsAssert(!ain || (ain->getType()->getDimZ() == 0));
-
-    mtls->rsc = mCtx;
-    mtls->ain = ain;
-    mtls->aout = aout;
-    mtls->fep.usr = usr;
-    mtls->fep.usrLen = usrLen;
-    mtls->mSliceSize = 1;
-    mtls->mSliceNum = 0;
-
-    mtls->fep.ptrIn = NULL;
-    mtls->fep.eStrideIn = 0;
-    mtls->isThreadable = mIsThreadable;
-
-    if (ain) {
-        mtls->fep.ptrIn = (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr;
-        mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes();
-        mtls->fep.yStrideIn = ain->mHal.drvState.lod[0].stride;
-    }
-
-    mtls->fep.ptrOut = NULL;
-    mtls->fep.eStrideOut = 0;
-    if (aout) {
-        mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
-        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
-        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
-    }
-}
-
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
+                                        uint32_t inLen,
                                         Allocation * aout,
                                         const void * usr, uint32_t usrLen,
                                         const RsScriptCall *sc,
@@ -909,24 +798,24 @@
 
     memset(mtls, 0, sizeof(MTLaunchStruct));
 
-    // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
-    if (ains != NULL) {
-        for (int index = inLen; --index >= 0;) {
-            const Allocation* ain = ains[index];
+    for (int index = inLen; --index >= 0;) {
+        const Allocation* ain = ains[index];
 
-            if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
-                mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
-                return;
-            }
+        // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+        if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                         "rsForEach called with null in allocations");
+            return;
         }
     }
 
     if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                     "rsForEach called with null out allocations");
         return;
     }
 
-    if (ains != NULL) {
+    if (inLen > 0) {
         const Allocation *ain0   = ains[0];
         const Type       *inType = ain0->getType();
 
@@ -951,11 +840,12 @@
         mtls->fep.dimZ = outType->getDimZ();
 
     } else {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                     "rsForEach called with null allocations");
         return;
     }
 
-    if (ains != NULL && aout != NULL) {
+    if (inLen > 0 && aout != NULL) {
         if (!ains[0]->hasSameDims(aout)) {
             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
               "Failed to launch kernel; dimensions of input and output allocations do not match.");
@@ -1002,7 +892,7 @@
     mtls->zEnd     = rsMax((uint32_t)1, mtls->zEnd);
     mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
 
-    rsAssert(!ains || (ains[0]->getType()->getDimZ() == 0));
+    rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
 
     mtls->rsc        = mCtx;
     mtls->ains       = ains;
@@ -1012,18 +902,28 @@
     mtls->mSliceSize = 1;
     mtls->mSliceNum  = 0;
 
-    mtls->fep.ptrIns    = NULL;
-    mtls->fep.eStrideIn = 0;
+    mtls->fep.inPtrs    = NULL;
+    mtls->fep.inStrides = NULL;
     mtls->isThreadable  = mIsThreadable;
 
-    if (ains) {
-        mtls->fep.ptrIns    = new const uint8_t*[inLen];
-        mtls->fep.inStrides = new StridePair[inLen];
+    if (inLen > 0) {
+
+        if (inLen <= RS_KERNEL_INPUT_THRESHOLD) {
+            mtls->fep.inPtrs    = (const uint8_t**)mtls->inPtrsBuff;
+            mtls->fep.inStrides = mtls->inStridesBuff;
+        } else {
+            mtls->fep.heapAllocatedArrays = true;
+
+            mtls->fep.inPtrs    = new const uint8_t*[inLen];
+            mtls->fep.inStrides = new StridePair[inLen];
+        }
+
+        mtls->fep.inLen = inLen;
 
         for (int index = inLen; --index >= 0;) {
             const Allocation *ain = ains[index];
 
-            mtls->fep.ptrIns[index] =
+            mtls->fep.inPtrs[index] =
               (const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
 
             mtls->fep.inStrides[index].eStride =
@@ -1033,41 +933,27 @@
         }
     }
 
-    mtls->fep.ptrOut = NULL;
-    mtls->fep.eStrideOut = 0;
-    if (aout) {
-        mtls->fep.ptrOut     = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
-        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
-        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+    mtls->fep.outPtr            = NULL;
+    mtls->fep.outStride.eStride = 0;
+    mtls->fep.outStride.yStride = 0;
+    if (aout != NULL) {
+        mtls->fep.outPtr = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+
+        mtls->fep.outStride.eStride = aout->getType()->getElementSizeBytes();
+        mtls->fep.outStride.yStride = aout->mHal.drvState.lod[0].stride;
     }
 }
 
 
 void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
-                                     const Allocation * ain,
+                                     const Allocation ** ains,
+                                     uint32_t inLen,
                                      Allocation * aout,
                                      const void * usr,
                                      uint32_t usrLen,
                                      const RsScriptCall *sc) {
 
     MTLaunchStruct mtls;
-    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
-    forEachKernelSetup(slot, &mtls);
-
-    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
-    mCtx->launchThreads(ain, aout, sc, &mtls);
-    mCtx->setTLS(oldTLS);
-}
-
-void RsdCpuScriptImpl::invokeForEachMulti(uint32_t slot,
-                                          const Allocation ** ains,
-                                          uint32_t inLen,
-                                          Allocation * aout,
-                                          const void * usr,
-                                          uint32_t usrLen,
-                                          const RsScriptCall *sc) {
-
-    MTLaunchStruct mtls;
 
     forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
     forEachKernelSetup(slot, &mtls);
@@ -1338,17 +1224,15 @@
     return NULL;
 }
 
-void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation * ain,
-                       Allocation * aout, const void * usr,
-                       uint32_t usrLen, const RsScriptCall *sc)
-{
-}
+void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation ** ains,
+                                 uint32_t inLen, Allocation * aout,
+                                 const void * usr, uint32_t usrLen,
+                                 const RsScriptCall *sc) {}
 
-void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation * ain,
-                        Allocation * aout, const void * usr,
-                        uint32_t usrLen, const RsScriptCall *sc)
-{
-}
+void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation ** ains,
+                                  uint32_t inLen, Allocation * aout,
+                                  const void * usr, uint32_t usrLen,
+                                  const RsScriptCall *sc) {}
 
 
 }
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index d51e9e3..f0843cc 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -64,26 +64,22 @@
 
     virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
     virtual int invokeRoot();
-    virtual void preLaunch(uint32_t slot, const Allocation * ain,
-                           Allocation * aout, const void * usr,
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
                            uint32_t usrLen, const RsScriptCall *sc);
-    virtual void postLaunch(uint32_t slot, const Allocation * ain,
-                            Allocation * aout, const void * usr,
-                            uint32_t usrLen, const RsScriptCall *sc);
-    virtual void invokeForEach(uint32_t slot,
-                       const Allocation * ain,
-                       Allocation * aout,
-                       const void * usr,
-                       uint32_t usrLen,
-                       const RsScriptCall *sc);
+    virtual void postLaunch(uint32_t slot, const Allocation ** ains,
+                            uint32_t inLen, Allocation * aout,
+                            const void * usr, uint32_t usrLen,
+                            const RsScriptCall *sc);
 
-    virtual void invokeForEachMulti(uint32_t slot,
-                                     const Allocation** ains,
-                                     uint32_t inLen,
-                                     Allocation* aout,
-                                     const void* usr,
-                                     uint32_t usrLen,
-                                     const RsScriptCall* sc);
+    virtual void invokeForEach(uint32_t slot,
+                               const Allocation ** ains,
+                               uint32_t inLen,
+                               Allocation* aout,
+                               const void* usr,
+                               uint32_t usrLen,
+                               const RsScriptCall* sc);
+
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
@@ -100,10 +96,6 @@
 
     const Script * getScript() {return mScript;}
 
-    void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
-                          const void * usr, uint32_t usrLen,
-                          const RsScriptCall *sc, MTLaunchStruct *mtls);
-
     void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
                           Allocation * aout, const void * usr, uint32_t usrLen,
                           const RsScriptCall *sc, MTLaunchStruct *mtls);
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 0878552..20ee09d 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -53,38 +53,45 @@
                                          uint32_t instep, uint32_t outstep) {
 
 
-    const ScriptList *sl            = (const ScriptList *)kparams->usr;
+    const ScriptList *sl           = (const ScriptList *)kparams->usr;
     RsExpandKernelParams *mkparams = (RsExpandKernelParams *)kparams;
 
+    const void **oldIns  = mkparams->ins;
+    uint32_t *oldStrides = mkparams->inEStrides;
+
+    void *localIns[1];
+    uint32_t localStride[1];
+
+    mkparams->ins        = (const void**)localIns;
+    mkparams->inEStrides = localStride;
+
     for (size_t ct = 0; ct < sl->count; ct++) {
         ScriptGroupRootFunc_t func;
         func          = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
         mkparams->usr = sl->usrPtrs[ct];
 
-        mkparams->in  = NULL;
-        mkparams->out = NULL;
-
-        uint32_t istep = 0;
-        uint32_t ostep = 0;
-
         if (sl->ins[ct]) {
-            mkparams->in =
-              (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+            localIns[0] = sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
 
-            istep = sl->ins[ct]->mHal.state.elementSizeBytes;
+            localStride[0] = sl->ins[ct]->mHal.state.elementSizeBytes;
 
             if (sl->inExts[ct]) {
-                mkparams->in =
-                  (const uint8_t *)mkparams->in +
-                  sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y;
+                localIns[0] = (void*)
+                  ((const uint8_t *)localIns[0] +
+                   sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y);
 
             } else if (sl->ins[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
-                mkparams->in =
-                  (const uint8_t *)mkparams->in +
-                  sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid;
+                localIns[0] = (void*)
+                  ((const uint8_t *)localIns[0] +
+                   sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid);
             }
+
+        } else {
+            localIns[0]    = NULL;
+            localStride[0] = 0;
         }
 
+        uint32_t ostep;
         if (sl->outs[ct]) {
             mkparams->out =
               (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
@@ -101,14 +108,23 @@
                   (uint8_t *)mkparams->out +
                   sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->lid;
             }
+        } else {
+            mkparams->out = NULL;
+            ostep         = 0;
         }
 
         //ALOGE("kernel %i %p,%p  %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
-        func(kparams, xstart, xend, istep, ostep);
+        /*
+         * The fourth argument is zero here because kernels get their stride
+         * information from a member of p that points to an array.
+         */
+        func(kparams, xstart, xend, 0, ostep);
     }
     //ALOGE("script group root");
 
-    mkparams->usr = sl;
+    mkparams->ins        = oldIns;
+    mkparams->inEStrides = oldStrides;
+    mkparams->usr        = sl;
 }
 
 
@@ -195,17 +211,33 @@
 
     MTLaunchStruct mtls;
 
-    if(fieldDep) {
+    if (fieldDep) {
         for (size_t ct=0; ct < ins.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
             uint32_t slot = kernels[ct]->mSlot;
 
-            si->forEachMtlsSetup(ins[ct], outs[ct], NULL, 0, NULL, &mtls);
+            uint32_t inLen;
+            const Allocation **ains;
+
+            if (ins[ct] == NULL) {
+                inLen = 0;
+                ains  = NULL;
+
+            } else {
+                inLen = 1;
+                ains  = const_cast<const Allocation**>(&ins[ct]);
+            }
+
+            si->forEachMtlsSetup(ains, inLen, outs[ct], NULL, 0, NULL, &mtls);
+
             si->forEachKernelSetup(slot, &mtls);
-            si->preLaunch(slot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
-            mCtx->launchThreads(ins[ct], outs[ct], NULL, &mtls);
-            si->postLaunch(slot, ins[ct], outs[ct], NULL, 0, NULL);
+            si->preLaunch(slot, ains, inLen, outs[ct], mtls.fep.usr,
+                          mtls.fep.usrLen, NULL);
+
+            mCtx->launchThreads(ains, inLen, outs[ct], NULL, &mtls);
+
+            si->postLaunch(slot, ains, inLen, outs[ct], NULL, 0, NULL);
         }
     } else {
         ScriptList sl;
@@ -214,6 +246,18 @@
         sl.kernels = kernels.array();
         sl.count = kernels.size();
 
+        uint32_t inLen;
+        const Allocation **ains;
+
+        if (ins[0] == NULL) {
+            inLen = 0;
+            ains  = NULL;
+
+        } else {
+            inLen = 1;
+            ains  = const_cast<const Allocation**>(&ins[0]);
+        }
+
         Vector<const void *> usrPtrs;
         Vector<const void *> fnPtrs;
         Vector<uint32_t> sigs;
@@ -225,7 +269,8 @@
             fnPtrs.add((void *)mtls.kernel);
             usrPtrs.add(mtls.fep.usr);
             sigs.add(mtls.fep.usrLen);
-            si->preLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
+            si->preLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct],
+                          mtls.fep.usr, mtls.fep.usrLen, NULL);
         }
         sl.sigs = sigs.array();
         sl.usrPtrs = usrPtrs.array();
@@ -235,16 +280,20 @@
 
         Script *s = kernels[0]->mScript;
         RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
-        si->forEachMtlsSetup(ins[0], outs[0], NULL, 0, NULL, &mtls);
+
+        si->forEachMtlsSetup(ains, inLen, outs[0], NULL, 0, NULL, &mtls);
+
         mtls.script = NULL;
         mtls.kernel = (void (*)())&scriptGroupRoot;
         mtls.fep.usr = &sl;
-        mCtx->launchThreads(ins[0], outs[0], NULL, &mtls);
+
+        mCtx->launchThreads(ains, inLen, outs[0], NULL, &mtls);
 
         for (size_t ct=0; ct < kernels.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
-            si->postLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], NULL, 0, NULL);
+            si->postLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct], NULL, 0,
+                           NULL);
         }
     }
 }
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 0076cb9..4728b7c 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -69,21 +69,15 @@
         virtual void populateScript(Script *) = 0;
         virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength) = 0;
         virtual int invokeRoot() = 0;
+
         virtual void invokeForEach(uint32_t slot,
-                           const Allocation * ain,
-                           Allocation * aout,
-                           const void * usr,
-                           uint32_t usrLen,
-                           const RsScriptCall *sc) = 0;
-                           
-        virtual void invokeForEachMulti(uint32_t slot,
-                                         const Allocation** ains,
-                                         uint32_t inLen,
-                                         Allocation * aout,
-                                         const void * usr,
-                                         uint32_t usrLen,
-                                         const RsScriptCall *sc) = 0;
-        
+                                   const Allocation ** ains,
+                                   uint32_t inLen,
+                                   Allocation * aout,
+                                   const void * usr,
+                                   uint32_t usrLen,
+                                   const RsScriptCall *sc) = 0;
+
         virtual void invokeInit() = 0;
         virtual void invokeFreeChildren() = 0;
 
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 27029cf..b7c7f2e 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -43,8 +43,9 @@
                      size_t bitcodeSize,
                      uint32_t flags) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
-    RsdCpuReference::CpuScript * cs = dc->mCpuRef->createScript(script, resName, cacheDir,
-                                                                bitcode, bitcodeSize, flags);
+    RsdCpuReference::CpuScript * cs =
+        dc->mCpuRef->createScript(script, resName, cacheDir, bitcode,
+                                  bitcodeSize, flags);
     if (cs == NULL) {
         return false;
     }
@@ -53,7 +54,8 @@
     return true;
 }
 
-bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrinsicID iid, Element *e) {
+bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrinsicID iid,
+                      Element *e) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
     RsdCpuReference::CpuScript * cs = dc->mCpuRef->createIntrinsic(s, iid, e);
     if (cs == NULL) {
@@ -73,8 +75,15 @@
                             size_t usrLen,
                             const RsScriptCall *sc) {
 
-    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
-    cs->invokeForEach(slot, ain, aout, usr, usrLen, sc);
+    if (ain == NULL) {
+        rsdScriptInvokeForEachMulti(rsc, s, slot, NULL, 0, aout, usr, usrLen,
+                                    sc);
+    } else {
+        const Allocation *ains[1] = {ain};
+
+        rsdScriptInvokeForEachMulti(rsc, s, slot, ains, 1, aout, usr, usrLen,
+                                    sc);
+    }
 }
 
 void rsdScriptInvokeForEachMulti(const Context *rsc,
@@ -88,7 +97,7 @@
                                  const RsScriptCall *sc) {
 
     RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
-    cs->invokeForEachMulti(slot, ains, inLen, aout, usr, usrLen, sc);
+    cs->invokeForEach(slot, ains, inLen, aout, usr, usrLen, sc);
 }
 
 
diff --git a/rsRuntime.h b/rsRuntime.h
index eb93e25..5a05883 100644
--- a/rsRuntime.h
+++ b/rsRuntime.h
@@ -158,7 +158,7 @@
                 Allocation *in,
                 Allocation *out,
                 const void *usr,
-                 uint32_t usrBytes,
+                uint32_t usrBytes,
                 const RsScriptCall *call);
 
 
diff --git a/rsScript.cpp b/rsScript.cpp
index ea1b3ac..a4fa196 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -187,23 +187,13 @@
     free(tz);
 }
 
-void rsi_ScriptForEach(Context *rsc, RsScript vs, uint32_t slot,
-                       RsAllocation vain, RsAllocation vaout,
-                       const void *params, size_t paramLen,
-                       const RsScriptCall *sc, size_t scLen) {
-    Script *s = static_cast<Script *>(vs);
-    s->runForEach(rsc, slot,
-                  static_cast<const Allocation *>(vain), static_cast<Allocation *>(vaout),
-                  params, paramLen, sc);
-
-}
-
 void rsi_ScriptForEachMulti(Context *rsc, RsScript vs, uint32_t slot,
                             RsAllocation *vains, size_t inLen,
                             RsAllocation vaout, const void *params,
                             size_t paramLen, const RsScriptCall *sc,
                             size_t scLen) {
-    Script *s = static_cast<Script *>(vs);
+
+    Script      *s    = static_cast<Script *>(vs);
     Allocation **ains = (Allocation**)(vains);
 
     s->runForEach(rsc, slot,
@@ -212,6 +202,23 @@
 
 }
 
+void rsi_ScriptForEach(Context *rsc, RsScript vs, uint32_t slot,
+                       RsAllocation vain, RsAllocation vaout,
+                       const void *params, size_t paramLen,
+                       const RsScriptCall *sc, size_t scLen) {
+
+    if (vain == NULL) {
+        rsi_ScriptForEachMulti(rsc, vs, slot, NULL, 0, vaout, params, paramLen,
+                               sc, scLen);
+    } else {
+        RsAllocation ains[1] = {vain};
+
+        rsi_ScriptForEachMulti(rsc, vs, slot, ains,
+                               sizeof(ains) / sizeof(RsAllocation), vaout,
+                               params, paramLen, sc, scLen);
+    }
+}
+
 void rsi_ScriptInvoke(Context *rsc, RsScript vs, uint32_t slot) {
     Script *s = static_cast<Script *>(vs);
     s->Invoke(rsc, slot, NULL, 0);
diff --git a/rsScript.h b/rsScript.h
index 1ad013f..2e232f0 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -108,17 +108,9 @@
 
     virtual bool freeChildren();
 
-    virtual void runForEach(Context *rsc,
-                            uint32_t slot,
-                            const Allocation * ain,
-                            Allocation * aout,
-                            const void * usr,
-                            size_t usrBytes,
-                            const RsScriptCall *sc = NULL) = 0;
-
     virtual void runForEach(Context* rsc,
                             uint32_t slot,
-                            const Allocation** ains,
+                            const Allocation ** ains,
                             size_t inLen,
                             Allocation* aout,
                             const void* usr,
diff --git a/rsScriptC.cpp b/rsScriptC.cpp
index e7ff8c7..892807b 100644
--- a/rsScriptC.cpp
+++ b/rsScriptC.cpp
@@ -156,36 +156,6 @@
 
 void ScriptC::runForEach(Context *rsc,
                          uint32_t slot,
-                         const Allocation * ain,
-                         Allocation * aout,
-                         const void * usr,
-                         size_t usrBytes,
-                         const RsScriptCall *sc) {
-    // Trace this function call.
-    // To avoid overhead, we only build the string, if tracing is actually
-    // enabled.
-    String8 *AString = NULL;
-    const char *String = "";
-    if (ATRACE_ENABLED()) {
-        AString = new String8("runForEach_");
-        AString->append(mHal.info.exportedForeachFuncList[slot].first);
-        String = AString->string();
-    }
-    ATRACE_NAME(String);
-    (void)String;
-
-    Context::PushState ps(rsc);
-
-    setupGLState(rsc);
-    setupScript(rsc);
-    rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ain, aout, usr, usrBytes, sc);
-
-    if (AString)
-        delete AString;
-}
-
-void ScriptC::runForEach(Context *rsc,
-                         uint32_t slot,
                          const Allocation ** ains,
                          size_t inLen,
                          Allocation * aout,
@@ -210,10 +180,22 @@
     setupGLState(rsc);
     setupScript(rsc);
 
-    rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen, aout, usr, usrBytes, sc);
+    if (rsc->mHal.funcs.script.invokeForEachMulti != NULL) {
+        rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen,
+                                                  aout, usr, usrBytes, sc);
 
-    if (AString)
+    } else if (inLen == 1) {
+        rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ains[0], aout,
+                                             usr, usrBytes, sc);
+
+    } else {
+        rsc->setError(RS_ERROR_FATAL_DRIVER,
+                      "Driver support for multi-input not present");
+    }
+
+    if (AString) {
         delete AString;
+    }
 }
 
 void ScriptC::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
diff --git a/rsScriptC.h b/rsScriptC.h
index d3d9d51..5735bea 100644
--- a/rsScriptC.h
+++ b/rsScriptC.h
@@ -44,14 +44,6 @@
 
     virtual void runForEach(Context *rsc,
                             uint32_t slot,
-                            const Allocation * ain,
-                            Allocation * aout,
-                            const void * usr,
-                            size_t usrBytes,
-                            const RsScriptCall *sc = NULL);
-
-    virtual void runForEach(Context *rsc,
-                            uint32_t slot,
                             const Allocation ** ains,
                             size_t inLen,
                             Allocation * aout,
diff --git a/rsScriptC_Lib.cpp b/rsScriptC_Lib.cpp
index a41f4a7..cacb37a 100644
--- a/rsScriptC_Lib.cpp
+++ b/rsScriptC_Lib.cpp
@@ -230,7 +230,17 @@
                 Allocation *in, Allocation *out,
                 const void *usr, uint32_t usrBytes,
                 const RsScriptCall *call) {
-    target->runForEach(rsc, /* root slot */ 0, in, out, usr, usrBytes, call);
+
+    if (in == NULL) {
+        target->runForEach(rsc, /* root slot */ 0, NULL, 0, out, usr,
+                           usrBytes, call);
+
+    } else {
+        const Allocation *ins[1] = {in};
+        target->runForEach(rsc, /* root slot */ 0, ins,
+                           sizeof(ins) / sizeof(RsAllocation), out, usr,
+                           usrBytes, call);
+    }
 }
 
 void rsrAllocationSyncAll(Context *rsc, Allocation *a, RsAllocationUsageType usage) {
diff --git a/rsScriptGroup.cpp b/rsScriptGroup.cpp
index d1dd9d8..a03cb78 100644
--- a/rsScriptGroup.cpp
+++ b/rsScriptGroup.cpp
@@ -346,7 +346,15 @@
                 }
             }
 
-            n->mScript->runForEach(rsc, k->mSlot, ain, aout, NULL, 0);
+            if (ain == NULL) {
+                n->mScript->runForEach(rsc, k->mSlot, NULL, 0, aout, NULL, 0);
+
+            } else {
+                const Allocation *ains[1] = {ain};
+                n->mScript->runForEach(rsc, k->mSlot, ains,
+                                       sizeof(ains) / sizeof(RsAllocation),
+                                       aout, NULL, 0);
+            }
         }
 
     }
@@ -409,4 +417,3 @@
 
 }
 }
-
diff --git a/rsScriptIntrinsic.cpp b/rsScriptIntrinsic.cpp
index 86f1c50..7461d34 100644
--- a/rsScriptIntrinsic.cpp
+++ b/rsScriptIntrinsic.cpp
@@ -55,18 +55,6 @@
     return 0;
 }
 
-
-void ScriptIntrinsic::runForEach(Context *rsc,
-                         uint32_t slot,
-                         const Allocation * ain,
-                         Allocation * aout,
-                         const void * usr,
-                         size_t usrBytes,
-                         const RsScriptCall *sc) {
-
-    rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ain, aout, usr, usrBytes, sc);
-}
-
 void ScriptIntrinsic::runForEach(Context* rsc,
                          uint32_t slot,
                          const Allocation** ains,
@@ -76,7 +64,18 @@
                          size_t usrBytes,
                          const RsScriptCall* sc) {
 
-    rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen, aout, usr, usrBytes, sc);
+    if (rsc->mHal.funcs.script.invokeForEachMulti != NULL) {
+        rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen,
+                                                  aout, usr, usrBytes, sc);
+
+    } else if (inLen == 1) {
+        rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ains[0], aout,
+                                             usr, usrBytes, sc);
+
+    } else {
+        rsc->setError(RS_ERROR_FATAL_DRIVER,
+                      "Driver support for multi-input not present");
+    }
 }
 
 void ScriptIntrinsic::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
@@ -107,5 +106,3 @@
 
 }
 }
-
-
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
index 66b6031..87b7353 100644
--- a/rsScriptIntrinsic.h
+++ b/rsScriptIntrinsic.h
@@ -40,17 +40,9 @@
     virtual RsA3DClassID getClassId() const;
     virtual bool freeChildren();
 
-    virtual void runForEach(Context *rsc,
-                            uint32_t slot,
-                            const Allocation * ain,
-                            Allocation * aout,
-                            const void * usr,
-                            size_t usrBytes,
-                            const RsScriptCall *sc = NULL);
-
     virtual void runForEach(Context* rsc,
                             uint32_t slot,
-                            const Allocation** ains,
+                            const Allocation ** ains,
                             size_t inLen,
                             Allocation* aout,
                             const void* usr,
@@ -69,5 +61,3 @@
 }
 }
 #endif
-
-