resolved conflicts for merge of adbc54f3 to lmp-dev-plus-aosp

Change-Id: I04f438a6fc4cd374a821c32102cc758d9a30d731
diff --git a/Android.mk b/Android.mk
index e8ec005..f6aab50 100644
--- a/Android.mk
+++ b/Android.mk
@@ -1,7 +1,7 @@
 
 LOCAL_PATH:=$(call my-dir)
 
-rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable -fno-exceptions
+rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable -fno-exceptions -std=c++11
 ifeq ($(TARGET_BUILD_PDK), true)
   rs_base_CFLAGS += -D__RS_PDK__
 endif
@@ -176,6 +176,9 @@
 LOCAL_C_INCLUDES += external/libcxx/include
 
 LOCAL_CFLAGS += $(rs_base_CFLAGS)
+# TODO: external/freetype still uses the register keyword
+# Bug: 17163086
+LOCAL_CFLAGS += -Wno-deprecated-register
 
 LOCAL_CPPFLAGS += -fno-exceptions
 
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index aa26c6a..3bd7d1e 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -1,7 +1,8 @@
 
 LOCAL_PATH:=$(call my-dir)
 
-rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable -fno-exceptions
+rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable \
+                  -fno-exceptions -std=c++11
 ifeq ($(TARGET_BUILD_PDK), true)
   rs_base_CFLAGS += -D__RS_PDK__
 endif
@@ -73,7 +74,7 @@
 ifeq ($(ARCH_X86_HAVE_SSSE3),true)
     LOCAL_CFLAGS += -DARCH_X86_HAVE_SSSE3
     LOCAL_SRC_FILES+= \
-    rsCpuIntrinsics_x86.c
+    rsCpuIntrinsics_x86.cpp
 endif
 
 LOCAL_SHARED_LIBRARIES += libRS libcutils libutils liblog libsync libc++
diff --git a/cpu_ref/linkloader/include/ELFObject.h b/cpu_ref/linkloader/include/ELFObject.h
index 0c195b9..86ac6bf 100644
--- a/cpu_ref/linkloader/include/ELFObject.h
+++ b/cpu_ref/linkloader/include/ELFObject.h
@@ -140,6 +140,11 @@
                     void *context,
                     ELFSectionRelTableTy *reltab,
                     ELFSectionProgBitsTy *text);
+
+  void relocateMIPS64(void *(*find_sym)(void *context, char const *name),
+                      void *context,
+                      ELFSectionRelTableTy *reltab,
+                      ELFSectionProgBitsTy *text);
 };
 
 #include "impl/ELFObject.hxx"
diff --git a/cpu_ref/linkloader/include/ELFReloc.h b/cpu_ref/linkloader/include/ELFReloc.h
index 84754b9..a6d7f5e 100644
--- a/cpu_ref/linkloader/include/ELFReloc.h
+++ b/cpu_ref/linkloader/include/ELFReloc.h
@@ -134,13 +134,27 @@
 
 public:
   xword_t getSymTabIndex() const {
+#if defined(__mips__)
+/*
+ * Packed r_info on MIPS is:
+ * r_sym (4) - r_ssym (1) - r_type3 (1) - r_type2 (1) - r_type (1)
+ * Each entry represents up to three actual relocations.
+ * Thus, the macros look different.
+ */
+#define ELF64_R_SYM(i)    ((i)&0xffffffffL)
+#else
 #define ELF64_R_SYM(i)    ((i)>>32)
+#endif
     return ELF64_R_SYM(this->r_info);
 #undef ELF64_R_SYM
   }
 
   xword_t getType() const {
+#if defined(__mips__)
+#define ELF64_R_TYPE(i)   ((i)>>32)
+#else
 #define ELF64_R_TYPE(i)   ((i)&0xffffffffL)
+#endif
     return ELF64_R_TYPE(this->r_info);
 #undef ELF64_R_TYPE
   }
diff --git a/cpu_ref/linkloader/include/GOT.h b/cpu_ref/linkloader/include/GOT.h
index b72bf66..c86ca77 100644
--- a/cpu_ref/linkloader/include/GOT.h
+++ b/cpu_ref/linkloader/include/GOT.h
@@ -20,10 +20,15 @@
 #include "utils/rsl_assert.h"
 #include "ELF.h"
 
-#define GP_OFFSET	((int)0x8000)
-#define GOT_SIZE	(1 << 16)	// bytes
-#define GOT_ENTRY_SIZE	4	// bytes
-#define NUM_OF_GOT_ENTRY	(GOT_SIZE/GOT_ENTRY_SIZE)
+#define GP_OFFSET    ((int)0x8000)
+#ifdef __LP64__
+#define GOT_SIZE    (1 << 17) // bytes
+#define GOT_ENTRY_SIZE    8   // bytes
+#else
+#define GOT_SIZE    (1 << 16) // bytes
+#define GOT_ENTRY_SIZE    4   // bytes
+#endif
+#define NUM_OF_GOT_ENTRY  (GOT_SIZE/GOT_ENTRY_SIZE)
 
 void *got_address();
 int search_got(int symbol_index, void *addr, uint8_t bind_type);
diff --git a/cpu_ref/linkloader/include/impl/ELFObject.hxx b/cpu_ref/linkloader/include/impl/ELFObject.hxx
index 81736b5..d849a93 100644
--- a/cpu_ref/linkloader/include/impl/ELFObject.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFObject.hxx
@@ -1022,6 +1022,167 @@
   }
 }
 
+template <unsigned Bitwidth>
+inline void ELFObject<Bitwidth>::
+relocateMIPS64(void *(*find_sym)(void *context, char const *name),
+               void *context,
+               ELFSectionRelTableTy *reltab,
+               ELFSectionProgBitsTy *text) {
+  ELFSectionSymTabTy *symtab =
+    static_cast<ELFSectionSymTabTy *>(getSectionByName(".symtab"));
+  rsl_assert(symtab && "Symtab is required.");
+
+  int64_t calculatedValue;
+  bool applyRelocation = true;
+  bool useCalculatedValue;
+
+  for (size_t i = 0; i < reltab->size(); ++i) {
+    ELFRelocTy *rel = (*reltab)[i];
+    ELFSymbolTy *sym = (*symtab)[rel->getSymTabIndex()];
+
+    typedef int64_t Inst_t;
+    Inst_t *inst = (Inst_t *)&(*text)[rel->getOffset()];
+    Inst_t P = (Inst_t)(uintptr_t)inst;
+    Inst_t A = (Inst_t)rel->getAddend();
+    Inst_t S = (Inst_t)(uintptr_t)sym->getAddress(EM_MIPS);
+
+    if (S == 0) {
+      S = (Inst_t)(uintptr_t)find_sym(context, sym->getName());
+      if (!S) {
+        missingSymbols = true;
+      }
+      sym->setAddress((void *)S);
+    }
+
+    uint8_t rtype[3];
+    rtype[0] = (rel->getType() >> 24) & 0xFF;
+    rtype[1] = (rel->getType() >> 16) & 0xFF;
+    rtype[2] = (rel->getType() >> 8) & 0xFF;
+
+    for (size_t j = 0; j < 3; ++j) {
+      useCalculatedValue = !applyRelocation;
+      if (j < 2) {
+        applyRelocation = (rtype[j+1] == R_MIPS_NONE);
+      } else if ((i + 1) < reltab->size()) {
+        // Enter here if there are more relocations left in the table
+        // and check if the next one affects the same instruction.
+        ELFRelocTy *next_rel = (*reltab)[i + 1];
+        Inst_t *next_inst = (Inst_t *)&(*text)[next_rel->getOffset()];
+        applyRelocation = (inst != next_inst);
+      }
+
+      if (useCalculatedValue) {
+        S = 0;
+        A = calculatedValue;
+      }
+
+      switch (rtype[j]) {
+      default:
+        rsl_assert(0 && "Not implemented relocation type.");
+        break;
+
+      case R_MIPS_NONE:
+        break;
+
+      case R_MIPS_64:
+        calculatedValue = S + A;
+        if (applyRelocation) {
+          *inst = calculatedValue;
+        }
+        break;
+
+      case R_MIPS_26:
+        if (sym->getBindingAttribute() == STB_LOCAL) {
+          // Local binding.
+          A |= ((P + 4) & 0xF0000000);
+          A += S;
+          calculatedValue = (A >> 2);
+          if (applyRelocation) {
+            *inst |= (calculatedValue & 0x3FFFFFF);
+          }
+        } else {
+          // External binding.
+          A += S;
+          calculatedValue = (A >> 2);
+          if (applyRelocation) {
+            *inst |= (calculatedValue & 0x3FFFFFF);
+          }
+        }
+        break;
+
+      case R_MIPS_CALL16:
+      case R_MIPS_GOT_PAGE:
+      case R_MIPS_GOT_DISP: {
+        A = A & 0xFFFF;
+        int got_index = search_got((int)rel->getSymTabIndex(),
+                                   (void *)(S + A),
+                                   sym->getBindingAttribute());
+        calculatedValue = (got_index << 3) - 0x7FF0;
+        if (applyRelocation) {
+          *inst |= (calculatedValue & 0xFFFF);
+        }
+        break;
+      }
+
+      case R_MIPS_GPREL32:
+        calculatedValue = A + S - ((int64_t)got_address() + 0x7FF0);
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+
+      case R_MIPS_GOT_OFST:
+        calculatedValue = (S + A) & 0xFFFF;
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+
+      case R_MIPS_GPREL16:
+        calculatedValue = A + S - ((int64_t)got_address() + 0x7FF0);
+        if (applyRelocation) {
+          *inst |= (calculatedValue & 0xFFFF);
+        }
+        break;
+
+      case R_MIPS_SUB:
+        calculatedValue = S - A;
+        if (applyRelocation) {
+          *inst = calculatedValue;
+        }
+        break;
+
+      case R_MIPS_HI16:
+        calculatedValue = ((S + A + 0x8000) >> 16) & 0xFFFF;
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+
+      case R_MIPS_LO16:
+        calculatedValue = (S + A) & 0xFFFF;
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+
+      case R_MIPS_HIGHER:
+        calculatedValue = ((S + A + 0x80008000) >> 32) & 0xFFFF;
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+
+      case R_MIPS_HIGHEST:
+        calculatedValue = ((S + A + 0x800080008000) >> 48) & 0xFFFF;
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+      }
+    }
+  }
+}
 
 // TODO: Refactor all relocations.
 template <unsigned Bitwidth>
@@ -1121,7 +1282,11 @@
         relocateX86_64(find_sym, context, reltab, need_rel);
         break;
       case EM_MIPS:
-        relocateMIPS(find_sym, context, reltab, need_rel);
+        if (getHeader()->getClass() == ELFCLASS64) {
+          relocateMIPS64(find_sym, context, reltab, need_rel);
+        } else {
+          relocateMIPS(find_sym, context, reltab, need_rel);
+        }
         break;
 
       default:
diff --git a/cpu_ref/linkloader/lib/GOT.cpp b/cpu_ref/linkloader/lib/GOT.cpp
index 3f523c5..7e85fb1 100644
--- a/cpu_ref/linkloader/lib/GOT.cpp
+++ b/cpu_ref/linkloader/lib/GOT.cpp
@@ -33,7 +33,12 @@
   // For local symbols (R_MIPS_GOT16), we only store the high 16-bit value
   // after adding 0x8000.
   if (bind_type == STB_LOCAL)
+#ifdef __LP64__
+    addr = (void *)(((intptr_t)addr + 0x8000) & 0xFFFFFFFFFFFF0000);
+#else
     addr = (void *)(((intptr_t)addr + 0x8000) & 0xFFFF0000);
+#endif
+
 
   for (i = 0; i < got_symbol_count; i++) {
     if (got_symbol_indexes[i] == symbol_index) {
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 3124ba1..752f169 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -48,9 +48,8 @@
 using namespace android::renderscript;
 
 typedef void (*outer_foreach_t)(
-    const android::renderscript::RsForEachStubParamStruct *,
-    uint32_t x1, uint32_t x2,
-    uint32_t instep, uint32_t outstep);
+    const android::renderscript::RsExpandKernelParams *,
+    uint32_t x1, uint32_t x2, uint32_t outstep);
 
 
 static pthread_key_t gThreadTLSKey = 0;
@@ -350,153 +349,132 @@
 }
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+typedef void (*walk_loop_t)(MTLaunchStruct*,
+                            RsExpandKernelParams&,
+                            outer_foreach_t);
 
-static void wc_xy(void *usr, uint32_t idx) {
+
+static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-    RsForEachStubParamStruct p;
-    memcpy(&p, &mtls->fep, sizeof(p));
-    p.lid = idx;
-    uint32_t sig = mtls->sig;
 
-    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-    while (1) {
-        uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
-        uint32_t yEnd = yStart + mtls->mSliceSize;
-        yEnd = rsMin(yEnd, mtls->yEnd);
-        if (yEnd <= yStart) {
-            return;
-        }
+    uint32_t inLen = mtls->fep.inLen;
 
-        //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
-        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+    RsExpandKernelParams kparams;
+    kparams.takeFields(mtls->fep);
 
-        for (p.y = yStart; p.y < yEnd; p.y++) {
-            p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y) +
-                    (mtls->fep.eStrideOut * mtls->xStart);
-            p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y) +
-                   (mtls->fep.eStrideIn * mtls->xStart);
-            fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+    // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
+    kparams.lid = idx;
+
+    if (inLen > 0) {
+        // Allocate space for our input base pointers.
+        kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+
+        // Allocate space for our input stride information.
+        kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
+
+        // Fill our stride information.
+        for (int inIndex = inLen; --inIndex >= 0;) {
+          kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
         }
     }
-}
-
-static void wc_x(void *usr, uint32_t idx) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-    RsForEachStubParamStruct p;
-    memcpy(&p, &mtls->fep, sizeof(p));
-    p.lid = idx;
-    uint32_t sig = mtls->sig;
 
     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-    while (1) {
-        uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
-        uint32_t xEnd = xStart + mtls->mSliceSize;
-        xEnd = rsMin(xEnd, mtls->xEnd);
-        if (xEnd <= xStart) {
-            return;
-        }
 
-        //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
-        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
-
-        p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
-        p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
-        fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
-    }
+    walk_loop(mtls, kparams, fn);
 }
 
-void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
-                                     const RsScriptCall *sc, MTLaunchStruct *mtls) {
+static void walk_2d(void *usr, uint32_t idx) {
+    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
+                              RsExpandKernelParams &kparams,
+                              outer_foreach_t fn) {
 
-    //android::StopWatch kernel_time("kernel time");
+        while (1) {
+            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+            uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+            uint32_t yEnd   = yStart + mtls->mSliceSize;
 
-    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
-        const size_t targetByteChunk = 16 * 1024;
-        mInForEach = true;
-        if (mtls->fep.dimY > 1) {
-            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
-            uint32_t s2 = 0;
+            yEnd = rsMin(yEnd, mtls->yEnd);
 
-            // This chooses our slice size to rate limit atomic ops to
-            // one per 16k bytes of reads/writes.
-            if (mtls->fep.yStrideOut) {
-                s2 = targetByteChunk / mtls->fep.yStrideOut;
-            } else {
-                s2 = targetByteChunk / mtls->fep.yStrideIn;
-            }
-            mtls->mSliceSize = rsMin(s1, s2);
-
-            if(mtls->mSliceSize < 1) {
-                mtls->mSliceSize = 1;
+            if (yEnd <= yStart) {
+                return;
             }
 
-         //   mtls->mSliceSize = 2;
-            launchThreads(wc_xy, mtls);
-        } else {
-            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
-            uint32_t s2 = 0;
+            for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
+                kparams.out = mtls->fep.outPtr +
+                              (mtls->fep.outStride.yStride * kparams.y) +
+                              (mtls->fep.outStride.eStride * mtls->xStart);
 
-            // This chooses our slice size to rate limit atomic ops to
-            // one per 16k bytes of reads/writes.
-            if (mtls->fep.eStrideOut) {
-                s2 = targetByteChunk / mtls->fep.eStrideOut;
-            } else {
-                s2 = targetByteChunk / mtls->fep.eStrideIn;
-            }
-            mtls->mSliceSize = rsMin(s1, s2);
+                for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
+                    StridePair &strides = mtls->fep.inStrides[inIndex];
 
-            if(mtls->mSliceSize < 1) {
-                mtls->mSliceSize = 1;
-            }
-
-            launchThreads(wc_x, mtls);
-        }
-        mInForEach = false;
-
-        //ALOGE("launch 1");
-    } else {
-        RsForEachStubParamStruct p;
-        memcpy(&p, &mtls->fep, sizeof(p));
-        uint32_t sig = mtls->sig;
-
-        //ALOGE("launch 3");
-        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
-            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
-                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
-                    uint32_t offset = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0] +
-                                      mtls->fep.dimY * p.z + p.y;
-                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
-                            (mtls->fep.eStrideOut * mtls->xStart);
-                    p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * offset) +
-                           (mtls->fep.eStrideIn * mtls->xStart);
-                    fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+                    kparams.ins[inIndex] =
+                      mtls->fep.inPtrs[inIndex] +
+                      (strides.yStride * kparams.y) +
+                      (strides.eStride * mtls->xStart);
                 }
+
+                fn(&kparams, mtls->xStart, mtls->xEnd,
+                   mtls->fep.outStride.eStride);
             }
         }
-    }
+    });
 }
 
-void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
-                                        const RsScriptCall* sc, MTLaunchStruct* mtls) {
+static void walk_1d(void *usr, uint32_t idx) {
+    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
+                              RsExpandKernelParams &kparams,
+                              outer_foreach_t fn) {
+
+        while (1) {
+            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+            uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+            uint32_t xEnd   = xStart + mtls->mSliceSize;
+
+            xEnd = rsMin(xEnd, mtls->xEnd);
+
+            if (xEnd <= xStart) {
+                return;
+            }
+
+            kparams.out = mtls->fep.outPtr +
+                          (mtls->fep.outStride.eStride * xStart);
+
+            for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
+                StridePair &strides = mtls->fep.inStrides[inIndex];
+
+                kparams.ins[inIndex] =
+                  mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
+            }
+
+            fn(&kparams, xStart, xEnd, mtls->fep.outStride.eStride);
+        }
+    });
+}
+
+
+void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
+                                        uint32_t inLen,
+                                        Allocation* aout,
+                                        const RsScriptCall* sc,
+                                        MTLaunchStruct* mtls) {
 
     //android::StopWatch kernel_time("kernel time");
 
     if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
         const size_t targetByteChunk = 16 * 1024;
         mInForEach = true;
+
         if (mtls->fep.dimY > 1) {
             uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.yStrideOut) {
-                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            if (mtls->fep.outStride.yStride) {
+                s2 = targetByteChunk / mtls->fep.outStride.yStride;
             } else {
-                s2 = targetByteChunk / mtls->fep.yStrideIn;
+                // We know that there is either an output or an input.
+                s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -504,18 +482,18 @@
                 mtls->mSliceSize = 1;
             }
 
-         //   mtls->mSliceSize = 2;
-            launchThreads(wc_xy, mtls);
+            launchThreads(walk_2d, mtls);
         } else {
             uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.eStrideOut) {
-                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            if (mtls->fep.outStride.eStride) {
+                s2 = targetByteChunk / mtls->fep.outStride.eStride;
             } else {
-                s2 = targetByteChunk / mtls->fep.eStrideIn;
+                // We know that there is either an output or an input.
+                s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -523,62 +501,61 @@
                 mtls->mSliceSize = 1;
             }
 
-            launchThreads(wc_x, mtls);
+            launchThreads(walk_1d, mtls);
         }
         mInForEach = false;
 
-        //ALOGE("launch 1");
     } else {
-        RsForEachStubParamStruct p;
-        memcpy(&p, &mtls->fep, sizeof(p));
-        uint32_t sig = mtls->sig;
+        RsExpandKernelParams kparams;
+        kparams.takeFields(mtls->fep);
 
-        // Allocate space for our input base pointers.
-        p.ins = new const void*[inLen];
+        if (inLen > 0) {
+            // Allocate space for our input base pointers.
+            kparams.ins = (const void**)alloca(inLen * sizeof(void*));
 
-        // Allocate space for our input stride information.
-        p.eStrideIns = new uint32_t[inLen];
+            // Allocate space for our input stride information.
+            kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
 
-        // Fill our stride information.
-        for (int index = inLen; --index >= 0;) {
-          p.eStrideIns[index] = mtls->fep.inStrides[index].eStride;
+            // Fill our stride information.
+            for (int inIndex = inLen; --inIndex >= 0;) {
+                kparams.inEStrides[inIndex] =
+                    mtls->fep.inStrides[inIndex].eStride;
+            }
         }
 
         //ALOGE("launch 3");
         outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-        uint32_t offset_invariant = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0];
+        for (uint32_t arrayIndex = mtls->arrayStart;
+             arrayIndex < mtls->arrayEnd; arrayIndex++) {
 
-        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
-            uint32_t offset_part = offset_invariant * p.ar[0];
+            for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
+                 kparams.z++) {
 
-            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
-                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
-                    uint32_t offset = offset_part + mtls->fep.dimY * p.z + p.y;
+                for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
+                     kparams.y++) {
 
-                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
-                            (mtls->fep.eStrideOut * mtls->xStart);
+                    uint32_t offset =
+                      mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
+                      mtls->fep.dimY * kparams.z + kparams.y;
 
-                    for (int index = inLen; --index >= 0;) {
-                        StridePair &strides = mtls->fep.inStrides[index];
+                    kparams.out = mtls->fep.outPtr +
+                                  (mtls->fep.outStride.yStride * offset) +
+                                  (mtls->fep.outStride.eStride * mtls->xStart);
 
-                        p.ins[index] = mtls->fep.ptrIns[index] +
-                                       (strides.yStride * offset) +
-                                       (strides.eStride * mtls->xStart);
+                    for (int inIndex = inLen; --inIndex >= 0;) {
+                        StridePair &strides = mtls->fep.inStrides[inIndex];
+
+                        kparams.ins[inIndex] =
+                          mtls->fep.inPtrs[inIndex] +
+                          (strides.yStride * offset) +
+                          (strides.eStride * mtls->xStart);
                     }
 
-                    /*
-                     * The fourth argument is zero here because multi-input
-                     * kernels get their stride information from a member of p
-                     * that points to an array.
-                     */
-                    fn(&p, mtls->xStart, mtls->xEnd, 0, mtls->fep.eStrideOut);
+                    fn(&kparams, mtls->xStart, mtls->xEnd,
+                       mtls->fep.outStride.eStride);
                 }
             }
         }
-
-        // Free our arrays.
-        delete[] p.ins;
-        delete[] p.eStrideIns;
     }
 }
 
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index c54dca2..2fea3fc 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -25,6 +25,8 @@
 
 #include <string>
 
+#define RS_KERNEL_INPUT_THRESHOLD 32
+
 namespace bcc {
     class BCCContext;
     class RSCompilerDriver;
@@ -34,44 +36,80 @@
 namespace android {
 namespace renderscript {
 
-typedef struct {
+struct StridePair {
   uint32_t eStride;
   uint32_t yStride;
-} StridePair;
+};
 
-typedef struct {
-    const void *in;
-    void *out;
-    const void *usr;
-    uint32_t usrLen;
-    uint32_t x;
-    uint32_t y;
-    uint32_t z;
-    uint32_t lod;
-    RsAllocationCubemapFace face;
-    uint32_t ar[16];
+struct RsExpandKernelDriverInfo {
+    const uint8_t **inPtrs;
+    uint32_t inLen;
 
-    const void **ins;
-    uint32_t *eStrideIns;
+    uint8_t *outPtr;
 
-    uint32_t lid;
+    StridePair *inStrides;
+    StridePair  outStride;
 
     uint32_t dimX;
     uint32_t dimY;
     uint32_t dimZ;
-    uint32_t dimArray;
 
-    const uint8_t *ptrIn;
-    uint8_t *ptrOut;
-    uint32_t eStrideIn;
-    uint32_t eStrideOut;
-    uint32_t yStrideIn;
-    uint32_t yStrideOut;
     uint32_t slot;
 
-    const uint8_t** ptrIns;
-    StridePair* inStrides;
-} RsForEachStubParamStruct;
+    const void *usr;
+    uint32_t usrLen;
+
+    bool heapAllocatedArrays;
+
+    RsExpandKernelDriverInfo() : heapAllocatedArrays(false) {}
+
+    ~RsExpandKernelDriverInfo() {
+        if (heapAllocatedArrays) {
+            if (inPtrs != NULL) {
+                delete[] inPtrs;
+            }
+
+            if (inStrides != NULL) {
+                delete[] inStrides;
+            }
+        }
+    }
+};
+
+struct RsExpandKernelParams {
+
+    // Used by kernels
+    const void **ins;
+    uint32_t *inEStrides;
+    void *out;
+    uint32_t y;
+    uint32_t z;
+    uint32_t lid;
+
+    // Used by ScriptGroup and user kernels.
+    const void *usr;
+
+    // Used by intrinsics
+    uint32_t dimX;
+    uint32_t dimY;
+    uint32_t dimZ;
+
+    /*
+     * FIXME: This is only used by the blend intrinsic.  If possible, we should
+     *        modify blur to not need it.
+     */
+    uint32_t slot;
+
+    /// Copy fields needed by a kernel from a driver struct.
+    void takeFields(const RsExpandKernelDriverInfo &dstruct) {
+        this->usr  = dstruct.usr;
+        this->slot = dstruct.slot;
+
+        this->dimX = dstruct.dimX;
+        this->dimY = dstruct.dimY;
+        this->dimZ = dstruct.dimZ;
+    }
+};
 
 extern bool gArchUseSIMD;
 
@@ -82,21 +120,21 @@
 class RsdCpuScriptImpl;
 class RsdCpuReferenceImpl;
 
-typedef struct ScriptTLSStructRec {
+struct ScriptTLSStruct {
     android::renderscript::Context * mContext;
     const android::renderscript::Script * mScript;
     RsdCpuScriptImpl *mImpl;
-} ScriptTLSStruct;
+};
 
-typedef struct {
-    RsForEachStubParamStruct fep;
+struct MTLaunchStruct {
+    RsExpandKernelDriverInfo fep;
 
     RsdCpuReferenceImpl *rsc;
     RsdCpuScriptImpl *script;
 
     ForEachFunc_t kernel;
     uint32_t sig;
-    const Allocation * ain;
+    const Allocation ** ains;
     Allocation * aout;
 
     uint32_t mSliceSize;
@@ -112,12 +150,9 @@
     uint32_t arrayStart;
     uint32_t arrayEnd;
 
-    // Multi-input data.
-    const Allocation ** ains;
-} MTLaunchStruct;
-
-
-
+    const uint8_t *inPtrsBuff[RS_KERNEL_INPUT_THRESHOLD];
+    StridePair     inStridesBuff[RS_KERNEL_INPUT_THRESHOLD];
+};
 
 class RsdCpuReferenceImpl : public RsdCpuReference {
 public:
@@ -138,9 +173,6 @@
         return mWorkers.mCount + 1;
     }
 
-    void launchThreads(const Allocation * ain, Allocation * aout,
-                       const RsScriptCall *sc, MTLaunchStruct *mtls);
-
     void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
                        const RsScriptCall* sc, MTLaunchStruct* mtls);
 
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 5a7fffd..8437c99 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -73,54 +73,29 @@
 }
 
 
-void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation * ain,
-                                      Allocation * aout, const void * usr,
-                                      uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation ** ains,
+                                      uint32_t inLen, Allocation * aout,
+                                      const void * usr, uint32_t usrLen,
+                                      const RsScriptCall *sc) {
 }
 
-void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation * ain,
-                                       Allocation * aout, const void * usr,
-                                       uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation ** ains,
+                                       uint32_t inLen, Allocation * aout,
+                                       const void * usr, uint32_t usrLen,
+                                       const RsScriptCall *sc) {
 }
 
 void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
-                                          const Allocation * ain,
+                                          const Allocation ** ains,
+                                          uint32_t inLen,
                                           Allocation * aout,
                                           const void * usr,
                                           uint32_t usrLen,
                                           const RsScriptCall *sc) {
 
     MTLaunchStruct mtls;
-    preLaunch(slot, ain, aout, usr, usrLen, sc);
 
-    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
-    mtls.script = this;
-    mtls.fep.slot = slot;
-
-    mtls.kernel = (void (*)())mRootPtr;
-    mtls.fep.usr = this;
-
-    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
-    mCtx->launchThreads(ain, aout, sc, &mtls);
-    mCtx->setTLS(oldTLS);
-
-    postLaunch(slot, ain, aout, usr, usrLen, sc);
-}
-
-void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot,
-                                               const Allocation ** ains,
-                                               uint32_t inLen,
-                                               Allocation * aout,
-                                               const void * usr,
-                                               uint32_t usrLen,
-                                               const RsScriptCall *sc) {
-
-    MTLaunchStruct mtls;
-    /*
-     * FIXME: Possibly create new preLaunch and postLaunch functions that take
-     *        all of the input allocation pointers.
-     */
-    preLaunch(slot, ains[0], aout, usr, usrLen, sc);
+    preLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
 
     forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
     mtls.script = this;
@@ -133,7 +108,7 @@
     mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
     mCtx->setTLS(oldTLS);
 
-    postLaunch(slot, ains[0], aout, usr, usrLen, sc);
+    postLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
 }
 
 void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index bf6a8ac..95aaa14 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -28,43 +28,42 @@
 public:
     virtual void populateScript(Script *) = 0;
 
-    virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
+    virtual void invokeFunction(uint32_t slot, const void * params,
+                                size_t paramLength);
     virtual int invokeRoot();
+
     virtual void invokeForEach(uint32_t slot,
-                       const Allocation * ain,
-                       Allocation * aout,
-                       const void * usr,
-                       uint32_t usrLen,
-                       const RsScriptCall *sc);
+                               const Allocation ** ain,
+                               uint32_t inLen,
+                               Allocation * aout,
+                               const void * usr,
+                               uint32_t usrLen,
+                               const RsScriptCall *sc);
 
-    virtual void invokeForEachMulti(uint32_t slot,
-                       const Allocation ** ain,
-                       uint32_t inLen,
-                       Allocation * aout,
-                       const void * usr,
-                       uint32_t usrLen,
-                       const RsScriptCall *sc);
-
-    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
+    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct * mtls);
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
-    virtual void preLaunch(uint32_t slot, const Allocation * ain,
-                           Allocation * aout, const void * usr,
-                           uint32_t usrLen, const RsScriptCall *sc);
-    virtual void postLaunch(uint32_t slot, const Allocation * ain,
-                            Allocation * aout, const void * usr,
-                            uint32_t usrLen, const RsScriptCall *sc);
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
+                           uint32_t usrLen, const RsScriptCall * sc);
+    virtual void postLaunch(uint32_t slot, const Allocation ** ains,
+                            uint32_t inLen, Allocation * aout,
+                            const void * usr, uint32_t usrLen,
+                            const RsScriptCall * sc);
 
-    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
-    virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
-                                  const Element *e, const uint32_t *dims, size_t dimLength);
+    virtual void setGlobalVar(uint32_t slot, const void * data,
+                              size_t dataLength);
+    virtual void setGlobalVarWithElemDims(uint32_t slot, const void * data,
+                                          size_t dataLength, const Element * e,
+                                          const uint32_t * dims,
+                                          size_t dimLength);
     virtual void setGlobalBind(uint32_t slot, Allocation *data);
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
     virtual ~RsdCpuScriptIntrinsic();
-    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, const Element *,
-                          RsScriptIntrinsicID iid);
+    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl * ctx, const Script * s,
+                          const Element * e, RsScriptIntrinsicID iid);
 
 protected:
     RsScriptIntrinsicID mID;
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index a7c9487..ce7c9c6 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -38,9 +38,9 @@
 protected:
     ObjectBaseRef<Allocation> mLUT;
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+                       uint32_t outstep);
 };
 
 }
@@ -58,13 +58,13 @@
                                       int dimx, int dimy, int dimz);
 
 
-void RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p,
-                                      uint32_t xstart, uint32_t xend,
-                                      uint32_t instep, uint32_t outstep) {
+void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelParams *p,
+                                        uint32_t xstart, uint32_t xend,
+                                        uint32_t outstep) {
     RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
 
     uchar4 *out = (uchar4 *)p->out + xstart;
-    uchar4 *in = (uchar4 *)p->in + xstart;
+    uchar4 *in = (uchar4 *)p->ins[0] + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -161,9 +161,9 @@
     }
 }
 
-RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
-                                                     const Script *s, const Element *e)
-            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
+RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(
+    RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) :
+        RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
 
     mRootPtr = &kernel;
 }
@@ -185,5 +185,3 @@
 
     return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 228b887..2beec3d 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -33,9 +33,8 @@
     RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
-    static void kernel(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
+    static void kernel(const RsExpandKernelParams *p, uint32_t xstart,
+                       uint32_t xend, uint32_t outstep);
 };
 
 }
@@ -110,14 +109,14 @@
 extern "C" void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
 #endif
 
-void RsdCpuScriptIntrinsicBlend::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelParams *p,
                                         uint32_t xstart, uint32_t xend,
-                                        uint32_t instep, uint32_t outstep) {
+                                        uint32_t outstep) {
     RsdCpuScriptIntrinsicBlend *cp = (RsdCpuScriptIntrinsicBlend *)p->usr;
 
     // instep/outstep can be ignored--sizeof(uchar4) known at compile time
     uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->in;
+    uchar4 *in = (uchar4 *)p->ins[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -509,6 +508,3 @@
                                       const Script *s, const Element *e) {
     return new RsdCpuScriptIntrinsicBlend(ctx, s, e);
 }
-
-
-
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index c1ca4e2..7f888e9 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -44,12 +44,12 @@
     int mIradius;
     ObjectBaseRef<Allocation> mAlloc;
 
-    static void kernelU4(const RsForEachStubParamStruct *p,
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU1(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
+                         uint32_t outstep);
     void ComputeGaussianWeights();
 };
 
@@ -113,7 +113,7 @@
 
 
 
-static void OneVU4(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
+static void OneVU4(const RsExpandKernelParams *p, float4 *out, int32_t x, int32_t y,
                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
 
     const uchar *pi = ptrIn + x*4;
@@ -131,7 +131,7 @@
     out->xyzw = blurredPixel;
 }
 
-static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int32_t y,
+static void OneVU1(const RsExpandKernelParams *p, float *out, int32_t x, int32_t y,
                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
 
     const uchar *pi = ptrIn + x;
@@ -243,7 +243,7 @@
     }
 }
 
-static void OneHU4(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
+static void OneHU4(const RsExpandKernelParams *p, uchar4 *out, int32_t x,
                    const float4 *ptrIn, const float* gPtr, int iradius) {
 
     float4 blurredPixel = 0;
@@ -258,7 +258,7 @@
     out->xyzw = convert_uchar4(blurredPixel);
 }
 
-static void OneHU1(const RsForEachStubParamStruct *p, uchar *out, int32_t x,
+static void OneHU1(const RsExpandKernelParams *p, uchar *out, int32_t x,
                    const float *ptrIn, const float* gPtr, int iradius) {
 
     float blurredPixel = 0;
@@ -274,9 +274,9 @@
 }
 
 
-void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicBlur::kernelU4(const RsExpandKernelParams *p,
                                          uint32_t xstart, uint32_t xend,
-                                         uint32_t instep, uint32_t outstep) {
+                                         uint32_t outstep) {
 
     float4 stackbuf[2048];
     float4 *buf = &stackbuf[0];
@@ -345,9 +345,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicBlur::kernelU1(const RsExpandKernelParams *p,
                                          uint32_t xstart, uint32_t xend,
-                                         uint32_t instep, uint32_t outstep) {
+                                         uint32_t outstep) {
     float buf[4 * 2048];
     RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
     if (!cp->mAlloc.get()) {
@@ -464,5 +464,3 @@
 
     return new RsdCpuScriptIntrinsicBlur(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 19894bc..e298d94 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -169,10 +169,9 @@
     virtual ~RsdCpuScriptIntrinsicColorMatrix();
     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
-    virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
-                           const void * usr, uint32_t usrLen, const RsScriptCall *sc);
-    virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
-                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
+                           uint32_t usrLen, const RsScriptCall *sc);
 
 protected:
     float fp[16];
@@ -188,9 +187,9 @@
     FunctionTab_t mFnTab;
 #endif
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+                       uint32_t outstep);
     void updateCoeffCache(float fpMul, float addMul);
 
     Key_t mLastKey;
@@ -778,7 +777,7 @@
 }
 
 
-static void One(const RsForEachStubParamStruct *p, void *out,
+static void One(const RsExpandKernelParams *p, void *out,
                 const void *py, const float* coeff, const float *add,
                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
 
@@ -879,10 +878,13 @@
     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p,
                                               uint32_t xstart, uint32_t xend,
-                                              uint32_t instep, uint32_t outstep) {
+                                              uint32_t outstep) {
     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
+
+    uint32_t instep = p->inEStrides[0];
+
     uchar *out = (uchar *)p->out;
     uchar *in = (uchar *)p->in;
     uint32_t x1 = xstart;
@@ -932,11 +934,15 @@
     }
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
-        uint32_t slot, const Allocation * ain, Allocation * aout,
-        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
+                                                 const Allocation ** ains,
+                                                 uint32_t inLen,
+                                                 Allocation * aout,
+                                                 const void * usr,
+                                                 uint32_t usrLen,
+                                                 const RsScriptCall *sc) {
 
-    const Element *ein = ain->mHal.state.type->getElement();
+    const Element *ein = ains[0]->mHal.state.type->getElement();
     const Element *eout = aout->mHal.state.type->getElement();
 
     if (ein->getType() == eout->getType()) {
@@ -953,8 +959,8 @@
         }
     }
 
-    Key_t key = computeKey(ain->mHal.state.type->getElement(),
-                           aout->mHal.state.type->getElement());
+    Key_t key = computeKey(ein, eout);
+
 #if defined(ARCH_X86_HAVE_SSSE3)
     if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
@@ -996,12 +1002,6 @@
 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
-        uint32_t slot, const Allocation * ain, Allocation * aout,
-        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
-
-}
-
 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 552a835..f9b70cc 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -42,24 +42,24 @@
     ObjectBaseRef<const Allocation> mAlloc;
     ObjectBaseRef<const Element> mElement;
 
-    static void kernelU1(const RsForEachStubParamStruct *p,
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU2(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU4(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF1(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF2(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF4(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
+                         uint32_t outstep);
 };
 
 }
@@ -88,7 +88,7 @@
                                           const void *y2, const short *coef, uint32_t count);
 
 
-static void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+static void ConvolveOneU4(const RsExpandKernelParams *p, uint32_t x, uchar4 *out,
                           const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
                           const float* coeff) {
 
@@ -110,7 +110,7 @@
     *out = o;
 }
 
-static void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
+static void ConvolveOneU2(const RsExpandKernelParams *p, uint32_t x, uchar2 *out,
                           const uchar2 *py0, const uchar2 *py1, const uchar2 *py2,
                           const float* coeff) {
 
@@ -131,7 +131,7 @@
     *out = convert_uchar2(px);
 }
 
-static void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
+static void ConvolveOneU1(const RsExpandKernelParams *p, uint32_t x, uchar *out,
                           const uchar *py0, const uchar *py1, const uchar *py2,
                           const float* coeff) {
 
@@ -150,7 +150,7 @@
     *out = clamp(px, 0.f, 255.f);
 }
 
-static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
+static void ConvolveOneF4(const RsExpandKernelParams *p, uint32_t x, float4 *out,
                           const float4 *py0, const float4 *py1, const float4 *py2,
                           const float* coeff) {
 
@@ -161,7 +161,7 @@
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-static void ConvolveOneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
+static void ConvolveOneF2(const RsExpandKernelParams *p, uint32_t x, float2 *out,
                           const float2 *py0, const float2 *py1, const float2 *py2,
                           const float* coeff) {
 
@@ -172,7 +172,7 @@
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-static void ConvolveOneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
+static void ConvolveOneF1(const RsExpandKernelParams *p, uint32_t x, float *out,
                           const float *py0, const float *py1, const float *py2,
                           const float* coeff) {
 
@@ -183,9 +183,9 @@
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -230,9 +230,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -275,9 +275,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -320,9 +320,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -365,9 +365,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -409,9 +409,9 @@
         }
     }
 }
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -507,5 +507,3 @@
 
     return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index ffa9543..e263e74 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -42,24 +42,24 @@
     ObjectBaseRef<Allocation> alloc;
 
 
-    static void kernelU1(const RsForEachStubParamStruct *p,
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU2(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU4(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF1(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF2(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF4(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
+                         uint32_t outstep);
 
 
 };
@@ -86,7 +86,7 @@
 }
 
 
-static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+static void OneU4(const RsExpandKernelParams *p, uint32_t x, uchar4 *out,
                   const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
                   const float* coeff) {
 
@@ -129,7 +129,7 @@
     *out = convert_uchar4(px);
 }
 
-static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
+static void OneU2(const RsExpandKernelParams *p, uint32_t x, uchar2 *out,
                   const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
                   const float* coeff) {
 
@@ -172,7 +172,7 @@
     *out = convert_uchar2(px);
 }
 
-static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
+static void OneU1(const RsExpandKernelParams *p, uint32_t x, uchar *out,
                   const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
                   const float* coeff) {
 
@@ -215,7 +215,7 @@
     *out = px;
 }
 
-static void OneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
+static void OneF4(const RsExpandKernelParams *p, uint32_t x, float4 *out,
                   const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
                   const float* coeff) {
 
@@ -257,7 +257,7 @@
     *out = px;
 }
 
-static void OneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
+static void OneF2(const RsExpandKernelParams *p, uint32_t x, float2 *out,
                   const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
                   const float* coeff) {
 
@@ -299,7 +299,7 @@
     *out = px;
 }
 
-static void OneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
+static void OneF1(const RsExpandKernelParams *p, uint32_t x, float *out,
                   const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
                   const float* coeff) {
 
@@ -346,9 +346,9 @@
                                           const void *y2, const void *y3, const void *y4,
                                           const short *coef, uint32_t count);
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -406,9 +406,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -455,9 +455,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -504,9 +504,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -553,9 +553,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -602,9 +602,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -705,6 +705,3 @@
 
     return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
 }
-
-
-
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
index cdfe7d1..d3dce6d 100644
--- a/cpu_ref/rsCpuIntrinsicHistogram.cpp
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -36,10 +36,10 @@
     RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
-    void preLaunch(uint32_t slot, const Allocation * ain,
+    void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
                    Allocation * aout, const void * usr,
                    uint32_t usrLen, const RsScriptCall *sc);
-    void postLaunch(uint32_t slot, const Allocation * ain,
+    void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
                     Allocation * aout, const void * usr,
                     uint32_t usrLen, const RsScriptCall *sc);
 
@@ -49,31 +49,31 @@
     int *mSums;
     ObjectBaseRef<Allocation> mAllocOut;
 
-    static void kernelP1U4(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
-    static void kernelP1U3(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
-    static void kernelP1U2(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
-    static void kernelP1U1(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
+    static void kernelP1U4(const RsExpandKernelParams *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t outstep);
+    static void kernelP1U3(const RsExpandKernelParams *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t outstep);
+    static void kernelP1U2(const RsExpandKernelParams *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t outstep);
+    static void kernelP1U1(const RsExpandKernelParams *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t outstep);
 
-    static void kernelP1L4(const RsForEachStubParamStruct *p,
+    static void kernelP1L4(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
-                           uint32_t instep, uint32_t outstep);
-    static void kernelP1L3(const RsForEachStubParamStruct *p,
+                           uint32_t outstep);
+    static void kernelP1L3(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
-                           uint32_t instep, uint32_t outstep);
-    static void kernelP1L2(const RsForEachStubParamStruct *p,
+                           uint32_t outstep);
+    static void kernelP1L2(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
-                           uint32_t instep, uint32_t outstep);
-    static void kernelP1L1(const RsForEachStubParamStruct *p,
+                           uint32_t outstep);
+    static void kernelP1L1(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
-                           uint32_t instep, uint32_t outstep);
+                           uint32_t outstep);
 
 };
 
@@ -97,9 +97,12 @@
 
 
 
-void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain,
-                                      Allocation * aout, const void * usr,
-                                      uint32_t usrLen, const RsScriptCall *sc) {
+void
+RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
+                                          const Allocation ** ains,
+                                          uint32_t inLen, Allocation * aout,
+                                          const void * usr, uint32_t usrLen,
+                                          const RsScriptCall *sc) {
 
     const uint32_t threads = mCtx->getThreadCount();
     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
@@ -123,7 +126,7 @@
         }
         break;
     case 1:
-        switch(ain->getType()->getElement()->getVectorSize()) {
+        switch(ains[0]->getType()->getElement()->getVectorSize()) {
         case 1:
             mRootPtr = &kernelP1L1;
             break;
@@ -142,9 +145,12 @@
     memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
 }
 
-void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
-                                       Allocation * aout, const void * usr,
-                                       uint32_t usrLen, const RsScriptCall *sc) {
+void
+RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot,
+                                           const Allocation ** ains,
+                                           uint32_t inLen,  Allocation * aout,
+                                           const void * usr, uint32_t usrLen,
+                                           const RsScriptCall *sc) {
 
     unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
     uint32_t threads = mCtx->getThreadCount();
@@ -160,12 +166,12 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * 4 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -173,47 +179,47 @@
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
         sums[(in[3] << 2) + 3] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * 4 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 2)    ] ++;
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * 2 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 1)    ] ++;
         sums[(in[1] << 1) + 1] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -222,16 +228,16 @@
                 (cp->mDotI[2] * in[2]) +
                 (cp->mDotI[3] * in[3]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -239,52 +245,52 @@
                 (cp->mDotI[1] * in[1]) +
                 (cp->mDotI[2] * in[2]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]) +
                 (cp->mDotI[1] * in[1]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[in[0]] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
@@ -323,5 +329,3 @@
 
     return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
index 5b2adc5..b08a0e5 100644
--- a/cpu_ref/rsCpuIntrinsicLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -38,9 +38,9 @@
 protected:
     ObjectBaseRef<Allocation> lut;
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+                       uint32_t outstep);
 };
 
 }
@@ -53,13 +53,13 @@
 }
 
 
-void RsdCpuScriptIntrinsicLUT::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicLUT::kernel(const RsExpandKernelParams *p,
                                       uint32_t xstart, uint32_t xend,
-                                      uint32_t instep, uint32_t outstep) {
+                                      uint32_t outstep) {
     RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr;
 
     uchar *out = (uchar *)p->out;
-    const uchar *in = (uchar *)p->in;
+    const uchar *in = (uchar *)p->ins[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -103,5 +103,3 @@
 
     return new RsdCpuScriptIntrinsicLUT(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicLoopFilter.cpp b/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
index c31fcdf..05ccfd6 100644
--- a/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
+++ b/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
@@ -174,17 +174,17 @@
     ObjectBaseRef<Allocation> mFrameBuffer;
 
     void doLoopFilter();
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+                       uint32_t outstep);
 };
 
 }
 }
 
-void RsdCpuScriptIntrinsicLoopFilter::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicLoopFilter::kernel(const RsExpandKernelParams *p,
                                              uint32_t xstart, uint32_t xend,
-                                             uint32_t instep, uint32_t outstep) {
+                                             uint32_t outstep) {
     RsdCpuScriptIntrinsicLoopFilter *cp = (RsdCpuScriptIntrinsicLoopFilter*)p->usr;
     memset((void*)&cp->mPrch.chart, 0, sizeof(cp->mPrch.chart));
     cp->mPrch.chart[0] = 0x0fffffff;
@@ -1232,4 +1232,3 @@
         rsAssert(rv == 0);
     }
 }
-
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 474f82d..fa0e8ee 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -35,8 +35,8 @@
     virtual ~RsdCpuScriptIntrinsicResize();
     RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
 
-    virtual void preLaunch(uint32_t slot, const Allocation * ain,
-                           Allocation * aout, const void * usr,
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
                            uint32_t usrLen, const RsScriptCall *sc);
 
     float scaleX;
@@ -46,15 +46,15 @@
     ObjectBaseRef<const Allocation> mAlloc;
     ObjectBaseRef<const Element> mElement;
 
-    static void kernelU1(const RsForEachStubParamStruct *p,
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU2(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU4(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
+                         uint32_t outstep);
 };
 
 }
@@ -175,9 +175,9 @@
     return (uchar)p;
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicResize::kernelU4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -215,9 +215,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicResize::kernelU2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -255,9 +255,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicResize::kernelU1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -304,9 +304,11 @@
 RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() {
 }
 
-void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot, const Allocation * ain,
-                                            Allocation * aout, const void * usr,
-                                            uint32_t usrLen, const RsScriptCall *sc)
+void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot,
+                                            const Allocation ** ains,
+                                            uint32_t inLen, Allocation * aout,
+                                            const void * usr, uint32_t usrLen,
+                                            const RsScriptCall *sc)
 {
     if (!mAlloc.get()) {
         ALOGE("Resize executed without input, skipping");
@@ -347,5 +349,3 @@
 
     return new RsdCpuScriptIntrinsicResize(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index c53ef31..390ca3c 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -46,9 +46,9 @@
 protected:
     ObjectBaseRef<Allocation> alloc;
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+                       uint32_t outstep);
 };
 
 }
@@ -101,9 +101,9 @@
 extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart, size_t xend);
 extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, size_t xstart, size_t xend);
 
-void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsExpandKernelParams *p,
                                            uint32_t xstart, uint32_t xend,
-                                           uint32_t instep, uint32_t outstep) {
+                                           uint32_t outstep) {
     RsdCpuScriptIntrinsicYuvToRGB *cp = (RsdCpuScriptIntrinsicYuvToRGB *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("YuvToRGB executed without input, skipping");
diff --git a/cpu_ref/rsCpuIntrinsics_x86.c b/cpu_ref/rsCpuIntrinsics_x86.cpp
similarity index 100%
rename from cpu_ref/rsCpuIntrinsics_x86.c
rename to cpu_ref/rsCpuIntrinsics_x86.cpp
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index a11fda1..e0b4004 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -176,7 +176,7 @@
         // library fallback path. Those applications don't have a private
         // library path, so they need to install to the system directly.
         // Note that this is really just a testing path.
-        android::String8 scriptSONameSystem("/system/lib/librs.");
+        std::string scriptSONameSystem("/system/lib/librs.");
         scriptSONameSystem.append(resName);
         scriptSONameSystem.append(".so");
         loaded = loadSOHelper(scriptSONameSystem.c_str(), cacheDir,
@@ -216,9 +216,11 @@
 
 const static char *BCC_EXE_PATH = "/system/bin/bcc";
 
-static void setCompileArguments(std::vector<const char*>* args, const android::String8& bcFileName,
-                                const char* cacheDir, const char* resName, const char* core_lib,
-                                bool useRSDebugContext, const char* bccPluginName) {
+static void setCompileArguments(std::vector<const char*>* args,
+                                const std::string& bcFileName,
+                                const char* cacheDir, const char* resName,
+                                const char* core_lib, bool useRSDebugContext,
+                                const char* bccPluginName) {
     rsAssert(cacheDir && resName && core_lib);
     args->push_back(BCC_EXE_PATH);
     args->push_back("-o");
@@ -242,27 +244,27 @@
         }
     }
 
-    args->push_back(bcFileName.string());
+    args->push_back(bcFileName.c_str());
     args->push_back(NULL);
 }
 
-static bool compileBitcode(const android::String8& bcFileName,
+static bool compileBitcode(const std::string &bcFileName,
                            const char *bitcode,
                            size_t bitcodeSize,
-                           const char** compileArguments,
-                           const std::string& compileCommandLine) {
+                           const char **compileArguments,
+                           const std::string &compileCommandLine) {
     rsAssert(bitcode && bitcodeSize);
 
-    FILE *bcfile = fopen(bcFileName.string(), "w");
+    FILE *bcfile = fopen(bcFileName.c_str(), "w");
     if (!bcfile) {
-        ALOGE("Could not write to %s", bcFileName.string());
+        ALOGE("Could not write to %s", bcFileName.c_str());
         return false;
     }
     size_t nwritten = fwrite(bitcode, 1, bitcodeSize, bcfile);
     fclose(bcfile);
     if (nwritten != bitcodeSize) {
         ALOGE("Could not write %zu bytes to %s", bitcodeSize,
-              bcFileName.string());
+              bcFileName.c_str());
         return false;
     }
 
@@ -426,7 +428,7 @@
         useRSDebugContext = true;
     }
 
-    android::String8 bcFileName(cacheDir);
+    std::string bcFileName(cacheDir);
     bcFileName.append("/");
     bcFileName.append(resName);
     bcFileName.append(".bc");
@@ -760,9 +762,9 @@
     script->mHal.info.exportedForeachFuncList = &mExportedForEachFuncList[0];
     script->mHal.info.exportedPragmaCount = mExecutable->getPragmaKeys().size();
     script->mHal.info.exportedPragmaKeyList =
-        const_cast<const char**>(mExecutable->getPragmaKeys().array());
+        const_cast<const char**>(&mExecutable->getPragmaKeys().front());
     script->mHal.info.exportedPragmaValueList =
-        const_cast<const char**>(mExecutable->getPragmaValues().array());
+        const_cast<const char**>(&mExecutable->getPragmaValues().front());
 
     if (mRootExpand) {
         script->mHal.info.root = mRootExpand;
@@ -789,119 +791,8 @@
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
 
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation * ain, Allocation * aout,
-                                        const void * usr, uint32_t usrLen,
-                                        const RsScriptCall *sc,
-                                        MTLaunchStruct *mtls) {
-
-    memset(mtls, 0, sizeof(MTLaunchStruct));
-
-    // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
-    if (ain && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
-        return;
-    }
-    if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
-        return;
-    }
-
-    if (ain != NULL) {
-        const Type *inType = ain->getType();
-
-        mtls->fep.dimX = inType->getDimX();
-        mtls->fep.dimY = inType->getDimY();
-        mtls->fep.dimZ = inType->getDimZ();
-
-    } else if (aout != NULL) {
-        const Type *outType = aout->getType();
-
-        mtls->fep.dimX = outType->getDimX();
-        mtls->fep.dimY = outType->getDimY();
-        mtls->fep.dimZ = outType->getDimZ();
-
-    } else {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
-        return;
-    }
-
-    if (ain != NULL && aout != NULL) {
-        if (!ain->hasSameDims(aout)) {
-            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
-              "Failed to launch kernel; dimensions of input and output allocations do not match.");
-
-            return;
-        }
-    }
-
-    if (!sc || (sc->xEnd == 0)) {
-        mtls->xEnd = mtls->fep.dimX;
-    } else {
-        rsAssert(sc->xStart < mtls->fep.dimX);
-        rsAssert(sc->xEnd <= mtls->fep.dimX);
-        rsAssert(sc->xStart < sc->xEnd);
-        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
-        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
-        if (mtls->xStart >= mtls->xEnd) return;
-    }
-
-    if (!sc || (sc->yEnd == 0)) {
-        mtls->yEnd = mtls->fep.dimY;
-    } else {
-        rsAssert(sc->yStart < mtls->fep.dimY);
-        rsAssert(sc->yEnd <= mtls->fep.dimY);
-        rsAssert(sc->yStart < sc->yEnd);
-        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
-        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
-        if (mtls->yStart >= mtls->yEnd) return;
-    }
-
-    if (!sc || (sc->zEnd == 0)) {
-        mtls->zEnd = mtls->fep.dimZ;
-    } else {
-        rsAssert(sc->zStart < mtls->fep.dimZ);
-        rsAssert(sc->zEnd <= mtls->fep.dimZ);
-        rsAssert(sc->zStart < sc->zEnd);
-        mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
-        mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
-        if (mtls->zStart >= mtls->zEnd) return;
-    }
-
-    mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
-    mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
-    mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
-    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
-
-    rsAssert(!ain || (ain->getType()->getDimZ() == 0));
-
-    mtls->rsc = mCtx;
-    mtls->ain = ain;
-    mtls->aout = aout;
-    mtls->fep.usr = usr;
-    mtls->fep.usrLen = usrLen;
-    mtls->mSliceSize = 1;
-    mtls->mSliceNum = 0;
-
-    mtls->fep.ptrIn = NULL;
-    mtls->fep.eStrideIn = 0;
-    mtls->isThreadable = mIsThreadable;
-
-    if (ain) {
-        mtls->fep.ptrIn = (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr;
-        mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes();
-        mtls->fep.yStrideIn = ain->mHal.drvState.lod[0].stride;
-    }
-
-    mtls->fep.ptrOut = NULL;
-    mtls->fep.eStrideOut = 0;
-    if (aout) {
-        mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
-        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
-        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
-    }
-}
-
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
+                                        uint32_t inLen,
                                         Allocation * aout,
                                         const void * usr, uint32_t usrLen,
                                         const RsScriptCall *sc,
@@ -909,24 +800,24 @@
 
     memset(mtls, 0, sizeof(MTLaunchStruct));
 
-    // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
-    if (ains != NULL) {
-        for (int index = inLen; --index >= 0;) {
-            const Allocation* ain = ains[index];
+    for (int index = inLen; --index >= 0;) {
+        const Allocation* ain = ains[index];
 
-            if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
-                mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
-                return;
-            }
+        // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+        if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                         "rsForEach called with null in allocations");
+            return;
         }
     }
 
     if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                     "rsForEach called with null out allocations");
         return;
     }
 
-    if (ains != NULL) {
+    if (inLen > 0) {
         const Allocation *ain0   = ains[0];
         const Type       *inType = ain0->getType();
 
@@ -951,11 +842,12 @@
         mtls->fep.dimZ = outType->getDimZ();
 
     } else {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                     "rsForEach called with null allocations");
         return;
     }
 
-    if (ains != NULL && aout != NULL) {
+    if (inLen > 0 && aout != NULL) {
         if (!ains[0]->hasSameDims(aout)) {
             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
               "Failed to launch kernel; dimensions of input and output allocations do not match.");
@@ -1002,7 +894,7 @@
     mtls->zEnd     = rsMax((uint32_t)1, mtls->zEnd);
     mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
 
-    rsAssert(!ains || (ains[0]->getType()->getDimZ() == 0));
+    rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
 
     mtls->rsc        = mCtx;
     mtls->ains       = ains;
@@ -1012,18 +904,28 @@
     mtls->mSliceSize = 1;
     mtls->mSliceNum  = 0;
 
-    mtls->fep.ptrIns    = NULL;
-    mtls->fep.eStrideIn = 0;
+    mtls->fep.inPtrs    = NULL;
+    mtls->fep.inStrides = NULL;
     mtls->isThreadable  = mIsThreadable;
 
-    if (ains) {
-        mtls->fep.ptrIns    = new const uint8_t*[inLen];
-        mtls->fep.inStrides = new StridePair[inLen];
+    if (inLen > 0) {
+
+        if (inLen <= RS_KERNEL_INPUT_THRESHOLD) {
+            mtls->fep.inPtrs    = (const uint8_t**)mtls->inPtrsBuff;
+            mtls->fep.inStrides = mtls->inStridesBuff;
+        } else {
+            mtls->fep.heapAllocatedArrays = true;
+
+            mtls->fep.inPtrs    = new const uint8_t*[inLen];
+            mtls->fep.inStrides = new StridePair[inLen];
+        }
+
+        mtls->fep.inLen = inLen;
 
         for (int index = inLen; --index >= 0;) {
             const Allocation *ain = ains[index];
 
-            mtls->fep.ptrIns[index] =
+            mtls->fep.inPtrs[index] =
               (const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
 
             mtls->fep.inStrides[index].eStride =
@@ -1033,41 +935,27 @@
         }
     }
 
-    mtls->fep.ptrOut = NULL;
-    mtls->fep.eStrideOut = 0;
-    if (aout) {
-        mtls->fep.ptrOut     = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
-        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
-        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+    mtls->fep.outPtr            = NULL;
+    mtls->fep.outStride.eStride = 0;
+    mtls->fep.outStride.yStride = 0;
+    if (aout != NULL) {
+        mtls->fep.outPtr = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+
+        mtls->fep.outStride.eStride = aout->getType()->getElementSizeBytes();
+        mtls->fep.outStride.yStride = aout->mHal.drvState.lod[0].stride;
     }
 }
 
 
 void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
-                                     const Allocation * ain,
+                                     const Allocation ** ains,
+                                     uint32_t inLen,
                                      Allocation * aout,
                                      const void * usr,
                                      uint32_t usrLen,
                                      const RsScriptCall *sc) {
 
     MTLaunchStruct mtls;
-    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
-    forEachKernelSetup(slot, &mtls);
-
-    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
-    mCtx->launchThreads(ain, aout, sc, &mtls);
-    mCtx->setTLS(oldTLS);
-}
-
-void RsdCpuScriptImpl::invokeForEachMulti(uint32_t slot,
-                                          const Allocation ** ains,
-                                          uint32_t inLen,
-                                          Allocation * aout,
-                                          const void * usr,
-                                          uint32_t usrLen,
-                                          const RsScriptCall *sc) {
-
-    MTLaunchStruct mtls;
 
     forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
     forEachKernelSetup(slot, &mtls);
@@ -1255,9 +1143,9 @@
 RsdCpuScriptImpl::~RsdCpuScriptImpl() {
 #ifndef RS_COMPATIBILITY_LIB
     if (mExecutable) {
-        Vector<void *>::const_iterator var_addr_iter =
+        std::vector<void *>::const_iterator var_addr_iter =
             mExecutable->getExportVarAddrs().begin();
-        Vector<void *>::const_iterator var_addr_end =
+        std::vector<void *>::const_iterator var_addr_end =
             mExecutable->getExportVarAddrs().end();
 
         bcc::RSInfo::ObjectSlotListTy::const_iterator is_object_iter =
@@ -1338,17 +1226,15 @@
     return NULL;
 }
 
-void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation * ain,
-                       Allocation * aout, const void * usr,
-                       uint32_t usrLen, const RsScriptCall *sc)
-{
-}
+void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation ** ains,
+                                 uint32_t inLen, Allocation * aout,
+                                 const void * usr, uint32_t usrLen,
+                                 const RsScriptCall *sc) {}
 
-void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation * ain,
-                        Allocation * aout, const void * usr,
-                        uint32_t usrLen, const RsScriptCall *sc)
-{
-}
+void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation ** ains,
+                                  uint32_t inLen, Allocation * aout,
+                                  const void * usr, uint32_t usrLen,
+                                  const RsScriptCall *sc) {}
 
 
 }
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index f4ca1ed..c5fc183 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -47,9 +47,9 @@
 class RsdCpuScriptImpl : public RsdCpuReferenceImpl::CpuScript {
 public:
     typedef void (*outer_foreach_t)(
-        const RsForEachStubParamStruct *,
+        const RsExpandKernelParams *,
         uint32_t x1, uint32_t x2,
-        uint32_t instep, uint32_t outstep);
+        uint32_t outstep);
 #ifdef RS_COMPATIBILITY_LIB
     typedef void (* InvokeFunc_t)(void);
     typedef void (* ForEachFunc_t)(void);
@@ -64,26 +64,22 @@
 
     virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
     virtual int invokeRoot();
-    virtual void preLaunch(uint32_t slot, const Allocation * ain,
-                           Allocation * aout, const void * usr,
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
                            uint32_t usrLen, const RsScriptCall *sc);
-    virtual void postLaunch(uint32_t slot, const Allocation * ain,
-                            Allocation * aout, const void * usr,
-                            uint32_t usrLen, const RsScriptCall *sc);
-    virtual void invokeForEach(uint32_t slot,
-                       const Allocation * ain,
-                       Allocation * aout,
-                       const void * usr,
-                       uint32_t usrLen,
-                       const RsScriptCall *sc);
+    virtual void postLaunch(uint32_t slot, const Allocation ** ains,
+                            uint32_t inLen, Allocation * aout,
+                            const void * usr, uint32_t usrLen,
+                            const RsScriptCall *sc);
 
-    virtual void invokeForEachMulti(uint32_t slot,
-                                     const Allocation** ains,
-                                     uint32_t inLen,
-                                     Allocation* aout,
-                                     const void* usr,
-                                     uint32_t usrLen,
-                                     const RsScriptCall* sc);
+    virtual void invokeForEach(uint32_t slot,
+                               const Allocation ** ains,
+                               uint32_t inLen,
+                               Allocation* aout,
+                               const void* usr,
+                               uint32_t usrLen,
+                               const RsScriptCall* sc);
+
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
@@ -100,10 +96,6 @@
 
     const Script * getScript() {return mScript;}
 
-    void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
-                          const void * usr, uint32_t usrLen,
-                          const RsScriptCall *sc, MTLaunchStruct *mtls);
-
     void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
                           Allocation * aout, const void * usr, uint32_t usrLen,
                           const RsScriptCall *sc, MTLaunchStruct *mtls);
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index a9de00c..1d26f59 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -44,76 +44,93 @@
 }
 
 
-typedef void (*ScriptGroupRootFunc_t)(const RsForEachStubParamStruct *p,
+typedef void (*ScriptGroupRootFunc_t)(const RsExpandKernelParams *kparams,
                                       uint32_t xstart, uint32_t xend,
-                                      uint32_t instep, uint32_t outstep);
+                                      uint32_t outstep);
 
-void CpuScriptGroupImpl::scriptGroupRoot(const RsForEachStubParamStruct *p,
+void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelParams *kparams,
                                          uint32_t xstart, uint32_t xend,
-                                         uint32_t instep, uint32_t outstep) {
+                                         uint32_t outstep) {
 
 
-    const ScriptList *sl = (const ScriptList *)p->usr;
-    RsForEachStubParamStruct *mp = (RsForEachStubParamStruct *)p;
-    const void *oldUsr = p->usr;
+    const ScriptList *sl           = (const ScriptList *)kparams->usr;
+    RsExpandKernelParams *mkparams = (RsExpandKernelParams *)kparams;
 
-    for(size_t ct=0; ct < sl->count; ct++) {
+    const void **oldIns  = mkparams->ins;
+    uint32_t *oldStrides = mkparams->inEStrides;
+
+    void *localIns[1];
+    uint32_t localStride[1];
+
+    mkparams->ins        = (const void**)localIns;
+    mkparams->inEStrides = localStride;
+
+    for (size_t ct = 0; ct < sl->count; ct++) {
         ScriptGroupRootFunc_t func;
-        func = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
-        mp->usr = sl->usrPtrs[ct];
-
-        mp->ptrIn = NULL;
-        mp->in = NULL;
-        mp->ptrOut = NULL;
-        mp->out = NULL;
-
-        uint32_t istep = 0;
-        uint32_t ostep = 0;
+        func          = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
+        mkparams->usr = sl->usrPtrs[ct];
 
         if (sl->ins[ct]) {
-            mp->ptrIn = (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
-            istep = sl->ins[ct]->mHal.state.elementSizeBytes;
-            mp->in = mp->ptrIn;
+            localIns[0] = sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+
+            localStride[0] = sl->ins[ct]->mHal.state.elementSizeBytes;
+
             if (sl->inExts[ct]) {
-                mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->y;
-            } else {
-                if (sl->ins[ct]->mHal.drvState.lod[0].dimY > p->lid) {
-                    mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->lid;
-                }
+                localIns[0] = (void*)
+                  ((const uint8_t *)localIns[0] +
+                   sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y);
+
+            } else if (sl->ins[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
+                localIns[0] = (void*)
+                  ((const uint8_t *)localIns[0] +
+                   sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid);
             }
+
+        } else {
+            localIns[0]    = NULL;
+            localStride[0] = 0;
         }
 
+        uint32_t ostep;
         if (sl->outs[ct]) {
-            mp->ptrOut = (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
-            mp->out = mp->ptrOut;
+            mkparams->out =
+              (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
+
             ostep = sl->outs[ct]->mHal.state.elementSizeBytes;
+
             if (sl->outExts[ct]) {
-                mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->y;
-            } else {
-                if (sl->outs[ct]->mHal.drvState.lod[0].dimY > p->lid) {
-                    mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->lid;
-                }
+                mkparams->out =
+                  (uint8_t *)mkparams->out +
+                  sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->y;
+
+            } else if (sl->outs[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
+                mkparams->out =
+                  (uint8_t *)mkparams->out +
+                  sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->lid;
             }
+        } else {
+            mkparams->out = NULL;
+            ostep         = 0;
         }
 
         //ALOGE("kernel %i %p,%p  %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
-        func(p, xstart, xend, istep, ostep);
+        func(kparams, xstart, xend, ostep);
     }
     //ALOGE("script group root");
 
-    //ConvolveParams *cp = (ConvolveParams *)p->usr;
-
-    mp->usr = oldUsr;
+    mkparams->ins        = oldIns;
+    mkparams->inEStrides = oldStrides;
+    mkparams->usr        = sl;
 }
 
 
 
 void CpuScriptGroupImpl::execute() {
-    Vector<Allocation *> ins;
-    Vector<bool> inExts;
-    Vector<Allocation *> outs;
-    Vector<bool> outExts;
-    Vector<const ScriptKernelID *> kernels;
+    std::vector<Allocation *> ins;
+    std::vector<char> inExts;
+    std::vector<Allocation *> outs;
+    std::vector<char> outExts;
+    std::vector<const ScriptKernelID *> kernels;
     bool fieldDep = false;
 
     for (size_t ct=0; ct < mSG->mNodes.size(); ct++) {
@@ -179,69 +196,108 @@
             rsAssert((k->mHasKernelOutput == (aout != NULL)) &&
                      (k->mHasKernelInput == (ain != NULL)));
 
-            ins.add(ain);
-            inExts.add(inExt);
-            outs.add(aout);
-            outExts.add(outExt);
-            kernels.add(k);
+            ins.push_back(ain);
+            inExts.push_back(inExt);
+            outs.push_back(aout);
+            outExts.push_back(outExt);
+            kernels.push_back(k);
         }
 
     }
 
     MTLaunchStruct mtls;
 
-    if(fieldDep) {
+    if (fieldDep) {
         for (size_t ct=0; ct < ins.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
             uint32_t slot = kernels[ct]->mSlot;
 
-            si->forEachMtlsSetup(ins[ct], outs[ct], NULL, 0, NULL, &mtls);
+            uint32_t inLen;
+            const Allocation **ains;
+
+            if (ins[ct] == NULL) {
+                inLen = 0;
+                ains  = NULL;
+
+            } else {
+                inLen = 1;
+                ains  = const_cast<const Allocation**>(&ins[ct]);
+            }
+
+            si->forEachMtlsSetup(ains, inLen, outs[ct], NULL, 0, NULL, &mtls);
+
             si->forEachKernelSetup(slot, &mtls);
-            si->preLaunch(slot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
-            mCtx->launchThreads(ins[ct], outs[ct], NULL, &mtls);
-            si->postLaunch(slot, ins[ct], outs[ct], NULL, 0, NULL);
+            si->preLaunch(slot, ains, inLen, outs[ct], mtls.fep.usr,
+                          mtls.fep.usrLen, NULL);
+
+            mCtx->launchThreads(ains, inLen, outs[ct], NULL, &mtls);
+
+            si->postLaunch(slot, ains, inLen, outs[ct], NULL, 0, NULL);
         }
     } else {
         ScriptList sl;
-        sl.ins = ins.array();
-        sl.outs = outs.array();
-        sl.kernels = kernels.array();
-        sl.count = kernels.size();
 
-        Vector<const void *> usrPtrs;
-        Vector<const void *> fnPtrs;
-        Vector<uint32_t> sigs;
+        /*
+         * TODO: This is a hacky way of doing this and should be replaced by a
+         *       call to std::vector's data() member once we have a C++11
+         *       version of the STL.
+         */
+        sl.ins     = &ins.front();
+        sl.outs    = &outs.front();
+        sl.kernels = &kernels.front();
+        sl.count   = kernels.size();
+
+        uint32_t inLen;
+        const Allocation **ains;
+
+        if (ins[0] == NULL) {
+            inLen = 0;
+            ains  = NULL;
+
+        } else {
+            inLen = 1;
+            ains  = const_cast<const Allocation**>(&ins[0]);
+        }
+
+        std::vector<const void *> usrPtrs;
+        std::vector<const void *> fnPtrs;
+        std::vector<uint32_t> sigs;
         for (size_t ct=0; ct < kernels.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
 
             si->forEachKernelSetup(kernels[ct]->mSlot, &mtls);
-            fnPtrs.add((void *)mtls.kernel);
-            usrPtrs.add(mtls.fep.usr);
-            sigs.add(mtls.fep.usrLen);
-            si->preLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
+            fnPtrs.push_back((void *)mtls.kernel);
+            usrPtrs.push_back(mtls.fep.usr);
+            sigs.push_back(mtls.fep.usrLen);
+            si->preLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct],
+                          mtls.fep.usr, mtls.fep.usrLen, NULL);
         }
-        sl.sigs = sigs.array();
-        sl.usrPtrs = usrPtrs.array();
-        sl.fnPtrs = fnPtrs.array();
-        sl.inExts = inExts.array();
-        sl.outExts = outExts.array();
+
+        sl.sigs    = &sigs.front();
+        sl.usrPtrs = &usrPtrs.front();
+        sl.fnPtrs  = &fnPtrs.front();
+
+        sl.inExts  = (bool*)&inExts.front();
+        sl.outExts = (bool*)&outExts.front();
 
         Script *s = kernels[0]->mScript;
         RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
-        si->forEachMtlsSetup(ins[0], outs[0], NULL, 0, NULL, &mtls);
+
+        si->forEachMtlsSetup(ains, inLen, outs[0], NULL, 0, NULL, &mtls);
+
         mtls.script = NULL;
         mtls.kernel = (void (*)())&scriptGroupRoot;
         mtls.fep.usr = &sl;
-        mCtx->launchThreads(ins[0], outs[0], NULL, &mtls);
+
+        mCtx->launchThreads(ains, inLen, outs[0], NULL, &mtls);
 
         for (size_t ct=0; ct < kernels.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
-            si->postLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], NULL, 0, NULL);
+            si->postLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct], NULL, 0,
+                           NULL);
         }
     }
 }
-
-
diff --git a/cpu_ref/rsCpuScriptGroup.h b/cpu_ref/rsCpuScriptGroup.h
index 78e179d..1a4af05 100644
--- a/cpu_ref/rsCpuScriptGroup.h
+++ b/cpu_ref/rsCpuScriptGroup.h
@@ -33,9 +33,9 @@
     CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg);
     bool init();
 
-    static void scriptGroupRoot(const RsForEachStubParamStruct *p,
+    static void scriptGroupRoot(const RsExpandKernelParams *p,
                                 uint32_t xstart, uint32_t xend,
-                                uint32_t instep, uint32_t outstep);
+                                uint32_t outstep);
 
 protected:
     struct ScriptList {
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 0076cb9..4728b7c 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -69,21 +69,15 @@
         virtual void populateScript(Script *) = 0;
         virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength) = 0;
         virtual int invokeRoot() = 0;
+
         virtual void invokeForEach(uint32_t slot,
-                           const Allocation * ain,
-                           Allocation * aout,
-                           const void * usr,
-                           uint32_t usrLen,
-                           const RsScriptCall *sc) = 0;
-                           
-        virtual void invokeForEachMulti(uint32_t slot,
-                                         const Allocation** ains,
-                                         uint32_t inLen,
-                                         Allocation * aout,
-                                         const void * usr,
-                                         uint32_t usrLen,
-                                         const RsScriptCall *sc) = 0;
-        
+                                   const Allocation ** ains,
+                                   uint32_t inLen,
+                                   Allocation * aout,
+                                   const void * usr,
+                                   uint32_t usrLen,
+                                   const RsScriptCall *sc) = 0;
+
         virtual void invokeInit() = 0;
         virtual void invokeFreeChildren() = 0;
 
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 9a40756..0586785 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -1213,5 +1213,3 @@
     }
 #endif
 }
-
-
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 27029cf..419422a 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <vector>
+
 #include "../cpu_ref/rsd_cpu.h"
 
 #include "rsdCore.h"
@@ -26,7 +28,6 @@
 #include "rsScriptC.h"
 
 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
-#include "utils/Vector.h"
 #include "utils/Timers.h"
 #include "utils/StopWatch.h"
 #endif
@@ -43,8 +44,9 @@
                      size_t bitcodeSize,
                      uint32_t flags) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
-    RsdCpuReference::CpuScript * cs = dc->mCpuRef->createScript(script, resName, cacheDir,
-                                                                bitcode, bitcodeSize, flags);
+    RsdCpuReference::CpuScript * cs =
+        dc->mCpuRef->createScript(script, resName, cacheDir, bitcode,
+                                  bitcodeSize, flags);
     if (cs == NULL) {
         return false;
     }
@@ -53,7 +55,8 @@
     return true;
 }
 
-bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrinsicID iid, Element *e) {
+bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrinsicID iid,
+                      Element *e) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
     RsdCpuReference::CpuScript * cs = dc->mCpuRef->createIntrinsic(s, iid, e);
     if (cs == NULL) {
@@ -73,8 +76,15 @@
                             size_t usrLen,
                             const RsScriptCall *sc) {
 
-    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
-    cs->invokeForEach(slot, ain, aout, usr, usrLen, sc);
+    if (ain == NULL) {
+        rsdScriptInvokeForEachMulti(rsc, s, slot, NULL, 0, aout, usr, usrLen,
+                                    sc);
+    } else {
+        const Allocation *ains[1] = {ain};
+
+        rsdScriptInvokeForEachMulti(rsc, s, slot, ains, 1, aout, usr, usrLen,
+                                    sc);
+    }
 }
 
 void rsdScriptInvokeForEachMulti(const Context *rsc,
@@ -88,7 +98,7 @@
                                  const RsScriptCall *sc) {
 
     RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
-    cs->invokeForEachMulti(slot, ains, inLen, aout, usr, usrLen, sc);
+    cs->invokeForEach(slot, ains, inLen, aout, usr, usrLen, sc);
 }
 
 
diff --git a/driver/rsdMeshObj.cpp b/driver/rsdMeshObj.cpp
index 66c3b18..8f072a5 100644
--- a/driver/rsdMeshObj.cpp
+++ b/driver/rsdMeshObj.cpp
@@ -112,9 +112,9 @@
             mAttribs[userNum].type = rsdTypeToGLType(f->mHal.state.dataType);
             mAttribs[userNum].normalized = f->mHal.state.dataType != RS_TYPE_FLOAT_32;
             mAttribs[userNum].stride = stride;
-            String8 tmp(RS_SHADER_ATTR);
+            std::string tmp(RS_SHADER_ATTR);
             tmp.append(elem->mHal.state.fieldNames[fieldI]);
-            mAttribs[userNum].name.setTo(tmp.string());
+            mAttribs[userNum].name = tmp.c_str();
 
             // Remember which allocation this attribute came from
             mAttribAllocationIndex[userNum] = ct;
diff --git a/driver/rsdShader.cpp b/driver/rsdShader.cpp
index 0b182ff..d1a486b 100644
--- a/driver/rsdShader.cpp
+++ b/driver/rsdShader.cpp
@@ -33,7 +33,7 @@
                      const char * shaderText, size_t shaderLength,
                      const char** textureNames, size_t textureNamesCount,
                      const size_t *textureNamesLength) {
-    mUserShader.setTo(shaderText, shaderLength);
+    mUserShader.replace(0, shaderLength, shaderText);
     mRSProgram = p;
     mType = type;
     initMemberVars();
@@ -41,13 +41,14 @@
     init(textureNames, textureNamesCount, textureNamesLength);
 
     for(size_t i=0; i < textureNamesCount; i++) {
-        mTextureNames.push(String8(textureNames[i], textureNamesLength[i]));
+        mTextureNames.push_back(std::string(textureNames[i],
+                                            textureNamesLength[i]));
     }
 }
 
 RsdShader::~RsdShader() {
     for (uint32_t i = 0; i < mStateBasedShaders.size(); i ++) {
-        StateBasedKey *state = mStateBasedShaders.itemAt(i);
+        StateBasedKey *state = mStateBasedShaders[i];
         if (state->mShaderID) {
             glDeleteShader(state->mShaderID);
         }
@@ -76,7 +77,7 @@
     RsdShader::StateBasedKey *returnKey = NULL;
 
     for (uint32_t i = 0; i < mStateBasedShaders.size(); i ++) {
-        returnKey = mStateBasedShaders.itemAt(i);
+        returnKey = mStateBasedShaders[i];
 
         for (uint32_t ct = 0; ct < mRSProgram->mHal.state.texturesCount; ct ++) {
             uint32_t texType = 0;
@@ -108,7 +109,7 @@
     // We have not created a shader for this particular state yet
     state = new StateBasedKey(mTextureCount);
     mCurrentState = state;
-    mStateBasedShaders.add(state);
+    mStateBasedShaders.push_back(state);
     createShader();
     loadShader(rsc);
     return mCurrentState->mShaderID;
@@ -129,15 +130,15 @@
 
     mTextureUniformIndexStart = uniformCount;
     for (uint32_t ct=0; ct < mRSProgram->mHal.state.texturesCount; ct++) {
-        mUniformNames[uniformCount].setTo("UNI_");
+        mUniformNames[uniformCount] = "UNI_";
         mUniformNames[uniformCount].append(textureNames[ct], textureNamesLength[ct]);
         mUniformArraySizes[uniformCount] = 1;
         uniformCount++;
     }
 }
 
-String8 RsdShader::getGLSLInputString() const {
-    String8 s;
+std::string RsdShader::getGLSLInputString() const {
+    std::string s;
     for (uint32_t ct=0; ct < mRSProgram->mHal.state.inputElementsCount; ct++) {
         const Element *e = mRSProgram->mHal.state.inputElements[ct];
         for (uint32_t field=0; field < e->mHal.state.fieldsCount; field++) {
@@ -237,11 +238,11 @@
 
     if (rsc->props.mLogShaders) {
         ALOGV("Loading shader type %x, ID %i", mType, mCurrentState->mShaderID);
-        ALOGV("%s", mShader.string());
+        ALOGV("%s", mShader.c_str());
     }
 
     if (mCurrentState->mShaderID) {
-        const char * ss = mShader.string();
+        const char * ss = mShader.c_str();
         RSD_CALL_GL(glShaderSource, mCurrentState->mShaderID, 1, &ss, NULL);
         RSD_CALL_GL(glCompileShader, mCurrentState->mShaderID);
 
@@ -299,7 +300,9 @@
 
             mShader.append(fn);
             if (e->mHal.state.fieldArraySizes[field] > 1) {
-                mShader.appendFormat("[%d]", e->mHal.state.fieldArraySizes[field]);
+                mShader += "[";
+                mShader += std::to_string(e->mHal.state.fieldArraySizes[field]);
+                mShader += "]";
             }
             mShader.append(";\n");
         }
@@ -585,27 +588,28 @@
     mUniformCount += mRSProgram->mHal.state.texturesCount;
 
     if (mAttribCount) {
-        mAttribNames = new String8[mAttribCount];
+        mAttribNames = new std::string[mAttribCount];
     }
     if (mUniformCount) {
-        mUniformNames = new String8[mUniformCount];
+        mUniformNames = new std::string[mUniformCount];
         mUniformArraySizes = new uint32_t[mUniformCount];
     }
 
     mTextureCount = mRSProgram->mHal.state.texturesCount;
 }
 
-void RsdShader::initAddUserElement(const Element *e, String8 *names, uint32_t *arrayLengths,
-                                   uint32_t *count, const char *prefix) {
+void RsdShader::initAddUserElement(const Element *e, std::string *names,
+                                   uint32_t *arrayLengths, uint32_t *count,
+                                   const char *prefix) {
     rsAssert(e->mHal.state.fieldsCount);
     for (uint32_t ct=0; ct < e->mHal.state.fieldsCount; ct++) {
         const Element *ce = e->mHal.state.fields[ct];
         if (ce->mHal.state.fieldsCount) {
             initAddUserElement(ce, names, arrayLengths, count, prefix);
         } else {
-            String8 tmp(prefix);
+            std::string tmp(prefix);
             tmp.append(e->mHal.state.fieldNames[ct]);
-            names[*count].setTo(tmp.string());
+            names[*count] = tmp;
             if (arrayLengths) {
                 arrayLengths[*count] = e->mHal.state.fieldArraySizes[ct];
             }
diff --git a/driver/rsdShader.h b/driver/rsdShader.h
index fba1790..0dc5102 100644
--- a/driver/rsdShader.h
+++ b/driver/rsdShader.h
@@ -17,7 +17,7 @@
 #ifndef ANDROID_RSD_SHADER_H
 #define ANDROID_RSD_SHADER_H
 
-#include <utils/String8.h>
+#include <string>
 
 // ---------------------------------------------------------------------------
 namespace android {
@@ -49,16 +49,16 @@
     // Add ability to get all ID's to clean up the cached program objects
     uint32_t getStateBasedIDCount() const { return mStateBasedShaders.size(); }
     uint32_t getStateBasedID(uint32_t index) const {
-        return mStateBasedShaders.itemAt(index)->mShaderID;
+        return mStateBasedShaders[index]->mShaderID;
     }
 
     uint32_t getAttribCount() const {return mAttribCount;}
     uint32_t getUniformCount() const {return mUniformCount;}
-    const android::String8 & getAttribName(uint32_t i) const {return mAttribNames[i];}
-    const android::String8 & getUniformName(uint32_t i) const {return mUniformNames[i];}
+    const std::string & getAttribName(uint32_t i) const {return mAttribNames[i];}
+    const std::string & getUniformName(uint32_t i) const {return mUniformNames[i];}
     uint32_t getUniformArraySize(uint32_t i) const {return mUniformArraySizes[i];}
 
-    android::String8 getGLSLInputString() const;
+    std::string getGLSLInputString() const;
 
     bool isValid() const {return mIsValid;}
     void forceDirty() const {mDirty = true;}
@@ -91,7 +91,7 @@
     void setupUserConstants(const android::renderscript::Context *rsc,
                             RsdShaderCache *sc, bool isFragment);
     void initAddUserElement(const android::renderscript::Element *e,
-                            android::String8 *names, uint32_t *arrayLengths,
+                            std::string *names, uint32_t *arrayLengths,
                             uint32_t *count, const char *prefix);
     void setupTextures(const android::renderscript::Context *rsc, RsdShaderCache *sc);
     void setupSampler(const android::renderscript::Context *rsc,
@@ -104,21 +104,21 @@
     void initAttribAndUniformArray();
 
     mutable bool mDirty;
-    android::String8 mShader;
-    android::String8 mUserShader;
+    std::string mShader;
+    std::string mUserShader;
     uint32_t mType;
 
     uint32_t mTextureCount;
     StateBasedKey *mCurrentState;
     uint32_t mAttribCount;
     uint32_t mUniformCount;
-    android::String8 *mAttribNames;
-    android::String8 *mUniformNames;
+    std::string *mAttribNames;
+    std::string *mUniformNames;
     uint32_t *mUniformArraySizes;
 
-    android::Vector<android::String8> mTextureNames;
+    std::vector<std::string> mTextureNames;
 
-    android::Vector<StateBasedKey*> mStateBasedShaders;
+    std::vector<StateBasedKey*> mStateBasedShaders;
 
     int32_t mTextureUniformIndexStart;
 
@@ -133,7 +133,3 @@
 };
 
 #endif //ANDROID_RSD_SHADER_H
-
-
-
-
diff --git a/driver/rsdShaderCache.cpp b/driver/rsdShaderCache.cpp
index 69b43fc..0e36b49 100644
--- a/driver/rsdShaderCache.cpp
+++ b/driver/rsdShaderCache.cpp
@@ -29,7 +29,7 @@
 
 
 RsdShaderCache::RsdShaderCache() {
-    mEntries.setCapacity(16);
+    mEntries.reserve(16);
     mVertexDirty = true;
     mFragmentDirty = true;
 }
@@ -38,9 +38,13 @@
     cleanupAll();
 }
 
-void RsdShaderCache::updateUniformArrayData(const Context *rsc, RsdShader *prog, uint32_t linkedID,
-                                         UniformData *data, const char* logTag,
-                                         UniformQueryData **uniformList, uint32_t uniListSize) {
+void RsdShaderCache::updateUniformArrayData(const Context *rsc,
+                                            RsdShader *prog,
+                                            uint32_t linkedID,
+                                            UniformData *data,
+                                            const char* logTag,
+                                            UniformQueryData **uniformList,
+                                            uint32_t uniListSize) {
 
     for (uint32_t ct=0; ct < prog->getUniformCount(); ct++) {
         if (data[ct].slot >= 0 && data[ct].arraySize > 1) {
@@ -55,14 +59,17 @@
 
         if (rsc->props.mLogShaders) {
              ALOGV("%s U, %s = %d, arraySize = %d\n", logTag,
-                  prog->getUniformName(ct).string(), data[ct].slot, data[ct].arraySize);
+                   prog->getUniformName(ct).c_str(), data[ct].slot,
+                   data[ct].arraySize);
         }
     }
 }
 
-void RsdShaderCache::populateUniformData(RsdShader *prog, uint32_t linkedID, UniformData *data) {
+void RsdShaderCache::populateUniformData(RsdShader *prog, uint32_t linkedID,
+                                         UniformData *data) {
     for (uint32_t ct=0; ct < prog->getUniformCount(); ct++) {
-       data[ct].slot = glGetUniformLocation(linkedID, prog->getUniformName(ct));
+       data[ct].slot = glGetUniformLocation(linkedID,
+                                            prog->getUniformName(ct).c_str());
        data[ct].arraySize = prog->getUniformArraySize(ct);
     }
 }
@@ -132,7 +139,7 @@
     ProgramEntry *e = new ProgramEntry(vtx->getAttribCount(),
                                        vtx->getUniformCount(),
                                        frag->getUniformCount());
-    mEntries.push(e);
+    mEntries.push_back(e);
     mCurrent = e;
     e->vtx = vID;
     e->frag = fID;
@@ -169,10 +176,12 @@
         }
 
         for (uint32_t ct=0; ct < e->vtxAttrCount; ct++) {
-            e->vtxAttrs[ct].slot = glGetAttribLocation(pgm, vtx->getAttribName(ct));
-            e->vtxAttrs[ct].name = vtx->getAttribName(ct).string();
+            e->vtxAttrs[ct].slot =
+                glGetAttribLocation(pgm, vtx->getAttribName(ct).c_str());
+            e->vtxAttrs[ct].name = vtx->getAttribName(ct).c_str();
             if (rsc->props.mLogShaders) {
-                ALOGV("vtx A %i, %s = %d\n", ct, vtx->getAttribName(ct).string(), e->vtxAttrs[ct].slot);
+                ALOGV("vtx A %i, %s = %d\n", ct,
+                      vtx->getAttribName(ct).c_str(), e->vtxAttrs[ct].slot);
             }
         }
 
@@ -228,7 +237,7 @@
     return true;
 }
 
-int32_t RsdShaderCache::vtxAttribSlot(const String8 &attrName) const {
+int32_t RsdShaderCache::vtxAttribSlot(const std::string &attrName) const {
     for (uint32_t ct=0; ct < mCurrent->vtxAttrCount; ct++) {
         if (attrName == mCurrent->vtxAttrs[ct].name) {
             return mCurrent->vtxAttrs[ct].slot;
@@ -238,46 +247,45 @@
 }
 
 void RsdShaderCache::cleanupVertex(RsdShader *s) {
-    int32_t numEntries = (int32_t)mEntries.size();
     uint32_t numShaderIDs = s->getStateBasedIDCount();
     for (uint32_t sId = 0; sId < numShaderIDs; sId ++) {
         uint32_t id = s->getStateBasedID(sId);
-        for (int32_t ct = 0; ct < numEntries; ct ++) {
-            if (mEntries[ct]->vtx == id) {
-                glDeleteProgram(mEntries[ct]->program);
 
-                delete mEntries[ct];
-                mEntries.removeAt(ct);
-                numEntries = (int32_t)mEntries.size();
-                ct --;
+        for (auto entry = mEntries.begin(); entry != mEntries.end();) {
+            if ((*entry)->vtx == id) {
+                glDeleteProgram((*entry)->program);
+
+                delete *entry;
+                entry = mEntries.erase(entry);
+            } else {
+                entry++;
             }
         }
     }
 }
 
 void RsdShaderCache::cleanupFragment(RsdShader *s) {
-    int32_t numEntries = (int32_t)mEntries.size();
     uint32_t numShaderIDs = s->getStateBasedIDCount();
     for (uint32_t sId = 0; sId < numShaderIDs; sId ++) {
         uint32_t id = s->getStateBasedID(sId);
-        for (int32_t ct = 0; ct < numEntries; ct ++) {
-            if (mEntries[ct]->frag == id) {
-                glDeleteProgram(mEntries[ct]->program);
 
-                delete mEntries[ct];
-                mEntries.removeAt(ct);
-                numEntries = (int32_t)mEntries.size();
-                ct --;
+        for (auto entry = mEntries.begin(); entry != mEntries.end();) {
+            if ((*entry)->frag == id) {
+                glDeleteProgram((*entry)->program);
+
+                delete *entry;
+                entry = mEntries.erase(entry);
+            } else {
+                entry++;
             }
         }
     }
 }
 
 void RsdShaderCache::cleanupAll() {
-    for (uint32_t ct=0; ct < mEntries.size(); ct++) {
-        glDeleteProgram(mEntries[ct]->program);
-        free(mEntries[ct]);
+    for (auto entry : mEntries) {
+        glDeleteProgram(entry->program);
+        delete entry;
     }
     mEntries.clear();
 }
-
diff --git a/driver/rsdShaderCache.h b/driver/rsdShaderCache.h
index 6de1d63..9b45092 100644
--- a/driver/rsdShaderCache.h
+++ b/driver/rsdShaderCache.h
@@ -17,6 +17,9 @@
 #ifndef ANDROID_RSD_SHADER_CACHE_H
 #define ANDROID_RSD_SHADER_CACHE_H
 
+#include <string>
+#include <vector>
+
 namespace android {
 namespace renderscript {
 
@@ -25,10 +28,7 @@
 }
 }
 
-#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
-#include <utils/String8.h>
-#include <utils/Vector.h>
-#else
+#if defined(RS_SERVER) || defined(RS_COMPATIBILITY_LIB)
 #include "rsUtils.h"
 #endif
 class RsdShader;
@@ -58,7 +58,7 @@
 
     void cleanupAll();
 
-    int32_t vtxAttribSlot(const android::String8 &attrName) const;
+    int32_t vtxAttribSlot(const std::string &attrName) const;
     int32_t vtxUniformSlot(uint32_t a) const {return mCurrent->vtxUniforms[a].slot;}
     uint32_t vtxUniformSize(uint32_t a) const {return mCurrent->vtxUniforms[a].arraySize;}
     int32_t fragUniformSlot(uint32_t a) const {return mCurrent->fragUniforms[a].slot;}
@@ -143,7 +143,7 @@
         UniformData *fragUniforms;
         bool *fragUniformIsSTO;
     };
-    android::Vector<ProgramEntry*> mEntries;
+    std::vector<ProgramEntry*> mEntries;
     ProgramEntry *mCurrent;
 
     bool hasArrayUniforms(RsdShader *vtx, RsdShader *frag);
@@ -156,7 +156,3 @@
 
 
 #endif //ANDROID_RSD_SHADER_CACHE_H
-
-
-
-
diff --git a/driver/rsdVertexArray.cpp b/driver/rsdVertexArray.cpp
index 4e293f6..d0a9b3e 100644
--- a/driver/rsdVertexArray.cpp
+++ b/driver/rsdVertexArray.cpp
@@ -48,7 +48,7 @@
     stride = 0;
     ptr = NULL;
     normalized = false;
-    name.setTo("");
+    name = "";
 }
 
 void RsdVertexArray::Attrib::set(uint32_t type, uint32_t size, uint32_t stride,
@@ -60,7 +60,7 @@
     this->offset = offset;
     this->normalized = normalized;
     this->stride = stride;
-    this->name.setTo(name);
+    this->name = name;
 }
 
 void RsdVertexArray::logAttrib(uint32_t idx, uint32_t slot) const {
@@ -69,7 +69,7 @@
     }
     ALOGV("va %i: slot=%i name=%s buf=%i ptr=%p size=%i  type=0x%x  stride=0x%x  norm=%i  offset=0x%p",
           idx, slot,
-          mAttribs[idx].name.string(),
+          mAttribs[idx].name.c_str(),
           mAttribs[idx].buffer,
           mAttribs[idx].ptr,
           mAttribs[idx].size,
@@ -135,4 +135,3 @@
         mAttrsEnabled[ct] = false;
     }
 }
-
diff --git a/driver/rsdVertexArray.h b/driver/rsdVertexArray.h
index 975121b..1bafe3b 100644
--- a/driver/rsdVertexArray.h
+++ b/driver/rsdVertexArray.h
@@ -17,6 +17,8 @@
 #ifndef ANDROID_RSD_VERTEX_ARRAY_H
 #define ANDROID_RSD_VERTEX_ARRAY_H
 
+#include <string>
+
 #include "rsUtils.h"
 
 namespace android {
@@ -39,7 +41,7 @@
         uint32_t size;
         uint32_t stride;
         bool normalized;
-        android::String8 name;
+        std::string name;
 
         Attrib();
         void clear();
@@ -74,6 +76,3 @@
 
 
 #endif //ANDROID_RSD_VERTEX_ARRAY_H
-
-
-
diff --git a/driver/runtime/rs_cl.c b/driver/runtime/rs_cl.c
index fe45420..a79ad2a 100644
--- a/driver/runtime/rs_cl.c
+++ b/driver/runtime/rs_cl.c
@@ -548,7 +548,7 @@
 
 extern float __attribute__((overloadable)) rootn(float v, int r) {
     if (r == 0) {
-        return posinf(0);
+        return posinf();
     }
 
     if (iszero(v)) {
diff --git a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
index 3acfe98..3047a56 100644
--- a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
+++ b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
@@ -93,7 +93,7 @@
         /*unitTests.add(new UT_program_store(this, mRes, mCtx));
         unitTests.add(new UT_program_raster(this, mRes, mCtx));
         unitTests.add(new UT_mesh(this, mRes, mCtx));*/
-        //unitTests.add(new UT_foreach_multi(this, mRes, mCtx));
+        unitTests.add(new UT_foreach_multi(this, mRes, mCtx));
         unitTests.add(new UT_fp_mad(this, mRes, mCtx));
 
         /*
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_foreach_multi.java b/java/tests/RsTest/src/com/android/rs/test/UT_foreach_multi.java
new file mode 100644
index 0000000..1a05f80
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_foreach_multi.java
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.test;
+
+import android.content.Context;
+import android.content.res.Resources;
+import android.renderscript.*;
+import android.util.Log;
+
+public class UT_foreach_multi extends UnitTest {
+    private Resources mRes;
+    private Allocation Ain0;
+    private Allocation Ain1;
+    private Allocation Ain2;
+    private Allocation Ain3;
+
+    private Allocation Out0;
+    private Allocation Out1;
+    private Allocation Out2;
+    private Allocation Out3;
+
+    protected UT_foreach_multi(RSTestCore rstc, Resources res, Context ctx) {
+        super(rstc, "Foreach Multi-input", ctx);
+        mRes = res;
+    }
+
+    private void initializeGlobals(RenderScript RS, ScriptC_foreach_multi s) {
+        Type.Builder type32Builder = new Type.Builder(RS, Element.U32(RS));
+        Type.Builder type16Builder = new Type.Builder(RS, Element.U16(RS));
+
+        int Xdim = 5;
+        s.set_dimX(Xdim);
+        type32Builder.setX(Xdim);
+        type16Builder.setX(Xdim);
+
+        // 32-bit input allocations
+
+        Ain0 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_ain0(Ain0);
+        s.forEach_init_uint32_alloc(Ain0);
+
+        Ain1 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_ain1(Ain1);
+        s.forEach_init_uint32_alloc(Ain1);
+
+        Ain2 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_ain2(Ain2);
+        s.forEach_init_uint32_alloc(Ain2);
+
+        // 16-bit input allocation
+
+        Ain3 = Allocation.createTyped(RS, type16Builder.create());
+        s.set_ain3(Ain3);
+        s.forEach_init_uint16_alloc(Ain3);
+
+        // 32-bit output allocations
+
+        Out0 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_aout0(Out0);
+
+        Out1 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_aout1(Out1);
+
+        Out2 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_aout2(Out2);
+
+        // RetStruct output allocations
+
+        ScriptField_RetStruct StructType = new ScriptField_RetStruct(RS, Xdim);
+        Out3 = StructType.getAllocation();
+        s.set_aout3(Out3);
+
+        return;
+    }
+
+    public void run() {
+        RenderScript pRS = RenderScript.create(mCtx);
+        ScriptC_foreach_multi s = new ScriptC_foreach_multi(pRS);
+
+        pRS.setMessageHandler(mRsMessage);
+
+        initializeGlobals(pRS, s);
+
+        s.forEach_sum2(Ain0, Ain1, Out0);
+        s.forEach_sum3(Ain0, Ain1, Ain2, Out1);
+        s.forEach_sum_mixed(Ain0, Ain3, Out2);
+        s.forEach_sum2_struct(Ain0, Ain1, Out3);
+
+        s.invoke_test_outputs();
+        s.invoke_check_test_results();
+
+        pRS.finish();
+        waitForMessage();
+        pRS.destroy();
+    }
+}
diff --git a/java/tests/RsTest/src/com/android/rs/test/foreach_multi.rs b/java/tests/RsTest/src/com/android/rs/test/foreach_multi.rs
new file mode 100644
index 0000000..0857e86
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/foreach_multi.rs
@@ -0,0 +1,178 @@
+#include "shared.rsh"
+
+struct RetStruct {
+    uint32_t i0;
+    uint32_t i1;
+    uint32_t i2;
+    uint32_t i3;
+    uint32_t i4;
+    uint32_t i5;
+    uint32_t i6;
+    uint32_t i7;
+};
+
+rs_allocation ain0, ain1, ain2;
+rs_allocation ain3;
+
+rs_allocation aout0, aout1, aout2, aout3;
+
+uint32_t dimX;
+
+static bool failed = false;
+
+uint32_t RS_KERNEL init_uint32_alloc(uint32_t x) {
+    return x;
+}
+
+uint16_t RS_KERNEL init_uint16_alloc(uint32_t x) {
+    return x;
+}
+
+uint32_t RS_KERNEL sum2(uint32_t in0, uint32_t in1, uint32_t x) {
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+
+    return in0 + in1;
+}
+
+struct RetStruct RS_KERNEL
+sum2_struct(uint32_t in0, uint32_t in1, uint32_t x) {
+
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+
+    struct RetStruct retval;
+
+    retval.i0 = in0 + in1;
+    retval.i1 = in0 + in1;
+    retval.i2 = in0 + in1;
+    retval.i3 = in0 + in1;
+    retval.i4 = in0 + in1;
+    retval.i5 = in0 + in1;
+    retval.i6 = in0 + in1;
+    retval.i7 = in0 + in1;
+
+    return retval;
+}
+
+uint32_t RS_KERNEL sum3(uint32_t in0, uint32_t in1, uint32_t in2, uint32_t x) {
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+    _RS_ASSERT(in2 == x);
+
+    return in0 + in1 + in2;
+}
+
+
+uint32_t RS_KERNEL sum_mixed(uint32_t in0, uint16_t in1, uint32_t x) {
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+
+    return in0 + in1;
+}
+
+static bool test_sum2_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        _RS_ASSERT(rsGetElementAt_uint(aout0, i) ==
+                   (rsGetElementAt_uint(ain0, i) +
+                    rsGetElementAt_uint(ain1, i)));
+    }
+
+    if (failed) {
+        rsDebug("test_sum2_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum2_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+static bool test_sum3_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        _RS_ASSERT(rsGetElementAt_uint(aout1, i) ==
+                   (rsGetElementAt_uint(ain0, i) +
+                    rsGetElementAt_uint(ain1, i) +
+                    rsGetElementAt_uint(ain2, i)));
+    }
+
+    if (failed) {
+        rsDebug("test_sum3_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum3_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+static bool test_sum_mixed_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        _RS_ASSERT(rsGetElementAt_uint(aout2, i) ==
+                   (rsGetElementAt_uint(ain0, i) +
+                    rsGetElementAt_ushort(ain3, i)));
+    }
+
+    if (failed) {
+        rsDebug("test_sum_mixed_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum_mixed_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+static bool test_sum2_struct_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        struct RetStruct *result = (struct RetStruct*)rsGetElementAt(aout3, i);
+
+        uint32_t sum = rsGetElementAt_uint(ain0, i) +
+                       rsGetElementAt_uint(ain1, i);
+
+        _RS_ASSERT(result->i0 == sum);
+        _RS_ASSERT(result->i1 == sum);
+        _RS_ASSERT(result->i2 == sum);
+        _RS_ASSERT(result->i3 == sum);
+        _RS_ASSERT(result->i4 == sum);
+        _RS_ASSERT(result->i5 == sum);
+        _RS_ASSERT(result->i6 == sum);
+        _RS_ASSERT(result->i7 == sum);
+    }
+
+    if (failed) {
+        rsDebug("test_sum2_struct_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum2_struct_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+void test_outputs() {
+    failed |= test_sum2_output();
+    failed |= test_sum3_output();
+    failed |= test_sum_mixed_output();
+    failed |= test_sum2_struct_output();
+}
+
+void check_test_results() {
+    if (failed) {
+        rsSendToClientBlocking(RS_MSG_TEST_FAILED);
+    } else {
+        rsSendToClientBlocking(RS_MSG_TEST_PASSED);
+    }
+}
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 0443ee8..b13467a 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -223,34 +223,40 @@
     }
 
     if (y >= mHal.drvState.lod[0].dimY) {
-        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData X offset out of range.");
+        rsc->setError(RS_ERROR_BAD_VALUE,
+                      "subElementData X offset out of range.");
         return;
     }
 
     if (cIdx >= mHal.state.type->getElement()->getFieldCount()) {
-        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData component out of range.");
+        rsc->setError(RS_ERROR_BAD_VALUE,
+                      "subElementData component out of range.");
         return;
     }
 
     const Element * e = mHal.state.type->getElement()->getField(cIdx);
-    uint32_t elemArraySize = mHal.state.type->getElement()->getFieldArraySize(cIdx);
+    uint32_t elemArraySize =
+        mHal.state.type->getElement()->getFieldArraySize(cIdx);
     if (sizeBytes != e->getSizeBytes() * elemArraySize) {
         rsc->setError(RS_ERROR_BAD_VALUE, "subElementData bad size.");
         return;
     }
 
-    rsc->mHal.funcs.allocation.elementData2D(rsc, this, x, y, data, cIdx, sizeBytes);
+    rsc->mHal.funcs.allocation.elementData2D(rsc, this, x, y, data, cIdx,
+                                             sizeBytes);
     sendDirty(rsc);
 }
 
 void Allocation::addProgramToDirty(const Program *p) {
-    mToDirtyList.push(p);
+    mToDirtyList.push_back(p);
 }
 
 void Allocation::removeProgramToDirty(const Program *p) {
-    for (size_t ct=0; ct < mToDirtyList.size(); ct++) {
-        if (mToDirtyList[ct] == p) {
-            mToDirtyList.removeAt(ct);
+    for (auto entryIter = mToDirtyList.begin(), endIter = mToDirtyList.end();
+         entryIter != endIter; entryIter++) {
+
+        if (p == *entryIter) {
+            mToDirtyList.erase(entryIter);
             return;
         }
     }
@@ -268,7 +274,8 @@
         }
     }
     ALOGV("%s allocation ptr=%p  mUsageFlags=0x04%x, mMipmapControl=0x%04x",
-         prefix, mHal.drvState.lod[0].mallocPtr, mHal.state.usageFlags, mHal.state.mipmapControl);
+          prefix, mHal.drvState.lod[0].mallocPtr, mHal.state.usageFlags,
+          mHal.state.mipmapControl);
 }
 
 uint32_t Allocation::getPackedSize() const {
diff --git a/rsAllocation.h b/rsAllocation.h
index f197efc..47344d8 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -170,7 +170,7 @@
     bool hasSameDims(const Allocation *Other) const;
 
 protected:
-    Vector<const Program *> mToDirtyList;
+    std::vector<const Program *> mToDirtyList;
     ObjectBaseRef<const Type> mType;
     void setType(const Type *t) {
         mType.set(t);
diff --git a/rsContext.cpp b/rsContext.cpp
index 2112ace..0d9ca05 100644
--- a/rsContext.cpp
+++ b/rsContext.cpp
@@ -747,13 +747,15 @@
 void Context::assignName(ObjectBase *obj, const char *name, uint32_t len) {
     rsAssert(!obj->getName());
     obj->setName(name, len);
-    mNames.add(obj);
+    mNames.push_back(obj);
 }
 
 void Context::removeName(ObjectBase *obj) {
-    for (size_t ct=0; ct < mNames.size(); ct++) {
-        if (obj == mNames[ct]) {
-            mNames.removeAt(ct);
+    for (auto nameIter = mNames.begin(), endIter = mNames.end();
+         nameIter != endIter; nameIter++) {
+
+        if (obj == *nameIter) {
+            mNames.erase(nameIter);
             return;
         }
     }
@@ -988,4 +990,3 @@
     ObjectBase *ob = static_cast<ObjectBase *>(obj);
     (*name) = ob->getName();
 }
-
diff --git a/rsContext.h b/rsContext.h
index b382358..f750670 100644
--- a/rsContext.h
+++ b/rsContext.h
@@ -297,7 +297,7 @@
     bool mHasSurface;
     bool mIsContextLite;
 
-    Vector<ObjectBase *> mNames;
+    std::vector<ObjectBase *> mNames;
 
     uint64_t mTimers[_RS_TIMER_TOTAL];
     Timers mTimerActive;
diff --git a/rsCppUtils.h b/rsCppUtils.h
index 71cf077..7432109 100644
--- a/rsCppUtils.h
+++ b/rsCppUtils.h
@@ -19,8 +19,6 @@
 
 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
 #include <utils/Log.h>
-#include <utils/String8.h>
-#include <utils/Vector.h>
 #include <cutils/atomic.h>
 #endif
 
@@ -54,96 +52,6 @@
 #define ALOGV(...) \
     __android_log_print(ANDROID_LOG_VERBOSE, LOG_TAG, __VA_ARGS__);
 
-namespace android {
-
-    // server has no Vector or String8 classes; implement on top of STL
-    class String8: public std::string {
-    public:
-    String8(const char *ptr) : std::string(ptr) {
-
-        }
-    String8(const char *ptr, size_t len) : std::string(ptr, len) {
-
-        }
-    String8() : std::string() {
-
-        }
-
-        const char* string() const {
-            return this->c_str();
-        }
-
-        void setTo(const char* str, ssize_t len) {
-            this->assign(str, len);
-        }
-        void setTo(const char* str) {
-            this->assign(str);
-        }
-        String8 getPathDir(void) const {
-            const char* cp;
-            const char*const str = this->c_str();
-
-            cp = strrchr(str, OS_PATH_SEPARATOR);
-            if (cp == NULL)
-                return String8("");
-            else
-                return String8(str, cp - str);
-        }
-    };
-
-    template <class T> class Vector: public std::vector<T> {
-    public:
-        void push(T obj) {
-            this->push_back(obj);
-        }
-        void removeAt(uint32_t index) {
-            this->erase(this->begin() + index);
-        }
-        ssize_t add(const T& obj) {
-            this->push_back(obj);
-            return this->size() - 1;
-        }
-        void setCapacity(ssize_t capacity) {
-            this->resize(capacity);
-        }
-
-        T* editArray() {
-            return (T*)(this->begin());
-        }
-
-        const T* array() {
-            return (const T*)(this->begin());
-        }
-
-    };
-
-    template<> class Vector<bool>: public std::vector<char> {
-    public:
-        void push(bool obj) {
-            this->push_back(obj);
-        }
-        void removeAt(uint32_t index) {
-            this->erase(this->begin() + index);
-        }
-        ssize_t add(const bool& obj) {
-            this->push_back(obj);
-            return this->size() - 1;
-        }
-        void setCapacity(ssize_t capacity) {
-            this->resize(capacity);
-        }
-
-        bool* editArray() {
-            return (bool*)(this->begin());
-        }
-
-        const bool* array() {
-            return (const bool*)(this->begin());
-        }
-    };
-
-}
-
 typedef int64_t nsecs_t;  // nano-seconds
 
 enum {
@@ -286,5 +194,3 @@
 }
 
 #endif //ANDROID_RS_OBJECT_BASE_H
-
-
diff --git a/rsDefines.h b/rsDefines.h
index 1259610..e7e869e 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -367,7 +367,10 @@
     RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9,
     // unused 10
     RS_SCRIPT_INTRINSIC_ID_LOOP_FILTER = 11,
-    RS_SCRIPT_INTRINSIC_ID_RESIZE = 12
+    RS_SCRIPT_INTRINSIC_ID_RESIZE = 12,
+
+
+    RS_SCRIPT_INTRINSIC_ID_OEM_START = 0x10000000
 };
 
 typedef struct {
diff --git a/rsDevice.cpp b/rsDevice.cpp
index 2688890..1ba005a 100644
--- a/rsDevice.cpp
+++ b/rsDevice.cpp
@@ -28,14 +28,16 @@
 }
 
 void Device::addContext(Context *rsc) {
-    mContexts.push(rsc);
+    mContexts.push_back(rsc);
 }
 
 void Device::removeContext(Context *rsc) {
-    for (size_t idx=0; idx < mContexts.size(); idx++) {
-        if (mContexts[idx] == rsc) {
-            mContexts.removeAt(idx);
-            break;
+    for (auto ctxIter = mContexts.begin(), endIter = mContexts.end();
+         ctxIter != endIter; ctxIter++) {
+
+        if (rsc == *ctxIter) {
+            mContexts.erase(ctxIter);
+            return;
         }
     }
 }
@@ -58,4 +60,3 @@
     }
     rsAssert(0);
 }
-
diff --git a/rsDevice.h b/rsDevice.h
index ffb514b..5961336 100644
--- a/rsDevice.h
+++ b/rsDevice.h
@@ -17,6 +17,8 @@
 #ifndef ANDROID_RS_DEVICE_H
 #define ANDROID_RS_DEVICE_H
 
+#include <vector>
+
 #include "rsUtils.h"
 
 // ---------------------------------------------------------------------------
@@ -36,7 +38,7 @@
     bool mForceSW;
 
 protected:
-    Vector<Context *> mContexts;
+    std::vector<Context *> mContexts;
 };
 
 }
diff --git a/rsElement.cpp b/rsElement.cpp
index f7b064a..0da8096 100644
--- a/rsElement.cpp
+++ b/rsElement.cpp
@@ -42,10 +42,14 @@
 }
 
 void Element::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateElement.mElements.size(); ct++) {
-        if (mRSC->mStateElement.mElements[ct] == this) {
-            mRSC->mStateElement.mElements.removeAt(ct);
-            break;
+    auto &elements = mRSC->mStateElement.mElements;
+
+    for (auto elIter = elements.begin(), endIter = elements.end();
+         elIter != endIter; elIter++) {
+
+        if (this == *elIter) {
+            elements.erase(elIter);
+            return;
         }
     }
 }
@@ -264,7 +268,7 @@
 
 
     ObjectBase::asyncLock();
-    rsc->mStateElement.mElements.push(e);
+    rsc->mStateElement.mElements.push_back(e);
     ObjectBase::asyncUnlock();
 
     return returnRef;
@@ -339,7 +343,7 @@
     e->compute();
 
     ObjectBase::asyncLock();
-    rsc->mStateElement.mElements.push(e);
+    rsc->mStateElement.mElements.push_back(e);
     ObjectBase::asyncUnlock();
 
     return returnRef;
diff --git a/rsElement.h b/rsElement.h
index 5a3bc13..2ae9404 100644
--- a/rsElement.h
+++ b/rsElement.h
@@ -17,6 +17,8 @@
 #ifndef ANDROID_STRUCTURED_ELEMENT_H
 #define ANDROID_STRUCTURED_ELEMENT_H
 
+#include <vector>
+
 #include "rsComponent.h"
 #include "rsUtils.h"
 #include "rsDefines.h"
@@ -170,7 +172,7 @@
     ~ElementState();
 
     // Cache of all existing elements.
-    Vector<Element *> mElements;
+    std::vector<Element *> mElements;
 };
 
 
diff --git a/rsFileA3D.cpp b/rsFileA3D.cpp
index a589033..ef5730f 100644
--- a/rsFileA3D.cpp
+++ b/rsFileA3D.cpp
@@ -87,7 +87,7 @@
             entry->mLength = headerStream->loadU32();
         }
         entry->mRsObj = NULL;
-        mIndex.push(entry);
+        mIndex.push_back(entry);
     }
 }
 
@@ -379,7 +379,7 @@
     indexEntry->mType = obj->getClassId();
     indexEntry->mOffset = mWriteStream->getPos();
     indexEntry->mRsObj = obj;
-    mWriteIndex.push(indexEntry);
+    mWriteIndex.push_back(indexEntry);
     obj->serialize(con, mWriteStream);
     indexEntry->mLength = mWriteStream->getPos() - indexEntry->mOffset;
     mWriteStream->align(4);
diff --git a/rsFileA3D.h b/rsFileA3D.h
index 8bf36b9..0c8b3d6 100644
--- a/rsFileA3D.h
+++ b/rsFileA3D.h
@@ -88,15 +88,13 @@
     Asset *mAsset;
 
     OStream *mWriteStream;
-    Vector<A3DIndexEntry*> mWriteIndex;
+    std::vector<A3DIndexEntry*> mWriteIndex;
 
     IStream *mReadStream;
-    Vector<A3DIndexEntry*> mIndex;
+    std::vector<A3DIndexEntry*> mIndex;
 };
 
 
 }
 }
 #endif //ANDROID_RS_FILE_A3D_H
-
-
diff --git a/rsFont.cpp b/rsFont.cpp
index 8feef2d..71399af 100644
--- a/rsFont.cpp
+++ b/rsFont.cpp
@@ -33,7 +33,7 @@
 using namespace android;
 using namespace android::renderscript;
 
-Font::Font(Context *rsc) : ObjectBase(rsc), mCachedGlyphs(NULL) {
+Font::Font(Context *rsc) : ObjectBase(rsc) {
     mInitialized = false;
     mHasKerning = false;
     mFace = NULL;
@@ -76,17 +76,21 @@
 }
 
 void Font::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateFont.mActiveFonts.size(); ct++) {
-        if (mRSC->mStateFont.mActiveFonts[ct] == this) {
-            mRSC->mStateFont.mActiveFonts.removeAt(ct);
-            break;
+    auto &activeFonts = mRSC->mStateFont.mActiveFonts;
+
+    for (auto font = activeFonts.begin(), end = activeFonts.end(); font != end;
+         font++) {
+
+        if (this == *font) {
+            activeFonts.erase(font);
+            return;
         }
     }
 }
 
 void Font::invalidateTextureCache() {
     for (uint32_t i = 0; i < mCachedGlyphs.size(); i ++) {
-        mCachedGlyphs.valueAt(i)->mIsValid = false;
+        mCachedGlyphs[i]->mIsValid = false;
     }
 }
 
@@ -224,7 +228,7 @@
 
 Font::CachedGlyphInfo* Font::getCachedUTFChar(int32_t utfChar) {
 
-    CachedGlyphInfo *cachedGlyph = mCachedGlyphs.valueFor((uint32_t)utfChar);
+    CachedGlyphInfo *cachedGlyph = mCachedGlyphs[(uint32_t)utfChar];
     if (cachedGlyph == NULL) {
         cachedGlyph = cacheGlyph((uint32_t)utfChar);
     }
@@ -283,7 +287,7 @@
 
 Font::CachedGlyphInfo *Font::cacheGlyph(uint32_t glyph) {
     CachedGlyphInfo *newGlyph = new CachedGlyphInfo();
-    mCachedGlyphs.add(glyph, newGlyph);
+    mCachedGlyphs[glyph] = newGlyph;
 #ifndef ANDROID_RS_SERIALIZE
     newGlyph->mGlyphIndex = FT_Get_Char_Index(mFace, glyph);
     newGlyph->mIsValid = false;
@@ -296,11 +300,14 @@
 Font * Font::create(Context *rsc, const char *name, float fontSize, uint32_t dpi,
                     const void *data, uint32_t dataLen) {
     rsc->mStateFont.checkInit();
-    Vector<Font*> &activeFonts = rsc->mStateFont.mActiveFonts;
+    std::vector<Font*> &activeFonts = rsc->mStateFont.mActiveFonts;
 
     for (uint32_t i = 0; i < activeFonts.size(); i ++) {
         Font *ithFont = activeFonts[i];
-        if (ithFont->mFontName == name && ithFont->mFontSize == fontSize && ithFont->mDpi == dpi) {
+        if (ithFont->mFontName == name &&
+            ithFont->mFontSize == fontSize &&
+            ithFont->mDpi == dpi) {
+
             return ithFont;
         }
     }
@@ -308,7 +315,7 @@
     Font *newFont = new Font(rsc);
     bool isInitialized = newFont->init(name, fontSize, dpi, data, dataLen);
     if (isInitialized) {
-        activeFonts.push(newFont);
+        activeFonts.push_back(newFont);
         rsc->mStateFont.precacheLatin(newFont);
         return newFont;
     }
@@ -325,7 +332,7 @@
 #endif
 
     for (uint32_t i = 0; i < mCachedGlyphs.size(); i ++) {
-        CachedGlyphInfo *glyph = mCachedGlyphs.valueAt(i);
+        CachedGlyphInfo *glyph = mCachedGlyphs[i];
         delete glyph;
     }
 }
@@ -551,29 +558,39 @@
     mCacheHeight = 256;
     mCacheWidth = 1024;
     ObjectBaseRef<Type> texType = Type::getTypeRef(mRSC, alphaElem.get(),
-                                                   mCacheWidth, mCacheHeight, 0, false, false, 0);
+                                                   mCacheWidth, mCacheHeight,
+                                                   0, false, false, 0);
+
     mCacheBuffer = new uint8_t[mCacheWidth * mCacheHeight];
 
 
-    Allocation *cacheAlloc = Allocation::createAllocation(mRSC, texType.get(),
-                                RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE);
+    Allocation *cacheAlloc =
+        Allocation::createAllocation(mRSC, texType.get(),
+                                     RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE);
     mTextTexture.set(cacheAlloc);
 
     // Split up our cache texture into lines of certain widths
     int32_t nextLine = 0;
-    mCacheLines.push(new CacheTextureLine(16, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(24, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(24, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(32, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(32, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(40, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(texType->getDimY() - nextLine, texType->getDimX(), nextLine, 0));
+    mCacheLines.push_back(new CacheTextureLine(16, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(24, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(24, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(32, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(32, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(40, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(texType->getDimY() - nextLine,
+                          texType->getDimX(), nextLine, 0));
 }
 
 // Avoid having to reallocate memory and render quad by quad
diff --git a/rsFont.h b/rsFont.h
index 7bac508..71e6fb4 100644
--- a/rsFont.h
+++ b/rsFont.h
@@ -17,9 +17,10 @@
 #ifndef ANDROID_RS_FONT_H
 #define ANDROID_RS_FONT_H
 
+#include <map>
+#include <vector>
+
 #include "rsStream.h"
-#include <utils/Vector.h>
-#include <utils/KeyedVector.h>
 
 struct FT_LibraryRec_;
 struct FT_FaceRec_;
@@ -124,7 +125,7 @@
     bool mInitialized;
     bool mHasKerning;
 
-    DefaultKeyedVector<uint32_t, CachedGlyphInfo* > mCachedGlyphs;
+    std::map<uint32_t, CachedGlyphInfo* > mCachedGlyphs;
     CachedGlyphInfo* getCachedUTFChar(int32_t utfChar);
 
     CachedGlyphInfo *cacheGlyph(uint32_t glyph);
@@ -178,7 +179,7 @@
         bool fitBitmap(FT_Bitmap_ *bitmap, uint32_t *retOriginX, uint32_t *retOriginY);
     };
 
-    Vector<CacheTextureLine*> mCacheLines;
+    std::vector<CacheTextureLine*> mCacheLines;
     uint32_t getRemainingCacheCapacity();
 
     void precacheLatin(Font *font);
@@ -203,7 +204,7 @@
     FT_LibraryRec_ *mLibrary;
     FT_LibraryRec_ *getLib();
 #endif //ANDROID_RS_SERIALIZE
-    Vector<Font*> mActiveFonts;
+    std::vector<Font*> mActiveFonts;
 
     // Render state for the font
     ObjectBaseRef<Allocation> mFontShaderFConstant;
diff --git a/rsGrallocConsumer.h b/rsGrallocConsumer.h
index 9e4fc58..6f3f879 100644
--- a/rsGrallocConsumer.h
+++ b/rsGrallocConsumer.h
@@ -17,12 +17,12 @@
 #ifndef ANDROID_RS_GRALLOC_CONSUMER_H
 #define ANDROID_RS_GRALLOC_CONSUMER_H
 
+#include <vector>
+
 #include <gui/ConsumerBase.h>
 
 #include <ui/GraphicBuffer.h>
 
-#include <utils/String8.h>
-#include <utils/Vector.h>
 #include <utils/threads.h>
 
 
@@ -75,4 +75,3 @@
 } // namespace android
 
 #endif // ANDROID_RS_GRALLOC_CONSUMER_H
-
diff --git a/rsProgramFragment.h b/rsProgramFragment.h
index e7456b9..1357bfc 100644
--- a/rsProgramFragment.h
+++ b/rsProgramFragment.h
@@ -55,7 +55,7 @@
     void deinit(Context *rsc);
 
     ObjectBaseRef<ProgramFragment> mDefault;
-    Vector<ProgramFragment *> mPrograms;
+    std::vector<ProgramFragment *> mPrograms;
 
     ObjectBaseRef<ProgramFragment> mLast;
 };
@@ -63,7 +63,3 @@
 }
 }
 #endif
-
-
-
-
diff --git a/rsProgramRaster.cpp b/rsProgramRaster.cpp
index 4f27f2e..d2d0602 100644
--- a/rsProgramRaster.cpp
+++ b/rsProgramRaster.cpp
@@ -31,10 +31,14 @@
 }
 
 void ProgramRaster::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateRaster.mRasterPrograms.size(); ct++) {
-        if (mRSC->mStateRaster.mRasterPrograms[ct] == this) {
-            mRSC->mStateRaster.mRasterPrograms.removeAt(ct);
-            break;
+    auto &rasters = mRSC->mStateRaster.mRasterPrograms;
+
+    for (auto prIter = rasters.begin(), endIter = rasters.end();
+         prIter != endIter; prIter++) {
+
+        if (this == *prIter) {
+            rasters.erase(prIter);
+            return;
         }
     }
 }
@@ -94,7 +98,7 @@
     returnRef.set(pr);
 
     ObjectBase::asyncLock();
-    rsc->mStateRaster.mRasterPrograms.push(pr);
+    rsc->mStateRaster.mRasterPrograms.push_back(pr);
     ObjectBase::asyncUnlock();
 
     return returnRef;
@@ -111,4 +115,3 @@
 
 }
 }
-
diff --git a/rsProgramRaster.h b/rsProgramRaster.h
index e9a524b..207d74c 100644
--- a/rsProgramRaster.h
+++ b/rsProgramRaster.h
@@ -75,14 +75,10 @@
     ObjectBaseRef<ProgramRaster> mLast;
 
     // Cache of all existing raster programs.
-    Vector<ProgramRaster *> mRasterPrograms;
+    std::vector<ProgramRaster *> mRasterPrograms;
 };
 
 
 }
 }
 #endif
-
-
-
-
diff --git a/rsProgramStore.cpp b/rsProgramStore.cpp
index 83c1f2c..b2d527e 100644
--- a/rsProgramStore.cpp
+++ b/rsProgramStore.cpp
@@ -42,10 +42,14 @@
 }
 
 void ProgramStore::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateFragmentStore.mStorePrograms.size(); ct++) {
-        if (mRSC->mStateFragmentStore.mStorePrograms[ct] == this) {
-            mRSC->mStateFragmentStore.mStorePrograms.removeAt(ct);
-            break;
+    auto &stores = mRSC->mStateFragmentStore.mStorePrograms;
+
+    for (auto psIter = stores.begin(), endIter = stores.end();
+         psIter != endIter; psIter++) {
+
+        if (this == *psIter) {
+            stores.erase(psIter);
+            return;
         }
     }
 }
@@ -118,7 +122,7 @@
     pfs->init();
 
     ObjectBase::asyncLock();
-    rsc->mStateFragmentStore.mStorePrograms.push(pfs);
+    rsc->mStateFragmentStore.mStorePrograms.push_back(pfs);
     ObjectBase::asyncUnlock();
 
     return returnRef;
diff --git a/rsProgramStore.h b/rsProgramStore.h
index 9a7f7f1..06824fe 100644
--- a/rsProgramStore.h
+++ b/rsProgramStore.h
@@ -92,12 +92,9 @@
     ObjectBaseRef<ProgramStore> mLast;
 
     // Cache of all existing store programs.
-    Vector<ProgramStore *> mStorePrograms;
+    std::vector<ProgramStore *> mStorePrograms;
 };
 
 }
 }
 #endif
-
-
-
diff --git a/rsRuntime.h b/rsRuntime.h
index eb93e25..5a05883 100644
--- a/rsRuntime.h
+++ b/rsRuntime.h
@@ -158,7 +158,7 @@
                 Allocation *in,
                 Allocation *out,
                 const void *usr,
-                 uint32_t usrBytes,
+                uint32_t usrBytes,
                 const RsScriptCall *call);
 
 
diff --git a/rsSampler.cpp b/rsSampler.cpp
index 0cf0b55..0ea9729 100644
--- a/rsSampler.cpp
+++ b/rsSampler.cpp
@@ -49,10 +49,14 @@
 }
 
 void Sampler::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateSampler.mAllSamplers.size(); ct++) {
-        if (mRSC->mStateSampler.mAllSamplers[ct] == this) {
-            mRSC->mStateSampler.mAllSamplers.removeAt(ct);
-            break;
+    auto &samplers = mRSC->mStateSampler.mAllSamplers;
+
+    for (auto sampleIter = samplers.begin(), endIter = samplers.end();
+         sampleIter != endIter; sampleIter++) {
+
+        if (this == *sampleIter) {
+            samplers.erase(sampleIter);
+            return;
         }
     }
 }
@@ -113,7 +117,7 @@
 #endif
 
     ObjectBase::asyncLock();
-    rsc->mStateSampler.mAllSamplers.push(s);
+    rsc->mStateSampler.mAllSamplers.push_back(s);
     ObjectBase::asyncUnlock();
 
     return returnRef;
diff --git a/rsSampler.h b/rsSampler.h
index 2fdf707..3f5855f 100644
--- a/rsSampler.h
+++ b/rsSampler.h
@@ -96,12 +96,9 @@
         }
     }
     // Cache of all existing raster programs.
-    Vector<Sampler *> mAllSamplers;
+    std::vector<Sampler *> mAllSamplers;
 };
 
 }
 }
 #endif //ANDROID_RS_SAMPLER_H
-
-
-
diff --git a/rsScript.cpp b/rsScript.cpp
index dd962d1..a4fa196 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -187,38 +187,13 @@
     free(tz);
 }
 
-void rsi_ScriptForEach(Context *rsc, RsScript vs, uint32_t slot,
-                       RsAllocation vain, RsAllocation vaout,
-                       const void *params, size_t paramLen,
-                       const RsScriptCall *sc, size_t scLen) {
-    Script *s = static_cast<Script *>(vs);
-    // The rs.spec generated code does not handle the absence of an actual
-    // input for sc. Instead, it retains an existing pointer value (the prior
-    // field in the packed data object). This can cause confusion because
-    // drivers might now inspect bogus sc data.
-    if (scLen == 0) {
-        sc = NULL;
-    }
-    s->runForEach(rsc, slot,
-                  static_cast<const Allocation *>(vain), static_cast<Allocation *>(vaout),
-                  params, paramLen, sc);
-
-}
-
 void rsi_ScriptForEachMulti(Context *rsc, RsScript vs, uint32_t slot,
                             RsAllocation *vains, size_t inLen,
                             RsAllocation vaout, const void *params,
                             size_t paramLen, const RsScriptCall *sc,
                             size_t scLen) {
-    Script *s = static_cast<Script *>(vs);
-    // The rs.spec generated code does not handle the absence of an actual
-    // input for sc. Instead, it retains an existing pointer value (the prior
-    // field in the packed data object). This can cause confusion because
-    // drivers might now inspect bogus sc data.
-    if (scLen == 0) {
-        sc = NULL;
-    }
 
+    Script      *s    = static_cast<Script *>(vs);
     Allocation **ains = (Allocation**)(vains);
 
     s->runForEach(rsc, slot,
@@ -227,6 +202,23 @@
 
 }
 
+void rsi_ScriptForEach(Context *rsc, RsScript vs, uint32_t slot,
+                       RsAllocation vain, RsAllocation vaout,
+                       const void *params, size_t paramLen,
+                       const RsScriptCall *sc, size_t scLen) {
+
+    if (vain == NULL) {
+        rsi_ScriptForEachMulti(rsc, vs, slot, NULL, 0, vaout, params, paramLen,
+                               sc, scLen);
+    } else {
+        RsAllocation ains[1] = {vain};
+
+        rsi_ScriptForEachMulti(rsc, vs, slot, ains,
+                               sizeof(ains) / sizeof(RsAllocation), vaout,
+                               params, paramLen, sc, scLen);
+    }
+}
+
 void rsi_ScriptInvoke(Context *rsc, RsScript vs, uint32_t slot) {
     Script *s = static_cast<Script *>(vs);
     s->Invoke(rsc, slot, NULL, 0);
diff --git a/rsScript.h b/rsScript.h
index 1ad013f..2e232f0 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -108,17 +108,9 @@
 
     virtual bool freeChildren();
 
-    virtual void runForEach(Context *rsc,
-                            uint32_t slot,
-                            const Allocation * ain,
-                            Allocation * aout,
-                            const void * usr,
-                            size_t usrBytes,
-                            const RsScriptCall *sc = NULL) = 0;
-
     virtual void runForEach(Context* rsc,
                             uint32_t slot,
-                            const Allocation** ains,
+                            const Allocation ** ains,
                             size_t inLen,
                             Allocation* aout,
                             const void* usr,
diff --git a/rsScriptC.cpp b/rsScriptC.cpp
index e7ff8c7..96a771f 100644
--- a/rsScriptC.cpp
+++ b/rsScriptC.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <string>
+
 #include "rsContext.h"
 #include "rsScriptC.h"
 
@@ -29,6 +31,19 @@
 
 #include <sys/stat.h>
 
+#ifdef USE_MINGW
+/* Define the default path separator for the platform. */
+#define OS_PATH_SEPARATOR     '\\'
+#define OS_PATH_SEPARATOR_STR "\\"
+
+#else /* not USE_MINGW */
+
+/* Define the default path separator for the platform. */
+#define OS_PATH_SEPARATOR     '/'
+#define OS_PATH_SEPARATOR_STR "/"
+
+#endif
+
 using namespace android;
 using namespace android::renderscript;
 
@@ -58,29 +73,45 @@
 
 #ifndef RS_COMPATIBILITY_LIB
 bool ScriptC::createCacheDir(const char *cacheDir) {
-    String8 cacheDirString, currentDir;
+    std::string currentDir;
+    const std::string cacheDirString(cacheDir);
+
     struct stat statBuf;
     int statReturn = stat(cacheDir, &statBuf);
     if (!statReturn) {
         return true;
     }
 
-    // String8 path functions strip leading /'s
-    // insert if necessary
-    if (cacheDir[0] == '/') {
-        currentDir += "/";
-    }
+    // Start from the beginning of the cacheDirString.
+    int currPos = 0;
 
-    cacheDirString.setPathName(cacheDir);
+    // Reserve space in currentDir for the entire cacheDir path.
+    currentDir.reserve(cacheDirString.length());
 
-    while (cacheDirString.length()) {
-        currentDir += (cacheDirString.walkPath(&cacheDirString));
-        statReturn = stat(currentDir.string(), &statBuf);
+    while (currPos >= 0) {
+        /*
+         * The character at currPos should be a path separator.  We need to look
+         * for the next one.
+         */
+        int nextPos = cacheDirString.find(OS_PATH_SEPARATOR_STR, currPos + 1);
+
+        if (nextPos > 0) {
+            // A new path separator has been found.
+            currentDir += cacheDirString.substr(currPos, nextPos - currPos);
+        } else {
+            // There are no more path separators.
+            currentDir += cacheDirString.substr(currPos);
+        }
+
+        currPos = nextPos;
+
+        statReturn = stat(currentDir.c_str(), &statBuf);
+
         if (statReturn) {
             if (errno == ENOENT) {
-                if (mkdir(currentDir.string(), S_IRUSR | S_IWUSR | S_IXUSR)) {
+                if (mkdir(currentDir.c_str(), S_IRUSR | S_IWUSR | S_IXUSR)) {
                     ALOGE("Couldn't create cache directory: %s",
-                          currentDir.string());
+                          currentDir.c_str());
                     ALOGE("Error: %s", strerror(errno));
                     return false;
                 }
@@ -89,7 +120,6 @@
                 return false;
             }
         }
-        currentDir += "/";
     }
     return true;
 }
@@ -156,36 +186,6 @@
 
 void ScriptC::runForEach(Context *rsc,
                          uint32_t slot,
-                         const Allocation * ain,
-                         Allocation * aout,
-                         const void * usr,
-                         size_t usrBytes,
-                         const RsScriptCall *sc) {
-    // Trace this function call.
-    // To avoid overhead, we only build the string, if tracing is actually
-    // enabled.
-    String8 *AString = NULL;
-    const char *String = "";
-    if (ATRACE_ENABLED()) {
-        AString = new String8("runForEach_");
-        AString->append(mHal.info.exportedForeachFuncList[slot].first);
-        String = AString->string();
-    }
-    ATRACE_NAME(String);
-    (void)String;
-
-    Context::PushState ps(rsc);
-
-    setupGLState(rsc);
-    setupScript(rsc);
-    rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ain, aout, usr, usrBytes, sc);
-
-    if (AString)
-        delete AString;
-}
-
-void ScriptC::runForEach(Context *rsc,
-                         uint32_t slot,
                          const Allocation ** ains,
                          size_t inLen,
                          Allocation * aout,
@@ -195,25 +195,36 @@
     // Trace this function call.
     // To avoid overhead we only build the string if tracing is actually
     // enabled.
-    String8 *AString = NULL;
-    const char *String = "";
+    std::string *traceString = NULL;
+    const char  *stringData  = "";
     if (ATRACE_ENABLED()) {
-        AString = new String8("runForEach_");
-        AString->append(mHal.info.exportedForeachFuncList[slot].first);
-        String = AString->string();
+        traceString = new std::string("runForEach_");
+        traceString->append(mHal.info.exportedForeachFuncList[slot].first);
+        stringData = traceString->c_str();
     }
-    ATRACE_NAME(String);
-    (void)String;
+    ATRACE_NAME(stringData);
 
     Context::PushState ps(rsc);
 
     setupGLState(rsc);
     setupScript(rsc);
 
-    rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen, aout, usr, usrBytes, sc);
+    if (rsc->mHal.funcs.script.invokeForEachMulti != NULL) {
+        rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen,
+                                                  aout, usr, usrBytes, sc);
 
-    if (AString)
-        delete AString;
+    } else if (inLen == 1) {
+        rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ains[0], aout,
+                                             usr, usrBytes, sc);
+
+    } else {
+        rsc->setError(RS_ERROR_FATAL_DRIVER,
+                      "Driver support for multi-input not present");
+    }
+
+    if (traceString) {
+        delete traceString;
+    }
 }
 
 void ScriptC::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
diff --git a/rsScriptC.h b/rsScriptC.h
index d3d9d51..5735bea 100644
--- a/rsScriptC.h
+++ b/rsScriptC.h
@@ -44,14 +44,6 @@
 
     virtual void runForEach(Context *rsc,
                             uint32_t slot,
-                            const Allocation * ain,
-                            Allocation * aout,
-                            const void * usr,
-                            size_t usrBytes,
-                            const RsScriptCall *sc = NULL);
-
-    virtual void runForEach(Context *rsc,
-                            uint32_t slot,
                             const Allocation ** ains,
                             size_t inLen,
                             Allocation * aout,
diff --git a/rsScriptC_Lib.cpp b/rsScriptC_Lib.cpp
index a41f4a7..cacb37a 100644
--- a/rsScriptC_Lib.cpp
+++ b/rsScriptC_Lib.cpp
@@ -230,7 +230,17 @@
                 Allocation *in, Allocation *out,
                 const void *usr, uint32_t usrBytes,
                 const RsScriptCall *call) {
-    target->runForEach(rsc, /* root slot */ 0, in, out, usr, usrBytes, call);
+
+    if (in == NULL) {
+        target->runForEach(rsc, /* root slot */ 0, NULL, 0, out, usr,
+                           usrBytes, call);
+
+    } else {
+        const Allocation *ins[1] = {in};
+        target->runForEach(rsc, /* root slot */ 0, ins,
+                           sizeof(ins) / sizeof(RsAllocation), out, usr,
+                           usrBytes, call);
+    }
 }
 
 void rsrAllocationSyncAll(Context *rsc, Allocation *a, RsAllocationUsageType usage) {
diff --git a/rsScriptGroup.cpp b/rsScriptGroup.cpp
index d1dd9d8..f41c65d 100644
--- a/rsScriptGroup.cpp
+++ b/rsScriptGroup.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <algorithm>
+
 #include "rsContext.h"
 #include <time.h>
 
@@ -28,8 +30,8 @@
         mRSC->mHal.funcs.scriptgroup.destroy(mRSC, this);
     }
 
-    for (size_t ct=0; ct < mLinks.size(); ct++) {
-        delete mLinks[ct];
+    for (auto link : mLinks) {
+        delete link;
     }
 }
 
@@ -44,148 +46,116 @@
 }
 
 ScriptGroup::Node * ScriptGroup::findNode(Script *s) const {
-    //ALOGE("find %p   %i", s, (int)mNodes.size());
-    for (size_t ct=0; ct < mNodes.size(); ct++) {
-        Node *n = mNodes[ct];
-        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
-            if (n->mKernels[ct2]->mScript == s) {
-                return n;
+    for (auto node : mNodes) {
+        for (auto kernelRef : node->mKernels) {
+            if (kernelRef->mScript == s) {
+                return node;
             }
         }
     }
+
     return NULL;
 }
 
-bool ScriptGroup::calcOrderRecurse(Node *n, int depth) {
-    n->mSeen = true;
-    if (n->mOrder < depth) {
-        n->mOrder = depth;
+bool ScriptGroup::calcOrderRecurse(Node *node0, int depth) {
+    node0->mSeen = true;
+    if (node0->mOrder < depth) {
+        node0->mOrder = depth;
     }
     bool ret = true;
-    for (size_t ct=0; ct < n->mOutputs.size(); ct++) {
-        const Link *l = n->mOutputs[ct];
-        Node *nt = NULL;
-        if (l->mDstField.get()) {
-            nt = findNode(l->mDstField->mScript);
+
+    for (auto link : node0->mOutputs) {
+        Node *node1 = NULL;
+        if (link->mDstField.get()) {
+            node1 = findNode(link->mDstField->mScript);
         } else {
-            nt = findNode(l->mDstKernel->mScript);
+            node1 = findNode(link->mDstKernel->mScript);
         }
-        if (nt->mSeen) {
+        if (node1->mSeen) {
             return false;
         }
-        ret &= calcOrderRecurse(nt, n->mOrder + 1);
+        ret &= calcOrderRecurse(node1, node0->mOrder + 1);
     }
+
     return ret;
 }
 
-#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
-static int CompareNodeForSort(ScriptGroup::Node *const* lhs,
-                              ScriptGroup::Node *const* rhs) {
-    if (lhs[0]->mOrder > rhs[0]->mOrder) {
-        return 1;
-    }
-    return 0;
-}
-#else
-class NodeCompare {
-public:
-    bool operator() (const ScriptGroup::Node* lhs,
-                     const ScriptGroup::Node* rhs) {
-        if (lhs->mOrder > rhs->mOrder) {
-            return true;
-        }
-        return false;
-    }
-};
-#endif
-
 bool ScriptGroup::calcOrder() {
     // Make nodes
-    for (size_t ct=0; ct < mKernels.size(); ct++) {
-        const ScriptKernelID *k = mKernels[ct].get();
-        //ALOGE(" kernel %i, %p  s=%p", (int)ct, k, mKernels[ct]->mScript);
-        Node *n = findNode(k->mScript);
-        //ALOGE("    n = %p", n);
-        if (n == NULL) {
-            n = new Node(k->mScript);
-            mNodes.add(n);
+
+    for (auto kernelRef : mKernels) {
+        const ScriptKernelID *kernel = kernelRef.get();
+        Node *node = findNode(kernel->mScript);
+        if (node == NULL) {
+            node = new Node(kernel->mScript);
+            mNodes.push_back(node);
         }
-        n->mKernels.add(k);
+        node->mKernels.push_back(kernel);
     }
 
     // add links
-    //ALOGE("link count %i", (int)mLinks.size());
-    for (size_t ct=0; ct < mLinks.size(); ct++) {
-        Link *l = mLinks[ct];
-        //ALOGE("link  %i %p", (int)ct, l);
-        Node *n = findNode(l->mSource->mScript);
-        //ALOGE("link n %p", n);
-        n->mOutputs.add(l);
+    for (auto link : mLinks) {
+        Node *node = findNode(link->mSource->mScript);
+        node->mOutputs.push_back(link);
 
-        if (l->mDstKernel.get()) {
-            //ALOGE("l->mDstKernel.get() %p", l->mDstKernel.get());
-            n = findNode(l->mDstKernel->mScript);
-            //ALOGE("  n1 %p", n);
-            n->mInputs.add(l);
+        if (link->mDstKernel.get()) {
+            node = findNode(link->mDstKernel->mScript);
+            node->mInputs.push_back(link);
         } else {
-            n = findNode(l->mDstField->mScript);
-            //ALOGE("  n2 %p", n);
-            n->mInputs.add(l);
+            node = findNode(link->mDstField->mScript);
+            node->mInputs.push_back(link);
         }
     }
 
-    //ALOGE("node count %i", (int)mNodes.size());
     // Order nodes
     bool ret = true;
-    for (size_t ct=0; ct < mNodes.size(); ct++) {
-        Node *n = mNodes[ct];
-        if (n->mInputs.size() == 0) {
-            for (size_t ct2=0; ct2 < mNodes.size(); ct2++) {
-                mNodes[ct2]->mSeen = false;
+    for (auto n0 : mNodes) {
+        if (n0->mInputs.size() == 0) {
+            for (auto n1 : mNodes) {
+                n1->mSeen = false;
             }
-            ret &= calcOrderRecurse(n, 0);
+            ret &= calcOrderRecurse(n0, 1);
         }
     }
 
-    for (size_t ct=0; ct < mKernels.size(); ct++) {
-        const ScriptKernelID *k = mKernels[ct].get();
-        const Node *n = findNode(k->mScript);
+    for (auto kernelRef : mKernels) {
+        const ScriptKernelID *kernel = kernelRef.get();
+        const Node *node = findNode(kernel->mScript);
 
-        if (k->mHasKernelOutput) {
+        if (kernel->mHasKernelOutput) {
             bool found = false;
-            for (size_t ct2=0; ct2 < n->mOutputs.size(); ct2++) {
-                if (n->mOutputs[ct2]->mSource.get() == k) {
+            for (auto output : node->mOutputs) {
+                if (output->mSource.get() == kernel) {
                     found = true;
                     break;
                 }
             }
+
             if (!found) {
-                //ALOGE("add io out %p", k);
-                mOutputs.add(new IO(k));
+                mOutputs.push_back(new IO(kernel));
             }
         }
 
-        if (k->mHasKernelInput) {
+        if (kernel->mHasKernelInput) {
             bool found = false;
-            for (size_t ct2=0; ct2 < n->mInputs.size(); ct2++) {
-                if (n->mInputs[ct2]->mDstKernel.get() == k) {
+            for (auto input : node->mInputs) {
+                if (input->mDstKernel.get() == kernel) {
                     found = true;
                     break;
                 }
             }
             if (!found) {
-                //ALOGE("add io in %p", k);
-                mInputs.add(new IO(k));
+                mInputs.push_back(new IO(kernel));
             }
         }
     }
 
     // sort
-#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
-    mNodes.sort(&CompareNodeForSort);
-#else
-    std::sort(mNodes.begin(), mNodes.end(), NodeCompare());
-#endif
+    std::stable_sort(mNodes.begin(), mNodes.end(),
+                     [](const ScriptGroup::Node* lhs,
+                        const ScriptGroup::Node* rhs) {
+        return lhs->mOrder < rhs->mOrder;
+    });
 
     return ret;
 }
@@ -209,7 +179,7 @@
 
     sg->mKernels.reserve(kernelCount);
     for (size_t ct=0; ct < kernelCount; ct++) {
-        sg->mKernels.add(kernels[ct]);
+        sg->mKernels.push_back(kernels[ct]);
     }
 
     sg->mLinks.reserve(linkCount);
@@ -219,7 +189,7 @@
         l->mSource = src[ct];
         l->mDstField = dstF[ct];
         l->mDstKernel = dstK[ct];
-        sg->mLinks.add(l);
+        sg->mLinks.push_back(l);
     }
 
     sg->calcOrder();
@@ -254,9 +224,9 @@
 }
 
 void ScriptGroup::setInput(Context *rsc, ScriptKernelID *kid, Allocation *a) {
-    for (size_t ct=0; ct < mInputs.size(); ct++) {
-        if (mInputs[ct]->mKernel == kid) {
-            mInputs[ct]->mAlloc = a;
+    for (auto input : mInputs) {
+        if (input->mKernel == kid) {
+            input->mAlloc = a;
 
             if (rsc->mHal.funcs.scriptgroup.setInput) {
                 rsc->mHal.funcs.scriptgroup.setInput(rsc, this, kid, a);
@@ -268,9 +238,9 @@
 }
 
 void ScriptGroup::setOutput(Context *rsc, ScriptKernelID *kid, Allocation *a) {
-    for (size_t ct=0; ct < mOutputs.size(); ct++) {
-        if (mOutputs[ct]->mKernel == kid) {
-            mOutputs[ct]->mAlloc = a;
+    for (auto output : mOutputs) {
+        if (output->mKernel == kid) {
+            output->mAlloc = a;
 
             if (rsc->mHal.funcs.scriptgroup.setOutput) {
                 rsc->mHal.funcs.scriptgroup.setOutput(rsc, this, kid, a);
@@ -311,44 +281,45 @@
         return;
     }
 
-    for (size_t ct=0; ct < mNodes.size(); ct++) {
-        Node *n = mNodes[ct];
-        //ALOGE("node %i, order %i, in %i out %i", (int)ct, n->mOrder, (int)n->mInputs.size(), (int)n->mOutputs.size());
-
-        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
-            const ScriptKernelID *k = n->mKernels[ct2];
-            Allocation *ain = NULL;
+    for (auto node : mNodes) {
+        for (auto kernel : node->mKernels) {
+            Allocation *ain  = NULL;
             Allocation *aout = NULL;
 
-            for (size_t ct3=0; ct3 < n->mInputs.size(); ct3++) {
-                if (n->mInputs[ct3]->mDstKernel.get() == k) {
-                    ain = n->mInputs[ct3]->mAlloc.get();
-                    //ALOGE(" link in %p", ain);
-                }
-            }
-            for (size_t ct3=0; ct3 < mInputs.size(); ct3++) {
-                if (mInputs[ct3]->mKernel == k) {
-                    ain = mInputs[ct3]->mAlloc.get();
-                    //ALOGE(" io in %p", ain);
+            for (auto nodeInput : node->mInputs) {
+                if (nodeInput->mDstKernel.get() == kernel) {
+                    ain = nodeInput->mAlloc.get();
                 }
             }
 
-            for (size_t ct3=0; ct3 < n->mOutputs.size(); ct3++) {
-                if (n->mOutputs[ct3]->mSource.get() == k) {
-                    aout = n->mOutputs[ct3]->mAlloc.get();
-                    //ALOGE(" link out %p", aout);
-                }
-            }
-            for (size_t ct3=0; ct3 < mOutputs.size(); ct3++) {
-                if (mOutputs[ct3]->mKernel == k) {
-                    aout = mOutputs[ct3]->mAlloc.get();
-                    //ALOGE(" io out %p", aout);
+            for (auto sgInput : mInputs) {
+                if (sgInput->mKernel == kernel) {
+                    ain = sgInput->mAlloc.get();
                 }
             }
 
-            n->mScript->runForEach(rsc, k->mSlot, ain, aout, NULL, 0);
+            for (auto nodeOutput : node->mOutputs) {
+                if (nodeOutput->mDstKernel.get() == kernel) {
+                    aout = nodeOutput->mAlloc.get();
+                }
+            }
+
+            for (auto sgOutput : mOutputs) {
+                if (sgOutput->mKernel == kernel) {
+                    aout = sgOutput->mAlloc.get();
+                }
+            }
+
+            if (ain == NULL) {
+                node->mScript->runForEach(rsc, kernel->mSlot, NULL, 0, aout,
+                                          NULL, 0);
+            } else {
+                const Allocation *ains[1] = {ain};
+                node->mScript->runForEach(rsc, kernel->mSlot, ains,
+                                          sizeof(ains) / sizeof(RsAllocation),
+                                          aout, NULL, 0);
+            }
         }
-
     }
 
 }
@@ -389,24 +360,20 @@
 
 void rsi_ScriptGroupSetInput(Context *rsc, RsScriptGroup sg, RsScriptKernelID kid,
         RsAllocation alloc) {
-    //ALOGE("rsi_ScriptGroupSetInput");
     ScriptGroup *s = (ScriptGroup *)sg;
     s->setInput(rsc, (ScriptKernelID *)kid, (Allocation *)alloc);
 }
 
 void rsi_ScriptGroupSetOutput(Context *rsc, RsScriptGroup sg, RsScriptKernelID kid,
         RsAllocation alloc) {
-    //ALOGE("rsi_ScriptGroupSetOutput");
     ScriptGroup *s = (ScriptGroup *)sg;
     s->setOutput(rsc, (ScriptKernelID *)kid, (Allocation *)alloc);
 }
 
 void rsi_ScriptGroupExecute(Context *rsc, RsScriptGroup sg) {
-    //ALOGE("rsi_ScriptGroupExecute");
     ScriptGroup *s = (ScriptGroup *)sg;
     s->execute(rsc);
 }
 
 }
 }
-
diff --git a/rsScriptGroup.h b/rsScriptGroup.h
index af98b50..974e3ba 100644
--- a/rsScriptGroup.h
+++ b/rsScriptGroup.h
@@ -32,7 +32,7 @@
 
 class ScriptGroup : public ObjectBase {
 public:
-    Vector<ObjectBaseRef<ScriptKernelID> > mKernels;
+    std::vector<ObjectBaseRef<ScriptKernelID> > mKernels;
 
     class Link {
     public:
@@ -49,9 +49,9 @@
     public:
         Node(Script *);
 
-        Vector<const ScriptKernelID *> mKernels;
-        Vector<Link *> mOutputs;
-        Vector<Link *> mInputs;
+        std::vector<const ScriptKernelID *> mKernels;
+        std::vector<Link *> mOutputs;
+        std::vector<Link *> mInputs;
         bool mSeen;
         int mOrder;
         Script *mScript;
@@ -65,10 +65,10 @@
         ObjectBaseRef<Allocation> mAlloc;
     };
 
-    Vector<Link *> mLinks;
-    Vector<Node *> mNodes;
-    Vector<IO *> mInputs;
-    Vector<IO *> mOutputs;
+    std::vector<Link *> mLinks;
+    std::vector<Node *> mNodes;
+    std::vector<IO *> mInputs;
+    std::vector<IO *> mOutputs;
 
     struct Hal {
         void * drv;
@@ -115,4 +115,3 @@
 }
 }
 #endif
-
diff --git a/rsScriptIntrinsic.cpp b/rsScriptIntrinsic.cpp
index 86f1c50..7461d34 100644
--- a/rsScriptIntrinsic.cpp
+++ b/rsScriptIntrinsic.cpp
@@ -55,18 +55,6 @@
     return 0;
 }
 
-
-void ScriptIntrinsic::runForEach(Context *rsc,
-                         uint32_t slot,
-                         const Allocation * ain,
-                         Allocation * aout,
-                         const void * usr,
-                         size_t usrBytes,
-                         const RsScriptCall *sc) {
-
-    rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ain, aout, usr, usrBytes, sc);
-}
-
 void ScriptIntrinsic::runForEach(Context* rsc,
                          uint32_t slot,
                          const Allocation** ains,
@@ -76,7 +64,18 @@
                          size_t usrBytes,
                          const RsScriptCall* sc) {
 
-    rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen, aout, usr, usrBytes, sc);
+    if (rsc->mHal.funcs.script.invokeForEachMulti != NULL) {
+        rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen,
+                                                  aout, usr, usrBytes, sc);
+
+    } else if (inLen == 1) {
+        rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ains[0], aout,
+                                             usr, usrBytes, sc);
+
+    } else {
+        rsc->setError(RS_ERROR_FATAL_DRIVER,
+                      "Driver support for multi-input not present");
+    }
 }
 
 void ScriptIntrinsic::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
@@ -107,5 +106,3 @@
 
 }
 }
-
-
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
index 66b6031..87b7353 100644
--- a/rsScriptIntrinsic.h
+++ b/rsScriptIntrinsic.h
@@ -40,17 +40,9 @@
     virtual RsA3DClassID getClassId() const;
     virtual bool freeChildren();
 
-    virtual void runForEach(Context *rsc,
-                            uint32_t slot,
-                            const Allocation * ain,
-                            Allocation * aout,
-                            const void * usr,
-                            size_t usrBytes,
-                            const RsScriptCall *sc = NULL);
-
     virtual void runForEach(Context* rsc,
                             uint32_t slot,
-                            const Allocation** ains,
+                            const Allocation ** ains,
                             size_t inLen,
                             Allocation* aout,
                             const void* usr,
@@ -69,5 +61,3 @@
 }
 }
 #endif
-
-
diff --git a/rsType.cpp b/rsType.cpp
index 31d6ce8..d009816 100644
--- a/rsType.cpp
+++ b/rsType.cpp
@@ -33,10 +33,14 @@
 }
 
 void Type::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateType.mTypes.size(); ct++) {
-        if (mRSC->mStateType.mTypes[ct] == this) {
-            mRSC->mStateType.mTypes.removeAt(ct);
-            break;
+    auto &types = mRSC->mStateType.mTypes;
+
+    for (auto typeIter = types.begin(), endIter = types.end();
+         typeIter != endIter; typeIter++) {
+
+        if (this == *typeIter) {
+            types.erase(typeIter);
+            return;
         }
     }
 }
@@ -265,7 +269,7 @@
     nt->compute();
 
     ObjectBase::asyncLock();
-    stc->mTypes.push(nt);
+    stc->mTypes.push_back(nt);
     ObjectBase::asyncUnlock();
 
     return returnRef;
diff --git a/rsType.h b/rsType.h
index e44e270..86d6ece 100644
--- a/rsType.h
+++ b/rsType.h
@@ -146,7 +146,7 @@
     ~TypeState();
 
     // Cache of all existing types.
-    Vector<Type *> mTypes;
+    std::vector<Type *> mTypes;
 };
 
 
diff --git a/rsg_generator.c b/rsg_generator.c
index d0f0b7c..2558f67 100644
--- a/rsg_generator.c
+++ b/rsg_generator.c
@@ -294,7 +294,9 @@
                 const VarType *vt = &api->params[ct2];
                 needFlush += vt->ptrLevel;
                 if (vt->ptrLevel && hasInlineDataPointers(api)) {
-                    fprintf(f, "    if (dataSize < io->getMaxInlineSize()) {\n");
+                    fprintf(f, "    if (%s_length == 0) {\n", vt->name);
+                    fprintf(f, "        cmd->%s = NULL;\n", vt->name);
+                    fprintf(f, "    } else if (dataSize < io->getMaxInlineSize()) {\n");
                     fprintf(f, "        memcpy(payload, %s, %s_length);\n", vt->name, vt->name);
                     fprintf(f, "        cmd->%s = (", vt->name);
                     printVarType(f, vt);
@@ -489,7 +491,8 @@
             needFlush += vt->ptrLevel;
 
             if (hasInlineDataPointers(api) && vt->ptrLevel) {
-                fprintf(f, ",\n           (const %s *)&baseData[(intptr_t)cmd->%s]", vt->typeName, vt->name);
+                fprintf(f, ",\n           cmd->%s_length == 0 ? NULL : (const %s *)&baseData[(intptr_t)cmd->%s]",
+                        vt->name, vt->typeName, vt->name);
             } else {
                 fprintf(f, ",\n           cmd->%s", vt->name);
             }