Merge "Split the RsForEachStubParamStruct in two."
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 499f890..a0564fc 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -48,7 +48,7 @@
 using namespace android::renderscript;
 
 typedef void (*outer_foreach_t)(
-    const android::renderscript::RsForEachStubParamStruct *,
+    const android::renderscript::RsExpandKernelParams *,
     uint32_t x1, uint32_t x2,
     uint32_t instep, uint32_t outstep);
 
@@ -353,17 +353,21 @@
 
 static void wc_xy(void *usr, uint32_t idx) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-    RsForEachStubParamStruct p;
-    memcpy(&p, &mtls->fep, sizeof(p));
-    p.lid = idx;
-    uint32_t sig = mtls->sig;
+
+    RsExpandKernelParams kparams;
+    kparams.takeFields(mtls->fep);
+
+    // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
+    kparams.lid = idx;
 
     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
     while (1) {
-        uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
         uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
-        uint32_t yEnd = yStart + mtls->mSliceSize;
+        uint32_t yEnd   = yStart + mtls->mSliceSize;
+
         yEnd = rsMin(yEnd, mtls->yEnd);
+
         if (yEnd <= yStart) {
             return;
         }
@@ -371,29 +375,39 @@
         //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
         //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
 
-        for (p.y = yStart; p.y < yEnd; p.y++) {
-            p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y) +
-                    (mtls->fep.eStrideOut * mtls->xStart);
-            p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y) +
-                   (mtls->fep.eStrideIn * mtls->xStart);
-            fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+        for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
+            kparams.out = mtls->fep.ptrOut +
+                          (mtls->fep.yStrideOut * kparams.y) +
+                          (mtls->fep.eStrideOut * mtls->xStart);
+
+            kparams.in = mtls->fep.ptrIn +
+                         (mtls->fep.yStrideIn * kparams.y) +
+                         (mtls->fep.eStrideIn * mtls->xStart);
+
+
+            fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
+               mtls->fep.eStrideOut);
         }
     }
 }
 
 static void wc_x(void *usr, uint32_t idx) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-    RsForEachStubParamStruct p;
-    memcpy(&p, &mtls->fep, sizeof(p));
-    p.lid = idx;
-    uint32_t sig = mtls->sig;
+
+    RsExpandKernelParams kparams;
+    kparams.takeFields(mtls->fep);
+
+    // Used by CpuScriptGroup, IntrinsicBlur, and IntrisicHistogram
+    kparams.lid = idx;
 
     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
     while (1) {
-        uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
         uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
-        uint32_t xEnd = xStart + mtls->mSliceSize;
+        uint32_t xEnd   = xStart + mtls->mSliceSize;
+
         xEnd = rsMin(xEnd, mtls->xEnd);
+
         if (xEnd <= xStart) {
             return;
         }
@@ -401,14 +415,15 @@
         //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
         //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
 
-        p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
-        p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
-        fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+        kparams.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
+        kparams.in  = mtls->fep.ptrIn  + (mtls->fep.eStrideIn  * xStart);
+
+        fn(&kparams, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
     }
 }
 
 void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
-                                     const RsScriptCall *sc, MTLaunchStruct *mtls) {
+                                        const RsScriptCall *sc, MTLaunchStruct *mtls) {
 
     //android::StopWatch kernel_time("kernel time");
 
@@ -457,22 +472,34 @@
 
         //ALOGE("launch 1");
     } else {
-        RsForEachStubParamStruct p;
-        memcpy(&p, &mtls->fep, sizeof(p));
-        uint32_t sig = mtls->sig;
+        RsExpandKernelParams kparams;
+        kparams.takeFields(mtls->fep);
 
         //ALOGE("launch 3");
         outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
-            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
-                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
-                    uint32_t offset = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0] +
-                                      mtls->fep.dimY * p.z + p.y;
-                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
-                            (mtls->fep.eStrideOut * mtls->xStart);
-                    p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * offset) +
-                           (mtls->fep.eStrideIn * mtls->xStart);
-                    fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+        for (uint32_t arrayIndex = mtls->arrayStart;
+             arrayIndex < mtls->arrayEnd; arrayIndex++) {
+
+            for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
+                 kparams.z++) {
+
+                for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
+                     kparams.y++) {
+
+                    uint32_t offset =
+                      kparams.dimY * kparams.dimZ * arrayIndex +
+                      kparams.dimY * kparams.z + kparams.y;
+
+                    kparams.out = mtls->fep.ptrOut +
+                                  (mtls->fep.yStrideOut * offset) +
+                                  (mtls->fep.eStrideOut * mtls->xStart);
+
+                    kparams.in = mtls->fep.ptrIn +
+                                 (mtls->fep.yStrideIn * offset) +
+                                 (mtls->fep.eStrideIn * mtls->xStart);
+
+                    fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn,
+                       mtls->fep.eStrideOut);
                 }
             }
         }
@@ -529,41 +556,46 @@
 
         //ALOGE("launch 1");
     } else {
-        RsForEachStubParamStruct p;
-        memcpy(&p, &mtls->fep, sizeof(p));
-        uint32_t sig = mtls->sig;
+        RsExpandKernelParams kparams;
+        kparams.takeFields(mtls->fep);
 
         // Allocate space for our input base pointers.
-        p.ins = new const void*[inLen];
+        kparams.ins = new const void*[inLen];
 
         // Allocate space for our input stride information.
-        p.eStrideIns = new uint32_t[inLen];
+        kparams.eStrideIns = new uint32_t[inLen];
 
         // Fill our stride information.
-        for (int index = inLen; --index >= 0;) {
-          p.eStrideIns[index] = mtls->fep.inStrides[index].eStride;
+        for (int inIndex = inLen; --inIndex >= 0;) {
+          kparams.eStrideIns[inIndex] = mtls->fep.inStrides[inIndex].eStride;
         }
 
         //ALOGE("launch 3");
         outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-        uint32_t offset_invariant = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0];
+        for (uint32_t arrayIndex = mtls->arrayStart;
+             arrayIndex < mtls->arrayEnd; arrayIndex++) {
 
-        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
-            uint32_t offset_part = offset_invariant * p.ar[0];
+            for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
+                 kparams.z++) {
 
-            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
-                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
-                    uint32_t offset = offset_part + mtls->fep.dimY * p.z + p.y;
+                for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
+                     kparams.y++) {
 
-                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
-                            (mtls->fep.eStrideOut * mtls->xStart);
+                    uint32_t offset =
+                      mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
+                      mtls->fep.dimY * kparams.z + kparams.y;
 
-                    for (int index = inLen; --index >= 0;) {
-                        StridePair &strides = mtls->fep.inStrides[index];
+                    kparams.out = mtls->fep.ptrOut +
+                                  (mtls->fep.yStrideOut * offset) +
+                                  (mtls->fep.eStrideOut * mtls->xStart);
 
-                        p.ins[index] = mtls->fep.ptrIns[index] +
-                                       (strides.yStride * offset) +
-                                       (strides.eStride * mtls->xStart);
+                    for (int inIndex = inLen; --inIndex >= 0;) {
+                        StridePair &strides = mtls->fep.inStrides[inIndex];
+
+                        kparams.ins[inIndex] =
+                          mtls->fep.ptrIns[inIndex] +
+                          (strides.yStride * offset) +
+                          (strides.eStride * mtls->xStart);
                     }
 
                     /*
@@ -571,14 +603,15 @@
                      * kernels get their stride information from a member of p
                      * that points to an array.
                      */
-                    fn(&p, mtls->xStart, mtls->xEnd, 0, mtls->fep.eStrideOut);
+                    fn(&kparams, mtls->xStart, mtls->xEnd, 0,
+                       mtls->fep.eStrideOut);
                 }
             }
         }
 
         // Free our arrays.
-        delete[] p.ins;
-        delete[] p.eStrideIns;
+        delete[] kparams.ins;
+        delete[] kparams.eStrideIns;
     }
 }
 
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index c54dca2..5d4b6cc 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -34,32 +34,18 @@
 namespace android {
 namespace renderscript {
 
-typedef struct {
+struct StridePair {
   uint32_t eStride;
   uint32_t yStride;
-} StridePair;
+};
 
-typedef struct {
-    const void *in;
-    void *out;
+struct RsExpandKernelDriverInfo {
     const void *usr;
     uint32_t usrLen;
-    uint32_t x;
-    uint32_t y;
-    uint32_t z;
-    uint32_t lod;
-    RsAllocationCubemapFace face;
-    uint32_t ar[16];
-
-    const void **ins;
-    uint32_t *eStrideIns;
-
-    uint32_t lid;
 
     uint32_t dimX;
     uint32_t dimY;
     uint32_t dimZ;
-    uint32_t dimArray;
 
     const uint8_t *ptrIn;
     uint8_t *ptrOut;
@@ -71,7 +57,54 @@
 
     const uint8_t** ptrIns;
     StridePair* inStrides;
-} RsForEachStubParamStruct;
+
+    ~RsExpandKernelDriverInfo() {
+        if (ptrIns != NULL) {
+            delete[] ptrIns;
+        }
+
+        if (inStrides != NULL) {
+            delete[] inStrides;
+        }
+    }
+};
+
+struct RsExpandKernelParams {
+
+    // Used by kernels
+    const void *in;
+    void *out;
+    uint32_t y;
+    uint32_t z;
+    uint32_t lid;
+
+    const void **ins;
+    uint32_t *eStrideIns;
+
+    // Used by ScriptGroup and user kernels.
+    const void *usr;
+
+    // Used by intrinsics
+    uint32_t dimX;
+    uint32_t dimY;
+    uint32_t dimZ;
+
+    /*
+     * FIXME: This is only used by the blend intrinsic.  If possible, we should
+     *        modify blur to not need it.
+     */
+    uint32_t slot;
+
+    /// Copy fields needed by a kernel from a driver struct.
+    void takeFields(const RsExpandKernelDriverInfo &dstruct) {
+        this->usr  = dstruct.usr;
+        this->slot = dstruct.slot;
+
+        this->dimX = dstruct.dimX;
+        this->dimY = dstruct.dimY;
+        this->dimZ = dstruct.dimZ;
+    }
+};
 
 extern bool gArchUseSIMD;
 
@@ -89,7 +122,7 @@
 } ScriptTLSStruct;
 
 typedef struct {
-    RsForEachStubParamStruct fep;
+    RsExpandKernelDriverInfo fep;
 
     RsdCpuReferenceImpl *rsc;
     RsdCpuScriptImpl *script;
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index a7c9487..c839c19 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -38,7 +38,7 @@
 protected:
     ObjectBaseRef<Allocation> mLUT;
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
                        uint32_t instep, uint32_t outstep);
 };
@@ -58,7 +58,7 @@
                                       int dimx, int dimy, int dimz);
 
 
-void RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelParams *p,
                                       uint32_t xstart, uint32_t xend,
                                       uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 228b887..b604658 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -33,7 +33,7 @@
     RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                           uint32_t xstart, uint32_t xend,
                           uint32_t instep, uint32_t outstep);
 };
@@ -110,7 +110,7 @@
 extern "C" void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
 #endif
 
-void RsdCpuScriptIntrinsicBlend::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelParams *p,
                                         uint32_t xstart, uint32_t xend,
                                         uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicBlend *cp = (RsdCpuScriptIntrinsicBlend *)p->usr;
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index c1ca4e2..22e1176 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -44,10 +44,10 @@
     int mIradius;
     ObjectBaseRef<Allocation> mAlloc;
 
-    static void kernelU4(const RsForEachStubParamStruct *p,
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelU1(const RsForEachStubParamStruct *p,
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
     void ComputeGaussianWeights();
@@ -113,7 +113,7 @@
 
 
 
-static void OneVU4(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
+static void OneVU4(const RsExpandKernelParams *p, float4 *out, int32_t x, int32_t y,
                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
 
     const uchar *pi = ptrIn + x*4;
@@ -131,7 +131,7 @@
     out->xyzw = blurredPixel;
 }
 
-static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int32_t y,
+static void OneVU1(const RsExpandKernelParams *p, float *out, int32_t x, int32_t y,
                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
 
     const uchar *pi = ptrIn + x;
@@ -243,7 +243,7 @@
     }
 }
 
-static void OneHU4(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
+static void OneHU4(const RsExpandKernelParams *p, uchar4 *out, int32_t x,
                    const float4 *ptrIn, const float* gPtr, int iradius) {
 
     float4 blurredPixel = 0;
@@ -258,7 +258,7 @@
     out->xyzw = convert_uchar4(blurredPixel);
 }
 
-static void OneHU1(const RsForEachStubParamStruct *p, uchar *out, int32_t x,
+static void OneHU1(const RsExpandKernelParams *p, uchar *out, int32_t x,
                    const float *ptrIn, const float* gPtr, int iradius) {
 
     float blurredPixel = 0;
@@ -274,7 +274,7 @@
 }
 
 
-void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicBlur::kernelU4(const RsExpandKernelParams *p,
                                          uint32_t xstart, uint32_t xend,
                                          uint32_t instep, uint32_t outstep) {
 
@@ -345,7 +345,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicBlur::kernelU1(const RsExpandKernelParams *p,
                                          uint32_t xstart, uint32_t xend,
                                          uint32_t instep, uint32_t outstep) {
     float buf[4 * 2048];
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 9b234f4..a194048 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -187,7 +187,7 @@
     FunctionTab_t mFnTab;
 #endif
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
                        uint32_t instep, uint32_t outstep);
     void updateCoeffCache(float fpMul, float addMul);
@@ -777,7 +777,7 @@
 }
 
 
-static void One(const RsForEachStubParamStruct *p, void *out,
+static void One(const RsExpandKernelParams *p, void *out,
                 const void *py, const float* coeff, const float *add,
                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
 
@@ -878,7 +878,7 @@
     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p,
                                               uint32_t xstart, uint32_t xend,
                                               uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 552a835..d5af88c 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -42,22 +42,22 @@
     ObjectBaseRef<const Allocation> mAlloc;
     ObjectBaseRef<const Element> mElement;
 
-    static void kernelU1(const RsForEachStubParamStruct *p,
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelU2(const RsForEachStubParamStruct *p,
+    static void kernelU2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelU4(const RsForEachStubParamStruct *p,
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelF1(const RsForEachStubParamStruct *p,
+    static void kernelF1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelF2(const RsForEachStubParamStruct *p,
+    static void kernelF2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelF4(const RsForEachStubParamStruct *p,
+    static void kernelF4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
 };
@@ -88,7 +88,7 @@
                                           const void *y2, const short *coef, uint32_t count);
 
 
-static void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+static void ConvolveOneU4(const RsExpandKernelParams *p, uint32_t x, uchar4 *out,
                           const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
                           const float* coeff) {
 
@@ -110,7 +110,7 @@
     *out = o;
 }
 
-static void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
+static void ConvolveOneU2(const RsExpandKernelParams *p, uint32_t x, uchar2 *out,
                           const uchar2 *py0, const uchar2 *py1, const uchar2 *py2,
                           const float* coeff) {
 
@@ -131,7 +131,7 @@
     *out = convert_uchar2(px);
 }
 
-static void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
+static void ConvolveOneU1(const RsExpandKernelParams *p, uint32_t x, uchar *out,
                           const uchar *py0, const uchar *py1, const uchar *py2,
                           const float* coeff) {
 
@@ -150,7 +150,7 @@
     *out = clamp(px, 0.f, 255.f);
 }
 
-static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
+static void ConvolveOneF4(const RsExpandKernelParams *p, uint32_t x, float4 *out,
                           const float4 *py0, const float4 *py1, const float4 *py2,
                           const float* coeff) {
 
@@ -161,7 +161,7 @@
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-static void ConvolveOneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
+static void ConvolveOneF2(const RsExpandKernelParams *p, uint32_t x, float2 *out,
                           const float2 *py0, const float2 *py1, const float2 *py2,
                           const float* coeff) {
 
@@ -172,7 +172,7 @@
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-static void ConvolveOneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
+static void ConvolveOneF1(const RsExpandKernelParams *p, uint32_t x, float *out,
                           const float *py0, const float *py1, const float *py2,
                           const float* coeff) {
 
@@ -183,7 +183,7 @@
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
@@ -230,7 +230,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
@@ -275,7 +275,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
@@ -320,7 +320,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
@@ -365,7 +365,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
@@ -409,7 +409,7 @@
         }
     }
 }
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index 48b5ca5..8421175 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -42,22 +42,22 @@
     ObjectBaseRef<Allocation> alloc;
 
 
-    static void kernelU1(const RsForEachStubParamStruct *p,
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelU2(const RsForEachStubParamStruct *p,
+    static void kernelU2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelU4(const RsForEachStubParamStruct *p,
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelF1(const RsForEachStubParamStruct *p,
+    static void kernelF1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelF2(const RsForEachStubParamStruct *p,
+    static void kernelF2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelF4(const RsForEachStubParamStruct *p,
+    static void kernelF4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
 
@@ -86,7 +86,7 @@
 }
 
 
-static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+static void OneU4(const RsExpandKernelParams *p, uint32_t x, uchar4 *out,
                   const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
                   const float* coeff) {
 
@@ -129,7 +129,7 @@
     *out = convert_uchar4(px);
 }
 
-static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
+static void OneU2(const RsExpandKernelParams *p, uint32_t x, uchar2 *out,
                   const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
                   const float* coeff) {
 
@@ -172,7 +172,7 @@
     *out = convert_uchar2(px);
 }
 
-static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
+static void OneU1(const RsExpandKernelParams *p, uint32_t x, uchar *out,
                   const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
                   const float* coeff) {
 
@@ -215,7 +215,7 @@
     *out = px;
 }
 
-static void OneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
+static void OneF4(const RsExpandKernelParams *p, uint32_t x, float4 *out,
                   const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
                   const float* coeff) {
 
@@ -257,7 +257,7 @@
     *out = px;
 }
 
-static void OneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
+static void OneF2(const RsExpandKernelParams *p, uint32_t x, float2 *out,
                   const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
                   const float* coeff) {
 
@@ -299,7 +299,7 @@
     *out = px;
 }
 
-static void OneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
+static void OneF1(const RsExpandKernelParams *p, uint32_t x, float *out,
                   const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
                   const float* coeff) {
 
@@ -346,7 +346,7 @@
                                           const void *y2, const void *y3, const void *y4,
                                           const short *coef, uint32_t count);
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
@@ -406,7 +406,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
@@ -455,7 +455,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
@@ -504,7 +504,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
@@ -553,7 +553,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
@@ -602,7 +602,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
index cdfe7d1..1c430b7 100644
--- a/cpu_ref/rsCpuIntrinsicHistogram.cpp
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -49,29 +49,29 @@
     int *mSums;
     ObjectBaseRef<Allocation> mAllocOut;
 
-    static void kernelP1U4(const RsForEachStubParamStruct *p,
+    static void kernelP1U4(const RsExpandKernelParams *p,
                           uint32_t xstart, uint32_t xend,
                           uint32_t instep, uint32_t outstep);
-    static void kernelP1U3(const RsForEachStubParamStruct *p,
+    static void kernelP1U3(const RsExpandKernelParams *p,
                           uint32_t xstart, uint32_t xend,
                           uint32_t instep, uint32_t outstep);
-    static void kernelP1U2(const RsForEachStubParamStruct *p,
+    static void kernelP1U2(const RsExpandKernelParams *p,
                           uint32_t xstart, uint32_t xend,
                           uint32_t instep, uint32_t outstep);
-    static void kernelP1U1(const RsForEachStubParamStruct *p,
+    static void kernelP1U1(const RsExpandKernelParams *p,
                           uint32_t xstart, uint32_t xend,
                           uint32_t instep, uint32_t outstep);
 
-    static void kernelP1L4(const RsForEachStubParamStruct *p,
+    static void kernelP1L4(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
                            uint32_t instep, uint32_t outstep);
-    static void kernelP1L3(const RsForEachStubParamStruct *p,
+    static void kernelP1L3(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
                            uint32_t instep, uint32_t outstep);
-    static void kernelP1L2(const RsForEachStubParamStruct *p,
+    static void kernelP1L2(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
                            uint32_t instep, uint32_t outstep);
-    static void kernelP1L1(const RsForEachStubParamStruct *p,
+    static void kernelP1L1(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
                            uint32_t instep, uint32_t outstep);
 
@@ -160,7 +160,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
 
@@ -177,7 +177,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
 
@@ -193,7 +193,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
 
@@ -208,7 +208,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
 
@@ -226,7 +226,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
 
@@ -243,7 +243,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
 
@@ -259,7 +259,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
 
@@ -274,7 +274,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
 
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
index 5b2adc5..db73a83 100644
--- a/cpu_ref/rsCpuIntrinsicLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -38,7 +38,7 @@
 protected:
     ObjectBaseRef<Allocation> lut;
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
                        uint32_t instep, uint32_t outstep);
 };
@@ -53,7 +53,7 @@
 }
 
 
-void RsdCpuScriptIntrinsicLUT::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicLUT::kernel(const RsExpandKernelParams *p,
                                       uint32_t xstart, uint32_t xend,
                                       uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr;
diff --git a/cpu_ref/rsCpuIntrinsicLoopFilter.cpp b/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
index c31fcdf..45f85e5 100644
--- a/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
+++ b/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
@@ -174,7 +174,7 @@
     ObjectBaseRef<Allocation> mFrameBuffer;
 
     void doLoopFilter();
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
                        uint32_t instep, uint32_t outstep);
 };
@@ -182,7 +182,7 @@
 }
 }
 
-void RsdCpuScriptIntrinsicLoopFilter::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicLoopFilter::kernel(const RsExpandKernelParams *p,
                                              uint32_t xstart, uint32_t xend,
                                              uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicLoopFilter *cp = (RsdCpuScriptIntrinsicLoopFilter*)p->usr;
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index d18eb8f..af1127e 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -46,13 +46,13 @@
     ObjectBaseRef<const Allocation> mAlloc;
     ObjectBaseRef<const Element> mElement;
 
-    static void kernelU1(const RsForEachStubParamStruct *p,
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelU2(const RsForEachStubParamStruct *p,
+    static void kernelU2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
-    static void kernelU4(const RsForEachStubParamStruct *p,
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
                          uint32_t instep, uint32_t outstep);
 };
@@ -179,7 +179,7 @@
     return (uchar)p;
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicResize::kernelU4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
@@ -219,7 +219,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicResize::kernelU2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
@@ -259,7 +259,7 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicResize::kernelU1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 563b3e1..d9ab98c 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -46,7 +46,7 @@
 protected:
     ObjectBaseRef<Allocation> alloc;
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
                        uint32_t instep, uint32_t outstep);
 };
@@ -101,7 +101,7 @@
 extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart, size_t xend);
 extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, size_t xstart, size_t xend);
 
-void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsExpandKernelParams *p,
                                            uint32_t xstart, uint32_t xend,
                                            uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicYuvToRGB *cp = (RsdCpuScriptIntrinsicYuvToRGB *)p->usr;
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index f4ca1ed..d51e9e3 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -47,7 +47,7 @@
 class RsdCpuScriptImpl : public RsdCpuReferenceImpl::CpuScript {
 public:
     typedef void (*outer_foreach_t)(
-        const RsForEachStubParamStruct *,
+        const RsExpandKernelParams *,
         uint32_t x1, uint32_t x2,
         uint32_t instep, uint32_t outstep);
 #ifdef RS_COMPATIBILITY_LIB
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 40eddf2..1e42185 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -44,66 +44,71 @@
 }
 
 
-typedef void (*ScriptGroupRootFunc_t)(const RsForEachStubParamStruct *p,
+typedef void (*ScriptGroupRootFunc_t)(const RsExpandKernelParams *kparams,
                                       uint32_t xstart, uint32_t xend,
                                       uint32_t instep, uint32_t outstep);
 
-void CpuScriptGroupImpl::scriptGroupRoot(const RsForEachStubParamStruct *p,
+void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelParams *kparams,
                                          uint32_t xstart, uint32_t xend,
                                          uint32_t instep, uint32_t outstep) {
 
 
-    const ScriptList *sl = (const ScriptList *)p->usr;
-    RsForEachStubParamStruct *mp = (RsForEachStubParamStruct *)p;
-    const void *oldUsr = p->usr;
+    const ScriptList *sl            = (const ScriptList *)kparams->usr;
+    RsExpandKernelParams *mkparams = (RsExpandKernelParams *)kparams;
 
-    for(size_t ct=0; ct < sl->count; ct++) {
+    for (size_t ct = 0; ct < sl->count; ct++) {
         ScriptGroupRootFunc_t func;
-        func = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
-        mp->usr = sl->usrPtrs[ct];
+        func          = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
+        mkparams->usr = sl->usrPtrs[ct];
 
-        mp->ptrIn = NULL;
-        mp->in = NULL;
-        mp->ptrOut = NULL;
-        mp->out = NULL;
+        mkparams->in  = NULL;
+        mkparams->out = NULL;
 
         uint32_t istep = 0;
         uint32_t ostep = 0;
 
         if (sl->ins[ct]) {
-            mp->ptrIn = (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+            mkparams->in =
+              (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+
             istep = sl->ins[ct]->mHal.state.elementSizeBytes;
-            mp->in = mp->ptrIn;
+
             if (sl->inExts[ct]) {
-                mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->y;
-            } else {
-                if (sl->ins[ct]->mHal.drvState.lod[0].dimY > p->lid) {
-                    mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->lid;
-                }
+                mkparams->in =
+                  (const uint8_t *)mkparams->in +
+                  sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y;
+
+            } else if (sl->ins[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
+                mkparams->in =
+                  (const uint8_t *)mkparams->in +
+                  sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid;
             }
         }
 
         if (sl->outs[ct]) {
-            mp->ptrOut = (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
-            mp->out = mp->ptrOut;
+            mkparams->out =
+              (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
+
             ostep = sl->outs[ct]->mHal.state.elementSizeBytes;
+
             if (sl->outExts[ct]) {
-                mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->y;
-            } else {
-                if (sl->outs[ct]->mHal.drvState.lod[0].dimY > p->lid) {
-                    mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->lid;
-                }
+                mkparams->out =
+                  (uint8_t *)mkparams->out +
+                  sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->y;
+
+            } else if (sl->outs[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
+                mkparams->out =
+                  (uint8_t *)mkparams->out +
+                  sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->lid;
             }
         }
 
         //ALOGE("kernel %i %p,%p  %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
-        func(p, xstart, xend, istep, ostep);
+        func(kparams, xstart, xend, istep, ostep);
     }
     //ALOGE("script group root");
 
-    //ConvolveParams *cp = (ConvolveParams *)p->usr;
-
-    mp->usr = oldUsr;
+    mkparams->usr = sl;
 }
 
 
@@ -245,5 +250,3 @@
         }
     }
 }
-
-
diff --git a/cpu_ref/rsCpuScriptGroup.h b/cpu_ref/rsCpuScriptGroup.h
index 78e179d..71f2dd8 100644
--- a/cpu_ref/rsCpuScriptGroup.h
+++ b/cpu_ref/rsCpuScriptGroup.h
@@ -33,7 +33,7 @@
     CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg);
     bool init();
 
-    static void scriptGroupRoot(const RsForEachStubParamStruct *p,
+    static void scriptGroupRoot(const RsExpandKernelParams *p,
                                 uint32_t xstart, uint32_t xend,
                                 uint32_t instep, uint32_t outstep);