Pass RsExpandKernelDriverInfo not RsExpandKernelParams.

Which is to say: retire RsExpandKernelParams and pass RsExpandKernelDriverInfo
directly to kernel wrapper functions instead.

Requires related change in frameworks/compile/libbcc.

Change-Id: I453f45ec18f389e88e27fcfa57ddf245d077cb98
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 2492c22..f164517 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -49,7 +49,7 @@
 using namespace android::renderscript;
 
 typedef void (*outer_foreach_t)(
-    const android::renderscript::RsExpandKernelParams *,
+    const RsExpandKernelDriverInfo *,
     uint32_t x1, uint32_t x2, uint32_t outstep);
 
 
@@ -348,28 +348,6 @@
                             RsExpandKernelDriverInfo,
                             outer_foreach_t);
 
-static void kparamSetup(RsExpandKernelParams *kparams, const RsExpandKernelDriverInfo *fep) {
-    //ALOGE("kp  usr %p", fep->usr);
-    //ALOGE("kp  slot %i", fep->slot);
-    //ALOGE("kp  dim %i %i %i", fep->dim.x, fep->dim.y, fep->dim.z);
-    //ALOGE("kp  lid %i", fep->lid);
-    //ALOGE("kp  in[0] stide %i  ptr %p", fep->inStride[0], fep->inPtr[0]);
-    //ALOGE("kp  out[0] ptr %p", fep->outPtr[0]);
-    //ALOGE("kp  loc %i %i %i", fep->current.x, fep->current.y, fep->current.z);
-
-    kparams->usr  = fep->usr;
-    kparams->slot = fep->slot;
-    kparams->dimX = fep->dim.x;
-    kparams->dimY = fep->dim.y;
-    kparams->dimZ = fep->dim.z;
-    kparams->lid = fep->lid;
-    kparams->inEStrides = (uint32_t *)&fep->inStride[0];
-    kparams->ins = (const void **)&fep->inPtr[0];
-    kparams->out = fep->outPtr[0];
-    kparams->y = fep->current.y;
-    kparams->z = fep->current.z;
-}
-
 static inline void FepPtrSetup(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo *fep,
                                uint32_t x, uint32_t y,
                                uint32_t z = 0, uint32_t lod = 0,
@@ -436,9 +414,7 @@
                         mtls->fep.current.array[0], mtls->fep.current.array[1],
                         mtls->fep.current.array[2], mtls->fep.current.array[3]);
 
-            RsExpandKernelParams kparams;
-            kparamSetup(&kparams, &mtls->fep);
-            fn(&kparams, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
+            fn(&mtls->fep, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
         }
     }
 
@@ -464,10 +440,7 @@
         for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
             FepPtrSetup(mtls, &fep, mtls->start.x, fep.current.y);
 
-            RsExpandKernelParams kparams;
-            kparamSetup(&kparams, &fep);
-
-            fn(&kparams, mtls->start.x, mtls->end.x, fep.outStride[0]);
+            fn(&fep, mtls->start.x, mtls->end.x, fep.outStride[0]);
         }
     }
 }
@@ -491,10 +464,7 @@
 
         FepPtrSetup(mtls, &fep, xStart, 0);
 
-        RsExpandKernelParams kparams;
-        kparamSetup(&kparams, &fep);
-
-        fn(&kparams, xStart, xEnd, fep.outStride[0]);
+        fn(&fep, xStart, xEnd, fep.outStride[0]);
     }
 }
 
@@ -579,9 +549,7 @@
                             mtls->fep.current.array[0], mtls->fep.current.array[1],
                             mtls->fep.current.array[2], mtls->fep.current.array[3]);
 
-                RsExpandKernelParams kparams;
-                kparamSetup(&kparams, &mtls->fep);
-                fn(&kparams, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
+                fn(&mtls->fep, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
             }
         }
     }
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 5f1913f..afe8ef5 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -25,9 +25,6 @@
 #include "rsScriptC.h"
 #include "rsCpuCoreRuntime.h"
 
-
-#define RS_KERNEL_INPUT_LIMIT 8
-
 namespace bcc {
     class BCCContext;
     class RSCompilerDriver;
@@ -37,51 +34,6 @@
 namespace android {
 namespace renderscript {
 
-struct StridePair {
-  uint32_t eStride;
-  uint32_t yStride;
-};
-
-struct RsLaunchDimensions {
-    uint32_t x;
-    uint32_t y;
-    uint32_t z;
-    uint32_t lod;
-    uint32_t face;
-    uint32_t array[4 /*make a define*/];
-};
-
-struct RsExpandKernelDriverInfo {
-    // Warning: This structure is shared with the compiler
-    // Any change to the fields here requires a matching compiler change
-
-    const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
-    uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
-    uint32_t inLen;
-
-    uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
-    uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
-    uint32_t outLen;
-
-    // Dimension of the launch
-    RsLaunchDimensions dim;
-
-    // The walking itterator of the launch
-    RsLaunchDimensions current;
-
-    const void *usr;
-    uint32_t usrLen;
-
-
-
-    // Items below this line are not used by the compiler and can be change in the driver
-    uint32_t lid;
-    uint32_t slot;
-
-};
-
-typedef ::RsExpandKernelParams RsExpandKernelParams;
-
 extern bool gArchUseSIMD;
 
 typedef void (* InvokeFunc_t)(void);
diff --git a/cpu_ref/rsCpuCoreRuntime.h b/cpu_ref/rsCpuCoreRuntime.h
index 19add1b..c7841ec 100644
--- a/cpu_ref/rsCpuCoreRuntime.h
+++ b/cpu_ref/rsCpuCoreRuntime.h
@@ -20,28 +20,45 @@
 #ifndef RSD_CPU_CORE_RUNTIME_H
 #define RSD_CPU_CORE_RUNTIME_H
 
-struct RsExpandKernelParams {
+// Warning: This value is shared with the compiler
+// Any change to this value requires a matching compiler change
+#define RS_KERNEL_INPUT_LIMIT 8
 
-    // Used by kernels
-    const void **ins;
-    uint32_t *inEStrides;
-    void *out;
+struct RsLaunchDimensions {
+    // Warning: This structure is shared with the compiler
+    // Any change to the fields here requires a matching compiler change
+
+    uint32_t x;
     uint32_t y;
     uint32_t z;
-    uint32_t lid;
+    uint32_t lod;
+    uint32_t face;
+    uint32_t array[4 /*make a define*/];
+};
 
-    // Used by ScriptGroup and user kernels.
+struct RsExpandKernelDriverInfo {
+    // Warning: This structure is shared with the compiler
+    // Any change to the fields here requires a matching compiler change
+
+    const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
+    uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
+    uint32_t inLen;
+
+    uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
+    uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
+    uint32_t outLen;
+
+    // Dimension of the launch
+    struct RsLaunchDimensions dim;
+
+    // The walking iterator of the launch
+    struct RsLaunchDimensions current;
+
     const void *usr;
+    uint32_t usrLen;
 
-    // Used by intrinsics
-    uint32_t dimX;
-    uint32_t dimY;
-    uint32_t dimZ;
-
-    /*
-     * FIXME: This is only used by the blend intrinsic.  If possible, we should
-     *        modify blur to not need it.
-     */
+    // Items below this line are not used by the compiler and can be change in the driver
+    uint32_t lid;
     uint32_t slot;
 };
 
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index 86d0478..734e5e5 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -38,7 +38,7 @@
 protected:
     ObjectBaseRef<Allocation> mLUT;
 
-    static void kernel(const RsExpandKernelParams *p,
+    static void kernel(const RsExpandKernelDriverInfo *info,
                        uint32_t xstart, uint32_t xend,
                        uint32_t outstep);
 };
@@ -58,13 +58,13 @@
                                       int dimx, int dimy, int dimz);
 
 
-void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelDriverInfo *info,
                                         uint32_t xstart, uint32_t xend,
                                         uint32_t outstep) {
-    RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
+    RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)info->usr;
 
-    uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->ins[0];
+    uchar4 *out = (uchar4 *)info->outPtr[0];
+    uchar4 *in = (uchar4 *)info->inPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 27a02b7..16348c6 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -33,7 +33,7 @@
     RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
-    static void kernel(const RsExpandKernelParams *p, uint32_t xstart,
+    static void kernel(const RsExpandKernelDriverInfo *info, uint32_t xstart,
                        uint32_t xend, uint32_t outstep);
 };
 
@@ -109,24 +109,24 @@
 extern void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
 #endif
 
-void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelDriverInfo *info,
                                         uint32_t xstart, uint32_t xend,
                                         uint32_t outstep) {
-    RsdCpuScriptIntrinsicBlend *cp = (RsdCpuScriptIntrinsicBlend *)p->usr;
+    RsdCpuScriptIntrinsicBlend *cp = (RsdCpuScriptIntrinsicBlend *)info->usr;
 
     // instep/outstep can be ignored--sizeof(uchar4) known at compile time
-    uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->ins[0];
+    uchar4 *out = (uchar4 *)info->outPtr[0];
+    uchar4 *in = (uchar4 *)info->inPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
     if (gArchUseSIMD) {
-        if (rsdIntrinsicBlend_K(out, in, p->slot, x1, x2) >= 0)
+        if (rsdIntrinsicBlend_K(out, in, info->slot, x1, x2) >= 0)
             return;
     }
 #endif
-    switch (p->slot) {
+    switch (info->slot) {
     case BLEND_CLEAR:
         for (;x1 < x2; x1++, out++) {
             *out = 0;
@@ -483,7 +483,7 @@
         break;
 
     default:
-        ALOGE("Called unimplemented value %d", p->slot);
+        ALOGE("Called unimplemented value %d", info->slot);
         rsAssert(false);
 
     }
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index ffdb74b..a3ed1d1 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -44,10 +44,10 @@
     int mIradius;
     ObjectBaseRef<Allocation> mAlloc;
 
-    static void kernelU4(const RsExpandKernelParams *p,
+    static void kernelU4(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelU1(const RsExpandKernelParams *p,
+    static void kernelU1(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
     void ComputeGaussianWeights();
@@ -113,7 +113,7 @@
 
 
 
-static void OneVU4(const RsExpandKernelParams *p, float4 *out, int32_t x, int32_t y,
+static void OneVU4(const RsExpandKernelDriverInfo *info, float4 *out, int32_t x, int32_t y,
                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
 
     const uchar *pi = ptrIn + x*4;
@@ -121,7 +121,7 @@
     float4 blurredPixel = 0;
     for (int r = -iradius; r <= iradius; r ++) {
         int validY = rsMax((y + r), 0);
-        validY = rsMin(validY, (int)(p->dimY - 1));
+        validY = rsMin(validY, (int)(info->dim.y- 1));
         const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
         float4 pf = convert_float4(pvy[0]);
         blurredPixel += pf * gPtr[0];
@@ -131,7 +131,7 @@
     out[0] = blurredPixel;
 }
 
-static void OneVU1(const RsExpandKernelParams *p, float *out, int32_t x, int32_t y,
+static void OneVU1(const RsExpandKernelDriverInfo *info, float *out, int32_t x, int32_t y,
                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
 
     const uchar *pi = ptrIn + x;
@@ -139,7 +139,7 @@
     float blurredPixel = 0;
     for (int r = -iradius; r <= iradius; r ++) {
         int validY = rsMax((y + r), 0);
-        validY = rsMin(validY, (int)(p->dimY - 1));
+        validY = rsMin(validY, (int)(info->dim.y - 1));
         float pf = (float)pi[validY * iStride];
         blurredPixel += pf * gPtr[0];
         gPtr++;
@@ -247,13 +247,13 @@
     }
 }
 
-static void OneHU4(const RsExpandKernelParams *p, uchar4 *out, int32_t x,
+static void OneHU4(const RsExpandKernelDriverInfo *info, uchar4 *out, int32_t x,
                    const float4 *ptrIn, const float* gPtr, int iradius) {
 
     float4 blurredPixel = 0;
     for (int r = -iradius; r <= iradius; r ++) {
         int validX = rsMax((x + r), 0);
-        validX = rsMin(validX, (int)(p->dimX - 1));
+        validX = rsMin(validX, (int)(info->dim.x - 1));
         float4 pf = ptrIn[validX];
         blurredPixel += pf * gPtr[0];
         gPtr++;
@@ -262,13 +262,13 @@
     out->xyzw = convert_uchar4(blurredPixel);
 }
 
-static void OneHU1(const RsExpandKernelParams *p, uchar *out, int32_t x,
+static void OneHU1(const RsExpandKernelDriverInfo *info, uchar *out, int32_t x,
                    const float *ptrIn, const float* gPtr, int iradius) {
 
     float blurredPixel = 0;
     for (int r = -iradius; r <= iradius; r ++) {
         int validX = rsMax((x + r), 0);
-        validX = rsMin(validX, (int)(p->dimX - 1));
+        validX = rsMin(validX, (int)(info->dim.x - 1));
         float pf = ptrIn[validX];
         blurredPixel += pf * gPtr[0];
         gPtr++;
@@ -278,13 +278,13 @@
 }
 
 
-void RsdCpuScriptIntrinsicBlur::kernelU4(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicBlur::kernelU4(const RsExpandKernelDriverInfo *info,
                                          uint32_t xstart, uint32_t xend,
                                          uint32_t outstep) {
 
     float4 stackbuf[2048];
     float4 *buf = &stackbuf[0];
-    RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
+    RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)info->usr;
     if (!cp->mAlloc.get()) {
         ALOGE("Blur executed without input, skipping");
         return;
@@ -292,36 +292,37 @@
     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *out = (uchar4 *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
     if (gArchUseSIMD) {
-        rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * p->y), p->dimX, p->dimY,
-                 stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
+      rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * info->current.y),
+                 info->dim.x, info->dim.y,
+                 stride, x1, info->current.y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
         return;
     }
 #endif
 
-    if (p->dimX > 2048) {
-        if ((p->dimX > cp->mScratchSize[p->lid]) || !cp->mScratch[p->lid]) {
+    if (info->dim.x > 2048) {
+        if ((info->dim.x > cp->mScratchSize[info->lid]) || !cp->mScratch[info->lid]) {
             // Pad the side of the allocation by one unit to allow alignment later
-            cp->mScratch[p->lid] = realloc(cp->mScratch[p->lid], (p->dimX + 1) * 16);
-            cp->mScratchSize[p->lid] = p->dimX;
+            cp->mScratch[info->lid] = realloc(cp->mScratch[info->lid], (info->dim.x + 1) * 16);
+            cp->mScratchSize[info->lid] = info->dim.x;
         }
         // realloc only aligns to 8 bytes so we manually align to 16.
-        buf = (float4 *) ((((intptr_t)cp->mScratch[p->lid]) + 15) & ~0xf);
+        buf = (float4 *) ((((intptr_t)cp->mScratch[info->lid]) + 15) & ~0xf);
     }
     float4 *fout = (float4 *)buf;
-    int y = p->y;
-    if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius))) {
+    int y = info->current.y;
+    if ((y > cp->mIradius) && (y < ((int)info->dim.y - cp->mIradius))) {
         const uchar *pi = pin + (y - cp->mIradius) * stride;
-        OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, p->dimX);
+        OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, info->dim.x);
     } else {
         x1 = 0;
-        while(p->dimX > x1) {
-            OneVU4(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
+        while(info->dim.x > x1) {
+            OneVU4(info, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
             fout++;
             x1++;
         }
@@ -329,7 +330,7 @@
 
     x1 = xstart;
     while ((x1 < (uint32_t)cp->mIradius) && (x1 < x2)) {
-        OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius);
+        OneHU4(info, out, x1, buf, cp->mFp, cp->mIradius);
         out++;
         x1++;
     }
@@ -344,17 +345,17 @@
     }
 #endif
     while(x2 > x1) {
-        OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius);
+        OneHU4(info, out, x1, buf, cp->mFp, cp->mIradius);
         out++;
         x1++;
     }
 }
 
-void RsdCpuScriptIntrinsicBlur::kernelU1(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicBlur::kernelU1(const RsExpandKernelDriverInfo *info,
                                          uint32_t xstart, uint32_t xend,
                                          uint32_t outstep) {
     float buf[4 * 2048];
-    RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
+    RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)info->usr;
     if (!cp->mAlloc.get()) {
         ALOGE("Blur executed without input, skipping");
         return;
@@ -362,27 +363,27 @@
     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    uchar *out = (uchar *)p->out;
+    uchar *out = (uchar *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
     if (gArchUseSIMD) {
-        rsdIntrinsicBlurU1_K(out, pin + stride * p->y, p->dimX, p->dimY,
-                 stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
+        rsdIntrinsicBlurU1_K(out, pin + stride * info->current.y, info->dim.x, info->dim.y,
+                 stride, x1, info->current.y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
         return;
     }
 #endif
 
     float *fout = (float *)buf;
-    int y = p->y;
-    if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius -1))) {
+    int y = info->current.y;
+    if ((y > cp->mIradius) && (y < ((int)info->dim.y - cp->mIradius -1))) {
         const uchar *pi = pin + (y - cp->mIradius) * stride;
-        OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, p->dimX);
+        OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, info->dim.x);
     } else {
         x1 = 0;
-        while(p->dimX > x1) {
-            OneVU1(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
+        while(info->dim.x > x1) {
+            OneVU1(info, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
             fout++;
             x1++;
         }
@@ -391,7 +392,7 @@
     x1 = xstart;
     while ((x1 < x2) &&
            ((x1 < (uint32_t)cp->mIradius) || (((uintptr_t)out) & 0x3))) {
-        OneHU1(p, out, x1, buf, cp->mFp, cp->mIradius);
+        OneHU1(info, out, x1, buf, cp->mFp, cp->mIradius);
         out++;
         x1++;
     }
@@ -410,7 +411,7 @@
     }
 #endif
     while(x2 > x1) {
-        OneHU1(p, out, x1, buf, cp->mFp, cp->mIradius);
+        OneHU1(info, out, x1, buf, cp->mFp, cp->mIradius);
         out++;
         x1++;
     }
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index e629dea..a7d576b 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -185,7 +185,7 @@
     FunctionTab_t mFnTab;
 #endif
 
-    static void kernel(const RsExpandKernelParams *p,
+    static void kernel(const RsExpandKernelDriverInfo *info,
                        uint32_t xstart, uint32_t xend,
                        uint32_t outstep);
     void updateCoeffCache(float fpMul, float addMul);
@@ -776,7 +776,7 @@
 }
 
 
-static void One(const RsExpandKernelParams *p, void *out,
+static void One(const RsExpandKernelDriverInfo *info, void *out,
                 const void *py, const float* coeff, const float *add,
                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
 
@@ -877,15 +877,15 @@
     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info,
                                               uint32_t xstart, uint32_t xend,
                                               uint32_t outstep) {
-    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
+    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr;
 
-    uint32_t instep = p->inEStrides[0];
+    uint32_t instep = info->inStride[0];
 
-    uchar *out = (uchar *)p->out;
-    uchar *in = (uchar *)p->ins[0];
+    uchar *out = (uchar *)info->outPtr[0];
+    uchar *in = (uchar *)info->inPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -894,7 +894,7 @@
     bool floatIn = !!cp->mLastKey.u.inType;
     bool floatOut = !!cp->mLastKey.u.outType;
 
-    //if (!p->y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
+    //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
 
     if(x2 > x1) {
         int32_t len = x2 - x1;
@@ -929,7 +929,7 @@
         }
 
         while(x1 != x2) {
-            One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
+            One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
             out += outstep;
             in += instep;
             x1++;
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index e3fa245..ce7be79 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -42,22 +42,22 @@
     ObjectBaseRef<const Allocation> mAlloc;
     ObjectBaseRef<const Element> mElement;
 
-    static void kernelU1(const RsExpandKernelParams *p,
+    static void kernelU1(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelU2(const RsExpandKernelParams *p,
+    static void kernelU2(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelU4(const RsExpandKernelParams *p,
+    static void kernelU4(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelF1(const RsExpandKernelParams *p,
+    static void kernelF1(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelF2(const RsExpandKernelParams *p,
+    static void kernelF2(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelF4(const RsExpandKernelParams *p,
+    static void kernelF4(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
 };
@@ -88,12 +88,12 @@
                                           const void *y2, const short *coef, uint32_t count);
 
 
-static void ConvolveOneU4(const RsExpandKernelParams *p, uint32_t x, uchar4 *out,
+static void ConvolveOneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
                           const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
                           const float* coeff) {
 
     uint32_t x1 = rsMax((int32_t)x-1, 0);
-    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
 
     float4 px = convert_float4(py0[x1]) * coeff[0] +
                 convert_float4(py0[x]) * coeff[1] +
@@ -110,12 +110,12 @@
     *out = o;
 }
 
-static void ConvolveOneU2(const RsExpandKernelParams *p, uint32_t x, uchar2 *out,
+static void ConvolveOneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
                           const uchar2 *py0, const uchar2 *py1, const uchar2 *py2,
                           const float* coeff) {
 
     uint32_t x1 = rsMax((int32_t)x-1, 0);
-    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
 
     float2 px = convert_float2(py0[x1]) * coeff[0] +
                 convert_float2(py0[x]) * coeff[1] +
@@ -131,12 +131,12 @@
     *out = convert_uchar2(px);
 }
 
-static void ConvolveOneU1(const RsExpandKernelParams *p, uint32_t x, uchar *out,
+static void ConvolveOneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
                           const uchar *py0, const uchar *py1, const uchar *py2,
                           const float* coeff) {
 
     uint32_t x1 = rsMax((int32_t)x-1, 0);
-    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
 
     float px = ((float)py0[x1]) * coeff[0] +
                ((float)py0[x]) * coeff[1] +
@@ -150,43 +150,43 @@
     *out = clamp(px + 0.5f, 0.f, 255.f);
 }
 
-static void ConvolveOneF4(const RsExpandKernelParams *p, uint32_t x, float4 *out,
+static void ConvolveOneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
                           const float4 *py0, const float4 *py1, const float4 *py2,
                           const float* coeff) {
 
     uint32_t x1 = rsMax((int32_t)x-1, 0);
-    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
     *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
            (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-static void ConvolveOneF2(const RsExpandKernelParams *p, uint32_t x, float2 *out,
+static void ConvolveOneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
                           const float2 *py0, const float2 *py1, const float2 *py2,
                           const float* coeff) {
 
     uint32_t x1 = rsMax((int32_t)x-1, 0);
-    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
     *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
            (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-static void ConvolveOneF1(const RsExpandKernelParams *p, uint32_t x, float *out,
+static void ConvolveOneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
                           const float *py0, const float *py1, const float *py2,
                           const float* coeff) {
 
     uint32_t x1 = rsMax((int32_t)x-1, 0);
-    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
     *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
            (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Convolve3x3 executed without input, skipping");
@@ -195,17 +195,17 @@
     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
-    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
+    uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
     const uchar4 *py0 = (const uchar4 *)(pin + stride * y2);
-    const uchar4 *py1 = (const uchar4 *)(pin + stride * p->y);
+    const uchar4 *py1 = (const uchar4 *)(pin + stride * info->current.y);
     const uchar4 *py2 = (const uchar4 *)(pin + stride * y1);
 
-    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *out = (uchar4 *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
     if(x1 == 0) {
-        ConvolveOneU4(p, 0, out, py0, py1, py2, cp->mFp);
+        ConvolveOneU4(info, 0, out, py0, py1, py2, cp->mFp);
         x1 ++;
         out++;
     }
@@ -223,17 +223,17 @@
 #endif
 
         while(x1 != x2) {
-            ConvolveOneU4(p, x1, out, py0, py1, py2, cp->mFp);
+            ConvolveOneU4(info, x1, out, py0, py1, py2, cp->mFp);
             out++;
             x1++;
         }
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Convolve3x3 executed without input, skipping");
@@ -242,17 +242,17 @@
     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
-    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
+    uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
     const uchar2 *py0 = (const uchar2 *)(pin + stride * y2);
-    const uchar2 *py1 = (const uchar2 *)(pin + stride * p->y);
+    const uchar2 *py1 = (const uchar2 *)(pin + stride * info->current.y);
     const uchar2 *py2 = (const uchar2 *)(pin + stride * y1);
 
-    uchar2 *out = (uchar2 *)p->out;
+    uchar2 *out = (uchar2 *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
     if(x1 == 0) {
-        ConvolveOneU2(p, 0, out, py0, py1, py2, cp->mFp);
+        ConvolveOneU2(info, 0, out, py0, py1, py2, cp->mFp);
         x1 ++;
         out++;
     }
@@ -268,17 +268,17 @@
 #endif
 
         while(x1 != x2) {
-            ConvolveOneU2(p, x1, out, py0, py1, py2, cp->mFp);
+            ConvolveOneU2(info, x1, out, py0, py1, py2, cp->mFp);
             out++;
             x1++;
         }
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Convolve3x3 executed without input, skipping");
@@ -287,17 +287,17 @@
     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
-    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
+    uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
     const uchar *py0 = (const uchar *)(pin + stride * y2);
-    const uchar *py1 = (const uchar *)(pin + stride * p->y);
+    const uchar *py1 = (const uchar *)(pin + stride * info->current.y);
     const uchar *py2 = (const uchar *)(pin + stride * y1);
 
-    uchar *out = (uchar *)p->out;
+    uchar *out = (uchar *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
     if(x1 == 0) {
-        ConvolveOneU1(p, 0, out, py0, py1, py2, cp->mFp);
+        ConvolveOneU1(info, 0, out, py0, py1, py2, cp->mFp);
         x1 ++;
         out++;
     }
@@ -313,17 +313,17 @@
 #endif
 
         while(x1 != x2) {
-            ConvolveOneU1(p, x1, out, py0, py1, py2, cp->mFp);
+            ConvolveOneU1(info, x1, out, py0, py1, py2, cp->mFp);
             out++;
             x1++;
         }
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Convolve3x3 executed without input, skipping");
@@ -332,17 +332,17 @@
     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
-    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
+    uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
     const float4 *py0 = (const float4 *)(pin + stride * y2);
-    const float4 *py1 = (const float4 *)(pin + stride * p->y);
+    const float4 *py1 = (const float4 *)(pin + stride * info->current.y);
     const float4 *py2 = (const float4 *)(pin + stride * y1);
 
-    float4 *out = (float4 *)p->out;
+    float4 *out = (float4 *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
     if(x1 == 0) {
-        ConvolveOneF4(p, 0, out, py0, py1, py2, cp->mFp);
+        ConvolveOneF4(info, 0, out, py0, py1, py2, cp->mFp);
         x1 ++;
         out++;
     }
@@ -358,17 +358,17 @@
 #endif
 
         while(x1 != x2) {
-            ConvolveOneF4(p, x1, out, py0, py1, py2, cp->mFp);
+            ConvolveOneF4(info, x1, out, py0, py1, py2, cp->mFp);
             out++;
             x1++;
         }
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Convolve3x3 executed without input, skipping");
@@ -377,17 +377,17 @@
     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
-    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
+    uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
     const float2 *py0 = (const float2 *)(pin + stride * y2);
-    const float2 *py1 = (const float2 *)(pin + stride * p->y);
+    const float2 *py1 = (const float2 *)(pin + stride * info->current.y);
     const float2 *py2 = (const float2 *)(pin + stride * y1);
 
-    float2 *out = (float2 *)p->out;
+    float2 *out = (float2 *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
     if(x1 == 0) {
-        ConvolveOneF2(p, 0, out, py0, py1, py2, cp->mFp);
+        ConvolveOneF2(info, 0, out, py0, py1, py2, cp->mFp);
         x1 ++;
         out++;
     }
@@ -403,16 +403,16 @@
 #endif
 
         while(x1 != x2) {
-            ConvolveOneF2(p, x1, out, py0, py1, py2, cp->mFp);
+            ConvolveOneF2(info, x1, out, py0, py1, py2, cp->mFp);
             out++;
             x1++;
         }
     }
 }
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Convolve3x3 executed without input, skipping");
@@ -421,17 +421,17 @@
     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
-    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
+    uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
     const float *py0 = (const float *)(pin + stride * y2);
-    const float *py1 = (const float *)(pin + stride * p->y);
+    const float *py1 = (const float *)(pin + stride * info->current.y);
     const float *py2 = (const float *)(pin + stride * y1);
 
-    float *out = (float *)p->out;
+    float *out = (float *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
     if(x1 == 0) {
-        ConvolveOneF1(p, 0, out, py0, py1, py2, cp->mFp);
+        ConvolveOneF1(info, 0, out, py0, py1, py2, cp->mFp);
         x1 ++;
         out++;
     }
@@ -447,7 +447,7 @@
 #endif
 
         while(x1 != x2) {
-            ConvolveOneF1(p, x1, out, py0, py1, py2, cp->mFp);
+            ConvolveOneF1(info, x1, out, py0, py1, py2, cp->mFp);
             out++;
             x1++;
         }
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index e591e44..29dd886 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -42,22 +42,22 @@
     ObjectBaseRef<Allocation> alloc;
 
 
-    static void kernelU1(const RsExpandKernelParams *p,
+    static void kernelU1(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelU2(const RsExpandKernelParams *p,
+    static void kernelU2(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelU4(const RsExpandKernelParams *p,
+    static void kernelU4(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelF1(const RsExpandKernelParams *p,
+    static void kernelF1(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelF2(const RsExpandKernelParams *p,
+    static void kernelF2(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelF4(const RsExpandKernelParams *p,
+    static void kernelF4(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
 
@@ -86,15 +86,15 @@
 }
 
 
-static void OneU4(const RsExpandKernelParams *p, uint32_t x, uchar4 *out,
+static void OneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
                   const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
                   const float* coeff) {
 
     uint32_t x0 = rsMax((int32_t)x-2, 0);
     uint32_t x1 = rsMax((int32_t)x-1, 0);
     uint32_t x2 = x;
-    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
-    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
+    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
+    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
 
     float4 px = convert_float4(py0[x0]) * coeff[0] +
                 convert_float4(py0[x1]) * coeff[1] +
@@ -129,15 +129,15 @@
     *out = convert_uchar4(px);
 }
 
-static void OneU2(const RsExpandKernelParams *p, uint32_t x, uchar2 *out,
+static void OneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
                   const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
                   const float* coeff) {
 
     uint32_t x0 = rsMax((int32_t)x-2, 0);
     uint32_t x1 = rsMax((int32_t)x-1, 0);
     uint32_t x2 = x;
-    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
-    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
+    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
+    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
 
     float2 px = convert_float2(py0[x0]) * coeff[0] +
                 convert_float2(py0[x1]) * coeff[1] +
@@ -172,15 +172,15 @@
     *out = convert_uchar2(px);
 }
 
-static void OneU1(const RsExpandKernelParams *p, uint32_t x, uchar *out,
+static void OneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
                   const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
                   const float* coeff) {
 
     uint32_t x0 = rsMax((int32_t)x-2, 0);
     uint32_t x1 = rsMax((int32_t)x-1, 0);
     uint32_t x2 = x;
-    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
-    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
+    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
+    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
 
     float px = (float)(py0[x0]) * coeff[0] +
                (float)(py0[x1]) * coeff[1] +
@@ -215,15 +215,15 @@
     *out = px;
 }
 
-static void OneF4(const RsExpandKernelParams *p, uint32_t x, float4 *out,
+static void OneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
                   const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
                   const float* coeff) {
 
     uint32_t x0 = rsMax((int32_t)x-2, 0);
     uint32_t x1 = rsMax((int32_t)x-1, 0);
     uint32_t x2 = x;
-    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
-    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
+    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
+    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
 
     float4 px = py0[x0] * coeff[0] +
                 py0[x1] * coeff[1] +
@@ -257,15 +257,15 @@
     *out = px;
 }
 
-static void OneF2(const RsExpandKernelParams *p, uint32_t x, float2 *out,
+static void OneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
                   const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
                   const float* coeff) {
 
     uint32_t x0 = rsMax((int32_t)x-2, 0);
     uint32_t x1 = rsMax((int32_t)x-1, 0);
     uint32_t x2 = x;
-    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
-    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
+    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
+    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
 
     float2 px = py0[x0] * coeff[0] +
                 py0[x1] * coeff[1] +
@@ -299,15 +299,15 @@
     *out = px;
 }
 
-static void OneF1(const RsExpandKernelParams *p, uint32_t x, float *out,
+static void OneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
                   const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
                   const float* coeff) {
 
     uint32_t x0 = rsMax((int32_t)x-2, 0);
     uint32_t x1 = rsMax((int32_t)x-1, 0);
     uint32_t x2 = x;
-    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
-    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
+    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
+    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
 
     float px = py0[x0] * coeff[0] +
                py0[x1] * coeff[1] +
@@ -346,10 +346,10 @@
                                           const void *y2, const void *y3, const void *y4,
                                           const short *coef, uint32_t count);
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
         return;
@@ -357,11 +357,11 @@
     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
-    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
-    uint32_t y2 = p->y;
-    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
-    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
+    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
+    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
+    uint32_t y2 = info->current.y;
+    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
+    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
 
     const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
     const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
@@ -369,12 +369,12 @@
     const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
     const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
 
-    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *out = (uchar4 *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
     while((x1 < x2) && (x1 < 2)) {
-        OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
@@ -400,16 +400,16 @@
 #endif
 
     while(x1 < x2) {
-        OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
         return;
@@ -417,11 +417,11 @@
     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
-    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
-    uint32_t y2 = p->y;
-    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
-    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
+    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
+    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
+    uint32_t y2 = info->current.y;
+    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
+    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
 
     const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
     const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
@@ -429,12 +429,12 @@
     const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
     const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
 
-    uchar2 *out = (uchar2 *)p->out;
+    uchar2 *out = (uchar2 *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
     while((x1 < x2) && (x1 < 2)) {
-        OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
@@ -449,16 +449,16 @@
 #endif
 
     while(x1 < x2) {
-        OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
         return;
@@ -466,11 +466,11 @@
     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
-    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
-    uint32_t y2 = p->y;
-    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
-    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
+    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
+    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
+    uint32_t y2 = info->current.y;
+    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
+    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
 
     const uchar *py0 = (const uchar *)(pin + stride * y0);
     const uchar *py1 = (const uchar *)(pin + stride * y1);
@@ -478,12 +478,12 @@
     const uchar *py3 = (const uchar *)(pin + stride * y3);
     const uchar *py4 = (const uchar *)(pin + stride * y4);
 
-    uchar *out = (uchar *)p->out;
+    uchar *out = (uchar *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
     while((x1 < x2) && (x1 < 2)) {
-        OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
@@ -498,16 +498,16 @@
 #endif
 
     while(x1 < x2) {
-        OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
         return;
@@ -515,11 +515,11 @@
     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
-    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
-    uint32_t y2 = p->y;
-    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
-    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
+    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
+    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
+    uint32_t y2 = info->current.y;
+    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
+    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
 
     const float4 *py0 = (const float4 *)(pin + stride * y0);
     const float4 *py1 = (const float4 *)(pin + stride * y1);
@@ -527,12 +527,12 @@
     const float4 *py3 = (const float4 *)(pin + stride * y3);
     const float4 *py4 = (const float4 *)(pin + stride * y4);
 
-    float4 *out = (float4 *)p->out;
+    float4 *out = (float4 *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
     while((x1 < x2) && (x1 < 2)) {
-        OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
@@ -547,16 +547,16 @@
 #endif
 
     while(x1 < x2) {
-        OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
         return;
@@ -564,11 +564,11 @@
     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
-    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
-    uint32_t y2 = p->y;
-    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
-    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
+    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
+    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
+    uint32_t y2 = info->current.y;
+    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
+    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
 
     const float2 *py0 = (const float2 *)(pin + stride * y0);
     const float2 *py1 = (const float2 *)(pin + stride * y1);
@@ -576,12 +576,12 @@
     const float2 *py3 = (const float2 *)(pin + stride * y3);
     const float2 *py4 = (const float2 *)(pin + stride * y4);
 
-    float2 *out = (float2 *)p->out;
+    float2 *out = (float2 *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
     while((x1 < x2) && (x1 < 2)) {
-        OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
@@ -596,16 +596,16 @@
 #endif
 
     while(x1 < x2) {
-        OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
+    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
         return;
@@ -613,11 +613,11 @@
     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
 
-    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
-    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
-    uint32_t y2 = p->y;
-    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
-    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
+    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
+    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
+    uint32_t y2 = info->current.y;
+    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
+    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
 
     const float *py0 = (const float *)(pin + stride * y0);
     const float *py1 = (const float *)(pin + stride * y1);
@@ -625,12 +625,12 @@
     const float *py3 = (const float *)(pin + stride * y3);
     const float *py4 = (const float *)(pin + stride * y4);
 
-    float *out = (float *)p->out;
+    float *out = (float *)info->outPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
     while((x1 < x2) && (x1 < 2)) {
-        OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
@@ -645,7 +645,7 @@
 #endif
 
     while(x1 < x2) {
-        OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
+        OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
         out++;
         x1++;
     }
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
index 4779187..fd60794 100644
--- a/cpu_ref/rsCpuIntrinsicHistogram.cpp
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -49,29 +49,29 @@
     int *mSums;
     ObjectBaseRef<Allocation> mAllocOut;
 
-    static void kernelP1U4(const RsExpandKernelParams *p,
+    static void kernelP1U4(const RsExpandKernelDriverInfo *info,
                            uint32_t xstart, uint32_t xend,
                            uint32_t outstep);
-    static void kernelP1U3(const RsExpandKernelParams *p,
+    static void kernelP1U3(const RsExpandKernelDriverInfo *info,
                            uint32_t xstart, uint32_t xend,
                            uint32_t outstep);
-    static void kernelP1U2(const RsExpandKernelParams *p,
+    static void kernelP1U2(const RsExpandKernelDriverInfo *info,
                            uint32_t xstart, uint32_t xend,
                            uint32_t outstep);
-    static void kernelP1U1(const RsExpandKernelParams *p,
+    static void kernelP1U1(const RsExpandKernelDriverInfo *info,
                            uint32_t xstart, uint32_t xend,
                            uint32_t outstep);
 
-    static void kernelP1L4(const RsExpandKernelParams *p,
+    static void kernelP1L4(const RsExpandKernelDriverInfo *info,
                            uint32_t xstart, uint32_t xend,
                            uint32_t outstep);
-    static void kernelP1L3(const RsExpandKernelParams *p,
+    static void kernelP1L3(const RsExpandKernelDriverInfo *info,
                            uint32_t xstart, uint32_t xend,
                            uint32_t outstep);
-    static void kernelP1L2(const RsExpandKernelParams *p,
+    static void kernelP1L2(const RsExpandKernelDriverInfo *info,
                            uint32_t xstart, uint32_t xend,
                            uint32_t outstep);
-    static void kernelP1L1(const RsExpandKernelParams *p,
+    static void kernelP1L1(const RsExpandKernelDriverInfo *info,
                            uint32_t xstart, uint32_t xend,
                            uint32_t outstep);
 
@@ -166,61 +166,61 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
 
-    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
-    int * sums = &cp->mSums[256 * 4 * p->lid];
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
+    uchar *in = (uchar *)info->inPtr[0];
+    int * sums = &cp->mSums[256 * 4 * info->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 2)    ] ++;
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
         sums[(in[3] << 2) + 3] ++;
-        in += p->inEStrides[0];
+        in += info->inStride[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
 
-    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
-    int * sums = &cp->mSums[256 * 4 * p->lid];
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
+    uchar *in = (uchar *)info->inPtr[0];
+    int * sums = &cp->mSums[256 * 4 * info->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 2)    ] ++;
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
-        in += p->inEStrides[0];
+        in += info->inStride[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
 
-    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
-    int * sums = &cp->mSums[256 * 2 * p->lid];
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
+    uchar *in = (uchar *)info->inPtr[0];
+    int * sums = &cp->mSums[256 * 2 * info->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 1)    ] ++;
         sums[(in[1] << 1) + 1] ++;
-        in += p->inEStrides[0];
+        in += info->inStride[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
 
-    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
-    int * sums = &cp->mSums[256 * p->lid];
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
+    uchar *in = (uchar *)info->inPtr[0];
+    int * sums = &cp->mSums[256 * info->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]) +
@@ -228,69 +228,69 @@
                 (cp->mDotI[2] * in[2]) +
                 (cp->mDotI[3] * in[3]);
         sums[(t + 0x7f) >> 8] ++;
-        in += p->inEStrides[0];
+        in += info->inStride[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
 
-    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
-    int * sums = &cp->mSums[256 * p->lid];
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
+    uchar *in = (uchar *)info->inPtr[0];
+    int * sums = &cp->mSums[256 * info->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]) +
                 (cp->mDotI[1] * in[1]) +
                 (cp->mDotI[2] * in[2]);
         sums[(t + 0x7f) >> 8] ++;
-        in += p->inEStrides[0];
+        in += info->inStride[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
 
-    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
-    int * sums = &cp->mSums[256 * p->lid];
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
+    uchar *in = (uchar *)info->inPtr[0];
+    int * sums = &cp->mSums[256 * info->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]) +
                 (cp->mDotI[1] * in[1]);
         sums[(t + 0x7f) >> 8] ++;
-        in += p->inEStrides[0];
+        in += info->inStride[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
 
-    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
-    int * sums = &cp->mSums[256 * p->lid];
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
+    uchar *in = (uchar *)info->inPtr[0];
+    int * sums = &cp->mSums[256 * info->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]);
         sums[(t + 0x7f) >> 8] ++;
-        in += p->inEStrides[0];
+        in += info->inStride[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
 
-    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->ins[0];
-    int * sums = &cp->mSums[256 * p->lid];
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
+    uchar *in = (uchar *)info->inPtr[0];
+    int * sums = &cp->mSums[256 * info->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[in[0]] ++;
-        in += p->inEStrides[0];
+        in += info->inStride[0];
     }
 }
 
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
index b08a0e5..622fe1e 100644
--- a/cpu_ref/rsCpuIntrinsicLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -38,7 +38,7 @@
 protected:
     ObjectBaseRef<Allocation> lut;
 
-    static void kernel(const RsExpandKernelParams *p,
+    static void kernel(const RsExpandKernelDriverInfo *info,
                        uint32_t xstart, uint32_t xend,
                        uint32_t outstep);
 };
@@ -53,13 +53,13 @@
 }
 
 
-void RsdCpuScriptIntrinsicLUT::kernel(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicLUT::kernel(const RsExpandKernelDriverInfo *info,
                                       uint32_t xstart, uint32_t xend,
                                       uint32_t outstep) {
-    RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr;
+    RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)info->usr;
 
-    uchar *out = (uchar *)p->out;
-    const uchar *in = (uchar *)p->ins[0];
+    uchar *out = (uchar *)info->outPtr[0];
+    const uchar *in = (uchar *)info->inPtr[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 2c51b5a..5668d96 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -46,22 +46,22 @@
     ObjectBaseRef<const Allocation> mAlloc;
     ObjectBaseRef<const Element> mElement;
 
-    static void kernelU1(const RsExpandKernelParams *p,
+    static void kernelU1(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelU2(const RsExpandKernelParams *p,
+    static void kernelU2(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelU4(const RsExpandKernelParams *p,
+    static void kernelU4(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelF1(const RsExpandKernelParams *p,
+    static void kernelF1(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelF2(const RsExpandKernelParams *p,
+    static void kernelF2(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
-    static void kernelF4(const RsExpandKernelParams *p,
+    static void kernelF4(const RsExpandKernelDriverInfo *info,
                          uint32_t xstart, uint32_t xend,
                          uint32_t outstep);
 };
@@ -255,10 +255,10 @@
     return p;
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU4(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicResize::kernelU4(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
+    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Resize executed without input, skipping");
@@ -269,7 +269,7 @@
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
@@ -283,7 +283,7 @@
     const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
     const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
 
-    uchar4 *out = ((uchar4 *)p->out) + xstart;
+    uchar4 *out = ((uchar4 *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -295,10 +295,10 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU2(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicResize::kernelU2(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
+    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Resize executed without input, skipping");
@@ -309,7 +309,7 @@
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
@@ -323,7 +323,7 @@
     const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
     const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
 
-    uchar2 *out = ((uchar2 *)p->out) + xstart;
+    uchar2 *out = ((uchar2 *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -335,10 +335,10 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU1(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicResize::kernelU1(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
+    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Resize executed without input, skipping");
@@ -349,7 +349,7 @@
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
@@ -363,7 +363,7 @@
     const uchar *yp2 = pin + stride * ys2;
     const uchar *yp3 = pin + stride * ys3;
 
-    uchar *out = ((uchar *)p->out) + xstart;
+    uchar *out = ((uchar *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -375,10 +375,10 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelF4(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicResize::kernelF4(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
+    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Resize executed without input, skipping");
@@ -389,7 +389,7 @@
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
@@ -403,7 +403,7 @@
     const float4 *yp2 = (const float4 *)(pin + stride * ys2);
     const float4 *yp3 = (const float4 *)(pin + stride * ys3);
 
-    float4 *out = ((float4 *)p->out) + xstart;
+    float4 *out = ((float4 *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -415,10 +415,10 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelF2(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicResize::kernelF2(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
+    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Resize executed without input, skipping");
@@ -429,7 +429,7 @@
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
@@ -443,7 +443,7 @@
     const float2 *yp2 = (const float2 *)(pin + stride * ys2);
     const float2 *yp3 = (const float2 *)(pin + stride * ys3);
 
-    float2 *out = ((float2 *)p->out) + xstart;
+    float2 *out = ((float2 *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -455,10 +455,10 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelF1(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicResize::kernelF1(const RsExpandKernelDriverInfo *info,
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t outstep) {
-    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
+    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
 
     if (!cp->mAlloc.get()) {
         ALOGE("Resize executed without input, skipping");
@@ -469,7 +469,7 @@
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
@@ -483,7 +483,7 @@
     const float *yp2 = (const float *)(pin + stride * ys2);
     const float *yp3 = (const float *)(pin + stride * ys3);
 
-    float *out = ((float *)p->out) + xstart;
+    float *out = ((float *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index afd3cd3..395a158 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -46,7 +46,7 @@
 protected:
     ObjectBaseRef<Allocation> alloc;
 
-    static void kernel(const RsExpandKernelParams *p,
+    static void kernel(const RsExpandKernelDriverInfo *info,
                        uint32_t xstart, uint32_t xend,
                        uint32_t outstep);
 };
@@ -101,10 +101,10 @@
 extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart, size_t xend);
 extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, size_t xstart, size_t xend);
 
-void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsExpandKernelParams *p,
+void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsExpandKernelDriverInfo *info,
                                            uint32_t xstart, uint32_t xend,
                                            uint32_t outstep) {
-    RsdCpuScriptIntrinsicYuvToRGB *cp = (RsdCpuScriptIntrinsicYuvToRGB *)p->usr;
+    RsdCpuScriptIntrinsicYuvToRGB *cp = (RsdCpuScriptIntrinsicYuvToRGB *)info->usr;
     if (!cp->alloc.get()) {
         ALOGE("YuvToRGB executed without input, skipping");
         return;
@@ -119,11 +119,11 @@
 
     // calculate correct stride in legacy case
     if (cp->alloc->mHal.drvState.lod[0].dimY == 0) {
-        strideY = p->dimX;
+        strideY = info->dim.x;
     }
-    const uchar *Y = pinY + (p->y * strideY);
+    const uchar *Y = pinY + (info->current.y * strideY);
 
-    uchar4 *out = (uchar4 *)p->out + xstart;
+    uchar4 *out = (uchar4 *)info->outPtr[0] + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -131,23 +131,23 @@
 
     const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
     const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
-    const uchar *u = pinU + ((p->y >> 1) * strideU);
+    const uchar *u = pinU + ((info->current.y >> 1) * strideU);
 
     const uchar *pinV = (const uchar *)cp->alloc->mHal.drvState.lod[2].mallocPtr;
     const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
-    const uchar *v = pinV + ((p->y >> 1) * strideV);
+    const uchar *v = pinV + ((info->current.y >> 1) * strideV);
 
-    //ALOGE("pinY, %p, Y, %p, p->y, %d, strideY, %d", pinY, Y, p->y, strideY);
-    //ALOGE("pinU, %p, U, %p, p->y, %d, strideU, %d", pinU, u, p->y, strideU);
-    //ALOGE("pinV, %p, V, %p, p->y, %d, strideV, %d", pinV, v, p->y, strideV);
+    //ALOGE("pinY, %p, Y, %p, info->current.y, %d, strideY, %d", pinY, Y, info->current.y, strideY);
+    //ALOGE("pinU, %p, U, %p, info->current.y, %d, strideU, %d", pinU, u, info->current.y, strideU);
+    //ALOGE("pinV, %p, V, %p, info->current.y, %d, strideV, %d", pinV, v, info->current.y, strideV);
     //ALOGE("dimX, %d, dimY, %d", cp->alloc->mHal.drvState.lod[0].dimX, cp->alloc->mHal.drvState.lod[0].dimY);
-    //ALOGE("p->dimX, %d, p->dimY, %d", p->dimX, p->dimY);
+    //ALOGE("info->dim.x, %d, info->dim.y, %d", info->dim.x, info->dim.y);
 
     if (pinU == nullptr) {
         // Legacy yuv support didn't fill in uv
         v = ((uint8_t *)cp->alloc->mHal.drvState.lod[0].mallocPtr) +
-            (strideY * p->dimY) +
-            ((p->y >> 1) * strideY);
+            (strideY * info->dim.y) +
+            ((info->current.y >> 1) * strideY);
         u = v + 1;
         cstep = 2;
     }
@@ -166,7 +166,7 @@
     if((x2 > x1) && gArchUseSIMD) {
         int32_t len = x2 - x1;
         if (cstep == 1) {
-            rsdIntrinsicYuv2_K(p->out, Y, u, v, x1, x2);
+            rsdIntrinsicYuv2_K(info->outPtr[0], Y, u, v, x1, x2);
             x1 += len;
             out += len;
         } else if (cstep == 2) {
@@ -175,11 +175,11 @@
             intptr_t ipv = (intptr_t)v;
 
             if (ipu == (ipv + 1)) {
-                rsdIntrinsicYuv_K(p->out, Y, v, x1, x2);
+                rsdIntrinsicYuv_K(info->outPtr[0], Y, v, x1, x2);
                 x1 += len;
                 out += len;
             } else if (ipu == (ipv - 1)) {
-                rsdIntrinsicYuvR_K(p->out, Y, u, x1, x2);
+                rsdIntrinsicYuvR_K(info->outPtr[0], Y, u, x1, x2);
                 x1 += len;
                 out += len;
             }
@@ -188,7 +188,7 @@
 #endif
 
     if(x2 > x1) {
-       // ALOGE("y %i  %i  %i", p->y, x1, x2);
+       // ALOGE("y %i  %i  %i", info->current.y, x1, x2);
         while(x1 < x2) {
             int cx = (x1 >> 1) * cstep;
             *out = rsYuvToRGBA_uchar4(Y[x1], u[cx], v[cx]);
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index aaaa2a2..72da141 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -43,7 +43,7 @@
 class RsdCpuScriptImpl : public RsdCpuReferenceImpl::CpuScript {
 public:
     typedef void (*outer_foreach_t)(
-        const RsExpandKernelParams *,
+        const RsExpandKernelDriverInfo *,
         uint32_t x1, uint32_t x2,
         uint32_t outstep);
 
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 281a715..82208db 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -42,83 +42,83 @@
 }
 
 
-typedef void (*ScriptGroupRootFunc_t)(const RsExpandKernelParams *kparams,
+typedef void (*ScriptGroupRootFunc_t)(const RsExpandKernelDriverInfo *kinfo,
                                       uint32_t xstart, uint32_t xend,
                                       uint32_t outstep);
 
-void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelParams *kparams,
+void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelDriverInfo *kinfo,
                                          uint32_t xstart, uint32_t xend,
                                          uint32_t outstep) {
 
 
-    const ScriptList *sl           = (const ScriptList *)kparams->usr;
-    RsExpandKernelParams *mkparams = (RsExpandKernelParams *)kparams;
+    const ScriptList *sl             = (const ScriptList *)kinfo->usr;
+    RsExpandKernelDriverInfo *mkinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo);
 
-    const void **oldIns  = mkparams->ins;
-    uint32_t *oldStrides = mkparams->inEStrides;
-
-    void *localIns[1];
-    uint32_t localStride[1];
-
-    mkparams->ins        = (const void**)localIns;
-    mkparams->inEStrides = localStride;
+    const uint32_t oldInStride = mkinfo->inStride[0];
 
     for (size_t ct = 0; ct < sl->count; ct++) {
         ScriptGroupRootFunc_t func;
         func          = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
-        mkparams->usr = sl->usrPtrs[ct];
+        mkinfo->usr   = sl->usrPtrs[ct];
 
         if (sl->ins[ct]) {
-            localIns[0] = sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+            rsAssert(kinfo->inLen == 1);
 
-            localStride[0] = sl->ins[ct]->mHal.state.elementSizeBytes;
+            mkinfo->inPtr[0] = (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+
+            mkinfo->inStride[0] = sl->ins[ct]->mHal.state.elementSizeBytes;
 
             if (sl->inExts[ct]) {
-                localIns[0] = (void*)
-                  ((const uint8_t *)localIns[0] +
-                   sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y);
+                mkinfo->inPtr[0] =
+                  (mkinfo->inPtr[0] +
+                   sl->ins[ct]->mHal.drvState.lod[0].stride * kinfo->current.y);
 
-            } else if (sl->ins[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
-                localIns[0] = (void*)
-                  ((const uint8_t *)localIns[0] +
-                   sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid);
+            } else if (sl->ins[ct]->mHal.drvState.lod[0].dimY > kinfo->lid) {
+                mkinfo->inPtr[0] =
+                  (mkinfo->inPtr[0] +
+                   sl->ins[ct]->mHal.drvState.lod[0].stride * kinfo->lid);
             }
 
         } else {
-            localIns[0]    = nullptr;
-            localStride[0] = 0;
+            rsAssert(kinfo->inLen == 0);
+
+            mkinfo->inPtr[0]     = nullptr;
+            mkinfo->inStride[0]  = 0;
         }
 
         uint32_t ostep;
         if (sl->outs[ct]) {
-            mkparams->out =
+            rsAssert(kinfo->outLen == 1);
+
+            mkinfo->outPtr[0] =
               (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
 
             ostep = sl->outs[ct]->mHal.state.elementSizeBytes;
 
             if (sl->outExts[ct]) {
-                mkparams->out =
-                  (uint8_t *)mkparams->out +
-                  sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->y;
+                mkinfo->outPtr[0] =
+                  mkinfo->outPtr[0] +
+                  sl->outs[ct]->mHal.drvState.lod[0].stride * kinfo->current.y;
 
-            } else if (sl->outs[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
-                mkparams->out =
-                  (uint8_t *)mkparams->out +
-                  sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->lid;
+            } else if (sl->outs[ct]->mHal.drvState.lod[0].dimY > kinfo->lid) {
+                mkinfo->outPtr[0] =
+                  mkinfo->outPtr[0] +
+                  sl->outs[ct]->mHal.drvState.lod[0].stride * kinfo->lid;
             }
         } else {
-            mkparams->out = nullptr;
-            ostep         = 0;
+            rsAssert(kinfo->outLen == 0);
+
+            mkinfo->outPtr[0] = nullptr;
+            ostep             = 0;
         }
 
         //ALOGE("kernel %i %p,%p  %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
-        func(kparams, xstart, xend, ostep);
+        func(kinfo, xstart, xend, ostep);
     }
     //ALOGE("script group root");
 
-    mkparams->ins        = oldIns;
-    mkparams->inEStrides = oldStrides;
-    mkparams->usr        = sl;
+    mkinfo->inStride[0] = oldInStride;
+    mkinfo->usr         = sl;
 }
 
 
diff --git a/cpu_ref/rsCpuScriptGroup.h b/cpu_ref/rsCpuScriptGroup.h
index 50ba2ac..acfe754 100644
--- a/cpu_ref/rsCpuScriptGroup.h
+++ b/cpu_ref/rsCpuScriptGroup.h
@@ -33,7 +33,7 @@
     CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroupBase *sg);
     bool init();
 
-    static void scriptGroupRoot(const RsExpandKernelParams *p,
+    static void scriptGroupRoot(const RsExpandKernelDriverInfo *info,
                                 uint32_t xstart, uint32_t xend,
                                 uint32_t outstep);
 
diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp
index 915fa4d..2e50ecb 100644
--- a/cpu_ref/rsCpuScriptGroup2.cpp
+++ b/cpu_ref/rsCpuScriptGroup2.cpp
@@ -16,7 +16,6 @@
 #endif
 
 #include "cpu_ref/rsCpuCore.h"
-#include "cpu_ref/rsCpuCoreRuntime.h"
 #include "rsClosure.h"
 #include "rsContext.h"
 #include "rsCpuCore.h"
@@ -36,21 +35,21 @@
 
 const size_t DefaultKernelArgCount = 2;
 
-void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart,
+void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart,
                uint32_t xend, uint32_t outstep) {
-    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr;
-    RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams;
-    const void **oldIns  = kparams->ins;
-    uint32_t *oldStrides = kparams->inEStrides;
+    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr;
+    RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo);
 
-    std::vector<const void*> ins(DefaultKernelArgCount);
-    std::vector<uint32_t> strides(DefaultKernelArgCount);
+    const size_t oldInLen = mutable_kinfo->inLen;
+
+    decltype(mutable_kinfo->inStride) oldInStride;
+    memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride));
 
     for (CPUClosure* cpuClosure : closures) {
         const Closure* closure = cpuClosure->mClosure;
 
-        auto in_iter = ins.begin();
-        auto stride_iter = strides.begin();
+        // There had better be enough space in mutable_kinfo
+        rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT);
 
         for (size_t i = 0; i < closure->mNumArg; i++) {
             const void* arg = closure->mArgs[i];
@@ -58,31 +57,30 @@
             const uint32_t eStride = a->mHal.state.elementSizeBytes;
             const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
                     eStride * xstart;
-            if (kparams->dimY > 1) {
-                ptr += a->mHal.drvState.lod[0].stride * kparams->y;
+            if (kinfo->dim.y > 1) {
+                ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y;
             }
-            *in_iter++ = ptr;
-            *stride_iter++ = eStride;
+            mutable_kinfo->inPtr[i] = ptr;
+            mutable_kinfo->inStride[i] = eStride;
         }
-
-        mutable_kparams->ins = &ins[0];
-        mutable_kparams->inEStrides = &strides[0];
+        mutable_kinfo->inLen = closure->mNumArg;
 
         const Allocation* out = closure->mReturnValue;
         const uint32_t ostep = out->mHal.state.elementSizeBytes;
         const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
                 ostep * xstart;
-        if (kparams->dimY > 1) {
-            ptr += out->mHal.drvState.lod[0].stride * kparams->y;
+        if (kinfo->dim.y > 1) {
+            ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y;
         }
 
-        mutable_kparams->out = (void*)ptr;
+        rsAssert(kinfo->outLen <= 1);
+        mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr);
 
-        cpuClosure->mFunc(kparams, xstart, xend, ostep);
+        cpuClosure->mFunc(kinfo, xstart, xend, ostep);
     }
 
-    mutable_kparams->ins        = oldIns;
-    mutable_kparams->inEStrides = oldStrides;
+    mutable_kinfo->inLen = oldInLen;
+    memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride));
 }
 
 }  // namespace
diff --git a/cpu_ref/rsCpuScriptGroup2.h b/cpu_ref/rsCpuScriptGroup2.h
index 3074cc8..1883f90 100644
--- a/cpu_ref/rsCpuScriptGroup2.h
+++ b/cpu_ref/rsCpuScriptGroup2.h
@@ -4,7 +4,7 @@
 #include "rsd_cpu.h"
 #include "rsList.h"
 
-struct RsExpandKernelParams;
+struct RsExpandKernelDriverInfo;
 
 namespace android {
 namespace renderscript {
@@ -15,9 +15,7 @@
 class ScriptExecutable;
 class ScriptGroup2;
 
-typedef ::RsExpandKernelParams RsExpandKernelParams;
-
-typedef void (*ExpandFuncTy)(const RsExpandKernelParams*, uint32_t, uint32_t,
+typedef void (*ExpandFuncTy)(const RsExpandKernelDriverInfo*, uint32_t, uint32_t,
                              uint32_t);
 typedef void (*InvokeFuncTy)(const void*, uint32_t);