Cleanup pass + implement blur uchar

Change-Id: Ib7f1c5218663b468a3c11daa2c3373ae132145ac

Conflicts:

	cpu_ref/rsCpuIntrinsicBlend.cpp
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 29539da..5ea28d4 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -423,13 +423,20 @@
     return i;
 }
 
-extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
+                                                   const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
+                                                   const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
+                                           const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
+                                                   const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx,
+                                            const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
+                                                const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
+                                             const Script *s, const Element *e);
 
 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
                                     RsScriptIntrinsicID iid, Element *e) {
@@ -437,25 +444,25 @@
     RsdCpuScriptImpl *i = NULL;
     switch (iid) {
     case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
-        i = rsdIntrinsic_Convolve3x3(this, s);
+        i = rsdIntrinsic_Convolve3x3(this, s, e);
         break;
     case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
-        i = rsdIntrinsic_ColorMatrix(this, s);
+        i = rsdIntrinsic_ColorMatrix(this, s, e);
         break;
     case RS_SCRIPT_INTRINSIC_ID_LUT:
-        i = rsdIntrinsic_LUT(this, s);
+        i = rsdIntrinsic_LUT(this, s, e);
         break;
     case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
-        i = rsdIntrinsic_Convolve5x5(this, s);
+        i = rsdIntrinsic_Convolve5x5(this, s, e);
         break;
     case RS_SCRIPT_INTRINSIC_ID_BLUR:
-        i = rsdIntrinsic_Blur(this, s);
+        i = rsdIntrinsic_Blur(this, s, e);
         break;
     case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
-        i = rsdIntrinsic_YuvToRGB(this, s);
+        i = rsdIntrinsic_YuvToRGB(this, s, e);
         break;
     case RS_SCRIPT_INTRINSIC_ID_BLEND:
-        i = rsdIntrinsic_Blend(this, s);
+        i = rsdIntrinsic_Blend(this, s, e);
         break;
 
     default:
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index a4eef21..450ee30 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -21,10 +21,11 @@
 using namespace android::renderscript;
 
 RsdCpuScriptIntrinsic::RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s,
-                                             RsScriptIntrinsicID iid)
+                                             const Element *e, RsScriptIntrinsicID iid)
         : RsdCpuScriptImpl(ctx, s) {
 
     mID = iid;
+    mElement.set(e);
 }
 
 RsdCpuScriptIntrinsic::~RsdCpuScriptIntrinsic() {
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index 1756115..35ffc69 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -47,11 +47,13 @@
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
     virtual ~RsdCpuScriptIntrinsic();
-    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, RsScriptIntrinsicID iid);
+    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, const Element *,
+                          RsScriptIntrinsicID iid);
 
 protected:
     RsScriptIntrinsicID mID;
     outer_foreach_t mRootPtr;
+    ObjectBaseRef<const Element> mElement;
 
 };
 
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 57286d5..d7b01b6 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -30,7 +30,7 @@
     virtual void populateScript(Script *);
 
     virtual ~RsdCpuScriptIntrinsicBlend();
-    RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s);
+    RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
     static void kernel(const RsForEachStubParamStruct *p,
@@ -456,8 +456,9 @@
 }
 
 
-RsdCpuScriptIntrinsicBlend::RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s)
-            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_BLEND) {
+RsdCpuScriptIntrinsicBlend::RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx,
+                                                       const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLEND) {
 
     mRootPtr = &kernel;
 }
@@ -469,8 +470,9 @@
     s->mHal.info.exportedVariableCount = 0;
 }
 
-RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx, const Script *s) {
-    return new RsdCpuScriptIntrinsicBlend(ctx, s);
+RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
+                                      const Script *s, const Element *e) {
+    return new RsdCpuScriptIntrinsicBlend(ctx, s, e);
 }
 
 
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 48363d1..1229f79 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -33,7 +33,7 @@
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
     virtual ~RsdCpuScriptIntrinsicBlur();
-    RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s);
+    RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
     float fp[104];
@@ -42,9 +42,12 @@
     int iradius;
     ObjectBaseRef<Allocation> alloc;
 
-    static void kernel(const RsForEachStubParamStruct *p,
-                       uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+    static void kernelU4(const RsForEachStubParamStruct *p,
+                         uint32_t xstart, uint32_t xend,
+                         uint32_t instep, uint32_t outstep);
+    static void kernelU1(const RsForEachStubParamStruct *p,
+                         uint32_t xstart, uint32_t xend,
+                         uint32_t instep, uint32_t outstep);
     void ComputeGaussianWeights();
 };
 
@@ -104,8 +107,8 @@
 
 
 
-static void OneV(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
-                 const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
+static void OneVU4(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
+                   const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
 
     const uchar *pi = ptrIn + x*4;
 
@@ -122,19 +125,36 @@
     out->xyzw = blurredPixel;
 }
 
-extern "C" void rsdIntrinsicBlurVF_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int x2);
-extern "C" void rsdIntrinsicBlurHF_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int x2);
+static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int32_t y,
+                   const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
 
-static void OneVF(float4 *out,
-                  const uchar *ptrIn, int iStride, const float* gPtr, int ct,
-                  int x1, int x2) {
+    const uchar *pi = ptrIn + x;
+
+    float blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validY = rsMax((y + r), 0);
+        validY = rsMin(validY, (int)(p->dimY - 1));
+        float pf = (float)pi[validY * iStride];
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out[0] = blurredPixel;
+}
+
+extern "C" void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct);
+extern "C" void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
+
+static void OneVFU4(float4 *out,
+                    const uchar *ptrIn, int iStride, const float* gPtr, int ct,
+                    int x1, int x2) {
 
 #if defined(ARCH_ARM_HAVE_NEON)
     {
         int t = (x2 - x1);
         t &= ~1;
         if(t) {
-            rsdIntrinsicBlurVF_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
+            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
         }
         x1 += t;
     }
@@ -157,8 +177,41 @@
     }
 }
 
-static void OneH(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
-                const float4 *ptrIn, const float* gPtr, int iradius) {
+static void OneVFU1(float *out,
+                    const uchar *ptrIn, int iStride, const float* gPtr, int ct, int len) {
+
+#if defined(ARCH_ARM_HAVE_NEON)
+    {
+        int t = len >> 2;
+        t &= ~1;
+        if(t) {
+            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, len);
+        }
+        len -= t << 2;
+        ptrIn += t << 2;
+        out += t << 2;
+    }
+#endif
+
+    while(len) {
+        const uchar *pi = ptrIn;
+        float blurredPixel = 0;
+        const float* gp = gPtr;
+
+        for (int r = 0; r < ct; r++) {
+            float pf = (float)pi[0];
+            blurredPixel += pf * gp[0];
+            pi += iStride;
+            gp++;
+        }
+        out[0] = blurredPixel;
+        len--;
+        out++;
+    }
+}
+
+static void OneHU4(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
+                   const float4 *ptrIn, const float* gPtr, int iradius) {
 
     float4 blurredPixel = 0;
     for (int r = -iradius; r <= iradius; r ++) {
@@ -172,10 +225,25 @@
     out->xyzw = convert_uchar4(blurredPixel);
 }
 
+static void OneHU1(const RsForEachStubParamStruct *p, uchar *out, int32_t x,
+                   const float *ptrIn, const float* gPtr, int iradius) {
 
-void RsdCpuScriptIntrinsicBlur::kernel(const RsForEachStubParamStruct *p,
-                                       uint32_t xstart, uint32_t xend,
-                                       uint32_t instep, uint32_t outstep) {
+    float blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validX = rsMax((x + r), 0);
+        validX = rsMin(validX, (int)(p->dimX - 1));
+        float pf = ptrIn[validX];
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out[0] = (uchar)blurredPixel;
+}
+
+
+void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
+                                         uint32_t xstart, uint32_t xend,
+                                         uint32_t instep, uint32_t outstep) {
     float buf[4 * 2048];
     RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
     if (!cp->alloc.get()) {
@@ -193,10 +261,10 @@
     int y = p->y;
     if ((y > cp->iradius) && (y < ((int)p->dimY - cp->iradius))) {
         const uchar *pi = pin + (y - cp->iradius) * stride;
-        OneVF(fout, pi, stride, cp->fp, cp->iradius * 2 + 1, x1, x2);
+        OneVFU4(fout, pi, stride, cp->fp, cp->iradius * 2 + 1, x1, x2);
     } else {
         while(x2 > x1) {
-            OneV(p, fout, x1, y, pin, stride, cp->fp, cp->iradius);
+            OneVU4(p, fout, x1, y, pin, stride, cp->fp, cp->iradius);
             fout++;
             x1++;
         }
@@ -204,29 +272,90 @@
 
     x1 = xstart;
     while ((x1 < (uint32_t)cp->iradius) && (x1 < x2)) {
-        OneH(p, out, x1, (float4 *)buf, cp->fp, cp->iradius);
+        OneHU4(p, out, x1, (float4 *)buf, cp->fp, cp->iradius);
         out++;
         x1++;
     }
 #if defined(ARCH_ARM_HAVE_NEON)
     if ((x1 + cp->iradius) < x2) {
-        rsdIntrinsicBlurHF_K(out, ((float4 *)buf) - cp->iradius, cp->fp, cp->iradius * 2 + 1, x1, x2 - cp->iradius);
+        rsdIntrinsicBlurHFU4_K(out, ((float4 *)buf) - cp->iradius, cp->fp,
+                               cp->iradius * 2 + 1, x1, x2 - cp->iradius);
         out += (x2 - cp->iradius) - x1;
         x1 = x2 - cp->iradius;
     }
 #endif
     while(x2 > x1) {
-        OneH(p, out, x1, (float4 *)buf, cp->fp, cp->iradius);
+        OneHU4(p, out, x1, (float4 *)buf, cp->fp, cp->iradius);
         out++;
         x1++;
     }
-
 }
 
-RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s)
-            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_BLUR) {
+void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p,
+                                         uint32_t xstart, uint32_t xend,
+                                         uint32_t instep, uint32_t outstep) {
+    float buf[4 * 2048];
+    RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
+    if (!cp->alloc.get()) {
+        ALOGE("Blur executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
 
-    mRootPtr = &kernel;
+    uchar *out = (uchar *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    float *fout = (float *)buf;
+    int y = p->y;
+    if ((y > cp->iradius) && (y < ((int)p->dimY - cp->iradius))) {
+        const uchar *pi = pin + (y - cp->iradius) * stride;
+        OneVFU1(fout, pi, stride, cp->fp, cp->iradius * 2 + 1, x2-x1);
+    } else {
+        while(x2 > x1) {
+            OneVU1(p, fout, x1, y, pin, stride, cp->fp, cp->iradius);
+            fout++;
+            x1++;
+        }
+    }
+
+    x1 = xstart;
+    while ((x1 < (uint32_t)cp->iradius) && (x1 < x2)) {
+        OneHU1(p, out, x1, buf, cp->fp, cp->iradius);
+        out++;
+        x1++;
+    }
+#if 0//defined(ARCH_ARM_HAVE_NEON)
+    if ((x1 + cp->iradius) < x2) {
+        rsdIntrinsicBlurHFU4_K(out, ((float4 *)buf) - cp->iradius, cp->fp, cp->iradius * 2 + 1, x1, 0, x2 - cp->iradius);
+        out += (x2 - cp->iradius) - x1;
+        x1 = x2 - cp->iradius;
+    }
+#endif
+    while(x2 > x1) {
+        OneHU1(p, out, x1, buf, cp->fp, cp->iradius);
+        out++;
+        x1++;
+    }
+}
+
+RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx,
+                                                     const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLUR) {
+
+    mRootPtr = NULL;
+    if (e->getType() == RS_TYPE_UNSIGNED_8) {
+        switch (e->getVectorSize()) {
+        case 1:
+            mRootPtr = &kernelU1;
+            break;
+        case 4:
+            mRootPtr = &kernelU4;
+            break;
+        }
+    }
+    rsAssert(mRootPtr);
     radius = 5;
     ComputeGaussianWeights();
 }
@@ -243,9 +372,9 @@
 }
 
 
-RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s) {
+RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
 
-    return new RsdCpuScriptIntrinsicBlur(ctx, s);
+    return new RsdCpuScriptIntrinsicBlur(ctx, s, e);
 }
 
 
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 8f3196d..3fc322c 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -32,7 +32,7 @@
     virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
 
     virtual ~RsdCpuScriptIntrinsicColorMatrix();
-    RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s);
+    RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
     float fp[16];
@@ -191,8 +191,8 @@
 
 
 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
-            RsdCpuReferenceImpl *ctx, const Script *s)
-            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
+            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
 
     const static float defaultMatrix[] = {
         1.f, 0.f, 0.f, 0.f,
@@ -210,9 +210,10 @@
     s->mHal.info.exportedVariableCount = 1;
 }
 
-RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s) {
+RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
+                                            const Script *s, const Element *e) {
 
-    return new RsdCpuScriptIntrinsicColorMatrix(ctx, s);
+    return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
 }
 
 
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 1f49e1e..020fa6f 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -34,12 +34,13 @@
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
     virtual ~RsdCpuScriptIntrinsicConvolve3x3();
-    RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s);
+    RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
 
 protected:
-    float fp[16];
-    short ip[16];
-    ObjectBaseRef<Allocation> alloc;
+    float mFp[16];
+    short mIp[16];
+    ObjectBaseRef<const Allocation> mAlloc;
+    ObjectBaseRef<const Element> mElement;
 
     static void kernel(const RsForEachStubParamStruct *p,
                        uint32_t xstart, uint32_t xend,
@@ -52,15 +53,15 @@
 
 void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) {
     rsAssert(slot == 1);
-    alloc.set(static_cast<Allocation *>(data));
+    mAlloc.set(static_cast<Allocation *>(data));
 }
 
 void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data,
                                                     size_t dataLength) {
     rsAssert(slot == 0);
-    memcpy (&fp, data, dataLength);
+    memcpy (&mFp, data, dataLength);
     for(int ct=0; ct < 9; ct++) {
-        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+        mIp[ct] = (short)(mFp[ct] * 255.f + 0.5f);
     }
 }
 
@@ -95,12 +96,12 @@
                                               uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
-    if (!cp->alloc.get()) {
+    if (!cp->mAlloc.get()) {
         ALOGE("Convolve3x3 executed without input, skipping");
         return;
     }
-    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
-    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
+    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
     uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
     uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
@@ -112,7 +113,7 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
     if(x1 == 0) {
-        ConvolveOne(p, 0, out, py0, py1, py2, cp->fp);
+        ConvolveOne(p, 0, out, py0, py1, py2, cp->mFp);
         x1 ++;
         out++;
     }
@@ -121,14 +122,14 @@
 #if defined(ARCH_ARM_HAVE_NEON)
         int32_t len = (x2 - x1 - 1) >> 1;
         if(len > 0) {
-            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->ip, len);
+            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
             x1 += len << 1;
             out += len << 1;
         }
 #endif
 
         while(x1 != x2) {
-            ConvolveOne(p, x1, out, py0, py1, py2, cp->fp);
+            ConvolveOne(p, x1, out, py0, py1, py2, cp->mFp);
             out++;
             x1++;
         }
@@ -136,13 +137,13 @@
 }
 
 RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3(
-            RsdCpuReferenceImpl *ctx, const Script *s)
-            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
+            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
 
     mRootPtr = &kernel;
     for(int ct=0; ct < 9; ct++) {
-        fp[ct] = 1.f / 9.f;
-        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+        mFp[ct] = 1.f / 9.f;
+        mIp[ct] = (short)(mFp[ct] * 255.f + 0.5f);
     }
 }
 
@@ -154,13 +155,13 @@
 }
 
 void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() {
-    alloc.clear();
+    mAlloc.clear();
 }
 
 
-RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s) {
+RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
 
-    return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s);
+    return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e);
 }
 
 
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index 2cae2c0..d36639f 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -34,7 +34,7 @@
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
     virtual ~RsdCpuScriptIntrinsicConvolve5x5();
-    RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s);
+    RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
     float fp[28];
@@ -167,8 +167,8 @@
 
 
 RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
-            RsdCpuReferenceImpl *ctx, const Script *s)
-            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
+            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
 
     mRootPtr = &kernel;
     for(int ct=0; ct < 9; ct++) {
@@ -189,9 +189,10 @@
 }
 
 
-RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, const Script *s) {
+RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
+                                            const Script *s, const Element *e) {
 
-    return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s);
+    return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
 }
 
 
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
index 188ed2b..0da1f75 100644
--- a/cpu_ref/rsCpuIntrinsicLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -33,7 +33,7 @@
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
     virtual ~RsdCpuScriptIntrinsicLUT();
-    RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx, const Script *s);
+    RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
     ObjectBaseRef<Allocation> lut;
@@ -78,8 +78,9 @@
     }
 }
 
-RsdCpuScriptIntrinsicLUT::RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx, const Script *s)
-            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_LUT) {
+RsdCpuScriptIntrinsicLUT::RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx,
+                                                   const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_LUT) {
 
     mRootPtr = &kernel;
 }
@@ -96,9 +97,10 @@
 }
 
 
-RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx, const Script *s) {
+RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
+                                    const Script *s, const Element *e) {
 
-    return new RsdCpuScriptIntrinsicLUT(ctx, s);
+    return new RsdCpuScriptIntrinsicLUT(ctx, s, e);
 }
 
 
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 7b8f768..946d1ba 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -33,7 +33,7 @@
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
     virtual ~RsdCpuScriptIntrinsicYuvToRGB();
-    RsdCpuScriptIntrinsicYuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s);
+    RsdCpuScriptIntrinsicYuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
     ObjectBaseRef<Allocation> alloc;
@@ -144,8 +144,8 @@
 }
 
 RsdCpuScriptIntrinsicYuvToRGB::RsdCpuScriptIntrinsicYuvToRGB(
-            RsdCpuReferenceImpl *ctx, const Script *s)
-            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB) {
+            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB) {
 
     mRootPtr = &kernel;
 }
@@ -162,8 +162,9 @@
 }
 
 
-RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s) {
-    return new RsdCpuScriptIntrinsicYuvToRGB(ctx, s);
+RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
+                                         const Script *s, const Element *e) {
+    return new RsdCpuScriptIntrinsicYuvToRGB(ctx, s, e);
 }
 
 
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 04dd8b1..53c116d 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -284,7 +284,7 @@
     r5 = sp+4, x1
     r6 = sp+8, x2
 */
-ENTRY(rsdIntrinsicBlurVF_K)
+ENTRY(rsdIntrinsicBlurVFU4_K)
         push            {r4-r8, r10, r11, lr}
         vpush           {q4-q7}
 
@@ -324,7 +324,7 @@
         vpop            {q4-q7}
         pop             {r4-r8, r10, r11, lr}
         bx              lr
-END(rsdIntrinsicBlurVF_K)
+END(rsdIntrinsicBlurVFU4_K)
 
 /*
 static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
@@ -337,7 +337,7 @@
     r4 = sp, x1
     r5 = sp+4, x2
 */
-ENTRY(rsdIntrinsicBlurHF_K)
+ENTRY(rsdIntrinsicBlurHFU4_K)
         push            {r4-r8, r10, r11, lr}
         vpush           {q4-q7}
 
@@ -357,8 +357,7 @@
 2:
         vld1.32 {q1}, [r7]!
         vld1.32 {q2}, [r7]!
-        vld1.32 {d6[0]}, [r10]!
-        vld1.32 {d6[1]}, [r10]!
+        vld1.32 {d6}, [r10]!
         vmla.f32 q0, q1, d6[0]
         vmla.f32 q0, q2, d6[1]
         subs r11, r11, #2
@@ -376,7 +375,7 @@
         vpop            {q4-q7}
         pop             {r4-r8, r10, r11, lr}
         bx              lr
-END(rsdIntrinsicBlurHF_K)
+END(rsdIntrinsicBlurHFU4_K)
 
 /*
         r0 = dst
diff --git a/rsElement.h b/rsElement.h
index 57698f4..1eae46d 100644
--- a/rsElement.h
+++ b/rsElement.h
@@ -103,6 +103,7 @@
     RsDataKind getKind() const {return mComponent.getKind();}
     uint32_t getBits() const {return mBits;}
     uint32_t getBitsUnpadded() const {return mBitsUnpadded;}
+    uint32_t getVectorSize() const {return mComponent.getVectorSize();}
 
     void dumpLOGV(const char *prefix) const;
     virtual void serialize(Context *rsc, OStream *stream) const;