Fix blur and 3dlut with clipped output

bug 17157250

Change-Id: I388a255380fbdd9f6b5d1c7cb9f14df6f035ae48
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index a7c9487..7eb0c01 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -63,8 +63,8 @@
                                       uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
 
-    uchar4 *out = (uchar4 *)p->out + xstart;
-    uchar4 *in = (uchar4 *)p->in + xstart;
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index c1ca4e2..a72701d 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -128,7 +128,7 @@
         gPtr++;
     }
 
-    out->xyzw = blurredPixel;
+    out[0] = blurredPixel;
 }
 
 static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int32_t y,
@@ -163,6 +163,7 @@
 static void OneVFU4(float4 *out,
                     const uchar *ptrIn, int iStride, const float* gPtr, int ct,
                     int x1, int x2) {
+    out += x1;
 #if defined(ARCH_X86_HAVE_SSSE3)
     if (gArchUseSIMD) {
         int t = (x2 - x1);
@@ -195,6 +196,7 @@
                     const uchar *ptrIn, int iStride, const float* gPtr, int ct, int x1, int x2) {
 
     int len = x2 - x1;
+    out += x1;
 
     while((x2 > x1) && (((uintptr_t)ptrIn) & 0x3)) {
         const uchar *pi = ptrIn;
@@ -293,7 +295,7 @@
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
-    if (gArchUseSIMD) {
+    if (gArchUseSIMD && !xstart && (xend == p->dimX)) {
         rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * p->y), p->dimX, p->dimY,
                  stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
         return;
@@ -313,9 +315,10 @@
     int y = p->y;
     if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius))) {
         const uchar *pi = pin + (y - cp->mIradius) * stride;
-        OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
+        OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, p->dimX);
     } else {
-        while(x2 > x1) {
+        x1 = 0;
+        while(p->dimX > x1) {
             OneVU4(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
             fout++;
             x1++;
@@ -362,9 +365,9 @@
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
-    if (gArchUseSIMD) {
+    if (gArchUseSIMD && !xstart && (xend == p->dimX)) {
         rsdIntrinsicBlurU1_K(out, pin + stride * p->y, p->dimX, p->dimY,
-                 stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
+                 stride, 0, p->y, p->dimX, cp->mIradius, cp->mIp + cp->mIradius);
         return;
     }
 #endif
@@ -373,9 +376,10 @@
     int y = p->y;
     if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius -1))) {
         const uchar *pi = pin + (y - cp->mIradius) * stride;
-        OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
+        OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, p->dimX);
     } else {
-        while(x2 > x1) {
+        x1 = 0;
+        while(p->dimX > x1) {
             OneVU1(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
             fout++;
             x1++;