Fix intrinsic bugs.

Change-Id: I027e5dcd8e538e52a21941facc5b93db2a6eac8c
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index d44b07a..1a28aab 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -211,7 +211,7 @@
         int t = (x2 - x1) >> 2;
         t &= ~1;
         if(t) {
-            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t << 2);
+            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
             len -= t << 2;
             ptrIn += t << 2;
             out += t << 2;
@@ -345,7 +345,7 @@
 
     float *fout = (float *)buf;
     int y = p->y;
-    if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius))) {
+    if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius -1))) {
         const uchar *pi = pin + (y - cp->mIradius) * stride;
         OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
     } else {
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index d36639f..bcd5ffd 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -62,7 +62,7 @@
     rsAssert(slot == 0);
     memcpy (&fp, data, dataLength);
     for(int ct=0; ct < 25; ct++) {
-        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+        ip[ct] = (short)(fp[ct] * 255.f);
     }
 }
 
@@ -109,7 +109,11 @@
 
     px = clamp(px, 0.f, 255.f);
     uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
-    *out = o;
+    //if ((out[0].r != o.r) || (out[0].y != o.y) || (out[0].z != o.z) || (out[0].w != o.w)) {
+        //ALOGE("x %i  %i,%i,%i,%i  %i,%i,%i,%i", x, o.x, o.y, o.z, o.w, out[0].x, out[0].y, out[0].z, out[0].w);
+    //}
+    //o.w = 0xff;
+    out->rgba = o.rgba;
 }
 
 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
@@ -171,9 +175,9 @@
             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
 
     mRootPtr = &kernel;
-    for(int ct=0; ct < 9; ct++) {
+    for(int ct=0; ct < 25; ct++) {
         fp[ct] = 1.f / 25.f;
-        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+        ip[ct] = (short)(fp[ct] * 255.f);
     }
 }
 
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 76e709e..d62b5a9 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -524,6 +524,8 @@
         vld1.16     {d0, d1, d2, d3}, [r6]!
         vld1.16     {d4, d5, d6}, [r6]
 
+        vmov.u32  q15, #0x7f
+
         /* load the count */
         ldr     r6, [sp, #32 + 64]
 
@@ -652,10 +654,13 @@
 
 
 
+        vadd.i32 q4, q4, q15
+        vadd.i32 q5, q5, q15
 
 /*      Narrow it to a d-reg 32 -> 16 bit */
-        vshrn.i32 d8, q4, #8
-        vshrn.i32 d9, q5, #8
+        vrshrn.i32 d8, q4, #8
+        vrshrn.i32 d9, q5, #8
+
 
 /*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
         vqmovun.s16 d8, q4