bug 7190126
FS intrinsic perf work.
Fix correctness issue with convolve 3x3.

Change-Id: I7f2657c5e9da003f91ad6a9c2f85d8d43913654b
diff --git a/driver/rsdIntrinsicBlur.cpp b/driver/rsdIntrinsicBlur.cpp
index 0d0483e..c394cc0 100644
--- a/driver/rsdIntrinsicBlur.cpp
+++ b/driver/rsdIntrinsicBlur.cpp
@@ -115,8 +115,14 @@
                   int x1, int x2) {
 
 #if defined(ARCH_ARM_HAVE_NEON)
-    rsdIntrinsicBlurVF_K(out, ptrIn, iStride, gPtr, ct, x1, x2);
-    return;
+    {
+        int t = (x2 - x1);
+        t &= ~1;
+        if(t) {
+            rsdIntrinsicBlurVF_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
+        }
+        x1 += t;
+    }
 #endif
 
     while(x2 > x1) {
diff --git a/driver/rsdIntrinsics_Convolve.S b/driver/rsdIntrinsics_Convolve.S
index 52734df..e178f36 100644
--- a/driver/rsdIntrinsics_Convolve.S
+++ b/driver/rsdIntrinsics_Convolve.S
@@ -69,9 +69,9 @@
 */
 
         vmull.s16 q8, d4, d0[0]
-        vmlal.s16 q8, d4, d0[3]
         vmlal.s16 q8, d5, d0[1]
         vmlal.s16 q8, d6, d0[2]
+        vmlal.s16 q8, d8, d0[3]
         vmlal.s16 q8, d9, d1[0]
         vmlal.s16 q8, d10, d1[1]
         vmlal.s16 q8, d12, d1[2]
@@ -296,25 +296,30 @@
         ldr r6, [sp, #32+64 + 8]
 
 1:
-        veor q0, q0, q0         /* float4 blurredPixel = 0; */
+        veor q10, q10, q10         /* float4 blurredPixel = 0; */
+        veor q11, q11, q11         /* float4 blurredPixel = 0; */
         add r7, r1, r5, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
         mov r10, r3
 
         mov r11, r4
 
 2:
-        vld1.32 {d2[0]}, [r7]
+        vld1.32 {d2}, [r7]
         vmovl.u8 q1, d2
-        vmovl.u16 q1, d2
-        vcvt.f32.s32 q1, q1
-        vld1.32 {d6[0]}, [r10]!
+        vmovl.u16 q3, d2
+        vmovl.u16 q4, d3
+        vcvt.f32.s32 q3, q3
+        vcvt.f32.s32 q4, q4
+        vld1.32 {d0[0]}, [r10]!
         add r7, r7, r2
-        vmla.f32 q0, q1, d6[0]
+        vmla.f32 q10, q3, d0[0]
+        vmla.f32 q11, q4, d0[0]
         subs r11, r11, #1
         bne 2b
 
-        vst1.32 {q0}, [r0]!
-        add r5, r5, #1
+        vst1.32 {q10}, [r0]!
+        vst1.32 {q11}, [r0]!
+        add r5, r5, #2
         cmp r5, r6
         bne 1b
 
@@ -343,16 +348,23 @@
         ldr r5, [sp, #32+64 + 4]
 
 1:
-        veor q0, q0, q0         /* float4 blurredPixel = 0; */
         add r7, r1, r4, lsl #4  /* const uchar *pi = ptrIn + x1 * 4; */
         mov r10, r2
         mov r11, r3
 
-2:
         vld1.32 {q1}, [r7]!
         vld1.32 {d6[0]}, [r10]!
+        vmul.f32 q0, q1, d6[0]
+        sub r11, r11, #1
+
+2:
+        vld1.32 {q1}, [r7]!
+        vld1.32 {q2}, [r7]!
+        vld1.32 {d6[0]}, [r10]!
+        vld1.32 {d6[1]}, [r10]!
         vmla.f32 q0, q1, d6[0]
-        subs r11, r11, #1
+        vmla.f32 q0, q2, d6[1]
+        subs r11, r11, #2
         bne 2b
 
         vcvt.s32.f32 q0, q0