Fix rounding bug of IntrinsicConvolve in cpu_ref

bug 18121051

Change-Id: Ica3e32731a7f78c8869b84e1e257216a52f79f8a
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 552a835..e5953cf 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -105,7 +105,7 @@
                 convert_float4(py2[x]) * coeff[7] +
                 convert_float4(py2[x2]) * coeff[8];
 
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
     *out = o;
 }
@@ -127,7 +127,7 @@
                 convert_float2(py2[x]) * coeff[7] +
                 convert_float2(py2[x2]) * coeff[8];
 
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = convert_uchar2(px);
 }
 
@@ -147,7 +147,7 @@
                ((float)py2[x1]) * coeff[6] +
                ((float)py2[x]) * coeff[7] +
                ((float)py2[x2]) * coeff[8];
-    *out = clamp(px, 0.f, 255.f);
+    *out = clamp(px + 0.5f, 0.f, 255.f);
 }
 
 static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index e2a6b8b..a2c29fd 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -125,7 +125,7 @@
                 convert_float4(py4[x2]) * coeff[22] +
                 convert_float4(py4[x3]) * coeff[23] +
                 convert_float4(py4[x4]) * coeff[24];
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = convert_uchar4(px);
 }
 
@@ -168,7 +168,7 @@
                 convert_float2(py4[x2]) * coeff[22] +
                 convert_float2(py4[x3]) * coeff[23] +
                 convert_float2(py4[x4]) * coeff[24];
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = convert_uchar2(px);
 }
 
@@ -211,7 +211,7 @@
                (float)(py4[x2]) * coeff[22] +
                (float)(py4[x3]) * coeff[23] +
                (float)(py4[x4]) * coeff[24];
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = px;
 }