Add RS intrinsic tests for Convolve.

Change-Id: I5321b7c54e076f5fc338ea634dd699bad663e65c
diff --git a/tests/src/android/renderscript/cts/intrinsic_colormatrix.rs b/tests/src/android/renderscript/cts/intrinsic_colormatrix.rs
new file mode 100644
index 0000000..1cab00b
--- /dev/null
+++ b/tests/src/android/renderscript/cts/intrinsic_colormatrix.rs
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "shared.rsh"
+
+static rs_matrix4x4 Mat;
+
+int gFormatIn;
+int gFormatOut;
+float4 gAdd;
+
+
+void init() {
+    rsMatrixLoadIdentity(&Mat);
+    gAdd = 0.f;
+}
+
+void setMatrix(rs_matrix4x4 m) {
+    Mat = m;
+}
+
+void test(rs_matrix4x4 m, float4 add, int formatIn, int formatOut) {
+
+}
+
+static float4 __attribute__((overloadable)) cvt_in(uchar4 in) {
+    float4 f = convert_float4(in);
+    f *= (1.f / 255.f);
+    return rsMatrixMultiply(&Mat, f);
+}
+static float4 __attribute__((overloadable)) cvt_in(uchar3 in) {
+    float4 f = {in.x, in.y, in.z, 0.f};
+    f *= (1.f / 255.f);
+    return rsMatrixMultiply(&Mat, f);
+}
+static float4 __attribute__((overloadable)) cvt_in(uchar2 in) {
+    float4 f = {in.x, in.y, 0.f, 0.f};
+    f *= (1.f / 255.f);
+    return rsMatrixMultiply(&Mat, f);
+}
+static float4 __attribute__((overloadable)) cvt_in(uchar in) {
+    float4 f = {in, 0.f, 0.f, 0.f};
+    f *= (1.f / 255.f);
+    return rsMatrixMultiply(&Mat, f);
+}
+static float4 __attribute__((overloadable)) cvt_in(float4 in) {
+    float4 f = in;
+    return rsMatrixMultiply(&Mat, f);
+}
+static float4 __attribute__((overloadable)) cvt_in(float3 in) {
+    float4 f = {in.x, in.y, in.z, 0.f};
+    return rsMatrixMultiply(&Mat, f);
+}
+static float4 __attribute__((overloadable)) cvt_in(float2 in) {
+    float4 f = {in.x, in.y, 0.f, 0.f};
+    return rsMatrixMultiply(&Mat, f);
+}
+static float4 __attribute__((overloadable)) cvt_in(float in) {
+    float4 f = {in, 0.f, 0.f, 0.f};
+    return rsMatrixMultiply(&Mat, f);
+}
+
+
+static uchar4 cvt_out_uchar4(float4 f) {
+    f = clamp(f, 0.f, 255.5f);
+    return convert_uchar4(f.xyzw);
+}
+static uchar3 cvt_out_uchar3(float4 f) {
+    f = clamp(f, 0.f, 255.5f);
+    return convert_uchar3(f.xyz);
+}
+static uchar2 cvt_out_uchar2(float4 f) {
+    f = clamp(f, 0.f, 255.5f);
+    return convert_uchar2(f.xy);
+}
+static uchar cvt_out_uchar(float4 f) {
+    f = clamp(f, 0.f, 255.5f);
+    return f.x;
+}
+static float4 cvt_out_float4(float4 f) {
+    return f;
+}
+static float3 cvt_out_float3(float4 f) {
+    return f.xyz;
+}
+static float2 cvt_out_float2(float4 f) {
+    return f.xy;
+}
+static float cvt_out_float(float4 f) {
+    return f.x;
+}
+
+#define KERN(tin, tout) \
+tout __attribute__((kernel)) k_##tin##_##tout(tin in) {         \
+    float4 f = cvt_in(in);                                      \
+    return cvt_out_##tout(f);                                   \
+}
+
+#define KERN2(tin)  \
+KERN(tin, uchar4)   \
+KERN(tin, uchar3)   \
+KERN(tin, uchar2)   \
+KERN(tin, uchar)    \
+KERN(tin, float4)   \
+KERN(tin, float3)   \
+KERN(tin, float2)   \
+KERN(tin, float)
+
+KERN2(uchar4)
+KERN2(uchar3)
+KERN2(uchar2)
+KERN2(uchar)
+KERN2(float4)
+KERN2(float3)
+KERN2(float2)
+KERN2(float)
+
diff --git a/tests/src/android/renderscript/cts/intrinsic_convolve3x3.rs b/tests/src/android/renderscript/cts/intrinsic_convolve3x3.rs
new file mode 100644
index 0000000..77da230
--- /dev/null
+++ b/tests/src/android/renderscript/cts/intrinsic_convolve3x3.rs
@@ -0,0 +1,305 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "shared.rsh"
+
+int32_t gWidth;
+int32_t gHeight;
+rs_allocation gIn;
+
+float gCoeffs[9];
+
+uchar4 __attribute__((kernel)) convolve_U4(uint32_t x, uint32_t y) {
+    uint32_t x1 = min((int32_t)x+1, gWidth-1);
+    uint32_t x2 = max((int32_t)x-1, 0);
+    uint32_t y1 = min((int32_t)y+1, gHeight-1);
+    uint32_t y2 = max((int32_t)y-1, 0);
+
+    float4 p00 = convert_float4(rsGetElementAt_uchar4(gIn, x1, y1));
+    float4 p01 = convert_float4(rsGetElementAt_uchar4(gIn, x, y1));
+    float4 p02 = convert_float4(rsGetElementAt_uchar4(gIn, x2, y1));
+    float4 p10 = convert_float4(rsGetElementAt_uchar4(gIn, x1, y));
+    float4 p11 = convert_float4(rsGetElementAt_uchar4(gIn, x, y));
+    float4 p12 = convert_float4(rsGetElementAt_uchar4(gIn, x2, y));
+    float4 p20 = convert_float4(rsGetElementAt_uchar4(gIn, x1, y2));
+    float4 p21 = convert_float4(rsGetElementAt_uchar4(gIn, x, y2));
+    float4 p22 = convert_float4(rsGetElementAt_uchar4(gIn, x2, y2));
+    p00 *= gCoeffs[0];
+    p01 *= gCoeffs[1];
+    p02 *= gCoeffs[2];
+    p10 *= gCoeffs[3];
+    p11 *= gCoeffs[4];
+    p12 *= gCoeffs[5];
+    p20 *= gCoeffs[6];
+    p21 *= gCoeffs[7];
+    p22 *= gCoeffs[8];
+
+    p00 += p01;
+    p02 += p10;
+    p11 += p12;
+    p20 += p21;
+
+    p22 += p00;
+    p02 += p11;
+
+    p20 += p22;
+    p20 += p02;
+    p20 += 0.5f;
+
+    p20 = clamp(p20, 0.f, 255.f);
+    return convert_uchar4(p20);
+}
+
+uchar3 __attribute__((kernel)) convolve_U3(uint32_t x, uint32_t y) {
+    uint32_t x1 = min((int32_t)x+1, gWidth-1);
+    uint32_t x2 = max((int32_t)x-1, 0);
+    uint32_t y1 = min((int32_t)y+1, gHeight-1);
+    uint32_t y2 = max((int32_t)y-1, 0);
+
+    float3 p00 = convert_float3(rsGetElementAt_uchar3(gIn, x1, y1));
+    float3 p01 = convert_float3(rsGetElementAt_uchar3(gIn, x, y1));
+    float3 p02 = convert_float3(rsGetElementAt_uchar3(gIn, x2, y1));
+    float3 p10 = convert_float3(rsGetElementAt_uchar3(gIn, x1, y));
+    float3 p11 = convert_float3(rsGetElementAt_uchar3(gIn, x, y));
+    float3 p12 = convert_float3(rsGetElementAt_uchar3(gIn, x2, y));
+    float3 p20 = convert_float3(rsGetElementAt_uchar3(gIn, x1, y2));
+    float3 p21 = convert_float3(rsGetElementAt_uchar3(gIn, x, y2));
+    float3 p22 = convert_float3(rsGetElementAt_uchar3(gIn, x2, y2));
+    p00 *= gCoeffs[0];
+    p01 *= gCoeffs[1];
+    p02 *= gCoeffs[2];
+    p10 *= gCoeffs[3];
+    p11 *= gCoeffs[4];
+    p12 *= gCoeffs[5];
+    p20 *= gCoeffs[6];
+    p21 *= gCoeffs[7];
+    p22 *= gCoeffs[8];
+
+    p00 += p01;
+    p02 += p10;
+    p11 += p12;
+    p20 += p21;
+
+    p22 += p00;
+    p02 += p11;
+
+    p20 += p22;
+    p20 += p02;
+    p20 += 0.5f;
+
+    p20 = clamp(p20, 0.f, 255.f);
+    return convert_uchar3(p20);
+}
+
+uchar2 __attribute__((kernel)) convolve_U2(uint32_t x, uint32_t y) {
+    uint32_t x1 = min((int32_t)x+1, gWidth-1);
+    uint32_t x2 = max((int32_t)x-1, 0);
+    uint32_t y1 = min((int32_t)y+1, gHeight-1);
+    uint32_t y2 = max((int32_t)y-1, 0);
+
+    float2 p00 = convert_float2(rsGetElementAt_uchar2(gIn, x1, y1));
+    float2 p01 = convert_float2(rsGetElementAt_uchar2(gIn, x, y1));
+    float2 p02 = convert_float2(rsGetElementAt_uchar2(gIn, x2, y1));
+    float2 p10 = convert_float2(rsGetElementAt_uchar2(gIn, x1, y));
+    float2 p11 = convert_float2(rsGetElementAt_uchar2(gIn, x, y));
+    float2 p12 = convert_float2(rsGetElementAt_uchar2(gIn, x2, y));
+    float2 p20 = convert_float2(rsGetElementAt_uchar2(gIn, x1, y2));
+    float2 p21 = convert_float2(rsGetElementAt_uchar2(gIn, x, y2));
+    float2 p22 = convert_float2(rsGetElementAt_uchar2(gIn, x2, y2));
+    p00 *= gCoeffs[0];
+    p01 *= gCoeffs[1];
+    p02 *= gCoeffs[2];
+    p10 *= gCoeffs[3];
+    p11 *= gCoeffs[4];
+    p12 *= gCoeffs[5];
+    p20 *= gCoeffs[6];
+    p21 *= gCoeffs[7];
+    p22 *= gCoeffs[8];
+
+    p00 += p01;
+    p02 += p10;
+    p11 += p12;
+    p20 += p21;
+
+    p22 += p00;
+    p02 += p11;
+
+    p20 += p22;
+    p20 += p02;
+    p20 += 0.5f;
+
+    p20 = clamp(p20, 0.f, 255.f);
+    return convert_uchar2(p20);
+}
+
+uchar __attribute__((kernel)) convolve_U1(uint32_t x, uint32_t y) {
+    uint32_t x1 = min((int32_t)x+1, gWidth-1);
+    uint32_t x2 = max((int32_t)x-1, 0);
+    uint32_t y1 = min((int32_t)y+1, gHeight-1);
+    uint32_t y2 = max((int32_t)y-1, 0);
+
+    float p00 = rsGetElementAt_uchar(gIn, x1, y1);
+    float p01 = rsGetElementAt_uchar(gIn, x, y1);
+    float p02 = rsGetElementAt_uchar(gIn, x2, y1);
+    float p10 = rsGetElementAt_uchar(gIn, x1, y);
+    float p11 = rsGetElementAt_uchar(gIn, x, y);
+    float p12 = rsGetElementAt_uchar(gIn, x2, y);
+    float p20 = rsGetElementAt_uchar(gIn, x1, y2);
+    float p21 = rsGetElementAt_uchar(gIn, x, y2);
+    float p22 = rsGetElementAt_uchar(gIn, x2, y2);
+    p00 *= gCoeffs[0];
+    p01 *= gCoeffs[1];
+    p02 *= gCoeffs[2];
+    p10 *= gCoeffs[3];
+    p11 *= gCoeffs[4];
+    p12 *= gCoeffs[5];
+    p20 *= gCoeffs[6];
+    p21 *= gCoeffs[7];
+    p22 *= gCoeffs[8];
+
+    p00 += p01;
+    p02 += p10;
+    p11 += p12;
+    p20 += p21;
+
+    p22 += p00;
+    p02 += p11;
+
+    p20 += p22;
+    p20 += p02;
+    p20 += 0.5f;
+
+    p20 = clamp(p20, 0.f, 255.f);
+    return (uchar)p20;
+}
+
+float4 __attribute__((kernel)) convolve_F4(uint32_t x, uint32_t y) {
+    uint32_t x1 = min((int32_t)x+1, gWidth-1);
+    uint32_t x2 = max((int32_t)x-1, 0);
+    uint32_t y1 = min((int32_t)y+1, gHeight-1);
+    uint32_t y2 = max((int32_t)y-1, 0);
+
+    float4 p00 = rsGetElementAt_float4(gIn, x1, y1) * gCoeffs[0];
+    float4 p01 = rsGetElementAt_float4(gIn, x, y1) * gCoeffs[1];
+    float4 p02 = rsGetElementAt_float4(gIn, x2, y1) * gCoeffs[2];
+    float4 p10 = rsGetElementAt_float4(gIn, x1, y) * gCoeffs[3];
+    float4 p11 = rsGetElementAt_float4(gIn, x, y) * gCoeffs[4];
+    float4 p12 = rsGetElementAt_float4(gIn, x2, y) * gCoeffs[5];
+    float4 p20 = rsGetElementAt_float4(gIn, x1, y2) * gCoeffs[6];
+    float4 p21 = rsGetElementAt_float4(gIn, x, y2) * gCoeffs[7];
+    float4 p22 = rsGetElementAt_float4(gIn, x2, y2) * gCoeffs[8];
+
+    p00 += p01;
+    p02 += p10;
+    p11 += p12;
+    p20 += p21;
+
+    p22 += p00;
+    p02 += p11;
+
+    p20 += p22;
+    p20 += p02;
+    return p20;
+}
+
+float3 __attribute__((kernel)) convolve_F3(uint32_t x, uint32_t y) {
+    uint32_t x1 = min((int32_t)x+1, gWidth-1);
+    uint32_t x2 = max((int32_t)x-1, 0);
+    uint32_t y1 = min((int32_t)y+1, gHeight-1);
+    uint32_t y2 = max((int32_t)y-1, 0);
+
+    float3 p00 = rsGetElementAt_float3(gIn, x1, y1) * gCoeffs[0];
+    float3 p01 = rsGetElementAt_float3(gIn, x, y1) * gCoeffs[1];
+    float3 p02 = rsGetElementAt_float3(gIn, x2, y1) * gCoeffs[2];
+    float3 p10 = rsGetElementAt_float3(gIn, x1, y) * gCoeffs[3];
+    float3 p11 = rsGetElementAt_float3(gIn, x, y) * gCoeffs[4];
+    float3 p12 = rsGetElementAt_float3(gIn, x2, y) * gCoeffs[5];
+    float3 p20 = rsGetElementAt_float3(gIn, x1, y2) * gCoeffs[6];
+    float3 p21 = rsGetElementAt_float3(gIn, x, y2) * gCoeffs[7];
+    float3 p22 = rsGetElementAt_float3(gIn, x2, y2) * gCoeffs[8];
+
+    p00 += p01;
+    p02 += p10;
+    p11 += p12;
+    p20 += p21;
+
+    p22 += p00;
+    p02 += p11;
+
+    p20 += p22;
+    p20 += p02;
+    return p20;
+}
+
+float2 __attribute__((kernel)) convolve_F2(uint32_t x, uint32_t y) {
+    uint32_t x1 = min((int32_t)x+1, gWidth-1);
+    uint32_t x2 = max((int32_t)x-1, 0);
+    uint32_t y1 = min((int32_t)y+1, gHeight-1);
+    uint32_t y2 = max((int32_t)y-1, 0);
+
+    float2 p00 = rsGetElementAt_float2(gIn, x1, y1) * gCoeffs[0];
+    float2 p01 = rsGetElementAt_float2(gIn, x, y1) * gCoeffs[1];
+    float2 p02 = rsGetElementAt_float2(gIn, x2, y1) * gCoeffs[2];
+    float2 p10 = rsGetElementAt_float2(gIn, x1, y) * gCoeffs[3];
+    float2 p11 = rsGetElementAt_float2(gIn, x, y) * gCoeffs[4];
+    float2 p12 = rsGetElementAt_float2(gIn, x2, y) * gCoeffs[5];
+    float2 p20 = rsGetElementAt_float2(gIn, x1, y2) * gCoeffs[6];
+    float2 p21 = rsGetElementAt_float2(gIn, x, y2) * gCoeffs[7];
+    float2 p22 = rsGetElementAt_float2(gIn, x2, y2) * gCoeffs[8];
+
+    p00 += p01;
+    p02 += p10;
+    p11 += p12;
+    p20 += p21;
+
+    p22 += p00;
+    p02 += p11;
+
+    p20 += p22;
+    p20 += p02;
+    return p20;
+}
+
+float __attribute__((kernel)) convolve_F1(uint32_t x, uint32_t y) {
+    uint32_t x1 = min((int32_t)x+1, gWidth-1);
+    uint32_t x2 = max((int32_t)x-1, 0);
+    uint32_t y1 = min((int32_t)y+1, gHeight-1);
+    uint32_t y2 = max((int32_t)y-1, 0);
+
+    float p00 = rsGetElementAt_float(gIn, x1, y1) * gCoeffs[0];
+    float p01 = rsGetElementAt_float(gIn, x, y1) * gCoeffs[1];
+    float p02 = rsGetElementAt_float(gIn, x2, y1) * gCoeffs[2];
+    float p10 = rsGetElementAt_float(gIn, x1, y) * gCoeffs[3];
+    float p11 = rsGetElementAt_float(gIn, x, y) * gCoeffs[4];
+    float p12 = rsGetElementAt_float(gIn, x2, y) * gCoeffs[5];
+    float p20 = rsGetElementAt_float(gIn, x1, y2) * gCoeffs[6];
+    float p21 = rsGetElementAt_float(gIn, x, y2) * gCoeffs[7];
+    float p22 = rsGetElementAt_float(gIn, x2, y2) * gCoeffs[8];
+
+    p00 += p01;
+    p02 += p10;
+    p11 += p12;
+    p20 += p21;
+
+    p22 += p00;
+    p02 += p11;
+
+    p20 += p22;
+    p20 += p02;
+    return p20;
+}
+
+
diff --git a/tests/src/android/renderscript/cts/intrinsic_convolve5x5.rs b/tests/src/android/renderscript/cts/intrinsic_convolve5x5.rs
new file mode 100644
index 0000000..9f9aa2b
--- /dev/null
+++ b/tests/src/android/renderscript/cts/intrinsic_convolve5x5.rs
@@ -0,0 +1,398 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "shared.rsh"
+
+
+int32_t gWidth;
+int32_t gHeight;
+rs_allocation gIn;
+
+float gCoeffs[25];
+
+uchar4 __attribute__((kernel)) convolve_U4(uint32_t x, uint32_t y) {
+    uint32_t x0 = max((int32_t)x-2, 0);
+    uint32_t x1 = max((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = min((int32_t)x+1, gWidth-1);
+    uint32_t x4 = min((int32_t)x+2, gWidth-1);
+
+    uint32_t y0 = max((int32_t)y-2, 0);
+    uint32_t y1 = max((int32_t)y-1, 0);
+    uint32_t y2 = y;
+    uint32_t y3 = min((int32_t)y+1, gHeight-1);
+    uint32_t y4 = min((int32_t)y+2, gHeight-1);
+
+    float4 p0 = convert_float4(rsGetElementAt_uchar4(gIn, x0, y0)) * gCoeffs[0]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x1, y0)) * gCoeffs[1]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x2, y0)) * gCoeffs[2]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x3, y0)) * gCoeffs[3]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x4, y0)) * gCoeffs[4];
+
+    float4 p1 = convert_float4(rsGetElementAt_uchar4(gIn, x0, y1)) * gCoeffs[5]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x1, y1)) * gCoeffs[6]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x2, y1)) * gCoeffs[7]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x3, y1)) * gCoeffs[8]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x4, y1)) * gCoeffs[9];
+
+    float4 p2 = convert_float4(rsGetElementAt_uchar4(gIn, x0, y2)) * gCoeffs[10]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x1, y2)) * gCoeffs[11]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x2, y2)) * gCoeffs[12]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x3, y2)) * gCoeffs[13]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x4, y2)) * gCoeffs[14];
+
+    float4 p3 = convert_float4(rsGetElementAt_uchar4(gIn, x0, y3)) * gCoeffs[15]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x1, y3)) * gCoeffs[16]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x2, y3)) * gCoeffs[17]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x3, y3)) * gCoeffs[18]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x4, y3)) * gCoeffs[19];
+
+    float4 p4 = convert_float4(rsGetElementAt_uchar4(gIn, x0, y4)) * gCoeffs[20]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x1, y4)) * gCoeffs[21]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x2, y4)) * gCoeffs[22]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x3, y4)) * gCoeffs[23]
+              + convert_float4(rsGetElementAt_uchar4(gIn, x4, y4)) * gCoeffs[24];
+
+    p0 = clamp(p0 + p1 + p2 + p3 + p4, 0.f, 255.f);
+    return convert_uchar4(p0);
+}
+
+uchar3 __attribute__((kernel)) convolve_U3(uint32_t x, uint32_t y) {
+    uint32_t x0 = max((int32_t)x-2, 0);
+    uint32_t x1 = max((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = min((int32_t)x+1, gWidth-1);
+    uint32_t x4 = min((int32_t)x+2, gWidth-1);
+
+    uint32_t y0 = max((int32_t)y-2, 0);
+    uint32_t y1 = max((int32_t)y-1, 0);
+    uint32_t y2 = y;
+    uint32_t y3 = min((int32_t)y+1, gHeight-1);
+    uint32_t y4 = min((int32_t)y+2, gHeight-1);
+
+    float3 p0 = convert_float3(rsGetElementAt_uchar3(gIn, x0, y0)) * gCoeffs[0]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x1, y0)) * gCoeffs[1]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x2, y0)) * gCoeffs[2]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x3, y0)) * gCoeffs[3]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x4, y0)) * gCoeffs[4];
+
+    float3 p1 = convert_float3(rsGetElementAt_uchar3(gIn, x0, y1)) * gCoeffs[5]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x1, y1)) * gCoeffs[6]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x2, y1)) * gCoeffs[7]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x3, y1)) * gCoeffs[8]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x4, y1)) * gCoeffs[9];
+
+    float3 p2 = convert_float3(rsGetElementAt_uchar3(gIn, x0, y2)) * gCoeffs[10]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x1, y2)) * gCoeffs[11]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x2, y2)) * gCoeffs[12]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x3, y2)) * gCoeffs[13]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x4, y2)) * gCoeffs[14];
+
+    float3 p3 = convert_float3(rsGetElementAt_uchar3(gIn, x0, y3)) * gCoeffs[15]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x1, y3)) * gCoeffs[16]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x2, y3)) * gCoeffs[17]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x3, y3)) * gCoeffs[18]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x4, y3)) * gCoeffs[19];
+
+    float3 p4 = convert_float3(rsGetElementAt_uchar3(gIn, x0, y4)) * gCoeffs[20]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x1, y4)) * gCoeffs[21]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x2, y4)) * gCoeffs[22]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x3, y4)) * gCoeffs[23]
+              + convert_float3(rsGetElementAt_uchar3(gIn, x4, y4)) * gCoeffs[24];
+
+    p0 = clamp(p0 + p1 + p2 + p3 + p4, 0.f, 255.f);
+    return convert_uchar3(p0);
+}
+
+uchar2 __attribute__((kernel)) convolve_U2(uint32_t x, uint32_t y) {
+    uint32_t x0 = max((int32_t)x-2, 0);
+    uint32_t x1 = max((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = min((int32_t)x+1, gWidth-1);
+    uint32_t x4 = min((int32_t)x+2, gWidth-1);
+
+    uint32_t y0 = max((int32_t)y-2, 0);
+    uint32_t y1 = max((int32_t)y-1, 0);
+    uint32_t y2 = y;
+    uint32_t y3 = min((int32_t)y+1, gHeight-1);
+    uint32_t y4 = min((int32_t)y+2, gHeight-1);
+
+    float2 p0 = convert_float2(rsGetElementAt_uchar2(gIn, x0, y0)) * gCoeffs[0]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x1, y0)) * gCoeffs[1]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x2, y0)) * gCoeffs[2]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x3, y0)) * gCoeffs[3]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x4, y0)) * gCoeffs[4];
+
+    float2 p1 = convert_float2(rsGetElementAt_uchar2(gIn, x0, y1)) * gCoeffs[5]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x1, y1)) * gCoeffs[6]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x2, y1)) * gCoeffs[7]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x3, y1)) * gCoeffs[8]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x4, y1)) * gCoeffs[9];
+
+    float2 p2 = convert_float2(rsGetElementAt_uchar2(gIn, x0, y2)) * gCoeffs[10]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x1, y2)) * gCoeffs[11]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x2, y2)) * gCoeffs[12]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x3, y2)) * gCoeffs[13]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x4, y2)) * gCoeffs[14];
+
+    float2 p3 = convert_float2(rsGetElementAt_uchar2(gIn, x0, y3)) * gCoeffs[15]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x1, y3)) * gCoeffs[16]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x2, y3)) * gCoeffs[17]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x3, y3)) * gCoeffs[18]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x4, y3)) * gCoeffs[19];
+
+    float2 p4 = convert_float2(rsGetElementAt_uchar2(gIn, x0, y4)) * gCoeffs[20]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x1, y4)) * gCoeffs[21]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x2, y4)) * gCoeffs[22]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x3, y4)) * gCoeffs[23]
+              + convert_float2(rsGetElementAt_uchar2(gIn, x4, y4)) * gCoeffs[24];
+
+    p0 = clamp(p0 + p1 + p2 + p3 + p4, 0.f, 255.f);
+    return convert_uchar2(p0);
+}
+
+uchar __attribute__((kernel)) convolve_U1(uint32_t x, uint32_t y) {
+    uint32_t x0 = max((int32_t)x-2, 0);
+    uint32_t x1 = max((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = min((int32_t)x+1, gWidth-1);
+    uint32_t x4 = min((int32_t)x+2, gWidth-1);
+
+    uint32_t y0 = max((int32_t)y-2, 0);
+    uint32_t y1 = max((int32_t)y-1, 0);
+    uint32_t y2 = y;
+    uint32_t y3 = min((int32_t)y+1, gHeight-1);
+    uint32_t y4 = min((int32_t)y+2, gHeight-1);
+
+    float p0 = (float)(rsGetElementAt_uchar(gIn, x0, y0)) * gCoeffs[0]
+             + (float)(rsGetElementAt_uchar(gIn, x1, y0)) * gCoeffs[1]
+             + (float)(rsGetElementAt_uchar(gIn, x2, y0)) * gCoeffs[2]
+             + (float)(rsGetElementAt_uchar(gIn, x3, y0)) * gCoeffs[3]
+             + (float)(rsGetElementAt_uchar(gIn, x4, y0)) * gCoeffs[4];
+
+    float p1 = (float)(rsGetElementAt_uchar(gIn, x0, y1)) * gCoeffs[5]
+             + (float)(rsGetElementAt_uchar(gIn, x1, y1)) * gCoeffs[6]
+             + (float)(rsGetElementAt_uchar(gIn, x2, y1)) * gCoeffs[7]
+             + (float)(rsGetElementAt_uchar(gIn, x3, y1)) * gCoeffs[8]
+             + (float)(rsGetElementAt_uchar(gIn, x4, y1)) * gCoeffs[9];
+
+    float p2 = (float)(rsGetElementAt_uchar(gIn, x0, y2)) * gCoeffs[10]
+             + (float)(rsGetElementAt_uchar(gIn, x1, y2)) * gCoeffs[11]
+             + (float)(rsGetElementAt_uchar(gIn, x2, y2)) * gCoeffs[12]
+             + (float)(rsGetElementAt_uchar(gIn, x3, y2)) * gCoeffs[13]
+             + (float)(rsGetElementAt_uchar(gIn, x4, y2)) * gCoeffs[14];
+
+    float p3 = (float)(rsGetElementAt_uchar(gIn, x0, y3)) * gCoeffs[15]
+             + (float)(rsGetElementAt_uchar(gIn, x1, y3)) * gCoeffs[16]
+             + (float)(rsGetElementAt_uchar(gIn, x2, y3)) * gCoeffs[17]
+             + (float)(rsGetElementAt_uchar(gIn, x3, y3)) * gCoeffs[18]
+             + (float)(rsGetElementAt_uchar(gIn, x4, y3)) * gCoeffs[19];
+
+    float p4 = (float)(rsGetElementAt_uchar(gIn, x0, y4)) * gCoeffs[20]
+             + (float)(rsGetElementAt_uchar(gIn, x1, y4)) * gCoeffs[21]
+             + (float)(rsGetElementAt_uchar(gIn, x2, y4)) * gCoeffs[22]
+             + (float)(rsGetElementAt_uchar(gIn, x3, y4)) * gCoeffs[23]
+             + (float)(rsGetElementAt_uchar(gIn, x4, y4)) * gCoeffs[24];
+
+    return clamp(p0 + p1 + p2 + p3 + p4, 0.f, 255.f);
+}
+
+float4 __attribute__((kernel)) convolve_F4(uint32_t x, uint32_t y) {
+    uint32_t x0 = max((int32_t)x-2, 0);
+    uint32_t x1 = max((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = min((int32_t)x+1, gWidth-1);
+    uint32_t x4 = min((int32_t)x+2, gWidth-1);
+
+    uint32_t y0 = max((int32_t)y-2, 0);
+    uint32_t y1 = max((int32_t)y-1, 0);
+    uint32_t y2 = y;
+    uint32_t y3 = min((int32_t)y+1, gHeight-1);
+    uint32_t y4 = min((int32_t)y+2, gHeight-1);
+
+    float4 p0 = rsGetElementAt_float4(gIn, x0, y0) * gCoeffs[0]
+              + rsGetElementAt_float4(gIn, x1, y0) * gCoeffs[1]
+              + rsGetElementAt_float4(gIn, x2, y0) * gCoeffs[2]
+              + rsGetElementAt_float4(gIn, x3, y0) * gCoeffs[3]
+              + rsGetElementAt_float4(gIn, x4, y0) * gCoeffs[4];
+
+    float4 p1 = rsGetElementAt_float4(gIn, x0, y1) * gCoeffs[5]
+              + rsGetElementAt_float4(gIn, x1, y1) * gCoeffs[6]
+              + rsGetElementAt_float4(gIn, x2, y1) * gCoeffs[7]
+              + rsGetElementAt_float4(gIn, x3, y1) * gCoeffs[8]
+              + rsGetElementAt_float4(gIn, x4, y1) * gCoeffs[9];
+
+    float4 p2 = rsGetElementAt_float4(gIn, x0, y2) * gCoeffs[10]
+              + rsGetElementAt_float4(gIn, x1, y2) * gCoeffs[11]
+              + rsGetElementAt_float4(gIn, x2, y2) * gCoeffs[12]
+              + rsGetElementAt_float4(gIn, x3, y2) * gCoeffs[13]
+              + rsGetElementAt_float4(gIn, x4, y2) * gCoeffs[14];
+
+    float4 p3 = rsGetElementAt_float4(gIn, x0, y3) * gCoeffs[15]
+              + rsGetElementAt_float4(gIn, x1, y3) * gCoeffs[16]
+              + rsGetElementAt_float4(gIn, x2, y3) * gCoeffs[17]
+              + rsGetElementAt_float4(gIn, x3, y3) * gCoeffs[18]
+              + rsGetElementAt_float4(gIn, x4, y3) * gCoeffs[19];
+
+    float4 p4 = rsGetElementAt_float4(gIn, x0, y4) * gCoeffs[20]
+              + rsGetElementAt_float4(gIn, x1, y4) * gCoeffs[21]
+              + rsGetElementAt_float4(gIn, x2, y4) * gCoeffs[22]
+              + rsGetElementAt_float4(gIn, x3, y4) * gCoeffs[23]
+              + rsGetElementAt_float4(gIn, x4, y4) * gCoeffs[24];
+
+    return p0 + p1 + p2 + p3 + p4;
+}
+
+float3 __attribute__((kernel)) convolve_F3(uint32_t x, uint32_t y) {
+    uint32_t x0 = max((int32_t)x-2, 0);
+    uint32_t x1 = max((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = min((int32_t)x+1, gWidth-1);
+    uint32_t x4 = min((int32_t)x+2, gWidth-1);
+
+    uint32_t y0 = max((int32_t)y-2, 0);
+    uint32_t y1 = max((int32_t)y-1, 0);
+    uint32_t y2 = y;
+    uint32_t y3 = min((int32_t)y+1, gHeight-1);
+    uint32_t y4 = min((int32_t)y+2, gHeight-1);
+
+    float3 p0 = rsGetElementAt_float3(gIn, x0, y0) * gCoeffs[0]
+              + rsGetElementAt_float3(gIn, x1, y0) * gCoeffs[1]
+              + rsGetElementAt_float3(gIn, x2, y0) * gCoeffs[2]
+              + rsGetElementAt_float3(gIn, x3, y0) * gCoeffs[3]
+              + rsGetElementAt_float3(gIn, x4, y0) * gCoeffs[4];
+
+    float3 p1 = rsGetElementAt_float3(gIn, x0, y1) * gCoeffs[5]
+              + rsGetElementAt_float3(gIn, x1, y1) * gCoeffs[6]
+              + rsGetElementAt_float3(gIn, x2, y1) * gCoeffs[7]
+              + rsGetElementAt_float3(gIn, x3, y1) * gCoeffs[8]
+              + rsGetElementAt_float3(gIn, x4, y1) * gCoeffs[9];
+
+    float3 p2 = rsGetElementAt_float3(gIn, x0, y2) * gCoeffs[10]
+              + rsGetElementAt_float3(gIn, x1, y2) * gCoeffs[11]
+              + rsGetElementAt_float3(gIn, x2, y2) * gCoeffs[12]
+              + rsGetElementAt_float3(gIn, x3, y2) * gCoeffs[13]
+              + rsGetElementAt_float3(gIn, x4, y2) * gCoeffs[14];
+
+    float3 p3 = rsGetElementAt_float3(gIn, x0, y3) * gCoeffs[15]
+              + rsGetElementAt_float3(gIn, x1, y3) * gCoeffs[16]
+              + rsGetElementAt_float3(gIn, x2, y3) * gCoeffs[17]
+              + rsGetElementAt_float3(gIn, x3, y3) * gCoeffs[18]
+              + rsGetElementAt_float3(gIn, x4, y3) * gCoeffs[19];
+
+    float3 p4 = rsGetElementAt_float3(gIn, x0, y4) * gCoeffs[20]
+              + rsGetElementAt_float3(gIn, x1, y4) * gCoeffs[21]
+              + rsGetElementAt_float3(gIn, x2, y4) * gCoeffs[22]
+              + rsGetElementAt_float3(gIn, x3, y4) * gCoeffs[23]
+              + rsGetElementAt_float3(gIn, x4, y4) * gCoeffs[24];
+
+    return p0 + p1 + p2 + p3 + p4;
+}
+
+float2 __attribute__((kernel)) convolve_F2(uint32_t x, uint32_t y) {
+    uint32_t x0 = max((int32_t)x-2, 0);
+    uint32_t x1 = max((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = min((int32_t)x+1, gWidth-1);
+    uint32_t x4 = min((int32_t)x+2, gWidth-1);
+
+    uint32_t y0 = max((int32_t)y-2, 0);
+    uint32_t y1 = max((int32_t)y-1, 0);
+    uint32_t y2 = y;
+    uint32_t y3 = min((int32_t)y+1, gHeight-1);
+    uint32_t y4 = min((int32_t)y+2, gHeight-1);
+
+    float2 p0 = rsGetElementAt_float2(gIn, x0, y0) * gCoeffs[0]
+              + rsGetElementAt_float2(gIn, x1, y0) * gCoeffs[1]
+              + rsGetElementAt_float2(gIn, x2, y0) * gCoeffs[2]
+              + rsGetElementAt_float2(gIn, x3, y0) * gCoeffs[3]
+              + rsGetElementAt_float2(gIn, x4, y0) * gCoeffs[4];
+
+    float2 p1 = rsGetElementAt_float2(gIn, x0, y1) * gCoeffs[5]
+              + rsGetElementAt_float2(gIn, x1, y1) * gCoeffs[6]
+              + rsGetElementAt_float2(gIn, x2, y1) * gCoeffs[7]
+              + rsGetElementAt_float2(gIn, x3, y1) * gCoeffs[8]
+              + rsGetElementAt_float2(gIn, x4, y1) * gCoeffs[9];
+
+    float2 p2 = rsGetElementAt_float2(gIn, x0, y2) * gCoeffs[10]
+              + rsGetElementAt_float2(gIn, x1, y2) * gCoeffs[11]
+              + rsGetElementAt_float2(gIn, x2, y2) * gCoeffs[12]
+              + rsGetElementAt_float2(gIn, x3, y2) * gCoeffs[13]
+              + rsGetElementAt_float2(gIn, x4, y2) * gCoeffs[14];
+
+    float2 p3 = rsGetElementAt_float2(gIn, x0, y3) * gCoeffs[15]
+              + rsGetElementAt_float2(gIn, x1, y3) * gCoeffs[16]
+              + rsGetElementAt_float2(gIn, x2, y3) * gCoeffs[17]
+              + rsGetElementAt_float2(gIn, x3, y3) * gCoeffs[18]
+              + rsGetElementAt_float2(gIn, x4, y3) * gCoeffs[19];
+
+    float2 p4 = rsGetElementAt_float2(gIn, x0, y4) * gCoeffs[20]
+              + rsGetElementAt_float2(gIn, x1, y4) * gCoeffs[21]
+              + rsGetElementAt_float2(gIn, x2, y4) * gCoeffs[22]
+              + rsGetElementAt_float2(gIn, x3, y4) * gCoeffs[23]
+              + rsGetElementAt_float2(gIn, x4, y4) * gCoeffs[24];
+
+    return p0 + p1 + p2 + p3 + p4;
+}
+
+float __attribute__((kernel)) convolve_F1(uint32_t x, uint32_t y) {
+    uint32_t x0 = max((int32_t)x-2, 0);
+    uint32_t x1 = max((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = min((int32_t)x+1, gWidth-1);
+    uint32_t x4 = min((int32_t)x+2, gWidth-1);
+
+    uint32_t y0 = max((int32_t)y-2, 0);
+    uint32_t y1 = max((int32_t)y-1, 0);
+    uint32_t y2 = y;
+    uint32_t y3 = min((int32_t)y+1, gHeight-1);
+    uint32_t y4 = min((int32_t)y+2, gHeight-1);
+
+    float p0 = rsGetElementAt_float(gIn, x0, y0) * gCoeffs[0]
+             + rsGetElementAt_float(gIn, x1, y0) * gCoeffs[1]
+             + rsGetElementAt_float(gIn, x2, y0) * gCoeffs[2]
+             + rsGetElementAt_float(gIn, x3, y0) * gCoeffs[3]
+             + rsGetElementAt_float(gIn, x4, y0) * gCoeffs[4];
+
+    float p1 = rsGetElementAt_float(gIn, x0, y1) * gCoeffs[5]
+             + rsGetElementAt_float(gIn, x1, y1) * gCoeffs[6]
+             + rsGetElementAt_float(gIn, x2, y1) * gCoeffs[7]
+             + rsGetElementAt_float(gIn, x3, y1) * gCoeffs[8]
+             + rsGetElementAt_float(gIn, x4, y1) * gCoeffs[9];
+
+    float p2 = rsGetElementAt_float(gIn, x0, y2) * gCoeffs[10]
+             + rsGetElementAt_float(gIn, x1, y2) * gCoeffs[11]
+             + rsGetElementAt_float(gIn, x2, y2) * gCoeffs[12]
+             + rsGetElementAt_float(gIn, x3, y2) * gCoeffs[13]
+             + rsGetElementAt_float(gIn, x4, y2) * gCoeffs[14];
+
+    float p3 = rsGetElementAt_float(gIn, x0, y3) * gCoeffs[15]
+             + rsGetElementAt_float(gIn, x1, y3) * gCoeffs[16]
+             + rsGetElementAt_float(gIn, x2, y3) * gCoeffs[17]
+             + rsGetElementAt_float(gIn, x3, y3) * gCoeffs[18]
+             + rsGetElementAt_float(gIn, x4, y3) * gCoeffs[19];
+
+    float p4 = rsGetElementAt_float(gIn, x0, y4) * gCoeffs[20]
+             + rsGetElementAt_float(gIn, x1, y4) * gCoeffs[21]
+             + rsGetElementAt_float(gIn, x2, y4) * gCoeffs[22]
+             + rsGetElementAt_float(gIn, x3, y4) * gCoeffs[23]
+             + rsGetElementAt_float(gIn, x4, y4) * gCoeffs[24];
+
+    return p0 + p1 + p2 + p3 + p4;
+}
+
+
+
diff --git a/tests/src/android/renderscript/cts/verify.rs b/tests/src/android/renderscript/cts/verify.rs
new file mode 100644
index 0000000..d100eb4
--- /dev/null
+++ b/tests/src/android/renderscript/cts/verify.rs
@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "shared.rsh"
+
+rs_allocation gIn1;
+rs_allocation gIn2;
+float gAllowedError;
+
+static bool hadError = false;
+
+static bool compare_float(float f1, float f2) {
+    if (fabs(f1-f2) > 0.0001f) {
+        hadError = true;
+        return false;
+    }
+    return true;
+}
+
+static void verify_float4(rs_allocation in1, rs_allocation in2)
+{
+    uint32_t w = rsAllocationGetDimX(in1);
+    uint32_t h = rsAllocationGetDimY(in1);
+    for (uint32_t y=0; y < h; y++) {
+        for (uint32_t x=0; x < w; x++) {
+            float4 p1 = rsGetElementAt_float4(in1, x, y);
+            float4 p2 = rsGetElementAt_float4(in2, x, y);
+            bool e = !compare_float(p1.x, p2.x);
+            e |= !compare_float(p1.y, p2.y);
+            e |= !compare_float(p1.z, p2.z);
+            e |= !compare_float(p1.w, p2.w);
+            if (e) {
+                rsDebug("verify_float4 x", x);
+                rsDebug("verify_float4 y", y);
+                rsDebug("verify_float4 p1", p1);
+                rsDebug("verify_float4 p2", p2);
+                return;
+            }
+        }
+    }
+}
+
+static void verify_float3(rs_allocation in1, rs_allocation in2)
+{
+    uint32_t w = rsAllocationGetDimX(in1);
+    uint32_t h = rsAllocationGetDimY(in1);
+    for (uint32_t y=0; y < h; y++) {
+        for (uint32_t x=0; x < w; x++) {
+            float3 p1 = rsGetElementAt_float3(in1, x, y);
+            float3 p2 = rsGetElementAt_float3(in2, x, y);
+            bool e = !compare_float(p1.x, p2.x);
+            e |= !compare_float(p1.y, p2.y);
+            e |= !compare_float(p1.z, p2.z);
+            if (e) {
+                rsDebug("verify_float4 x", x);
+                rsDebug("verify_float4 y", y);
+                rsDebug("verify_float4 p1", p1);
+                rsDebug("verify_float4 p2", p2);
+                return;
+            }
+        }
+    }
+}
+
+static void verify_float2(rs_allocation in1, rs_allocation in2)
+{
+    uint32_t w = rsAllocationGetDimX(in1);
+    uint32_t h = rsAllocationGetDimY(in1);
+    for (uint32_t y=0; y < h; y++) {
+        for (uint32_t x=0; x < w; x++) {
+            float2 p1 = rsGetElementAt_float2(in1, x, y);
+            float2 p2 = rsGetElementAt_float2(in2, x, y);
+            bool e = !compare_float(p1.x, p2.x);
+            e |= !compare_float(p1.y, p2.y);
+            if (e) {
+                rsDebug("verify_float4 x", x);
+                rsDebug("verify_float4 y", y);
+                rsDebug("verify_float4 p1", p1);
+                rsDebug("verify_float4 p2", p2);
+                return;
+            }
+        }
+    }
+}
+
+static void verify_float(rs_allocation in1, rs_allocation in2)
+{
+    uint32_t w = rsAllocationGetDimX(in1);
+    uint32_t h = rsAllocationGetDimY(in1);
+    for (uint32_t y=0; y < h; y++) {
+        for (uint32_t x=0; x < w; x++) {
+            float p1 = rsGetElementAt_float(in1, x, y);
+            float p2 = rsGetElementAt_float(in2, x, y);
+            bool e = !compare_float(p1, p2);
+            if (e) {
+                rsDebug("verify_float4 x", x);
+                rsDebug("verify_float4 y", y);
+                rsDebug("verify_float4 p1", p1);
+                rsDebug("verify_float4 p2", p2);
+                return;
+            }
+        }
+    }
+}
+
+static void verify_uchar4(rs_allocation in1, rs_allocation in2)
+{
+    int merr = 0;
+    uint32_t w = rsAllocationGetDimX(in1);
+    uint32_t h = rsAllocationGetDimY(in1);
+    for (uint32_t y=0; y < h; y++) {
+        for (uint32_t x=0; x < w; x++) {
+            int4 p1 = convert_int4(rsGetElementAt_uchar4(in1, x, y));
+            int4 p2 = convert_int4(rsGetElementAt_uchar4(in2, x, y));
+            int4 d = convert_int4(abs(p1 - p2));
+            int e = 0;
+            e = max(e, d.x);
+            e = max(e, d.y);
+            e = max(e, d.z);
+            e = max(e, d.w);
+            if (e != 0) {
+                rsDebug("verify_uchar4 x", x);
+                rsDebug("verify_uchar4 y", y);
+                rsDebug("verify_uchar4 p1", p1);
+                rsDebug("verify_uchar4 p2", p2);
+                return;
+            }
+            merr = max(e, merr);
+        }
+    }
+}
+
+static void verify_uchar3(rs_allocation in1, rs_allocation in2)
+{
+    int merr = 0;
+    uint32_t w = rsAllocationGetDimX(in1);
+    uint32_t h = rsAllocationGetDimY(in1);
+    for (uint32_t y=0; y < h; y++) {
+        for (uint32_t x=0; x < w; x++) {
+            int3 p1 = convert_int3(rsGetElementAt_uchar3(in1, x, y));
+            int3 p2 = convert_int3(rsGetElementAt_uchar3(in2, x, y));
+            int3 d = convert_int3(abs(p1 - p2));
+            int e = 0;
+            e = max(e, d.x);
+            e = max(e, d.y);
+            e = max(e, d.z);
+            if (e != 0) {
+                rsDebug("verify_uchar3 x", x);
+                rsDebug("verify_uchar3 y", y);
+                rsDebug("verify_uchar3 p1", p1);
+                rsDebug("verify_uchar3 p2", p2);
+                return;
+            }
+            merr = max(e, merr);
+        }
+    }
+}
+
+static void verify_uchar2(rs_allocation in1, rs_allocation in2)
+{
+    int merr = 0;
+    uint32_t w = rsAllocationGetDimX(in1);
+    uint32_t h = rsAllocationGetDimY(in1);
+    for (uint32_t y=0; y < h; y++) {
+        for (uint32_t x=0; x < w; x++) {
+            int2 p1 = convert_int2(rsGetElementAt_uchar2(in1, x, y));
+            int2 p2 = convert_int2(rsGetElementAt_uchar2(in2, x, y));
+            int2 d = convert_int2(abs(p1 - p2));
+            int e = 0;
+            e = max(e, d.x);
+            e = max(e, d.y);
+            if (e != 0) {
+                rsDebug("verify_uchar2 x", x);
+                rsDebug("verify_uchar2 y", y);
+                rsDebug("verify_uchar2 p1", p1);
+                rsDebug("verify_uchar2 p2", p2);
+                return;
+            }
+            merr = max(e, merr);
+        }
+    }
+}
+
+static void verify_uchar(rs_allocation in1, rs_allocation in2)
+{
+    int merr = 0;
+    uint32_t w = rsAllocationGetDimX(in1);
+    uint32_t h = rsAllocationGetDimY(in1);
+    for (uint32_t y=0; y < h; y++) {
+        for (uint32_t x=0; x < w; x++) {
+            int p1 = rsGetElementAt_uchar(in1, x, y);
+            int p2 = rsGetElementAt_uchar(in2, x, y);
+            int e = abs(p1 - p2);
+            if (e != 0) {
+                rsDebug("verify_uchar4 x", x);
+                rsDebug("verify_uchar4 y", y);
+                rsDebug("verify_uchar4 p1", p1);
+                rsDebug("verify_uchar4 p2", p2);
+                return;
+            }
+            merr = max(e, merr);
+        }
+    }
+}
+
+void verify(rs_allocation in1, rs_allocation in2, int etype)
+{
+    switch(etype) {
+    case 0:
+        verify_uchar4(in1, in2);
+        break;
+    case 1:
+        verify_uchar3(in1, in2);
+        break;
+    case 2:
+        verify_uchar2(in1, in2);
+        break;
+    case 3:
+        verify_uchar(in1, in2);
+        break;
+    case 4:
+        verify_float4(in1, in2);
+        break;
+    case 5:
+        verify_float3(in1, in2);
+        break;
+    case 6:
+        verify_float2(in1, in2);
+        break;
+    case 7:
+        verify_float(in1, in2);
+        break;
+    }
+
+}
+
+void checkError()
+{
+    if (hadError) {
+        rsSendToClientBlocking(RS_MSG_TEST_FAILED);
+    } else {
+        rsSendToClientBlocking(RS_MSG_TEST_PASSED);
+    }
+}
diff --git a/tests/tests/renderscript/src/android/renderscript/cts/IntrinsicBase.java b/tests/tests/renderscript/src/android/renderscript/cts/IntrinsicBase.java
new file mode 100644
index 0000000..f5a6eee
--- /dev/null
+++ b/tests/tests/renderscript/src/android/renderscript/cts/IntrinsicBase.java
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package android.renderscript.cts;
+
+import android.util.Log;
+import android.renderscript.RenderScript;
+import android.renderscript.Allocation;
+import android.renderscript.Element;
+import android.renderscript.Type;
+
+public class IntrinsicBase extends RSBaseCompute {
+    protected final String TAG = "Img";
+
+    protected Allocation mAllocSrc;
+    protected Allocation mAllocRef;
+    protected Allocation mAllocDst;
+    protected ScriptC_verify mVerify;
+
+    @Override
+    protected void setUp() throws Exception {
+        super.setUp();
+        mVerify = new ScriptC_verify(mRS);
+    }
+
+    @Override
+    protected void tearDown() throws Exception {
+        if (mVerify != null) {
+            mVerify.destroy();
+            mVerify = null;
+        }
+        super.tearDown();
+    }
+
+    protected void makeSource(int w, int h, Element e) {
+        System.gc();
+
+        if (mAllocSrc != null) {
+            mAllocSrc.destroy();
+        }
+        if (mAllocRef != null) {
+            mAllocRef.destroy();
+        }
+        if (mAllocDst != null) {
+            mAllocDst.destroy();
+        }
+
+        Type.Builder tb = new Type.Builder(mRS, e);
+        tb.setX(w);
+        tb.setY(h);
+        Type t = tb.create();
+        mAllocSrc = Allocation.createTyped(mRS, t);
+        mAllocRef = Allocation.createTyped(mRS, t);
+        mAllocDst = Allocation.createTyped(mRS, t);
+
+        java.util.Random r = new java.util.Random(100);
+
+        int vs = e.getVectorSize();
+        if (e.getDataType() == Element.DataType.FLOAT_32) {
+            float f[] = new float[w * h * vs];
+            for (int y=0; y < h; y++) {
+                for (int x = 0; x < w; x++) {
+                    for (int v = 0; v < vs; v++) {
+                        f[(y * w + x) * vs + v] = r.nextFloat();
+                    }
+                }
+            }
+            mAllocSrc.copyFromUnchecked(f);
+        }
+
+        if (e.getDataType() == Element.DataType.UNSIGNED_8) {
+            byte f[] = new byte[w * h * vs];
+            for (int y=0; y < h; y++) {
+                for (int x = 0; x < w; x++) {
+                    for (int v = 0; v < vs; v++) {
+                        f[(y * w + x) * vs + v] = (byte)r.nextInt(256);
+                    }
+                }
+            }
+            mAllocSrc.copyFromUnchecked(f);
+        }
+
+    }
+
+
+    protected void checkError() {
+        mRS.finish();
+        mVerify.invoke_checkError();
+        waitForMessage();
+        checkForErrors();
+    }
+
+}
diff --git a/tests/tests/renderscript/src/android/renderscript/cts/IntrinsicConvolve3x3.java b/tests/tests/renderscript/src/android/renderscript/cts/IntrinsicConvolve3x3.java
new file mode 100644
index 0000000..e74536b
--- /dev/null
+++ b/tests/tests/renderscript/src/android/renderscript/cts/IntrinsicConvolve3x3.java
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package android.renderscript.cts;
+
+import android.renderscript.*;
+import android.util.Log;
+
+public class IntrinsicConvolve3x3 extends IntrinsicBase {
+    private void testConvolve3(int w, int h, Element.DataType dt, int vecSize, int en) {
+        float cf1[] = {0.f, 0.f, 0.f,  0.f, 1.f, 0.f,  0.f, 0.f, 0.f};
+        float cf2[] = {0.f, -1.f, 0.f,  -1.f, 5.f, -1.f,  0.f, -1.f, 0.f};
+
+
+        Element e;
+        if (vecSize > 1) {
+            e = Element.createVector(mRS, dt, vecSize);
+        } else {
+            if (dt == Element.DataType.UNSIGNED_8) {
+                e = Element.U8(mRS);
+            } else {
+                e = Element.F32(mRS);
+            }
+        }
+
+        System.gc();
+        makeSource(w, h, e);
+
+
+        ScriptIntrinsicConvolve3x3 si = ScriptIntrinsicConvolve3x3.create(mRS, e);
+        si.setCoefficients(cf1);
+        si.setInput(mAllocSrc);
+        si.forEach(mAllocRef);
+
+        ScriptC_intrinsic_convolve3x3 sr = new ScriptC_intrinsic_convolve3x3(mRS);
+        sr.set_gCoeffs(cf1);
+        sr.set_gIn(mAllocSrc);
+        sr.set_gWidth(w);
+        sr.set_gHeight(h);
+        if (dt == Element.DataType.UNSIGNED_8) {
+            switch(vecSize) {
+            case 4:
+                sr.forEach_convolve_U4(mAllocDst);
+                break;
+            case 3:
+                sr.forEach_convolve_U3(mAllocDst);
+                break;
+            case 2:
+                sr.forEach_convolve_U2(mAllocDst);
+                break;
+            case 1:
+                sr.forEach_convolve_U1(mAllocDst);
+                break;
+            }
+        } else {
+            switch(vecSize) {
+            case 4:
+                sr.forEach_convolve_F4(mAllocDst);
+                break;
+            case 3:
+                sr.forEach_convolve_F3(mAllocDst);
+                break;
+            case 2:
+                sr.forEach_convolve_F2(mAllocDst);
+                break;
+            case 1:
+                sr.forEach_convolve_F1(mAllocDst);
+                break;
+            }
+        }
+
+        android.util.Log.e("RSI test", "test convolve U8_" + vecSize + " 1 " + w + ", " + h);
+        mVerify.invoke_verify(mAllocRef, mAllocDst, en);
+
+        si.setCoefficients(cf2);
+        sr.set_gCoeffs(cf2);
+        si.forEach(mAllocRef);
+        if (dt == Element.DataType.UNSIGNED_8) {
+            switch(vecSize) {
+            case 4:
+                sr.forEach_convolve_U4(mAllocDst);
+                break;
+            case 3:
+                sr.forEach_convolve_U3(mAllocDst);
+                break;
+            case 2:
+                sr.forEach_convolve_U2(mAllocDst);
+                break;
+            case 1:
+                sr.forEach_convolve_U1(mAllocDst);
+                break;
+            }
+        } else {
+            switch(vecSize) {
+            case 4:
+                sr.forEach_convolve_F4(mAllocDst);
+                break;
+            case 3:
+                sr.forEach_convolve_F3(mAllocDst);
+                break;
+            case 2:
+                sr.forEach_convolve_F2(mAllocDst);
+                break;
+            case 1:
+                sr.forEach_convolve_F1(mAllocDst);
+                break;
+            }
+        }
+        android.util.Log.e("RSI test", "test convolve U8_" + vecSize + " 2 " + w + ", " + h);
+        mVerify.invoke_verify(mAllocRef, mAllocDst, en);
+        mRS.finish();
+    }
+
+
+    public void test() {
+        testConvolve3(100, 100, Element.DataType.UNSIGNED_8, 4, 0);
+        testConvolve3(100, 100, Element.DataType.UNSIGNED_8, 3, 1);
+        testConvolve3(100, 100, Element.DataType.UNSIGNED_8, 2, 2);
+        testConvolve3(100, 100, Element.DataType.UNSIGNED_8, 1, 3);
+
+        testConvolve3(100, 100, Element.DataType.FLOAT_32, 4, 4);
+        testConvolve3(100, 100, Element.DataType.FLOAT_32, 3, 5);
+        testConvolve3(100, 100, Element.DataType.FLOAT_32, 2, 6);
+        testConvolve3(100, 100, Element.DataType.FLOAT_32, 1, 7);
+        checkError();
+    }
+
+
+}
diff --git a/tests/tests/renderscript/src/android/renderscript/cts/IntrinsicConvolve5x5.java b/tests/tests/renderscript/src/android/renderscript/cts/IntrinsicConvolve5x5.java
new file mode 100644
index 0000000..500b5aa
--- /dev/null
+++ b/tests/tests/renderscript/src/android/renderscript/cts/IntrinsicConvolve5x5.java
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package android.renderscript.cts;
+
+import android.renderscript.*;
+import android.util.Log;
+
+public class IntrinsicConvolve5x5 extends IntrinsicBase {
+    private void test5(ScriptC_intrinsic_convolve5x5 sr, ScriptIntrinsicConvolve5x5 si,
+                        Element e, float cf[], String name, int num, int w, int h, int en) {
+        si.setCoefficients(cf);
+        si.setInput(mAllocSrc);
+        si.forEach(mAllocRef);
+
+        sr.set_gWidth(w);
+        sr.set_gHeight(h);
+        sr.set_gCoeffs(cf);
+        sr.set_gIn(mAllocSrc);
+        if (e.getDataType() == Element.DataType.UNSIGNED_8) {
+            switch(e.getVectorSize()) {
+            case 4:
+                sr.forEach_convolve_U4(mAllocDst);
+                break;
+            case 3:
+                sr.forEach_convolve_U3(mAllocDst);
+                break;
+            case 2:
+                sr.forEach_convolve_U2(mAllocDst);
+                break;
+            case 1:
+                sr.forEach_convolve_U1(mAllocDst);
+                break;
+            }
+        } else {
+            switch(e.getVectorSize()) {
+            case 4:
+                sr.forEach_convolve_F4(mAllocDst);
+                break;
+            case 3:
+                sr.forEach_convolve_F3(mAllocDst);
+                break;
+            case 2:
+                sr.forEach_convolve_F2(mAllocDst);
+                break;
+            case 1:
+                sr.forEach_convolve_F1(mAllocDst);
+                break;
+            }
+        }
+
+        android.util.Log.e("RSI test", name + "  " + e.getVectorSize() + " " + num + " " + w + ", " + h);
+        mVerify.invoke_verify(mAllocRef, mAllocDst, en);
+        mRS.finish();
+    }
+
+    private void testConvolve5(int w, int h, Element.DataType dt, int vecSize, int en) {
+        float cf1[] = { 0.f,  0.f,  0.f,  0.f,  0.f,
+                        0.f,  0.f,  0.f,  0.f,  0.f,
+                        0.f,  0.f,  1.f,  0.f,  0.f,
+                        0.f,  0.f,  0.f,  0.f,  0.f,
+                        0.f,  0.f,  0.f,  0.f,  0.f};
+        float cf2[] = {-1.f, -1.f, -1.f, -1.f, -1.f,
+                       -1.f,  0.f,  0.f,  0.f, -1.f,
+                       -1.f,  0.f, 16.f,  0.f, -1.f,
+                       -1.f,  0.f,  0.f,  0.f, -1.f,
+                       -1.f, -1.f, -1.f, -1.f, -1.f};
+
+        Element e;
+        if (vecSize > 1) {
+            e = Element.createVector(mRS, dt, vecSize);
+        } else {
+            if (dt == Element.DataType.UNSIGNED_8) {
+                e = Element.U8(mRS);
+            } else {
+                e = Element.F32(mRS);
+            }
+        }
+
+        makeSource(w, h, e);
+
+
+        ScriptIntrinsicConvolve5x5 si = ScriptIntrinsicConvolve5x5.create(mRS, e);
+        ScriptC_intrinsic_convolve5x5 sr = new ScriptC_intrinsic_convolve5x5(mRS);
+        test5(sr, si, e, cf1, "test convolve", 1, w, h, en);
+        test5(sr, si, e, cf2, "test convolve", 2, w, h, en);
+    }
+
+
+    public void test() {
+        testConvolve5(100, 100, Element.DataType.UNSIGNED_8, 4, 0);
+        testConvolve5(100, 100, Element.DataType.UNSIGNED_8, 3, 1);
+        testConvolve5(100, 100, Element.DataType.UNSIGNED_8, 2, 2);
+        testConvolve5(100, 100, Element.DataType.UNSIGNED_8, 1, 3);
+
+        testConvolve5(100, 100, Element.DataType.FLOAT_32, 4, 4);
+        testConvolve5(100, 100, Element.DataType.FLOAT_32, 3, 5);
+        testConvolve5(100, 100, Element.DataType.FLOAT_32, 2, 6);
+        testConvolve5(100, 100, Element.DataType.FLOAT_32, 1, 7);
+        checkError();
+    }
+
+
+}