Implement U1, U2, F1, F2, F4 convolve 3x3

Change-Id: Ib6ffd75ba19cf09a710f39fcd07400aae12d60ca
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 020fa6f..4ee3a23 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -42,9 +42,24 @@
     ObjectBaseRef<const Allocation> mAlloc;
     ObjectBaseRef<const Element> mElement;
 
-    static void kernel(const RsForEachStubParamStruct *p,
-                       uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+    static void kernelU1(const RsForEachStubParamStruct *p,
+                         uint32_t xstart, uint32_t xend,
+                         uint32_t instep, uint32_t outstep);
+    static void kernelU2(const RsForEachStubParamStruct *p,
+                         uint32_t xstart, uint32_t xend,
+                         uint32_t instep, uint32_t outstep);
+    static void kernelU4(const RsForEachStubParamStruct *p,
+                         uint32_t xstart, uint32_t xend,
+                         uint32_t instep, uint32_t outstep);
+    static void kernelF1(const RsForEachStubParamStruct *p,
+                         uint32_t xstart, uint32_t xend,
+                         uint32_t instep, uint32_t outstep);
+    static void kernelF2(const RsForEachStubParamStruct *p,
+                         uint32_t xstart, uint32_t xend,
+                         uint32_t instep, uint32_t outstep);
+    static void kernelF4(const RsForEachStubParamStruct *p,
+                         uint32_t xstart, uint32_t xend,
+                         uint32_t instep, uint32_t outstep);
 };
 
 }
@@ -61,7 +76,11 @@
     rsAssert(slot == 0);
     memcpy (&mFp, data, dataLength);
     for(int ct=0; ct < 9; ct++) {
-        mIp[ct] = (short)(mFp[ct] * 255.f + 0.5f);
+        if (mFp[ct] >= 0) {
+            mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
+        } else {
+            mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
+        }
     }
 }
 
@@ -69,9 +88,9 @@
                                           const void *y2, const short *coef, uint32_t count);
 
 
-static void ConvolveOne(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
-                        const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
-                        const float* coeff) {
+static void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+                          const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
+                          const float* coeff) {
 
     uint32_t x1 = rsMax((int32_t)x-1, 0);
     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
@@ -91,9 +110,82 @@
     *out = o;
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernel(const RsForEachStubParamStruct *p,
-                                              uint32_t xstart, uint32_t xend,
-                                              uint32_t instep, uint32_t outstep) {
+static void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
+                          const uchar2 *py0, const uchar2 *py1, const uchar2 *py2,
+                          const float* coeff) {
+
+    uint32_t x1 = rsMax((int32_t)x-1, 0);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+
+    float2 px = convert_float2(py0[x1]) * coeff[0] +
+                convert_float2(py0[x]) * coeff[1] +
+                convert_float2(py0[x2]) * coeff[2] +
+                convert_float2(py1[x1]) * coeff[3] +
+                convert_float2(py1[x]) * coeff[4] +
+                convert_float2(py1[x2]) * coeff[5] +
+                convert_float2(py2[x1]) * coeff[6] +
+                convert_float2(py2[x]) * coeff[7] +
+                convert_float2(py2[x2]) * coeff[8];
+
+    px = clamp(px, 0.f, 255.f);
+    *out = convert_uchar2(px);
+}
+
+static void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
+                          const uchar *py0, const uchar *py1, const uchar *py2,
+                          const float* coeff) {
+
+    uint32_t x1 = rsMax((int32_t)x-1, 0);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+
+    float px = ((float)py0[x1]) * coeff[0] +
+               ((float)py0[x]) * coeff[1] +
+               ((float)py0[x2]) * coeff[2] +
+               ((float)py1[x1]) * coeff[3] +
+               ((float)py1[x]) * coeff[4] +
+               ((float)py1[x2]) * coeff[5] +
+               ((float)py2[x1]) * coeff[6] +
+               ((float)py2[x]) * coeff[7] +
+               ((float)py2[x2]) * coeff[8];
+    *out = clamp(px, 0.f, 255.f);
+}
+
+static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
+                          const float4 *py0, const float4 *py1, const float4 *py2,
+                          const float* coeff) {
+
+    uint32_t x1 = rsMax((int32_t)x-1, 0);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
+           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
+           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
+}
+
+static void ConvolveOneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
+                          const float2 *py0, const float2 *py1, const float2 *py2,
+                          const float* coeff) {
+
+    uint32_t x1 = rsMax((int32_t)x-1, 0);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
+           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
+           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
+}
+
+static void ConvolveOneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
+                          const float *py0, const float *py1, const float *py2,
+                          const float* coeff) {
+
+    uint32_t x1 = rsMax((int32_t)x-1, 0);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
+    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
+           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
+           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -113,7 +205,7 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
     if(x1 == 0) {
-        ConvolveOne(p, 0, out, py0, py1, py2, cp->mFp);
+        ConvolveOneU4(p, 0, out, py0, py1, py2, cp->mFp);
         x1 ++;
         out++;
     }
@@ -129,7 +221,231 @@
 #endif
 
         while(x1 != x2) {
-            ConvolveOne(p, x1, out, py0, py1, py2, cp->mFp);
+            ConvolveOneU4(p, x1, out, py0, py1, py2, cp->mFp);
+            out++;
+            x1++;
+        }
+    }
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+
+    if (!cp->mAlloc.get()) {
+        ALOGE("Convolve3x3 executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
+
+    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
+    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    const uchar2 *py0 = (const uchar2 *)(pin + stride * y2);
+    const uchar2 *py1 = (const uchar2 *)(pin + stride * p->y);
+    const uchar2 *py2 = (const uchar2 *)(pin + stride * y1);
+
+    uchar2 *out = (uchar2 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+    if(x1 == 0) {
+        ConvolveOneU2(p, 0, out, py0, py1, py2, cp->mFp);
+        x1 ++;
+        out++;
+    }
+
+    if(x2 > x1) {
+#if 0//defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 1;
+        if(len > 0) {
+            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
+            x1 += len << 1;
+            out += len << 1;
+        }
+#endif
+
+        while(x1 != x2) {
+            ConvolveOneU2(p, x1, out, py0, py1, py2, cp->mFp);
+            out++;
+            x1++;
+        }
+    }
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+
+    if (!cp->mAlloc.get()) {
+        ALOGE("Convolve3x3 executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
+
+    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
+    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    const uchar *py0 = (const uchar *)(pin + stride * y2);
+    const uchar *py1 = (const uchar *)(pin + stride * p->y);
+    const uchar *py2 = (const uchar *)(pin + stride * y1);
+
+    uchar *out = (uchar *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+    if(x1 == 0) {
+        ConvolveOneU1(p, 0, out, py0, py1, py2, cp->mFp);
+        x1 ++;
+        out++;
+    }
+
+    if(x2 > x1) {
+#if 0//defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 1;
+        if(len > 0) {
+            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
+            x1 += len << 1;
+            out += len << 1;
+        }
+#endif
+
+        while(x1 != x2) {
+            ConvolveOneU1(p, x1, out, py0, py1, py2, cp->mFp);
+            out++;
+            x1++;
+        }
+    }
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+
+    if (!cp->mAlloc.get()) {
+        ALOGE("Convolve3x3 executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
+
+    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
+    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    const float4 *py0 = (const float4 *)(pin + stride * y2);
+    const float4 *py1 = (const float4 *)(pin + stride * p->y);
+    const float4 *py2 = (const float4 *)(pin + stride * y1);
+
+    float4 *out = (float4 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+    if(x1 == 0) {
+        ConvolveOneF4(p, 0, out, py0, py1, py2, cp->mFp);
+        x1 ++;
+        out++;
+    }
+
+    if(x2 > x1) {
+#if 0//defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 1;
+        if(len > 0) {
+            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
+            x1 += len << 1;
+            out += len << 1;
+        }
+#endif
+
+        while(x1 != x2) {
+            ConvolveOneF4(p, x1, out, py0, py1, py2, cp->mFp);
+            out++;
+            x1++;
+        }
+    }
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+
+    if (!cp->mAlloc.get()) {
+        ALOGE("Convolve3x3 executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
+
+    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
+    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    const float2 *py0 = (const float2 *)(pin + stride * y2);
+    const float2 *py1 = (const float2 *)(pin + stride * p->y);
+    const float2 *py2 = (const float2 *)(pin + stride * y1);
+
+    float2 *out = (float2 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+    if(x1 == 0) {
+        ConvolveOneF2(p, 0, out, py0, py1, py2, cp->mFp);
+        x1 ++;
+        out++;
+    }
+
+    if(x2 > x1) {
+#if 0//defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 1;
+        if(len > 0) {
+            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
+            x1 += len << 1;
+            out += len << 1;
+        }
+#endif
+
+        while(x1 != x2) {
+            ConvolveOneF2(p, x1, out, py0, py1, py2, cp->mFp);
+            out++;
+            x1++;
+        }
+    }
+}
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+
+    if (!cp->mAlloc.get()) {
+        ALOGE("Convolve3x3 executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
+
+    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
+    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    const float *py0 = (const float *)(pin + stride * y2);
+    const float *py1 = (const float *)(pin + stride * p->y);
+    const float *py2 = (const float *)(pin + stride * y1);
+
+    float *out = (float *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+    if(x1 == 0) {
+        ConvolveOneF1(p, 0, out, py0, py1, py2, cp->mFp);
+        x1 ++;
+        out++;
+    }
+
+    if(x2 > x1) {
+#if 0//defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 1;
+        if(len > 0) {
+            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
+            x1 += len << 1;
+            out += len << 1;
+        }
+#endif
+
+        while(x1 != x2) {
+            ConvolveOneF1(p, x1, out, py0, py1, py2, cp->mFp);
             out++;
             x1++;
         }
@@ -140,10 +456,36 @@
             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
 
-    mRootPtr = &kernel;
+    if (e->getType() == RS_TYPE_FLOAT_32) {
+        switch(e->getVectorSize()) {
+        case 1:
+            mRootPtr = &kernelF1;
+            break;
+        case 2:
+            mRootPtr = &kernelF2;
+            break;
+        case 3:
+        case 4:
+            mRootPtr = &kernelF4;
+            break;
+        }
+    } else {
+        switch(e->getVectorSize()) {
+        case 1:
+            mRootPtr = &kernelU1;
+            break;
+        case 2:
+            mRootPtr = &kernelU2;
+            break;
+        case 3:
+        case 4:
+            mRootPtr = &kernelU4;
+            break;
+        }
+    }
     for(int ct=0; ct < 9; ct++) {
         mFp[ct] = 1.f / 9.f;
-        mIp[ct] = (short)(mFp[ct] * 255.f + 0.5f);
+        mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
     }
 }
 
diff --git a/cpu_ref/rsCpuIntrinsicInlines.h b/cpu_ref/rsCpuIntrinsicInlines.h
index d6644ca..fb5003a 100644
--- a/cpu_ref/rsCpuIntrinsicInlines.h
+++ b/cpu_ref/rsCpuIntrinsicInlines.h
@@ -52,55 +52,36 @@
 
 };
 
-static inline int4 convert_int4(uchar4 i) {
-    int4 f4 = {i.x, i.y, i.z, i.w};
-    return f4;
-}
-
-static inline uint4 convert_uint4(uchar4 i) {
-    uint4 f4 = {i.x, i.y, i.z, i.w};
-    return f4;
-}
-
-static inline int4 convert_int4(float4 i) {
-    int4 f4 = {i.x, i.y, i.z, i.w};
-    return f4;
-}
-
-static inline short4 convert_short4(uchar4 i) {
-    short4 f4 = {i.x, i.y, i.z, i.w};
-    return f4;
-}
-
-static inline float4 convert_float4(uchar4 i) {
-    float4 f4 = {i.x, i.y, i.z, i.w};
-    return f4;
-}
-
-static inline float4 convert_float4(int4 i) {
-    float4 f4 = {i.x, i.y, i.z, i.w};
-    return f4;
-}
-
-static inline uchar4 convert_uchar4(short4 i) {
-    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
-    return f4;
-}
-
-static inline uchar4 convert_uchar4(int4 i) {
-    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
-    return f4;
-}
-
-static inline uchar4 convert_uchar4(uint4 i) {
-    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
-    return f4;
-}
-
-static inline uchar4 convert_uchar4(float4 i) {
-    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
-    return f4;
-}
+#define CVT_FUNC_2(typeout, typein)                             \
+static inline typeout##2 __attribute__((const, overloadable))   \
+    convert_##typeout##2(typein##2 i) {                         \
+        typeout##2 f = {(typeout)i.x, (typeout)i.y};            \
+        return f;                                               \
+    }                                                           \
+static inline typeout##3 __attribute__((const, overloadable))   \
+    convert_##typeout##3(typein##3 i) {                         \
+        typeout##3 f = {(typeout)i.x, (typeout)i.y, (typeout)i.z}; \
+        return f;                                               \
+    }                                                           \
+static inline typeout##4 __attribute__((const, overloadable))   \
+    convert_##typeout##4(typein##4 i) {                         \
+        typeout##4 f = {(typeout)i.x, (typeout)i.y, (typeout)i.z, (typeout)i.w}; \
+        return f;                                               \
+    }
+#define CVT_FUNC(type)  CVT_FUNC_2(type, uchar)     \
+                        CVT_FUNC_2(type, char)      \
+                        CVT_FUNC_2(type, ushort)    \
+                        CVT_FUNC_2(type, short)     \
+                        CVT_FUNC_2(type, uint)      \
+                        CVT_FUNC_2(type, int)       \
+                        CVT_FUNC_2(type, float)
+CVT_FUNC(char)
+CVT_FUNC(uchar)
+CVT_FUNC(short)
+CVT_FUNC(ushort)
+CVT_FUNC(int)
+CVT_FUNC(uint)
+CVT_FUNC(float)
 
 
 static inline int4 clamp(int4 amount, int low, int high) {
@@ -121,4 +102,25 @@
     return r;
 }
 
+static inline int2 clamp(int2 amount, int low, int high) {
+    int2 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    return r;
+}
+
+static inline float2 clamp(float2 amount, float low, float high) {
+    float2 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    return r;
+}
+
+static inline int clamp(int amount, int low, int high) {
+    return amount < low ? low : (amount > high ? high : amount);
+}
+
+static inline float clamp(float amount, float low, float high) {
+    return amount < low ? low : (amount > high ? high : amount);
+}