AArch64 ports of Convolve3x3 and Convolve5x5.

Change-Id: I68d9e0ddda128f07d4383d20e9bad161f0c6965b
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index d44f872..85e34b4 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -43,6 +43,7 @@
 #    rsCpuIntrinsics_advsimd_3DLUT.S \
 #    rsCpuIntrinsics_advsimd_Blend.S \
 #    rsCpuIntrinsics_advsimd_Blur.S \
+#    rsCpuIntrinsics_advsimd_Convolve.S \
 #    rsCpuIntrinsics_advsimd_ColorMatrix.S \
 #    rsCpuIntrinsics_advsimd_YuvToRGB.S
 
@@ -53,10 +54,10 @@
 ifeq ($(ARCH_ARM_HAVE_VFP),true)
     LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_VFP
     LOCAL_SRC_FILES_arm += \
-    rsCpuIntrinsics_neon.S \
     rsCpuIntrinsics_neon_3DLUT.S \
     rsCpuIntrinsics_neon_Blend.S \
     rsCpuIntrinsics_neon_Blur.S \
+    rsCpuIntrinsics_neon_Convolve.S \
     rsCpuIntrinsics_neon_ColorMatrix.S \
     rsCpuIntrinsics_neon_YuvToRGB.S \
     convolve/convolve_copy_neon.s \
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Convolve.S b/cpu_ref/rsCpuIntrinsics_advsimd_Convolve.S
new file mode 100644
index 0000000..0daa0c5
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Convolve.S
@@ -0,0 +1,265 @@
+/*
+ * Copyright (C) 2012,2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+        x0 = dst
+        x1 = y0 base pointer
+        x2 = y1 base pointer
+        x3 = y2 base pointer
+        x4 = coeffs
+        x5 = length / 2
+*/
+
+#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+        sub             x6, sp, #64
+        sub             sp, sp, #64
+        st1             {v8.1d-v11.1d}, [x6], #32
+        st1             {v12.1d-v15.1d}, [x6]
+
+        /* Load the coefficients in the v0, v1 registers */
+        ld1     {v0.8h, v1.8h}, [x4]
+
+        /* Load the frequently used immediate in a register */
+        mov x4, #8
+
+1:
+        /* Load and post-increase the address by x4=#8 */
+        ld1     {v13.16b}, [x1], x4
+        ld1     {v14.16b}, [x2], x4
+        ld1     {v15.16b}, [x3], x4
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x1, x4] // TODO: test this
+//        prfm        PLDL1KEEP,[x2, x4] // TODO: test this
+//        prfm        PLDL1KEEP,[x3, x4] // TODO: test this
+
+        uxtl      v2.8h, v13.8b
+        uxtl2     v3.8h, v13.16b
+        uxtl      v4.8h, v14.8b
+        uxtl2     v5.8h, v14.16b
+        uxtl      v6.8h, v15.8b
+        uxtl2     v7.8h, v15.16b
+
+/*
+        The two pixel source array is
+        v2,  v2hi,  v3lo,  v3hi
+        v4,  v4hi,  v5lo, v5hi
+        v6, v6hi, v7lo, v7hi
+*/
+
+        smull     v8.4s, v2.4h, v0.h[0]
+        smull2    v9.4s, v2.8h, v0.h[0]
+        smlal2    v8.4s, v2.8h, v0.h[1]
+        smlal     v9.4s, v3.4h, v0.h[1]
+        smlal     v8.4s, v3.4h, v0.h[2]
+        smlal2    v9.4s, v3.8h, v0.h[2]
+        smlal     v8.4s, v4.4h, v0.h[3]
+        smlal2    v9.4s, v4.8h, v0.h[3]
+        smlal2    v8.4s, v4.8h, v0.h[4]
+        smlal     v9.4s, v5.4h, v0.h[4]
+        smlal     v8.4s, v5.4h, v0.h[5]
+        smlal2    v9.4s, v5.8h, v0.h[5]
+        smlal     v8.4s, v6.4h, v0.h[6]
+        smlal2    v9.4s, v6.8h, v0.h[6]
+        smlal2    v8.4s, v6.8h, v0.h[7]
+        smlal     v9.4s, v7.4h, v0.h[7]
+        smlal     v8.4s, v7.4h, v1.h[0]
+        smlal2    v9.4s, v7.8h, v1.h[0]
+
+        shrn      v8.4h, v8.4s, #8
+        shrn2     v8.8h, v9.4s, #8
+
+        sqxtun      v8.8b, v8.8h
+        st1         {v8.8b}, [x0], #8
+
+        /* Are we done yet? */
+        subs x5, x5, #1
+        bne 1b
+
+        /* We're done, bye! */
+        ld1             {v8.1d-v11.1d}, [sp], #32
+        ld1             {v12.1d-v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicConvolve3x3_K)
+
+
+/* Convolve 5x5 */
+
+/*
+        x0 = dst
+        x1 = y0 base pointer
+        x2 = y1 base pointer
+        x3 = y2 base pointer
+        x4 = y3 base pointer
+        x5 = y4 base pointer
+        x6 = coeffs
+        x7 = length
+*/
+ENTRY(rsdIntrinsicConvolve5x5_K)
+        sub         x8, sp, #64
+        sub         sp, sp, #64
+        st1         {v8.1d-v11.1d}, [x8], #32
+        st1         {v12.1d-v15.1d}, [x8]
+
+        /* Create the coefficients vector  */
+        ld1         {v0.8h-v2.8h}, [x6], #48
+        ld1         {v3.4h}, [x6], #8
+
+        movi      v15.4s, #0x7f
+
+        /* Load the frequently used immediate in a register */
+        mov     x6, #8
+
+1:
+        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+        ld1     {v9.8b-v11.8b}, [x1], x6      //  y0 ( y - 2 )
+        ld1     {v12.8b-v14.8b}, [x2], x6      //  y0 ( y - 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x1, x6] // TODO: test this
+//        prfm        PLDL1KEEP,[x2, x6] // TODO: test this
+
+        /* Promoting the 8bit channels to 16bit */
+        uxtl      v9.8h,  v9.8b
+        uxtl      v10.8h, v10.8b
+        uxtl      v11.8h, v11.8b
+        uxtl      v12.8h, v12.8b
+        uxtl      v13.8h, v13.8b
+        uxtl      v14.8h, v14.8b
+
+/*
+        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
+        v12,  v12hi
+*/
+        smull     v4.4s, v9.4h, v0.h[0]
+        smull2    v5.4s, v9.8h, v0.h[0]
+        smlal2    v4.4s, v9.8h, v0.h[1]
+        smlal     v5.4s, v10.4h, v0.h[1]
+        smlal     v4.4s, v10.4h, v0.h[2]
+        smlal2    v5.4s, v10.8h, v0.h[2]
+        smlal2    v4.4s, v10.8h, v0.h[3]
+        smlal     v5.4s, v11.4h, v0.h[3]
+        smlal     v4.4s, v11.4h, v0.h[4]
+        smlal2    v5.4s, v11.8h, v0.h[4]
+
+        smlal     v4.4s, v12.4h, v0.h[5]
+        smlal2    v5.4s, v12.8h, v0.h[5]
+        smlal2    v4.4s, v12.8h, v0.h[6]
+        smlal     v5.4s, v13.4h, v0.h[6]
+        smlal     v4.4s, v13.4h, v0.h[7]
+        smlal2    v5.4s, v13.8h, v0.h[7]
+        smlal2    v4.4s, v13.8h, v1.h[0]
+        smlal     v5.4s, v14.4h, v1.h[0]
+        smlal     v4.4s, v14.4h, v1.h[1]
+        smlal2    v5.4s, v14.8h, v1.h[1]
+
+        /* Next 2 rows */
+        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+        ld1     {v9.8b-v11.8b}, [x3], x6      //  y0 ( y )
+        ld1     {v12.8b-v14.8b}, [x4], x6      //  y0 ( y + 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x3, x6] // TODO: test this
+//        prfm        PLDL1KEEP,[x4, x6] // TODO: test this
+
+        /* Promoting the 8bit channels to 16bit */
+        uxtl      v9.8h,  v9.8b
+        uxtl      v10.8h, v10.8b
+        uxtl      v11.8h, v11.8b
+        uxtl      v12.8h, v12.8b
+        uxtl      v13.8h, v13.8b
+        uxtl      v14.8h, v14.8b
+
+/*
+        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
+        v12,  v12hi
+*/
+        smlal     v4.4s, v9.4h, v1.h[2]
+        smlal2    v5.4s, v9.8h, v1.h[2]
+        smlal2    v4.4s, v9.8h, v1.h[3]
+        smlal     v5.4s, v10.4h, v1.h[3]
+        smlal     v4.4s, v10.4h, v1.h[4]
+        smlal2    v5.4s, v10.8h, v1.h[4]
+        smlal2    v4.4s, v10.8h, v1.h[5]
+        smlal     v5.4s, v11.4h, v1.h[5]
+        smlal     v4.4s, v11.4h, v1.h[6]
+        smlal2    v5.4s, v11.8h, v1.h[6]
+
+        smlal     v4.4s, v12.4h, v1.h[7]
+        smlal2    v5.4s, v12.8h, v1.h[7]
+        smlal2    v4.4s, v12.8h, v2.h[0]
+        smlal     v5.4s, v13.4h, v2.h[0]
+        smlal     v4.4s, v13.4h, v2.h[1]
+        smlal2    v5.4s, v13.8h, v2.h[1]
+        smlal2    v4.4s, v13.8h, v2.h[2]
+        smlal     v5.4s, v14.4h, v2.h[2]
+        smlal     v4.4s, v14.4h, v2.h[3]
+        smlal2    v5.4s, v14.8h, v2.h[3]
+
+        /* Last row */
+        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+        ld1     {v9.8b- v11.8b}, [x5], x6      //  y0 ( y + 2 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x5, x6] // TODO: test this
+
+        /* Promoting the 8bit channels to 16bit */
+        uxtl      v9.8h,  v9.8b
+        uxtl      v10.8h, v10.8b
+        uxtl      v11.8h, v11.8b
+
+/*
+        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
+        v12,  v12hi
+*/
+
+        smlal     v4.4s, v9.4h, v2.h[4]
+        smlal2    v5.4s, v9.8h, v2.h[4]
+        smlal2    v4.4s, v9.8h, v2.h[5]
+        smlal     v5.4s, v10.4h, v2.h[5]
+        smlal     v4.4s, v10.4h, v2.h[6]
+        smlal2    v5.4s, v10.8h, v2.h[6]
+        smlal2    v4.4s, v10.8h, v2.h[7]
+        smlal     v5.4s, v11.4h, v2.h[7]
+        smlal     v4.4s, v11.4h, v3.h[0]
+        smlal2    v5.4s, v11.8h, v3.h[0]
+
+        add      v4.4s, v4.4s, v15.4s
+        add      v5.4s, v5.4s, v15.4s
+
+/*      Narrow it to a d-reg 32 -> 16 bit */
+        rshrn      v4.4h, v4.4s, #8
+        rshrn2     v4.8h, v5.4s, #8
+
+
+/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
+        sqxtun      v4.8b, v4.8h
+
+        st1     {v4.8b}, [x0], #8        // return the output and increase the address of x0
+
+        /* Are we done? */
+        subs x7, x7, #1
+        bne 1b
+
+        /* Yup, bye */
+        ld1         {v8.1d-v11.1d}, [sp], #32
+        ld1         {v12.1d-v15.1d}, [sp], #32
+        ret
+
+END(rsdIntrinsicConvolve5x5_K)
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon_Convolve.S
similarity index 100%
rename from cpu_ref/rsCpuIntrinsics_neon.S
rename to cpu_ref/rsCpuIntrinsics_neon_Convolve.S