Enable ARM64 intrinsics.

This also moves ARM intrinsic ifdefs behing ARCH_ARM_USE_INTRINSICS instead of ARCH_ARM_HAVE_VFP.

Change-Id: I48d3d55c77feb931e22288828247e281db43d32b
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index bfb9183..a4fbf3a 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -39,23 +39,22 @@
 	rsCpuIntrinsicResize.cpp \
 	rsCpuIntrinsicLUT.cpp
 
-#LOCAL_CFLAGS_arm64 += -DARCH_ARM_HAVE_NEON
-LOCAL_ASFLAGS_arm64 += -no-integrated-as
+LOCAL_CFLAGS_arm64 += -DARCH_ARM_USE_INTRINSICS -DARCH_ARM64_USE_INTRINSICS
 
-#LOCAL_SRC_FILES_arm64 += \
-#    rsCpuIntrinsics_advsimd_3DLUT.S \
+LOCAL_SRC_FILES_arm64 += \
+    rsCpuIntrinsics_advsimd_3DLUT.S \
+    rsCpuIntrinsics_advsimd_Convolve.S \
+    rsCpuIntrinsics_advsimd_Blur.S \
+    rsCpuIntrinsics_advsimd_ColorMatrix.S \
+    rsCpuIntrinsics_advsimd_YuvToRGB.S
 #    rsCpuIntrinsics_advsimd_Blend.S \
-#    rsCpuIntrinsics_advsimd_Blur.S \
-#    rsCpuIntrinsics_advsimd_Convolve.S \
-#    rsCpuIntrinsics_advsimd_ColorMatrix.S \
-#    rsCpuIntrinsics_advsimd_YuvToRGB.S
 
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
     LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_NEON
 endif
 
 ifeq ($(ARCH_ARM_HAVE_VFP),true)
-    LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_VFP
+    LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_VFP -DARCH_ARM_USE_INTRINSICS
     LOCAL_SRC_FILES_arm += \
     rsCpuIntrinsics_neon_3DLUT.S \
     rsCpuIntrinsics_neon_Blend.S \
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 0a3a872..9755b9a 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -204,7 +204,6 @@
     pthread_mutex_unlock(&gInitMutex);
 }
 
-#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_X86_HAVE_SSSE3)
 static int
 read_file(const char*  pathname, char*  buffer, size_t  buffsize)
 {
@@ -232,13 +231,13 @@
         return;
     }
 
-#if defined(ARCH_ARM_HAVE_VFP)
-    gArchUseSIMD = !!strstr(cpuinfo, " neon");
+#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS)
+    gArchUseSIMD = (!!strstr(cpuinfo, " neon")) ||
+                   (!!strstr(cpuinfo, " asimd"));
 #elif defined(ARCH_X86_HAVE_SSSE3)
     gArchUseSIMD = !!strstr(cpuinfo, " ssse3");
 #endif
 }
-#endif // ARCH_ARM_HAVE_VFP || ARCH_X86_HAVE_SSSE3
 
 bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
                                sym_lookup_t lfn, script_lookup_t slfn) {
@@ -265,9 +264,7 @@
         ALOGE("pthread_setspecific %i", status);
     }
 
-#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_X86_HAVE_SSSE3)
     GetCpuInfo();
-#endif
 
     int cpu = sysconf(_SC_NPROCESSORS_ONLN);
     if(mRSC->props.mDebugMaxThreads) {
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index af2973d..a7c9487 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -83,7 +83,7 @@
 
     //ALOGE("strides %zu %zu", stride_y, stride_z);
 
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
     if (gArchUseSIMD) {
         int32_t len = x2 - x1;
         if(len > 0) {
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 3af425f..228b887 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -90,7 +90,7 @@
     BLEND_LUMINOSITY = 43
 };
 
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
 extern "C" int rsdIntrinsicBlend_K(uchar4 *out, uchar4 const *in, int slot,
                     uint32_t xstart, uint32_t xend);
 #endif
@@ -121,7 +121,7 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
     if (gArchUseSIMD) {
         if (rsdIntrinsicBlend_K(out, in, p->slot, x1, x2) >= 0)
             return;
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 4e2ee96..c1ca4e2 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -292,7 +292,7 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
     if (gArchUseSIMD) {
         rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * p->y), p->dimX, p->dimY,
                  stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
@@ -361,7 +361,7 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
     if (gArchUseSIMD) {
         rsdIntrinsicBlurU1_K(out, pin + stride * p->y, p->dimX, p->dimY,
                  stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 765e571..0f3af5b 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -125,7 +125,7 @@
     } u;
 } Key_t;
 
-#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
 typedef struct {
     void (*column[4])(void);
     void (*store)(void);
@@ -175,7 +175,7 @@
     int ipa[4];
     float tmpFp[16];
     float tmpFpa[4];
-#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
     FunctionTab_t mFnTab;
 #endif
 
@@ -310,7 +310,7 @@
     return key;
 }
 
-#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
 
 #define DEF_SYM(x)                                  \
     extern "C" uint32_t _N_ColorMatrix_##x;      \
@@ -473,7 +473,7 @@
 #endif
 
 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
-#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
     mBufSize = 4096;
     //StopWatch build_time("rs cm: build time");
     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
@@ -900,7 +900,7 @@
                 out += outstep * len;
                 in += instep * len;
             }
-#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
             else {
                 size_t done;
                 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
@@ -962,7 +962,7 @@
         if (build(key)) {
             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
         }
-#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
         else {
             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 0d7a86b..552a835 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -211,7 +211,7 @@
     }
 
     if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_X86_HAVE_SSSE3)
+#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
         if (gArchUseSIMD) {
             int32_t len = (x2 - x1 - 1) >> 1;
             if(len > 0) {
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index bcffe9a..48b5ca5 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -390,7 +390,7 @@
     }
 #endif
 
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
     if(gArchUseSIMD && ((x1 + 3) < x2)) {
         uint32_t len = (x2 - x1 - 3) >> 1;
         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->mIp, len);
diff --git a/cpu_ref/rsCpuIntrinsicLoopFilter.cpp b/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
index ed58556..c31fcdf 100644
--- a/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
+++ b/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
@@ -377,7 +377,8 @@
                                                const uint8_t *thresh1);
 
 
-#if defined(ARCH_ARM_HAVE_VFP)
+// remove ARM64 statement when ARM64 asm available
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
 
 #define vp9_lpf_vertical_16 vp9_lpf_vertical_16_neon
 #define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_neon
@@ -446,7 +447,7 @@
 #define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_c
 #define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_c
 
-#endif // ARCH_ARM_HAVE_VFP
+#endif // ARCH_ARM_USE_INTRINSICS && !ARCH_ARM64_USE_INTRINSICS
 
 
 
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index f65d6aa..563b3e1 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -161,7 +161,7 @@
         out++;
         x1++;
     }
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
     if((x2 > x1) && gArchUseSIMD) {
         int32_t len = x2 - x1;
         if (cstep == 1) {