Enable ARM64 intrinsics.
This also moves ARM intrinsic ifdefs behing ARCH_ARM_USE_INTRINSICS instead of ARCH_ARM_HAVE_VFP.
Change-Id: I48d3d55c77feb931e22288828247e281db43d32b
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index bfb9183..a4fbf3a 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -39,23 +39,22 @@
rsCpuIntrinsicResize.cpp \
rsCpuIntrinsicLUT.cpp
-#LOCAL_CFLAGS_arm64 += -DARCH_ARM_HAVE_NEON
-LOCAL_ASFLAGS_arm64 += -no-integrated-as
+LOCAL_CFLAGS_arm64 += -DARCH_ARM_USE_INTRINSICS -DARCH_ARM64_USE_INTRINSICS
-#LOCAL_SRC_FILES_arm64 += \
-# rsCpuIntrinsics_advsimd_3DLUT.S \
+LOCAL_SRC_FILES_arm64 += \
+ rsCpuIntrinsics_advsimd_3DLUT.S \
+ rsCpuIntrinsics_advsimd_Convolve.S \
+ rsCpuIntrinsics_advsimd_Blur.S \
+ rsCpuIntrinsics_advsimd_ColorMatrix.S \
+ rsCpuIntrinsics_advsimd_YuvToRGB.S
# rsCpuIntrinsics_advsimd_Blend.S \
-# rsCpuIntrinsics_advsimd_Blur.S \
-# rsCpuIntrinsics_advsimd_Convolve.S \
-# rsCpuIntrinsics_advsimd_ColorMatrix.S \
-# rsCpuIntrinsics_advsimd_YuvToRGB.S
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_NEON
endif
ifeq ($(ARCH_ARM_HAVE_VFP),true)
- LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_VFP
+ LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_VFP -DARCH_ARM_USE_INTRINSICS
LOCAL_SRC_FILES_arm += \
rsCpuIntrinsics_neon_3DLUT.S \
rsCpuIntrinsics_neon_Blend.S \
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 0a3a872..9755b9a 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -204,7 +204,6 @@
pthread_mutex_unlock(&gInitMutex);
}
-#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_X86_HAVE_SSSE3)
static int
read_file(const char* pathname, char* buffer, size_t buffsize)
{
@@ -232,13 +231,13 @@
return;
}
-#if defined(ARCH_ARM_HAVE_VFP)
- gArchUseSIMD = !!strstr(cpuinfo, " neon");
+#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS)
+ gArchUseSIMD = (!!strstr(cpuinfo, " neon")) ||
+ (!!strstr(cpuinfo, " asimd"));
#elif defined(ARCH_X86_HAVE_SSSE3)
gArchUseSIMD = !!strstr(cpuinfo, " ssse3");
#endif
}
-#endif // ARCH_ARM_HAVE_VFP || ARCH_X86_HAVE_SSSE3
bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
sym_lookup_t lfn, script_lookup_t slfn) {
@@ -265,9 +264,7 @@
ALOGE("pthread_setspecific %i", status);
}
-#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_X86_HAVE_SSSE3)
GetCpuInfo();
-#endif
int cpu = sysconf(_SC_NPROCESSORS_ONLN);
if(mRSC->props.mDebugMaxThreads) {
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index af2973d..a7c9487 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -83,7 +83,7 @@
//ALOGE("strides %zu %zu", stride_y, stride_z);
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
if (gArchUseSIMD) {
int32_t len = x2 - x1;
if(len > 0) {
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 3af425f..228b887 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -90,7 +90,7 @@
BLEND_LUMINOSITY = 43
};
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
extern "C" int rsdIntrinsicBlend_K(uchar4 *out, uchar4 const *in, int slot,
uint32_t xstart, uint32_t xend);
#endif
@@ -121,7 +121,7 @@
uint32_t x1 = xstart;
uint32_t x2 = xend;
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
if (gArchUseSIMD) {
if (rsdIntrinsicBlend_K(out, in, p->slot, x1, x2) >= 0)
return;
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 4e2ee96..c1ca4e2 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -292,7 +292,7 @@
uint32_t x1 = xstart;
uint32_t x2 = xend;
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
if (gArchUseSIMD) {
rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * p->y), p->dimX, p->dimY,
stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
@@ -361,7 +361,7 @@
uint32_t x1 = xstart;
uint32_t x2 = xend;
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
if (gArchUseSIMD) {
rsdIntrinsicBlurU1_K(out, pin + stride * p->y, p->dimX, p->dimY,
stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 765e571..0f3af5b 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -125,7 +125,7 @@
} u;
} Key_t;
-#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
typedef struct {
void (*column[4])(void);
void (*store)(void);
@@ -175,7 +175,7 @@
int ipa[4];
float tmpFp[16];
float tmpFpa[4];
-#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
FunctionTab_t mFnTab;
#endif
@@ -310,7 +310,7 @@
return key;
}
-#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
#define DEF_SYM(x) \
extern "C" uint32_t _N_ColorMatrix_##x; \
@@ -473,7 +473,7 @@
#endif
bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
-#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
mBufSize = 4096;
//StopWatch build_time("rs cm: build time");
mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
@@ -900,7 +900,7 @@
out += outstep * len;
in += instep * len;
}
-#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
else {
size_t done;
if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
@@ -962,7 +962,7 @@
if (build(key)) {
mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
}
-#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
else {
int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 0d7a86b..552a835 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -211,7 +211,7 @@
}
if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_X86_HAVE_SSSE3)
+#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
if (gArchUseSIMD) {
int32_t len = (x2 - x1 - 1) >> 1;
if(len > 0) {
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index bcffe9a..48b5ca5 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -390,7 +390,7 @@
}
#endif
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
if(gArchUseSIMD && ((x1 + 3) < x2)) {
uint32_t len = (x2 - x1 - 3) >> 1;
rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->mIp, len);
diff --git a/cpu_ref/rsCpuIntrinsicLoopFilter.cpp b/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
index ed58556..c31fcdf 100644
--- a/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
+++ b/cpu_ref/rsCpuIntrinsicLoopFilter.cpp
@@ -377,7 +377,8 @@
const uint8_t *thresh1);
-#if defined(ARCH_ARM_HAVE_VFP)
+// remove ARM64 statement when ARM64 asm available
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_neon
#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_neon
@@ -446,7 +447,7 @@
#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_c
#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_c
-#endif // ARCH_ARM_HAVE_VFP
+#endif // ARCH_ARM_USE_INTRINSICS && !ARCH_ARM64_USE_INTRINSICS
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index f65d6aa..563b3e1 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -161,7 +161,7 @@
out++;
x1++;
}
-#if defined(ARCH_ARM_HAVE_VFP)
+#if defined(ARCH_ARM_USE_INTRINSICS)
if((x2 > x1) && gArchUseSIMD) {
int32_t len = x2 - x1;
if (cstep == 1) {