Neon detection for RS SDK compat lib.
Change-Id: I3887158c7ec97ba116c28dc7b1d0c789b81fae60
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index a4cf0ad..79b908c 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -28,6 +28,10 @@
#include <string.h>
#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+
#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
#include <cutils/properties.h>
#include "utils/StopWatch.h"
@@ -53,6 +57,8 @@
static uint32_t gThreadTLSKeyCount = 0;
static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
+bool android::renderscript::gArchUseSIMD = false;
+
RsdCpuReference::~RsdCpuReference() {
}
@@ -193,6 +199,38 @@
pthread_mutex_unlock(&gInitMutex);
}
+#if defined(ARCH_ARM_HAVE_VFP)
+static int
+read_file(const char* pathname, char* buffer, size_t buffsize)
+{
+ int fd, len;
+
+ fd = open(pathname, O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ do {
+ len = read(fd, buffer, buffsize);
+ } while (len < 0 && errno == EINTR);
+
+ close(fd);
+
+ return len;
+}
+
+static void GetCpuInfo() {
+ char cpuinfo[4096];
+ int cpuinfo_len;
+
+ cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, sizeof cpuinfo);
+ if (cpuinfo_len < 0) /* should not happen */ {
+ return;
+ }
+
+ gArchUseSIMD = !!strstr(cpuinfo, " neon");
+}
+#endif // ARCH_ARM_HAVE_VFP
+
bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
sym_lookup_t lfn, script_lookup_t slfn) {
@@ -218,6 +256,10 @@
ALOGE("pthread_setspecific %i", status);
}
+#if defined(ARCH_ARM_HAVE_VFP)
+ GetCpuInfo();
+#endif
+
int cpu = sysconf(_SC_NPROCESSORS_ONLN);
if(mRSC->props.mDebugMaxThreads) {
cpu = mRSC->props.mDebugMaxThreads;
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index c6704fb..0f30a48 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -32,6 +32,7 @@
namespace android {
namespace renderscript {
+extern bool gArchUseSIMD;
typedef void (* InvokeFunc_t)(void);
typedef void (* ForEachFunc_t)(void);
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index 03f24d8..4362973 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -83,19 +83,21 @@
//ALOGE("strides %zu %zu", stride_y, stride_z);
while (x1 < x2) {
-#if defined(ARCH_ARM_HAVE_NEON)
- int32_t len = (x2 - x1 - 1) >> 1;
- if(len > 0) {
- const short neon_constants[] = {
- coordMul.x, coordMul.y, coordMul.z, 0,
- 0, 0, 0, 0xffff,
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ int32_t len = (x2 - x1 - 1) >> 1;
+ if(len > 0) {
+ const short neon_constants[] = {
+ coordMul.x, coordMul.y, coordMul.z, 0,
+ 0, 0, 0, 0xffff,
- };
+ };
- rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants);
- x1 += len << 1;
- out += len << 1;
- in += len << 1;
+ rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants);
+ x1 += len << 1;
+ out += len << 1;
+ in += len << 1;
+ }
}
#endif
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 4e9470e..5e79169 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -103,8 +103,6 @@
extern "C" void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
extern "C" void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
-//#undef ARCH_ARM_HAVE_NEON
-
void RsdCpuScriptIntrinsicBlend::kernel(const RsForEachStubParamStruct *p,
uint32_t xstart, uint32_t xend,
uint32_t instep, uint32_t outstep) {
@@ -131,13 +129,15 @@
case BLEND_DST:
break;
case BLEND_SRC_OVER:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendSrcOver_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendSrcOver_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -148,13 +148,15 @@
}
break;
case BLEND_DST_OVER:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendDstOver_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendDstOver_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -165,13 +167,15 @@
}
break;
case BLEND_SRC_IN:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendSrcIn_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendSrcIn_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -181,13 +185,15 @@
}
break;
case BLEND_DST_IN:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendDstIn_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendDstIn_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -197,13 +203,15 @@
}
break;
case BLEND_SRC_OUT:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendSrcOut_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendSrcOut_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -213,13 +221,15 @@
}
break;
case BLEND_DST_OUT:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendDstOut_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendDstOut_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -229,13 +239,15 @@
}
break;
case BLEND_SRC_ATOP:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendSrcAtop_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendSrcAtop_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -247,13 +259,15 @@
}
break;
case BLEND_DST_ATOP:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendDstAtop_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendDstAtop_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -265,13 +279,15 @@
}
break;
case BLEND_XOR:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendXor_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendXor_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -287,13 +303,15 @@
rsAssert(false);
break;
case BLEND_MULTIPLY:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendMultiply_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendMultiply_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -378,13 +396,15 @@
rsAssert(false);
break;
case BLEND_ADD:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendAdd_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendAdd_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
@@ -397,13 +417,15 @@
}
break;
case BLEND_SUBTRACT:
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 8) < x2) {
- uint32_t len = (x2 - x1) >> 3;
- rsdIntrinsicBlendSub_K(out, in, len);
- x1 += len << 3;
- out += len << 3;
- in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendSub_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
}
#endif
for (;x1 < x2; x1++, out++, in++) {
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 068cc78..f3a656d 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -155,8 +155,8 @@
const uchar *ptrIn, int iStride, const float* gPtr, int ct,
int x1, int x2) {
-#if defined(ARCH_ARM_HAVE_NEON)
- {
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
int t = (x2 - x1);
t &= ~1;
if(t) {
@@ -207,8 +207,8 @@
len--;
}
-#if defined(ARCH_ARM_HAVE_NEON)
- if (x2 > x1) {
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD && (x2 > x1)) {
int t = (x2 - x1) >> 2;
t &= ~1;
if(t) {
@@ -313,12 +313,14 @@
out++;
x1++;
}
-#if defined(ARCH_ARM_HAVE_NEON)
- if ((x1 + cp->mIradius) < x2) {
- rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp,
- cp->mIradius * 2 + 1, x1, x2 - cp->mIradius);
- out += (x2 - cp->mIradius) - x1;
- x1 = x2 - cp->mIradius;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if ((x1 + cp->mIradius) < x2) {
+ rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp,
+ cp->mIradius * 2 + 1, x1, x2 - cp->mIradius);
+ out += (x2 - cp->mIradius) - x1;
+ x1 = x2 - cp->mIradius;
+ }
}
#endif
while(x2 > x1) {
@@ -364,15 +366,17 @@
out++;
x1++;
}
-#if defined(ARCH_ARM_HAVE_NEON)
- if ((x1 + cp->mIradius) < x2) {
- uint32_t len = x2 - (x1 + cp->mIradius);
- len &= ~3;
- if (len > 0) {
- rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - cp->mIradius, cp->mFp,
- cp->mIradius * 2 + 1, x1, x1 + len);
- out += len;
- x1 += len;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ if ((x1 + cp->mIradius) < x2) {
+ uint32_t len = x2 - (x1 + cp->mIradius);
+ len &= ~3;
+ if (len > 0) {
+ rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - cp->mIradius, cp->mFp,
+ cp->mIradius * 2 + 1, x1, x1 + len);
+ out += len;
+ x1 += len;
+ }
}
}
#endif
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 3fc322c..c6e38c0 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -118,13 +118,15 @@
uint32_t x2 = xend;
if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
- int32_t len = (x2 - x1) >> 2;
- if(len > 0) {
- rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
- x1 += len << 2;
- out += len << 2;
- in += len << 2;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ int32_t len = (x2 - x1) >> 2;
+ if(len > 0) {
+ rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
+ x1 += len << 2;
+ out += len << 2;
+ in += len << 2;
+ }
}
#endif
@@ -145,13 +147,15 @@
uint32_t x2 = xend;
if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
- int32_t len = (x2 - x1) >> 2;
- if(len > 0) {
- rsdIntrinsicColorMatrix3x3_K(out, in, cp->ip, len);
- x1 += len << 2;
- out += len << 2;
- in += len << 2;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ int32_t len = (x2 - x1) >> 2;
+ if(len > 0) {
+ rsdIntrinsicColorMatrix3x3_K(out, in, cp->ip, len);
+ x1 += len << 2;
+ out += len << 2;
+ in += len << 2;
+ }
}
#endif
@@ -172,13 +176,15 @@
uint32_t x2 = xend;
if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
- int32_t len = (x2 - x1) >> 2;
- if(len > 0) {
- rsdIntrinsicColorMatrixDot_K(out, in, cp->ip, len);
- x1 += len << 2;
- out += len << 2;
- in += len << 2;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ int32_t len = (x2 - x1) >> 2;
+ if(len > 0) {
+ rsdIntrinsicColorMatrixDot_K(out, in, cp->ip, len);
+ x1 += len << 2;
+ out += len << 2;
+ in += len << 2;
+ }
}
#endif
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 020fa6f..82ae1a4 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -119,12 +119,14 @@
}
if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
- int32_t len = (x2 - x1 - 1) >> 1;
- if(len > 0) {
- rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
- x1 += len << 1;
- out += len << 1;
+#if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ int32_t len = (x2 - x1 - 1) >> 1;
+ if(len > 0) {
+ rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
+ x1 += len << 1;
+ out += len << 1;
+ }
}
#endif
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index 112f377..b4932bd 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -153,8 +153,8 @@
x1++;
}
-#if defined(ARCH_ARM_HAVE_NEON)
- if((x1 + 3) < x2) {
+#if defined(ARCH_ARM_HAVE_VFP)
+ if(gArchUseSIMD && ((x1 + 3) < x2)) {
uint32_t len = (x2 - x1 - 3) >> 1;
rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
out += len << 1;
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 94fce1c..7b2a579 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -151,15 +151,17 @@
}
if(x2 > x1) {
- #if defined(ARCH_ARM_HAVE_NEON)
- int32_t len = (x2 - x1 - 1) >> 3;
- if(len > 0) {
- // ALOGE("%p, %p, %p, %d, %p", out, Y, uv, len, YuvCoeff);
- rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
- x1 += len << 3;
- out += len << 3;
+ if (gArchUseSIMD) {
+ #if defined(ARCH_ARM_HAVE_VFP)
+ int32_t len = (x2 - x1 - 1) >> 3;
+ if(len > 0) {
+ // ALOGE("%p, %p, %p, %d, %p", out, Y, uv, len, YuvCoeff);
+ rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
+ x1 += len << 3;
+ out += len << 3;
+ }
+ #endif
}
- #endif
// ALOGE("y %i %i %i", p->y, x1, x2);
while(x1 < x2) {
@@ -188,12 +190,14 @@
const uchar *v = pinV + ((p->y >> 1) * strideV);
if(x2 > x1) {
- #if defined(ARCH_ARM_HAVE_NEON)
- int32_t len = (x2 - x1 - 1) >> 3;
- if(len > 0) {
- rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
- x1 += len << 3;
- out += len << 3;
+ #if defined(ARCH_ARM_HAVE_VFP)
+ if (gArchUseSIMD) {
+ int32_t len = (x2 - x1 - 1) >> 3;
+ if(len > 0) {
+ rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
+ x1 += len << 3;
+ out += len << 3;
+ }
}
#endif