Neon detection for RS SDK compat lib.

Change-Id: I3887158c7ec97ba116c28dc7b1d0c789b81fae60
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index a4cf0ad..79b908c 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -28,6 +28,10 @@
 #include <string.h>
 #include <unistd.h>
 
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+
 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
 #include <cutils/properties.h>
 #include "utils/StopWatch.h"
@@ -53,6 +57,8 @@
 static uint32_t gThreadTLSKeyCount = 0;
 static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
 
+bool android::renderscript::gArchUseSIMD = false;
+
 RsdCpuReference::~RsdCpuReference() {
 }
 
@@ -193,6 +199,38 @@
     pthread_mutex_unlock(&gInitMutex);
 }
 
+#if defined(ARCH_ARM_HAVE_VFP)
+static int
+read_file(const char*  pathname, char*  buffer, size_t  buffsize)
+{
+    int  fd, len;
+
+    fd = open(pathname, O_RDONLY);
+    if (fd < 0)
+        return -1;
+
+    do {
+        len = read(fd, buffer, buffsize);
+    } while (len < 0 && errno == EINTR);
+
+    close(fd);
+
+    return len;
+}
+
+static void GetCpuInfo() {
+    char cpuinfo[4096];
+    int  cpuinfo_len;
+
+    cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, sizeof cpuinfo);
+    if (cpuinfo_len < 0)  /* should not happen */ {
+        return;
+    }
+
+    gArchUseSIMD = !!strstr(cpuinfo, " neon");
+}
+#endif // ARCH_ARM_HAVE_VFP
+
 bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
                                sym_lookup_t lfn, script_lookup_t slfn) {
 
@@ -218,6 +256,10 @@
         ALOGE("pthread_setspecific %i", status);
     }
 
+#if defined(ARCH_ARM_HAVE_VFP)
+    GetCpuInfo();
+#endif
+
     int cpu = sysconf(_SC_NPROCESSORS_ONLN);
     if(mRSC->props.mDebugMaxThreads) {
         cpu = mRSC->props.mDebugMaxThreads;
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index c6704fb..0f30a48 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -32,6 +32,7 @@
 namespace android {
 namespace renderscript {
 
+extern bool gArchUseSIMD;
 
 typedef void (* InvokeFunc_t)(void);
 typedef void (* ForEachFunc_t)(void);
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index 03f24d8..4362973 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -83,19 +83,21 @@
     //ALOGE("strides %zu %zu", stride_y, stride_z);
 
     while (x1 < x2) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1 - 1) >> 1;
-        if(len > 0) {
-            const short neon_constants[] = {
-                coordMul.x, coordMul.y, coordMul.z, 0,
-                0, 0, 0, 0xffff,
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            int32_t len = (x2 - x1 - 1) >> 1;
+            if(len > 0) {
+                const short neon_constants[] = {
+                    coordMul.x, coordMul.y, coordMul.z, 0,
+                    0, 0, 0, 0xffff,
 
-            };
+                };
 
-            rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants);
-            x1 += len << 1;
-            out += len << 1;
-            in += len << 1;
+                rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants);
+                x1 += len << 1;
+                out += len << 1;
+                in += len << 1;
+            }
         }
 
 #endif
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 4e9470e..5e79169 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -103,8 +103,6 @@
 extern "C" void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
 extern "C" void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
 
-//#undef ARCH_ARM_HAVE_NEON
-
 void RsdCpuScriptIntrinsicBlend::kernel(const RsForEachStubParamStruct *p,
                                         uint32_t xstart, uint32_t xend,
                                         uint32_t instep, uint32_t outstep) {
@@ -131,13 +129,15 @@
     case BLEND_DST:
         break;
     case BLEND_SRC_OVER:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendSrcOver_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcOver_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -148,13 +148,15 @@
         }
         break;
     case BLEND_DST_OVER:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendDstOver_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstOver_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -165,13 +167,15 @@
         }
         break;
     case BLEND_SRC_IN:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendSrcIn_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcIn_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -181,13 +185,15 @@
         }
         break;
     case BLEND_DST_IN:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendDstIn_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstIn_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -197,13 +203,15 @@
         }
         break;
     case BLEND_SRC_OUT:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendSrcOut_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcOut_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -213,13 +221,15 @@
         }
         break;
     case BLEND_DST_OUT:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendDstOut_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstOut_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -229,13 +239,15 @@
         }
         break;
     case BLEND_SRC_ATOP:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendSrcAtop_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcAtop_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -247,13 +259,15 @@
         }
         break;
     case BLEND_DST_ATOP:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendDstAtop_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstAtop_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -265,13 +279,15 @@
         }
         break;
     case BLEND_XOR:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendXor_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendXor_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -287,13 +303,15 @@
         rsAssert(false);
         break;
     case BLEND_MULTIPLY:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendMultiply_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendMultiply_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -378,13 +396,15 @@
         rsAssert(false);
         break;
     case BLEND_ADD:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendAdd_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendAdd_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -397,13 +417,15 @@
         }
         break;
     case BLEND_SUBTRACT:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendSub_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSub_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 068cc78..f3a656d 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -155,8 +155,8 @@
                     const uchar *ptrIn, int iStride, const float* gPtr, int ct,
                     int x1, int x2) {
 
-#if defined(ARCH_ARM_HAVE_NEON)
-    {
+#if defined(ARCH_ARM_HAVE_VFP)
+    if (gArchUseSIMD) {
         int t = (x2 - x1);
         t &= ~1;
         if(t) {
@@ -207,8 +207,8 @@
         len--;
     }
 
-#if defined(ARCH_ARM_HAVE_NEON)
-    if (x2 > x1) {
+#if defined(ARCH_ARM_HAVE_VFP)
+    if (gArchUseSIMD && (x2 > x1)) {
         int t = (x2 - x1) >> 2;
         t &= ~1;
         if(t) {
@@ -313,12 +313,14 @@
         out++;
         x1++;
     }
-#if defined(ARCH_ARM_HAVE_NEON)
-    if ((x1 + cp->mIradius) < x2) {
-        rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp,
-                               cp->mIradius * 2 + 1, x1, x2 - cp->mIradius);
-        out += (x2 - cp->mIradius) - x1;
-        x1 = x2 - cp->mIradius;
+#if defined(ARCH_ARM_HAVE_VFP)
+    if (gArchUseSIMD) {
+        if ((x1 + cp->mIradius) < x2) {
+            rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp,
+                                   cp->mIradius * 2 + 1, x1, x2 - cp->mIradius);
+            out += (x2 - cp->mIradius) - x1;
+            x1 = x2 - cp->mIradius;
+        }
     }
 #endif
     while(x2 > x1) {
@@ -364,15 +366,17 @@
         out++;
         x1++;
     }
-#if defined(ARCH_ARM_HAVE_NEON)
-    if ((x1 + cp->mIradius) < x2) {
-        uint32_t len = x2 - (x1 + cp->mIradius);
-        len &= ~3;
-        if (len > 0) {
-            rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - cp->mIradius, cp->mFp,
-                                   cp->mIradius * 2 + 1, x1, x1 + len);
-            out += len;
-            x1 += len;
+#if defined(ARCH_ARM_HAVE_VFP)
+    if (gArchUseSIMD) {
+        if ((x1 + cp->mIradius) < x2) {
+            uint32_t len = x2 - (x1 + cp->mIradius);
+            len &= ~3;
+            if (len > 0) {
+                rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - cp->mIradius, cp->mFp,
+                                       cp->mIradius * 2 + 1, x1, x1 + len);
+                out += len;
+                x1 += len;
+            }
         }
     }
 #endif
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 3fc322c..c6e38c0 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -118,13 +118,15 @@
     uint32_t x2 = xend;
 
     if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1) >> 2;
-        if(len > 0) {
-            rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
-            x1 += len << 2;
-            out += len << 2;
-            in += len << 2;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            int32_t len = (x2 - x1) >> 2;
+            if(len > 0) {
+                rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
+                x1 += len << 2;
+                out += len << 2;
+                in += len << 2;
+            }
         }
 #endif
 
@@ -145,13 +147,15 @@
     uint32_t x2 = xend;
 
     if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1) >> 2;
-        if(len > 0) {
-            rsdIntrinsicColorMatrix3x3_K(out, in, cp->ip, len);
-            x1 += len << 2;
-            out += len << 2;
-            in += len << 2;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            int32_t len = (x2 - x1) >> 2;
+            if(len > 0) {
+                rsdIntrinsicColorMatrix3x3_K(out, in, cp->ip, len);
+                x1 += len << 2;
+                out += len << 2;
+                in += len << 2;
+            }
         }
 #endif
 
@@ -172,13 +176,15 @@
     uint32_t x2 = xend;
 
     if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1) >> 2;
-        if(len > 0) {
-            rsdIntrinsicColorMatrixDot_K(out, in, cp->ip, len);
-            x1 += len << 2;
-            out += len << 2;
-            in += len << 2;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            int32_t len = (x2 - x1) >> 2;
+            if(len > 0) {
+                rsdIntrinsicColorMatrixDot_K(out, in, cp->ip, len);
+                x1 += len << 2;
+                out += len << 2;
+                in += len << 2;
+            }
         }
 #endif
 
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 020fa6f..82ae1a4 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -119,12 +119,14 @@
     }
 
     if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1 - 1) >> 1;
-        if(len > 0) {
-            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
-            x1 += len << 1;
-            out += len << 1;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            int32_t len = (x2 - x1 - 1) >> 1;
+            if(len > 0) {
+                rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
+                x1 += len << 1;
+                out += len << 1;
+            }
         }
 #endif
 
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index 112f377..b4932bd 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -153,8 +153,8 @@
         x1++;
     }
 
-#if defined(ARCH_ARM_HAVE_NEON)
-    if((x1 + 3) < x2) {
+#if defined(ARCH_ARM_HAVE_VFP)
+    if(gArchUseSIMD && ((x1 + 3) < x2)) {
         uint32_t len = (x2 - x1 - 3) >> 1;
         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
         out += len << 1;
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 94fce1c..7b2a579 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -151,15 +151,17 @@
             }
 
             if(x2 > x1) {
-        #if defined(ARCH_ARM_HAVE_NEON)
-                int32_t len = (x2 - x1 - 1) >> 3;
-                if(len > 0) {
-                    //                    ALOGE("%p, %p, %p, %d, %p", out, Y, uv, len, YuvCoeff);
-                    rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
-                    x1 += len << 3;
-                    out += len << 3;
+                if (gArchUseSIMD) {
+            #if defined(ARCH_ARM_HAVE_VFP)
+                    int32_t len = (x2 - x1 - 1) >> 3;
+                    if(len > 0) {
+                        //                    ALOGE("%p, %p, %p, %d, %p", out, Y, uv, len, YuvCoeff);
+                        rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
+                        x1 += len << 3;
+                        out += len << 3;
+                    }
+            #endif
                 }
-        #endif
 
                // ALOGE("y %i  %i  %i", p->y, x1, x2);
                 while(x1 < x2) {
@@ -188,12 +190,14 @@
             const uchar *v = pinV + ((p->y >> 1) * strideV);
 
             if(x2 > x1) {
-        #if defined(ARCH_ARM_HAVE_NEON)
-                int32_t len = (x2 - x1 - 1) >> 3;
-                if(len > 0) {
-                    rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
-                    x1 += len << 3;
-                    out += len << 3;
+        #if defined(ARCH_ARM_HAVE_VFP)
+                if (gArchUseSIMD) {
+                    int32_t len = (x2 - x1 - 1) >> 3;
+                    if(len > 0) {
+                        rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
+                        x1 += len << 3;
+                        out += len << 3;
+                    }
                 }
         #endif