Neon detection for RS SDK compat lib.

Change-Id: I3887158c7ec97ba116c28dc7b1d0c789b81fae60
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 4e9470e..5e79169 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -103,8 +103,6 @@
 extern "C" void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
 extern "C" void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
 
-//#undef ARCH_ARM_HAVE_NEON
-
 void RsdCpuScriptIntrinsicBlend::kernel(const RsForEachStubParamStruct *p,
                                         uint32_t xstart, uint32_t xend,
                                         uint32_t instep, uint32_t outstep) {
@@ -131,13 +129,15 @@
     case BLEND_DST:
         break;
     case BLEND_SRC_OVER:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendSrcOver_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcOver_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -148,13 +148,15 @@
         }
         break;
     case BLEND_DST_OVER:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendDstOver_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstOver_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -165,13 +167,15 @@
         }
         break;
     case BLEND_SRC_IN:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendSrcIn_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcIn_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -181,13 +185,15 @@
         }
         break;
     case BLEND_DST_IN:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendDstIn_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstIn_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -197,13 +203,15 @@
         }
         break;
     case BLEND_SRC_OUT:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendSrcOut_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcOut_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -213,13 +221,15 @@
         }
         break;
     case BLEND_DST_OUT:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendDstOut_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstOut_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -229,13 +239,15 @@
         }
         break;
     case BLEND_SRC_ATOP:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendSrcAtop_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcAtop_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -247,13 +259,15 @@
         }
         break;
     case BLEND_DST_ATOP:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendDstAtop_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstAtop_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -265,13 +279,15 @@
         }
         break;
     case BLEND_XOR:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendXor_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendXor_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -287,13 +303,15 @@
         rsAssert(false);
         break;
     case BLEND_MULTIPLY:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendMultiply_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendMultiply_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -378,13 +396,15 @@
         rsAssert(false);
         break;
     case BLEND_ADD:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendAdd_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendAdd_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {
@@ -397,13 +417,15 @@
         }
         break;
     case BLEND_SUBTRACT:
-#if defined(ARCH_ARM_HAVE_NEON)
-        if((x1 + 8) < x2) {
-            uint32_t len = (x2 - x1) >> 3;
-            rsdIntrinsicBlendSub_K(out, in, len);
-            x1 += len << 3;
-            out += len << 3;
-            in += len << 3;
+#if defined(ARCH_ARM_HAVE_VFP)
+        if (gArchUseSIMD) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSub_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
         }
 #endif
         for (;x1 < x2; x1++, out++, in++) {