Optimise YuvToRGB using 16-bit arithmetic.

Reimplement YuvToRGB intrinsic using 16-bit SIMD arithmetic to increase
throughput.  Implementations in AArch32 and AArch64 NEON.

Change-Id: Idd43e383f5147c33b0b546fa822c970de432c19d
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 22f0962..2d905de 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -97,19 +97,9 @@
 }
 
 
-static short YuvCoeff[] = {
-    298, 409, -100, 516,   -208, 255, 0, 0,
-    16, 16, 16, 16,        16, 16, 16, 16,
-    128, 128, 128, 128, 128, 128, 128, 128,
-    298, 298, 298, 298, 298, 298, 298, 298,
-    255, 255, 255, 255, 255, 255, 255, 255
-
-
-};
-
-extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
-extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
-extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, uint32_t count, const short *param);
+extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart, size_t xend);
+extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart, size_t xend);
+extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, size_t xstart, size_t xend);
 
 void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
                                            uint32_t xstart, uint32_t xend,
@@ -164,28 +154,24 @@
 
 #if defined(ARCH_ARM_HAVE_VFP)
     if((x2 > x1) && gArchUseSIMD) {
-        // The neon paths may over-read by up to 8 bytes
-        int32_t len = (x2 - x1 - 8) >> 3;
-        if(len > 0) {
-            if (cstep == 1) {
-                rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
-                x1 += len << 3;
-                out += len << 3;
-            } else if (cstep == 2) {
-                // Check for proper interleave
-                intptr_t ipu = (intptr_t)u;
-                intptr_t ipv = (intptr_t)v;
+        int32_t len = x2 - x1;
+        if (cstep == 1) {
+            rsdIntrinsicYuv2_K(out, Y, u, v, x1, x2);
+            x1 += len;
+            out += len;
+        } else if (cstep == 2) {
+            // Check for proper interleave
+            intptr_t ipu = (intptr_t)u;
+            intptr_t ipv = (intptr_t)v;
 
-                if (ipu == (ipv + 1)) {
-                    rsdIntrinsicYuv_K(out, Y, v, len, YuvCoeff);
-                    x1 += len << 3;
-                    out += len << 3;
-                } else if (ipu == (ipv - 1)) {
-                    rsdIntrinsicYuvR_K(out, Y, u, len, YuvCoeff);
-                    x1 += len << 3;
-                    out += len << 3;
-                }
-
+            if (ipu == (ipv + 1)) {
+                rsdIntrinsicYuv_K(out, Y, v, x1, x2);
+                x1 += len;
+                out += len;
+            } else if (ipu == (ipv - 1)) {
+                rsdIntrinsicYuvR_K(out, Y, u, x1, x2);
+                x1 += len;
+                out += len;
             }
         }
     }