Optimise YuvToRGB using 16-bit arithmetic.

Reimplement YuvToRGB intrinsic using 16-bit SIMD arithmetic to increase
throughput.  Implementations in AArch32 and AArch64 NEON.

Change-Id: Idd43e383f5147c33b0b546fa822c970de432c19d
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 6a2ac45..4f7d3c0 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -34,7 +34,9 @@
 
 ifeq ($(TARGET_ARCH),arm64)
     LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
-    LOCAL_SRC_FILES+=rsCpuIntrinsics_advsimd_Blend.S
+    LOCAL_SRC_FILES+= \
+        rsCpuIntrinsics_advsimd_Blend.S \
+        rsCpuIntrinsics_advsimd_YuvToRGB.S
 else
     ifeq ($(ARCH_ARM_HAVE_NEON),true)
         LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
@@ -45,7 +47,8 @@
         LOCAL_SRC_FILES+= \
             rsCpuIntrinsics_neon.S \
             rsCpuIntrinsics_neon_ColorMatrix.S \
-            rsCpuIntrinsics_neon_Blend.S
+            rsCpuIntrinsics_neon_Blend.S \
+            rsCpuIntrinsics_neon_YuvToRGB.S
         LOCAL_ASFLAGS := -mfpu=neon
     endif
 endif
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 22f0962..2d905de 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -97,19 +97,9 @@
 }
 
 
-static short YuvCoeff[] = {
-    298, 409, -100, 516,   -208, 255, 0, 0,
-    16, 16, 16, 16,        16, 16, 16, 16,
-    128, 128, 128, 128, 128, 128, 128, 128,
-    298, 298, 298, 298, 298, 298, 298, 298,
-    255, 255, 255, 255, 255, 255, 255, 255
-
-
-};
-
-extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
-extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
-extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, uint32_t count, const short *param);
+extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart, size_t xend);
+extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart, size_t xend);
+extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, size_t xstart, size_t xend);
 
 void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
                                            uint32_t xstart, uint32_t xend,
@@ -164,28 +154,24 @@
 
 #if defined(ARCH_ARM_HAVE_VFP)
     if((x2 > x1) && gArchUseSIMD) {
-        // The neon paths may over-read by up to 8 bytes
-        int32_t len = (x2 - x1 - 8) >> 3;
-        if(len > 0) {
-            if (cstep == 1) {
-                rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
-                x1 += len << 3;
-                out += len << 3;
-            } else if (cstep == 2) {
-                // Check for proper interleave
-                intptr_t ipu = (intptr_t)u;
-                intptr_t ipv = (intptr_t)v;
+        int32_t len = x2 - x1;
+        if (cstep == 1) {
+            rsdIntrinsicYuv2_K(out, Y, u, v, x1, x2);
+            x1 += len;
+            out += len;
+        } else if (cstep == 2) {
+            // Check for proper interleave
+            intptr_t ipu = (intptr_t)u;
+            intptr_t ipv = (intptr_t)v;
 
-                if (ipu == (ipv + 1)) {
-                    rsdIntrinsicYuv_K(out, Y, v, len, YuvCoeff);
-                    x1 += len << 3;
-                    out += len << 3;
-                } else if (ipu == (ipv - 1)) {
-                    rsdIntrinsicYuvR_K(out, Y, u, len, YuvCoeff);
-                    x1 += len << 3;
-                    out += len << 3;
-                }
-
+            if (ipu == (ipv + 1)) {
+                rsdIntrinsicYuv_K(out, Y, v, x1, x2);
+                x1 += len;
+                out += len;
+            } else if (ipu == (ipv - 1)) {
+                rsdIntrinsicYuvR_K(out, Y, u, x1, x2);
+                x1 += len;
+                out += len;
             }
         }
     }
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
new file mode 100644
index 0000000..9232a79
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
@@ -0,0 +1,292 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register.  This macro will be called from within several different wrapper
+ * variants for different data layouts.  Y data starts with the even and odd
+ * bytes split into the low parts of v8 and v9 respectively.  U and V are in
+ * v16 and v17.  Working constants are pre-loaded into v13-v15, and v3 is
+ * pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern
+        movi        v7.8b, #149
+
+        umull       v1.8h, v8.8b, v7.8b        // g0 = y0 * 149
+        umull       v5.8h, v9.8b, v7.8b        // g1 = y1 * 149
+
+        movi        v7.8b, #50
+        movi        v10.8b, #104
+        umull       v8.8h, v16.8b, v7.8b       // g2 = u * 50 + v * 104
+        umlal       v8.8h, v17.8b, v10.8b
+
+        ushr        v7.8b, v17.8b, #1
+        uaddw       v0.8h, v1.8h, v7.8b        // r0 = y0 * 149 + (v >> 1)
+        uaddw       v4.8h, v5.8h, v7.8b        // r1 = y1 * 149 + (v >> 1)
+
+        ushll       v7.8h, v16.8b, #2
+        add         v2.8h, v1.8h, v7.8h        // b0 = y0 * 149 + (u << 2)
+        add         v6.8h, v5.8h, v7.8h        // b1 = y1 * 149 + (u << 2)
+
+        movi        v7.16b, #204
+        movi        v10.8b, #254
+        umull       v11.8h, v17.8b, v7.8b     // r2 = v * 204
+        umull       v12.8h, v16.8b, v10.8b      // b2 = u * 254
+
+        uhadd       v0.8h, v0.8h, v11.8h       // r0 = (r0 + r2) >> 1
+        uhadd       v4.8h, v4.8h, v11.8h       // r1 = (r1 + r2) >> 1
+        uqadd       v1.8h, v1.8h, v14.8h       // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uqadd       v5.8h, v5.8h, v14.8h       // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uhadd       v2.8h, v2.8h, v12.8h       // b0 = (b0 + b2) >> 1
+        uhadd       v6.8h, v6.8h, v12.8h       // b1 = (b1 + b2) >> 1
+
+        uqsub       v0.8h, v0.8h, v13.8h       // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v4.8h, v4.8h, v13.8h       // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v1.8h, v1.8h, v8.8h        // g0 = satu16(g0 - g2)
+        uqsub       v5.8h, v5.8h, v8.8h        // g1 = satu16(g1 - g2)
+        uqsub       v2.8h, v2.8h, v15.8h       // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        uqsub       v6.8h, v6.8h, v15.8h       // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        uqrshrn     v0.8b, v0.8h, #6
+        uqrshrn     v4.8b, v4.8h, #6
+        uqrshrn     v1.8b, v1.8h, #7
+        uqrshrn     v5.8b, v5.8h, #7
+        uqrshrn     v2.8b, v2.8h, #6
+        uqrshrn     v6.8b, v6.8h, #6
+
+        zip1        v0.16b, v0.16b, v4.16b
+        zip1        v1.16b, v1.16b, v5.16b
+        zip1        v2.16b, v2.16b, v6.16b
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+
+        mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        dup         v13.8h, w5
+        mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        dup         v14.8h, w5
+        mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        dup         v15.8h, w5
+
+        movi        v3.16b, #0xff
+
+        subs        x2, x2, #16
+        bhs         1f
+        b           2f
+
+        .align 4
+1:      ld2         {v8.8b,v9.8b}, [x1], #16
+//      prfm PLDL1STRM, [x1, #256]
+  .if \interleaved
+    .if \swapuv
+        ld2         {v17.8b,v18.8b}, [x3], #16
+        mov         v16.8b, v18.8b
+    .else
+        ld2         {v16.8b,v17.8b}, [x3], #16
+    .endif
+//      prfm PLD1STRM,  [x3, #256]
+  .else
+        ld1         {v16.8b}, [x3], #8
+        ld1         {v17.8b}, [x4], #8
+//      prfm PLD1STRM,  [x3, #128]
+//      prfm PLD1STRM,  [x4, #128]
+  .endif
+
+        \kernel
+
+        subs        x2, x2, #16
+
+        st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+
+        bhs         1b
+
+2:      adds        x2, x2, #16
+        beq         2f
+
+        /* To handle the tail portion of the data (something less than 16
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the
+         * interaction between neighbouring pixels is constrained to odd
+         * boundaries where the load operations don't interfere.
+         */
+        movi        v8.8b, #0
+        movi        v9.8b, #0
+        movi        v16.8b, #0
+        movi        v17.8b, #0
+
+        tbz         x2, #3, 1f
+        ld1         {v9.8b}, [x1], #8
+  .if \interleaved
+        ld1         {v17.8b}, [x3], #8
+  .else
+        ld1         {v16.s}[1], [x3], #4
+        ld1         {v17.s}[1], [x4], #4
+  .endif
+1:      tbz         x2, #2, 1f
+        ld1         {v8.s}[1], [x1], #4
+  .if \interleaved
+        ld1         {v16.s}[1], [x3], #4
+  .else
+        ld1         {v16.h}[1], [x3], #2
+        ld1         {v17.h}[1], [x4], #2
+  .endif
+1:      tbz         x2, #1, 1f
+        ld1         {v8.h}[1], [x1], #2
+  .if \interleaved
+        ld1         {v16.h}[1], [x3], #2
+  .else
+        ld1         {v16.b}[1], [x3], #1
+        ld1         {v17.b}[1], [x4], #1
+  .endif
+1:      tbz         x2, #0, 1f
+        ld1         {v8.b}[1], [x1], #1
+  .if \interleaved
+        ld1         {v16.b}[1], [x3], #1
+  .else
+        ld1         {v16.b}[0], [x3], #1
+        ld1         {v17.b}[0], [x4], #1
+  .endif
+
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point if necessary.
+         */
+1:      uzp1        v8.16b, v8.16b, v9.16b
+  .if \interleaved
+    .if \swapuv
+        uzp1        v16.16b, v17.16b, v16.16b
+    .else
+        uzp1        v16.16b, v16.16b, v17.16b
+    .endif
+  .endif
+
+        \kernel
+
+        /* As above but with the output; structured stores for partial vectors
+         * aren't available, so the data is re-packed first and stored linearly.
+         */
+        zip1        v4.16b, v0.16b, v2.16b
+        zip2        v6.16b, v0.16b, v2.16b
+        zip1        v5.16b, v1.16b, v3.16b
+        zip2        v7.16b, v1.16b, v3.16b
+        zip1        v0.16b, v4.16b, v5.16b
+        zip2        v1.16b, v4.16b, v5.16b
+        zip1        v2.16b, v6.16b, v7.16b
+        zip2        v3.16b, v6.16b, v7.16b
+
+1:      tbz         x2, #3, 1f
+        st1         {v2.16b,v3.16b}, [x0], #32
+1:      tbz         x2, #2, 1f
+        st1         {v1.16b}, [x0], #16
+1:      tbz         x2, #1, 1f
+        st1         {v0.d}[1], [x0], #8
+1:      tbz         x2, #0, 2f
+        st1         {v0.s}[1], [x0], #4
+2:
+.endm
+
+
+/*  void rsdIntrinsicYuv2_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uin,    // x2
+ *          void const *vin,    // x3
+ *          size_t xstart,      // x4
+ *          size_t xend);       // x5
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+        lsr         x6, x4, #1
+        add         x0, x0, x4, LSL #2
+        add         x1, x1, x4
+        add         x4, x3, x6
+        add         x3, x2, x6
+        sub         x2, x5, x6, LSL #2
+
+        sub         x6, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x6]
+
+        wrap_line yuvkern, 0
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuv2_K)
+
+/*  void rsdIntrinsicYuv_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uvin,   // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicYuv_K)
+        bic         x5, x4, #1
+        add         x0, x0, x5, LSL #2
+        add         x1, x1, x5
+        add         x3, x2, x5
+        sub         x2, x4, x5
+
+        sub         x5, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x5]
+
+        wrap_line yuvkern, 1, 1
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuv_K)
+
+/*  void rsdIntrinsicYuvR_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uvin,   // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+        bic         x5, x4, #1
+        add         x0, x0, x5, LSL #2
+        add         x1, x1, x5
+        add         x3, x2, x5
+        sub         x2, x4, x5
+
+        sub         x5, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x5]
+
+        wrap_line yuvkern, 1
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuvR_K)
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 0ed5ea3..ec3c962 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -244,281 +244,6 @@
         bx              lr
 END(rsdIntrinsicBlurHFU1_K)
 
-/*
-    Function called with the following arguments: dst, Y, vu, len, YuvCoeff
-        r0 = dst
-        r1 = Y
-        r2 = VU
-        r3 = length (pixels / 8)
-        ---- Args below will be in the stack ----
-        sp = YuvCoeff
-
-        This function converts 8 pixels per iteration
-*/
-ENTRY(rsdIntrinsicYuv_K)
-        push        {r4, r5, lr}            @ preserve clobbered int registers
-        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
-
-        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
-
-        ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
-        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
-        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
-        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
-
-        mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
-
-        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
-                                            @ the coeffs matrix (Q2)
-
-        1:
-        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
-        vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
-        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
-        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
-
-        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
-        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
-        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
-
-        vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
-        vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
-        vmov.u16    d11, d10                @ Copying V to d11
-        vmov.u16    d13, d12                @ Copying U to d13
-        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
-        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
-
-
-        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
-        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
-        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
-        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
-
-                                            @                  R    G    B
-                                            @     Pixel(0-3)  Q8,  Q9, Q10
-                                            @     Pixel(4-7) Q11, Q12, Q13
-                                            @
-
-                                            @ Pixel(0-3)
-        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
-        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
-        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
-        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
-
-                                            @ Pixel(4-7)
-        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
-        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
-        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
-        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
-
-                                            @ Pixel(0-3)
-        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
-
-                                            @ Pixel(4-7)
-        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
-
-        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
-        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
-        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
-
-        subs        r3, r3, #1              @ Checking length (r3)
-        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
-
-        bne 1b                              @ if not done with length, loop
-
-        vpop        {Q4-Q7}                 @ Restore Vregisters
-        pop         {r4, r5, lr}            @ Restore int registers
-        bx          lr
-END(rsdIntrinsicYuv_K)
-
-/*
-    Function called with the following arguments: dst, Y, vu, len, YuvCoeff
-        r0 = dst
-        r1 = Y
-        r2 = UV
-        r3 = length (pixels / 8)
-        ---- Args below will be in the stack ----
-        sp = YuvCoeff
-
-        This function converts 8 pixels per iteration
-*/
-ENTRY(rsdIntrinsicYuvR_K)
-        push        {r4, r5, lr}            @ preserve clobbered int registers
-        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
-
-        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
-
-        ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
-        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
-        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
-        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
-
-        mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
-
-        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
-                                            @ the coeffs matrix (Q2)
-
-        1:
-        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
-        vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
-        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
-        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
-
-        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
-        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
-        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
-
-        vsubl.u8    Q5, d14, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
-        vsubl.u8    Q6, d12, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
-        vmov.u16    d11, d10                @ Copying V to d11
-        vmov.u16    d13, d12                @ Copying U to d13
-        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
-        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
-
-
-        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
-        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
-        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
-        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
-
-                                            @                  R    G    B
-                                            @     Pixel(0-3)  Q8,  Q9, Q10
-                                            @     Pixel(4-7) Q11, Q12, Q13
-                                            @
-
-                                            @ Pixel(0-3)
-        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
-        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
-        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
-        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
-
-                                            @ Pixel(4-7)
-        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
-        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
-        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
-        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
-
-                                            @ Pixel(0-3)
-        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
-
-                                            @ Pixel(4-7)
-        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
-
-        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
-        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
-        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
-
-        subs        r3, r3, #1              @ Checking length (r3)
-        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
-
-        bne 1b                              @ if not done with length, loop
-
-        vpop        {Q4-Q7}                 @ Restore Vregisters
-        pop         {r4, r5, lr}            @ Restore int registers
-        bx          lr
-END(rsdIntrinsicYuvR_K)
-
-/*
-    Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
-        r0 = dst
-        r1 = Y
-        r2 = V,
-        r3 = U
-        ---- Args below will be in the stack ----
-        sp = length (pixels / 8)
-        sp+4 = YuvCoeff
-
-        This function converts 8 pixels per iteration
-*/
-ENTRY(rsdIntrinsicYuv2_K)
-        push        {r4, r5, r6, lr}        @ preserve clobbered int registers
-        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
-
-        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
-
-        ldr         r4, [sp, #64+16+4]      @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4)
-        ldr         r6, [sp, #64+16]        @ load the length in r6 (16*4 + 4*4)
-        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
-        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
-        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
-
-        mov         r4, #4                  @ Integer 8 in r4; used as an incrementing value
-
-        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
-                                            @ the coeffs matrix (Q2)
-
-        1:
-        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
-        vld1.8      {d12}, [r3], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
-        vld1.8      {d14}, [r2], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
-        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
-        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
-
-        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
-        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
-        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
-
-        vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
-        vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
-        vmov.u16    d11, d10                @ Copying V to d11
-        vmov.u16    d13, d12                @ Copying U to d13
-        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
-        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
-
-
-        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
-        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
-        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
-        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
-
-                                            @                  R    G    B
-                                            @     Pixel(0-3)  Q8,  Q9, Q10
-                                            @     Pixel(4-7) Q11, Q12, Q13
-                                            @
-
-                                            @ Pixel(0-3)
-        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
-        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
-        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
-        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
-
-                                            @ Pixel(4-7)
-        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
-        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
-        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
-        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
-
-                                            @ Pixel(0-3)
-        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
-
-                                            @ Pixel(4-7)
-        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
-        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
-
-        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
-        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
-        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
-
-        subs        r6, r6, #1              @ Checking length (r6)
-        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
-
-        bne 1b                              @ if not done with length, loop
-
-        vpop        {Q4-Q7}                 @ Restore Vregisters
-        pop         {r4, r5, r6, lr}        @ Restore int registers
-        bx          lr
-END(rsdIntrinsicYuv2_K)
 
 /* Convolve 5x5 */
 
diff --git a/cpu_ref/rsCpuIntrinsics_neon_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_neon_YuvToRGB.S
new file mode 100644
index 0000000..da4cded
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_neon_YuvToRGB.S
@@ -0,0 +1,298 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register.  This macro will be called from within several different wrapper
+ * variants for different data layouts.  Y data starts in q8, but with the even
+ * and odd bytes split into d16 and d17 respectively.  U and V are in d20
+ * and d21.  Working constants are pre-loaded into q13-q15, and q3 is
+ * pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern
+        vmov.i8     d15, #149
+
+        vmull.u8    q1, d16, d15        // g0 = y0 * 149
+        vmull.u8    q5, d17, d15        // g1 = y1 * 149
+
+        vmov.i8     d14, #50
+        vmov.i8     d15, #104
+        vmull.u8    q8, d20, d14        // g2 = u * 50 + v * 104
+        vmlal.u8    q8, d21, d15
+
+        vshr.u8     d14, d21, #1
+        vaddw.u8    q0, q1, d14         // r0 = y0 * 149 + (v >> 1)
+        vaddw.u8    q4, q5, d14         // r1 = y1 * 149 + (v >> 1)
+
+        vshll.u8    q7, d20, #2
+        vadd.u16    q2, q1, q7          // b0 = y0 * 149 + (u << 2)
+        vadd.u16    q6, q5, q7          // b1 = y1 * 149 + (u << 2)
+
+        vmov.i8     d14, #204
+        vmov.i8     d15, #254
+        vmull.u8    q11, d21, d14       // r2 = v * 204
+        vmull.u8    q12, d20, d15       // b2 = u * 254
+
+        vhadd.u16   q0, q11             // r0 = (r0 + r2) >> 1
+        vhadd.u16   q4, q11             // r1 = (r1 + r2) >> 1
+        vqadd.u16   q1, q14             // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        vqadd.u16   q5, q14             // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        vhadd.u16   q2, q12             // b0 = (b0 + b2) >> 1
+        vhadd.u16   q6, q12             // b1 = (b1 + b2) >> 1
+
+        vqsub.u16   q0, q13             // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        vqsub.u16   q4, q13             // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        vqsub.u16   q1, q8              // g0 = satu16(g0 - g2)
+        vqsub.u16   q5, q8              // g1 = satu16(g1 - g2)
+        vqsub.u16   q2, q15             // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        vqsub.u16   q6, q15             // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        vqrshrn.u16 d0, q0, #6
+        vqrshrn.u16 d1, q1, #7
+        vqrshrn.u16 d2, q4, #6
+        vqrshrn.u16 d3, q5, #7
+        vqrshrn.u16 d4, q2, #6
+        vqrshrn.u16 d5, q6, #6
+
+        vzip.u8     q0, q1
+        vzip.u8     d4, d5
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+
+        movw        r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        vdup.i16    q13, r5
+        movw        r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        vdup.i16    q14, r5
+        movw        r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        vdup.i16    q15, r5
+
+        vmov.i8     q3, #0xff
+
+        subs        r2, #16
+        bhs         1f
+        b           2f
+
+        .align 4
+1:      vld2.u8     {d16,d17}, [r1]!
+        pld         [r1, #256]
+  .if \interleaved
+        vld2.u8     {d20,d21}, [r3]!
+    .if \swapuv
+        vswp        d20, d21
+    .endif
+        pld         [r3, #256]
+  .else
+        vld1.u8     d20, [r3]!
+        vld1.u8     d21, [r4]!
+        pld         [r3, #128]
+        pld         [r4, #128]
+  .endif
+
+        \kernel
+
+        subs    r2, #16
+
+        vst4.u8     {d0,d2,d4,d6}, [r0]!
+        vst4.u8     {d1,d3,d5,d7}, [r0]!
+
+        bhs         1b
+
+2:      adds        r2, #16
+        beq         2f
+
+        /* To handle the tail portion of the data (something less than 16
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the
+         * interaction between neighbouring pixels is constrained to odd
+         * boundaries where the load operations don't interfere.
+         */
+        vmov.i8     q8, #0
+        vmov.i8     q10, #0
+
+        tst         r2, #8
+        beq         1f
+        vld1.u8     d17, [r1]!
+  .if \interleaved
+        vld1.u8     d21, [r3]!
+  .else
+        vld1.u32    d20[1], [r3]!
+        vld1.u32    d21[1], [r4]!
+  .endif
+
+1:      tst         r2, #4
+        beq         1f
+        vld1.u32    d16[1], [r1]!
+  .if \interleaved
+        vld1.u32    d20[1], [r3]!
+  .else
+        vld1.u16    d20[1], [r3]!
+        vld1.u16    d21[1], [r4]!
+  .endif
+1:      tst         r2, #2
+        beq         1f
+        vld1.u16    d16[1], [r1]!
+  .if \interleaved
+        vld1.u16    d20[1], [r3]!
+  .else
+        vld1.u8     d20[1], [r3]!
+        vld1.u8     d21[1], [r4]!
+  .endif
+1:      tst         r2, #1
+        beq         1f
+        vld1.u8     d16[1], [r1]!
+  .if \interleaved
+        vld1.u8     d20[1], [r3]!
+  .else
+        vld1.u8     d20[0], [r3]!
+        vld1.u8     d21[0], [r4]!
+  .endif
+
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point if necessary.
+         */
+1:      vuzp.8      d16, d17
+  .if \interleaved
+        vuzp.8      d20, d21
+    .if \swapuv
+        vswp        d20, d21
+    .endif
+  .endif
+
+        \kernel
+
+        /* As above but with the output; structured stores for partial vectors
+         * aren't available, so the data is re-packed first and stored linearly.
+         */
+        vzip.8  q0, q2
+        vzip.8  q1, q3
+        vzip.8  q0, q1
+        vzip.8  q2, q3
+
+1:      tst         r2, #8
+        beq         1f
+        vst1.u8     {d4,d5,d6,d7}, [r0]!
+
+1:      tst         r2, #4
+        beq         1f
+        vst1.u8     {d2,d3}, [r0]!
+1:      tst         r2, #2
+        beq         1f
+        vst1.u8     d1, [r0]!
+1:      tst         r2, #1
+        beq         2f
+        vst1.u32    d0[1], [r0]!
+2:
+.endm
+
+
+/*  void rsdIntrinsicYuv2_K(
+ *          void *out,          // r0
+ *          void const *yin,    // r1
+ *          void const *uin,    // r2
+ *          void const *vin,    // r3
+ *          size_t xstart,      // [sp]
+ *          size_t xend);       // [sp+#4]
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+        push        {r4,r5}
+        ldr         r5, [sp, #8]
+        mov         r4, r3
+        mov         r3, r2
+        ldr         r2, [sp, #12]
+
+        add         r0, r5, LSL #2
+        add         r1, r5
+        add         r3, r5, LSR #1
+        add         r4, r5, LSR #1
+        sub         r2, r5
+
+        vpush       {d8-d15}
+
+        wrap_line yuvkern, 0
+
+        vpop        {d8-d15}
+        pop         {r4,r5}
+        bx lr
+END(rsdIntrinsicYuv2_K)
+
+/*  void rsdIntrinsicYuv_K(
+ *          void *out,          // r0
+ *          void const *yin,    // r1
+ *          void const *uvin,   // r2
+ *          size_t xstart,      // r3
+ *          size_t xend);       // [sp]
+ */
+ENTRY(rsdIntrinsicYuv_K)
+        push        {r4,r5}
+        bic         r4, r3, #1
+        add         r3, r2, r4
+        ldr         r2, [sp, #8]
+
+        add         r0, r4, LSL #2
+        add         r1, r4
+        sub         r2, r4
+
+        vpush       {d8-d15}
+
+        wrap_line yuvkern, 1, 1
+
+        vpop        {d8-d15}
+        pop         {r4,r5}
+        bx lr
+END(rsdIntrinsicYuv_K)
+
+/*  void rsdIntrinsicYuvR_K(
+ *          void *out,          // r0
+ *          void const *yin,    // r1
+ *          void const *uvin,   // r2
+ *          size_t xstart,      // r3
+ *          size_t xend);       // [sp]
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+        push        {r4,r5}
+        bic         r4, r3, #1
+        add         r3, r2, r4
+        ldr         r2, [sp, #8]
+
+        add         r0, r4, LSL #2
+        add         r1, r4
+        sub         r2, r4
+
+        vpush       {d8-d15}
+
+        wrap_line yuvkern, 1
+
+        vpop        {d8-d15}
+        pop         {r4,r5}
+        bx lr
+END(rsdIntrinsicYuvR_K)