Cleanup type offsets which cannot be calculated for flexible YUV.

Support flexible YUV

bug 10567550

Change-Id: I4f6e5a8d86eeee635605460f1751208f3320969b
(cherry picked from commit a75372759e288be3fb8835735a830b1f7d1a4c42)
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 4f56443..7546b38 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -107,6 +107,7 @@
 };
 
 extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
+extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
 extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, uint32_t count, const short *param);
 
 void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
@@ -135,91 +136,63 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-    switch (cp->alloc->mHal.state.yuv) {
-    // In API 17 there was no yuv format and the intrinsic treated everything as NV21
-    case 0:
-#if !defined(RS_SERVER)
-    case HAL_PIXEL_FORMAT_YCrCb_420_SP:  // NV21
-#endif
-        {
-            const uchar *pinUV = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
-            size_t strideUV = cp->alloc->mHal.drvState.lod[1].stride;
-            const uchar *uv = pinUV + ((p->y >> 1) * strideUV);
+    const size_t cstep = cp->alloc->mHal.drvState.yuv.step;
 
-            if (pinUV == NULL) {
-                // Legacy yuv support didn't fill in uv
-                strideUV = strideY;
-                uv = ((uint8_t *)cp->alloc->mHal.drvState.lod[0].mallocPtr) +
-                    (strideY * p->dimY) +
-                    ((p->y >> 1) * strideUV);
-            }
+    const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
+    const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
+    const uchar *u = pinU + ((p->y >> 1) * strideU);
 
-            if(x2 > x1) {
-                if (gArchUseSIMD) {
-            #if defined(ARCH_ARM_HAVE_VFP)
-                    int32_t len = (x2 - x1 - 1) >> 3;
-                    if(len > 0) {
-                        //                    ALOGE("%p, %p, %p, %d, %p", out, Y, uv, len, YuvCoeff);
-                        rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
-                        x1 += len << 3;
-                        out += len << 3;
-                    }
-            #endif
+    const uchar *pinV = (const uchar *)cp->alloc->mHal.drvState.lod[2].mallocPtr;
+    const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
+    const uchar *v = pinV + ((p->y >> 1) * strideV);
+
+    if (pinU == NULL) {
+        // Legacy yuv support didn't fill in uv
+        v = ((uint8_t *)cp->alloc->mHal.drvState.lod[0].mallocPtr) +
+            (strideY * p->dimY) +
+            ((p->y >> 1) * strideY);
+        u = v + 1;
+    }
+
+#if defined(ARCH_ARM_HAVE_VFP)
+    if((x2 > x1) && gArchUseSIMD) {
+        int32_t len = (x2 - x1 - 1) >> 3;
+        if(len > 0) {
+            if (cstep == 1) {
+                rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
+                x1 += len << 3;
+                out += len << 3;
+            } else if (cstep == 2) {
+                // Check for proper interleave
+                intptr_t ipu = (intptr_t)u;
+                intptr_t ipv = (intptr_t)v;
+
+                if (ipu == (ipv + 1)) {
+                    rsdIntrinsicYuv_K(out, Y, v, len, YuvCoeff);
+                    x1 += len << 3;
+                    out += len << 3;
+                } else if (ipu == (ipv - 1)) {
+                    rsdIntrinsicYuvR_K(out, Y, u, len, YuvCoeff);
+                    x1 += len << 3;
+                    out += len << 3;
                 }
 
-               // ALOGE("y %i  %i  %i", p->y, x1, x2);
-                while(x1 < x2) {
-                    uchar u = uv[(x1 & 0xffffe) + 1];
-                    uchar v = uv[(x1 & 0xffffe) + 0];
-                    *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
-                    out++;
-                    x1++;
-                    *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
-                    out++;
-                    x1++;
-                }
             }
         }
-        break;
-
-#if !defined(RS_SERVER)
-    case HAL_PIXEL_FORMAT_YV12:
-        {
-            const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
-            const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
-            const uchar *u = pinU + ((p->y >> 1) * strideU);
-
-            const uchar *pinV = (const uchar *)cp->alloc->mHal.drvState.lod[2].mallocPtr;
-            const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
-            const uchar *v = pinV + ((p->y >> 1) * strideV);
-
-            if(x2 > x1) {
-        #if defined(ARCH_ARM_HAVE_VFP)
-                if (gArchUseSIMD) {
-                    int32_t len = (x2 - x1 - 1) >> 3;
-                    if(len > 0) {
-                        rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
-                        x1 += len << 3;
-                        out += len << 3;
-                    }
-                }
-        #endif
-
-               // ALOGE("y %i  %i  %i", p->y, x1, x2);
-                while(x1 < x2) {
-                    uchar ut = u[x1];
-                    uchar vt = v[x1];
-                    *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
-                    out++;
-                    x1++;
-                    *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
-                    out++;
-                    x1++;
-                }
-            }
-        }
-        break;
+    }
 #endif
+
+    if(x2 > x1) {
+       // ALOGE("y %i  %i  %i", p->y, x1, x2);
+        while(x1 < x2) {
+            int cx = (x1 >> 1) * cstep;
+            *out = rsYuvToRGBA_uchar4(Y[x1], u[cx], v[cx]);
+            out++;
+            x1++;
+            *out = rsYuvToRGBA_uchar4(Y[x1], u[cx], v[cx]);
+            out++;
+            x1++;
+        }
     }
 
 }
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 52fd565..da58f89 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -338,6 +338,97 @@
 END(rsdIntrinsicYuv_K)
 
 /*
+    Function called with the following arguments: dst, Y, vu, len, YuvCoeff
+        r0 = dst
+        r1 = Y
+        r2 = UV
+        r3 = length (pixels / 8)
+        ---- Args below will be in the stack ----
+        sp = YuvCoeff
+
+        This function converts 8 pixels per iteration
+*/
+ENTRY(rsdIntrinsicYuvR_K)
+        push        {r4, r5, lr}            @ preserve clobbered int registers
+        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
+
+        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
+
+        ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
+        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
+        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
+        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
+
+        mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
+
+        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
+                                            @ the coeffs matrix (Q2)
+
+        1:
+        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
+        vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
+        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
+        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
+
+        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
+        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
+        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
+
+        vsubl.u8    Q5, d14, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
+        vsubl.u8    Q6, d12, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
+        vmov.u16    d11, d10                @ Copying V to d11
+        vmov.u16    d13, d12                @ Copying U to d13
+        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
+        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
+
+
+        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
+        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
+        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
+        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
+
+                                            @                  R    G    B
+                                            @     Pixel(0-3)  Q8,  Q9, Q10
+                                            @     Pixel(4-7) Q11, Q12, Q13
+                                            @
+
+                                            @ Pixel(0-3)
+        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
+        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
+        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
+        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
+
+                                            @ Pixel(4-7)
+        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
+        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
+        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
+        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
+
+                                            @ Pixel(0-3)
+        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
+
+                                            @ Pixel(4-7)
+        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
+
+        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
+        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
+        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
+
+        subs        r3, r3, #1              @ Checking length (r3)
+        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
+
+        bne 1b                              @ if not done with length, loop
+
+        vpop        {Q4-Q7}                 @ Restore Vregisters
+        pop         {r4, r5, lr}            @ Restore int registers
+        bx          lr
+END(rsdIntrinsicYuvR_K)
+
+/*
     Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
         r0 = dst
         r1 = Y