Cleanup type offsets which cannot be calculated for flexible YUV.
Support flexible YUV
bug 10567550
Change-Id: I4f6e5a8d86eeee635605460f1751208f3320969b
(cherry picked from commit a75372759e288be3fb8835735a830b1f7d1a4c42)
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 4f56443..7546b38 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -107,6 +107,7 @@
};
extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
+extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, uint32_t count, const short *param);
void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
@@ -135,91 +136,63 @@
uint32_t x1 = xstart;
uint32_t x2 = xend;
- switch (cp->alloc->mHal.state.yuv) {
- // In API 17 there was no yuv format and the intrinsic treated everything as NV21
- case 0:
-#if !defined(RS_SERVER)
- case HAL_PIXEL_FORMAT_YCrCb_420_SP: // NV21
-#endif
- {
- const uchar *pinUV = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
- size_t strideUV = cp->alloc->mHal.drvState.lod[1].stride;
- const uchar *uv = pinUV + ((p->y >> 1) * strideUV);
+ const size_t cstep = cp->alloc->mHal.drvState.yuv.step;
- if (pinUV == NULL) {
- // Legacy yuv support didn't fill in uv
- strideUV = strideY;
- uv = ((uint8_t *)cp->alloc->mHal.drvState.lod[0].mallocPtr) +
- (strideY * p->dimY) +
- ((p->y >> 1) * strideUV);
- }
+ const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
+ const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
+ const uchar *u = pinU + ((p->y >> 1) * strideU);
- if(x2 > x1) {
- if (gArchUseSIMD) {
- #if defined(ARCH_ARM_HAVE_VFP)
- int32_t len = (x2 - x1 - 1) >> 3;
- if(len > 0) {
- // ALOGE("%p, %p, %p, %d, %p", out, Y, uv, len, YuvCoeff);
- rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
- x1 += len << 3;
- out += len << 3;
- }
- #endif
+ const uchar *pinV = (const uchar *)cp->alloc->mHal.drvState.lod[2].mallocPtr;
+ const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
+ const uchar *v = pinV + ((p->y >> 1) * strideV);
+
+ if (pinU == NULL) {
+ // Legacy yuv support didn't fill in uv
+ v = ((uint8_t *)cp->alloc->mHal.drvState.lod[0].mallocPtr) +
+ (strideY * p->dimY) +
+ ((p->y >> 1) * strideY);
+ u = v + 1;
+ }
+
+#if defined(ARCH_ARM_HAVE_VFP)
+ if((x2 > x1) && gArchUseSIMD) {
+ int32_t len = (x2 - x1 - 1) >> 3;
+ if(len > 0) {
+ if (cstep == 1) {
+ rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
+ x1 += len << 3;
+ out += len << 3;
+ } else if (cstep == 2) {
+ // Check for proper interleave
+ intptr_t ipu = (intptr_t)u;
+ intptr_t ipv = (intptr_t)v;
+
+ if (ipu == (ipv + 1)) {
+ rsdIntrinsicYuv_K(out, Y, v, len, YuvCoeff);
+ x1 += len << 3;
+ out += len << 3;
+ } else if (ipu == (ipv - 1)) {
+ rsdIntrinsicYuvR_K(out, Y, u, len, YuvCoeff);
+ x1 += len << 3;
+ out += len << 3;
}
- // ALOGE("y %i %i %i", p->y, x1, x2);
- while(x1 < x2) {
- uchar u = uv[(x1 & 0xffffe) + 1];
- uchar v = uv[(x1 & 0xffffe) + 0];
- *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
- out++;
- x1++;
- *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
- out++;
- x1++;
- }
}
}
- break;
-
-#if !defined(RS_SERVER)
- case HAL_PIXEL_FORMAT_YV12:
- {
- const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
- const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
- const uchar *u = pinU + ((p->y >> 1) * strideU);
-
- const uchar *pinV = (const uchar *)cp->alloc->mHal.drvState.lod[2].mallocPtr;
- const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
- const uchar *v = pinV + ((p->y >> 1) * strideV);
-
- if(x2 > x1) {
- #if defined(ARCH_ARM_HAVE_VFP)
- if (gArchUseSIMD) {
- int32_t len = (x2 - x1 - 1) >> 3;
- if(len > 0) {
- rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
- x1 += len << 3;
- out += len << 3;
- }
- }
- #endif
-
- // ALOGE("y %i %i %i", p->y, x1, x2);
- while(x1 < x2) {
- uchar ut = u[x1];
- uchar vt = v[x1];
- *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
- out++;
- x1++;
- *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
- out++;
- x1++;
- }
- }
- }
- break;
+ }
#endif
+
+ if(x2 > x1) {
+ // ALOGE("y %i %i %i", p->y, x1, x2);
+ while(x1 < x2) {
+ int cx = (x1 >> 1) * cstep;
+ *out = rsYuvToRGBA_uchar4(Y[x1], u[cx], v[cx]);
+ out++;
+ x1++;
+ *out = rsYuvToRGBA_uchar4(Y[x1], u[cx], v[cx]);
+ out++;
+ x1++;
+ }
}
}
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 52fd565..da58f89 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -338,6 +338,97 @@
END(rsdIntrinsicYuv_K)
/*
+ Function called with the following arguments: dst, Y, vu, len, YuvCoeff
+ r0 = dst
+ r1 = Y
+ r2 = UV
+ r3 = length (pixels / 8)
+ ---- Args below will be in the stack ----
+ sp = YuvCoeff
+
+ This function converts 8 pixels per iteration
+*/
+ENTRY(rsdIntrinsicYuvR_K)
+ push {r4, r5, lr} @ preserve clobbered int registers
+ vpush {Q4-Q7} @ preserve Vregisters we clobber
+
+ mov r5, #16 @ Integer 16 in r5; used as an incrementing value
+
+ ldr r4, [sp, #64+12] @ load the coeffs address in memory in r4 (16*4 + 4*3)
+ vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2
+ vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6
+ vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8
+
+ mov r4, #8 @ Integer 8 in r4; used as an incrementing value
+
+ vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in
+ @ the coeffs matrix (Q2)
+
+ 1:
+ vld1.8 {d10}, [r1]! @ get Y (r1->Y)
+ vld2.8 {d12, d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
+ pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops
+ pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops
+
+ vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7)
+ vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
+ vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
+
+ vsubl.u8 Q5, d14, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
+ vsubl.u8 Q6, d12, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
+ vmov.u16 d11, d10 @ Copying V to d11
+ vmov.u16 d13, d12 @ Copying U to d13
+ vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
+ vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
+
+
+ vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
+ vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
+ vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
+ vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
+
+ @ R G B
+ @ Pixel(0-3) Q8, Q9, Q10
+ @ Pixel(4-7) Q11, Q12, Q13
+ @
+
+ @ Pixel(0-3)
+ vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409
+ vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208)
+ vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100)
+ vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516
+
+ @ Pixel(4-7)
+ vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409
+ vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
+ vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100)
+ vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516
+
+ @ Pixel(0-3)
+ vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
+ vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
+ vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
+
+ @ Pixel(4-7)
+ vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
+ vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
+ vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
+
+ vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit)
+ vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit)
+ vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit)
+
+ subs r3, r3, #1 @ Checking length (r3)
+ vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
+
+ bne 1b @ if not done with length, loop
+
+ vpop {Q4-Q7} @ Restore Vregisters
+ pop {r4, r5, lr} @ Restore int registers
+ bx lr
+END(rsdIntrinsicYuvR_K)
+
+/*
Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
r0 = dst
r1 = Y