am 70cc3f31: Merge "Fix YUV intrinsic" into jb-mr2-dev
* commit '70cc3f31ebe6dd61e173d9e0a120a2e4932b14c5':
Fix YUV intrinsic
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 3a49c0d..3d989bd 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -18,6 +18,10 @@
#include "rsCpuIntrinsic.h"
#include "rsCpuIntrinsicInlines.h"
+#ifndef RS_COMPATIBILITY_LIB
+#include "hardware/gralloc.h"
+#endif
+
using namespace android;
using namespace android::renderscript;
@@ -99,6 +103,7 @@
};
extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
+extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, uint32_t count, const short *param);
void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
uint32_t xstart, uint32_t xend,
@@ -109,39 +114,87 @@
return;
}
const uchar *pinY = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
- const uchar *pinUV = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
const size_t strideY = cp->alloc->mHal.drvState.lod[0].stride;
- const size_t strideUV = cp->alloc->mHal.drvState.lod[1].stride;
-
const uchar *Y = pinY + (p->y * strideY);
- const uchar *uv = pinUV + ((p->y >> 1) * strideUV);
uchar4 *out = (uchar4 *)p->out;
uint32_t x1 = xstart;
uint32_t x2 = xend;
- if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
- int32_t len = (x2 - x1 - 1) >> 3;
- if(len > 0) {
- rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
- x1 += len << 3;
- out += len << 3;
- }
+ switch (cp->alloc->mHal.state.yuv) {
+ // In API 17 there was no yuv format and the intrinsic treated everything as NV21
+ case 0:
+#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
+ case HAL_PIXEL_FORMAT_YCrCb_420_SP: // NV21
#endif
+ {
+ const uchar *pinUV = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
+ const size_t strideUV = cp->alloc->mHal.drvState.lod[1].stride;
+ const uchar *uv = pinUV + ((p->y >> 1) * strideUV);
- // ALOGE("y %i %i %i", p->y, x1, x2);
- while(x1 < x2) {
- uchar u = uv[(x1 & 0xffffe) + 1];
- uchar v = uv[(x1 & 0xffffe) + 0];
- *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
- out++;
- x1++;
- *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
- out++;
- x1++;
+ if(x2 > x1) {
+ #if defined(ARCH_ARM_HAVE_NEON)
+ int32_t len = (x2 - x1 - 1) >> 3;
+ if(len > 0) {
+ rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
+ x1 += len << 3;
+ out += len << 3;
+ }
+ #endif
+
+ // ALOGE("y %i %i %i", p->y, x1, x2);
+ while(x1 < x2) {
+ uchar u = uv[(x1 & 0xffffe) + 1];
+ uchar v = uv[(x1 & 0xffffe) + 0];
+ *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
+ out++;
+ x1++;
+ *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
+ out++;
+ x1++;
+ }
+ }
}
+ break;
+
+#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
+ case HAL_PIXEL_FORMAT_YV12:
+ {
+ const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
+ const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
+ const uchar *u = pinU + ((p->y >> 1) * strideU);
+
+ const uchar *pinV = (const uchar *)cp->alloc->mHal.drvState.lod[2].mallocPtr;
+ const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
+ const uchar *v = pinV + ((p->y >> 1) * strideV);
+
+ if(x2 > x1) {
+ #if defined(ARCH_ARM_HAVE_NEON)
+ int32_t len = (x2 - x1 - 1) >> 3;
+ if(len > 0) {
+ rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
+ x1 += len << 3;
+ out += len << 3;
+ }
+ #endif
+
+ // ALOGE("y %i %i %i", p->y, x1, x2);
+ while(x1 < x2) {
+ uchar ut = u[x1];
+ uchar vt = v[x1];
+ *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
+ out++;
+ x1++;
+ *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
+ out++;
+ x1++;
+ }
+ }
+ }
+ break;
+#endif
}
+
}
RsdCpuScriptIntrinsicYuvToRGB::RsdCpuScriptIntrinsicYuvToRGB(
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index b93a038..c8dc9bf 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -504,6 +504,100 @@
bx lr
END(rsdIntrinsicYuv_K)
+/*
+ Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
+ r0 = dst
+ r1 = Y
+ r2 = V,
+ r3 = U
+ ---- Args below will be in the stack ----
+ sp = length (pixels / 8)
+ sp+4 = YuvCoeff
+
+ This function converts 8 pixels per iteration
+*/
+ENTRY(rsdIntrinsicYuv2_K)
+ push {r4, r5, r6, lr} @ preserve clobbered int registers
+ vpush {Q4-Q7} @ preserve Vregisters we clobber
+
+ mov r5, #16 @ Integer 16 in r5; used as an incrementing value
+
+ ldr r4, [sp, #64+16+4] @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4)
+ ldr r6, [sp, #64+16] @ load the length in r6 (16*4 + 4*4)
+ vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2
+ vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6
+ vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8
+
+ mov r4, #4 @ Integer 8 in r4; used as an incrementing value
+
+ vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in
+ @ the coeffs matrix (Q2)
+
+ 1:
+ vld1.8 {d10}, [r1]! @ get Y (r1->Y)
+ vld1.8 {d12}, [r3], r4 @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
+ vld1.8 {d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
+ pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops
+ pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops
+
+ vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7)
+ vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
+ vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
+
+ vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
+ vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
+ vmov.u16 d11, d10 @ Copying V to d11
+ vmov.u16 d13, d12 @ Copying U to d13
+ vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
+ vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
+
+
+ vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
+ vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
+ vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
+ vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
+
+ @ R G B
+ @ Pixel(0-3) Q8, Q9, Q10
+ @ Pixel(4-7) Q11, Q12, Q13
+ @
+
+ @ Pixel(0-3)
+ vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409
+ vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208)
+ vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100)
+ vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516
+
+ @ Pixel(4-7)
+ vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409
+ vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
+ vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100)
+ vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516
+
+ @ Pixel(0-3)
+ vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
+ vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
+ vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
+
+ @ Pixel(4-7)
+ vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
+ vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
+ vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
+
+ vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit)
+ vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit)
+ vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit)
+
+ subs r6, r6, #1 @ Checking length (r6)
+ vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
+
+ bne 1b @ if not done with length, loop
+
+ vpop {Q4-Q7} @ Restore Vregisters
+ pop {r4, r5, r6, lr} @ Restore int registers
+ bx lr
+END(rsdIntrinsicYuv2_K)
+
/* Convolve 5x5 */
/*