Optimise YuvToRGB using 16-bit arithmetic.
Reimplement YuvToRGB intrinsic using 16-bit SIMD arithmetic to increase
throughput. Implementations in AArch32 and AArch64 NEON.
Change-Id: Idd43e383f5147c33b0b546fa822c970de432c19d
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 6a2ac45..4f7d3c0 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -34,7 +34,9 @@
ifeq ($(TARGET_ARCH),arm64)
LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
- LOCAL_SRC_FILES+=rsCpuIntrinsics_advsimd_Blend.S
+ LOCAL_SRC_FILES+= \
+ rsCpuIntrinsics_advsimd_Blend.S \
+ rsCpuIntrinsics_advsimd_YuvToRGB.S
else
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
@@ -45,7 +47,8 @@
LOCAL_SRC_FILES+= \
rsCpuIntrinsics_neon.S \
rsCpuIntrinsics_neon_ColorMatrix.S \
- rsCpuIntrinsics_neon_Blend.S
+ rsCpuIntrinsics_neon_Blend.S \
+ rsCpuIntrinsics_neon_YuvToRGB.S
LOCAL_ASFLAGS := -mfpu=neon
endif
endif
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 22f0962..2d905de 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -97,19 +97,9 @@
}
-static short YuvCoeff[] = {
- 298, 409, -100, 516, -208, 255, 0, 0,
- 16, 16, 16, 16, 16, 16, 16, 16,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 298, 298, 298, 298, 298, 298, 298, 298,
- 255, 255, 255, 255, 255, 255, 255, 255
-
-
-};
-
-extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
-extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
-extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, uint32_t count, const short *param);
+extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart, size_t xend);
+extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart, size_t xend);
+extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, size_t xstart, size_t xend);
void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
uint32_t xstart, uint32_t xend,
@@ -164,28 +154,24 @@
#if defined(ARCH_ARM_HAVE_VFP)
if((x2 > x1) && gArchUseSIMD) {
- // The neon paths may over-read by up to 8 bytes
- int32_t len = (x2 - x1 - 8) >> 3;
- if(len > 0) {
- if (cstep == 1) {
- rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
- x1 += len << 3;
- out += len << 3;
- } else if (cstep == 2) {
- // Check for proper interleave
- intptr_t ipu = (intptr_t)u;
- intptr_t ipv = (intptr_t)v;
+ int32_t len = x2 - x1;
+ if (cstep == 1) {
+ rsdIntrinsicYuv2_K(out, Y, u, v, x1, x2);
+ x1 += len;
+ out += len;
+ } else if (cstep == 2) {
+ // Check for proper interleave
+ intptr_t ipu = (intptr_t)u;
+ intptr_t ipv = (intptr_t)v;
- if (ipu == (ipv + 1)) {
- rsdIntrinsicYuv_K(out, Y, v, len, YuvCoeff);
- x1 += len << 3;
- out += len << 3;
- } else if (ipu == (ipv - 1)) {
- rsdIntrinsicYuvR_K(out, Y, u, len, YuvCoeff);
- x1 += len << 3;
- out += len << 3;
- }
-
+ if (ipu == (ipv + 1)) {
+ rsdIntrinsicYuv_K(out, Y, v, x1, x2);
+ x1 += len;
+ out += len;
+ } else if (ipu == (ipv - 1)) {
+ rsdIntrinsicYuvR_K(out, Y, u, x1, x2);
+ x1 += len;
+ out += len;
}
}
}
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
new file mode 100644
index 0000000..9232a79
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
@@ -0,0 +1,292 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register. This macro will be called from within several different wrapper
+ * variants for different data layouts. Y data starts with the even and odd
+ * bytes split into the low parts of v8 and v9 respectively. U and V are in
+ * v16 and v17. Working constants are pre-loaded into v13-v15, and v3 is
+ * pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern
+ movi v7.8b, #149
+
+ umull v1.8h, v8.8b, v7.8b // g0 = y0 * 149
+ umull v5.8h, v9.8b, v7.8b // g1 = y1 * 149
+
+ movi v7.8b, #50
+ movi v10.8b, #104
+ umull v8.8h, v16.8b, v7.8b // g2 = u * 50 + v * 104
+ umlal v8.8h, v17.8b, v10.8b
+
+ ushr v7.8b, v17.8b, #1
+ uaddw v0.8h, v1.8h, v7.8b // r0 = y0 * 149 + (v >> 1)
+ uaddw v4.8h, v5.8h, v7.8b // r1 = y1 * 149 + (v >> 1)
+
+ ushll v7.8h, v16.8b, #2
+ add v2.8h, v1.8h, v7.8h // b0 = y0 * 149 + (u << 2)
+ add v6.8h, v5.8h, v7.8h // b1 = y1 * 149 + (u << 2)
+
+ movi v7.16b, #204
+ movi v10.8b, #254
+ umull v11.8h, v17.8b, v7.8b // r2 = v * 204
+ umull v12.8h, v16.8b, v10.8b // b2 = u * 254
+
+ uhadd v0.8h, v0.8h, v11.8h // r0 = (r0 + r2) >> 1
+ uhadd v4.8h, v4.8h, v11.8h // r1 = (r1 + r2) >> 1
+ uqadd v1.8h, v1.8h, v14.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uqadd v5.8h, v5.8h, v14.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uhadd v2.8h, v2.8h, v12.8h // b0 = (b0 + b2) >> 1
+ uhadd v6.8h, v6.8h, v12.8h // b1 = (b1 + b2) >> 1
+
+ uqsub v0.8h, v0.8h, v13.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v4.8h, v4.8h, v13.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2)
+ uqsub v5.8h, v5.8h, v8.8h // g1 = satu16(g1 - g2)
+ uqsub v2.8h, v2.8h, v15.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ uqsub v6.8h, v6.8h, v15.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+ uqrshrn v0.8b, v0.8h, #6
+ uqrshrn v4.8b, v4.8h, #6
+ uqrshrn v1.8b, v1.8h, #7
+ uqrshrn v5.8b, v5.8h, #7
+ uqrshrn v2.8b, v2.8h, #6
+ uqrshrn v6.8b, v6.8h, #6
+
+ zip1 v0.16b, v0.16b, v4.16b
+ zip1 v1.16b, v1.16b, v5.16b
+ zip1 v2.16b, v2.16b, v6.16b
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop. Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+
+ mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ dup v13.8h, w5
+ mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ dup v14.8h, w5
+ mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ dup v15.8h, w5
+
+ movi v3.16b, #0xff
+
+ subs x2, x2, #16
+ bhs 1f
+ b 2f
+
+ .align 4
+1: ld2 {v8.8b,v9.8b}, [x1], #16
+// prfm PLDL1STRM, [x1, #256]
+ .if \interleaved
+ .if \swapuv
+ ld2 {v17.8b,v18.8b}, [x3], #16
+ mov v16.8b, v18.8b
+ .else
+ ld2 {v16.8b,v17.8b}, [x3], #16
+ .endif
+// prfm PLD1STRM, [x3, #256]
+ .else
+ ld1 {v16.8b}, [x3], #8
+ ld1 {v17.8b}, [x4], #8
+// prfm PLD1STRM, [x3, #128]
+// prfm PLD1STRM, [x4, #128]
+ .endif
+
+ \kernel
+
+ subs x2, x2, #16
+
+ st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+
+ bhs 1b
+
+2: adds x2, x2, #16
+ beq 2f
+
+ /* To handle the tail portion of the data (something less than 16
+ * bytes) load small power-of-two chunks into working registers. It
+ * doesn't matter where they end up in the register; the same process
+ * will store them back out using the same positions and the
+ * interaction between neighbouring pixels is constrained to odd
+ * boundaries where the load operations don't interfere.
+ */
+ movi v8.8b, #0
+ movi v9.8b, #0
+ movi v16.8b, #0
+ movi v17.8b, #0
+
+ tbz x2, #3, 1f
+ ld1 {v9.8b}, [x1], #8
+ .if \interleaved
+ ld1 {v17.8b}, [x3], #8
+ .else
+ ld1 {v16.s}[1], [x3], #4
+ ld1 {v17.s}[1], [x4], #4
+ .endif
+1: tbz x2, #2, 1f
+ ld1 {v8.s}[1], [x1], #4
+ .if \interleaved
+ ld1 {v16.s}[1], [x3], #4
+ .else
+ ld1 {v16.h}[1], [x3], #2
+ ld1 {v17.h}[1], [x4], #2
+ .endif
+1: tbz x2, #1, 1f
+ ld1 {v8.h}[1], [x1], #2
+ .if \interleaved
+ ld1 {v16.h}[1], [x3], #2
+ .else
+ ld1 {v16.b}[1], [x3], #1
+ ld1 {v17.b}[1], [x4], #1
+ .endif
+1: tbz x2, #0, 1f
+ ld1 {v8.b}[1], [x1], #1
+ .if \interleaved
+ ld1 {v16.b}[1], [x3], #1
+ .else
+ ld1 {v16.b}[0], [x3], #1
+ ld1 {v17.b}[0], [x4], #1
+ .endif
+
+ /* One small impediment in the process above is that some of the load
+ * operations can't perform byte-wise structure deinterleaving at the
+ * same time as loading only part of a register. So the data is loaded
+ * linearly and unpacked manually at this point if necessary.
+ */
+1: uzp1 v8.16b, v8.16b, v9.16b
+ .if \interleaved
+ .if \swapuv
+ uzp1 v16.16b, v17.16b, v16.16b
+ .else
+ uzp1 v16.16b, v16.16b, v17.16b
+ .endif
+ .endif
+
+ \kernel
+
+ /* As above but with the output; structured stores for partial vectors
+ * aren't available, so the data is re-packed first and stored linearly.
+ */
+ zip1 v4.16b, v0.16b, v2.16b
+ zip2 v6.16b, v0.16b, v2.16b
+ zip1 v5.16b, v1.16b, v3.16b
+ zip2 v7.16b, v1.16b, v3.16b
+ zip1 v0.16b, v4.16b, v5.16b
+ zip2 v1.16b, v4.16b, v5.16b
+ zip1 v2.16b, v6.16b, v7.16b
+ zip2 v3.16b, v6.16b, v7.16b
+
+1: tbz x2, #3, 1f
+ st1 {v2.16b,v3.16b}, [x0], #32
+1: tbz x2, #2, 1f
+ st1 {v1.16b}, [x0], #16
+1: tbz x2, #1, 1f
+ st1 {v0.d}[1], [x0], #8
+1: tbz x2, #0, 2f
+ st1 {v0.s}[1], [x0], #4
+2:
+.endm
+
+
+/* void rsdIntrinsicYuv2_K(
+ * void *out, // x0
+ * void const *yin, // x1
+ * void const *uin, // x2
+ * void const *vin, // x3
+ * size_t xstart, // x4
+ * size_t xend); // x5
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+ lsr x6, x4, #1
+ add x0, x0, x4, LSL #2
+ add x1, x1, x4
+ add x4, x3, x6
+ add x3, x2, x6
+ sub x2, x5, x6, LSL #2
+
+ sub x6, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x6]
+
+ wrap_line yuvkern, 0
+
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicYuv2_K)
+
+/* void rsdIntrinsicYuv_K(
+ * void *out, // x0
+ * void const *yin, // x1
+ * void const *uvin, // x2
+ * size_t xstart, // x3
+ * size_t xend); // x4
+ */
+ENTRY(rsdIntrinsicYuv_K)
+ bic x5, x4, #1
+ add x0, x0, x5, LSL #2
+ add x1, x1, x5
+ add x3, x2, x5
+ sub x2, x4, x5
+
+ sub x5, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x5]
+
+ wrap_line yuvkern, 1, 1
+
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicYuv_K)
+
+/* void rsdIntrinsicYuvR_K(
+ * void *out, // x0
+ * void const *yin, // x1
+ * void const *uvin, // x2
+ * size_t xstart, // x3
+ * size_t xend); // x4
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+ bic x5, x4, #1
+ add x0, x0, x5, LSL #2
+ add x1, x1, x5
+ add x3, x2, x5
+ sub x2, x4, x5
+
+ sub x5, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x5]
+
+ wrap_line yuvkern, 1
+
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicYuvR_K)
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 0ed5ea3..ec3c962 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -244,281 +244,6 @@
bx lr
END(rsdIntrinsicBlurHFU1_K)
-/*
- Function called with the following arguments: dst, Y, vu, len, YuvCoeff
- r0 = dst
- r1 = Y
- r2 = VU
- r3 = length (pixels / 8)
- ---- Args below will be in the stack ----
- sp = YuvCoeff
-
- This function converts 8 pixels per iteration
-*/
-ENTRY(rsdIntrinsicYuv_K)
- push {r4, r5, lr} @ preserve clobbered int registers
- vpush {Q4-Q7} @ preserve Vregisters we clobber
-
- mov r5, #16 @ Integer 16 in r5; used as an incrementing value
-
- ldr r4, [sp, #64+12] @ load the coeffs address in memory in r4 (16*4 + 4*3)
- vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2
- vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6
- vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8
-
- mov r4, #8 @ Integer 8 in r4; used as an incrementing value
-
- vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in
- @ the coeffs matrix (Q2)
-
- 1:
- vld1.8 {d10}, [r1]! @ get Y (r1->Y)
- vld2.8 {d12, d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
- pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops
- pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops
-
- vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7)
- vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
- vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
-
- vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
- vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
- vmov.u16 d11, d10 @ Copying V to d11
- vmov.u16 d13, d12 @ Copying U to d13
- vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
- vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
-
-
- vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
- vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
- vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
- vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
-
- @ R G B
- @ Pixel(0-3) Q8, Q9, Q10
- @ Pixel(4-7) Q11, Q12, Q13
- @
-
- @ Pixel(0-3)
- vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409
- vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208)
- vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100)
- vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516
-
- @ Pixel(4-7)
- vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409
- vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
- vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100)
- vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516
-
- @ Pixel(0-3)
- vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
-
- @ Pixel(4-7)
- vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
-
- vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit)
- vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit)
- vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit)
-
- subs r3, r3, #1 @ Checking length (r3)
- vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
-
- bne 1b @ if not done with length, loop
-
- vpop {Q4-Q7} @ Restore Vregisters
- pop {r4, r5, lr} @ Restore int registers
- bx lr
-END(rsdIntrinsicYuv_K)
-
-/*
- Function called with the following arguments: dst, Y, vu, len, YuvCoeff
- r0 = dst
- r1 = Y
- r2 = UV
- r3 = length (pixels / 8)
- ---- Args below will be in the stack ----
- sp = YuvCoeff
-
- This function converts 8 pixels per iteration
-*/
-ENTRY(rsdIntrinsicYuvR_K)
- push {r4, r5, lr} @ preserve clobbered int registers
- vpush {Q4-Q7} @ preserve Vregisters we clobber
-
- mov r5, #16 @ Integer 16 in r5; used as an incrementing value
-
- ldr r4, [sp, #64+12] @ load the coeffs address in memory in r4 (16*4 + 4*3)
- vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2
- vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6
- vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8
-
- mov r4, #8 @ Integer 8 in r4; used as an incrementing value
-
- vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in
- @ the coeffs matrix (Q2)
-
- 1:
- vld1.8 {d10}, [r1]! @ get Y (r1->Y)
- vld2.8 {d12, d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
- pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops
- pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops
-
- vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7)
- vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
- vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
-
- vsubl.u8 Q5, d14, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
- vsubl.u8 Q6, d12, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
- vmov.u16 d11, d10 @ Copying V to d11
- vmov.u16 d13, d12 @ Copying U to d13
- vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
- vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
-
-
- vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
- vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
- vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
- vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
-
- @ R G B
- @ Pixel(0-3) Q8, Q9, Q10
- @ Pixel(4-7) Q11, Q12, Q13
- @
-
- @ Pixel(0-3)
- vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409
- vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208)
- vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100)
- vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516
-
- @ Pixel(4-7)
- vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409
- vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
- vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100)
- vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516
-
- @ Pixel(0-3)
- vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
-
- @ Pixel(4-7)
- vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
-
- vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit)
- vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit)
- vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit)
-
- subs r3, r3, #1 @ Checking length (r3)
- vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
-
- bne 1b @ if not done with length, loop
-
- vpop {Q4-Q7} @ Restore Vregisters
- pop {r4, r5, lr} @ Restore int registers
- bx lr
-END(rsdIntrinsicYuvR_K)
-
-/*
- Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
- r0 = dst
- r1 = Y
- r2 = V,
- r3 = U
- ---- Args below will be in the stack ----
- sp = length (pixels / 8)
- sp+4 = YuvCoeff
-
- This function converts 8 pixels per iteration
-*/
-ENTRY(rsdIntrinsicYuv2_K)
- push {r4, r5, r6, lr} @ preserve clobbered int registers
- vpush {Q4-Q7} @ preserve Vregisters we clobber
-
- mov r5, #16 @ Integer 16 in r5; used as an incrementing value
-
- ldr r4, [sp, #64+16+4] @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4)
- ldr r6, [sp, #64+16] @ load the length in r6 (16*4 + 4*4)
- vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2
- vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6
- vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8
-
- mov r4, #4 @ Integer 8 in r4; used as an incrementing value
-
- vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in
- @ the coeffs matrix (Q2)
-
- 1:
- vld1.8 {d10}, [r1]! @ get Y (r1->Y)
- vld1.8 {d12}, [r3], r4 @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
- vld1.8 {d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
- pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops
- pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops
-
- vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7)
- vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
- vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
-
- vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
- vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
- vmov.u16 d11, d10 @ Copying V to d11
- vmov.u16 d13, d12 @ Copying U to d13
- vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
- vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
-
-
- vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
- vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
- vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
- vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
-
- @ R G B
- @ Pixel(0-3) Q8, Q9, Q10
- @ Pixel(4-7) Q11, Q12, Q13
- @
-
- @ Pixel(0-3)
- vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409
- vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208)
- vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100)
- vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516
-
- @ Pixel(4-7)
- vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409
- vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
- vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100)
- vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516
-
- @ Pixel(0-3)
- vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
-
- @ Pixel(4-7)
- vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
- vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
-
- vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit)
- vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit)
- vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit)
-
- subs r6, r6, #1 @ Checking length (r6)
- vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
-
- bne 1b @ if not done with length, loop
-
- vpop {Q4-Q7} @ Restore Vregisters
- pop {r4, r5, r6, lr} @ Restore int registers
- bx lr
-END(rsdIntrinsicYuv2_K)
/* Convolve 5x5 */
diff --git a/cpu_ref/rsCpuIntrinsics_neon_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_neon_YuvToRGB.S
new file mode 100644
index 0000000..da4cded
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_neon_YuvToRGB.S
@@ -0,0 +1,298 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register. This macro will be called from within several different wrapper
+ * variants for different data layouts. Y data starts in q8, but with the even
+ * and odd bytes split into d16 and d17 respectively. U and V are in d20
+ * and d21. Working constants are pre-loaded into q13-q15, and q3 is
+ * pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern
+ vmov.i8 d15, #149
+
+ vmull.u8 q1, d16, d15 // g0 = y0 * 149
+ vmull.u8 q5, d17, d15 // g1 = y1 * 149
+
+ vmov.i8 d14, #50
+ vmov.i8 d15, #104
+ vmull.u8 q8, d20, d14 // g2 = u * 50 + v * 104
+ vmlal.u8 q8, d21, d15
+
+ vshr.u8 d14, d21, #1
+ vaddw.u8 q0, q1, d14 // r0 = y0 * 149 + (v >> 1)
+ vaddw.u8 q4, q5, d14 // r1 = y1 * 149 + (v >> 1)
+
+ vshll.u8 q7, d20, #2
+ vadd.u16 q2, q1, q7 // b0 = y0 * 149 + (u << 2)
+ vadd.u16 q6, q5, q7 // b1 = y1 * 149 + (u << 2)
+
+ vmov.i8 d14, #204
+ vmov.i8 d15, #254
+ vmull.u8 q11, d21, d14 // r2 = v * 204
+ vmull.u8 q12, d20, d15 // b2 = u * 254
+
+ vhadd.u16 q0, q11 // r0 = (r0 + r2) >> 1
+ vhadd.u16 q4, q11 // r1 = (r1 + r2) >> 1
+ vqadd.u16 q1, q14 // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ vqadd.u16 q5, q14 // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ vhadd.u16 q2, q12 // b0 = (b0 + b2) >> 1
+ vhadd.u16 q6, q12 // b1 = (b1 + b2) >> 1
+
+ vqsub.u16 q0, q13 // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ vqsub.u16 q4, q13 // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ vqsub.u16 q1, q8 // g0 = satu16(g0 - g2)
+ vqsub.u16 q5, q8 // g1 = satu16(g1 - g2)
+ vqsub.u16 q2, q15 // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ vqsub.u16 q6, q15 // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+ vqrshrn.u16 d0, q0, #6
+ vqrshrn.u16 d1, q1, #7
+ vqrshrn.u16 d2, q4, #6
+ vqrshrn.u16 d3, q5, #7
+ vqrshrn.u16 d4, q2, #6
+ vqrshrn.u16 d5, q6, #6
+
+ vzip.u8 q0, q1
+ vzip.u8 d4, d5
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop. Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+
+ movw r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ vdup.i16 q13, r5
+ movw r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ vdup.i16 q14, r5
+ movw r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ vdup.i16 q15, r5
+
+ vmov.i8 q3, #0xff
+
+ subs r2, #16
+ bhs 1f
+ b 2f
+
+ .align 4
+1: vld2.u8 {d16,d17}, [r1]!
+ pld [r1, #256]
+ .if \interleaved
+ vld2.u8 {d20,d21}, [r3]!
+ .if \swapuv
+ vswp d20, d21
+ .endif
+ pld [r3, #256]
+ .else
+ vld1.u8 d20, [r3]!
+ vld1.u8 d21, [r4]!
+ pld [r3, #128]
+ pld [r4, #128]
+ .endif
+
+ \kernel
+
+ subs r2, #16
+
+ vst4.u8 {d0,d2,d4,d6}, [r0]!
+ vst4.u8 {d1,d3,d5,d7}, [r0]!
+
+ bhs 1b
+
+2: adds r2, #16
+ beq 2f
+
+ /* To handle the tail portion of the data (something less than 16
+ * bytes) load small power-of-two chunks into working registers. It
+ * doesn't matter where they end up in the register; the same process
+ * will store them back out using the same positions and the
+ * interaction between neighbouring pixels is constrained to odd
+ * boundaries where the load operations don't interfere.
+ */
+ vmov.i8 q8, #0
+ vmov.i8 q10, #0
+
+ tst r2, #8
+ beq 1f
+ vld1.u8 d17, [r1]!
+ .if \interleaved
+ vld1.u8 d21, [r3]!
+ .else
+ vld1.u32 d20[1], [r3]!
+ vld1.u32 d21[1], [r4]!
+ .endif
+
+1: tst r2, #4
+ beq 1f
+ vld1.u32 d16[1], [r1]!
+ .if \interleaved
+ vld1.u32 d20[1], [r3]!
+ .else
+ vld1.u16 d20[1], [r3]!
+ vld1.u16 d21[1], [r4]!
+ .endif
+1: tst r2, #2
+ beq 1f
+ vld1.u16 d16[1], [r1]!
+ .if \interleaved
+ vld1.u16 d20[1], [r3]!
+ .else
+ vld1.u8 d20[1], [r3]!
+ vld1.u8 d21[1], [r4]!
+ .endif
+1: tst r2, #1
+ beq 1f
+ vld1.u8 d16[1], [r1]!
+ .if \interleaved
+ vld1.u8 d20[1], [r3]!
+ .else
+ vld1.u8 d20[0], [r3]!
+ vld1.u8 d21[0], [r4]!
+ .endif
+
+ /* One small impediment in the process above is that some of the load
+ * operations can't perform byte-wise structure deinterleaving at the
+ * same time as loading only part of a register. So the data is loaded
+ * linearly and unpacked manually at this point if necessary.
+ */
+1: vuzp.8 d16, d17
+ .if \interleaved
+ vuzp.8 d20, d21
+ .if \swapuv
+ vswp d20, d21
+ .endif
+ .endif
+
+ \kernel
+
+ /* As above but with the output; structured stores for partial vectors
+ * aren't available, so the data is re-packed first and stored linearly.
+ */
+ vzip.8 q0, q2
+ vzip.8 q1, q3
+ vzip.8 q0, q1
+ vzip.8 q2, q3
+
+1: tst r2, #8
+ beq 1f
+ vst1.u8 {d4,d5,d6,d7}, [r0]!
+
+1: tst r2, #4
+ beq 1f
+ vst1.u8 {d2,d3}, [r0]!
+1: tst r2, #2
+ beq 1f
+ vst1.u8 d1, [r0]!
+1: tst r2, #1
+ beq 2f
+ vst1.u32 d0[1], [r0]!
+2:
+.endm
+
+
+/* void rsdIntrinsicYuv2_K(
+ * void *out, // r0
+ * void const *yin, // r1
+ * void const *uin, // r2
+ * void const *vin, // r3
+ * size_t xstart, // [sp]
+ * size_t xend); // [sp+#4]
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+ push {r4,r5}
+ ldr r5, [sp, #8]
+ mov r4, r3
+ mov r3, r2
+ ldr r2, [sp, #12]
+
+ add r0, r5, LSL #2
+ add r1, r5
+ add r3, r5, LSR #1
+ add r4, r5, LSR #1
+ sub r2, r5
+
+ vpush {d8-d15}
+
+ wrap_line yuvkern, 0
+
+ vpop {d8-d15}
+ pop {r4,r5}
+ bx lr
+END(rsdIntrinsicYuv2_K)
+
+/* void rsdIntrinsicYuv_K(
+ * void *out, // r0
+ * void const *yin, // r1
+ * void const *uvin, // r2
+ * size_t xstart, // r3
+ * size_t xend); // [sp]
+ */
+ENTRY(rsdIntrinsicYuv_K)
+ push {r4,r5}
+ bic r4, r3, #1
+ add r3, r2, r4
+ ldr r2, [sp, #8]
+
+ add r0, r4, LSL #2
+ add r1, r4
+ sub r2, r4
+
+ vpush {d8-d15}
+
+ wrap_line yuvkern, 1, 1
+
+ vpop {d8-d15}
+ pop {r4,r5}
+ bx lr
+END(rsdIntrinsicYuv_K)
+
+/* void rsdIntrinsicYuvR_K(
+ * void *out, // r0
+ * void const *yin, // r1
+ * void const *uvin, // r2
+ * size_t xstart, // r3
+ * size_t xend); // [sp]
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+ push {r4,r5}
+ bic r4, r3, #1
+ add r3, r2, r4
+ ldr r2, [sp, #8]
+
+ add r0, r4, LSL #2
+ add r1, r4
+ sub r2, r4
+
+ vpush {d8-d15}
+
+ wrap_line yuvkern, 1
+
+ vpop {d8-d15}
+ pop {r4,r5}
+ bx lr
+END(rsdIntrinsicYuvR_K)