blob: 632ef7a4c53f3e07dcc58f2e0e29463b22c7d480 [file] [log] [blame]
/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
#define END(f) .size f, .-f;
/* Perform the actual YuvToRGB conversion in a macro, from register to
* register. This macro will be called from within several different wrapper
* variants for different data layouts. Y data starts with the even and odd
* bytes split into the low parts of v8 and v9 respectively. U and V are in
* v16 and v17. Working constants are pre-loaded into v13-v15, and v3 is
* pre-loaded with a constant 0xff alpha channel.
*
* The complicated arithmetic is the result of refactoring the original
* equations to avoid 16-bit overflow without losing any precision.
*/
.macro yuvkern
movi v7.8b, #149
umull v1.8h, v8.8b, v7.8b // g0 = y0 * 149
umull v5.8h, v9.8b, v7.8b // g1 = y1 * 149
movi v7.8b, #50
movi v10.8b, #104
umull v8.8h, v16.8b, v7.8b // g2 = u * 50 + v * 104
umlal v8.8h, v17.8b, v10.8b
ushr v7.8b, v17.8b, #1
uaddw v0.8h, v1.8h, v7.8b // r0 = y0 * 149 + (v >> 1)
uaddw v4.8h, v5.8h, v7.8b // r1 = y1 * 149 + (v >> 1)
ushll v7.8h, v16.8b, #2
add v2.8h, v1.8h, v7.8h // b0 = y0 * 149 + (u << 2)
add v6.8h, v5.8h, v7.8h // b1 = y1 * 149 + (u << 2)
movi v7.16b, #204
movi v10.8b, #254
umull v11.8h, v17.8b, v7.8b // r2 = v * 204
umull v12.8h, v16.8b, v10.8b // b2 = u * 254
uhadd v0.8h, v0.8h, v11.8h // r0 = (r0 + r2) >> 1
uhadd v4.8h, v4.8h, v11.8h // r1 = (r1 + r2) >> 1
uqadd v1.8h, v1.8h, v14.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
uqadd v5.8h, v5.8h, v14.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
uhadd v2.8h, v2.8h, v12.8h // b0 = (b0 + b2) >> 1
uhadd v6.8h, v6.8h, v12.8h // b1 = (b1 + b2) >> 1
uqsub v0.8h, v0.8h, v13.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
uqsub v4.8h, v4.8h, v13.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2)
uqsub v5.8h, v5.8h, v8.8h // g1 = satu16(g1 - g2)
uqsub v2.8h, v2.8h, v15.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
uqsub v6.8h, v6.8h, v15.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
uqrshrn v0.8b, v0.8h, #6
uqrshrn v4.8b, v4.8h, #6
uqrshrn v1.8b, v1.8h, #7
uqrshrn v5.8b, v5.8h, #7
uqrshrn v2.8b, v2.8h, #6
uqrshrn v6.8b, v6.8h, #6
zip1 v0.16b, v0.16b, v4.16b
zip1 v1.16b, v1.16b, v5.16b
zip1 v2.16b, v2.16b, v6.16b
.endm
/* Define the wrapper code which will load and store the data, iterate the
* correct number of times, and safely handle the remainder at the end of the
* loop. Some sections of code are switched out depending on the data packing
* being handled.
*/
.macro wrap_line kernel, interleaved=0, swapuv=0
mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
dup v13.8h, w5
mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
dup v14.8h, w5
mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
dup v15.8h, w5
movi v3.16b, #0xff
subs x2, x2, #16
bhs 1f
b 2f
.align 4
1: ld2 {v8.8b,v9.8b}, [x1], #16
// prfm PLDL1STRM, [x1, #256]
.if \interleaved
.if \swapuv
ld2 {v17.8b,v18.8b}, [x3], #16
mov v16.8b, v18.8b
.else
ld2 {v16.8b,v17.8b}, [x3], #16
.endif
// prfm PLD1STRM, [x3, #256]
.else
ld1 {v16.8b}, [x3], #8
ld1 {v17.8b}, [x4], #8
// prfm PLD1STRM, [x3, #128]
// prfm PLD1STRM, [x4, #128]
.endif
\kernel
subs x2, x2, #16
st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
bhs 1b
2: adds x2, x2, #16
beq 2f
/* To handle the tail portion of the data (something less than 16
* bytes) load small power-of-two chunks into working registers. It
* doesn't matter where they end up in the register; the same process
* will store them back out using the same positions and the
* interaction between neighbouring pixels is constrained to odd
* boundaries where the load operations don't interfere.
*/
movi v8.8b, #0
movi v9.8b, #0
movi v16.8b, #0
movi v17.8b, #0
tbz x2, #3, 1f
ld1 {v9.8b}, [x1], #8
.if \interleaved
ld1 {v17.8b}, [x3], #8
.else
ld1 {v16.s}[1], [x3], #4
ld1 {v17.s}[1], [x4], #4
.endif
1: tbz x2, #2, 1f
ld1 {v8.s}[1], [x1], #4
.if \interleaved
ld1 {v16.s}[1], [x3], #4
.else
ld1 {v16.h}[1], [x3], #2
ld1 {v17.h}[1], [x4], #2
.endif
1: tbz x2, #1, 1f
ld1 {v8.h}[1], [x1], #2
.if \interleaved
ld1 {v16.h}[1], [x3], #2
.else
ld1 {v16.b}[1], [x3], #1
ld1 {v17.b}[1], [x4], #1
.endif
1: tbz x2, #0, 1f
ld1 {v8.b}[1], [x1], #1
.if \interleaved
ld1 {v16.h}[0], [x3], #2
.else
ld1 {v16.b}[0], [x3], #1
ld1 {v17.b}[0], [x4], #1
.endif
/* One small impediment in the process above is that some of the load
* operations can't perform byte-wise structure deinterleaving at the
* same time as loading only part of a register. So the data is loaded
* linearly and unpacked manually at this point if necessary.
*/
1: uzp1 v8.16b, v8.16b, v9.16b
.if \interleaved
.if \swapuv
uzp1 v16.16b, v17.16b, v16.16b
.else
uzp1 v16.16b, v16.16b, v17.16b
.endif
.endif
\kernel
/* As above but with the output; structured stores for partial vectors
* aren't available, so the data is re-packed first and stored linearly.
*/
zip1 v4.16b, v0.16b, v2.16b
zip2 v6.16b, v0.16b, v2.16b
zip1 v5.16b, v1.16b, v3.16b
zip2 v7.16b, v1.16b, v3.16b
zip1 v0.16b, v4.16b, v5.16b
zip2 v1.16b, v4.16b, v5.16b
zip1 v2.16b, v6.16b, v7.16b
zip2 v3.16b, v6.16b, v7.16b
1: tbz x2, #3, 1f
st1 {v2.16b,v3.16b}, [x0], #32
1: tbz x2, #2, 1f
st1 {v1.16b}, [x0], #16
1: tbz x2, #1, 1f
st1 {v0.d}[1], [x0], #8
1: tbz x2, #0, 2f
st1 {v0.s}[1], [x0], #4
2:
.endm
/* void rsdIntrinsicYuv2_K(
* void *out, // x0
* void const *yin, // x1
* void const *uin, // x2
* void const *vin, // x3
* size_t xstart, // x4
* size_t xend); // x5
*/
ENTRY(rsdIntrinsicYuv2_K)
lsr x6, x4, #1
add x0, x0, x4, LSL #2
add x1, x1, x4
add x4, x3, x6
add x3, x2, x6
sub x2, x5, x6, LSL #2
sub x6, sp, #32
sub sp, sp, #64
st1 {v8.1d - v11.1d}, [sp]
st1 {v12.1d - v15.1d}, [x6]
wrap_line yuvkern, 0
ld1 {v8.1d - v11.1d}, [sp], #32
ld1 {v12.1d - v15.1d}, [sp], #32
ret
END(rsdIntrinsicYuv2_K)
/* void rsdIntrinsicYuv_K(
* void *out, // x0
* void const *yin, // x1
* void const *uvin, // x2
* size_t xstart, // x3
* size_t xend); // x4
*/
ENTRY(rsdIntrinsicYuv_K)
bic x5, x3, #1
add x0, x0, x5, LSL #2
add x1, x1, x5
add x3, x2, x5
sub x2, x4, x5
sub x5, sp, #32
sub sp, sp, #64
st1 {v8.1d - v11.1d}, [sp]
st1 {v12.1d - v15.1d}, [x5]
wrap_line yuvkern, 1, 1
ld1 {v8.1d - v11.1d}, [sp], #32
ld1 {v12.1d - v15.1d}, [sp], #32
ret
END(rsdIntrinsicYuv_K)
/* void rsdIntrinsicYuvR_K(
* void *out, // x0
* void const *yin, // x1
* void const *uvin, // x2
* size_t xstart, // x3
* size_t xend); // x4
*/
ENTRY(rsdIntrinsicYuvR_K)
bic x5, x3, #1
add x0, x0, x5, LSL #2
add x1, x1, x5
add x3, x2, x5
sub x2, x4, x5
sub x5, sp, #32
sub sp, sp, #64
st1 {v8.1d - v11.1d}, [sp]
st1 {v12.1d - v15.1d}, [x5]
wrap_line yuvkern, 1
ld1 {v8.1d - v11.1d}, [sp], #32
ld1 {v12.1d - v15.1d}, [sp], #32
ret
END(rsdIntrinsicYuvR_K)