blob: 7a6d4c592bbf809ae4ca99141051404a077b7b88 [file] [log] [blame]
/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
#define END(f) .size f, .-f;
.macro vmxx_f32 i, mask, opd, opa, opb
.if (\i) & \mask
.if (\i) & (\mask - 1)
fmla \opd, \opa, \opb
.else
fmul \opd, \opa, \opb
.endif
.endif
.endm
.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
.if (\i) & \mask
.if (\i) & (\mask - 1)
fadd \opd, \opa, \opb
.else
mov \stupidsyntax1, \stupidsyntax2
.endif
.endif
.endm
.macro vmxx_s16 i, mask, opd, opa, opb
.if (\i) & \mask
.if (\i) & (\mask - 1 + 16)
smlal \opd, \opa, \opb
.else
smull \opd, \opa, \opb
.endif
.endif
.endm
.macro vmxx2_s16 i, mask, opd, opa, opb
.if (\i) & \mask
.if (\i) & (\mask - 1 + 16)
smlal2 \opd, \opa, \opb
.else
smull2 \opd, \opa, \opb
.endif
.endif
.endm
/* x0 = dst
* x1 = src
* x2 = count
* x3 = params
* x4 = column0_fn
* x5 = column1_fn
* x6 = column2_fn
* x7 = column3_fn
* x8 = store_fn
* x9 = load_fn
*/
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
.align 6
colormatrix_int_col0_\i:
.if \i & 16
dup v6.4s, v4.s[0]
dup v7.4s, v4.s[0]
.endif
vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0]
vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[4]
vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[0]
vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[4]
vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[0]
vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[4]
vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[0]
vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[4]
sqshrun v8.4h, v6.4s, #8
sqshrun2 v8.8h, v7.4s, #8
br x5
colormatrix_int_col0_n\i:
.if (\i^31) & 16
dup v6.4s, v4.s[0]
dup v7.4s, v4.s[0]
.endif
vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[0]
vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[4]
vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[0]
vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[4]
vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[0]
vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[4]
vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[0]
vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[4]
sqshrun v8.4h, v6.4s, #8
sqshrun2 v8.8h, v7.4s, #8
br x5
.align 6
colormatrix_int_col1_\i:
.if \i & 16
dup v6.4s, v4.s[1]
dup v7.4s, v4.s[1]
.endif
vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[1]
vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[5]
vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[1]
vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[5]
vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[1]
vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[5]
vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[1]
vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[5]
sqshrun v9.4h, v6.4s, #8
sqshrun2 v9.8h, v7.4s, #8
br x6
colormatrix_int_col1_n\i:
.if (\i^31) & 16
dup v6.4s, v4.s[1]
dup v7.4s, v4.s[1]
.endif
vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[1]
vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[5]
vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[1]
vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[5]
vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[1]
vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[5]
vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[1]
vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[5]
sqshrun v9.4h, v6.4s, #8
sqshrun2 v9.8h, v7.4s, #8
br x6
.align 6
colormatrix_int_col2_\i:
.if \i & 16
dup v6.4s, v4.s[2]
dup v7.4s, v4.s[2]
.endif
vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[2]
vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[6]
vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[2]
vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[6]
vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[2]
vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[6]
vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[2]
vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[6]
sqshrun v10.4h, v6.4s, #8
sqshrun2 v10.8h, v7.4s, #8
br x7
colormatrix_int_col2_n\i:
.if (\i^31) & 16
dup v6.4s, v4.s[2]
dup v7.4s, v4.s[2]
.endif
vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[2]
vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[6]
vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[2]
vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[6]
vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[2]
vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[6]
vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[2]
vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[6]
sqshrun v10.4h, v6.4s, #8
sqshrun2 v10.8h, v7.4s, #8
br x7
.align 6
colormatrix_int_col3_\i:
.if \i & 16
dup v6.4s, v4.s[3]
dup v7.4s, v4.s[3]
.endif
vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[3]
vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[7]
vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[3]
vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[7]
vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[3]
vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[7]
vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[3]
vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[7]
sqshrun v11.4h, v6.4s, #8
sqshrun2 v11.8h, v7.4s, #8
br x8
colormatrix_int_col3_n\i:
.if (\i^31) & 16
dup v6.4s, v4.s[3]
dup v7.4s, v4.s[3]
.endif
vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[3]
vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[7]
vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[3]
vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[7]
vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[3]
vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[7]
vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[3]
vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[7]
sqshrun v11.4h, v6.4s, #8
sqshrun2 v11.8h, v7.4s, #8
br x8
.align 5
colormatrix_float_col0_\i:
vmxx_f32 \i, 1, v8.4s, v12.4s, v0.s[0]
vmxx_f32 \i, 2, v8.4s, v13.4s, v1.s[0]
vmxx_f32 \i, 4, v8.4s, v14.4s, v2.s[0]
vmxx_f32 \i, 8, v8.4s, v15.4s, v3.s[0]
vadd_f32 \i, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
vmxx_f32 \i, 1, v16.4s, v20.4s, v0.s[0]
vmxx_f32 \i, 2, v16.4s, v21.4s, v1.s[0]
vmxx_f32 \i, 4, v16.4s, v22.4s, v2.s[0]
vmxx_f32 \i, 8, v16.4s, v23.4s, v3.s[0]
vadd_f32 \i, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
br x5
.align 4
colormatrix_float_col0_n\i:
vmxx_f32 \i^31, 1, v8.4s, v12.4s, v0.s[0]
vmxx_f32 \i^31, 2, v8.4s, v13.4s, v1.s[0]
vmxx_f32 \i^31, 4, v8.4s, v14.4s, v2.s[0]
vmxx_f32 \i^31, 8, v8.4s, v15.4s, v3.s[0]
vadd_f32 \i^31, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
vmxx_f32 \i^31, 1, v16.4s, v20.4s, v0.s[0]
vmxx_f32 \i^31, 2, v16.4s, v21.4s, v1.s[0]
vmxx_f32 \i^31, 4, v16.4s, v22.4s, v2.s[0]
vmxx_f32 \i^31, 8, v16.4s, v23.4s, v3.s[0]
vadd_f32 \i^31, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
br x5
.align 5
colormatrix_float_col1_\i:
vmxx_f32 \i, 1, v9.4s, v12.4s, v0.s[1]
vmxx_f32 \i, 2, v9.4s, v13.4s, v1.s[1]
vmxx_f32 \i, 4, v9.4s, v14.4s, v2.s[1]
vmxx_f32 \i, 8, v9.4s, v15.4s, v3.s[1]
vadd_f32 \i, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
vmxx_f32 \i, 1, v17.4s, v20.4s, v0.s[1]
vmxx_f32 \i, 2, v17.4s, v21.4s, v1.s[1]
vmxx_f32 \i, 4, v17.4s, v22.4s, v2.s[1]
vmxx_f32 \i, 8, v17.4s, v23.4s, v3.s[1]
vadd_f32 \i, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
br x6
.align 4
colormatrix_float_col1_n\i:
vmxx_f32 \i^31, 1, v9.4s, v12.4s, v0.s[1]
vmxx_f32 \i^31, 2, v9.4s, v13.4s, v1.s[1]
vmxx_f32 \i^31, 4, v9.4s, v14.4s, v2.s[1]
vmxx_f32 \i^31, 8, v9.4s, v15.4s, v3.s[1]
vadd_f32 \i^31, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
vmxx_f32 \i^31, 1, v17.4s, v20.4s, v0.s[1]
vmxx_f32 \i^31, 2, v17.4s, v21.4s, v1.s[1]
vmxx_f32 \i^31, 4, v17.4s, v22.4s, v2.s[1]
vmxx_f32 \i^31, 8, v17.4s, v23.4s, v3.s[1]
vadd_f32 \i^31, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
br x6
.align 5
colormatrix_float_col2_\i:
vmxx_f32 \i, 1, v10.4s, v12.4s, v0.s[2]
vmxx_f32 \i, 2, v10.4s, v13.4s, v1.s[2]
vmxx_f32 \i, 4, v10.4s, v14.4s, v2.s[2]
vmxx_f32 \i, 8, v10.4s, v15.4s, v3.s[2]
vadd_f32 \i, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
vmxx_f32 \i, 1, v18.4s, v20.4s, v0.s[2]
vmxx_f32 \i, 2, v18.4s, v21.4s, v1.s[2]
vmxx_f32 \i, 4, v18.4s, v22.4s, v2.s[2]
vmxx_f32 \i, 8, v18.4s, v23.4s, v3.s[2]
vadd_f32 \i, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
br x7
.align 4
colormatrix_float_col2_n\i:
vmxx_f32 \i^31, 1, v10.4s, v12.4s, v0.s[2]
vmxx_f32 \i^31, 2, v10.4s, v13.4s, v1.s[2]
vmxx_f32 \i^31, 4, v10.4s, v14.4s, v2.s[2]
vmxx_f32 \i^31, 8, v10.4s, v15.4s, v3.s[2]
vadd_f32 \i^31, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
vmxx_f32 \i^31, 1, v18.4s, v20.4s, v0.s[2]
vmxx_f32 \i^31, 2, v18.4s, v21.4s, v1.s[2]
vmxx_f32 \i^31, 4, v18.4s, v22.4s, v2.s[2]
vmxx_f32 \i^31, 8, v18.4s, v23.4s, v3.s[2]
vadd_f32 \i^31, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
br x7
.align 5
colormatrix_float_col3_\i:
vmxx_f32 \i, 1, v11.4s, v12.4s, v0.s[3]
vmxx_f32 \i, 2, v11.4s, v13.4s, v1.s[3]
vmxx_f32 \i, 4, v11.4s, v14.4s, v2.s[3]
vmxx_f32 \i, 8, v11.4s, v15.4s, v3.s[3]
vadd_f32 \i, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
vmxx_f32 \i, 1, v19.4s, v20.4s, v0.s[3]
vmxx_f32 \i, 2, v19.4s, v21.4s, v1.s[3]
vmxx_f32 \i, 4, v19.4s, v22.4s, v2.s[3]
vmxx_f32 \i, 8, v19.4s, v23.4s, v3.s[3]
vadd_f32 \i, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
br x8
.align 4
colormatrix_float_col3_n\i:
vmxx_f32 \i^31, 1, v11.4s, v12.4s, v0.s[3]
vmxx_f32 \i^31, 2, v11.4s, v13.4s, v1.s[3]
vmxx_f32 \i^31, 4, v11.4s, v14.4s, v2.s[3]
vmxx_f32 \i^31, 8, v11.4s, v15.4s, v3.s[3]
vadd_f32 \i^31, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
vmxx_f32 \i^31, 1, v19.4s, v20.4s, v0.s[3]
vmxx_f32 \i^31, 2, v19.4s, v21.4s, v1.s[3]
vmxx_f32 \i^31, 4, v19.4s, v22.4s, v2.s[3]
vmxx_f32 \i^31, 8, v19.4s, v23.4s, v3.s[3]
vadd_f32 \i^31, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
br x8
.endr
.align 6
colormatrix_float_ldu4:
ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
uxtl v22.8h, v22.8b
uxtl v23.8h, v23.8b
uxtl v12.4s, v20.4h
uxtl v13.4s, v21.4h
uxtl v14.4s, v22.4h
uxtl v15.4s, v23.4h
uxtl2 v20.4s, v20.8h
uxtl2 v21.4s, v21.8h
uxtl2 v22.4s, v22.8h
uxtl2 v23.4s, v23.8h
ucvtf v12.4s, v12.4s
ucvtf v13.4s, v13.4s
ucvtf v14.4s, v14.4s
ucvtf v15.4s, v15.4s
ucvtf v20.4s, v20.4s
ucvtf v21.4s, v21.4s
ucvtf v22.4s, v22.4s
ucvtf v23.4s, v23.4s
br x4
.align 5
colormatrix_int_ldu4:
ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
uxtl v12.8h, v12.8b
uxtl v13.8h, v13.8b
uxtl v14.8h, v14.8b
uxtl v15.8h, v15.8b
br x4
.align 6
colormatrix_float_ldu3:
ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
uxtl v22.8h, v22.8b
uxtl v12.4s, v20.4h
uxtl v13.4s, v21.4h
uxtl v14.4s, v22.4h
uxtl2 v20.4s, v20.8h
uxtl2 v21.4s, v21.8h
uxtl2 v22.4s, v22.8h
ucvtf v12.4s, v12.4s
ucvtf v13.4s, v13.4s
ucvtf v14.4s, v14.4s
ucvtf v20.4s, v20.4s
ucvtf v21.4s, v21.4s
ucvtf v22.4s, v22.4s
br x4
colormatrix_int_ldu3:
ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
uxtl v12.8h, v12.8b
uxtl v13.8h, v13.8b
uxtl v14.8h, v14.8b
br x4
.align 5
colormatrix_float_ldu1:
ld1 {v20.8b}, [x1], #8
uxtl v20.8h, v20.8b
uxtl v12.4s, v20.4h
uxtl2 v20.4s, v20.8h
ucvtf v12.4s, v12.4s
ucvtf v20.4s, v20.4s
br x4
.align 6
colormatrix_float_ldu2:
ld2 {v20.8b,v21.8b}, [x1], #16
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
uxtl v12.4s, v20.4h
uxtl v13.4s, v21.4h
uxtl2 v20.4s, v20.8h
uxtl2 v21.4s, v21.8h
ucvtf v12.4s, v12.4s
ucvtf v13.4s, v13.4s
ucvtf v20.4s, v20.4s
ucvtf v21.4s, v21.4s
br x4
.align 4
colormatrix_int_ldu2:
ld2 {v12.8b,v13.8b}, [x1], #16
uxtl v12.8h, v12.8b
uxtl v13.8h, v13.8b
br x4
.align 6
colormatrix_float_stu4:
fcvtzs v24.4s, v8.4s, #1
fcvtzs v25.4s, v9.4s, #1
fcvtzs v26.4s, v10.4s, #1
fcvtzs v27.4s, v11.4s, #1
fcvtzs v28.4s, v16.4s, #1
fcvtzs v29.4s, v17.4s, #1
fcvtzs v30.4s, v18.4s, #1
fcvtzs v31.4s, v19.4s, #1
sqrshrun v24.4h, v24.4s, #1
sqrshrun v25.4h, v25.4s, #1
sqrshrun v26.4h, v26.4s, #1
sqrshrun v27.4h, v27.4s, #1
sqrshrun2 v24.8h, v28.4s, #1
sqrshrun2 v25.8h, v29.4s, #1
sqrshrun2 v26.8h, v30.4s, #1
sqrshrun2 v27.8h, v31.4s, #1
uqxtn v24.8b, v24.8h
uqxtn v25.8b, v25.8h
uqxtn v26.8b, v26.8h
uqxtn v27.8b, v27.8h
subs x2, x2, #8
st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
blo colormatrix_float_end
br x9
.align 5
colormatrix_int_stu4:
uqxtn v12.8b, v8.8h
uqxtn v13.8b, v9.8h
uqxtn v14.8b, v10.8h
uqxtn v15.8b, v11.8h
subs x2, x2, #8
st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
blo colormatrix_int_end
br x9
.align 6
colormatrix_float_stu3:
fcvtzs v24.4s, v8.4s, #1
fcvtzs v25.4s, v9.4s, #1
fcvtzs v26.4s, v10.4s, #1
fcvtzs v28.4s, v16.4s, #1
fcvtzs v29.4s, v17.4s, #1
fcvtzs v30.4s, v18.4s, #1
sqrshrun v24.4h, v24.4s, #1
sqrshrun v25.4h, v25.4s, #1
sqrshrun v26.4h, v26.4s, #1
sqrshrun2 v24.8h, v28.4s, #1
sqrshrun2 v25.8h, v29.4s, #1
sqrshrun2 v26.8h, v30.4s, #1
uqxtn v24.8b, v24.8h
uqxtn v25.8b, v25.8h
uqxtn v26.8b, v26.8h
movi v27.8b, #0
subs x2, x2, #8
st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
blo colormatrix_float_end
br x9
.align 4
colormatrix_int_ldu1:
ld1 {v12.8b}, [x1], #8
uxtl v12.8h, v12.8b
br x4
.align 5
colormatrix_int_stu3:
uqxtn v12.8b, v8.8h
uqxtn v13.8b, v9.8h
uqxtn v14.8b, v10.8h
movi v15.8b, #0
subs x2, x2, #8
st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
blo colormatrix_int_end
br x9
.align 6
colormatrix_float_stu2:
fcvtzs v24.4s, v8.4s, #1
fcvtzs v25.4s, v9.4s, #1
fcvtzs v28.4s, v16.4s, #1
fcvtzs v29.4s, v17.4s, #1
sqrshrun v24.4h, v24.4s, #1
sqrshrun v25.4h, v25.4s, #1
sqrshrun2 v24.8h, v28.4s, #1
sqrshrun2 v25.8h, v29.4s, #1
uqxtn v24.8b, v24.8h
uqxtn v25.8b, v25.8h
subs x2, x2, #8
st2 {v24.8b,v25.8b}, [x0], #16
blo colormatrix_float_end
br x9
.align 5
colormatrix_int_stu2:
uqxtn v12.8b, v8.8h
uqxtn v13.8b, v9.8h
subs x2, x2, #8
st2 {v12.8b,v13.8b}, [x0], #16
blo colormatrix_int_end
br x9
.align 5
colormatrix_int_stu1:
uqxtn v12.8b, v8.8h
subs x2, x2, #8
st1 {v12.8b}, [x0], #8
blo colormatrix_int_end
br x9
colormatrix_float_ldf3:
ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
br x4
.align 6
colormatrix_float_stu1:
fcvtzs v24.4s, v8.4s, #1
fcvtzs v28.4s, v16.4s, #1
sqrshrun v24.4h, v24.4s, #1
sqrshrun2 v24.8h, v28.4s, #1
uqxtn v24.8b, v24.8h
subs x2, x2, #8
st1 {v24.8b}, [x0], #8
blo colormatrix_float_end
br x9
colormatrix_float_stf3:
movi v11.16b, #0
st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
movi v19.16b, #0
subs x2, x2, #8
st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
blo colormatrix_float_end
br x9
.align 5
colormatrix_float_stf4:
st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
subs x2, x2, #8
st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
blo colormatrix_float_end
br x9
colormatrix_float_ldf4:
ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
br x4
.align 5
colormatrix_float_stf2:
st2 {v8.4s, v9.4s}, [x0], #32
subs x2, x2, #8
st2 {v16.4s, v17.4s}, [x0], #32
blo colormatrix_float_end
br x9
colormatrix_float_ldf2:
ld2 {v12.4s,v13.4s}, [x1], #32
ld2 {v20.4s,v21.4s}, [x1], #32
br x4
.align 5
colormatrix_float_stf1:
st1 {v8.4s}, [x0], #16
subs x2, x2, #8
st1 {v16.4s}, [x0], #16
blo colormatrix_float_end
br x9
colormatrix_float_ldf1:
ld1 {v12.4s}, [x1], #16
ld1 {v20.4s}, [x1], #16
br x4
/* size_t rsdIntrinsicColorMatrix_int_K(
* void *out, // x0
* void const *in, // x1
* size_t count, // x2
* fntab_t const *fns, // x3
* int16_t const *mult, // x4
* int32_t const *add); // x5
*/
ENTRY(rsdIntrinsicColorMatrix_int_K)
stp x8,x9, [sp, #-16]!
sub x7, sp, #32
sub sp, sp, #64
st1 {v8.1d-v11.1d}, [sp]
st1 {v12.1d-v15.1d}, [x7]
ld1 {v0.8h,v1.8h}, [x4], #32
ld1 {v4.4s}, [x5], #16
ldp x4,x5, [x3],#16
ldp x6,x7, [x3],#16
ldp x8,x9, [x3],#16
dup v12.4s, v4.s[0]
dup v13.4s, v4.s[1]
dup v14.4s, v4.s[2]
dup v15.4s, v4.s[3]
sqshrun v8.4h, v12.4s, #8
sqshrun2 v8.8h, v12.4s, #8
sqshrun v9.4h, v13.4s, #8
sqshrun2 v9.8h, v13.4s, #8
sqshrun v10.4h, v14.4s, #8
sqshrun2 v10.8h, v14.4s, #8
sqshrun v11.4h, v15.4s, #8
sqshrun2 v11.8h, v15.4s, #8
subs x2, x2, #8
blo colormatrix_int_end
br x9
colormatrix_int_end:
ld1 {v8.1d-v11.1d}, [sp], #32
ld1 {v12.1d-v15.1d}, [sp], #32
ldp x8,x9, [sp], #16
add x0, x2, #8
ret
END(rsdIntrinsicColorMatrix_int_K)
/* void rsdIntrinsicColorMatrixSetup_int_K(
* fntab_t const *fns, // x0
* uint32_t mask, // x1
* int dt, // x2
* int st); // x3
*/
ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
adr x4, 2f
ldrsh x2, [x4, x2, LSL #1]
add x2, x2, x4
adr x4, 3f
ldrsh x3, [x4, x3, LSL #1]
add x3, x3, x4
stp x2, x3, [x0, #32]
/* For each column function, if the matrix is all zeroes then write NULL,
* otherwise look up the appropriate function and store that. */
mov x3, #4
adr x4, 4f
1: ands x2, x1, #15
beq 9f
and x2, x1, #31
lsl x2, x2, #3
ldrsh x2, [x4, x2]
add x2, x2, x4
9: str x2, [x0], #8
lsr x1, x1, #5
add x4, x4, #2
subs x3, x3, #1
bne 1b
/* For every NULL entry, copy the non-NULL entry that follows it, or the store
* function. */
ldr x2, [x0]
mov x3, #4
1: ldr x1, [x0, #-8]!
cmp x1, #0
csel x2, x1, x2, ne
str x2, [x0]
subs x3, x3, #1
bne 1b
ret
.align 4
2: .hword colormatrix_int_stu1-2b
.hword colormatrix_int_stu2-2b
.hword colormatrix_int_stu3-2b
.hword colormatrix_int_stu4-2b
3: .hword colormatrix_int_ldu1-3b
.hword colormatrix_int_ldu2-3b
.hword colormatrix_int_ldu3-3b
.hword colormatrix_int_ldu4-3b
4:
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
.hword colormatrix_int_col0_\i-4b
.hword colormatrix_int_col1_\i-4b-2
.hword colormatrix_int_col2_\i-4b-4
.hword colormatrix_int_col3_\i-4b-6
.endr
.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.hword colormatrix_int_col0_n\i-4b
.hword colormatrix_int_col1_n\i-4b-2
.hword colormatrix_int_col2_n\i-4b-4
.hword colormatrix_int_col3_n\i-4b-6
.endr
END(rsdIntrinsicColorMatrixSetup_int_K)
/* size_t rsdIntrinsicColorMatrix_float_K(
* void *out, // x0
* void const *in, // x1
* size_t count, // x2
* fntab_t const *fns, // x3
* float const *mult, // x4
* float const *add); // x5
*/
ENTRY(rsdIntrinsicColorMatrix_float_K)
stp x8,x9, [sp, #-16]!
sub x7, sp, #32
sub sp, sp, #64
st1 {v8.1d-v11.1d}, [sp]
st1 {v12.1d-v15.1d}, [x7]
ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
ld1r {v4.4s}, [x5], #4
ld1r {v5.4s}, [x5], #4
ld1r {v6.4s}, [x5], #4
ld1r {v7.4s}, [x5], #4
ldp x4,x5, [x3], #16
ldp x6,x7, [x3], #16
ldp x8,x9, [x3], #16
mov v8.16b, v4.16b
mov v9.16b, v5.16b
mov v10.16b, v6.16b
mov v11.16b, v7.16b
mov v16.16b, v4.16b
mov v17.16b, v5.16b
mov v18.16b, v6.16b
mov v19.16b, v7.16b
subs x2, x2, #8
blo colormatrix_float_end
br x9
colormatrix_float_end:
ld1 {v8.1d-v11.1d}, [sp], #32
ld1 {v12.1d-v15.1d}, [sp], #32
ldp x8,x9, [sp], #16
add x0, x2, #8
ret
END(rsdIntrinsicColorMatrix_float_K)
/* void rsdIntrinsicColorMatrixSetup_float_K(
* fntab_t const *fns, // x0
* uint32_t mask, // x1
* int dt, // x2
* int st); // x3
*/
ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
adr x4, 2f
ldrsh x2, [x4, x2, LSL #1]
add x2, x2, x4
adr x4, 3f
ldrsh x3, [x4, x3, LSL #1]
add x3, x3, x4
stp x2, x3, [x0, #32]
/* For each column function, if the matrix is all zeroes then write NULL,
* otherwise look up the appropriate function and store that. */
mov x3, #4
adr x4, 4f
1: ands x2, x1, #15
beq 9f
and x2, x1, #31
lsl x2, x2, #3
ldrsh x2, [x4, x2]
add x2, x2, x4
9: str x2, [x0], #8
lsr x1, x1, #5
add x4, x4, #2
subs x3, x3, #1
bne 1b
/* For every NULL entry, copy the non-NULL entry that follows it, or the store
* function. */
ldr x2, [x0]
mov x3, #4
1: ldr x1, [x0, #-8]!
cmp x1, #0
csel x2, x1, x2, ne
str x2, [x0]
subs x3, x3, #1
bne 1b
ret
.align 4
2: .hword colormatrix_float_stu1-2b
.hword colormatrix_float_stu2-2b
.hword colormatrix_float_stu3-2b
.hword colormatrix_float_stu4-2b
.hword colormatrix_float_stf1-2b
.hword colormatrix_float_stf2-2b
.hword colormatrix_float_stf3-2b
.hword colormatrix_float_stf4-2b
3: .hword colormatrix_float_ldu1-3b
.hword colormatrix_float_ldu2-3b
.hword colormatrix_float_ldu3-3b
.hword colormatrix_float_ldu4-3b
.hword colormatrix_float_ldf1-3b
.hword colormatrix_float_ldf2-3b
.hword colormatrix_float_ldf3-3b
.hword colormatrix_float_ldf4-3b
4:
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
.hword colormatrix_float_col0_\i-4b
.hword colormatrix_float_col1_\i-4b-2
.hword colormatrix_float_col2_\i-4b-4
.hword colormatrix_float_col3_\i-4b-6
.endr
.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.hword colormatrix_float_col0_n\i-4b
.hword colormatrix_float_col1_n\i-4b-2
.hword colormatrix_float_col2_n\i-4b-4
.hword colormatrix_float_col3_n\i-4b-6
.endr
END(rsdIntrinsicColorMatrixSetup_float_K)