| @/****************************************************************************** |
| @ * |
| @ * Copyright (C) 2015 The Android Open Source Project |
| @ * |
| @ * Licensed under the Apache License, Version 2.0 (the "License"); |
| @ * you may not use this file except in compliance with the License. |
| @ * You may obtain a copy of the License at: |
| @ * |
| @ * http://www.apache.org/licenses/LICENSE-2.0 |
| @ * |
| @ * Unless required by applicable law or agreed to in writing, software |
| @ * distributed under the License is distributed on an "AS IS" BASIS, |
| @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| @ * See the License for the specific language governing permissions and |
| @ * limitations under the License. |
| @ * |
| @ ***************************************************************************** |
| @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| @*/ |
| |
| |
| .data |
| .p2align 2 |
| |
| scratch_intrapred_luma_4x4_prediction: |
| .long ver, hor, d_c, dia_dl |
| .long dia_dr, ver_r, hor_d, ver_l |
| .long hor_u |
| |
| |
| .text |
| .p2align 2 |
| |
| scratch_intrapred_luma_4x4_prediction_addr1: |
| .long scratch_intrapred_luma_4x4_prediction - scrintra_4x4 - 8 |
| |
| |
| |
| @/** |
| @****************************************************************************** |
| @* |
| @* @brief :Evaluate best intra 4x4 mode |
| @* and do the prediction. |
| @* |
| @* @par Description |
| @* This function evaluates 4x4 modes and compute corresponding sad |
| @* and return the buffer predicted with best mode. |
| @* |
| @* @param[in] pu1_src |
| @* UWORD8 pointer to the source |
| @* |
| @** @param[in] pu1_ngbr_pels |
| @* UWORD8 pointer to neighbouring pels |
| @* |
| @* @param[out] pu1_dst |
| @* UWORD8 pointer to the destination |
| @* |
| @* @param[in] src_strd |
| @* integer source stride |
| @* |
| @* @param[in] dst_strd |
| @* integer destination stride |
| @* |
| @* @param[in] u4_n_avblty |
| @* availability of neighbouring pixels |
| @* |
| @* @param[in] u4_intra_mode |
| @* Pointer to the variable in which best mode is returned |
| @* |
| @* @param[in] pu4_sadmin |
| @* Pointer to the variable in which minimum cost is returned |
| @* |
| @* @param[in] u4_valid_intra_modes |
| @* Says what all modes are valid |
| @* |
| @* * @param[in] u4_lambda |
| @* Lamda value for computing cost from SAD |
| @* |
| @* @param[in] u4_predictd_mode |
| @* Predicted mode for cost computation |
| @* |
| @* |
| @* |
| @* @return none |
| @* |
| @****************************************************************************** |
| @*/ |
| @void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src, |
| @ UWORD8 *pu1_ngbr_pels, |
| @ UWORD8 *pu1_dst, |
| @ UWORD32 src_strd, |
| @ UWORD32 dst_strd, |
| @ WORD32 u4_n_avblty, |
| @ UWORD32 *u4_intra_mode, |
| @ WORD32 *pu4_sadmin, |
| @ UWORD32 u4_valid_intra_modes, |
| @ UWORD32 u4_lambda, |
| @ UWORD32 u4_predictd_mode) |
| |
| |
| |
| .global ih264e_evaluate_intra_4x4_modes_a9q |
| |
| ih264e_evaluate_intra_4x4_modes_a9q: |
| |
| @r0 = pu1_src, |
| @r1 = pu1_ngbr_pels_i16, |
| @r2 = pu1_dst, |
| @r3 = src_strd, |
| @r4 = dst_strd, |
| @r5 = u4_n_avblty, |
| @r6 = u4_intra_mode, |
| @r7 = pu4_sadmin |
| @r8 = u4_valid_intra_modes |
| @r0 =u4_lambda |
| @r1 = u4_predictd_mode |
| |
| |
| stmfd sp!, {r4-r12, r14} @store register values to stack |
| |
| @-------------------- |
| ldr r5, [sp, #44] @r5 = u4_n_avblty, |
| @---------------------- |
| vpush {d8-d15} |
| @Loading neighbours |
| vld1.32 {q0}, [r1] |
| add r4, r1, #12 |
| vld1.8 d1[5], [r4] |
| vld1.8 d1[7], [r1] |
| @-------------------------------- |
| ldr r8, [sp, #120] @u4_valid_intra_modes |
| @---------------------------------------------- |
| |
| |
| |
| @ LOADING pu1_src |
| vld1.32 {d20[0]}, [r0], r3 |
| vext.8 q1, q0, q0, #1 |
| vld1.32 {d20[1]}, [r0], r3 |
| mov r11, #1 |
| vld1.32 {d21[0]}, [r0], r3 |
| lsl r11, r11, #30 |
| vld1.32 {d21[1]}, [r0], r3 |
| |
| |
| |
| @-------------------------------- |
| ldr r0, [sp, #124] @r0 =u4_lambda |
| ldr r1, [sp, #128] @r1 = u4_predictd_mode |
| @------ |
| |
| |
| vert: |
| ands r10, r8, #01 @VERT sad ?? |
| beq horz |
| vdup.32 q2, d2[1] |
| vabdl.u8 q14, d4, d20 |
| vabal.u8 q14, d4, d21 |
| vadd.i16 d28, d29, d28 |
| subs r6, r1, #0 |
| vpaddl.u16 d28, d28 @ |
| lslne r6, r0, #2 |
| vpaddl.u32 d28, d28 @/ |
| moveq r6, r0 @ |
| vmov.u32 r9, d28[0] @ vert |
| add r9, r6, r9 |
| |
| subs r6, r11, r9 |
| movgt r11, r9 |
| movgt r12, #0 |
| |
| horz: |
| ands r10, r8, #02 @HORZ sad ?? |
| beq dc |
| vdup.32 q3, d0[0] |
| vmov.32 q4, q3 |
| vtrn.8 q3, q4 |
| vtrn.16 d7, d6 |
| vtrn.16 d9, d8 |
| vtrn.32 d9, d7 |
| vtrn.32 d8, d6 |
| vabdl.u8 q14, d6, d20 |
| subs r6, r1, #1 |
| vabal.u8 q14, d7, d21 |
| vadd.i16 d28, d29, d28 |
| lslne r6, r0, #2 |
| vpaddl.u16 d28, d28 @ |
| vpaddl.u32 d28, d28 @/ |
| vmov.u32 r9, d28[0] @ |
| moveq r6, r0 @ |
| add r9, r6, r9 |
| |
| subs r6, r11, r9 |
| movgt r11, r9 |
| movgt r12, #1 |
| |
| dc: |
| ands r10, r8, #04 @DC sad ?? |
| beq diags |
| vext.8 q4, q0, q0, #5 |
| vaddl.u8 q4, d0, d8 |
| vpaddl.u16 d8, d8 @ |
| vpaddl.u32 d8, d8 @/ |
| vmov.u32 r4, d8[0] @ |
| mov r14, #1 |
| ands r10, r5, #1 |
| addne r4, r4, #2 |
| addne r14, r14, #1 |
| ands r10, r5, #4 |
| addne r4, r4, #2 |
| addne r14, r14, #1 |
| ands r10, r5, #5 |
| moveq r4, #128 |
| moveq r14, #0 |
| subs r6, r1, #2 |
| lsr r4, r4, r14 |
| vdup.8 q4, r4 |
| lslne r6, r0, #2 |
| vabdl.u8 q14, d8, d20 |
| vabal.u8 q14, d9, d21 |
| vadd.i16 d28, d29, d28 |
| vpaddl.u16 d28, d28 @ |
| vpaddl.u32 d28, d28 @/ |
| vmov.u32 r9, d28[0] @ |
| |
| moveq r6, r0 @ |
| add r9, r6, r9 |
| |
| subs r6, r11, r9 |
| movgt r11, r9 |
| movgt r12, #2 |
| |
| diags: |
| ands r10, r8, #504 @/* if modes other than VERT, HORZ and DC are valid ????*/ |
| beq pred |
| @/* Performing FILT11 and FILT121 operation for all neighbour values*/ |
| vext.8 q5, q0, q0, #2 |
| vaddl.u8 q6, d0, d2 |
| vaddl.u8 q7, d1, d3 |
| vaddl.u8 q8, d10, d2 |
| vaddl.u8 q9, d11, d3 |
| vadd.u16 q12, q10, q11 |
| vqrshrun.s16 d10, q6, #1 |
| vqrshrun.s16 d11, q7, #1 |
| vadd.u16 q11, q6, q8 |
| vadd.u16 q12, q7, q9 |
| vqrshrun.s16 d12, q11, #2 |
| vqrshrun.s16 d13, q12, #2 |
| mov r14, #0 |
| vdup.32 q13 , r14 |
| mov r14, #-1 |
| vmov.i32 d26[0], r14 |
| |
| diag_dl: |
| ands r10, r8, #0x08 @DIAG_DL sad ?? |
| beq diag_dr |
| |
| vext.8 q15, q6, q6, #5 |
| vbit.32 d14, d30, d26 |
| vext.8 q15, q6, q6, #15 |
| vbit.32 d15, d31, d26 |
| vext.8 q15, q6, q6, #2 |
| vext.32 q14, q13, q13, #3 |
| vbit.32 d14, d30, d28 |
| vext.8 q15, q6, q6, #4 |
| vbit.32 d15, d30, d28 |
| vabdl.u8 q14, d14, d20 |
| subs r6, r1, #3 |
| vabal.u8 q14, d15, d21 |
| vadd.i16 d28, d29, d28 |
| vpaddl.u16 d28, d28 @ |
| lslne r6, r0, #2 |
| vpaddl.u32 d28, d28 @/ |
| vmov.u32 r9, d28[0] @ |
| |
| moveq r6, r0 @ |
| add r9, r6, r9 |
| |
| subs r6, r11, r9 |
| movgt r11, r9 |
| movgt r12, #3 |
| |
| diag_dr: |
| ands r10, r8, #16 @DIAG_DR sad ?? |
| beq vert_r |
| |
| vext.8 q15, q6, q6, #3 |
| vbit.32 d16, d30, d26 |
| vext.8 q15, q6, q6, #1 |
| vbit.32 d17, d30, d26 |
| vext.8 q15, q6, q6, #4 |
| vext.32 q14, q13, q13, #3 |
| vbit.32 d17, d31, d28 |
| vext.8 q15, q6, q6, #6 |
| vbit.32 d16, d31, d28 |
| vabdl.u8 q14, d16, d20 |
| subs r6, r1, #4 |
| vabal.u8 q14, d17, d21 |
| vadd.i16 d28, d29, d28 |
| vpaddl.u16 d28, d28 @ |
| lslne r6, r0, #2 |
| vpaddl.u32 d28, d28 @/ |
| vmov.u32 r9, d28[0] @ |
| |
| moveq r6, r0 @ |
| add r9, r6, r9 |
| |
| subs r6, r11, r9 |
| movgt r11, r9 |
| movgt r12, #4 |
| |
| vert_r: |
| ands r10, r8, #32 @VERT_R sad ?? |
| beq horz_d |
| vext.8 q15, q5, q5, #4 |
| vbit.32 d18, d30, d26 |
| vext.8 q15, q5, q5, #3 |
| vbit.32 d19, d30, d26 |
| vext.32 q14, q13, q13, #3 |
| vext.8 q15, q6, q6, #15 |
| vbit.32 d18, d30, d28 |
| vext.8 q15, q6, q6, #14 |
| vbit.32 d19, d30, d28 |
| mov r14, #0 |
| vdup.32 q14 , r14 |
| mov r14, #0xff |
| vmov.i8 d28[0], r14 |
| vext.8 q15, q6, q6, #2 |
| vbit.32 d19, d30, d28 |
| vext.32 q14, q14, q14, #3 |
| subs r6, r1, #5 |
| vext.8 q15, q6, q6, #13 |
| vbit.32 d19, d30, d28 |
| lslne r6, r0, #2 |
| vabdl.u8 q14, d18, d20 |
| vabal.u8 q14, d19, d21 |
| vadd.i16 d28, d29, d28 |
| vpaddl.u16 d28, d28 @ |
| vpaddl.u32 d28, d28 @/ |
| vmov.u32 r9, d28[0] @ |
| |
| |
| moveq r6, r0 @ |
| add r9, r6, r9 |
| |
| subs r6, r11, r9 |
| movgt r11, r9 |
| movgt r12, #5 |
| |
| horz_d: |
| vmov.8 q1, q5 |
| vmov.8 q15, q6 |
| vzip.8 q1, q15 |
| |
| ands r10, r8, #64 @HORZ_D sad ?? |
| beq vert_l |
| vext.8 q15, q6, q6, #2 |
| vbit.32 d8, d30, d26 |
| mov r14, #0 |
| vdup.32 q14 , r14 |
| mov r14, #0xff |
| vmov.i8 d28[0], r14 |
| vext.8 q15, q5, q5, #3 |
| vbit.32 d8, d30, d28 |
| vext.8 q15, q1, q1, #2 |
| vbit.32 d9, d30, d26 |
| vext.32 q14, q13, q13, #3 |
| vbit.32 d8, d2, d28 |
| subs r6, r1, #6 |
| vext.8 q15, q1, q1, #12 |
| vbit.32 d9, d30, d28 |
| vabdl.u8 q14, d8, d20 |
| vabal.u8 q14, d9, d21 |
| vadd.i16 d28, d29, d28 |
| vpaddl.u16 d28, d28 @ |
| lslne r6, r0, #2 |
| vpaddl.u32 d28, d28 @/ |
| vmov.u32 r9, d28[0] @ |
| |
| |
| moveq r6, r0 @ |
| add r9, r6, r9 |
| |
| subs r6, r11, r9 |
| movgt r11, r9 |
| movgt r12, #6 |
| vert_l: |
| ands r10, r8, #128 @VERT_L sad ?? |
| beq horz_u |
| vext.8 q15, q5, q5, #5 |
| vbit.32 d24, d30, d26 |
| vext.8 q15, q15, q15, #1 |
| vbit.32 d25, d30, d26 |
| vext.8 q15, q6, q6, #1 |
| vext.32 q14, q13, q13, #3 |
| vbit.32 d24, d30, d28 |
| vext.8 q15, q15, q15, #1 |
| subs r6, r1, #7 |
| vbit.32 d25, d30, d28 |
| vabdl.u8 q14, d24, d20 |
| vabal.u8 q14, d25, d21 |
| vadd.i16 d28, d29, d28 |
| vpaddl.u16 d28, d28 @ |
| lslne r6, r0, #2 |
| vpaddl.u32 d28, d28 @/ |
| vmov.u32 r9, d28[0] @ |
| |
| moveq r6, r0 @ |
| add r9, r6, r9 |
| |
| subs r6, r11, r9 |
| movgt r11, r9 |
| movgt r12, #7 |
| |
| horz_u: |
| ands r10, r8, #256 @HORZ_U sad ?? |
| beq pred |
| vrev64.8 q5, q1 |
| vdup.8 q1, d0[0] |
| vext.8 q6, q6, #7 |
| mov r14, #0 |
| vdup.32 q14 , r14 |
| mov r14, #0xff |
| vmov.i8 d28[0], r14 |
| vbit.32 d11, d13, d28 |
| movw r14, #0xffff |
| vmov.i16 d28[0], r14 |
| vext.8 q6, q5, q5, #7 |
| subs r6, r1, #8 |
| vbit.32 d3, d12, d28 |
| vext.8 q6, q5, q5, #3 |
| vbit.32 d2, d12, d26 |
| vext.32 q14, q13, q13, #3 |
| vext.8 q6, q5, q5, #1 |
| vbit.32 d2, d12, d28 |
| vabdl.u8 q14, d2, d20 |
| vabal.u8 q14, d3, d21 |
| vadd.i16 d28, d29, d28 |
| vpaddl.u16 d28, d28 @ |
| lslne r6, r0, #2 |
| vpaddl.u32 d28, d28 @/ |
| vmov.u32 r9, d28[0] @ |
| |
| |
| moveq r6, r0 @ |
| add r9, r6, r9 |
| |
| subs r6, r11, r9 |
| movgt r11, r9 |
| movgt r12, #8 |
| |
| pred: @/*dOING FINAL PREDICTION*/ |
| @--------------------------- |
| ldr r7, [sp, #116] @r7 = pu4_sadmin |
| ldr r6, [sp, #112] @ R6 =MODE |
| @-------------------------- |
| str r11, [r7] @/STORING MIN SAD*/ |
| str r12, [r6] @/FINAL MODE*/ |
| |
| |
| ldr r3, scratch_intrapred_luma_4x4_prediction_addr1 |
| scrintra_4x4: |
| add r3, r3, pc |
| lsl r12, r12, #2 |
| add r3, r3, r12 |
| |
| ldr r5, [r3] |
| and r5, r5, #0xfffffffe |
| |
| bx r5 |
| |
| |
| ver: |
| vext.8 q0, q0, q0, #1 |
| vdup.32 q15, d0[1] |
| b store |
| |
| hor: |
| vmov.32 q15, q3 |
| b store |
| |
| d_c: |
| vdup.8 q15, r4 |
| b store |
| |
| dia_dl: |
| vmov.32 q15, q7 |
| b store |
| |
| dia_dr: |
| vmov.32 q15, q8 |
| b store |
| |
| ver_r: |
| vmov.32 q15, q9 |
| b store |
| |
| hor_d: |
| vmov.32 q15, q4 |
| b store |
| |
| ver_l: |
| vmov.32 q15, q12 |
| b store |
| |
| hor_u: |
| vmov.32 q15, q1 |
| |
| store: @/* storing to pu1_dst*/ |
| |
| ldr r4, [sp, #104] @r4 = dst_strd, |
| |
| vst1.32 {d30[0]}, [r2], r4 |
| vst1.32 {d30[1]}, [r2], r4 |
| vst1.32 {d31[0]}, [r2], r4 |
| vst1.32 {d31[1]}, [r2], r4 |
| |
| |
| end_func: |
| vpop {d8-d15} |
| ldmfd sp!, {r4-r12, pc} @Restoring registers from stack |
| |
| |
| |
| |
| |