| @/****************************************************************************** |
| @ * |
| @ * Copyright (C) 2015 The Android Open Source Project |
| @ * |
| @ * Licensed under the Apache License, Version 2.0 (the "License"); |
| @ * you may not use this file except in compliance with the License. |
| @ * You may obtain a copy of the License at: |
| @ * |
| @ * http://www.apache.org/licenses/LICENSE-2.0 |
| @ * |
| @ * Unless required by applicable law or agreed to in writing, software |
| @ * distributed under the License is distributed on an "AS IS" BASIS, |
| @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| @ * See the License for the specific language governing permissions and |
| @ * limitations under the License. |
| @ * |
| @ ***************************************************************************** |
| @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| @*/ |
| @* |
| @ ******************************************************************************* |
| @ * @file |
| @ * ih264_padding_neon.s |
| @ * |
| @ * @brief |
| @ * Contains function definitions padding |
| @ * |
| @ * @author |
| @ * Ittiam |
| @ * |
| @ * @par List of Functions: |
| @ * - ih264_pad_top_a9q() |
| @ * - ih264_pad_left_luma_a9q() |
| @ * - ih264_pad_left_chroma_a9q() |
| @ * - ih264_pad_right_luma_a9q() |
| @ * - ih264_pad_right_chroma_a9q() |
| @ * |
| @ * @remarks |
| @ * None |
| @ * |
| @ ******************************************************************************* |
| @* |
| |
| |
| @** |
| @******************************************************************************* |
| @* |
| @* @brief pad at the top of a 2d array |
| @* |
| @* @par Description: |
| @* The top row of a 2d array is replicated for pad_size times at the top |
| @* |
| @* @param[in] pu1_src |
| @* UWORD8 pointer to the source |
| @* |
| @* @param[in] src_strd |
| @* integer source stride |
| @* |
| @* @param[in] wd |
| @* integer width of the array |
| @* |
| @* @param[in] pad_size |
| @* integer -padding size of the array |
| @* |
| @* @returns none |
| @* |
| @* @remarks none |
| @* |
| @******************************************************************************* |
| @* |
| @void ih264_pad_top(UWORD8 *pu1_src, |
| @ WORD32 src_strd, |
| @ WORD32 wd, |
| @ WORD32 pad_size) |
| @**************Variables Vs Registers************************* |
| @ r0 => *pu1_src |
| @ r1 => src_strd |
| @ r2 => wd |
| @ r3 => pad_size |
| |
| .text |
| .p2align 2 |
| |
| .global ih264_pad_top_a9q |
| |
| ih264_pad_top_a9q: |
| |
| stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments |
| |
| sub r5, r0, r1 |
| neg r6, r1 |
| |
| loop_neon_memcpy_mul_16: |
| @ Load 16 bytes |
| vld1.8 {d0, d1}, [r0]! |
| mov r4, r5 |
| mov r7, r3 |
| add r5, r5, #16 |
| |
| loop_neon_pad_top: |
| vst1.8 {d0, d1}, [r4], r6 |
| subs r7, r7, #1 |
| bne loop_neon_pad_top |
| |
| subs r2, r2, #16 |
| bne loop_neon_memcpy_mul_16 |
| |
| ldmfd sp!, {r4-r11, pc} @Reload the registers from SP |
| |
| |
| |
| |
| @** |
| @******************************************************************************* |
| @* |
| @* @brief |
| @* Padding (luma block) at the left of a 2d array |
| @* |
| @* @par Description: |
| @* The left column of a 2d array is replicated for pad_size times at the left |
| @* |
| @* |
| @* @param[in] pu1_src |
| @* UWORD8 pointer to the source |
| @* |
| @* @param[in] src_strd |
| @* integer source stride |
| @* |
| @* @param[in] ht |
| @* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array |
| @* |
| @* @param[in] pad_size |
| @* integer -padding size of the array |
| @* |
| @* @param[in] ht |
| @* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array |
| @* |
| @* @returns |
| @* |
| @* @remarks |
| @* None |
| @* |
| @******************************************************************************* |
| @* |
| @#if PAD_LEFT_LUMA == C |
| @void ih264_pad_left_luma(UWORD8 *pu1_src, |
| @ WORD32 src_strd, |
| @ WORD32 ht, |
| @ WORD32 pad_size) |
| @**************Variables Vs Registers************************* |
| @ r0 => *pu1_src |
| @ r1 => src_strd |
| @ r2 => ht |
| @ r3 => pad_size |
| |
| |
| |
| .global ih264_pad_left_luma_a9q |
| |
| ih264_pad_left_luma_a9q: |
| |
| stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments |
| |
| |
| sub r4, r0, r3 |
| sub r6, r1, #16 |
| subs r5, r3, #16 |
| bne loop_32 |
| loop_16: @ /*hard coded for width=16 ,height =8,16*/ |
| ldrb r8, [r0], r1 |
| ldrb r9, [r0], r1 |
| vdup.u8 q0, r8 |
| ldrb r10, [r0], r1 |
| vst1.8 {q0}, [r4], r1 @ 16 bytes store |
| vdup.u8 q1, r9 |
| vst1.8 {q1}, [r4], r1 @ 16 bytes store |
| ldrb r11, [r0], r1 |
| vdup.u8 q2, r10 |
| vdup.u8 q3, r11 |
| vst1.8 {q2}, [r4], r1 @ 16 bytes store |
| ldrb r8, [r0], r1 |
| vst1.8 {q3}, [r4], r1 @ 16 bytes store |
| ldrb r9, [r0], r1 |
| vdup.u8 q0, r8 |
| ldrb r10, [r0], r1 |
| vst1.8 {q0}, [r4], r1 @ 16 bytes store |
| vdup.u8 q1, r9 |
| ldrb r11, [r0], r1 |
| vst1.8 {q1}, [r4], r1 @ 16 bytes store |
| vdup.u8 q2, r10 |
| vdup.u8 q3, r11 |
| subs r2, r2, #8 |
| vst1.8 {q2}, [r4], r1 @ 16 bytes store |
| vst1.8 {q3}, [r4], r1 @ 16 bytes store |
| bne loop_16 |
| b end_func |
| |
| loop_32: @ /*hard coded for width=32 ,height =8,16*/ |
| ldrb r8, [r0], r1 |
| ldrb r9, [r0], r1 |
| vdup.u8 q0, r8 |
| ldrb r10, [r0], r1 |
| vst1.8 {q0}, [r4]! @ 16 bytes store |
| vdup.u8 q1, r9 |
| vst1.8 {q0}, [r4], r6 |
| vst1.8 {q1}, [r4]! @ 16 bytes store |
| vdup.u8 q2, r10 |
| vst1.8 {q1}, [r4], r6 @ 16 bytes store |
| ldrb r11, [r0], r1 |
| vst1.8 {q2}, [r4]! @ 16 bytes store |
| vdup.u8 q3, r11 |
| vst1.8 {q2}, [r4], r6 @ 16 bytes store |
| ldrb r8, [r0], r1 |
| vst1.8 {q3}, [r4]! @ 16 bytes store |
| vdup.u8 q0, r8 |
| ldrb r9, [r0], r1 |
| vst1.8 {q3}, [r4], r6 @ 16 bytes store |
| ldrb r10, [r0], r1 |
| vst1.8 {q0}, [r4]! @ 16 bytes store |
| vdup.u8 q1, r9 |
| vst1.8 {q0}, [r4], r6 @ 16 bytes store |
| ldrb r11, [r0], r1 |
| vst1.8 {q1}, [r4]! @ 16 bytes store |
| vdup.u8 q2, r10 |
| vst1.8 {q1}, [r4], r6 @ 16 bytes store |
| vst1.8 {q2}, [r4]! @ 16 bytes store |
| vdup.u8 q3, r11 |
| vst1.8 {q2}, [r4], r6 @ 16 bytes store |
| subs r2, r2, #8 |
| vst1.8 {q3}, [r4]! @ 16 bytes store |
| vst1.8 {q3}, [r4], r6 @ 16 bytes store |
| bne loop_32 |
| |
| |
| |
| end_func: |
| ldmfd sp!, {r4-r11, pc} @Reload the registers from SP |
| |
| |
| |
| |
| |
| @** |
| @******************************************************************************* |
| @* |
| @* @brief |
| @* Padding (chroma block) at the left of a 2d array |
| @* |
| @* @par Description: |
| @* The left column of a 2d array is replicated for pad_size times at the left |
| @* |
| @* |
| @* @param[in] pu1_src |
| @* UWORD8 pointer to the source |
| @* |
| @* @param[in] src_strd |
| @* integer source stride |
| @* |
| @* @param[in] ht |
| @* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array (each colour component) |
| @* |
| @* @param[in] pad_size |
| @* integer -padding size of the array |
| @* |
| @* @param[in] ht |
| @* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array |
| @* |
| @* @returns |
| @* |
| @* @remarks |
| @* None |
| @* |
| @******************************************************************************* |
| @* |
| @#if PAD_LEFT_CHROMA == C |
| @void ih264_pad_left_chroma(UWORD8 *pu1_src, |
| @ WORD32 src_strd, |
| @ WORD32 ht, |
| @ WORD32 pad_size) |
| @{ |
| @ r0 => *pu1_src |
| @ r1 => src_strd |
| @ r2 => ht |
| @ r3 => pad_size |
| |
| |
| |
| .global ih264_pad_left_chroma_a9q |
| |
| ih264_pad_left_chroma_a9q: |
| |
| stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments |
| |
| sub r4, r0, r3 |
| sub r6, r1, #16 |
| |
| |
| loop_32_l_c: @ /*hard coded for width=32 ,height =4,8,12*/ |
| ldrh r8, [r0], r1 |
| ldrh r9, [r0], r1 |
| vdup.u16 q0, r8 |
| ldrh r10, [r0], r1 |
| vst1.8 {q0}, [r4]! @ 16 bytes store |
| vdup.u16 q1, r9 |
| vst1.8 {q0}, [r4], r6 @ 16 bytes store |
| ldrh r11, [r0], r1 |
| vst1.8 {q1}, [r4]! @ 16 bytes store |
| vdup.u16 q2, r10 |
| vst1.8 {q1}, [r4], r6 @ 16 bytes store |
| vdup.u16 q3, r11 |
| vst1.8 {q2}, [r4]! @ 16 bytes store |
| vst1.8 {q2}, [r4], r6 @ 16 bytes store |
| subs r2, r2, #4 |
| vst1.8 {q3}, [r4]! @ 16 bytes store |
| vst1.8 {q3}, [r4], r6 @ 16 bytes store |
| |
| |
| beq end_func_l_c @/* Branching when ht=4*/ |
| |
| ldrh r8, [r0], r1 |
| ldrh r9, [r0], r1 |
| vdup.u16 q0, r8 |
| ldrh r10, [r0], r1 |
| vst1.8 {q0}, [r4]! @ 16 bytes store |
| vdup.u16 q1, r9 |
| vst1.8 {q0}, [r4], r6 |
| ldrh r11, [r0], r1 |
| vst1.8 {q1}, [r4]! @ 16 bytes store |
| vdup.u16 q2, r10 |
| vst1.8 {q1}, [r4], r6 @ 16 bytes store |
| vdup.u16 q3, r11 |
| vst1.8 {q2}, [r4]! @ 16 bytes store |
| vst1.8 {q2}, [r4], r6 @ 16 bytes store |
| subs r2, r2, #4 |
| vst1.8 {q3}, [r4]! @ 16 bytes store |
| vst1.8 {q3}, [r4], r6 @ 16 bytes store |
| |
| beq end_func_l_c @/* Branching when ht=8*/ |
| bne loop_32_l_c |
| |
| ldrh r8, [r0], r1 |
| ldrh r9, [r0], r1 |
| vdup.u16 q0, r8 |
| ldrh r10, [r0], r1 |
| vst1.8 {q0}, [r4]! @ 16 bytes store |
| vdup.u16 q1, r9 |
| vst1.8 {q0}, [r4], r6 |
| ldrh r11, [r0], r1 |
| vst1.8 {q1}, [r4]! @ 16 bytes store |
| vdup.u16 q2, r10 |
| vst1.8 {q1}, [r4], r6 @ 16 bytes store |
| vdup.u16 q3, r11 |
| vst1.8 {q2}, [r4]! @ 16 bytes store |
| vst1.8 {q2}, [r4], r6 @ 16 bytes store |
| vst1.8 {q3}, [r4]! @ 16 bytes store |
| vst1.8 {q3}, [r4], r6 @ 16 bytes store |
| |
| end_func_l_c: |
| ldmfd sp!, {r4-r11, pc} @Reload the registers from SP |
| |
| |
| |
| |
| |
| @** |
| @******************************************************************************* |
| @* |
| @* @brief |
| @* Padding (luma block) at the right of a 2d array |
| @* |
| @* @par Description: |
| @* The right column of a 2d array is replicated for pad_size times at the right |
| @* |
| @* |
| @* @param[in] pu1_src |
| @* UWORD8 pointer to the source |
| @* |
| @* @param[in] src_strd |
| @* integer source stride |
| @* |
| @* @param[in] ht |
| @* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array |
| @* |
| @* @param[in] pad_size |
| @* integer -padding size of the array |
| @* |
| @* @param[in] ht |
| @* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array |
| @* |
| @* @returns |
| @* |
| @* @remarks |
| @* None |
| @* |
| @******************************************************************************* |
| @* |
| @#if PAD_RIGHT_LUMA == C |
| @void ih264_pad_right_luma(UWORD8 *pu1_src, |
| @ WORD32 src_strd, |
| @ WORD32 ht, |
| @ WORD32 pad_size) |
| @{ |
| @ WORD32 row; |
| @ |
| @ for(row = 0; row < ht; row++) |
| @ { |
| @ memset(pu1_src, *(pu1_src -1), pad_size); |
| @ |
| @ pu1_src += src_strd; |
| @ } |
| @} |
| @ |
| @ r0 => *pu1_src |
| @ r1 => src_strd |
| @ r2 => ht |
| @ r3 => pad_size |
| |
| |
| |
| .global ih264_pad_right_luma_a9q |
| |
| ih264_pad_right_luma_a9q: |
| |
| stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments |
| |
| mov r4, r0 |
| sub r6, r1, #16 |
| sub r0, r0, #1 |
| subs r5, r3, #16 |
| bne loop_32 |
| loop_16_r: @ /*hard coded for width=16 ,height =8,16*/ |
| ldrb r8, [r0], r1 |
| ldrb r9, [r0], r1 |
| vdup.u8 q0, r8 |
| ldrb r10, [r0], r1 |
| vst1.8 {q0}, [r4], r1 @ 16 bytes store |
| vdup.u8 q1, r9 |
| vst1.8 {q1}, [r4], r1 @ 16 bytes store |
| ldrb r11, [r0], r1 |
| vdup.u8 q2, r10 |
| vdup.u8 q3, r11 |
| vst1.8 {q2}, [r4], r1 @ 16 bytes store |
| ldrb r8, [r0], r1 |
| vst1.8 {q3}, [r4], r1 @ 16 bytes store |
| ldrb r9, [r0], r1 |
| vdup.u8 q0, r8 |
| ldrb r10, [r0], r1 |
| vst1.8 {q0}, [r4], r1 @ 16 bytes store |
| vdup.u8 q1, r9 |
| ldrb r11, [r0], r1 |
| vst1.8 {q1}, [r4], r1 @ 16 bytes store |
| vdup.u8 q2, r10 |
| vdup.u8 q3, r11 |
| subs r2, r2, #8 |
| vst1.8 {q2}, [r4], r1 @ 16 bytes store |
| vst1.8 {q3}, [r4], r1 @ 16 bytes store |
| bne loop_16_r |
| b end_func_r |
| |
| loop_32_r: @ /*hard coded for width=32 ,height =8,16*/ |
| ldrb r8, [r0], r1 |
| ldrb r9, [r0], r1 |
| vdup.u8 q0, r8 |
| ldrb r10, [r0], r1 |
| vst1.8 {q0}, [r4]! @ 16 bytes store |
| vdup.u8 q1, r9 |
| vst1.8 {q0}, [r4], r6 |
| vst1.8 {q1}, [r4]! @ 16 bytes store |
| vdup.u8 q2, r10 |
| vst1.8 {q1}, [r4], r6 @ 16 bytes store |
| ldrb r11, [r0], r1 |
| vst1.8 {q2}, [r4]! @ 16 bytes store |
| vdup.u8 q3, r11 |
| vst1.8 {q2}, [r4], r6 @ 16 bytes store |
| ldrb r8, [r0], r1 |
| vst1.8 {q3}, [r4]! @ 16 bytes store |
| ldrb r9, [r0], r1 |
| vdup.u8 q0, r8 |
| vst1.8 {q3}, [r4], r6 @ 16 bytes store |
| ldrb r10, [r0], r1 |
| vst1.8 {q0}, [r4]! @ 16 bytes store |
| vdup.u8 q1, r9 |
| vst1.8 {q0}, [r4], r6 @ 16 bytes store |
| ldrb r11, [r0], r1 |
| vst1.8 {q1}, [r4]! @ 16 bytes store |
| vdup.u8 q2, r10 |
| vst1.8 {q1}, [r4], r6 @ 16 bytes store |
| vst1.8 {q2}, [r4]! @ 16 bytes store |
| vdup.u8 q3, r11 |
| vst1.8 {q2}, [r4], r6 @ 16 bytes store |
| subs r2, r2, #8 |
| vst1.8 {q3}, [r4]! @ 16 bytes store |
| vst1.8 {q3}, [r4], r6 @ 16 bytes store |
| bne loop_32_r |
| |
| |
| |
| end_func_r: |
| ldmfd sp!, {r4-r11, pc} @Reload the registers from SP |
| |
| |
| |
| |
| |
| @** |
| @******************************************************************************* |
| @* |
| @* @brief |
| @;* Padding (chroma block) at the right of a 2d array |
| @* |
| @* @par Description: |
| @* The right column of a 2d array is replicated for pad_size times at the right |
| @* |
| @* |
| @* @param[in] pu1_src |
| @;* UWORD8 pointer to the source |
| @* |
| @* @param[in] src_strd |
| @* integer source stride |
| @* |
| @* @param[in] ht |
| @;* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array (each colour component) |
| @* |
| @* @param[in] pad_size |
| @* integer -padding size of the array |
| @* |
| @* @param[in] ht |
| @;* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array |
| @* |
| @* @returns |
| @* |
| @* @remarks |
| @* None |
| @* |
| @******************************************************************************* |
| @* |
| @#if PAD_RIGHT_CHROMA == C |
| @void ih264_pad_right_chroma(UWORD8 *pu1_src, |
| @ WORD32 src_strd, |
| @ WORD32 ht, |
| @ WORD32 pad_size) |
| @ r0 => *pu1_src |
| @ r1 => src_strd |
| @ r2 => ht |
| @ r3 => pad_size |
| |
| |
| |
| .global ih264_pad_right_chroma_a9q |
| |
| ih264_pad_right_chroma_a9q: |
| |
| stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments |
| |
| mov r4, r0 |
| sub r6, r1, #16 |
| sub r0, r0, #2 |
| loop_32_r_c: @ /*hard coded for width=32 ,height =8,4*/ |
| ldrh r8, [r0], r1 |
| ldrh r9, [r0], r1 |
| vdup.u16 q0, r8 |
| ldrh r10, [r0], r1 |
| vst1.8 {q0}, [r4]! @ 16 bytes store |
| vdup.u16 q1, r9 |
| vst1.8 {q0}, [r4], r6 |
| vst1.8 {q1}, [r4]! @ 16 bytes store |
| vdup.u16 q2, r10 |
| vst1.8 {q1}, [r4], r6 @ 16 bytes store |
| subs r2, r2, #4 |
| ldrh r11, [r0], r1 |
| vst1.8 {q2}, [r4]! @ 16 bytes store |
| vdup.u16 q3, r11 |
| vst1.8 {q2}, [r4], r6 @ 16 bytes store |
| vst1.8 {q3}, [r4]! @ 16 bytes store |
| vst1.8 {q3}, [r4], r6 @ 16 bytes store |
| |
| beq end_func_r_c @/* Branching when ht=4*/ |
| |
| ldrh r8, [r0], r1 |
| vdup.u16 q0, r8 |
| ldrh r9, [r0], r1 |
| ldrh r10, [r0], r1 |
| vst1.8 {q0}, [r4]! @ 16 bytes store |
| vdup.u16 q1, r9 |
| vst1.8 {q0}, [r4], r6 @ 16 bytes store |
| ldrh r11, [r0], r1 |
| vst1.8 {q1}, [r4]! @ 16 bytes store |
| vdup.u16 q2, r10 |
| vst1.8 {q1}, [r4], r6 @ 16 bytes store |
| vst1.8 {q2}, [r4]! @ 16 bytes store |
| vdup.u16 q3, r11 |
| vst1.8 {q2}, [r4], r6 @ 16 bytes store |
| subs r2, r2, #4 |
| vst1.8 {q3}, [r4]! @ 16 bytes store |
| vst1.8 {q3}, [r4], r6 @ 16 bytes store |
| |
| beq end_func_r_c @/* Branching when ht=8*/ |
| bne loop_32_r_c |
| |
| ldrh r8, [r0], r1 |
| vdup.u16 q0, r8 |
| ldrh r9, [r0], r1 |
| ldrh r10, [r0], r1 |
| vst1.8 {q0}, [r4]! @ 16 bytes store |
| vdup.u16 q1, r9 |
| vst1.8 {q0}, [r4], r6 @ 16 bytes store |
| ldrh r11, [r0], r1 |
| vst1.8 {q1}, [r4]! @ 16 bytes store |
| vdup.u16 q2, r10 |
| vst1.8 {q1}, [r4], r6 @ 16 bytes store |
| vst1.8 {q2}, [r4]! @ 16 bytes store |
| vdup.u16 q3, r11 |
| vst1.8 {q2}, [r4], r6 @ 16 bytes store |
| vst1.8 {q3}, [r4]! @ 16 bytes store |
| vst1.8 {q3}, [r4], r6 @ 16 bytes store |
| |
| end_func_r_c: |
| ldmfd sp!, {r4-r11, pc} @Reload the registers from SP |
| |
| |
| |
| |
| |