| //****************************************************************************** |
| //* |
| //* Copyright (C) 2015 The Android Open Source Project |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //***************************************************************************** |
| //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| //*/ |
| |
| ///** |
| //****************************************************************************** |
| //* |
| //* @brief :Evaluate best intra 16x16 mode (among VERT, HORZ and DC ) |
| //* and do the prediction. |
| //* |
| //* @par Description |
| //* This function evaluates first three 16x16 modes and compute corresponding sad |
| //* and return the buffer predicted with best mode. |
| //* |
| //* @param[in] pu1_src |
| //* UWORD8 pointer to the source |
| //* |
| //** @param[in] pu1_ngbr_pels_i16 |
| //* UWORD8 pointer to neighbouring pels |
| //* |
| //* @param[out] pu1_dst |
| //* UWORD8 pointer to the destination |
| //* |
| //* @param[in] src_strd |
| //* integer source stride |
| //* |
| //* @param[in] dst_strd |
| //* integer destination stride |
| //* |
| //* @param[in] u4_n_avblty |
| //* availability of neighbouring pixels |
| //* |
| //* @param[in] u4_intra_mode |
| //* Pointer to the variable in which best mode is returned |
| //* |
| //* @param[in] pu4_sadmin |
| //* Pointer to the variable in which minimum sad is returned |
| //* |
| //* @param[in] u4_valid_intra_modes |
| //* Says what all modes are valid |
| //* |
| //* |
| //* @return none |
| //* |
| //****************************************************************************** |
| //*/ |
| // |
| //void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src, |
| // UWORD8 *pu1_ngbr_pels_i16, |
| // UWORD8 *pu1_dst, |
| // UWORD32 src_strd, |
| // UWORD32 dst_strd, |
| // WORD32 u4_n_avblty, |
| // UWORD32 *u4_intra_mode, |
| // WORD32 *pu4_sadmin, |
| // UWORD32 u4_valid_intra_modes) |
| // |
| .text |
| .p2align 2 |
| .include "ih264_neon_macros.s" |
| |
| .global ih264e_evaluate_intra16x16_modes_av8 |
| |
| ih264e_evaluate_intra16x16_modes_av8: |
| |
| //x0 = pu1_src, |
| //x1 = pu1_ngbr_pels_i16, |
| //x2 = pu1_dst, |
| //w3 = src_strd, |
| //w4 = dst_strd, |
| //w5 = u4_n_avblty, |
| //x6 = u4_intra_mode, |
| //x7 = pu4_sadmin |
| |
| |
| |
| // STMFD sp!, {x4-x12, x14} //store register values to stack |
| push_v_regs |
| sxtw x3, w3 |
| sxtw x4, w4 |
| stp x19, x20, [sp, #-16]! |
| |
| ldr w16, [sp, #80] |
| mov x17, x4 |
| mov x14, x6 |
| mov x15, x7 |
| |
| |
| sub v0.16b, v0.16b, v0.16b |
| sub v1.16b, v1.16b, v1.16b |
| mov w10, #0 |
| mov w11 , #3 |
| |
| ands w6, w5, #0x01 |
| beq top_available //LEFT NOT AVAILABLE |
| ld1 {v0.16b}, [x1] |
| add w10, w10, #8 |
| add w11, w11, #1 |
| top_available: |
| ands w6, w5, #0x04 |
| beq none_available |
| add x6, x1, #17 |
| ld1 {v1.16b}, [x6] |
| add w10, w10, #8 |
| add w11, w11, #1 |
| b summation |
| none_available: |
| cmp w5, #0 |
| bne summation |
| mov w6, #128 |
| dup v30.16b, w6 |
| dup v31.16b, w6 |
| b sad_comp |
| summation: |
| uaddl v2.8h, v0.8b, v1.8b |
| uaddl2 v3.8h, v0.16b, v1.16b |
| dup v10.8h, w10 |
| neg w11, w11 |
| dup v20.8h, w11 |
| add v0.8h, v2.8h, v3.8h |
| mov v1.d[0], v0.d[1] |
| add v0.4h, v0.4h, v1.4h |
| addp v0.4h, v0.4h , v0.4h |
| addp v0.4h, v0.4h , v0.4h |
| add v0.4h, v0.4h, v10.4h |
| uqshl v0.8h, v0.8h, v20.8h |
| sqxtun v0.8b, v0.8h |
| |
| dup v30.16b, v0.b[0] |
| dup v31.16b, v0.b[0] |
| |
| |
| sad_comp: |
| ld1 { v0.2s, v1.2s }, [x0], x3 // source x0w 0 |
| |
| ld1 { v2.2s, v3.2s}, [x0], x3 //row 1 |
| |
| ld1 { v4.2s, v5.2s}, [x0], x3 //row 2 |
| |
| ld1 { v6.2s, v7.2s}, [x0], x3 //row 3 |
| |
| //--------------------- |
| |
| //values for vertical prediction |
| add x6, x1, #17 |
| ld1 {v10.8b}, [x6], #8 |
| ld1 {v11.8b}, [x6], #8 |
| ld1 {v9.16b}, [x1] |
| |
| |
| |
| dup v20.8b, v9.b[15] ///HORIZONTAL VALUE ROW=0// |
| dup v21.8b, v9.b[15] ///HORIZONTAL VALUE ROW=0// |
| |
| |
| ///* computing SADs for all three modes*/ |
| ///vertical row 0@ |
| uabdl v16.8h, v0.8b, v10.8b |
| uabdl v18.8h, v1.8b, v11.8b |
| |
| ///HORZ row 0@ |
| uabdl v26.8h, v0.8b, v20.8b |
| uabdl v28.8h, v1.8b, v21.8b |
| |
| ///dc row 0@ |
| uabdl v22.8h, v0.8b, v30.8b |
| uabdl v24.8h, v1.8b, v31.8b |
| |
| |
| |
| |
| |
| dup v20.8b, v9.b[14] ///HORIZONTAL VALUE ROW=1// |
| dup v21.8b, v9.b[14] |
| |
| |
| ///vertical row 1@ |
| uabal v16.8h, v2.8b, v10.8b |
| uabal v18.8h, v3.8b, v11.8b |
| |
| ld1 { v0.2s, v1.2s }, [x0], x3 //row 4 |
| ///HORZ row 1@ |
| uabal v26.8h, v2.8b, v20.8b |
| uabal v28.8h, v3.8b, v21.8b |
| |
| ///dc row 1@ |
| uabal v22.8h, v2.8b, v30.8b |
| uabal v24.8h, v3.8b, v31.8b |
| |
| dup v20.8b, v9.b[13] ///HORIZONTAL VALUE ROW=2// |
| dup v21.8b, v9.b[13] |
| |
| ///vertical row 2@ |
| uabal v16.8h, v4.8b, v10.8b |
| uabal v18.8h, v5.8b, v11.8b |
| |
| ld1 { v2.2s, v3.2s}, [x0], x3 //row 5 |
| ///HORZ row 2@ |
| uabal v26.8h, v4.8b, v20.8b |
| uabal v28.8h, v5.8b, v21.8b |
| |
| ///dc row 2@ |
| uabal v22.8h, v4.8b, v30.8b |
| uabal v24.8h, v5.8b, v31.8b |
| |
| dup v20.8b, v9.b[12] ///HORIZONTAL VALUE ROW=3// |
| dup v21.8b, v9.b[12] |
| |
| ///vertical row 3@ |
| uabal v16.8h, v6.8b, v10.8b |
| uabal v18.8h, v7.8b, v11.8b |
| |
| ld1 { v4.2s, v5.2s}, [x0], x3 //row 6 |
| ///HORZ row 3@ |
| uabal v26.8h, v6.8b, v20.8b |
| uabal v28.8h, v7.8b, v21.8b |
| |
| ///dc row 3@ |
| uabal v22.8h, v6.8b, v30.8b |
| uabal v24.8h, v7.8b, v31.8b |
| //---------------------------------------------------------------------------------------------- |
| |
| dup v20.8b, v9.b[11] ///HORIZONTAL VALUE ROW=0// |
| dup v21.8b, v9.b[11] |
| |
| ///vertical row 0@ |
| uabal v16.8h, v0.8b, v10.8b |
| uabal v18.8h, v1.8b, v11.8b |
| |
| ld1 { v6.2s, v7.2s}, [x0], x3 //row 7 |
| ///HORZ row 0@ |
| uabal v26.8h, v0.8b, v20.8b |
| uabal v28.8h, v1.8b, v21.8b |
| |
| ///dc row 0@ |
| uabal v22.8h, v0.8b, v30.8b |
| uabal v24.8h, v1.8b, v31.8b |
| |
| dup v20.8b, v9.b[10] ///HORIZONTAL VALUE ROW=1// |
| dup v21.8b, v9.b[10] |
| |
| ///vertical row 1@ |
| uabal v16.8h, v2.8b, v10.8b |
| uabal v18.8h, v3.8b, v11.8b |
| |
| ld1 { v0.2s, v1.2s }, [x0], x3 //row 8 |
| ///HORZ row 1@ |
| uabal v26.8h, v2.8b, v20.8b |
| uabal v28.8h, v3.8b, v21.8b |
| |
| ///dc row 1@ |
| uabal v22.8h, v2.8b, v30.8b |
| uabal v24.8h, v3.8b, v31.8b |
| |
| dup v20.8b, v9.b[9] ///HORIZONTAL VALUE ROW=2// |
| dup v21.8b, v9.b[9] |
| |
| ///vertical row 2@ |
| uabal v16.8h, v4.8b, v10.8b |
| uabal v18.8h, v5.8b, v11.8b |
| |
| ld1 { v2.2s, v3.2s}, [x0], x3 //row 9 |
| |
| ///HORZ row 2@ |
| uabal v26.8h, v4.8b, v20.8b |
| uabal v28.8h, v5.8b, v21.8b |
| |
| ///dc row 2@ |
| uabal v22.8h, v4.8b, v30.8b |
| uabal v24.8h, v5.8b, v31.8b |
| |
| dup v20.8b, v9.b[8] ///HORIZONTAL VALUE ROW=3// |
| dup v21.8b, v9.b[8] |
| |
| ///vertical row 3@ |
| uabal v16.8h, v6.8b, v10.8b |
| uabal v18.8h, v7.8b, v11.8b |
| |
| ld1 { v4.2s, v5.2s}, [x0], x3 //row 10 |
| |
| ///HORZ row 3@ |
| uabal v26.8h, v6.8b, v20.8b |
| uabal v28.8h, v7.8b, v21.8b |
| |
| ///dc row 3@ |
| uabal v22.8h, v6.8b, v30.8b |
| uabal v24.8h, v7.8b, v31.8b |
| |
| |
| //------------------------------------------- |
| |
| dup v20.8b, v9.b[7] ///HORIZONTAL VALUE ROW=0// |
| dup v21.8b, v9.b[7] |
| |
| ///vertical row 0@ |
| uabal v16.8h, v0.8b, v10.8b |
| uabal v18.8h, v1.8b, v11.8b |
| |
| ld1 { v6.2s, v7.2s}, [x0], x3 //row11 |
| |
| ///HORZ row 0@ |
| uabal v26.8h, v0.8b, v20.8b |
| uabal v28.8h, v1.8b, v21.8b |
| |
| ///dc row 0@ |
| uabal v22.8h, v0.8b, v30.8b |
| uabal v24.8h, v1.8b, v31.8b |
| |
| dup v20.8b, v9.b[6] ///HORIZONTAL VALUE ROW=1// |
| dup v21.8b, v9.b[6] |
| |
| ///vertical row 1@ |
| uabal v16.8h, v2.8b, v10.8b |
| uabal v18.8h, v3.8b, v11.8b |
| |
| ld1 { v0.2s, v1.2s }, [x0], x3 //row12 |
| |
| ///HORZ row 1@ |
| uabal v26.8h, v2.8b, v20.8b |
| uabal v28.8h, v3.8b, v21.8b |
| |
| ///dc row 1@ |
| uabal v22.8h, v2.8b, v30.8b |
| uabal v24.8h, v3.8b, v31.8b |
| |
| dup v20.8b, v9.b[5] ///HORIZONTAL VALUE ROW=2// |
| dup v21.8b, v9.b[5] |
| |
| ///vertical row 2@ |
| uabal v16.8h, v4.8b, v10.8b |
| uabal v18.8h, v5.8b, v11.8b |
| |
| ld1 { v2.2s, v3.2s}, [x0], x3 //row13 |
| |
| ///HORZ row 2@ |
| uabal v26.8h, v4.8b, v20.8b |
| uabal v28.8h, v5.8b, v21.8b |
| |
| ///dc row 2@ |
| uabal v22.8h, v4.8b, v30.8b |
| uabal v24.8h, v5.8b, v31.8b |
| |
| dup v20.8b, v9.b[4] ///HORIZONTAL VALUE ROW=3// |
| dup v21.8b, v9.b[4] |
| |
| ///vertical row 3@ |
| uabal v16.8h, v6.8b, v10.8b |
| uabal v18.8h, v7.8b, v11.8b |
| |
| ld1 { v4.2s, v5.2s}, [x0], x3 //row14 |
| |
| ///HORZ row 3@ |
| uabal v26.8h, v6.8b, v20.8b |
| uabal v28.8h, v7.8b, v21.8b |
| |
| ///dc row 3@ |
| uabal v22.8h, v6.8b, v30.8b |
| uabal v24.8h, v7.8b, v31.8b |
| //----------------------------------------------------------------- |
| |
| dup v20.8b, v9.b[3] ///HORIZONTAL VALUE ROW=0// |
| dup v21.8b, v9.b[3] |
| |
| ///vertical row 0@ |
| uabal v16.8h, v0.8b, v10.8b |
| uabal v18.8h, v1.8b, v11.8b |
| |
| ld1 { v6.2s, v7.2s}, [x0], x3 //row15 |
| |
| ///HORZ row 0@ |
| uabal v26.8h, v0.8b, v20.8b |
| uabal v28.8h, v1.8b, v21.8b |
| |
| ///dc row 0@ |
| uabal v22.8h, v0.8b, v30.8b |
| uabal v24.8h, v1.8b, v31.8b |
| |
| dup v20.8b, v9.b[2] ///HORIZONTAL VALUE ROW=1// |
| dup v21.8b, v9.b[2] |
| |
| ///vertical row 1@ |
| uabal v16.8h, v2.8b, v10.8b |
| uabal v18.8h, v3.8b, v11.8b |
| |
| ///HORZ row 1@ |
| uabal v26.8h, v2.8b, v20.8b |
| uabal v28.8h, v3.8b, v21.8b |
| |
| ///dc row 1@ |
| uabal v22.8h, v2.8b, v30.8b |
| uabal v24.8h, v3.8b, v31.8b |
| |
| dup v20.8b, v9.b[1] ///HORIZONTAL VALUE ROW=2// |
| dup v21.8b, v9.b[1] |
| |
| ///vertical row 2@ |
| uabal v16.8h, v4.8b, v10.8b |
| uabal v18.8h, v5.8b, v11.8b |
| |
| ///HORZ row 2@ |
| uabal v26.8h, v4.8b, v20.8b |
| uabal v28.8h, v5.8b, v21.8b |
| |
| ///dc row 2@ |
| uabal v22.8h, v4.8b, v30.8b |
| uabal v24.8h, v5.8b, v31.8b |
| |
| dup v20.8b, v9.b[0] ///HORIZONTAL VALUE ROW=3// |
| dup v21.8b, v9.b[0] |
| |
| ///vertical row 3@ |
| uabal v16.8h, v6.8b, v10.8b |
| uabal v18.8h, v7.8b, v11.8b |
| |
| ///HORZ row 3@ |
| uabal v26.8h, v6.8b, v20.8b |
| uabal v28.8h, v7.8b, v21.8b |
| |
| ///dc row 3@ |
| uabal v22.8h, v6.8b, v30.8b |
| uabal v24.8h, v7.8b, v31.8b |
| //------------------------------------------------------------------------------ |
| |
| |
| //vert sum |
| |
| add v16.8h, v16.8h , v18.8h |
| mov v18.d[0], v16.d[1] |
| add v16.4h, v16.4h , v18.4h |
| uaddlp v16.2s, v16.4h |
| addp v16.2s, v16.2s, v16.2s |
| smov x8, v16.s[0] //dc |
| |
| |
| //horz sum |
| |
| add v26.8h, v26.8h , v28.8h |
| mov v28.d[0], v26.d[1] |
| add v26.4h, v26.4h , v28.4h |
| uaddlp v26.2s, v26.4h |
| addp v26.2s, v26.2s, v26.2s |
| smov x9, v26.s[0] |
| |
| //dc sum |
| |
| add v24.8h, v22.8h , v24.8h ///DC |
| mov v25.d[0], v24.d[1] |
| add v24.4h, v24.4h , v25.4h ///DC |
| uaddlp v24.2s, v24.4h ///DC |
| addp v24.2s, v24.2s, v24.2s ///DC |
| smov x10, v24.s[0] //dc |
| |
| |
| //----------------------- |
| mov x11, #1 |
| lsl x11, x11, #30 |
| |
| mov w0, w16 |
| //-------------------------------------------- |
| ands w7, w0, #01 // vert mode valid???????????? |
| csel x8, x11, x8, eq |
| |
| |
| ands w6, w0, #02 // horz mode valid???????????? |
| csel x9, x11, x9, eq |
| |
| ands w6, w0, #04 // dc mode valid???????????? |
| csel x10, x11, x10, eq |
| |
| |
| |
| |
| //-------------------------------- |
| |
| mov x4, x17 |
| mov x7, x15 |
| mov x6, x14 |
| |
| //--------------------------- |
| |
| //-------------------------- |
| |
| cmp x8, x9 |
| bgt not_vert |
| cmp x8, x10 |
| bgt do_dc |
| |
| ///---------------------- |
| //DO VERTICAL PREDICTION |
| str w8 , [x7] //MIN SAD |
| mov w8, #0 |
| str w8 , [x6] // MODE |
| add x6, x1, #17 |
| ld1 {v30.16b}, [x6] |
| b do_dc_vert |
| //----------------------------- |
| not_vert: cmp x9, x10 |
| bgt do_dc |
| |
| ///---------------------- |
| //DO HORIZONTAL |
| str w9 , [x7] //MIN SAD |
| mov w9, #1 |
| str w9 , [x6] // MODE |
| |
| ld1 {v0.16b}, [x1] |
| dup v10.16b, v0.b[15] |
| dup v11.16b, v0.b[14] |
| dup v12.16b, v0.b[13] |
| dup v13.16b, v0.b[12] |
| st1 {v10.16b}, [x2], x4 |
| dup v14.16b, v0.b[11] |
| st1 {v11.16b}, [x2], x4 |
| dup v15.16b, v0.b[10] |
| st1 {v12.16b}, [x2], x4 |
| dup v16.16b, v0.b[9] |
| st1 {v13.16b}, [x2], x4 |
| dup v17.16b, v0.b[8] |
| st1 {v14.16b}, [x2], x4 |
| dup v18.16b, v0.b[7] |
| st1 {v15.16b}, [x2], x4 |
| dup v19.16b, v0.b[6] |
| st1 {v16.16b}, [x2], x4 |
| dup v20.16b, v0.b[5] |
| st1 {v17.16b}, [x2], x4 |
| dup v21.16b, v0.b[4] |
| st1 {v18.16b}, [x2], x4 |
| dup v22.16b, v0.b[3] |
| st1 {v19.16b}, [x2], x4 |
| dup v23.16b, v0.b[2] |
| st1 {v20.16b}, [x2], x4 |
| dup v24.16b, v0.b[1] |
| st1 {v21.16b}, [x2], x4 |
| dup v25.16b, v0.b[0] |
| st1 {v22.16b}, [x2], x4 |
| st1 {v23.16b}, [x2], x4 |
| st1 {v24.16b}, [x2], x4 |
| st1 {v25.16b}, [x2], x4 |
| |
| |
| |
| b end_func |
| |
| |
| ///----------------------------- |
| |
| do_dc: ///--------------------------------- |
| //DO DC |
| str w10 , [x7] //MIN SAD |
| mov w10, #2 |
| str w10 , [x6] // MODE |
| do_dc_vert: |
| st1 {v30.4s}, [x2], x4 //0 |
| st1 {v30.4s}, [x2], x4 //1 |
| st1 {v30.4s}, [x2], x4 //2 |
| st1 {v30.4s}, [x2], x4 //3 |
| st1 {v30.4s}, [x2], x4 //4 |
| st1 {v30.4s}, [x2], x4 //5 |
| st1 {v30.4s}, [x2], x4 //6 |
| st1 {v30.4s}, [x2], x4 //7 |
| st1 {v30.4s}, [x2], x4 //8 |
| st1 {v30.4s}, [x2], x4 //9 |
| st1 {v30.4s}, [x2], x4 //10 |
| st1 {v30.4s}, [x2], x4 //11 |
| st1 {v30.4s}, [x2], x4 //12 |
| st1 {v30.4s}, [x2], x4 //13 |
| st1 {v30.4s}, [x2], x4 //14 |
| st1 {v30.4s}, [x2], x4 //15 |
| ///------------------ |
| end_func: |
| // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |