| ///***************************************************************************** |
| //* |
| //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //*****************************************************************************/ |
| ///** |
| // ******************************************************************************* |
| // * @file |
| // * ihevc_itrans_recon_8x8_neon.s |
| // * |
| // * @brief |
| // * contains function definitions for single stage inverse transform |
| // * |
| // * @author |
| // * anand s |
| // * |
| // * @par list of functions: |
| // * - ihevc_itrans_recon_8x8() |
| // * |
| // * @remarks |
| // * none |
| // * |
| // ******************************************************************************* |
| //*/ |
| |
| ///** |
| // ******************************************************************************* |
| // * |
| // * @brief |
| // * this function performs inverse transform and reconstruction for 8x8 |
| // * input block |
| // * |
| // * @par description: |
| // * performs inverse transform and adds the prediction data and clips output |
| // * to 8 bit |
| // * |
| // * @param[in] pi2_src |
| // * input 8x8 coefficients |
| // * |
| // * @param[in] pi2_tmp |
| // * temporary 8x8 buffer for storing inverse |
| // * |
| // * transform |
| // * 1st stage output |
| // * |
| // * @param[in] pu1_pred |
| // * prediction 8x8 block |
| // * |
| // * @param[out] pu1_dst |
| // * output 8x8 block |
| // * |
| // * @param[in] src_strd |
| // * input stride |
| // * |
| // * @param[in] pred_strd |
| // * prediction stride |
| // * |
| // * @param[in] dst_strd |
| // * output stride |
| // * |
| // * @param[in] shift |
| // * output shift |
| // * |
| // * @param[in] zero_cols |
| // * zero columns in pi2_src |
| // * |
| // * @returns void |
| // * |
| // * @remarks |
| // * none |
| // * |
| // ******************************************************************************* |
| // */ |
| |
| //void ihevc_itrans_recon_8x8(word16 *pi2_src, |
| // word16 *pi2_tmp, |
| // uword8 *pu1_pred, |
| // uword8 *pu1_dst, |
| // word32 src_strd, |
| // word32 pred_strd, |
| // word32 dst_strd, |
| // word32 zero_cols |
| // word32 zero_rows ) |
| |
| //**************variables vs registers************************* |
| // x0 => *pi2_src |
| // x1 => *pi2_tmp |
| // x2 => *pu1_pred |
| // x3 => *pu1_dst |
| // src_strd |
| // pred_strd |
| // dst_strd |
| // zero_cols |
| |
| |
| |
| .text |
| .align 4 |
| .include "ihevc_neon_macros.s" |
| |
| |
| |
| .set width_x_size_x5 , 40 |
| .set width_x_size_x2 , 32 |
| .set shift_stage1_idct , 7 |
| .set shift_stage2_idct , 12 |
| |
| .globl ihevc_itrans_recon_8x8_av8 |
| |
| .extern g_ai2_ihevc_trans_8_transpose |
| |
| .type ihevc_itrans_recon_8x8_av8, %function |
| |
| ihevc_itrans_recon_8x8_av8: |
| ////register usage.extern - loading and until idct of columns |
| //// cosine constants - d0 |
| //// sine constants - d1 |
| //// row 0 first half - d2 - y0 |
| //// row 1 first half - d6 - y1 |
| //// row 2 first half - d3 - y2 |
| //// row 3 first half - d7 - y3 |
| //// row 4 first half - d10 - y4 |
| //// row 5 first half - d14 - y5 |
| //// row 6 first half - d11 - y6 |
| //// row 7 first half - d15 - y7 |
| |
| //// row 0 second half - d4 - y0 |
| //// row 1 second half - d8 - y1 |
| //// row 2 second half - d5 - y2 |
| //// row 3 second half - d9 - y3 |
| //// row 4 second half - d12 - y4 |
| //// row 5 second half - d16 - y5 |
| //// row 6 second half - d13 - y6 |
| //// row 7 second half - d17 - y7 |
| |
| //// copy the input pointer to another register |
| //// step 1 : load all constants |
| // stmfd sp!,{x4-x12,x14} |
| |
| ldr w11, [sp] // zero rows |
| |
| push_v_regs |
| stp x19, x20,[sp,#-16]! |
| |
| mov x12, x7 // zero columns |
| mov x8, x5 // prediction stride |
| mov x7, x6 // destination stride |
| mov x6, x4 // src stride |
| lsl x6, x6, #1 // x sizeof(word16) |
| add x9,x0,x6, lsl #1 // 2 rows |
| |
| add x10,x6,x6, lsl #1 // 3 rows |
| |
| sub x10,x10, #8 // - 4 cols * sizeof(word16) |
| sub x5,x6, #8 // src_strd - 4 cols * sizeof(word16) |
| |
| adrp x14, :got:g_ai2_ihevc_trans_8_transpose |
| ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_8_transpose] |
| |
| ld1 {v0.4h, v1.4h},[x14] ////d0,d1 are used for storing the constant data |
| |
| ////step 2 load all the input data |
| ////step 3 operate first 4 colums at a time |
| |
| and x11,x11,#0xff |
| and x12,x12,#0xff |
| |
| cmp x11,#0xf0 |
| bge skip_last4_rows |
| |
| |
| ld1 {v2.4h},[x0],#8 |
| ld1 {v3.4h},[x9],#8 |
| ld1 {v4.4h},[x0],x5 |
| smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) |
| ld1 {v5.4h},[x9],x5 |
| smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) |
| ld1 {v6.4h},[x0],#8 |
| ld1 {v7.4h},[x9],#8 |
| smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) |
| ld1 {v8.4h},[x0],x10 |
| smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) |
| ld1 {v9.4h},[x9],x10 |
| smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) |
| ld1 {v10.4h},[x0],#8 |
| smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) |
| ld1 {v11.4h},[x9],#8 |
| smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| ld1 {v12.4h},[x0],x5 |
| smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| ld1 {v13.4h},[x9],x5 |
| smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| ld1 {v14.4h},[x0],#8 |
| smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| ld1 {v15.4h},[x9],#8 |
| smull v22.4s, v10.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) |
| ld1 {v16.4h},[x0],x10 |
| smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) |
| ld1 {v17.4h},[x9],x10 |
| |
| ///* this following was activated when alignment is not there */ |
| //// vld1.16 d2,[x0]! |
| //// vld1.16 d3,[x2]! |
| //// vld1.16 d4,[x0]! |
| //// vld1.16 d5,[x2]! |
| //// vld1.16 d6,[x0]! |
| //// vld1.16 d7,[x2]! |
| //// vld1.16 d8,[x0],x3 |
| //// vld1.16 d9,[x2],x3 |
| //// vld1.16 d10,[x0]! |
| //// vld1.16 d11,[x2]! |
| //// vld1.16 d12,[x0]! |
| //// vld1.16 d13,[x2]! |
| //// vld1.16 d14,[x0]! |
| //// vld1.16 d15,[x2]! |
| //// vld1.16 d16,[x0],x3 |
| //// vld1.16 d17,[x2],x3 |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) |
| smlsl v26.4s, v14.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) |
| smlal v28.4s, v14.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) |
| smlal v30.4s, v14.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) |
| |
| smlsl v18.4s, v11.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) |
| smlal v6.4s, v11.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) |
| |
| add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) |
| sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) |
| |
| smlal v24.4s, v15.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) |
| smlsl v26.4s, v15.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) |
| smlal v28.4s, v15.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) |
| smlsl v30.4s, v15.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) |
| |
| add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) |
| sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) |
| sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) |
| add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) |
| |
| add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0) |
| sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7) |
| |
| add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2) |
| sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5) |
| |
| add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1) |
| sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6) |
| |
| add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3) |
| sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4) |
| |
| sqrshrn v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| |
| |
| b last4_cols |
| |
| |
| |
| skip_last4_rows: |
| |
| |
| |
| ld1 {v2.4h},[x0],#8 |
| ld1 {v3.4h},[x9],#8 |
| ld1 {v4.4h},[x0],x5 |
| ld1 {v5.4h},[x9],x5 |
| ld1 {v6.4h},[x0],#8 |
| ld1 {v7.4h},[x9],#8 |
| ld1 {v8.4h},[x0],x10 |
| ld1 {v9.4h},[x9],x10 |
| |
| |
| |
| movi v12.4h, #0 |
| movi v13.4h, #0 |
| movi v16.4h, #0 |
| movi v17.4h, #0 |
| |
| |
| |
| |
| smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) |
| smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) |
| |
| smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) |
| |
| |
| add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) |
| sub v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) |
| sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) |
| add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) |
| |
| add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0) |
| sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7) |
| |
| add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2) |
| sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5) |
| |
| add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1) |
| sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6) |
| |
| add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3) |
| sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4) |
| |
| sqrshrn v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| |
| |
| last4_cols: |
| |
| |
| cmp x12,#0xf0 |
| bge skip_last4cols |
| |
| smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| smull v18.4s, v5.4h, v1.h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1) |
| smull v8.4s, v5.4h, v0.h[2] //// y2 * cos2(part of d0) |
| |
| smull v20.4s, v4.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) |
| smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) |
| |
| smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) |
| smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) |
| smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) |
| smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) |
| |
| smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) |
| smlal v8.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) |
| |
| add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) |
| sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) |
| |
| smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7) |
| smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6) |
| smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5) |
| smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4) |
| |
| add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7) |
| sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4) |
| sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5) |
| add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6) |
| |
| add v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0) |
| sub v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7) |
| |
| add v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2) |
| sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5) |
| |
| add v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1) |
| sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6) |
| |
| add v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3) |
| sub v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4) |
| |
| sqrshrn v4.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v17.4h, v8.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v5.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v16.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v8.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v13.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v9.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v12.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| b end_skip_last4cols |
| |
| |
| |
| skip_last4cols: |
| |
| umov x15,v25.d[0] |
| |
| trn1 v25.4h, v2.4h, v6.4h |
| trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing |
| |
| trn1 v27.4h, v3.4h, v7.4h |
| trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing |
| |
| trn1 v6.2s, v29.2s, v31.2s |
| trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued..... |
| trn1 v2.2s, v25.2s, v27.2s |
| trn2 v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued..... |
| |
| |
| trn1 v25.4h, v10.4h, v14.4h |
| trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing |
| |
| trn1 v27.4h, v11.4h, v15.4h |
| trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing |
| |
| trn1 v10.2s, v25.2s, v27.2s |
| trn2 v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued..... |
| trn1 v14.2s, v29.2s, v31.2s |
| trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued..... |
| |
| mov v25.d[0],x15 |
| |
| smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) |
| // vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1) |
| |
| smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) |
| smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) |
| |
| |
| |
| |
| sub v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) |
| add v4.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) |
| |
| |
| add v2.4s, v4.4s , v24.4s |
| |
| sub v6.4s, v4.4s , v24.4s |
| |
| add v8.4s, v22.4s , v30.4s |
| |
| sub v24.4s, v22.4s , v30.4s |
| |
| sqrshrn v5.4h, v8.4s,#shift_stage2_idct |
| sqrshrn v2.4h, v2.4s,#shift_stage2_idct |
| sqrshrn v9.4h, v6.4s,#shift_stage2_idct |
| sqrshrn v6.4h, v24.4s,#shift_stage2_idct |
| |
| sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) |
| add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) |
| |
| |
| add v30.4s, v22.4s , v28.4s |
| |
| sub v24.4s, v22.4s , v28.4s |
| |
| add v28.4s, v18.4s , v26.4s |
| |
| sub v22.4s, v18.4s , v26.4s |
| sqrshrn v4.4h, v30.4s,#shift_stage2_idct |
| sqrshrn v7.4h, v24.4s,#shift_stage2_idct |
| sqrshrn v3.4h, v28.4s,#shift_stage2_idct |
| sqrshrn v8.4h, v22.4s,#shift_stage2_idct |
| |
| |
| |
| umov x19,v25.d[0] |
| umov x20,v25.d[1] |
| |
| trn1 v27.4h, v2.4h, v3.4h |
| trn2 v29.4h, v2.4h, v3.4h |
| trn1 v25.4h, v4.4h, v5.4h |
| trn2 v31.4h, v4.4h, v5.4h |
| |
| trn1 v2.2s, v27.2s, v25.2s |
| trn2 v4.2s, v27.2s, v25.2s |
| trn1 v3.2s, v29.2s, v31.2s |
| trn2 v5.2s, v29.2s, v31.2s |
| |
| trn1 v27.4h, v6.4h, v7.4h |
| trn2 v29.4h, v6.4h, v7.4h |
| trn1 v25.4h, v8.4h, v9.4h |
| trn2 v31.4h, v8.4h, v9.4h |
| |
| trn1 v6.2s, v27.2s, v25.2s |
| trn2 v8.2s, v27.2s, v25.2s |
| trn1 v7.2s, v29.2s, v31.2s |
| trn2 v9.2s, v29.2s, v31.2s |
| |
| mov v25.d[0],x19 |
| mov v25.d[1],x20 |
| |
| smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0) |
| |
| smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) |
| smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) |
| smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0) |
| |
| |
| add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data |
| |
| |
| add x5,x8,x8, lsl #1 // |
| |
| |
| add x0,x3,x7, lsl #1 // x0 points to 3rd row of dest data |
| |
| |
| add x10,x7,x7, lsl #1 // |
| |
| // swapping v3 and v6 |
| mov v31.d[0], v3.d[0] |
| mov v3.d[0], v6.d[0] |
| mov v6.d[0], v31.d[0] |
| |
| // swapping v5 and v8 |
| mov v31.d[0], v5.d[0] |
| mov v5.d[0], v8.d[0] |
| mov v8.d[0], v31.d[0] |
| |
| |
| sub v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) |
| add v12.4s, v20.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) |
| |
| |
| add v0.4s, v12.4s , v24.4s |
| |
| |
| sub v24.4s, v12.4s , v24.4s |
| |
| |
| add v12.4s, v22.4s , v30.4s |
| |
| |
| sub v14.4s, v22.4s , v30.4s |
| |
| sqrshrn v10.4h, v0.4s,#shift_stage2_idct |
| sqrshrn v17.4h, v24.4s,#shift_stage2_idct |
| sqrshrn v13.4h, v12.4s,#shift_stage2_idct |
| sqrshrn v14.4h, v14.4s,#shift_stage2_idct |
| |
| sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) |
| add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) |
| |
| |
| add v0.4s, v22.4s , v28.4s |
| |
| |
| sub v24.4s, v22.4s , v28.4s |
| |
| |
| add v28.4s, v18.4s , v26.4s |
| |
| |
| sub v26.4s, v18.4s , v26.4s |
| ld1 {v18.8b},[x2],x8 |
| |
| sqrshrn v12.4h, v0.4s,#shift_stage2_idct |
| ld1 {v20.8b},[x2],x5 |
| |
| |
| sqrshrn v15.4h, v24.4s,#shift_stage2_idct |
| ld1 {v19.8b},[x2],x8 |
| |
| |
| |
| |
| sqrshrn v11.4h, v28.4s,#shift_stage2_idct |
| ld1 {v22.8b},[x4],x8 |
| |
| |
| |
| |
| sqrshrn v16.4h, v26.4s,#shift_stage2_idct |
| ld1 {v21.8b},[x2],x5 |
| |
| |
| b pred_buff_addition |
| end_skip_last4cols: |
| |
| |
| umov x19,v25.d[0] |
| umov x20,v25.d[1] |
| |
| ///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */ |
| trn1 v27.4h, v2.4h, v6.4h |
| trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing |
| trn1 v25.4h, v3.4h, v7.4h |
| trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing |
| |
| trn1 v2.2s, v27.2s, v25.2s |
| trn2 v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued..... |
| trn1 v6.2s, v29.2s, v31.2s |
| trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued..... |
| |
| trn1 v27.4h, v4.4h, v8.4h |
| trn2 v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing |
| trn1 v25.4h, v5.4h, v9.4h |
| trn2 v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing |
| |
| trn1 v4.2s, v27.2s, v25.2s |
| trn2 v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued..... |
| trn1 v8.2s, v29.2s, v31.2s |
| trn2 v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued..... |
| |
| trn1 v27.4h, v10.4h, v14.4h |
| trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing |
| trn1 v25.4h, v11.4h, v15.4h |
| trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing |
| |
| trn1 v10.2s, v27.2s, v25.2s |
| trn2 v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued..... |
| trn1 v14.2s, v29.2s, v31.2s |
| trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued..... |
| |
| trn1 v27.4h, v12.4h, v16.4h |
| trn2 v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing |
| trn1 v25.4h, v13.4h, v17.4h |
| trn2 v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing |
| |
| trn1 v12.2s, v27.2s, v25.2s |
| trn2 v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued..... |
| trn1 v16.2s, v29.2s, v31.2s |
| trn2 v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued..... |
| |
| mov v25.d[0],x19 |
| mov v25.d[1],x20 |
| |
| ////step6 operate on first four rows and find their idct |
| ////register usage.extern - storing and idct of rows |
| //// cosine constants - d0 |
| //// sine constants - d1 |
| //// element 0 first four - d2 - y0 |
| //// element 1 first four - d6 - y1 |
| //// element 2 first four - d3 - y2 |
| //// element 3 first four - d7 - y3 |
| //// element 4 first four - d4 - y4 |
| //// element 5 first four - d8 - y5 |
| //// element 6 first four - d5 - y6 |
| //// element 7 first four - d9 - y7 |
| //// element 0 second four - d10 - y0 |
| //// element 1 second four - d14 - y1 |
| //// element 2 second four - d11 - y2 |
| //// element 3 second four - d15 - y3 |
| //// element 4 second four - d12 - y4 |
| //// element 5 second four - d16 - y5 |
| //// element 6 second four - d13 - y6 |
| //// element 7 second four - d17 - y7 |
| |
| //// map between first kernel code seq and current |
| //// d2 -> d2 |
| //// d6 -> d6 |
| //// d3 -> d3 |
| //// d7 -> d7 |
| //// d10 -> d4 |
| //// d14 -> d8 |
| //// d11 -> d5 |
| //// d15 -> d9 |
| //// q3 -> q3 |
| //// q5 -> q2 |
| //// q7 -> q4 |
| |
| smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) |
| smull v22.4s, v4.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) |
| |
| smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) |
| smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) |
| |
| |
| smlal v24.4s, v8.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) |
| smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) |
| smlal v28.4s, v8.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) |
| smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) |
| |
| smlsl v18.4s, v5.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) |
| smlal v6.4s, v5.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) |
| |
| add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) |
| sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) |
| |
| smlal v24.4s, v9.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) |
| smlsl v26.4s, v9.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) |
| smlal v28.4s, v9.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) |
| smlsl v30.4s, v9.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) |
| |
| sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) |
| add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) |
| |
| |
| add v2.4s, v4.4s , v24.4s |
| |
| sub v6.4s, v4.4s , v24.4s |
| |
| add v8.4s, v22.4s , v30.4s |
| |
| sub v24.4s, v22.4s , v30.4s |
| |
| sqrshrn v5.4h, v8.4s,#shift_stage2_idct |
| sqrshrn v2.4h, v2.4s,#shift_stage2_idct |
| sqrshrn v9.4h, v6.4s,#shift_stage2_idct |
| sqrshrn v6.4h, v24.4s,#shift_stage2_idct |
| |
| sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) |
| add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) |
| |
| |
| add v30.4s, v22.4s , v28.4s |
| |
| sub v24.4s, v22.4s , v28.4s |
| |
| add v28.4s, v18.4s , v26.4s |
| |
| sub v22.4s, v18.4s , v26.4s |
| sqrshrn v4.4h, v30.4s,#shift_stage2_idct |
| sqrshrn v7.4h, v24.4s,#shift_stage2_idct |
| sqrshrn v3.4h, v28.4s,#shift_stage2_idct |
| sqrshrn v8.4h, v22.4s,#shift_stage2_idct |
| |
| |
| |
| umov x19,v25.d[0] |
| umov x20,v25.d[1] |
| |
| trn1 v27.4h, v2.4h, v3.4h |
| trn2 v29.4h, v2.4h, v3.4h |
| trn1 v25.4h, v4.4h, v5.4h |
| trn2 v31.4h, v4.4h, v5.4h |
| |
| trn1 v2.2s, v27.2s, v25.2s |
| trn2 v4.2s, v27.2s, v25.2s |
| trn1 v3.2s, v29.2s, v31.2s |
| trn2 v5.2s, v29.2s, v31.2s |
| |
| trn1 v27.4h, v6.4h, v7.4h |
| trn2 v29.4h, v6.4h, v7.4h |
| trn1 v25.4h, v8.4h, v9.4h |
| trn2 v31.4h, v8.4h, v9.4h |
| |
| trn1 v6.2s, v27.2s, v25.2s |
| trn2 v8.2s, v27.2s, v25.2s |
| trn1 v7.2s, v29.2s, v31.2s |
| trn2 v9.2s, v29.2s, v31.2s |
| |
| mov v25.d[0],x19 |
| mov v25.d[1],x20 |
| |
| |
| |
| smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3) |
| smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) |
| smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) |
| smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) |
| smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0) |
| smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) |
| |
| add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data |
| smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) |
| |
| add x5,x8,x8, lsl #1 // |
| smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) |
| |
| add x0,x3,x7, lsl #1 // x0 points to 3rd row of dest data |
| smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) |
| |
| add x10,x7,x7, lsl #1 // |
| smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) |
| |
| |
| smlal v14.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) |
| |
| add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) |
| sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) |
| |
| smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) |
| |
| // swapping v3 and v6 |
| mov v31.d[0], v3.d[0] |
| mov v3.d[0], v6.d[0] |
| mov v6.d[0], v31.d[0] |
| |
| smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) |
| // swapping v5 and v8 |
| mov v31.d[0], v5.d[0] |
| mov v5.d[0], v8.d[0] |
| mov v8.d[0], v31.d[0] |
| |
| smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) |
| smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) |
| |
| sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) |
| add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) |
| |
| |
| add v0.4s, v12.4s , v24.4s |
| |
| |
| sub v24.4s, v12.4s , v24.4s |
| |
| |
| add v12.4s, v22.4s , v30.4s |
| |
| |
| sub v14.4s, v22.4s , v30.4s |
| |
| sqrshrn v10.4h, v0.4s,#shift_stage2_idct |
| sqrshrn v17.4h, v24.4s,#shift_stage2_idct |
| sqrshrn v13.4h, v12.4s,#shift_stage2_idct |
| sqrshrn v14.4h, v14.4s,#shift_stage2_idct |
| |
| sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) |
| add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) |
| |
| |
| add v0.4s, v22.4s , v28.4s |
| |
| |
| sub v24.4s, v22.4s , v28.4s |
| |
| |
| add v28.4s, v18.4s , v26.4s |
| |
| |
| sub v26.4s, v18.4s , v26.4s |
| ld1 {v18.8b},[x2],x8 |
| |
| sqrshrn v12.4h, v0.4s,#shift_stage2_idct |
| ld1 {v20.8b},[x2],x5 |
| |
| |
| sqrshrn v15.4h, v24.4s,#shift_stage2_idct |
| ld1 {v19.8b},[x2],x8 |
| |
| |
| |
| |
| sqrshrn v11.4h, v28.4s,#shift_stage2_idct |
| ld1 {v22.8b},[x4],x8 |
| |
| |
| |
| |
| sqrshrn v16.4h, v26.4s,#shift_stage2_idct |
| ld1 {v21.8b},[x2],x5 |
| |
| |
| |
| |
| pred_buff_addition: |
| |
| umov x19,v25.d[0] |
| umov x20,v25.d[1] |
| |
| trn1 v27.4h, v10.4h, v11.4h |
| trn2 v29.4h, v10.4h, v11.4h |
| trn1 v25.4h, v12.4h, v13.4h |
| trn2 v31.4h, v12.4h, v13.4h |
| |
| trn1 v10.2s, v27.2s, v25.2s |
| trn2 v12.2s, v27.2s, v25.2s |
| trn1 v11.2s, v29.2s, v31.2s |
| trn2 v13.2s, v29.2s, v31.2s |
| |
| trn1 v27.4h, v14.4h, v15.4h |
| trn2 v29.4h, v14.4h, v15.4h |
| trn1 v25.4h, v16.4h, v17.4h |
| trn2 v31.4h, v16.4h, v17.4h |
| |
| trn1 v14.2s, v27.2s, v25.2s |
| trn2 v16.2s, v27.2s, v25.2s |
| trn1 v15.2s, v29.2s, v31.2s |
| trn2 v17.2s, v29.2s, v31.2s |
| |
| |
| mov v25.d[0],x19 |
| mov v25.d[1],x20 |
| |
| |
| ld1 {v24.8b},[x4],x5 |
| ld1 {v23.8b},[x4],x8 |
| ld1 {v25.8b},[x4],x5 |
| mov v2.d[1], v3.d[0] |
| mov v4.d[1], v5.d[0] |
| mov v6.d[1], v7.d[0] |
| mov v8.d[1], v9.d[0] |
| uaddw v2.8h, v2.8h , v18.8b |
| uaddw v4.8h, v4.8h , v22.8b |
| uaddw v6.8h, v6.8h , v20.8b |
| uaddw v8.8h, v8.8h , v24.8b |
| |
| // swapping v11 and v14 |
| mov v31.d[0], v11.d[0] |
| mov v11.d[0], v14.d[0] |
| mov v14.d[0], v31.d[0] |
| |
| // swapping v13 and v16 |
| mov v31.d[0], v13.d[0] |
| mov v13.d[0], v16.d[0] |
| mov v16.d[0], v31.d[0] |
| // row values stored in the q register. |
| |
| //q1 :x0 |
| //q3: x1 |
| //q2: x2 |
| //q4: x3 |
| //q5: x4 |
| //q7: x5 |
| //q6: x6 |
| //q8: x7 |
| |
| |
| |
| ///// adding the prediction buffer |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| // load prediction data |
| |
| |
| |
| |
| |
| //adding recon with prediction |
| |
| |
| |
| |
| mov v10.d[1], v11.d[0] |
| mov v12.d[1], v13.d[0] |
| mov v14.d[1], v15.d[0] |
| mov v16.d[1], v17.d[0] |
| uaddw v10.8h, v10.8h , v19.8b |
| sqxtun v2.8b, v2.8h |
| uaddw v14.8h, v14.8h , v21.8b |
| sqxtun v4.8b, v4.8h |
| uaddw v12.8h, v12.8h , v23.8b |
| sqxtun v6.8b, v6.8h |
| uaddw v16.8h, v16.8h , v25.8b |
| sqxtun v8.8b, v8.8h |
| |
| |
| |
| |
| |
| |
| |
| st1 {v2.8b},[x3],x7 |
| sqxtun v10.8b, v10.8h |
| st1 {v6.8b},[x3],x10 |
| sqxtun v14.8b, v14.8h |
| st1 {v4.8b},[x0],x7 |
| sqxtun v12.8b, v12.8h |
| st1 {v8.8b},[x0],x10 |
| sqxtun v16.8b, v16.8h |
| |
| |
| |
| |
| |
| |
| |
| st1 {v10.8b},[x3],x7 |
| st1 {v14.8b},[x3],x10 |
| st1 {v12.8b},[x0],x7 |
| st1 {v16.8b},[x0],x10 |
| |
| |
| |
| |
| // ldmfd sp!,{x4-x12,pc} |
| ldp x19, x20,[sp],#16 |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |