| @/****************************************************************************** |
| @ * |
| @ * Copyright (C) 2015 The Android Open Source Project |
| @ * |
| @ * Licensed under the Apache License, Version 2.0 (the "License"); |
| @ * you may not use this file except in compliance with the License. |
| @ * You may obtain a copy of the License at: |
| @ * |
| @ * http://www.apache.org/licenses/LICENSE-2.0 |
| @ * |
| @ * Unless required by applicable law or agreed to in writing, software |
| @ * distributed under the License is distributed on an "AS IS" BASIS, |
| @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| @ * See the License for the specific language governing permissions and |
| @ * limitations under the License. |
| @ * |
| @ ***************************************************************************** |
| @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| @*/ |
| |
| @/* |
| @//---------------------------------------------------------------------------- |
| @// File Name : impeg2_idct.s |
| @// |
| @// Description : This file has the Idct Implementations for the |
| @// MPEG2 SP decoder on neon platform. |
| @// |
| @// Reference Document : |
| @// |
| @// Revision History : |
| @// Date Author Detail Description |
| @// ------------ ---------------- ---------------------------------- |
| @// Feb 22, 2008 Naveen Kumar T Created |
| @// |
| @//------------------------------------------------------------------------- |
| @*/ |
| |
| @/* |
| @// ---------------------------------------------------------------------------- |
| @// Include Files |
| @// ---------------------------------------------------------------------------- |
| @*/ |
| |
| .text |
| .p2align 2 |
| .equ idct_stg1_shift , 12 |
| .equ idct_stg2_shift , 16 |
| .equ idct_stg1_round , (1 << (idct_stg1_shift - 1)) |
| .equ idct_stg2_round , (1 << (idct_stg2_shift - 1)) |
| @/* |
| @// ---------------------------------------------------------------------------- |
| @// Struct/Union Types and Define |
| @// ---------------------------------------------------------------------------- |
| @*/ |
| |
| @/* |
| @// ---------------------------------------------------------------------------- |
| @// Static Global Data section variables |
| @// ---------------------------------------------------------------------------- |
| @*/ |
| @//--------------------------- NONE -------------------------------------------- |
| |
| @/* |
| @// ---------------------------------------------------------------------------- |
| @// Static Prototype Functions |
| @// ---------------------------------------------------------------------------- |
| @*/ |
| @// -------------------------- NONE -------------------------------------------- |
| |
| @/* |
| @// ---------------------------------------------------------------------------- |
| @// Exported functions |
| @// ---------------------------------------------------------------------------- |
| @*/ |
| |
| .extern gai2_impeg2_idct_q15 |
| .hidden gai2_impeg2_idct_q15 |
| .extern gai2_impeg2_idct_q11 |
| .hidden gai2_impeg2_idct_q11 |
| .extern gai2_impeg2_idct_first_col_q15 |
| .hidden gai2_impeg2_idct_first_col_q15 |
| .extern gai2_impeg2_idct_first_col_q11 |
| .hidden gai2_impeg2_idct_first_col_q11 |
| .extern gai2_impeg2_mismatch_stg2_additive |
| .hidden gai2_impeg2_mismatch_stg2_additive |
| |
| gai2_impeg2_idct_q15_addr1: |
| .long gai2_impeg2_idct_q15 - q15lbl1 - 8 |
| gai2_impeg2_idct_q15_addr2: |
| .long gai2_impeg2_idct_q15 - q15lbl2 - 8 |
| gai2_impeg2_idct_q11_addr1: |
| .long gai2_impeg2_idct_q11 - q11lbl1 - 8 |
| gai2_impeg2_idct_q11_addr2: |
| .long gai2_impeg2_idct_q11 - q11lbl2 - 8 |
| gai2_impeg2_idct_first_col_q15_addr1: |
| .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl1 - 8 |
| gai2_impeg2_idct_first_col_q15_addr2: |
| .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl2 - 8 |
| gai2_impeg2_idct_first_col_q15_addr3: |
| .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl3 - 8 |
| gai2_impeg2_mismatch_stg2_additive_addr: |
| .long gai2_impeg2_mismatch_stg2_additive - additive_lbl - 8 |
| gai2_impeg2_idct_first_col_q11_addr1: |
| .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl1 - 8 |
| gai2_impeg2_idct_first_col_q11_addr2: |
| .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl2 - 8 |
| |
| .global impeg2_idct_recon_dc_a9q |
| impeg2_idct_recon_dc_a9q: |
| stmfd sp!, {r4, r6, r12, lr} |
| vpush {d8-d15} |
| @//r0: pi2_src |
| @//r1: pi2_tmp - not used, used as pred_strd |
| @//r2: pu1_pred |
| @//r3: pu1_dst |
| @//r4: used as scratch |
| @//r5: |
| |
| ldr r1, [sp, #84] @//pred_strd |
| ldr r6, [sp, #88] @//dst_strd |
| |
| ldr r14, gai2_impeg2_idct_q15_addr1 |
| q15lbl1: |
| add r14, r14, pc |
| ldrsh r12, [r14] |
| ldrsh r4, [r0] |
| |
| vld1.8 d0, [r2], r1 |
| mul r4, r4, r12 |
| |
| vld1.8 d1, [r2], r1 |
| add r4, #idct_stg1_round |
| |
| vld1.8 d2, [r2], r1 |
| asr r4, r4, #idct_stg1_shift |
| |
| ldr r14, gai2_impeg2_idct_q11_addr1 |
| q11lbl1: |
| add r14, r14, pc |
| ldrsh r12, [r14] |
| |
| vld1.8 d3, [r2], r1 |
| mul r4, r4, r12 |
| |
| vld1.8 d4, [r2], r1 |
| add r4, #idct_stg2_round |
| |
| vld1.8 d5, [r2], r1 |
| asr r4, r4, #idct_stg2_shift |
| |
| vld1.8 d6, [r2], r1 |
| vdup.s16 q15, r4 |
| |
| |
| vld1.8 d7, [r2], r1 |
| |
| vaddw.u8 q4, q15, d0 |
| |
| vaddw.u8 q5, q15, d1 |
| vqmovun.s16 d0, q4 |
| |
| vaddw.u8 q6, q15, d2 |
| vqmovun.s16 d1, q5 |
| vst1.8 d0, [r3], r6 |
| |
| vaddw.u8 q7, q15, d3 |
| vqmovun.s16 d2, q6 |
| vst1.8 d1, [r3], r6 |
| |
| vaddw.u8 q8, q15, d4 |
| vqmovun.s16 d3, q7 |
| vst1.8 d2, [r3], r6 |
| |
| vaddw.u8 q9, q15, d5 |
| vqmovun.s16 d4, q8 |
| vst1.8 d3, [r3], r6 |
| |
| vaddw.u8 q10, q15, d6 |
| vqmovun.s16 d5, q9 |
| vst1.8 d4, [r3], r6 |
| |
| vaddw.u8 q11, q15, d7 |
| vqmovun.s16 d6, q10 |
| vst1.8 d5, [r3], r6 |
| |
| vqmovun.s16 d7, q11 |
| vst1.8 d6, [r3], r6 |
| |
| |
| vst1.8 d7, [r3], r6 |
| |
| vpop {d8-d15} |
| ldmfd sp!, {r4, r6, r12, pc} |
| |
| |
| |
| |
| .global impeg2_idct_recon_dc_mismatch_a9q |
| impeg2_idct_recon_dc_mismatch_a9q: |
| stmfd sp!, {r4-r12, lr} |
| vpush {d8-d15} |
| |
| ldr r1, [sp, #108] @//pred_strd |
| ldr r6, [sp, #112] @//dst_strd |
| |
| ldr r14, gai2_impeg2_idct_q15_addr2 |
| q15lbl2: |
| add r14, r14, pc |
| ldrsh r12, [r14] |
| ldrsh r4, [r0] |
| |
| mul r4, r4, r12 |
| add r4, #idct_stg1_round |
| asr r4, r4, #idct_stg1_shift |
| |
| ldr r14, gai2_impeg2_idct_q11_addr2 |
| q11lbl2: |
| add r14, r14, pc |
| ldrsh r12, [r14] |
| mul r4, r4, r12 |
| vdup.s32 q0, r4 |
| |
| mov r14, #16 @//Increment for table read |
| ldr r4, gai2_impeg2_mismatch_stg2_additive_addr |
| additive_lbl: |
| add r4, r4, pc |
| |
| vld1.16 {q1}, [r4], r14 |
| |
| vld1.8 d30, [r2], r1 |
| vmovl.s16 q4, d2 |
| vmovl.s16 q5, d3 |
| vraddhn.s32 d12, q0, q4 |
| vraddhn.s32 d13, q0, q5 |
| vaddw.u8 q7, q6, d30 |
| vqmovun.s16 d30, q7 |
| vst1.8 d30, [r3], r6 |
| |
| vld1.16 {q1}, [r4], r14 |
| vld1.8 d30, [r2], r1 |
| vmovl.s16 q4, d2 |
| vmovl.s16 q5, d3 |
| vraddhn.s32 d12, q0, q4 |
| vraddhn.s32 d13, q0, q5 |
| vaddw.u8 q7, q6, d30 |
| vqmovun.s16 d30, q7 |
| vst1.8 d30, [r3], r6 |
| |
| vld1.16 {q1}, [r4], r14 |
| vld1.8 d30, [r2], r1 |
| vmovl.s16 q4, d2 |
| vmovl.s16 q5, d3 |
| vraddhn.s32 d12, q0, q4 |
| vraddhn.s32 d13, q0, q5 |
| vaddw.u8 q7, q6, d30 |
| vqmovun.s16 d30, q7 |
| vst1.8 d30, [r3], r6 |
| |
| vld1.16 {q1}, [r4], r14 |
| vld1.8 d30, [r2], r1 |
| vmovl.s16 q4, d2 |
| vmovl.s16 q5, d3 |
| vraddhn.s32 d12, q0, q4 |
| vraddhn.s32 d13, q0, q5 |
| vaddw.u8 q7, q6, d30 |
| vqmovun.s16 d30, q7 |
| vst1.8 d30, [r3], r6 |
| |
| vld1.16 {q1}, [r4], r14 |
| vld1.8 d30, [r2], r1 |
| vmovl.s16 q4, d2 |
| vmovl.s16 q5, d3 |
| vraddhn.s32 d12, q0, q4 |
| vraddhn.s32 d13, q0, q5 |
| vaddw.u8 q7, q6, d30 |
| vqmovun.s16 d30, q7 |
| vst1.8 d30, [r3], r6 |
| |
| vld1.16 {q1}, [r4], r14 |
| vld1.8 d30, [r2], r1 |
| vmovl.s16 q4, d2 |
| vmovl.s16 q5, d3 |
| vraddhn.s32 d12, q0, q4 |
| vraddhn.s32 d13, q0, q5 |
| vaddw.u8 q7, q6, d30 |
| vqmovun.s16 d30, q7 |
| vst1.8 d30, [r3], r6 |
| |
| vld1.16 {q1}, [r4], r14 |
| vld1.8 d30, [r2], r1 |
| vmovl.s16 q4, d2 |
| vmovl.s16 q5, d3 |
| vraddhn.s32 d12, q0, q4 |
| vraddhn.s32 d13, q0, q5 |
| vaddw.u8 q7, q6, d30 |
| vqmovun.s16 d30, q7 |
| vst1.8 d30, [r3], r6 |
| |
| vld1.16 {q1}, [r4], r14 |
| vld1.8 d30, [r2], r1 |
| vmovl.s16 q4, d2 |
| vmovl.s16 q5, d3 |
| vraddhn.s32 d12, q0, q4 |
| vraddhn.s32 d13, q0, q5 |
| vaddw.u8 q7, q6, d30 |
| vqmovun.s16 d30, q7 |
| vst1.8 d30, [r3], r6 |
| |
| |
| vpop {d8-d15} |
| ldmfd sp!, {r4-r12, pc} |
| |
| |
| |
| |
| @/** |
| @ ******************************************************************************* |
| @ * |
| @ * ;brief |
| @ * This function performs Inverse transform and reconstruction for 8x8 |
| @ * input block |
| @ * |
| @ * ;par Description: |
| @ * Performs inverse transform and adds the prediction data and clips output |
| @ * to 8 bit |
| @ * |
| @ * ;param[in] pi2_src |
| @ * Input 8x8 coefficients |
| @ * |
| @ * ;param[in] pi2_tmp |
| @ * Temporary 8x8 buffer for storing inverse |
| @ * |
| @ * transform |
| @ * 1st stage output |
| @ * |
| @ * ;param[in] pu1_pred |
| @ * Prediction 8x8 block |
| @ * |
| @ * ;param[out] pu1_dst |
| @ * Output 8x8 block |
| @ * |
| @ * ;param[in] src_strd |
| @ * Input stride |
| @ * |
| @ * ;param[in] pred_strd |
| @ * Prediction stride |
| @ * |
| @ * ;param[in] dst_strd |
| @ * Output Stride |
| @ * |
| @ * ;param[in] shift |
| @ * Output shift |
| @ * |
| @ * ;param[in] zero_cols |
| @ * Zero columns in pi2_src |
| @ * |
| @ * ;returns Void |
| @ * |
| @ * ;remarks |
| @ * None |
| @ * |
| @ ******************************************************************************* |
| @ */ |
| |
| @void impeg2_itrans_recon_8x8(WORD16 *pi2_src, |
| @ WORD16 *pi2_tmp, |
| @ UWORD8 *pu1_pred, |
| @ UWORD8 *pu1_dst, |
| @ WORD32 src_strd, |
| @ WORD32 pred_strd, |
| @ WORD32 dst_strd, |
| @ WORD32 zero_cols |
| @ WORD32 zero_rows ) |
| |
| @**************Variables Vs Registers************************* |
| @ r0 => *pi2_src |
| @ r1 => *pi2_tmp |
| @ r2 => *pu1_pred |
| @ r3 => *pu1_dst |
| @ src_strd |
| @ pred_strd |
| @ dst_strd |
| @ zero_cols |
| |
| |
| |
| .global impeg2_idct_recon_a9q |
| impeg2_idct_recon_a9q: |
| @//Register Usage Reference - loading and Until IDCT of columns |
| @// Cosine Constants - D0 |
| @// Sine Constants - D1 |
| @// Row 0 First Half - D2 - y0 |
| @// Row 1 First Half - D6 - y1 |
| @// Row 2 First Half - D3 - y2 |
| @// Row 3 First Half - D7 - y3 |
| @// Row 4 First Half - D10 - y4 |
| @// Row 5 First Half - D14 - y5 |
| @// Row 6 First Half - D11 - y6 |
| @// Row 7 First Half - D15 - y7 |
| |
| @// Row 0 Second Half - D4 - y0 |
| @// Row 1 Second Half - D8 - y1 |
| @// Row 2 Second Half - D5 - y2 |
| @// Row 3 Second Half - D9 - y3 |
| @// Row 4 Second Half - D12 - y4 |
| @// Row 5 Second Half - D16 - y5 |
| @// Row 6 Second Half - D13 - y6 |
| @// Row 7 Second Half - D17 - y7 |
| |
| @// Copy the input pointer to another register |
| @// Step 1 : load all constants |
| stmfd sp!, {r4-r12, lr} |
| vpush {d8-d15} |
| |
| ldr r8, [sp, #108] @ prediction stride |
| ldr r7, [sp, #112] @ destination stride |
| ldr r6, [sp, #104] @ src stride |
| ldr r12, [sp, #116] |
| ldr r11, [sp, #120] |
| |
| mov r6, r6, lsl #1 @ x sizeof(word16) |
| add r9, r0, r6, lsl #1 @ 2 rows |
| |
| add r10, r6, r6, lsl #1 @ 3 rows |
| |
| sub r10, r10, #8 @ - 4 cols * sizeof(WORD16) |
| sub r5, r6, #8 @ src_strd - 4 cols * sizeof(WORD16) |
| |
| |
| ldr r14, gai2_impeg2_idct_first_col_q15_addr1 |
| fcq15_lbl1: |
| add r14, r14, pc |
| vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data |
| |
| @//Step 2 Load all the input data |
| @//Step 3 Operate first 4 colums at a time |
| |
| and r11, r11, #0xff |
| and r12, r12, #0xff |
| |
| cmp r11, #0xf0 |
| bge skip_last4_rows |
| |
| |
| vld1.16 d2, [r0]! |
| vld1.16 d3, [r9]! |
| vld1.16 d4, [r0], r5 |
| vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) |
| vld1.16 d5, [r9], r5 |
| vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) |
| vld1.16 d6, [r0]! |
| vld1.16 d7, [r9]! |
| vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) |
| vld1.16 d8, [r0], r10 |
| vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) |
| vld1.16 d9, [r9], r10 |
| vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) |
| vld1.16 d10, [r0]! |
| vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) |
| vld1.16 d11, [r9]! |
| vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) |
| vld1.16 d12, [r0], r5 |
| vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) |
| vld1.16 d13, [r9], r5 |
| vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) |
| vld1.16 d14, [r0]! |
| vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) |
| vld1.16 d15, [r9]! |
| vmull.s16 q11, d10, d0[0] @// y4 * cos4(part of c0 and c1) |
| vld1.16 d16, [r0], r10 |
| vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) |
| vld1.16 d17, [r9], r10 |
| |
| @/* This following was activated when alignment is not there */ |
| @// VLD1.16 D2,[r0]! |
| @// VLD1.16 D3,[r2]! |
| @// VLD1.16 D4,[r0]! |
| @// VLD1.16 D5,[r2]! |
| @// VLD1.16 D6,[r0]! |
| @// VLD1.16 D7,[r2]! |
| @// VLD1.16 D8,[r0],r3 |
| @// VLD1.16 D9,[r2],r3 |
| @// VLD1.16 D10,[r0]! |
| @// VLD1.16 D11,[r2]! |
| @// VLD1.16 D12,[r0]! |
| @// VLD1.16 D13,[r2]! |
| @// VLD1.16 D14,[r0]! |
| @// VLD1.16 D15,[r2]! |
| @// VLD1.16 D16,[r0],r3 |
| @// VLD1.16 D17,[r2],r3 |
| |
| |
| |
| |
| vmlal.s16 q12, d14, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) |
| vmlsl.s16 q13, d14, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) |
| vmlal.s16 q14, d14, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) |
| vmlal.s16 q15, d14, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) |
| |
| vmlsl.s16 q9, d11, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) |
| vmlal.s16 q3, d11, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) |
| |
| vadd.s32 q5, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) |
| vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) |
| |
| vmlal.s16 q12, d15, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7) |
| vmlsl.s16 q13, d15, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6) |
| vmlal.s16 q14, d15, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5) |
| vmlsl.s16 q15, d15, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4) |
| |
| vadd.s32 q7, q5, q3 @// a0 = c0 + d0(part of r0,r7) |
| vsub.s32 q5, q5, q3 @// a3 = c0 - d0(part of r3,r4) |
| vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) |
| vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) |
| |
| vadd.s32 q10, q7, q12 @// a0 + b0(part of r0) |
| vsub.s32 q3, q7, q12 @// a0 - b0(part of r7) |
| |
| vadd.s32 q12, q11, q14 @// a2 + b2(part of r2) |
| vsub.s32 q11, q11, q14 @// a2 - b2(part of r5) |
| |
| vadd.s32 q14, q9, q13 @// a1 + b1(part of r1) |
| vsub.s32 q9, q9, q13 @// a1 - b1(part of r6) |
| |
| vadd.s32 q13, q5, q15 @// a3 + b3(part of r3) |
| vsub.s32 q15, q5, q15 @// a3 - b3(part of r4) |
| |
| vqrshrn.s32 d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) |
| |
| |
| b last4_cols |
| |
| |
| |
| skip_last4_rows: |
| |
| |
| ldr r14, gai2_impeg2_idct_first_col_q15_addr2 |
| fcq15_lbl2: |
| add r14, r14, pc |
| vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data |
| |
| vld1.16 d2, [r0]! |
| vld1.16 d3, [r9]! |
| vld1.16 d4, [r0], r5 |
| vld1.16 d5, [r9], r5 |
| vld1.16 d6, [r0]! |
| vld1.16 d7, [r9]! |
| vld1.16 d8, [r0], r10 |
| vld1.16 d9, [r9], r10 |
| |
| |
| |
| vmov.s16 q6, #0 |
| vmov.s16 q8, #0 |
| |
| |
| |
| |
| vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) |
| vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) |
| vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) |
| vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) |
| |
| vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) |
| vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) |
| vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) |
| vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) |
| |
| vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) |
| vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) |
| |
| vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) |
| |
| |
| vadd.s32 q7, q10, q3 @// a0 = c0 + d0(part of r0,r7) |
| vsub.s32 q5, q10, q3 @// a3 = c0 - d0(part of r3,r4) |
| vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) |
| vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) |
| |
| vadd.s32 q10, q7, q12 @// a0 + b0(part of r0) |
| vsub.s32 q3, q7, q12 @// a0 - b0(part of r7) |
| |
| vadd.s32 q12, q11, q14 @// a2 + b2(part of r2) |
| vsub.s32 q11, q11, q14 @// a2 - b2(part of r5) |
| |
| vadd.s32 q14, q9, q13 @// a1 + b1(part of r1) |
| vsub.s32 q9, q9, q13 @// a1 - b1(part of r6) |
| |
| vadd.s32 q13, q5, q15 @// a3 + b3(part of r3) |
| vsub.s32 q15, q5, q15 @// a3 - b3(part of r4) |
| |
| vqrshrn.s32 d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) |
| |
| |
| last4_cols: |
| |
| |
| cmp r12, #0xf0 |
| bge skip_last4cols |
| |
| ldr r14, gai2_impeg2_idct_first_col_q15_addr3 |
| fcq15_lbl3: |
| add r14, r14, pc |
| vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data |
| |
| vmull.s16 q12, d8, d0[1] @// y1 * cos1(part of b0) |
| vmull.s16 q13, d8, d0[3] @// y1 * cos3(part of b1) |
| vmull.s16 q14, d8, d1[1] @// y1 * sin3(part of b2) |
| vmull.s16 q15, d8, d1[3] @// y1 * sin1(part of b3) |
| |
| vmlal.s16 q12, d9, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) |
| vmlsl.s16 q13, d9, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) |
| vmlsl.s16 q14, d9, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) |
| vmlsl.s16 q15, d9, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) |
| |
| vmull.s16 q9, d5, d1[2] @// y2 * sin2 (Q4 is freed by this time)(part of d1) |
| vmull.s16 q4, d5, d0[2] @// y2 * cos2(part of d0) |
| |
| vmull.s16 q10, d4, d0[0] @// y0 * cos4(part of c0 and c1) |
| vmull.s16 q11, d12, d0[0] @// y4 * cos4(part of c0 and c1) |
| |
| vmlal.s16 q12, d16, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) |
| vmlsl.s16 q13, d16, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) |
| vmlal.s16 q14, d16, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) |
| vmlal.s16 q15, d16, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) |
| |
| vmlsl.s16 q9, d13, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) |
| vmlal.s16 q4, d13, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) |
| |
| vadd.s32 q6, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) |
| vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) |
| |
| vmlal.s16 q12, d17, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7) |
| vmlsl.s16 q13, d17, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6) |
| vmlal.s16 q14, d17, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5) |
| vmlsl.s16 q15, d17, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4) |
| |
| vadd.s32 q8, q6, q4 @// a0 = c0 + d0(part of e0,e7) |
| vsub.s32 q6, q6, q4 @// a3 = c0 - d0(part of e3,e4) |
| vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of e2,e5) |
| vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of e1,e6) |
| |
| vadd.s32 q10, q8, q12 @// a0 + b0(part of e0) |
| vsub.s32 q4, q8, q12 @// a0 - b0(part of e7) |
| |
| vadd.s32 q12, q11, q14 @// a2 + b2(part of e2) |
| vsub.s32 q11, q11, q14 @// a2 - b2(part of e5) |
| |
| vadd.s32 q14, q9, q13 @// a1 + b1(part of e1) |
| vsub.s32 q9, q9, q13 @// a1 - b1(part of e6) |
| |
| vadd.s32 q13, q6, q15 @// a3 + b3(part of e3) |
| vsub.s32 q15, q6, q15 @// a3 - b3(part of r4) |
| |
| vqrshrn.s32 d4, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d17, q4, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d5, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d16, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d8, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d13, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d9, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) |
| vqrshrn.s32 d12, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) |
| b end_skip_last4cols |
| |
| |
| |
| skip_last4cols: |
| |
| |
| |
| ldr r14, gai2_impeg2_idct_first_col_q11_addr1 |
| fcq11_lbl1: |
| add r14, r14, pc |
| vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data |
| |
| |
| |
| vtrn.16 q1, q3 @//[r3,r1],[r2,r0] first qudrant transposing |
| |
| vtrn.16 q5, q7 @//[r7,r5],[r6,r4] third qudrant transposing |
| |
| |
| vtrn.32 d6, d7 @//r0,r1,r2,r3 first qudrant transposing continued..... |
| vtrn.32 d2, d3 @//r0,r1,r2,r3 first qudrant transposing continued..... |
| |
| vtrn.32 d10, d11 @//r4,r5,r6,r7 third qudrant transposing continued..... |
| vtrn.32 d14, d15 @//r4,r5,r6,r7 third qudrant transposing continued..... |
| |
| |
| vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) |
| vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) |
| vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) |
| vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) |
| |
| vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) |
| vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) |
| vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) |
| vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) |
| |
| vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) |
| @ VMULL.S16 Q11,D4,D0[0] ;// y4 * cos4(part of c0 and c1) |
| |
| vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) |
| vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) |
| |
| |
| |
| |
| vsub.s32 q11, q10, q3 @// a3 = c0 - d0(part of r3,r4) |
| vadd.s32 q2, q10, q3 @// a0 = c0 + d0(part of r0,r7) |
| |
| |
| vadd.s32 q1, q2, q12 |
| |
| vsub.s32 q3, q2, q12 |
| |
| vadd.s32 q4, q11, q15 |
| |
| vsub.s32 q12, q11, q15 |
| |
| vqrshrn.s32 d5, q4, #idct_stg2_shift |
| vqrshrn.s32 d2, q1, #idct_stg2_shift |
| vqrshrn.s32 d9, q3, #idct_stg2_shift |
| vqrshrn.s32 d6, q12, #idct_stg2_shift |
| |
| vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) |
| vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) |
| |
| |
| vadd.s32 q15, q11, q14 |
| |
| vsub.s32 q12, q11, q14 |
| |
| vadd.s32 q14, q9, q13 |
| |
| vsub.s32 q11, q9, q13 |
| vqrshrn.s32 d4, q15, #idct_stg2_shift |
| vqrshrn.s32 d7, q12, #idct_stg2_shift |
| vqrshrn.s32 d3, q14, #idct_stg2_shift |
| vqrshrn.s32 d8, q11, #idct_stg2_shift |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| vmull.s16 q12, d14, d0[1] @// y1 * cos1(part of b0) |
| |
| vmull.s16 q13, d14, d0[3] @// y1 * cos3(part of b1) |
| vmull.s16 q14, d14, d1[1] @// y1 * sin3(part of b2) |
| vmull.s16 q15, d14, d1[3] @// y1 * sin1(part of b3) |
| |
| vmlal.s16 q12, d15, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) |
| vtrn.16 d2, d3 |
| vmlsl.s16 q13, d15, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) |
| vtrn.16 d4, d5 |
| vmlsl.s16 q14, d15, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) |
| vtrn.16 d6, d7 |
| vmlsl.s16 q15, d15, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) |
| vtrn.16 d8, d9 |
| vmull.s16 q10, d10, d0[0] @// y0 * cos4(part of c0 and c1) |
| vtrn.32 d2, d4 |
| |
| vtrn.32 d3, d5 |
| vmull.s16 q9, d11, d1[2] @// y2 * sin2 (Q7 is freed by this time)(part of d1) |
| vtrn.32 d6, d8 |
| vmull.s16 q7, d11, d0[2] @// y2 * cos2(part of d0) |
| vtrn.32 d7, d9 |
| |
| |
| add r4, r2, r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data |
| |
| |
| add r5, r8, r8, lsl #1 @ |
| |
| |
| add r0, r3, r7, lsl #1 @ r0 points to 3rd row of dest data |
| |
| |
| add r10, r7, r7, lsl #1 @ |
| |
| |
| vswp d3, d6 |
| |
| |
| vswp d5, d8 |
| |
| |
| vsub.s32 q11, q10, q7 @// a3 = c0 - d0(part of r3,r4) |
| vadd.s32 q6, q10, q7 @// a0 = c0 + d0(part of r0,r7) |
| |
| |
| vadd.s32 q0, q6, q12 |
| |
| |
| vsub.s32 q12, q6, q12 |
| |
| |
| vadd.s32 q6, q11, q15 |
| |
| |
| vsub.s32 q7, q11, q15 |
| |
| vqrshrn.s32 d10, q0, #idct_stg2_shift |
| vqrshrn.s32 d17, q12, #idct_stg2_shift |
| vqrshrn.s32 d13, q6, #idct_stg2_shift |
| vqrshrn.s32 d14, q7, #idct_stg2_shift |
| |
| vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) |
| vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) |
| |
| |
| vadd.s32 q0, q11, q14 |
| |
| |
| vsub.s32 q12, q11, q14 |
| |
| |
| vadd.s32 q14, q9, q13 |
| |
| |
| vsub.s32 q13, q9, q13 |
| vld1.8 d18, [r2], r8 |
| |
| vqrshrn.s32 d12, q0, #idct_stg2_shift |
| vld1.8 d20, [r2], r5 |
| |
| |
| vqrshrn.s32 d15, q12, #idct_stg2_shift |
| vld1.8 d19, [r2], r8 |
| |
| |
| |
| |
| vqrshrn.s32 d11, q14, #idct_stg2_shift |
| vld1.8 d22, [r4], r8 |
| |
| |
| |
| |
| vqrshrn.s32 d16, q13, #idct_stg2_shift |
| vld1.8 d21, [r2], r5 |
| |
| |
| b pred_buff_addition |
| end_skip_last4cols: |
| |
| ldr r14, gai2_impeg2_idct_first_col_q11_addr2 |
| fcq11_lbl2: |
| add r14, r14, pc |
| vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data |
| |
| |
| @/* Now the Idct of columns is done, transpose so that row idct done efficiently(step5) */ |
| vtrn.16 q1, q3 @//[r3,r1],[r2,r0] first qudrant transposing |
| vtrn.16 q2, q4 @//[r3,r1],[r2,r0] second qudrant transposing |
| vtrn.16 q5, q7 @//[r7,r5],[r6,r4] third qudrant transposing |
| vtrn.16 q6, q8 @//[r7,r5],[r6,r4] fourth qudrant transposing |
| |
| vtrn.32 d6, d7 @//r0,r1,r2,r3 first qudrant transposing continued..... |
| vtrn.32 d2, d3 @//r0,r1,r2,r3 first qudrant transposing continued..... |
| vtrn.32 d4, d5 @//r0,r1,r2,r3 second qudrant transposing continued..... |
| vtrn.32 d8, d9 @//r0,r1,r2,r3 second qudrant transposing continued..... |
| vtrn.32 d10, d11 @//r4,r5,r6,r7 third qudrant transposing continued..... |
| vtrn.32 d14, d15 @//r4,r5,r6,r7 third qudrant transposing continued..... |
| vtrn.32 d12, d13 @//r4,r5,r6,r7 fourth qudrant transposing continued..... |
| vtrn.32 d16, d17 @//r4,r5,r6,r7 fourth qudrant transposing continued..... |
| |
| @//step6 Operate on first four rows and find their idct |
| @//Register Usage Reference - storing and IDCT of rows |
| @// Cosine Constants - D0 |
| @// Sine Constants - D1 |
| @// Element 0 First four - D2 - y0 |
| @// Element 1 First four - D6 - y1 |
| @// Element 2 First four - D3 - y2 |
| @// Element 3 First four - D7 - y3 |
| @// Element 4 First four - D4 - y4 |
| @// Element 5 First four - D8 - y5 |
| @// Element 6 First four - D5 - y6 |
| @// Element 7 First four - D9 - y7 |
| @// Element 0 Second four - D10 - y0 |
| @// Element 1 Second four - D14 - y1 |
| @// Element 2 Second four - D11 - y2 |
| @// Element 3 Second four - D15 - y3 |
| @// Element 4 Second four - D12 - y4 |
| @// Element 5 Second four - D16 - y5 |
| @// Element 6 Second four - D13 - y6 |
| @// Element 7 Second four - D17 - y7 |
| |
| @// Map between first kernel code seq and current |
| @// D2 -> D2 |
| @// D6 -> D6 |
| @// D3 -> D3 |
| @// D7 -> D7 |
| @// D10 -> D4 |
| @// D14 -> D8 |
| @// D11 -> D5 |
| @// D15 -> D9 |
| @// Q3 -> Q3 |
| @// Q5 -> Q2 |
| @// Q7 -> Q4 |
| |
| vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) |
| vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) |
| vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) |
| vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) |
| |
| vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) |
| vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) |
| vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) |
| vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) |
| |
| vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) |
| vmull.s16 q11, d4, d0[0] @// y4 * cos4(part of c0 and c1) |
| |
| vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) |
| vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) |
| |
| |
| vmlal.s16 q12, d8, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) |
| vmlsl.s16 q13, d8, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) |
| vmlal.s16 q14, d8, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) |
| vmlal.s16 q15, d8, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) |
| |
| vmlsl.s16 q9, d5, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) |
| vmlal.s16 q3, d5, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) |
| |
| vadd.s32 q1, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) |
| vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) |
| |
| vmlal.s16 q12, d9, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7) |
| vmlsl.s16 q13, d9, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6) |
| vmlal.s16 q14, d9, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5) |
| vmlsl.s16 q15, d9, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4) |
| |
| vsub.s32 q11, q1, q3 @// a3 = c0 - d0(part of r3,r4) |
| vadd.s32 q2, q1, q3 @// a0 = c0 + d0(part of r0,r7) |
| |
| |
| vadd.s32 q1, q2, q12 |
| |
| vsub.s32 q3, q2, q12 |
| |
| vadd.s32 q4, q11, q15 |
| |
| vsub.s32 q12, q11, q15 |
| |
| vqrshrn.s32 d5, q4, #idct_stg2_shift |
| vqrshrn.s32 d2, q1, #idct_stg2_shift |
| vqrshrn.s32 d9, q3, #idct_stg2_shift |
| vqrshrn.s32 d6, q12, #idct_stg2_shift |
| |
| vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) |
| vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) |
| |
| |
| vadd.s32 q15, q11, q14 |
| |
| vsub.s32 q12, q11, q14 |
| |
| vadd.s32 q14, q9, q13 |
| |
| vsub.s32 q11, q9, q13 |
| vqrshrn.s32 d4, q15, #idct_stg2_shift |
| vqrshrn.s32 d7, q12, #idct_stg2_shift |
| vqrshrn.s32 d3, q14, #idct_stg2_shift |
| vqrshrn.s32 d8, q11, #idct_stg2_shift |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| vmull.s16 q12, d14, d0[1] @// y1 * cos1(part of b0) |
| |
| vmull.s16 q13, d14, d0[3] @// y1 * cos3(part of b1) |
| vmull.s16 q14, d14, d1[1] @// y1 * sin3(part of b2) |
| vmull.s16 q15, d14, d1[3] @// y1 * sin1(part of b3) |
| |
| vmlal.s16 q12, d15, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) |
| vtrn.16 d2, d3 |
| vmlsl.s16 q13, d15, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) |
| vtrn.16 d4, d5 |
| vmlsl.s16 q14, d15, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) |
| vtrn.16 d6, d7 |
| vmlsl.s16 q15, d15, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) |
| vtrn.16 d8, d9 |
| vmull.s16 q10, d10, d0[0] @// y0 * cos4(part of c0 and c1) |
| vtrn.32 d2, d4 |
| vmull.s16 q11, d12, d0[0] @// y4 * cos4(part of c0 and c1) |
| vtrn.32 d3, d5 |
| vmull.s16 q9, d11, d1[2] @// y2 * sin2 (Q7 is freed by this time)(part of d1) |
| vtrn.32 d6, d8 |
| vmull.s16 q7, d11, d0[2] @// y2 * cos2(part of d0) |
| vtrn.32 d7, d9 |
| vmlal.s16 q12, d16, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) |
| |
| add r4, r2, r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data |
| vmlsl.s16 q13, d16, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) |
| |
| add r5, r8, r8, lsl #1 @ |
| vmlal.s16 q14, d16, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) |
| |
| add r0, r3, r7, lsl #1 @ r0 points to 3rd row of dest data |
| vmlal.s16 q15, d16, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) |
| |
| add r10, r7, r7, lsl #1 @ |
| vmlsl.s16 q9, d13, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) |
| |
| |
| vmlal.s16 q7, d13, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) |
| |
| vadd.s32 q6, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) |
| vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) |
| |
| vmlal.s16 q12, d17, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7) |
| vswp d3, d6 |
| vmlsl.s16 q13, d17, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6) |
| |
| vswp d5, d8 |
| vmlal.s16 q14, d17, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5) |
| vmlsl.s16 q15, d17, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4) |
| |
| vsub.s32 q11, q6, q7 @// a3 = c0 - d0(part of r3,r4) |
| vadd.s32 q6, q6, q7 @// a0 = c0 + d0(part of r0,r7) |
| |
| |
| vadd.s32 q0, q6, q12 |
| |
| |
| vsub.s32 q12, q6, q12 |
| |
| |
| vadd.s32 q6, q11, q15 |
| |
| |
| vsub.s32 q7, q11, q15 |
| |
| vqrshrn.s32 d10, q0, #idct_stg2_shift |
| vqrshrn.s32 d17, q12, #idct_stg2_shift |
| vqrshrn.s32 d13, q6, #idct_stg2_shift |
| vqrshrn.s32 d14, q7, #idct_stg2_shift |
| |
| vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) |
| vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) |
| |
| |
| vadd.s32 q0, q11, q14 |
| |
| |
| vsub.s32 q12, q11, q14 |
| |
| |
| vadd.s32 q14, q9, q13 |
| |
| |
| vsub.s32 q13, q9, q13 |
| vld1.8 d18, [r2], r8 |
| |
| vqrshrn.s32 d12, q0, #idct_stg2_shift |
| vld1.8 d20, [r2], r5 |
| |
| |
| vqrshrn.s32 d15, q12, #idct_stg2_shift |
| vld1.8 d19, [r2], r8 |
| |
| |
| |
| |
| vqrshrn.s32 d11, q14, #idct_stg2_shift |
| vld1.8 d22, [r4], r8 |
| |
| |
| |
| |
| vqrshrn.s32 d16, q13, #idct_stg2_shift |
| vld1.8 d21, [r2], r5 |
| |
| |
| |
| |
| pred_buff_addition: |
| |
| |
| vtrn.16 d10, d11 |
| vld1.8 d24, [r4], r5 |
| |
| vtrn.16 d12, d13 |
| vld1.8 d23, [r4], r8 |
| |
| vaddw.u8 q1, q1, d18 |
| vld1.8 d25, [r4], r5 |
| |
| vtrn.16 d14, d15 |
| vaddw.u8 q2, q2, d22 |
| |
| vtrn.16 d16, d17 |
| vaddw.u8 q3, q3, d20 |
| |
| vtrn.32 d10, d12 |
| vaddw.u8 q4, q4, d24 |
| |
| vtrn.32 d11, d13 |
| vtrn.32 d14, d16 |
| vtrn.32 d15, d17 |
| |
| vswp d11, d14 |
| vswp d13, d16 |
| |
| @ Row values stored in the q register. |
| |
| @Q1 :r0 |
| @Q3: r1 |
| @Q2: r2 |
| @Q4: r3 |
| @Q5: r4 |
| @Q7: r5 |
| @Q6: r6 |
| @Q8: r7 |
| |
| |
| |
| @/// Adding the prediction buffer |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| @ Load prediction data |
| |
| |
| |
| |
| |
| @Adding recon with prediction |
| |
| |
| |
| |
| |
| vaddw.u8 q5, q5, d19 |
| vqmovun.s16 d2, q1 |
| vaddw.u8 q7, q7, d21 |
| vqmovun.s16 d4, q2 |
| vaddw.u8 q6, q6, d23 |
| vqmovun.s16 d6, q3 |
| vaddw.u8 q8, q8, d25 |
| vqmovun.s16 d8, q4 |
| |
| |
| |
| |
| |
| |
| |
| vst1.8 {d2}, [r3], r7 |
| vqmovun.s16 d10, q5 |
| vst1.8 {d6}, [r3], r10 |
| vqmovun.s16 d14, q7 |
| vst1.8 {d4}, [r0], r7 |
| vqmovun.s16 d12, q6 |
| vst1.8 {d8}, [r0], r10 |
| vqmovun.s16 d16, q8 |
| |
| |
| |
| |
| |
| |
| |
| vst1.8 {d10}, [r3], r7 |
| vst1.8 {d14}, [r3], r10 |
| vst1.8 {d12}, [r0], r7 |
| vst1.8 {d16}, [r0], r10 |
| |
| |
| |
| |
| |
| vpop {d8-d15} |
| ldmfd sp!, {r4-r12, pc} |
| |
| |
| |