| /* |
| * Copyright (C) 2008 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <machine/cpu-features.h> |
| |
| .text |
| .align |
| |
| .global jpeg_idct_ifast |
| .func jpeg_idct_ifast |
| |
| // NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15 |
| |
| // jpeg_idct_ifast (j_decompress_ptr cinfo, |
| // jpeg_component_info * compptr, |
| // short* coef_block, |
| // unsigned char* output_buf, |
| // int output_col) |
| |
| #define local_TMP0123 sp |
| #define local_TMP0 [sp, #0] |
| #define local_TMP1 [sp, #4] |
| #define local_TMP2 [sp, #8] |
| #define local_TMP3 [sp, #12] |
| #define local_RANGE_TABLE [sp, #16] |
| #define local_OUTPUT_COL [sp, #20] |
| #define local_OUTPUT_BUF [sp, #24] |
| #define local_UNUSED [sp, #28] |
| #define off_WORKSPACE 32 |
| #define local_WORKSPACE [sp, #offWORKSPACE] |
| #define local_SIZE (off_WORKSPACE + 8*8*4) |
| |
| #define off_DECOMPRESS_range_limit_base 324 |
| #define off_COMPINFO_quanttable 80 |
| |
| #define DCTSIZE 8 |
| #define VY(x) ((x)*DCTSIZE*2) |
| #define QY(x) ((x)*DCTSIZE*4) |
| |
| #define VX(x) ((x)*2) |
| #define QX(x) ((x)*4) |
| |
| #define FIX_1_414213562 #362 |
| #define FIX_1_082392200 #277 |
| #define FIX_1_847759065 #473 |
| #define FIX_2_613125930 #669 |
| |
| #define RANGE_MASK 1023 |
| |
| |
| |
| jpeg_idct_ifast: |
| PLD (r2, #0) |
| stmdb sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr} |
| ldr r4, [sp, #4*10] |
| sub sp, #local_SIZE |
| |
| ldr r10,[r1, #off_COMPINFO_quanttable] // r10 = quanttable |
| str r4, local_OUTPUT_COL |
| str r3, local_OUTPUT_BUF |
| ldr r5, [r0, #off_DECOMPRESS_range_limit_base] |
| add r5, r5, #128 |
| str r5, local_RANGE_TABLE |
| mov fp, r2 // fp = coef_block |
| add ip, sp, #off_WORKSPACE |
| |
| VLoopTail: |
| ldrsh r0, [fp, #VY(0)] |
| ldrsh r1, [fp, #VY(1)] |
| ldrsh r2, [fp, #VY(2)] |
| ldrsh r3, [fp, #VY(3)] |
| ldrsh r4, [fp, #VY(4)] |
| ldrsh r5, [fp, #VY(5)] |
| ldrsh r6, [fp, #VY(6)] |
| ldrsh r7, [fp, #VY(7)] |
| |
| cmp r1, #0 |
| orreqs r8, r2, r3 |
| orreqs r8, r4, r5 |
| orreqs r8, r6, r7 |
| beq VLoopHeadZero |
| |
| VLoopHead: |
| // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0] (r0) |
| // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4] (r4) |
| // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2] (r2) |
| // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6] (r6) |
| // tmp10 = tmp0 + tmp2 (r0) |
| // tmp11 = tmp0 - tmp2 (r4) |
| |
| ldr r9, [r10, #QY(4)] |
| ldr r8, [r10, #QY(0)] |
| #if __ARM_HAVE_HALFWORD_MULTIPLY |
| smulbb r4, r9, r4 |
| smlabb r0, r8, r0, r4 |
| #else |
| mul r4, r9, r4 |
| mul r0, r8, r0 |
| add r0, r4 |
| #endif |
| ldr r9, [r10, #QY(6)] |
| ldr r8, [r10, #QY(2)] |
| sub r4, r0, r4, lsl #1 |
| #if __ARM_HAVE_HALFWORD_MULTIPLY |
| smulbb r6, r9, r6 |
| smlabb r2, r8, r2, r6 |
| #else |
| mul r6, r9, r6 |
| mul r2, r8, r2 |
| add r2, r6 |
| #endif |
| |
| // tmp13 = tmp1 + tmp3 (r2) |
| // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13 (r6) |
| // FIX_1_4142... = 362 = 45*8 + 2 |
| sub r6, r2, r6, lsl #1 |
| mov r8, #360 |
| add r8, r8, #2 |
| mul r9, r6, r8 |
| |
| // tmp0 = tmp10 + tmp13; (r0) |
| // tmp3 = tmp10 - tmp13; (r8) |
| // tmp1 = tmp11 + tmp12; (r4) |
| // tmp2 = tmp11 - tmp12; (r6) |
| add r0, r0, r2 |
| rsb r6, r2, r9, asr #8 |
| sub r8, r0, r2, lsl #1 |
| add r4, r4, r6 |
| sub r6, r4, r6, lsl #1 |
| |
| stmia local_TMP0123, {r0, r4, r6, r8} |
| |
| // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above |
| |
| // odd part |
| // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] ) (r1) |
| // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] ) (r5) |
| // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] ) (r3) |
| // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] ) (r7) |
| // z13 = tmp6 + tmp5; (r0) |
| // z10 = tmp6 - tmp5; (r2) |
| // z11 = tmp4 + tmp7; (r4) |
| // z12 = tmp4 - tmp7; (r6) |
| |
| ldr r2, [r10, #QY(1)] |
| ldr r9, [r10, #QY(5)] |
| #if __ARM_HAVE_HALFWORD_MULTIPLY |
| smulbb r1, r2, r1 |
| #else |
| mul r1, r2, r1 |
| #endif |
| ldr r2, [r10, #QY(3)] |
| #if __ARM_HAVE_HALFWORD_MULTIPLY |
| smulbb r5, r9, r5 |
| #else |
| mul r5, r9, r5 |
| #endif |
| ldr r9, [r10, #QY(7)] |
| #if __ARM_HAVE_HALFWORD_MULTIPLY |
| smlabb r0, r2, r3, r5 |
| smlabb r4, r9, r7, r1 |
| #else |
| mul r0, r2, r3 |
| add r0, r5 |
| mul r4, r9, r7 |
| add r4, r1 |
| #endif |
| rsb r2, r0, r5, lsl #1 |
| rsb r6, r4, r1, lsl #1 |
| |
| // tmp7 = z11 + z13; (r7) |
| // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1) |
| // FIX_... = 360 + 2 |
| add r7, r4, r0 |
| sub r1, r4, r0 |
| mov r8, #360 |
| add r8, r8, #2 |
| mul r1, r8, r1 |
| |
| // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8) |
| // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0) |
| // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2) |
| // FIX_1_8477... = 473 = 472 + 1 |
| // FIX_1_082... = 277 = 276 + 1 |
| // FIX_2_... = 669 = 668 + 1 |
| add r8, r2, r6 |
| mov r9, #472 |
| mla r8, r9, r8, r8 |
| mov r9, #276 |
| mla r0, r6, r9, r6 |
| mov r9, #668 |
| mla r2, r9, r2, r2 |
| sub r0, r0, r8 |
| rsb r2, r2, r8 |
| |
| // tmp6 = tmp12 - tmp7; (r6) |
| // tmp5 = tmp11 - tmp6; (r5) |
| // tmp4 = tmp10 + tmp5; (r4) |
| rsb r6, r7, r2, asr #8 |
| rsb r5, r6, r1, asr #8 |
| add r4, r5, r0, asr #8 |
| |
| ldmia local_TMP0123, {r0, r1, r2, r3} |
| |
| // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); |
| // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); |
| // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); |
| // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); |
| // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5); |
| // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); |
| // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); |
| // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); |
| |
| add r0, r0, r7 |
| sub r7, r0, r7, lsl #1 |
| add r1, r1, r6 |
| sub r6, r1, r6, lsl #1 |
| add r2, r2, r5 |
| sub r5, r2, r5, lsl #1 |
| sub r3, r3, r4 |
| add r4, r3, r4, lsl #1 |
| |
| str r0, [ip, #QY(0)] |
| str r1, [ip, #QY(1)] |
| str r2, [ip, #QY(2)] |
| str r3, [ip, #QY(3)] |
| str r4, [ip, #QY(4)] |
| str r5, [ip, #QY(5)] |
| str r6, [ip, #QY(6)] |
| str r7, [ip, #QY(7)] |
| |
| // inptr++; /* advance pointers to next column */ |
| // quantptr++; |
| // wsptr++; |
| add fp, fp, #2 |
| add r10, r10, #4 |
| add ip, ip, #4 |
| add r0, sp, #(off_WORKSPACE + 4*8) |
| cmp ip, r0 |
| bne VLoopTail |
| |
| |
| |
| HLoopStart: |
| // reset pointers |
| PLD (sp, #off_WORKSPACE) |
| add ip, sp, #off_WORKSPACE |
| ldr r10, local_RANGE_TABLE |
| |
| HLoopTail: |
| // output = *output_buf++ + output_col |
| ldr r0, local_OUTPUT_BUF |
| ldr r1, local_OUTPUT_COL |
| ldr r2, [r0], #4 |
| str r0, local_OUTPUT_BUF |
| add fp, r2, r1 |
| |
| PLD (ip, #32) |
| ldmia ip!, {r0-r7} |
| |
| cmp r1, #0 |
| orreqs r8, r2, r3 |
| orreqs r8, r4, r5 |
| orreqs r8, r6, r7 |
| beq HLoopTailZero |
| |
| HLoopHead: |
| // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); (r0) |
| // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); (r4) |
| add r0, r0, r4 |
| sub r4, r0, r4, lsl #1 |
| |
| // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); (r2) |
| // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13; (r6) |
| // FIX_... = 360 + 2 |
| add r2, r2, r6 |
| sub r6, r2, r6, lsl #1 |
| mov r8, #360 |
| add r8, r8, #2 |
| mul r6, r8, r6 |
| |
| // tmp0 = tmp10 + tmp13; (r0) |
| // tmp3 = tmp10 - tmp13; (r8) |
| // tmp1 = tmp11 + tmp12; (r4) |
| // tmp2 = tmp11 - tmp12; (r6) |
| add r0, r0, r2 |
| rsb r6, r2, r6, asr #8 |
| sub r8, r0, r2, lsl #1 |
| add r4, r4, r6 |
| sub r6, r4, r6, lsl #1 |
| |
| stmia local_TMP0123, {r0, r4, r6, r8} |
| |
| // Odd part |
| |
| // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; (r0) |
| // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; (r2) |
| // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; (r4) |
| // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; (r6) |
| add r0, r5, r3 |
| sub r2, r5, r3 |
| add r4, r1, r7 |
| sub r6, r1, r7 |
| |
| // tmp7 = z11 + z13; (r7) |
| // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1) |
| // FIX_... = 360 + 2 |
| add r7, r4, r0 |
| sub r1, r4, r0 |
| mov r8, #360 |
| add r8, r8, #2 |
| mul r1, r8, r1 |
| |
| // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8) |
| // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0) |
| // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2) |
| // FIX_1_8477... = 473 = 472 + 1 |
| // FIX_1_082... = 277 = 276 + 1 |
| // FIX_2_... = 669 = 668 + 1 |
| add r8, r2, r6 |
| mov r9, #472 |
| mla r8, r9, r8, r8 |
| mov r9, #276 |
| mla r0, r6, r9, r6 |
| mov r9, #668 |
| mla r2, r9, r2, r2 |
| sub r0, r0, r8 |
| sub r2, r8, r2 |
| |
| // tmp6 = tmp12 - tmp7; (r6) |
| // tmp5 = tmp11 - tmp6; (r5) |
| // tmp4 = tmp10 + tmp5; (r4) |
| rsb r6, r7, r2, asr #8 |
| rsb r5, r6, r1, asr #8 |
| add r4, r5, r0, asr #8 |
| |
| ldmia local_TMP0123, {r0, r1, r2, r3} |
| |
| // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK]; |
| // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK]; |
| // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK]; |
| // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK]; |
| // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK]; |
| // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK]; |
| // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK]; |
| // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK]; |
| |
| mov r8, #128 |
| add r0, r0, r7 |
| sub r7, r0, r7, lsl #1 |
| add r0, r8, r0, asr #5 |
| add r7, r8, r7, asr #5 |
| add r1, r1, r6 |
| sub r6, r1, r6, lsl #1 |
| add r1, r8, r1, asr #5 |
| add r6, r8, r6, asr #5 |
| add r2, r2, r5 |
| sub r5, r2, r5, lsl #1 |
| add r2, r8, r2, asr #5 |
| add r5, r8, r5, asr #5 |
| sub r3, r3, r4 |
| add r4, r3, r4, lsl #1 |
| add r3, r8, r3, asr #5 |
| add r4, r8, r4, asr #5 |
| |
| #if __ARM_ARCH__ >= 6 |
| usat r0, #8, r0 |
| usat r1, #8, r1 |
| usat r2, #8, r2 |
| usat r3, #8, r3 |
| usat r4, #8, r4 |
| usat r5, #8, r5 |
| usat r6, #8, r6 |
| usat r7, #8, r7 |
| #else |
| cmp r0, #255 |
| mvnhi r0, r0, asr #31 |
| andhi r0, #255 |
| cmp r7, #255 |
| mvnhi r7, r7, asr #31 |
| cmp r1, #255 |
| mvnhi r1, r1, asr #31 |
| andhi r1, #255 |
| cmp r6, #255 |
| mvnhi r6, r6, asr #31 |
| andhi r6, #255 |
| cmp r2, #255 |
| mvnhi r2, r2, asr #31 |
| andhi r2, #255 |
| cmp r5, #255 |
| mvnhi r5, r5, asr #31 |
| andhi r5, #255 |
| cmp r3, #255 |
| mvnhi r3, r3, asr #31 |
| cmp r4, #255 |
| mvnhi r4, r4, asr #31 |
| andhi r4, #255 |
| #endif |
| |
| // r3 r2 r1 r0 |
| orr r0, r0, r1, lsl #8 |
| orr r0, r0, r2, lsl #16 |
| orr r0, r0, r3, lsl #24 |
| |
| // r7 r6 r5 r4 |
| orr r1, r4, r5, lsl #8 |
| orr r1, r1, r6, lsl #16 |
| orr r1, r1, r7, lsl #24 |
| stmia fp, {r0, r1} |
| |
| add r0, sp, #(off_WORKSPACE + 8*8*4) |
| cmp ip, r0 |
| bne HLoopTail |
| |
| Exit: |
| add sp, sp, #local_SIZE |
| ldmia sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr} |
| bx lr |
| |
| |
| VLoopHeadZero: |
| // ok, all AC coefficients are 0 |
| ldr r1, [r10, #QY(0)] |
| add fp, fp, #2 |
| add r10, r10, #4 |
| mul r0, r1, r0 |
| str r0, [ip, #QY(0)] |
| str r0, [ip, #QY(1)] |
| str r0, [ip, #QY(2)] |
| str r0, [ip, #QY(3)] |
| str r0, [ip, #QY(4)] |
| str r0, [ip, #QY(5)] |
| str r0, [ip, #QY(6)] |
| str r0, [ip, #QY(7)] |
| add ip, ip, #4 |
| add r0, sp, #(off_WORKSPACE + 4*8) |
| cmp ip, r0 |
| beq HLoopStart |
| b VLoopTail |
| |
| HLoopTailZero: |
| mov r0, r0, asr #5 |
| add r0, #128 |
| |
| #if __ARM_ARCH__ >= 6 |
| usat r0, #8, r0 |
| #else |
| cmp r0, #255 |
| mvnhi r0, r0, asr #31 |
| andhi r0, r0, #255 |
| #endif |
| |
| orr r0, r0, lsl #8 |
| orr r0, r0, lsl #16 |
| mov r1, r0 |
| stmia fp, {r0, r1} |
| |
| add r0, sp, #(off_WORKSPACE + 64*4) |
| cmp ip, r0 |
| beq Exit |
| b HLoopTail |
| |
| .endfunc |