| ; |
| ; jfdctflt.asm - floating-point FDCT (non-SIMD) |
| ; |
| ; x86 SIMD extension for IJG JPEG library |
| ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| ; |
| ; This file should be assembled with NASM (Netwide Assembler), |
| ; can *not* be assembled with Microsoft's MASM or any compatible |
| ; assembler (including Borland's Turbo Assembler). |
| ; NASM is available from http://nasm.sourceforge.net/ or |
| ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
| ; |
| ; This file contains a floating-point implementation of the forward DCT |
| ; (Discrete Cosine Transform). The following code is based directly on |
| ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. |
| ; |
| ; Last Modified : October 17, 2004 |
| ; |
| ; [TAB8] |
| |
| %include "jsimdext.inc" |
| %include "jdct.inc" |
| |
| %ifdef DCT_FLOAT_SUPPORTED |
| |
| ; This module is specialized to the case DCTSIZE = 8. |
| ; |
| %if DCTSIZE != 8 |
| %error "Sorry, this code only copes with 8x8 DCTs." |
| %endif |
| |
| ; -------------------------------------------------------------------------- |
| SECTION SEG_CONST |
| |
| %define ROTATOR_TYPE FP32 ; float |
| |
| alignz 16 |
| global EXTN(jconst_fdct_float) |
| |
| EXTN(jconst_fdct_float): |
| |
| F_0_382 dd 0.382683432365089771728460 ; cos(PI*3/8) |
| F_0_707 dd 0.707106781186547524400844 ; cos(PI*1/4) |
| F_0_541 dd 0.541196100146196984399723 ; cos(PI*1/8)-cos(PI*3/8) |
| F_1_306 dd 1.306562964876376527856643 ; cos(PI*1/8)+cos(PI*3/8) |
| |
| alignz 16 |
| |
| ; -------------------------------------------------------------------------- |
| SECTION SEG_TEXT |
| BITS 32 |
| ; |
| ; Perform the forward DCT on one block of samples. |
| ; |
| ; GLOBAL(void) |
| ; jpeg_fdct_float (FAST_FLOAT * data) |
| ; |
| |
| %define data(b) (b)+8 ; FAST_FLOAT * data |
| |
| align 16 |
| global EXTN(jpeg_fdct_float) |
| |
| EXTN(jpeg_fdct_float): |
| push ebp |
| mov ebp,esp |
| pushpic ebx |
| ; push ecx ; need not be preserved |
| ; push edx ; need not be preserved |
| ; push esi ; unused |
| ; push edi ; unused |
| |
| get_GOT ebx ; get GOT address |
| |
| ; ---- Pass 1: process rows. |
| |
| mov edx, POINTER [data(ebp)] ; (FAST_FLOAT *) |
| mov ecx, DCTSIZE |
| alignx 16,7 |
| .rowloop: |
| fld FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)] |
| fadd FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)] |
| fadd FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)] |
| fadd FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)] |
| fadd FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)] |
| |
| ; -- Even part |
| |
| fld st2 ; st2 = st2 + st1, st1 = st2 - st1 |
| fsub st0,st2 |
| fxch st0,st2 |
| faddp st3,st0 |
| fld st3 ; st3 = st3 + st0, st0 = st3 - st0 |
| fsub st0,st1 |
| fxch st0,st1 |
| faddp st4,st0 |
| |
| fadd st0,st1 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)] |
| |
| fld st2 ; st3 = st2 + st3, st2 = st2 - st3 |
| fsub st0,st4 |
| fxch st0,st3 |
| faddp st4,st0 |
| fld st1 ; st0 = st1 + st0, st1 = st1 - st0 |
| fsub st0,st1 |
| fxch st0,st2 |
| faddp st1,st0 |
| |
| fld FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)] |
| fsub FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)] |
| fxch st0,st4 |
| fld FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)] |
| fsub FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)] |
| fxch st0,st4 |
| fld FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)] |
| fsub FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)] |
| fxch st0,st4 |
| fld FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)] |
| fsub FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)] |
| fxch st0,st4 |
| |
| fstp FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)] |
| |
| ; -- Odd part |
| |
| fadd st2,st0 |
| fadd st0,st1 |
| fxch st0,st3 |
| fadd st1,st0 |
| fxch st0,st3 |
| |
| fld st2 |
| fxch st0,st1 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)] |
| fxch st0,st1 |
| fsub st0,st2 |
| fxch st0,st3 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_541)] |
| fxch st0,st3 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_382)] |
| fxch st0,st2 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_306)] |
| fxch st0,st2 |
| fadd st3,st0 |
| faddp st2,st0 |
| |
| fld st3 ; st3 = st3 + st0, st0 = st3 - st0 |
| fsub st0,st1 |
| fxch st0,st1 |
| faddp st4,st0 |
| |
| fld st2 ; st0 = st0 + st2, st2 = st0 - st2 |
| fsubr st0,st1 |
| fxch st0,st3 |
| faddp st1,st0 |
| fld st1 ; st3 = st3 + st1, st1 = st3 - st1 |
| fsubr st0,st4 |
| fxch st0,st2 |
| faddp st4,st0 |
| |
| fstp FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)] |
| |
| add edx, byte DCTSIZE*SIZEOF_FAST_FLOAT |
| dec ecx ; advance pointer to next row |
| jnz near .rowloop |
| |
| ; ---- Pass 2: process columns. |
| |
| mov edx, POINTER [data(ebp)] ; (FAST_FLOAT *) |
| mov ecx, DCTSIZE |
| alignx 16,7 |
| .columnloop: |
| fld FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)] |
| fadd FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)] |
| fadd FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)] |
| fadd FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)] |
| fadd FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)] |
| |
| ; -- Even part |
| |
| fld st2 ; st2 = st2 + st1, st1 = st2 - st1 |
| fsub st0,st2 |
| fxch st0,st2 |
| faddp st3,st0 |
| fld st3 ; st3 = st3 + st0, st0 = st3 - st0 |
| fsub st0,st1 |
| fxch st0,st1 |
| faddp st4,st0 |
| |
| fadd st0,st1 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)] |
| |
| fld st2 ; st3 = st2 + st3, st2 = st2 - st3 |
| fsub st0,st4 |
| fxch st0,st3 |
| faddp st4,st0 |
| fld st1 ; st0 = st1 + st0, st1 = st1 - st0 |
| fsub st0,st1 |
| fxch st0,st2 |
| faddp st1,st0 |
| |
| fld FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)] |
| fsub FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)] |
| fxch st0,st4 |
| fld FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)] |
| fsub FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)] |
| fxch st0,st4 |
| fld FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)] |
| fsub FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)] |
| fxch st0,st4 |
| fld FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)] |
| fsub FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)] |
| fxch st0,st4 |
| |
| fstp FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)] |
| |
| ; -- Odd part |
| |
| fadd st2,st0 |
| fadd st0,st1 |
| fxch st0,st3 |
| fadd st1,st0 |
| fxch st0,st3 |
| |
| fld st2 |
| fxch st0,st1 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)] |
| fxch st0,st1 |
| fsub st0,st2 |
| fxch st0,st3 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_541)] |
| fxch st0,st3 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_382)] |
| fxch st0,st2 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_306)] |
| fxch st0,st2 |
| fadd st3,st0 |
| faddp st2,st0 |
| |
| fld st3 ; st3 = st3 + st0, st0 = st3 - st0 |
| fsub st0,st1 |
| fxch st0,st1 |
| faddp st4,st0 |
| |
| fld st2 ; st0 = st0 + st2, st2 = st0 - st2 |
| fsubr st0,st1 |
| fxch st0,st3 |
| faddp st1,st0 |
| fld st1 ; st3 = st3 + st1, st1 = st3 - st1 |
| fsubr st0,st4 |
| fxch st0,st2 |
| faddp st4,st0 |
| |
| fstp FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)] |
| |
| add edx, byte SIZEOF_FAST_FLOAT ; advance pointer to next column |
| dec ecx |
| jnz near .columnloop |
| |
| ; pop edi ; unused |
| ; pop esi ; unused |
| ; pop edx ; need not be preserved |
| ; pop ecx ; need not be preserved |
| poppic ebx |
| pop ebp |
| ret |
| |
| %endif ; DCT_FLOAT_SUPPORTED |