blob: be899b3d9c37eb5bc51d2ba84fd64286e9907e91 [file] [log] [blame]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00001;
DRC72130be2014-05-09 20:14:26 +00002; jidctflt.asm - floating-point IDCT (SSE & SSE2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00003;
Pierre Ossmaneea72152009-03-09 13:34:17 +00004; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00007; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the inverse DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jidctflt.c; see the jidctflt.c for more details.
20;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000021; [TAB8]
22
Pierre Ossman3a65ef42009-03-16 13:34:18 +000023%include "jsimdext.inc"
24%include "jdct.inc"
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000025
26; --------------------------------------------------------------------------
27
DRCe5eaf372014-05-09 18:00:32 +000028%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
29 shufps %1,%2,0x44
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000030%endmacro
31
DRCe5eaf372014-05-09 18:00:32 +000032%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
33 shufps %1,%2,0xEE
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000034%endmacro
35
36; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000037 SECTION SEG_CONST
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000038
DRCe5eaf372014-05-09 18:00:32 +000039 alignz 16
40 global EXTN(jconst_idct_float_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000041
42EXTN(jconst_idct_float_sse2):
43
DRCe5eaf372014-05-09 18:00:32 +000044PD_1_414 times 4 dd 1.414213562373095048801689
45PD_1_847 times 4 dd 1.847759065022573512256366
46PD_1_082 times 4 dd 1.082392200292393968799446
47PD_M2_613 times 4 dd -2.613125929752753055713286
48PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
49PB_CENTERJSAMP times 16 db CENTERJSAMPLE
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000050
DRCe5eaf372014-05-09 18:00:32 +000051 alignz 16
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000052
53; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000054 SECTION SEG_TEXT
55 BITS 32
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000056;
57; Perform dequantization and inverse DCT on one block of coefficients.
58;
59; GLOBAL(void)
DRCbd498032016-02-19 08:53:33 -060060; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
Pierre Ossmaneea72152009-03-09 13:34:17 +000061; JSAMPARRAY output_buf, JDIMENSION output_col)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000062;
63
DRCbd498032016-02-19 08:53:33 -060064%define dct_table(b) (b)+8 ; void *dct_table
DRCe5eaf372014-05-09 18:00:32 +000065%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
66%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
67%define output_col(b) (b)+20 ; JDIMENSION output_col
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000068
DRCe5eaf372014-05-09 18:00:32 +000069%define original_ebp ebp+0
70%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
71%define WK_NUM 2
72%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
73 ; FAST_FLOAT workspace[DCTSIZE2]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000074
DRCe5eaf372014-05-09 18:00:32 +000075 align 16
76 global EXTN(jsimd_idct_float_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000077
Pierre Ossmaneea72152009-03-09 13:34:17 +000078EXTN(jsimd_idct_float_sse2):
DRCe5eaf372014-05-09 18:00:32 +000079 push ebp
80 mov eax,esp ; eax = original ebp
81 sub esp, byte 4
82 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
83 mov [esp],eax
84 mov ebp,esp ; ebp = aligned ebp
85 lea esp, [workspace]
86 push ebx
87; push ecx ; need not be preserved
88; push edx ; need not be preserved
89 push esi
90 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000091
DRCe5eaf372014-05-09 18:00:32 +000092 get_GOT ebx ; get GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000093
DRCe5eaf372014-05-09 18:00:32 +000094 ; ---- Pass 1: process columns from input, store into work array.
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000095
DRCe5eaf372014-05-09 18:00:32 +000096; mov eax, [original_ebp]
97 mov edx, POINTER [dct_table(eax)] ; quantptr
98 mov esi, JCOEFPTR [coef_block(eax)] ; inptr
DRCbd498032016-02-19 08:53:33 -060099 lea edi, [workspace] ; FAST_FLOAT *wsptr
DRCe5eaf372014-05-09 18:00:32 +0000100 mov ecx, DCTSIZE/4 ; ctr
101 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000102.columnloop:
103%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
DRCe5eaf372014-05-09 18:00:32 +0000104 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
105 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
106 jnz near .columnDCT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000107
DRCe5eaf372014-05-09 18:00:32 +0000108 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
109 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
110 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
111 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
112 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
113 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
114 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
115 por xmm1,xmm2
116 por xmm3,xmm4
117 por xmm5,xmm6
118 por xmm1,xmm3
119 por xmm5,xmm7
120 por xmm1,xmm5
121 packsswb xmm1,xmm1
122 movd eax,xmm1
123 test eax,eax
124 jnz short .columnDCT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000125
DRCe5eaf372014-05-09 18:00:32 +0000126 ; -- AC terms all zero
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000127
DRCe5eaf372014-05-09 18:00:32 +0000128 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000129
DRCe5eaf372014-05-09 18:00:32 +0000130 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
131 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
132 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000133
DRCe5eaf372014-05-09 18:00:32 +0000134 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000135
DRCe5eaf372014-05-09 18:00:32 +0000136 movaps xmm1,xmm0
137 movaps xmm2,xmm0
138 movaps xmm3,xmm0
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000139
DRCe5eaf372014-05-09 18:00:32 +0000140 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
141 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
142 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
143 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000144
DRCe5eaf372014-05-09 18:00:32 +0000145 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
146 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
147 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
148 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
149 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
150 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
151 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
152 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
153 jmp near .nextcolumn
154 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000155%endif
156.columnDCT:
157
DRCe5eaf372014-05-09 18:00:32 +0000158 ; -- Even part
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000159
DRCe5eaf372014-05-09 18:00:32 +0000160 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
161 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
162 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
163 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000164
DRCe5eaf372014-05-09 18:00:32 +0000165 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
166 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
167 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
168 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
169 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
170 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000171
DRCe5eaf372014-05-09 18:00:32 +0000172 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
173 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
174 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
175 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
176 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
177 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000178
DRCe5eaf372014-05-09 18:00:32 +0000179 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
180 mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
181 mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
182 mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000183
DRCe5eaf372014-05-09 18:00:32 +0000184 movaps xmm4,xmm0
185 movaps xmm5,xmm1
186 subps xmm0,xmm2 ; xmm0=tmp11
187 subps xmm1,xmm3
188 addps xmm4,xmm2 ; xmm4=tmp10
189 addps xmm5,xmm3 ; xmm5=tmp13
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000190
DRCe5eaf372014-05-09 18:00:32 +0000191 mulps xmm1,[GOTOFF(ebx,PD_1_414)]
192 subps xmm1,xmm5 ; xmm1=tmp12
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000193
DRCe5eaf372014-05-09 18:00:32 +0000194 movaps xmm6,xmm4
195 movaps xmm7,xmm0
196 subps xmm4,xmm5 ; xmm4=tmp3
197 subps xmm0,xmm1 ; xmm0=tmp2
198 addps xmm6,xmm5 ; xmm6=tmp0
199 addps xmm7,xmm1 ; xmm7=tmp1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000200
DRCe5eaf372014-05-09 18:00:32 +0000201 movaps XMMWORD [wk(1)], xmm4 ; tmp3
202 movaps XMMWORD [wk(0)], xmm0 ; tmp2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000203
DRCe5eaf372014-05-09 18:00:32 +0000204 ; -- Odd part
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000205
DRCe5eaf372014-05-09 18:00:32 +0000206 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
207 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
208 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
209 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000210
DRCe5eaf372014-05-09 18:00:32 +0000211 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
212 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
213 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
214 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
215 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
216 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000217
DRCe5eaf372014-05-09 18:00:32 +0000218 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
219 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
220 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
221 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
222 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
223 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000224
DRCe5eaf372014-05-09 18:00:32 +0000225 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
226 mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
227 mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
228 mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000229
DRCe5eaf372014-05-09 18:00:32 +0000230 movaps xmm4,xmm2
231 movaps xmm0,xmm5
232 addps xmm2,xmm1 ; xmm2=z11
233 addps xmm5,xmm3 ; xmm5=z13
234 subps xmm4,xmm1 ; xmm4=z12
235 subps xmm0,xmm3 ; xmm0=z10
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000236
DRCe5eaf372014-05-09 18:00:32 +0000237 movaps xmm1,xmm2
238 subps xmm2,xmm5
239 addps xmm1,xmm5 ; xmm1=tmp7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000240
DRCe5eaf372014-05-09 18:00:32 +0000241 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000242
DRCe5eaf372014-05-09 18:00:32 +0000243 movaps xmm3,xmm0
244 addps xmm0,xmm4
245 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
246 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
247 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
248 addps xmm3,xmm0 ; xmm3=tmp12
249 subps xmm4,xmm0 ; xmm4=tmp10
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000250
DRCe5eaf372014-05-09 18:00:32 +0000251 ; -- Final output stage
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000252
DRCe5eaf372014-05-09 18:00:32 +0000253 subps xmm3,xmm1 ; xmm3=tmp6
254 movaps xmm5,xmm6
255 movaps xmm0,xmm7
256 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
257 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
258 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
259 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
260 subps xmm2,xmm3 ; xmm2=tmp5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000261
DRCe5eaf372014-05-09 18:00:32 +0000262 movaps xmm1,xmm6 ; transpose coefficients(phase 1)
263 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
264 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
265 movaps xmm3,xmm0 ; transpose coefficients(phase 1)
266 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
267 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000268
DRCe5eaf372014-05-09 18:00:32 +0000269 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
270 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000271
DRCe5eaf372014-05-09 18:00:32 +0000272 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
273 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000274
DRCe5eaf372014-05-09 18:00:32 +0000275 addps xmm4,xmm2 ; xmm4=tmp4
276 movaps xmm0,xmm7
277 movaps xmm3,xmm5
278 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
279 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
280 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
281 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000282
DRCe5eaf372014-05-09 18:00:32 +0000283 movaps xmm2,xmm7 ; transpose coefficients(phase 1)
284 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
285 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
286 movaps xmm4,xmm5 ; transpose coefficients(phase 1)
287 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
288 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000289
DRCe5eaf372014-05-09 18:00:32 +0000290 movaps xmm3,xmm6 ; transpose coefficients(phase 2)
291 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
292 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
293 movaps xmm0,xmm1 ; transpose coefficients(phase 2)
294 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
295 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000296
DRCe5eaf372014-05-09 18:00:32 +0000297 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
298 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000299
DRCe5eaf372014-05-09 18:00:32 +0000300 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
301 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
302 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
303 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000304
DRCe5eaf372014-05-09 18:00:32 +0000305 movaps xmm6,xmm5 ; transpose coefficients(phase 2)
306 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
307 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
308 movaps xmm3,xmm4 ; transpose coefficients(phase 2)
309 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
310 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000311
DRCe5eaf372014-05-09 18:00:32 +0000312 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
313 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
314 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
315 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000316
317.nextcolumn:
DRCe5eaf372014-05-09 18:00:32 +0000318 add esi, byte 4*SIZEOF_JCOEF ; coef_block
319 add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
320 add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
321 dec ecx ; ctr
322 jnz near .columnloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000323
DRCe5eaf372014-05-09 18:00:32 +0000324 ; -- Prefetch the next coefficient block
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000325
DRCe5eaf372014-05-09 18:00:32 +0000326 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
327 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
328 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
329 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000330
DRCe5eaf372014-05-09 18:00:32 +0000331 ; ---- Pass 2: process rows from work array, store into output array.
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000332
DRCe5eaf372014-05-09 18:00:32 +0000333 mov eax, [original_ebp]
DRCbd498032016-02-19 08:53:33 -0600334 lea esi, [workspace] ; FAST_FLOAT *wsptr
DRCe5eaf372014-05-09 18:00:32 +0000335 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
336 mov eax, JDIMENSION [output_col(eax)]
337 mov ecx, DCTSIZE/4 ; ctr
338 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000339.rowloop:
340
DRCe5eaf372014-05-09 18:00:32 +0000341 ; -- Even part
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000342
DRCe5eaf372014-05-09 18:00:32 +0000343 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
344 movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
345 movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
346 movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000347
DRCe5eaf372014-05-09 18:00:32 +0000348 movaps xmm4,xmm0
349 movaps xmm5,xmm1
350 subps xmm0,xmm2 ; xmm0=tmp11
351 subps xmm1,xmm3
352 addps xmm4,xmm2 ; xmm4=tmp10
353 addps xmm5,xmm3 ; xmm5=tmp13
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000354
DRCe5eaf372014-05-09 18:00:32 +0000355 mulps xmm1,[GOTOFF(ebx,PD_1_414)]
356 subps xmm1,xmm5 ; xmm1=tmp12
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000357
DRCe5eaf372014-05-09 18:00:32 +0000358 movaps xmm6,xmm4
359 movaps xmm7,xmm0
360 subps xmm4,xmm5 ; xmm4=tmp3
361 subps xmm0,xmm1 ; xmm0=tmp2
362 addps xmm6,xmm5 ; xmm6=tmp0
363 addps xmm7,xmm1 ; xmm7=tmp1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000364
DRCe5eaf372014-05-09 18:00:32 +0000365 movaps XMMWORD [wk(1)], xmm4 ; tmp3
366 movaps XMMWORD [wk(0)], xmm0 ; tmp2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000367
DRCe5eaf372014-05-09 18:00:32 +0000368 ; -- Odd part
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000369
DRCe5eaf372014-05-09 18:00:32 +0000370 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
371 movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
372 movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
373 movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000374
DRCe5eaf372014-05-09 18:00:32 +0000375 movaps xmm4,xmm2
376 movaps xmm0,xmm5
377 addps xmm2,xmm1 ; xmm2=z11
378 addps xmm5,xmm3 ; xmm5=z13
379 subps xmm4,xmm1 ; xmm4=z12
380 subps xmm0,xmm3 ; xmm0=z10
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000381
DRCe5eaf372014-05-09 18:00:32 +0000382 movaps xmm1,xmm2
383 subps xmm2,xmm5
384 addps xmm1,xmm5 ; xmm1=tmp7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000385
DRCe5eaf372014-05-09 18:00:32 +0000386 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000387
DRCe5eaf372014-05-09 18:00:32 +0000388 movaps xmm3,xmm0
389 addps xmm0,xmm4
390 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
391 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
392 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
393 addps xmm3,xmm0 ; xmm3=tmp12
394 subps xmm4,xmm0 ; xmm4=tmp10
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000395
DRCe5eaf372014-05-09 18:00:32 +0000396 ; -- Final output stage
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000397
DRCe5eaf372014-05-09 18:00:32 +0000398 subps xmm3,xmm1 ; xmm3=tmp6
399 movaps xmm5,xmm6
400 movaps xmm0,xmm7
401 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
402 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
403 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
404 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
405 subps xmm2,xmm3 ; xmm2=tmp5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000406
DRCe5eaf372014-05-09 18:00:32 +0000407 movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
408 pcmpeqd xmm3,xmm3
409 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000410
DRCe5eaf372014-05-09 18:00:32 +0000411 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
412 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
413 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
414 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000415
DRCe5eaf372014-05-09 18:00:32 +0000416 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
417 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
418 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
419 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
420 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
421 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000422
DRCe5eaf372014-05-09 18:00:32 +0000423 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
424 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000425
DRCe5eaf372014-05-09 18:00:32 +0000426 addps xmm4,xmm2 ; xmm4=tmp4
427 movaps xmm7,xmm1
428 movaps xmm5,xmm3
429 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
430 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
431 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
432 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000433
DRCe5eaf372014-05-09 18:00:32 +0000434 movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
435 pcmpeqd xmm4,xmm4
436 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000437
DRCe5eaf372014-05-09 18:00:32 +0000438 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
439 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
440 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
441 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000442
DRCe5eaf372014-05-09 18:00:32 +0000443 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
444 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
445 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
446 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
447 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
448 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000449
DRCe5eaf372014-05-09 18:00:32 +0000450 movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000451
DRCe5eaf372014-05-09 18:00:32 +0000452 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
453 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
454 paddb xmm6,xmm2
455 paddb xmm1,xmm2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000456
DRCe5eaf372014-05-09 18:00:32 +0000457 movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
458 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
459 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000460
DRCe5eaf372014-05-09 18:00:32 +0000461 movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
462 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
463 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000464
DRCe5eaf372014-05-09 18:00:32 +0000465 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
466 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000467
DRCe5eaf372014-05-09 18:00:32 +0000468 pushpic ebx ; save GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000469
DRCe5eaf372014-05-09 18:00:32 +0000470 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
471 mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
472 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
473 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
474 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
475 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
476 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
477 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000478
DRCe5eaf372014-05-09 18:00:32 +0000479 poppic ebx ; restore GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000480
DRCe5eaf372014-05-09 18:00:32 +0000481 add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
482 add edi, byte 4*SIZEOF_JSAMPROW
483 dec ecx ; ctr
484 jnz near .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000485
DRCe5eaf372014-05-09 18:00:32 +0000486 pop edi
487 pop esi
488; pop edx ; need not be preserved
489; pop ecx ; need not be preserved
490 pop ebx
491 mov esp,ebp ; esp <- aligned ebp
492 pop esp ; esp <- original ebp
493 pop ebp
494 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000495
DRC132b5fd2009-10-08 09:04:56 +0000496; For some reason, the OS X linker does not honor the request to align the
497; segment unless we do this.
DRCe5eaf372014-05-09 18:00:32 +0000498 align 16