blob: 3f5350105a72ae67d7e96245326df18fba2b9c68 [file] [log] [blame]
DRCcdc8ac32009-06-25 20:38:31 +00001;
DRC72130be2014-05-09 20:14:26 +00002; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
DRCcdc8ac32009-06-25 20:38:31 +00003;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2009 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; This file contains a floating-point implementation of the inverse DCT
19; (Discrete Cosine Transform). The following code is based directly on
20; the IJG's original jidctflt.c; see the jidctflt.c for more details.
21;
22; [TAB8]
23
24%include "jsimdext.inc"
25%include "jdct.inc"
26
27; --------------------------------------------------------------------------
28
DRCe5eaf372014-05-09 18:00:32 +000029%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
30 shufps %1,%2,0x44
DRCcdc8ac32009-06-25 20:38:31 +000031%endmacro
32
DRCe5eaf372014-05-09 18:00:32 +000033%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
34 shufps %1,%2,0xEE
DRCcdc8ac32009-06-25 20:38:31 +000035%endmacro
36
37; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000038 SECTION SEG_CONST
DRCcdc8ac32009-06-25 20:38:31 +000039
DRCe5eaf372014-05-09 18:00:32 +000040 alignz 16
41 global EXTN(jconst_idct_float_sse2)
DRCcdc8ac32009-06-25 20:38:31 +000042
43EXTN(jconst_idct_float_sse2):
44
DRCe5eaf372014-05-09 18:00:32 +000045PD_1_414 times 4 dd 1.414213562373095048801689
46PD_1_847 times 4 dd 1.847759065022573512256366
47PD_1_082 times 4 dd 1.082392200292393968799446
48PD_M2_613 times 4 dd -2.613125929752753055713286
49PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
50PB_CENTERJSAMP times 16 db CENTERJSAMPLE
DRCcdc8ac32009-06-25 20:38:31 +000051
DRCe5eaf372014-05-09 18:00:32 +000052 alignz 16
DRCcdc8ac32009-06-25 20:38:31 +000053
54; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000055 SECTION SEG_TEXT
56 BITS 64
DRCcdc8ac32009-06-25 20:38:31 +000057;
58; Perform dequantization and inverse DCT on one block of coefficients.
59;
60; GLOBAL(void)
DRCbd498032016-02-19 08:53:33 -060061; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
DRCcdc8ac32009-06-25 20:38:31 +000062; JSAMPARRAY output_buf, JDIMENSION output_col)
63;
64
DRCbd498032016-02-19 08:53:33 -060065; r10 = void *dct_table
DRCcdc8ac32009-06-25 20:38:31 +000066; r11 = JCOEFPTR coef_block
67; r12 = JSAMPARRAY output_buf
68; r13 = JDIMENSION output_col
69
DRCe5eaf372014-05-09 18:00:32 +000070%define original_rbp rbp+0
71%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
72%define WK_NUM 2
73%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
74 ; FAST_FLOAT workspace[DCTSIZE2]
DRCcdc8ac32009-06-25 20:38:31 +000075
DRCe5eaf372014-05-09 18:00:32 +000076 align 16
77 global EXTN(jsimd_idct_float_sse2)
DRCcdc8ac32009-06-25 20:38:31 +000078
79EXTN(jsimd_idct_float_sse2):
DRCe5eaf372014-05-09 18:00:32 +000080 push rbp
81 mov rax,rsp ; rax = original rbp
82 sub rsp, byte 4
83 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
84 mov [rsp],rax
85 mov rbp,rsp ; rbp = aligned rbp
86 lea rsp, [workspace]
87 collect_args
88 push rbx
DRCcdc8ac32009-06-25 20:38:31 +000089
DRCe5eaf372014-05-09 18:00:32 +000090 ; ---- Pass 1: process columns from input, store into work array.
DRCcdc8ac32009-06-25 20:38:31 +000091
DRCe5eaf372014-05-09 18:00:32 +000092 mov rdx, r10 ; quantptr
93 mov rsi, r11 ; inptr
DRCbd498032016-02-19 08:53:33 -060094 lea rdi, [workspace] ; FAST_FLOAT *wsptr
DRCe5eaf372014-05-09 18:00:32 +000095 mov rcx, DCTSIZE/4 ; ctr
DRCcdc8ac32009-06-25 20:38:31 +000096.columnloop:
97%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
DRCe5eaf372014-05-09 18:00:32 +000098 mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
99 or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
100 jnz near .columnDCT
DRCcdc8ac32009-06-25 20:38:31 +0000101
DRCe5eaf372014-05-09 18:00:32 +0000102 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
103 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
104 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
105 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
106 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
107 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
108 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
109 por xmm1,xmm2
110 por xmm3,xmm4
111 por xmm5,xmm6
112 por xmm1,xmm3
113 por xmm5,xmm7
114 por xmm1,xmm5
115 packsswb xmm1,xmm1
116 movd eax,xmm1
117 test rax,rax
118 jnz short .columnDCT
DRCcdc8ac32009-06-25 20:38:31 +0000119
DRCe5eaf372014-05-09 18:00:32 +0000120 ; -- AC terms all zero
DRCcdc8ac32009-06-25 20:38:31 +0000121
DRCe5eaf372014-05-09 18:00:32 +0000122 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
DRCcdc8ac32009-06-25 20:38:31 +0000123
DRCe5eaf372014-05-09 18:00:32 +0000124 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
125 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
126 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
DRCcdc8ac32009-06-25 20:38:31 +0000127
DRCe5eaf372014-05-09 18:00:32 +0000128 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
DRCcdc8ac32009-06-25 20:38:31 +0000129
DRCe5eaf372014-05-09 18:00:32 +0000130 movaps xmm1,xmm0
131 movaps xmm2,xmm0
132 movaps xmm3,xmm0
DRCcdc8ac32009-06-25 20:38:31 +0000133
DRCe5eaf372014-05-09 18:00:32 +0000134 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
135 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
136 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
137 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
DRCcdc8ac32009-06-25 20:38:31 +0000138
DRCe5eaf372014-05-09 18:00:32 +0000139 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
140 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
141 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
142 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
143 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
144 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
145 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
146 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
147 jmp near .nextcolumn
DRCcdc8ac32009-06-25 20:38:31 +0000148%endif
149.columnDCT:
150
DRCe5eaf372014-05-09 18:00:32 +0000151 ; -- Even part
DRCcdc8ac32009-06-25 20:38:31 +0000152
DRCe5eaf372014-05-09 18:00:32 +0000153 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
154 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
155 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
156 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
DRCcdc8ac32009-06-25 20:38:31 +0000157
DRCe5eaf372014-05-09 18:00:32 +0000158 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
159 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
160 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
161 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
162 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
163 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
DRCcdc8ac32009-06-25 20:38:31 +0000164
DRCe5eaf372014-05-09 18:00:32 +0000165 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
166 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
167 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
168 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
169 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
170 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
DRCcdc8ac32009-06-25 20:38:31 +0000171
DRCe5eaf372014-05-09 18:00:32 +0000172 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
173 mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
174 mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
175 mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
DRCcdc8ac32009-06-25 20:38:31 +0000176
DRCe5eaf372014-05-09 18:00:32 +0000177 movaps xmm4,xmm0
178 movaps xmm5,xmm1
179 subps xmm0,xmm2 ; xmm0=tmp11
180 subps xmm1,xmm3
181 addps xmm4,xmm2 ; xmm4=tmp10
182 addps xmm5,xmm3 ; xmm5=tmp13
DRCcdc8ac32009-06-25 20:38:31 +0000183
DRCe5eaf372014-05-09 18:00:32 +0000184 mulps xmm1,[rel PD_1_414]
185 subps xmm1,xmm5 ; xmm1=tmp12
DRCcdc8ac32009-06-25 20:38:31 +0000186
DRCe5eaf372014-05-09 18:00:32 +0000187 movaps xmm6,xmm4
188 movaps xmm7,xmm0
189 subps xmm4,xmm5 ; xmm4=tmp3
190 subps xmm0,xmm1 ; xmm0=tmp2
191 addps xmm6,xmm5 ; xmm6=tmp0
192 addps xmm7,xmm1 ; xmm7=tmp1
DRCcdc8ac32009-06-25 20:38:31 +0000193
DRCe5eaf372014-05-09 18:00:32 +0000194 movaps XMMWORD [wk(1)], xmm4 ; tmp3
195 movaps XMMWORD [wk(0)], xmm0 ; tmp2
DRCcdc8ac32009-06-25 20:38:31 +0000196
DRCe5eaf372014-05-09 18:00:32 +0000197 ; -- Odd part
DRCcdc8ac32009-06-25 20:38:31 +0000198
DRCe5eaf372014-05-09 18:00:32 +0000199 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
200 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
201 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
202 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
DRCcdc8ac32009-06-25 20:38:31 +0000203
DRCe5eaf372014-05-09 18:00:32 +0000204 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
205 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
206 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
207 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
208 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
209 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
DRCcdc8ac32009-06-25 20:38:31 +0000210
DRCe5eaf372014-05-09 18:00:32 +0000211 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
212 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
213 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
214 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
215 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
216 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
DRCcdc8ac32009-06-25 20:38:31 +0000217
DRCe5eaf372014-05-09 18:00:32 +0000218 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
219 mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
220 mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
221 mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
DRCcdc8ac32009-06-25 20:38:31 +0000222
DRCe5eaf372014-05-09 18:00:32 +0000223 movaps xmm4,xmm2
224 movaps xmm0,xmm5
225 addps xmm2,xmm1 ; xmm2=z11
226 addps xmm5,xmm3 ; xmm5=z13
227 subps xmm4,xmm1 ; xmm4=z12
228 subps xmm0,xmm3 ; xmm0=z10
DRCcdc8ac32009-06-25 20:38:31 +0000229
DRCe5eaf372014-05-09 18:00:32 +0000230 movaps xmm1,xmm2
231 subps xmm2,xmm5
232 addps xmm1,xmm5 ; xmm1=tmp7
DRCcdc8ac32009-06-25 20:38:31 +0000233
DRCe5eaf372014-05-09 18:00:32 +0000234 mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
DRCcdc8ac32009-06-25 20:38:31 +0000235
DRCe5eaf372014-05-09 18:00:32 +0000236 movaps xmm3,xmm0
237 addps xmm0,xmm4
238 mulps xmm0,[rel PD_1_847] ; xmm0=z5
239 mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
240 mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
241 addps xmm3,xmm0 ; xmm3=tmp12
242 subps xmm4,xmm0 ; xmm4=tmp10
DRCcdc8ac32009-06-25 20:38:31 +0000243
DRCe5eaf372014-05-09 18:00:32 +0000244 ; -- Final output stage
DRCcdc8ac32009-06-25 20:38:31 +0000245
DRCe5eaf372014-05-09 18:00:32 +0000246 subps xmm3,xmm1 ; xmm3=tmp6
247 movaps xmm5,xmm6
248 movaps xmm0,xmm7
249 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
250 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
251 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
252 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
253 subps xmm2,xmm3 ; xmm2=tmp5
DRCcdc8ac32009-06-25 20:38:31 +0000254
DRCe5eaf372014-05-09 18:00:32 +0000255 movaps xmm1,xmm6 ; transpose coefficients(phase 1)
256 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
257 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
258 movaps xmm3,xmm0 ; transpose coefficients(phase 1)
259 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
260 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
DRCcdc8ac32009-06-25 20:38:31 +0000261
DRCe5eaf372014-05-09 18:00:32 +0000262 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
263 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
DRCcdc8ac32009-06-25 20:38:31 +0000264
DRCe5eaf372014-05-09 18:00:32 +0000265 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
266 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
DRCcdc8ac32009-06-25 20:38:31 +0000267
DRCe5eaf372014-05-09 18:00:32 +0000268 addps xmm4,xmm2 ; xmm4=tmp4
269 movaps xmm0,xmm7
270 movaps xmm3,xmm5
271 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
272 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
273 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
274 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
DRCcdc8ac32009-06-25 20:38:31 +0000275
DRCe5eaf372014-05-09 18:00:32 +0000276 movaps xmm2,xmm7 ; transpose coefficients(phase 1)
277 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
278 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
279 movaps xmm4,xmm5 ; transpose coefficients(phase 1)
280 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
281 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
DRCcdc8ac32009-06-25 20:38:31 +0000282
DRCe5eaf372014-05-09 18:00:32 +0000283 movaps xmm3,xmm6 ; transpose coefficients(phase 2)
284 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
285 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
286 movaps xmm0,xmm1 ; transpose coefficients(phase 2)
287 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
288 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
DRCcdc8ac32009-06-25 20:38:31 +0000289
DRCe5eaf372014-05-09 18:00:32 +0000290 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
291 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
DRCcdc8ac32009-06-25 20:38:31 +0000292
DRCe5eaf372014-05-09 18:00:32 +0000293 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
294 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
295 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
296 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
DRCcdc8ac32009-06-25 20:38:31 +0000297
DRCe5eaf372014-05-09 18:00:32 +0000298 movaps xmm6,xmm5 ; transpose coefficients(phase 2)
299 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
300 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
301 movaps xmm3,xmm4 ; transpose coefficients(phase 2)
302 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
303 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
DRCcdc8ac32009-06-25 20:38:31 +0000304
DRCe5eaf372014-05-09 18:00:32 +0000305 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
306 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
307 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
308 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
DRCcdc8ac32009-06-25 20:38:31 +0000309
310.nextcolumn:
DRCe5eaf372014-05-09 18:00:32 +0000311 add rsi, byte 4*SIZEOF_JCOEF ; coef_block
312 add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
313 add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
314 dec rcx ; ctr
315 jnz near .columnloop
DRCcdc8ac32009-06-25 20:38:31 +0000316
DRCe5eaf372014-05-09 18:00:32 +0000317 ; -- Prefetch the next coefficient block
DRCcdc8ac32009-06-25 20:38:31 +0000318
DRCe5eaf372014-05-09 18:00:32 +0000319 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
320 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
321 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
322 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
DRCcdc8ac32009-06-25 20:38:31 +0000323
DRCe5eaf372014-05-09 18:00:32 +0000324 ; ---- Pass 2: process rows from work array, store into output array.
DRCcdc8ac32009-06-25 20:38:31 +0000325
DRCe5eaf372014-05-09 18:00:32 +0000326 mov rax, [original_rbp]
DRCbd498032016-02-19 08:53:33 -0600327 lea rsi, [workspace] ; FAST_FLOAT *wsptr
DRCe5eaf372014-05-09 18:00:32 +0000328 mov rdi, r12 ; (JSAMPROW *)
Chandler Carruth498d9bc2015-09-15 11:57:03 -0700329 mov eax, r13d
DRCe5eaf372014-05-09 18:00:32 +0000330 mov rcx, DCTSIZE/4 ; ctr
DRCcdc8ac32009-06-25 20:38:31 +0000331.rowloop:
332
DRCe5eaf372014-05-09 18:00:32 +0000333 ; -- Even part
DRCcdc8ac32009-06-25 20:38:31 +0000334
DRCe5eaf372014-05-09 18:00:32 +0000335 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
336 movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
337 movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
338 movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
DRCcdc8ac32009-06-25 20:38:31 +0000339
DRCe5eaf372014-05-09 18:00:32 +0000340 movaps xmm4,xmm0
341 movaps xmm5,xmm1
342 subps xmm0,xmm2 ; xmm0=tmp11
343 subps xmm1,xmm3
344 addps xmm4,xmm2 ; xmm4=tmp10
345 addps xmm5,xmm3 ; xmm5=tmp13
DRCcdc8ac32009-06-25 20:38:31 +0000346
DRCe5eaf372014-05-09 18:00:32 +0000347 mulps xmm1,[rel PD_1_414]
348 subps xmm1,xmm5 ; xmm1=tmp12
DRCcdc8ac32009-06-25 20:38:31 +0000349
DRCe5eaf372014-05-09 18:00:32 +0000350 movaps xmm6,xmm4
351 movaps xmm7,xmm0
352 subps xmm4,xmm5 ; xmm4=tmp3
353 subps xmm0,xmm1 ; xmm0=tmp2
354 addps xmm6,xmm5 ; xmm6=tmp0
355 addps xmm7,xmm1 ; xmm7=tmp1
DRCcdc8ac32009-06-25 20:38:31 +0000356
DRCe5eaf372014-05-09 18:00:32 +0000357 movaps XMMWORD [wk(1)], xmm4 ; tmp3
358 movaps XMMWORD [wk(0)], xmm0 ; tmp2
DRCcdc8ac32009-06-25 20:38:31 +0000359
DRCe5eaf372014-05-09 18:00:32 +0000360 ; -- Odd part
DRCcdc8ac32009-06-25 20:38:31 +0000361
DRCe5eaf372014-05-09 18:00:32 +0000362 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
363 movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
364 movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
365 movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
DRCcdc8ac32009-06-25 20:38:31 +0000366
DRCe5eaf372014-05-09 18:00:32 +0000367 movaps xmm4,xmm2
368 movaps xmm0,xmm5
369 addps xmm2,xmm1 ; xmm2=z11
370 addps xmm5,xmm3 ; xmm5=z13
371 subps xmm4,xmm1 ; xmm4=z12
372 subps xmm0,xmm3 ; xmm0=z10
DRCcdc8ac32009-06-25 20:38:31 +0000373
DRCe5eaf372014-05-09 18:00:32 +0000374 movaps xmm1,xmm2
375 subps xmm2,xmm5
376 addps xmm1,xmm5 ; xmm1=tmp7
DRCcdc8ac32009-06-25 20:38:31 +0000377
DRCe5eaf372014-05-09 18:00:32 +0000378 mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
DRCcdc8ac32009-06-25 20:38:31 +0000379
DRCe5eaf372014-05-09 18:00:32 +0000380 movaps xmm3,xmm0
381 addps xmm0,xmm4
382 mulps xmm0,[rel PD_1_847] ; xmm0=z5
383 mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
384 mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
385 addps xmm3,xmm0 ; xmm3=tmp12
386 subps xmm4,xmm0 ; xmm4=tmp10
DRCcdc8ac32009-06-25 20:38:31 +0000387
DRCe5eaf372014-05-09 18:00:32 +0000388 ; -- Final output stage
DRCcdc8ac32009-06-25 20:38:31 +0000389
DRCe5eaf372014-05-09 18:00:32 +0000390 subps xmm3,xmm1 ; xmm3=tmp6
391 movaps xmm5,xmm6
392 movaps xmm0,xmm7
393 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
394 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
395 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
396 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
397 subps xmm2,xmm3 ; xmm2=tmp5
DRCcdc8ac32009-06-25 20:38:31 +0000398
DRCe5eaf372014-05-09 18:00:32 +0000399 movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
400 pcmpeqd xmm3,xmm3
401 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
DRCcdc8ac32009-06-25 20:38:31 +0000402
DRCe5eaf372014-05-09 18:00:32 +0000403 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
404 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
405 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
406 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
DRCcdc8ac32009-06-25 20:38:31 +0000407
DRCe5eaf372014-05-09 18:00:32 +0000408 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
409 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
410 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
411 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
412 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
413 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
DRCcdc8ac32009-06-25 20:38:31 +0000414
DRCe5eaf372014-05-09 18:00:32 +0000415 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
416 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
DRCcdc8ac32009-06-25 20:38:31 +0000417
DRCe5eaf372014-05-09 18:00:32 +0000418 addps xmm4,xmm2 ; xmm4=tmp4
419 movaps xmm7,xmm1
420 movaps xmm5,xmm3
421 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
422 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
423 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
424 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
DRCcdc8ac32009-06-25 20:38:31 +0000425
DRCe5eaf372014-05-09 18:00:32 +0000426 movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
427 pcmpeqd xmm4,xmm4
428 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
DRCcdc8ac32009-06-25 20:38:31 +0000429
DRCe5eaf372014-05-09 18:00:32 +0000430 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
431 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
432 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
433 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
DRCcdc8ac32009-06-25 20:38:31 +0000434
DRCe5eaf372014-05-09 18:00:32 +0000435 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
436 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
437 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
438 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
439 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
440 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
DRCcdc8ac32009-06-25 20:38:31 +0000441
DRCe5eaf372014-05-09 18:00:32 +0000442 movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
DRCcdc8ac32009-06-25 20:38:31 +0000443
DRCe5eaf372014-05-09 18:00:32 +0000444 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
445 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
446 paddb xmm6,xmm2
447 paddb xmm1,xmm2
DRCcdc8ac32009-06-25 20:38:31 +0000448
DRCe5eaf372014-05-09 18:00:32 +0000449 movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
450 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
451 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
DRCcdc8ac32009-06-25 20:38:31 +0000452
DRCe5eaf372014-05-09 18:00:32 +0000453 movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
454 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
455 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
DRCcdc8ac32009-06-25 20:38:31 +0000456
DRCe5eaf372014-05-09 18:00:32 +0000457 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
458 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
DRCcdc8ac32009-06-25 20:38:31 +0000459
DRCe5eaf372014-05-09 18:00:32 +0000460 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
461 mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
462 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
463 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
464 mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
465 mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
466 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
467 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
DRCcdc8ac32009-06-25 20:38:31 +0000468
DRCe5eaf372014-05-09 18:00:32 +0000469 add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
470 add rdi, byte 4*SIZEOF_JSAMPROW
471 dec rcx ; ctr
472 jnz near .rowloop
DRCcdc8ac32009-06-25 20:38:31 +0000473
DRCe5eaf372014-05-09 18:00:32 +0000474 pop rbx
475 uncollect_args
476 mov rsp,rbp ; rsp <- aligned rbp
477 pop rsp ; rsp <- original rbp
478 pop rbp
479 ret
DRC132b5fd2009-10-08 09:04:56 +0000480
481; For some reason, the OS X linker does not honor the request to align the
482; segment unless we do this.
DRCe5eaf372014-05-09 18:00:32 +0000483 align 16