blob: 542672dc5498a4cea7b24cdd348ad2e6a81bd182 [file] [log] [blame]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00001;
2; jf3dnflt.asm - floating-point FDCT (3DNow!)
3;
Pierre Ossman65d03172009-03-09 13:28:10 +00004; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00007; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the forward DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
20;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000021; [TAB8]
22
Pierre Ossman3a65ef42009-03-16 13:34:18 +000023%include "jsimdext.inc"
24%include "jdct.inc"
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000025
26; --------------------------------------------------------------------------
27 SECTION SEG_CONST
28
29 alignz 16
30 global EXTN(jconst_fdct_float_3dnow)
31
32EXTN(jconst_fdct_float_3dnow):
33
34PD_0_382 times 2 dd 0.382683432365089771728460
35PD_0_707 times 2 dd 0.707106781186547524400844
36PD_0_541 times 2 dd 0.541196100146196984399723
37PD_1_306 times 2 dd 1.306562964876376527856643
38
39 alignz 16
40
41; --------------------------------------------------------------------------
42 SECTION SEG_TEXT
43 BITS 32
44;
45; Perform the forward DCT on one block of samples.
46;
47; GLOBAL(void)
Pierre Ossman65d03172009-03-09 13:28:10 +000048; jsimd_fdct_float_3dnow (FAST_FLOAT * data)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000049;
50
51%define data(b) (b)+8 ; FAST_FLOAT * data
52
53%define original_ebp ebp+0
54%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
55%define WK_NUM 2
56
57 align 16
Pierre Ossman65d03172009-03-09 13:28:10 +000058 global EXTN(jsimd_fdct_float_3dnow)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000059
Pierre Ossman65d03172009-03-09 13:28:10 +000060EXTN(jsimd_fdct_float_3dnow):
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000061 push ebp
62 mov eax,esp ; eax = original ebp
63 sub esp, byte 4
64 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
65 mov [esp],eax
66 mov ebp,esp ; ebp = aligned ebp
67 lea esp, [wk(0)]
68 pushpic ebx
69; push ecx ; need not be preserved
70; push edx ; need not be preserved
71; push esi ; unused
72; push edi ; unused
73
74 get_GOT ebx ; get GOT address
75
76 ; ---- Pass 1: process rows.
77
78 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
79 mov ecx, DCTSIZE/2
80 alignx 16,7
81.rowloop:
82
83 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
84 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
85 movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
86 movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
87
88 ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
89
90 movq mm4,mm0 ; transpose coefficients
91 punpckldq mm0,mm1 ; mm0=(00 10)=data0
92 punpckhdq mm4,mm1 ; mm4=(01 11)=data1
93 movq mm5,mm2 ; transpose coefficients
94 punpckldq mm2,mm3 ; mm2=(06 16)=data6
95 punpckhdq mm5,mm3 ; mm5=(07 17)=data7
96
97 movq mm6,mm4
98 movq mm7,mm0
99 pfsub mm4,mm2 ; mm4=data1-data6=tmp6
100 pfsub mm0,mm5 ; mm0=data0-data7=tmp7
101 pfadd mm6,mm2 ; mm6=data1+data6=tmp1
102 pfadd mm7,mm5 ; mm7=data0+data7=tmp0
103
104 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
105 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
106 movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
107 movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
108
109 ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
110
111 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
112 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
113
114 movq mm4,mm1 ; transpose coefficients
115 punpckldq mm1,mm3 ; mm1=(02 12)=data2
116 punpckhdq mm4,mm3 ; mm4=(03 13)=data3
117 movq mm0,mm2 ; transpose coefficients
118 punpckldq mm2,mm5 ; mm2=(04 14)=data4
119 punpckhdq mm0,mm5 ; mm0=(05 15)=data5
120
121 movq mm3,mm4
122 movq mm5,mm1
123 pfadd mm4,mm2 ; mm4=data3+data4=tmp3
124 pfadd mm1,mm0 ; mm1=data2+data5=tmp2
125 pfsub mm3,mm2 ; mm3=data3-data4=tmp4
126 pfsub mm5,mm0 ; mm5=data2-data5=tmp5
127
128 ; -- Even part
129
130 movq mm2,mm7
131 movq mm0,mm6
132 pfsub mm7,mm4 ; mm7=tmp13
133 pfsub mm6,mm1 ; mm6=tmp12
134 pfadd mm2,mm4 ; mm2=tmp10
135 pfadd mm0,mm1 ; mm0=tmp11
136
137 pfadd mm6,mm7
138 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
139
140 movq mm4,mm2
141 movq mm1,mm7
142 pfsub mm2,mm0 ; mm2=data4
143 pfsub mm7,mm6 ; mm7=data6
144 pfadd mm4,mm0 ; mm4=data0
145 pfadd mm1,mm6 ; mm1=data2
146
147 movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
148 movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
149 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
150 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
151
152 ; -- Odd part
153
154 movq mm0, MMWORD [wk(0)] ; mm0=tmp6
155 movq mm6, MMWORD [wk(1)] ; mm6=tmp7
156
157 pfadd mm3,mm5 ; mm3=tmp10
158 pfadd mm5,mm0 ; mm5=tmp11
159 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7
160
161 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
162
163 movq mm2,mm3 ; mm2=tmp10
164 pfsub mm3,mm0
165 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
166 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
167 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
168 pfadd mm2,mm3 ; mm2=z2
169 pfadd mm0,mm3 ; mm0=z4
170
171 movq mm7,mm6
172 pfsub mm6,mm5 ; mm6=z13
173 pfadd mm7,mm5 ; mm7=z11
174
175 movq mm4,mm6
176 movq mm1,mm7
177 pfsub mm6,mm2 ; mm6=data3
178 pfsub mm7,mm0 ; mm7=data7
179 pfadd mm4,mm2 ; mm4=data5
180 pfadd mm1,mm0 ; mm1=data1
181
182 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
183 movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
184 movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
185 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
186
187 add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
188 dec ecx
189 jnz near .rowloop
190
191 ; ---- Pass 2: process columns.
192
193 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
194 mov ecx, DCTSIZE/2
195 alignx 16,7
196.columnloop:
197
198 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
199 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
200 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
201 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
202
203 ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
204
205 movq mm4,mm0 ; transpose coefficients
206 punpckldq mm0,mm1 ; mm0=(00 01)=data0
207 punpckhdq mm4,mm1 ; mm4=(10 11)=data1
208 movq mm5,mm2 ; transpose coefficients
209 punpckldq mm2,mm3 ; mm2=(60 61)=data6
210 punpckhdq mm5,mm3 ; mm5=(70 71)=data7
211
212 movq mm6,mm4
213 movq mm7,mm0
214 pfsub mm4,mm2 ; mm4=data1-data6=tmp6
215 pfsub mm0,mm5 ; mm0=data0-data7=tmp7
216 pfadd mm6,mm2 ; mm6=data1+data6=tmp1
217 pfadd mm7,mm5 ; mm7=data0+data7=tmp0
218
219 movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
220 movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
221 movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
222 movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
223
224 ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
225
226 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
227 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
228
229 movq mm4,mm1 ; transpose coefficients
230 punpckldq mm1,mm3 ; mm1=(20 21)=data2
231 punpckhdq mm4,mm3 ; mm4=(30 31)=data3
232 movq mm0,mm2 ; transpose coefficients
233 punpckldq mm2,mm5 ; mm2=(40 41)=data4
234 punpckhdq mm0,mm5 ; mm0=(50 51)=data5
235
236 movq mm3,mm4
237 movq mm5,mm1
238 pfadd mm4,mm2 ; mm4=data3+data4=tmp3
239 pfadd mm1,mm0 ; mm1=data2+data5=tmp2
240 pfsub mm3,mm2 ; mm3=data3-data4=tmp4
241 pfsub mm5,mm0 ; mm5=data2-data5=tmp5
242
243 ; -- Even part
244
245 movq mm2,mm7
246 movq mm0,mm6
247 pfsub mm7,mm4 ; mm7=tmp13
248 pfsub mm6,mm1 ; mm6=tmp12
249 pfadd mm2,mm4 ; mm2=tmp10
250 pfadd mm0,mm1 ; mm0=tmp11
251
252 pfadd mm6,mm7
253 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
254
255 movq mm4,mm2
256 movq mm1,mm7
257 pfsub mm2,mm0 ; mm2=data4
258 pfsub mm7,mm6 ; mm7=data6
259 pfadd mm4,mm0 ; mm4=data0
260 pfadd mm1,mm6 ; mm1=data2
261
262 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
263 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
264 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
265 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
266
267 ; -- Odd part
268
269 movq mm0, MMWORD [wk(0)] ; mm0=tmp6
270 movq mm6, MMWORD [wk(1)] ; mm6=tmp7
271
272 pfadd mm3,mm5 ; mm3=tmp10
273 pfadd mm5,mm0 ; mm5=tmp11
274 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7
275
276 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
277
278 movq mm2,mm3 ; mm2=tmp10
279 pfsub mm3,mm0
280 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
281 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
282 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
283 pfadd mm2,mm3 ; mm2=z2
284 pfadd mm0,mm3 ; mm0=z4
285
286 movq mm7,mm6
287 pfsub mm6,mm5 ; mm6=z13
288 pfadd mm7,mm5 ; mm7=z11
289
290 movq mm4,mm6
291 movq mm1,mm7
292 pfsub mm6,mm2 ; mm6=data3
293 pfsub mm7,mm0 ; mm7=data7
294 pfadd mm4,mm2 ; mm4=data5
295 pfadd mm1,mm0 ; mm1=data1
296
297 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
298 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
299 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
300 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
301
302 add edx, byte 2*SIZEOF_FAST_FLOAT
303 dec ecx
304 jnz near .columnloop
305
306 femms ; empty MMX/3DNow! state
307
308; pop edi ; unused
309; pop esi ; unused
310; pop edx ; need not be preserved
311; pop ecx ; need not be preserved
312 poppic ebx
313 mov esp,ebp ; esp <- aligned ebp
314 pop esp ; esp <- original ebp
315 pop ebp
316 ret
317
DRC132b5fd2009-10-08 09:04:56 +0000318; For some reason, the OS X linker does not honor the request to align the
319; segment unless we do this.
320 align 16