blob: fadd80aeef72f2367a573c00f0257d3fbc366373 [file] [log] [blame]
Josh Coalsonbe5e19b2007-03-22 03:13:11 +00001; vim:filetype=nasm ts=8
2
Josh Coalsonafd81072003-01-31 23:34:56 +00003; libFLAC - Free Lossless Audio Codec library
Josh Coalsondea0f5a2009-01-07 07:31:28 +00004; Copyright (C) 2001,2002,2003,2004,2005,2006,2007,2008,2009 Josh Coalson
Josh Coalson9a7b5e22001-06-13 18:03:09 +00005;
Josh Coalsonafd81072003-01-31 23:34:56 +00006; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions
8; are met:
Josh Coalson9a7b5e22001-06-13 18:03:09 +00009;
Josh Coalsonafd81072003-01-31 23:34:56 +000010; - Redistributions of source code must retain the above copyright
11; notice, this list of conditions and the following disclaimer.
Josh Coalson9a7b5e22001-06-13 18:03:09 +000012;
Josh Coalsonafd81072003-01-31 23:34:56 +000013; - Redistributions in binary form must reproduce the above copyright
14; notice, this list of conditions and the following disclaimer in the
15; documentation and/or other materials provided with the distribution.
16;
17; - Neither the name of the Xiph.org Foundation nor the names of its
18; contributors may be used to endorse or promote products derived from
19; this software without specific prior written permission.
20;
21; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
25; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Josh Coalson9a7b5e22001-06-13 18:03:09 +000032
33%include "nasm.h"
34
35 data_section
36
Josh Coalsone6499bd2001-06-13 18:11:25 +000037cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
38cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
39cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
40cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
Josh Coalsonf5925df2001-07-16 21:13:19 +000041cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
Josh Coalsone6499bd2001-06-13 18:11:25 +000042cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
43cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
44cglobal FLAC__lpc_restore_signal_asm_ia32
45cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +000046
47 code_section
48
49; **********************************************************************
50;
Josh Coalson77e3f312001-06-23 03:03:24 +000051; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +000052; {
Josh Coalson77e3f312001-06-23 03:03:24 +000053; FLAC__real d;
Josh Coalson9a7b5e22001-06-13 18:03:09 +000054; unsigned sample, coeff;
55; const unsigned limit = data_len - lag;
56;
57; FLAC__ASSERT(lag > 0);
58; FLAC__ASSERT(lag <= data_len);
59;
60; for(coeff = 0; coeff < lag; coeff++)
61; autoc[coeff] = 0.0;
62; for(sample = 0; sample <= limit; sample++) {
63; d = data[sample];
64; for(coeff = 0; coeff < lag; coeff++)
65; autoc[coeff] += d * data[sample+coeff];
66; }
67; for(; sample < data_len; sample++) {
68; d = data[sample];
69; for(coeff = 0; coeff < data_len - sample; coeff++)
70; autoc[coeff] += d * data[sample+coeff];
71; }
72; }
73;
74 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +000075cident FLAC__lpc_compute_autocorrelation_asm_ia32
Josh Coalson651d6de2001-12-04 05:36:09 +000076 ;[esp + 28] == autoc[]
77 ;[esp + 24] == lag
78 ;[esp + 20] == data_len
79 ;[esp + 16] == data[]
Josh Coalson9a7b5e22001-06-13 18:03:09 +000080
81 ;ASSERT(lag > 0)
82 ;ASSERT(lag <= 33)
83 ;ASSERT(lag <= data_len)
84
85.begin:
86 push esi
87 push edi
Josh Coalson651d6de2001-12-04 05:36:09 +000088 push ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +000089
90 ; for(coeff = 0; coeff < lag; coeff++)
91 ; autoc[coeff] = 0.0;
Josh Coalson651d6de2001-12-04 05:36:09 +000092 mov edi, [esp + 28] ; edi == autoc
93 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write
Josh Coalson9a7b5e22001-06-13 18:03:09 +000094 xor eax, eax
95 rep stosd
96
97 ; const unsigned limit = data_len - lag;
Josh Coalson651d6de2001-12-04 05:36:09 +000098 mov eax, [esp + 24] ; eax == lag
99 mov ecx, [esp + 20]
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000100 sub ecx, eax ; ecx == limit
101
Josh Coalson651d6de2001-12-04 05:36:09 +0000102 mov edi, [esp + 28] ; edi == autoc
103 mov esi, [esp + 16] ; esi == data
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000104 inc ecx ; we are looping <= limit so we add one to the counter
105
106 ; for(sample = 0; sample <= limit; sample++) {
107 ; d = data[sample];
108 ; for(coeff = 0; coeff < lag; coeff++)
109 ; autoc[coeff] += d * data[sample+coeff];
110 ; }
111 fld dword [esi] ; ST = d <- data[sample]
112 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
113 lea edx, [eax + eax*2]
114 neg edx
Josh Coalson651d6de2001-12-04 05:36:09 +0000115 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
116 call .get_eip1
117.get_eip1:
118 pop ebx
119 add edx, ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000120 inc edx ; compensate for the shorter opcode on the last iteration
121 inc edx ; compensate for the shorter opcode on the last iteration
122 inc edx ; compensate for the shorter opcode on the last iteration
123 cmp eax, 33
124 jne .loop1_start
125 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration
126.loop1_start:
127 jmp edx
128
129 fld st0 ; ST = d d
130 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here!
131 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here!
132 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here!
133 fld st0 ; ST = d d
134 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
135 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
136 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
137 fld st0 ; ST = d d
138 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
139 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
140 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
141 fld st0 ; ST = d d
142 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
143 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
144 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
145 fld st0 ; ST = d d
146 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
147 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
148 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
149 fld st0 ; ST = d d
150 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
151 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
152 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
153 fld st0 ; ST = d d
154 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
155 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
156 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
157 fld st0 ; ST = d d
158 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
159 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
160 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
161 fld st0 ; ST = d d
162 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
163 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
164 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
165 fld st0 ; ST = d d
166 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
167 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
168 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
169 fld st0 ; ST = d d
170 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
171 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
172 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
173 fld st0 ; ST = d d
174 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
175 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
176 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
177 fld st0 ; ST = d d
178 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
179 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
180 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
181 fld st0 ; ST = d d
182 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
183 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
184 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
185 fld st0 ; ST = d d
186 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
187 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
188 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
189 fld st0 ; ST = d d
190 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
191 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
192 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
193 fld st0 ; ST = d d
194 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
195 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
196 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
197 fld st0 ; ST = d d
198 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
199 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
200 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
201 fld st0 ; ST = d d
202 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
203 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
204 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
205 fld st0 ; ST = d d
206 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
207 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
208 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
209 fld st0 ; ST = d d
210 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
211 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
212 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
213 fld st0 ; ST = d d
214 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
215 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
216 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
217 fld st0 ; ST = d d
218 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
219 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
220 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
221 fld st0 ; ST = d d
222 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
223 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
224 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
225 fld st0 ; ST = d d
226 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
227 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
228 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
229 fld st0 ; ST = d d
230 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
231 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
232 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
233 fld st0 ; ST = d d
234 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
235 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
236 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
237 fld st0 ; ST = d d
238 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
239 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
240 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
241 fld st0 ; ST = d d
242 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
243 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
244 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
245 fld st0 ; ST = d d
246 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
247 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
248 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
249 fld st0 ; ST = d d
250 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
251 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
252 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
253 fld st0 ; ST = d d
254 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
255 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
256 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
257 fld st0 ; ST = d d
258 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
259 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
260 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
261.jumper1_0:
262
263 fstp st0 ; pop d, ST = empty
264 add esi, byte 4 ; sample++
265 dec ecx
266 jz .loop1_end
267 fld dword [esi] ; ST = d <- data[sample]
268 jmp edx
269.loop1_end:
270
271 ; for(; sample < data_len; sample++) {
272 ; d = data[sample];
273 ; for(coeff = 0; coeff < data_len - sample; coeff++)
274 ; autoc[coeff] += d * data[sample+coeff];
275 ; }
Josh Coalson651d6de2001-12-04 05:36:09 +0000276 mov ecx, [esp + 24] ; ecx <- lag
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000277 dec ecx ; ecx <- lag - 1
278 jz near .end ; skip loop if 0 (i.e. lag == 1)
279
280 fld dword [esi] ; ST = d <- data[sample]
281 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through
282 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
283 lea edx, [eax + eax*2]
284 neg edx
Josh Coalson651d6de2001-12-04 05:36:09 +0000285 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
286 call .get_eip2
287.get_eip2:
288 pop ebx
289 add edx, ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000290 inc edx ; compensate for the shorter opcode on the last iteration
291 inc edx ; compensate for the shorter opcode on the last iteration
292 inc edx ; compensate for the shorter opcode on the last iteration
293 jmp edx
294
295 fld st0 ; ST = d d
296 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
297 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
298 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
299 fld st0 ; ST = d d
300 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
301 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
302 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
303 fld st0 ; ST = d d
304 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
305 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
306 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
307 fld st0 ; ST = d d
308 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
309 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
310 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
311 fld st0 ; ST = d d
312 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
313 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
314 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
315 fld st0 ; ST = d d
316 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
317 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
318 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
319 fld st0 ; ST = d d
320 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
321 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
322 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
323 fld st0 ; ST = d d
324 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
325 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
326 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
327 fld st0 ; ST = d d
328 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
329 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
330 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
331 fld st0 ; ST = d d
332 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
333 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
334 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
335 fld st0 ; ST = d d
336 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
337 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
338 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
339 fld st0 ; ST = d d
340 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
341 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
342 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
343 fld st0 ; ST = d d
344 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
345 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
346 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
347 fld st0 ; ST = d d
348 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
349 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
350 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
351 fld st0 ; ST = d d
352 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
353 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
354 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
355 fld st0 ; ST = d d
356 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
357 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
358 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
359 fld st0 ; ST = d d
360 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
361 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
362 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
363 fld st0 ; ST = d d
364 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
365 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
366 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
367 fld st0 ; ST = d d
368 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
369 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
370 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
371 fld st0 ; ST = d d
372 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
373 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
374 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
375 fld st0 ; ST = d d
376 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
377 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
378 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
379 fld st0 ; ST = d d
380 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
381 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
382 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
383 fld st0 ; ST = d d
384 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
385 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
386 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
387 fld st0 ; ST = d d
388 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
389 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
390 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
391 fld st0 ; ST = d d
392 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
393 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
394 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
395 fld st0 ; ST = d d
396 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
397 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
398 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
399 fld st0 ; ST = d d
400 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
401 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
402 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
403 fld st0 ; ST = d d
404 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
405 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
406 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
407 fld st0 ; ST = d d
408 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
409 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
410 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
411 fld st0 ; ST = d d
412 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
413 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
414 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
415 fld st0 ; ST = d d
416 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
417 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
418 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
419 fld st0 ; ST = d d
420 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
421 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
422 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
423.jumper2_0:
424
425 fstp st0 ; pop d, ST = empty
426 add esi, byte 4 ; sample++
427 dec ecx
428 jz .loop2_end
429 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target
430 fld dword [esi] ; ST = d <- data[sample]
431 jmp edx
432.loop2_end:
433
434.end:
Josh Coalson651d6de2001-12-04 05:36:09 +0000435 pop ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000436 pop edi
437 pop esi
438 ret
439
440 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000441cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000442 ;[esp + 16] == autoc[]
443 ;[esp + 12] == lag
444 ;[esp + 8] == data_len
445 ;[esp + 4] == data[]
446
447 ;ASSERT(lag > 0)
448 ;ASSERT(lag <= 4)
449 ;ASSERT(lag <= data_len)
450
451 ; for(coeff = 0; coeff < lag; coeff++)
452 ; autoc[coeff] = 0.0;
453 xorps xmm5, xmm5
454
455 mov edx, [esp + 8] ; edx == data_len
456 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
457
458 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
459 add eax, 4
460 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
461 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
462.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
463 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
464 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
465 dec edx
466 jz .loop_end
467 ALIGN 16
468.loop_start:
469 ; start by reading the next sample
470 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
471 add eax, 4
472 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
473 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
474 movss xmm2, xmm0
475 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
476 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
477 dec edx
478 jnz .loop_start
479.loop_end:
480 ; store autoc
481 mov edx, [esp + 16] ; edx == autoc
482 movups [edx], xmm5
483
484.end:
485 ret
486
487 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000488cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000489 ;[esp + 16] == autoc[]
490 ;[esp + 12] == lag
491 ;[esp + 8] == data_len
492 ;[esp + 4] == data[]
493
494 ;ASSERT(lag > 0)
495 ;ASSERT(lag <= 8)
496 ;ASSERT(lag <= data_len)
497
498 ; for(coeff = 0; coeff < lag; coeff++)
499 ; autoc[coeff] = 0.0;
500 xorps xmm5, xmm5
501 xorps xmm6, xmm6
502
503 mov edx, [esp + 8] ; edx == data_len
504 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
505
506 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
507 add eax, 4
508 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
509 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
510 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
511 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
512.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
513 mulps xmm0, xmm2
514 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
515 addps xmm5, xmm0
516 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
517 dec edx
518 jz .loop_end
519 ALIGN 16
520.loop_start:
521 ; start by reading the next sample
522 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
523 ; here we reorder the instructions; see the (#) indexes for a logical order
524 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
525 add eax, 4 ; (0)
526 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
527 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
528 movss xmm3, xmm2 ; (5)
529 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
530 movss xmm2, xmm0 ; (6)
531 mulps xmm1, xmm3 ; (8)
532 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
533 addps xmm6, xmm1 ; (10)
534 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
535 dec edx
536 jnz .loop_start
537.loop_end:
538 ; store autoc
539 mov edx, [esp + 16] ; edx == autoc
540 movups [edx], xmm5
Josh Coalsona52270e2001-07-18 00:23:40 +0000541 movups [edx + 16], xmm6
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000542
543.end:
544 ret
545
546 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000547cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000548 ;[esp + 16] == autoc[]
549 ;[esp + 12] == lag
550 ;[esp + 8] == data_len
551 ;[esp + 4] == data[]
552
553 ;ASSERT(lag > 0)
554 ;ASSERT(lag <= 12)
555 ;ASSERT(lag <= data_len)
556
557 ; for(coeff = 0; coeff < lag; coeff++)
558 ; autoc[coeff] = 0.0;
559 xorps xmm5, xmm5
560 xorps xmm6, xmm6
561 xorps xmm7, xmm7
562
563 mov edx, [esp + 8] ; edx == data_len
564 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
565
566 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
567 add eax, 4
568 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
569 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
570 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
571 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
572.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
573 movaps xmm1, xmm0
574 mulps xmm1, xmm2
575 addps xmm5, xmm1
576 movaps xmm1, xmm0
577 mulps xmm1, xmm3
578 addps xmm6, xmm1
579 mulps xmm0, xmm4
580 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
581 dec edx
582 jz .loop_end
583 ALIGN 16
584.loop_start:
585 ; start by reading the next sample
586 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
587 add eax, 4
588 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
589
590 ; shift xmm4:xmm3:xmm2 left by one float
591 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
592 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
593 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
594 movss xmm4, xmm3
595 movss xmm3, xmm2
596 movss xmm2, xmm0
597
598 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
599 movaps xmm1, xmm0
600 mulps xmm1, xmm2
601 addps xmm5, xmm1
602 movaps xmm1, xmm0
603 mulps xmm1, xmm3
604 addps xmm6, xmm1
605 mulps xmm0, xmm4
606 addps xmm7, xmm0
607
608 dec edx
609 jnz .loop_start
610.loop_end:
611 ; store autoc
612 mov edx, [esp + 16] ; edx == autoc
613 movups [edx], xmm5
Josh Coalsona52270e2001-07-18 00:23:40 +0000614 movups [edx + 16], xmm6
615 movups [edx + 32], xmm7
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000616
617.end:
618 ret
619
Josh Coalson1b44f7e2007-03-13 06:35:03 +0000620 ALIGN 16
Josh Coalsonf5925df2001-07-16 21:13:19 +0000621cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
622 ;[ebp + 32] autoc
623 ;[ebp + 28] lag
624 ;[ebp + 24] data_len
625 ;[ebp + 20] data
626
627 push ebp
628 push ebx
629 push esi
630 push edi
631 mov ebp, esp
632
633 mov esi, [ebp + 20]
634 mov edi, [ebp + 24]
635 mov edx, [ebp + 28]
Josh Coalson59d84502002-12-30 02:33:19 +0000636 inc edx
637 and edx, byte -2
Josh Coalsonf5925df2001-07-16 21:13:19 +0000638 mov eax, edx
639 neg eax
640 and esp, byte -8
641 lea esp, [esp + 4 * eax]
642 mov ecx, edx
643 xor eax, eax
644.loop0:
645 dec ecx
646 mov [esp + 4 * ecx], eax
647 jnz short .loop0
648
649 mov eax, edi
650 sub eax, edx
651 mov ebx, edx
652 and ebx, byte 1
653 sub eax, ebx
654 lea ecx, [esi + 4 * eax - 12]
655 cmp esi, ecx
656 mov eax, esi
657 ja short .loop2_pre
Josh Coalson1b44f7e2007-03-13 06:35:03 +0000658 ALIGN 16 ;4 nops
Josh Coalsonf5925df2001-07-16 21:13:19 +0000659.loop1_i:
660 movd mm0, [eax]
661 movd mm2, [eax + 4]
662 movd mm4, [eax + 8]
663 movd mm6, [eax + 12]
664 mov ebx, edx
665 punpckldq mm0, mm0
666 punpckldq mm2, mm2
667 punpckldq mm4, mm4
668 punpckldq mm6, mm6
Josh Coalson1b44f7e2007-03-13 06:35:03 +0000669 ALIGN 16 ;3 nops
Josh Coalsonf5925df2001-07-16 21:13:19 +0000670.loop1_j:
671 sub ebx, byte 2
672 movd mm1, [eax + 4 * ebx]
673 movd mm3, [eax + 4 * ebx + 4]
674 movd mm5, [eax + 4 * ebx + 8]
675 movd mm7, [eax + 4 * ebx + 12]
676 punpckldq mm1, mm3
677 punpckldq mm3, mm5
678 pfmul mm1, mm0
679 punpckldq mm5, mm7
680 pfmul mm3, mm2
681 punpckldq mm7, [eax + 4 * ebx + 16]
682 pfmul mm5, mm4
683 pfmul mm7, mm6
684 pfadd mm1, mm3
685 movq mm3, [esp + 4 * ebx]
686 pfadd mm5, mm7
687 pfadd mm1, mm5
688 pfadd mm3, mm1
689 movq [esp + 4 * ebx], mm3
690 jg short .loop1_j
691
692 add eax, byte 16
693 cmp eax, ecx
694 jb short .loop1_i
695
696.loop2_pre:
697 mov ebx, eax
698 sub eax, esi
699 shr eax, 2
700 lea ecx, [esi + 4 * edi]
701 mov esi, ebx
702.loop2_i:
703 movd mm0, [esi]
704 mov ebx, edi
705 sub ebx, eax
706 cmp ebx, edx
707 jbe short .loop2_j
708 mov ebx, edx
709.loop2_j:
710 dec ebx
711 movd mm1, [esi + 4 * ebx]
712 pfmul mm1, mm0
713 movd mm2, [esp + 4 * ebx]
714 pfadd mm1, mm2
715 movd [esp + 4 * ebx], mm1
716
717 jnz short .loop2_j
718
719 add esi, byte 4
720 inc eax
721 cmp esi, ecx
722 jnz short .loop2_i
723
724 mov edi, [ebp + 32]
Josh Coalson59d84502002-12-30 02:33:19 +0000725 mov edx, [ebp + 28]
Josh Coalsonf5925df2001-07-16 21:13:19 +0000726.loop3:
727 dec edx
728 mov eax, [esp + 4 * edx]
729 mov [edi + 4 * edx], eax
730 jnz short .loop3
731
732 femms
733
734 mov esp, ebp
735 pop edi
736 pop esi
737 pop ebx
738 pop ebp
739 ret
740
Josh Coalson7446e182005-01-26 04:04:38 +0000741;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000742;
743; for(i = 0; i < data_len; i++) {
744; sum = 0;
745; for(j = 0; j < order; j++)
746; sum += qlp_coeff[j] * data[i-j-1];
747; residual[i] = data[i] - (sum >> lp_quantization);
748; }
749;
750 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000751cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000752 ;[esp + 40] residual[]
753 ;[esp + 36] lp_quantization
754 ;[esp + 32] order
755 ;[esp + 28] qlp_coeff[]
756 ;[esp + 24] data_len
757 ;[esp + 20] data[]
758
759 ;ASSERT(order > 0)
760
761 push ebp
762 push ebx
763 push esi
764 push edi
765
766 mov esi, [esp + 20] ; esi = data[]
767 mov edi, [esp + 40] ; edi = residual[]
768 mov eax, [esp + 32] ; eax = order
769 mov ebx, [esp + 24] ; ebx = data_len
770
771 test ebx, ebx
772 jz near .end ; do nothing if data_len == 0
773.begin:
774 cmp eax, byte 1
775 jg short .i_1more
776
777 mov ecx, [esp + 28]
778 mov edx, [ecx] ; edx = qlp_coeff[0]
779 mov eax, [esi - 4] ; eax = data[-1]
780 mov cl, [esp + 36] ; cl = lp_quantization
781 ALIGN 16
782.i_1_loop_i:
783 imul eax, edx
784 sar eax, cl
785 neg eax
786 add eax, [esi]
787 mov [edi], eax
788 mov eax, [esi]
789 add edi, byte 4
790 add esi, byte 4
791 dec ebx
792 jnz .i_1_loop_i
793
794 jmp .end
795
796.i_1more:
797 cmp eax, byte 32 ; for order <= 32 there is a faster routine
798 jbe short .i_32
799
800 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
801 ALIGN 16
802.i_32more_loop_i:
803 xor ebp, ebp
804 mov ecx, [esp + 32]
805 mov edx, ecx
806 shl edx, 2
807 add edx, [esp + 28]
808 neg ecx
809 ALIGN 16
810.i_32more_loop_j:
811 sub edx, byte 4
812 mov eax, [edx]
813 imul eax, [esi + 4 * ecx]
814 add ebp, eax
815 inc ecx
816 jnz short .i_32more_loop_j
817
818 mov cl, [esp + 36]
819 sar ebp, cl
820 neg ebp
821 add ebp, [esi]
822 mov [edi], ebp
823 add esi, byte 4
824 add edi, byte 4
825
826 dec ebx
827 jnz .i_32more_loop_i
828
829 jmp .end
830
831.i_32:
832 sub edi, esi
833 neg eax
Josh Coalson651d6de2001-12-04 05:36:09 +0000834 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
835 call .get_eip0
836.get_eip0:
837 pop eax
838 add edx, eax
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000839 inc edx
840 mov eax, [esp + 28] ; eax = qlp_coeff[]
841 xor ebp, ebp
842 jmp edx
843
844 mov ecx, [eax + 124]
845 imul ecx, [esi - 128]
846 add ebp, ecx
847 mov ecx, [eax + 120]
848 imul ecx, [esi - 124]
849 add ebp, ecx
850 mov ecx, [eax + 116]
851 imul ecx, [esi - 120]
852 add ebp, ecx
853 mov ecx, [eax + 112]
854 imul ecx, [esi - 116]
855 add ebp, ecx
856 mov ecx, [eax + 108]
857 imul ecx, [esi - 112]
858 add ebp, ecx
859 mov ecx, [eax + 104]
860 imul ecx, [esi - 108]
861 add ebp, ecx
862 mov ecx, [eax + 100]
863 imul ecx, [esi - 104]
864 add ebp, ecx
865 mov ecx, [eax + 96]
866 imul ecx, [esi - 100]
867 add ebp, ecx
868 mov ecx, [eax + 92]
869 imul ecx, [esi - 96]
870 add ebp, ecx
871 mov ecx, [eax + 88]
872 imul ecx, [esi - 92]
873 add ebp, ecx
874 mov ecx, [eax + 84]
875 imul ecx, [esi - 88]
876 add ebp, ecx
877 mov ecx, [eax + 80]
878 imul ecx, [esi - 84]
879 add ebp, ecx
880 mov ecx, [eax + 76]
881 imul ecx, [esi - 80]
882 add ebp, ecx
883 mov ecx, [eax + 72]
884 imul ecx, [esi - 76]
885 add ebp, ecx
886 mov ecx, [eax + 68]
887 imul ecx, [esi - 72]
888 add ebp, ecx
889 mov ecx, [eax + 64]
890 imul ecx, [esi - 68]
891 add ebp, ecx
892 mov ecx, [eax + 60]
893 imul ecx, [esi - 64]
894 add ebp, ecx
895 mov ecx, [eax + 56]
896 imul ecx, [esi - 60]
897 add ebp, ecx
898 mov ecx, [eax + 52]
899 imul ecx, [esi - 56]
900 add ebp, ecx
901 mov ecx, [eax + 48]
902 imul ecx, [esi - 52]
903 add ebp, ecx
904 mov ecx, [eax + 44]
905 imul ecx, [esi - 48]
906 add ebp, ecx
907 mov ecx, [eax + 40]
908 imul ecx, [esi - 44]
909 add ebp, ecx
910 mov ecx, [eax + 36]
911 imul ecx, [esi - 40]
912 add ebp, ecx
913 mov ecx, [eax + 32]
914 imul ecx, [esi - 36]
915 add ebp, ecx
916 mov ecx, [eax + 28]
917 imul ecx, [esi - 32]
918 add ebp, ecx
919 mov ecx, [eax + 24]
920 imul ecx, [esi - 28]
921 add ebp, ecx
922 mov ecx, [eax + 20]
923 imul ecx, [esi - 24]
924 add ebp, ecx
925 mov ecx, [eax + 16]
926 imul ecx, [esi - 20]
927 add ebp, ecx
928 mov ecx, [eax + 12]
929 imul ecx, [esi - 16]
930 add ebp, ecx
931 mov ecx, [eax + 8]
932 imul ecx, [esi - 12]
933 add ebp, ecx
934 mov ecx, [eax + 4]
935 imul ecx, [esi - 8]
936 add ebp, ecx
937 mov ecx, [eax] ; there is one byte missing
938 imul ecx, [esi - 4]
939 add ebp, ecx
940.jumper_0:
941
942 mov cl, [esp + 36]
943 sar ebp, cl
944 neg ebp
945 add ebp, [esi]
946 mov [edi + esi], ebp
947 add esi, byte 4
948
949 dec ebx
950 jz short .end
951 xor ebp, ebp
952 jmp edx
953
954.end:
955 pop edi
956 pop esi
957 pop ebx
958 pop ebp
959 ret
960
961; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
Josh Coalson1b44f7e2007-03-13 06:35:03 +0000962; the channel and qlp_coeffs must be <= 16. Especially note that this routine
963; cannot be used for side-channel coded 16bps channels since the effective bps
964; is 17.
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000965 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000966cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000967 ;[esp + 40] residual[]
968 ;[esp + 36] lp_quantization
969 ;[esp + 32] order
970 ;[esp + 28] qlp_coeff[]
971 ;[esp + 24] data_len
972 ;[esp + 20] data[]
973
974 ;ASSERT(order > 0)
975
976 push ebp
977 push ebx
978 push esi
979 push edi
980
981 mov esi, [esp + 20] ; esi = data[]
982 mov edi, [esp + 40] ; edi = residual[]
983 mov eax, [esp + 32] ; eax = order
984 mov ebx, [esp + 24] ; ebx = data_len
985
986 test ebx, ebx
987 jz near .end ; do nothing if data_len == 0
988 dec ebx
989 test ebx, ebx
990 jz near .last_one
991
992 mov edx, [esp + 28] ; edx = qlp_coeff[]
993 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
994 mov ebp, esp
995
996 and esp, 0xfffffff8
997
998 xor ecx, ecx
999.copy_qlp_loop:
1000 push word [edx + 4 * ecx]
1001 inc ecx
1002 cmp ecx, eax
1003 jnz short .copy_qlp_loop
1004
1005 and ecx, 0x3
1006 test ecx, ecx
1007 je short .za_end
1008 sub ecx, byte 4
1009.za_loop:
1010 push word 0
1011 inc eax
1012 inc ecx
1013 jnz short .za_loop
1014.za_end:
1015
1016 movq mm5, [esp + 2 * eax - 8]
1017 movd mm4, [esi - 16]
1018 punpckldq mm4, [esi - 12]
1019 movd mm0, [esi - 8]
1020 punpckldq mm0, [esi - 4]
1021 packssdw mm4, mm0
1022
1023 cmp eax, byte 4
1024 jnbe short .mmx_4more
1025
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001026 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001027.mmx_4_loop_i:
1028 movd mm1, [esi]
1029 movq mm3, mm4
1030 punpckldq mm1, [esi + 4]
1031 psrlq mm4, 16
1032 movq mm0, mm1
1033 psllq mm0, 48
1034 por mm4, mm0
1035 movq mm2, mm4
1036 psrlq mm4, 16
1037 pxor mm0, mm0
1038 punpckhdq mm0, mm1
1039 pmaddwd mm3, mm5
1040 pmaddwd mm2, mm5
1041 psllq mm0, 16
1042 por mm4, mm0
1043 movq mm0, mm3
1044 punpckldq mm3, mm2
1045 punpckhdq mm0, mm2
1046 paddd mm3, mm0
1047 psrad mm3, mm6
1048 psubd mm1, mm3
1049 movd [edi], mm1
1050 punpckhdq mm1, mm1
1051 movd [edi + 4], mm1
1052
1053 add edi, byte 8
1054 add esi, byte 8
1055
1056 sub ebx, 2
1057 jg .mmx_4_loop_i
1058 jmp .mmx_end
Josh Coalsoncd66fc02001-06-18 02:34:09 +00001059
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001060.mmx_4more:
1061 shl eax, 2
1062 neg eax
1063 add eax, byte 16
1064
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001065 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001066.mmx_4more_loop_i:
1067 movd mm1, [esi]
1068 punpckldq mm1, [esi + 4]
1069 movq mm3, mm4
1070 psrlq mm4, 16
1071 movq mm0, mm1
1072 psllq mm0, 48
1073 por mm4, mm0
1074 movq mm2, mm4
1075 psrlq mm4, 16
1076 pxor mm0, mm0
1077 punpckhdq mm0, mm1
1078 pmaddwd mm3, mm5
1079 pmaddwd mm2, mm5
1080 psllq mm0, 16
1081 por mm4, mm0
1082
1083 mov ecx, esi
1084 add ecx, eax
1085 mov edx, esp
1086
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001087 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001088.mmx_4more_loop_j:
1089 movd mm0, [ecx - 16]
1090 movd mm7, [ecx - 8]
1091 punpckldq mm0, [ecx - 12]
1092 punpckldq mm7, [ecx - 4]
1093 packssdw mm0, mm7
1094 pmaddwd mm0, [edx]
1095 punpckhdq mm7, mm7
1096 paddd mm3, mm0
1097 movd mm0, [ecx - 12]
1098 punpckldq mm0, [ecx - 8]
1099 punpckldq mm7, [ecx]
1100 packssdw mm0, mm7
1101 pmaddwd mm0, [edx]
1102 paddd mm2, mm0
1103
1104 add edx, byte 8
1105 add ecx, byte 16
1106 cmp ecx, esi
1107 jnz .mmx_4more_loop_j
1108
1109 movq mm0, mm3
1110 punpckldq mm3, mm2
1111 punpckhdq mm0, mm2
1112 paddd mm3, mm0
1113 psrad mm3, mm6
1114 psubd mm1, mm3
1115 movd [edi], mm1
1116 punpckhdq mm1, mm1
1117 movd [edi + 4], mm1
1118
1119 add edi, byte 8
1120 add esi, byte 8
1121
1122 sub ebx, 2
1123 jg near .mmx_4more_loop_i
1124
1125.mmx_end:
1126 emms
1127 mov esp, ebp
1128.last_one:
1129 mov eax, [esp + 32]
1130 inc ebx
Josh Coalsone6499bd2001-06-13 18:11:25 +00001131 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001132
1133.end:
1134 pop edi
1135 pop esi
1136 pop ebx
1137 pop ebp
1138 ret
1139
1140; **********************************************************************
1141;
Josh Coalson77e3f312001-06-23 03:03:24 +00001142; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001143; {
1144; unsigned i, j;
Josh Coalson77e3f312001-06-23 03:03:24 +00001145; FLAC__int32 sum;
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001146;
1147; FLAC__ASSERT(order > 0);
1148;
1149; for(i = 0; i < data_len; i++) {
1150; sum = 0;
1151; for(j = 0; j < order; j++)
1152; sum += qlp_coeff[j] * data[i-j-1];
1153; data[i] = residual[i] + (sum >> lp_quantization);
1154; }
1155; }
1156 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +00001157cident FLAC__lpc_restore_signal_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001158 ;[esp + 40] data[]
1159 ;[esp + 36] lp_quantization
1160 ;[esp + 32] order
1161 ;[esp + 28] qlp_coeff[]
1162 ;[esp + 24] data_len
1163 ;[esp + 20] residual[]
1164
1165 ;ASSERT(order > 0)
1166
1167 push ebp
1168 push ebx
1169 push esi
1170 push edi
1171
1172 mov esi, [esp + 20] ; esi = residual[]
1173 mov edi, [esp + 40] ; edi = data[]
1174 mov eax, [esp + 32] ; eax = order
1175 mov ebx, [esp + 24] ; ebx = data_len
1176
1177 test ebx, ebx
1178 jz near .end ; do nothing if data_len == 0
1179
1180.begin:
1181 cmp eax, byte 1
1182 jg short .x87_1more
1183
1184 mov ecx, [esp + 28]
1185 mov edx, [ecx]
1186 mov eax, [edi - 4]
1187 mov cl, [esp + 36]
1188 ALIGN 16
1189.x87_1_loop_i:
1190 imul eax, edx
1191 sar eax, cl
1192 add eax, [esi]
1193 mov [edi], eax
1194 add esi, byte 4
1195 add edi, byte 4
1196 dec ebx
1197 jnz .x87_1_loop_i
1198
1199 jmp .end
1200
1201.x87_1more:
1202 cmp eax, byte 32 ; for order <= 32 there is a faster routine
1203 jbe short .x87_32
1204
1205 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1206 ALIGN 16
1207.x87_32more_loop_i:
1208 xor ebp, ebp
1209 mov ecx, [esp + 32]
1210 mov edx, ecx
1211 shl edx, 2
1212 add edx, [esp + 28]
1213 neg ecx
1214 ALIGN 16
1215.x87_32more_loop_j:
1216 sub edx, byte 4
1217 mov eax, [edx]
1218 imul eax, [edi + 4 * ecx]
1219 add ebp, eax
1220 inc ecx
1221 jnz short .x87_32more_loop_j
1222
1223 mov cl, [esp + 36]
1224 sar ebp, cl
1225 add ebp, [esi]
1226 mov [edi], ebp
1227 add edi, byte 4
1228 add esi, byte 4
1229
1230 dec ebx
1231 jnz .x87_32more_loop_i
1232
1233 jmp .end
1234
1235.x87_32:
1236 sub esi, edi
1237 neg eax
Josh Coalson651d6de2001-12-04 05:36:09 +00001238 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1239 call .get_eip0
1240.get_eip0:
1241 pop eax
1242 add edx, eax
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001243 inc edx ; compensate for the shorter opcode on the last iteration
1244 mov eax, [esp + 28] ; eax = qlp_coeff[]
1245 xor ebp, ebp
1246 jmp edx
1247
1248 mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
1249 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
1250 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
1251 mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
1252 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
1253 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
1254 mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
1255 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
1256 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
1257 mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
1258 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
1259 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
1260 mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
1261 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
1262 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
1263 mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
1264 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
1265 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
1266 mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
1267 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
1268 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
1269 mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
1270 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
1271 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
1272 mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
1273 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
1274 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
1275 mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
1276 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
1277 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
1278 mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
1279 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
1280 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
1281 mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
1282 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
1283 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
1284 mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
1285 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
1286 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
1287 mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
1288 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
1289 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
1290 mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
1291 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
1292 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
1293 mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
1294 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
1295 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
1296 mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
1297 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
1298 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
1299 mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
1300 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
1301 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
1302 mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
1303 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
1304 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
1305 mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
1306 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
1307 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
1308 mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
1309 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
1310 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
1311 mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
1312 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
1313 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
1314 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
1315 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
1316 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
1317 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
1318 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
1319 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
1320 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
1321 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
1322 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
1323 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
1324 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
1325 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
1326 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
1327 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
1328 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
1329 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
1330 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
1331 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
1332 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
1333 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
1334 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
1335 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
1336 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
1337 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
1338 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
1339 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
1340 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
1341 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1342 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
1343 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
1344.jumper_0:
1345
1346 mov cl, [esp + 36]
1347 sar ebp, cl ; ebp = (sum >> lp_quantization)
1348 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
1349 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
1350 add edi, byte 4
1351
1352 dec ebx
1353 jz short .end
1354 xor ebp, ebp
1355 jmp edx
1356
1357.end:
1358 pop edi
1359 pop esi
1360 pop ebx
1361 pop ebp
1362 ret
1363
1364; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001365; the channel and qlp_coeffs must be <= 16. Especially note that this routine
1366; cannot be used for side-channel coded 16bps channels since the effective bps
1367; is 17.
Josh Coalsone0a06682001-07-12 21:23:31 +00001368; WATCHOUT: this routine requires that each data array have a buffer of up to
1369; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1370; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001371 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +00001372cident FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001373 ;[esp + 40] data[]
1374 ;[esp + 36] lp_quantization
1375 ;[esp + 32] order
1376 ;[esp + 28] qlp_coeff[]
1377 ;[esp + 24] data_len
1378 ;[esp + 20] residual[]
1379
1380 ;ASSERT(order > 0)
1381
1382 push ebp
1383 push ebx
1384 push esi
1385 push edi
1386
1387 mov esi, [esp + 20]
1388 mov edi, [esp + 40]
1389 mov eax, [esp + 32]
1390 mov ebx, [esp + 24]
1391
1392 test ebx, ebx
1393 jz near .end ; do nothing if data_len == 0
1394 cmp eax, byte 4
Josh Coalsone6499bd2001-06-13 18:11:25 +00001395 jb near FLAC__lpc_restore_signal_asm_ia32.begin
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001396
1397 mov edx, [esp + 28]
1398 movd mm6, [esp + 36]
1399 mov ebp, esp
1400
1401 and esp, 0xfffffff8
1402
1403 xor ecx, ecx
1404.copy_qlp_loop:
1405 push word [edx + 4 * ecx]
1406 inc ecx
1407 cmp ecx, eax
1408 jnz short .copy_qlp_loop
1409
1410 and ecx, 0x3
1411 test ecx, ecx
1412 je short .za_end
1413 sub ecx, byte 4
1414.za_loop:
1415 push word 0
1416 inc eax
1417 inc ecx
1418 jnz short .za_loop
1419.za_end:
1420
1421 movq mm5, [esp + 2 * eax - 8]
1422 movd mm4, [edi - 16]
1423 punpckldq mm4, [edi - 12]
1424 movd mm0, [edi - 8]
1425 punpckldq mm0, [edi - 4]
1426 packssdw mm4, mm0
1427
1428 cmp eax, byte 4
1429 jnbe short .mmx_4more
1430
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001431 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001432.mmx_4_loop_i:
1433 movq mm7, mm4
1434 pmaddwd mm7, mm5
1435 movq mm0, mm7
1436 punpckhdq mm7, mm7
1437 paddd mm7, mm0
1438 psrad mm7, mm6
1439 movd mm1, [esi]
1440 paddd mm7, mm1
1441 movd [edi], mm7
1442 psllq mm7, 48
1443 psrlq mm4, 16
1444 por mm4, mm7
1445
1446 add esi, byte 4
1447 add edi, byte 4
1448
1449 dec ebx
1450 jnz .mmx_4_loop_i
1451 jmp .mmx_end
1452.mmx_4more:
1453 shl eax, 2
1454 neg eax
1455 add eax, byte 16
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001456 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001457.mmx_4more_loop_i:
1458 mov ecx, edi
1459 add ecx, eax
1460 mov edx, esp
1461
1462 movq mm7, mm4
1463 pmaddwd mm7, mm5
1464
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001465 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001466.mmx_4more_loop_j:
1467 movd mm0, [ecx - 16]
1468 punpckldq mm0, [ecx - 12]
1469 movd mm1, [ecx - 8]
1470 punpckldq mm1, [ecx - 4]
1471 packssdw mm0, mm1
1472 pmaddwd mm0, [edx]
1473 paddd mm7, mm0
1474
1475 add edx, byte 8
1476 add ecx, byte 16
1477 cmp ecx, edi
1478 jnz .mmx_4more_loop_j
1479
1480 movq mm0, mm7
1481 punpckhdq mm7, mm7
1482 paddd mm7, mm0
1483 psrad mm7, mm6
1484 movd mm1, [esi]
1485 paddd mm7, mm1
1486 movd [edi], mm7
1487 psllq mm7, 48
1488 psrlq mm4, 16
1489 por mm4, mm7
1490
1491 add esi, byte 4
1492 add edi, byte 4
1493
1494 dec ebx
1495 jnz short .mmx_4more_loop_i
1496.mmx_end:
1497 emms
1498 mov esp, ebp
1499
1500.end:
1501 pop edi
1502 pop esi
1503 pop ebx
1504 pop ebp
1505 ret
1506
1507end