blob: 8c386f9f23864e25e88ce050a1574d62196a182b [file] [log] [blame]
Josh Coalsonbe5e19b2007-03-22 03:13:11 +00001; vim:filetype=nasm ts=8
2
Josh Coalsonafd81072003-01-31 23:34:56 +00003; libFLAC - Free Lossless Audio Codec library
Erik de Castro Lopob1982fb2013-05-25 17:11:19 +10004; Copyright (C) 2001-2009 Josh Coalson
5; Copyright (C) 2011-2013 Xiph.Org Foundation
Josh Coalson9a7b5e22001-06-13 18:03:09 +00006;
Josh Coalsonafd81072003-01-31 23:34:56 +00007; Redistribution and use in source and binary forms, with or without
8; modification, are permitted provided that the following conditions
9; are met:
Josh Coalson9a7b5e22001-06-13 18:03:09 +000010;
Josh Coalsonafd81072003-01-31 23:34:56 +000011; - Redistributions of source code must retain the above copyright
12; notice, this list of conditions and the following disclaimer.
Josh Coalson9a7b5e22001-06-13 18:03:09 +000013;
Josh Coalsonafd81072003-01-31 23:34:56 +000014; - Redistributions in binary form must reproduce the above copyright
15; notice, this list of conditions and the following disclaimer in the
16; documentation and/or other materials provided with the distribution.
17;
18; - Neither the name of the Xiph.org Foundation nor the names of its
19; contributors may be used to endorse or promote products derived from
20; this software without specific prior written permission.
21;
22; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
26; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Josh Coalson9a7b5e22001-06-13 18:03:09 +000033
34%include "nasm.h"
35
36 data_section
37
Josh Coalsone6499bd2001-06-13 18:11:25 +000038cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
39cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
40cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
41cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
Josh Coalsonf5925df2001-07-16 21:13:19 +000042cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
Josh Coalsone6499bd2001-06-13 18:11:25 +000043cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
44cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
45cglobal FLAC__lpc_restore_signal_asm_ia32
46cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +000047
48 code_section
49
50; **********************************************************************
51;
Josh Coalson77e3f312001-06-23 03:03:24 +000052; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +000053; {
Josh Coalson77e3f312001-06-23 03:03:24 +000054; FLAC__real d;
Josh Coalson9a7b5e22001-06-13 18:03:09 +000055; unsigned sample, coeff;
56; const unsigned limit = data_len - lag;
57;
58; FLAC__ASSERT(lag > 0);
59; FLAC__ASSERT(lag <= data_len);
60;
61; for(coeff = 0; coeff < lag; coeff++)
62; autoc[coeff] = 0.0;
63; for(sample = 0; sample <= limit; sample++) {
64; d = data[sample];
65; for(coeff = 0; coeff < lag; coeff++)
66; autoc[coeff] += d * data[sample+coeff];
67; }
68; for(; sample < data_len; sample++) {
69; d = data[sample];
70; for(coeff = 0; coeff < data_len - sample; coeff++)
71; autoc[coeff] += d * data[sample+coeff];
72; }
73; }
74;
75 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +000076cident FLAC__lpc_compute_autocorrelation_asm_ia32
Josh Coalson651d6de2001-12-04 05:36:09 +000077 ;[esp + 28] == autoc[]
78 ;[esp + 24] == lag
79 ;[esp + 20] == data_len
80 ;[esp + 16] == data[]
Josh Coalson9a7b5e22001-06-13 18:03:09 +000081
82 ;ASSERT(lag > 0)
83 ;ASSERT(lag <= 33)
84 ;ASSERT(lag <= data_len)
85
86.begin:
87 push esi
88 push edi
Josh Coalson651d6de2001-12-04 05:36:09 +000089 push ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +000090
91 ; for(coeff = 0; coeff < lag; coeff++)
92 ; autoc[coeff] = 0.0;
Josh Coalson651d6de2001-12-04 05:36:09 +000093 mov edi, [esp + 28] ; edi == autoc
94 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write
Josh Coalson9a7b5e22001-06-13 18:03:09 +000095 xor eax, eax
96 rep stosd
97
98 ; const unsigned limit = data_len - lag;
Josh Coalson651d6de2001-12-04 05:36:09 +000099 mov eax, [esp + 24] ; eax == lag
100 mov ecx, [esp + 20]
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000101 sub ecx, eax ; ecx == limit
102
Josh Coalson651d6de2001-12-04 05:36:09 +0000103 mov edi, [esp + 28] ; edi == autoc
104 mov esi, [esp + 16] ; esi == data
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000105 inc ecx ; we are looping <= limit so we add one to the counter
106
107 ; for(sample = 0; sample <= limit; sample++) {
108 ; d = data[sample];
109 ; for(coeff = 0; coeff < lag; coeff++)
110 ; autoc[coeff] += d * data[sample+coeff];
111 ; }
112 fld dword [esi] ; ST = d <- data[sample]
113 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
114 lea edx, [eax + eax*2]
115 neg edx
Josh Coalson651d6de2001-12-04 05:36:09 +0000116 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
117 call .get_eip1
118.get_eip1:
119 pop ebx
120 add edx, ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000121 inc edx ; compensate for the shorter opcode on the last iteration
122 inc edx ; compensate for the shorter opcode on the last iteration
123 inc edx ; compensate for the shorter opcode on the last iteration
124 cmp eax, 33
125 jne .loop1_start
126 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration
127.loop1_start:
128 jmp edx
129
130 fld st0 ; ST = d d
131 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here!
132 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here!
133 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here!
134 fld st0 ; ST = d d
135 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
136 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
137 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
138 fld st0 ; ST = d d
139 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
140 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
141 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
142 fld st0 ; ST = d d
143 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
144 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
145 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
146 fld st0 ; ST = d d
147 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
148 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
149 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
150 fld st0 ; ST = d d
151 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
152 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
153 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
154 fld st0 ; ST = d d
155 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
156 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
157 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
158 fld st0 ; ST = d d
159 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
160 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
161 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
162 fld st0 ; ST = d d
163 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
164 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
165 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
166 fld st0 ; ST = d d
167 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
168 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
169 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
170 fld st0 ; ST = d d
171 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
172 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
173 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
174 fld st0 ; ST = d d
175 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
176 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
177 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
178 fld st0 ; ST = d d
179 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
180 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
181 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
182 fld st0 ; ST = d d
183 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
184 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
185 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
186 fld st0 ; ST = d d
187 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
188 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
189 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
190 fld st0 ; ST = d d
191 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
192 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
193 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
194 fld st0 ; ST = d d
195 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
196 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
197 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
198 fld st0 ; ST = d d
199 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
200 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
201 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
202 fld st0 ; ST = d d
203 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
204 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
205 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
206 fld st0 ; ST = d d
207 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
208 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
209 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
210 fld st0 ; ST = d d
211 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
212 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
213 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
214 fld st0 ; ST = d d
215 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
216 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
217 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
218 fld st0 ; ST = d d
219 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
220 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
221 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
222 fld st0 ; ST = d d
223 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
224 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
225 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
226 fld st0 ; ST = d d
227 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
228 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
229 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
230 fld st0 ; ST = d d
231 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
232 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
233 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
234 fld st0 ; ST = d d
235 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
236 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
237 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
238 fld st0 ; ST = d d
239 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
240 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
241 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
242 fld st0 ; ST = d d
243 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
244 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
245 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
246 fld st0 ; ST = d d
247 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
248 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
249 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
250 fld st0 ; ST = d d
251 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
252 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
253 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
254 fld st0 ; ST = d d
255 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
256 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
257 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
258 fld st0 ; ST = d d
259 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
260 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
261 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
262.jumper1_0:
263
264 fstp st0 ; pop d, ST = empty
265 add esi, byte 4 ; sample++
266 dec ecx
267 jz .loop1_end
268 fld dword [esi] ; ST = d <- data[sample]
269 jmp edx
270.loop1_end:
271
272 ; for(; sample < data_len; sample++) {
273 ; d = data[sample];
274 ; for(coeff = 0; coeff < data_len - sample; coeff++)
275 ; autoc[coeff] += d * data[sample+coeff];
276 ; }
Josh Coalson651d6de2001-12-04 05:36:09 +0000277 mov ecx, [esp + 24] ; ecx <- lag
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000278 dec ecx ; ecx <- lag - 1
279 jz near .end ; skip loop if 0 (i.e. lag == 1)
280
281 fld dword [esi] ; ST = d <- data[sample]
282 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through
283 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
284 lea edx, [eax + eax*2]
285 neg edx
Josh Coalson651d6de2001-12-04 05:36:09 +0000286 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
287 call .get_eip2
288.get_eip2:
289 pop ebx
290 add edx, ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000291 inc edx ; compensate for the shorter opcode on the last iteration
292 inc edx ; compensate for the shorter opcode on the last iteration
293 inc edx ; compensate for the shorter opcode on the last iteration
294 jmp edx
295
296 fld st0 ; ST = d d
297 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
298 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
299 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
300 fld st0 ; ST = d d
301 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
302 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
303 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
304 fld st0 ; ST = d d
305 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
306 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
307 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
308 fld st0 ; ST = d d
309 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
310 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
311 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
312 fld st0 ; ST = d d
313 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
314 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
315 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
316 fld st0 ; ST = d d
317 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
318 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
319 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
320 fld st0 ; ST = d d
321 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
322 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
323 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
324 fld st0 ; ST = d d
325 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
326 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
327 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
328 fld st0 ; ST = d d
329 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
330 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
331 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
332 fld st0 ; ST = d d
333 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
334 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
335 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
336 fld st0 ; ST = d d
337 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
338 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
339 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
340 fld st0 ; ST = d d
341 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
342 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
343 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
344 fld st0 ; ST = d d
345 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
346 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
347 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
348 fld st0 ; ST = d d
349 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
350 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
351 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
352 fld st0 ; ST = d d
353 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
354 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
355 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
356 fld st0 ; ST = d d
357 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
358 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
359 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
360 fld st0 ; ST = d d
361 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
362 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
363 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
364 fld st0 ; ST = d d
365 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
366 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
367 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
368 fld st0 ; ST = d d
369 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
370 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
371 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
372 fld st0 ; ST = d d
373 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
374 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
375 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
376 fld st0 ; ST = d d
377 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
378 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
379 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
380 fld st0 ; ST = d d
381 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
382 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
383 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
384 fld st0 ; ST = d d
385 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
386 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
387 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
388 fld st0 ; ST = d d
389 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
390 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
391 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
392 fld st0 ; ST = d d
393 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
394 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
395 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
396 fld st0 ; ST = d d
397 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
398 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
399 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
400 fld st0 ; ST = d d
401 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
402 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
403 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
404 fld st0 ; ST = d d
405 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
406 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
407 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
408 fld st0 ; ST = d d
409 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
410 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
411 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
412 fld st0 ; ST = d d
413 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
414 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
415 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
416 fld st0 ; ST = d d
417 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
418 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
419 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
420 fld st0 ; ST = d d
421 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
422 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
423 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
424.jumper2_0:
425
426 fstp st0 ; pop d, ST = empty
427 add esi, byte 4 ; sample++
428 dec ecx
429 jz .loop2_end
430 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target
431 fld dword [esi] ; ST = d <- data[sample]
432 jmp edx
433.loop2_end:
434
435.end:
Josh Coalson651d6de2001-12-04 05:36:09 +0000436 pop ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000437 pop edi
438 pop esi
439 ret
440
441 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000442cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000443 ;[esp + 16] == autoc[]
444 ;[esp + 12] == lag
445 ;[esp + 8] == data_len
446 ;[esp + 4] == data[]
447
448 ;ASSERT(lag > 0)
449 ;ASSERT(lag <= 4)
450 ;ASSERT(lag <= data_len)
451
452 ; for(coeff = 0; coeff < lag; coeff++)
453 ; autoc[coeff] = 0.0;
454 xorps xmm5, xmm5
455
456 mov edx, [esp + 8] ; edx == data_len
457 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
458
459 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
460 add eax, 4
461 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
462 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
463.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
464 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
465 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
466 dec edx
467 jz .loop_end
468 ALIGN 16
469.loop_start:
470 ; start by reading the next sample
471 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
472 add eax, 4
473 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
474 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
475 movss xmm2, xmm0
476 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
477 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
478 dec edx
479 jnz .loop_start
480.loop_end:
481 ; store autoc
482 mov edx, [esp + 16] ; edx == autoc
483 movups [edx], xmm5
484
485.end:
486 ret
487
488 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000489cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000490 ;[esp + 16] == autoc[]
491 ;[esp + 12] == lag
492 ;[esp + 8] == data_len
493 ;[esp + 4] == data[]
494
495 ;ASSERT(lag > 0)
496 ;ASSERT(lag <= 8)
497 ;ASSERT(lag <= data_len)
498
499 ; for(coeff = 0; coeff < lag; coeff++)
500 ; autoc[coeff] = 0.0;
501 xorps xmm5, xmm5
502 xorps xmm6, xmm6
503
504 mov edx, [esp + 8] ; edx == data_len
505 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
506
507 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
508 add eax, 4
509 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
510 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
511 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
512 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
513.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
514 mulps xmm0, xmm2
515 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
516 addps xmm5, xmm0
517 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
518 dec edx
519 jz .loop_end
520 ALIGN 16
521.loop_start:
522 ; start by reading the next sample
523 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
524 ; here we reorder the instructions; see the (#) indexes for a logical order
525 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
526 add eax, 4 ; (0)
527 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
528 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
529 movss xmm3, xmm2 ; (5)
530 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
531 movss xmm2, xmm0 ; (6)
532 mulps xmm1, xmm3 ; (8)
533 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
534 addps xmm6, xmm1 ; (10)
535 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
536 dec edx
537 jnz .loop_start
538.loop_end:
539 ; store autoc
540 mov edx, [esp + 16] ; edx == autoc
541 movups [edx], xmm5
Josh Coalsona52270e2001-07-18 00:23:40 +0000542 movups [edx + 16], xmm6
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000543
544.end:
545 ret
546
547 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000548cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000549 ;[esp + 16] == autoc[]
550 ;[esp + 12] == lag
551 ;[esp + 8] == data_len
552 ;[esp + 4] == data[]
553
554 ;ASSERT(lag > 0)
555 ;ASSERT(lag <= 12)
556 ;ASSERT(lag <= data_len)
557
558 ; for(coeff = 0; coeff < lag; coeff++)
559 ; autoc[coeff] = 0.0;
560 xorps xmm5, xmm5
561 xorps xmm6, xmm6
562 xorps xmm7, xmm7
563
564 mov edx, [esp + 8] ; edx == data_len
565 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
566
567 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
568 add eax, 4
569 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
570 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
571 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
572 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
573.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
574 movaps xmm1, xmm0
575 mulps xmm1, xmm2
576 addps xmm5, xmm1
577 movaps xmm1, xmm0
578 mulps xmm1, xmm3
579 addps xmm6, xmm1
580 mulps xmm0, xmm4
581 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
582 dec edx
583 jz .loop_end
584 ALIGN 16
585.loop_start:
586 ; start by reading the next sample
587 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
588 add eax, 4
589 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
590
591 ; shift xmm4:xmm3:xmm2 left by one float
592 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
593 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
594 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
595 movss xmm4, xmm3
596 movss xmm3, xmm2
597 movss xmm2, xmm0
598
599 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
600 movaps xmm1, xmm0
601 mulps xmm1, xmm2
602 addps xmm5, xmm1
603 movaps xmm1, xmm0
604 mulps xmm1, xmm3
605 addps xmm6, xmm1
606 mulps xmm0, xmm4
607 addps xmm7, xmm0
608
609 dec edx
610 jnz .loop_start
611.loop_end:
612 ; store autoc
613 mov edx, [esp + 16] ; edx == autoc
614 movups [edx], xmm5
Josh Coalsona52270e2001-07-18 00:23:40 +0000615 movups [edx + 16], xmm6
616 movups [edx + 32], xmm7
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000617
618.end:
619 ret
620
Josh Coalson1b44f7e2007-03-13 06:35:03 +0000621 ALIGN 16
Josh Coalsonf5925df2001-07-16 21:13:19 +0000622cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
623 ;[ebp + 32] autoc
624 ;[ebp + 28] lag
625 ;[ebp + 24] data_len
626 ;[ebp + 20] data
627
628 push ebp
629 push ebx
630 push esi
631 push edi
632 mov ebp, esp
633
634 mov esi, [ebp + 20]
635 mov edi, [ebp + 24]
636 mov edx, [ebp + 28]
Josh Coalson59d84502002-12-30 02:33:19 +0000637 inc edx
638 and edx, byte -2
Josh Coalsonf5925df2001-07-16 21:13:19 +0000639 mov eax, edx
640 neg eax
641 and esp, byte -8
642 lea esp, [esp + 4 * eax]
643 mov ecx, edx
644 xor eax, eax
645.loop0:
646 dec ecx
647 mov [esp + 4 * ecx], eax
648 jnz short .loop0
649
650 mov eax, edi
651 sub eax, edx
652 mov ebx, edx
653 and ebx, byte 1
654 sub eax, ebx
655 lea ecx, [esi + 4 * eax - 12]
656 cmp esi, ecx
657 mov eax, esi
658 ja short .loop2_pre
Josh Coalson1b44f7e2007-03-13 06:35:03 +0000659 ALIGN 16 ;4 nops
Josh Coalsonf5925df2001-07-16 21:13:19 +0000660.loop1_i:
661 movd mm0, [eax]
662 movd mm2, [eax + 4]
663 movd mm4, [eax + 8]
664 movd mm6, [eax + 12]
665 mov ebx, edx
666 punpckldq mm0, mm0
667 punpckldq mm2, mm2
668 punpckldq mm4, mm4
669 punpckldq mm6, mm6
Josh Coalson1b44f7e2007-03-13 06:35:03 +0000670 ALIGN 16 ;3 nops
Josh Coalsonf5925df2001-07-16 21:13:19 +0000671.loop1_j:
672 sub ebx, byte 2
673 movd mm1, [eax + 4 * ebx]
674 movd mm3, [eax + 4 * ebx + 4]
675 movd mm5, [eax + 4 * ebx + 8]
676 movd mm7, [eax + 4 * ebx + 12]
677 punpckldq mm1, mm3
678 punpckldq mm3, mm5
679 pfmul mm1, mm0
680 punpckldq mm5, mm7
681 pfmul mm3, mm2
682 punpckldq mm7, [eax + 4 * ebx + 16]
683 pfmul mm5, mm4
684 pfmul mm7, mm6
685 pfadd mm1, mm3
686 movq mm3, [esp + 4 * ebx]
687 pfadd mm5, mm7
688 pfadd mm1, mm5
689 pfadd mm3, mm1
690 movq [esp + 4 * ebx], mm3
691 jg short .loop1_j
692
693 add eax, byte 16
694 cmp eax, ecx
695 jb short .loop1_i
696
697.loop2_pre:
698 mov ebx, eax
699 sub eax, esi
700 shr eax, 2
701 lea ecx, [esi + 4 * edi]
702 mov esi, ebx
703.loop2_i:
704 movd mm0, [esi]
705 mov ebx, edi
706 sub ebx, eax
707 cmp ebx, edx
708 jbe short .loop2_j
709 mov ebx, edx
710.loop2_j:
711 dec ebx
712 movd mm1, [esi + 4 * ebx]
713 pfmul mm1, mm0
714 movd mm2, [esp + 4 * ebx]
715 pfadd mm1, mm2
716 movd [esp + 4 * ebx], mm1
717
718 jnz short .loop2_j
719
720 add esi, byte 4
721 inc eax
722 cmp esi, ecx
723 jnz short .loop2_i
724
725 mov edi, [ebp + 32]
Josh Coalson59d84502002-12-30 02:33:19 +0000726 mov edx, [ebp + 28]
Josh Coalsonf5925df2001-07-16 21:13:19 +0000727.loop3:
728 dec edx
729 mov eax, [esp + 4 * edx]
730 mov [edi + 4 * edx], eax
731 jnz short .loop3
732
733 femms
734
735 mov esp, ebp
736 pop edi
737 pop esi
738 pop ebx
739 pop ebp
740 ret
741
Josh Coalson7446e182005-01-26 04:04:38 +0000742;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000743;
744; for(i = 0; i < data_len; i++) {
745; sum = 0;
746; for(j = 0; j < order; j++)
747; sum += qlp_coeff[j] * data[i-j-1];
748; residual[i] = data[i] - (sum >> lp_quantization);
749; }
750;
751 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000752cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000753 ;[esp + 40] residual[]
754 ;[esp + 36] lp_quantization
755 ;[esp + 32] order
756 ;[esp + 28] qlp_coeff[]
757 ;[esp + 24] data_len
758 ;[esp + 20] data[]
759
760 ;ASSERT(order > 0)
761
762 push ebp
763 push ebx
764 push esi
765 push edi
766
767 mov esi, [esp + 20] ; esi = data[]
768 mov edi, [esp + 40] ; edi = residual[]
769 mov eax, [esp + 32] ; eax = order
770 mov ebx, [esp + 24] ; ebx = data_len
771
772 test ebx, ebx
773 jz near .end ; do nothing if data_len == 0
774.begin:
775 cmp eax, byte 1
776 jg short .i_1more
777
778 mov ecx, [esp + 28]
779 mov edx, [ecx] ; edx = qlp_coeff[0]
780 mov eax, [esi - 4] ; eax = data[-1]
781 mov cl, [esp + 36] ; cl = lp_quantization
782 ALIGN 16
783.i_1_loop_i:
784 imul eax, edx
785 sar eax, cl
786 neg eax
787 add eax, [esi]
788 mov [edi], eax
789 mov eax, [esi]
790 add edi, byte 4
791 add esi, byte 4
792 dec ebx
793 jnz .i_1_loop_i
794
795 jmp .end
796
797.i_1more:
798 cmp eax, byte 32 ; for order <= 32 there is a faster routine
799 jbe short .i_32
800
801 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
802 ALIGN 16
803.i_32more_loop_i:
804 xor ebp, ebp
805 mov ecx, [esp + 32]
806 mov edx, ecx
807 shl edx, 2
808 add edx, [esp + 28]
809 neg ecx
810 ALIGN 16
811.i_32more_loop_j:
812 sub edx, byte 4
813 mov eax, [edx]
814 imul eax, [esi + 4 * ecx]
815 add ebp, eax
816 inc ecx
817 jnz short .i_32more_loop_j
818
819 mov cl, [esp + 36]
820 sar ebp, cl
821 neg ebp
822 add ebp, [esi]
823 mov [edi], ebp
824 add esi, byte 4
825 add edi, byte 4
826
827 dec ebx
828 jnz .i_32more_loop_i
829
830 jmp .end
831
832.i_32:
833 sub edi, esi
834 neg eax
Josh Coalson651d6de2001-12-04 05:36:09 +0000835 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
836 call .get_eip0
837.get_eip0:
838 pop eax
839 add edx, eax
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000840 inc edx
841 mov eax, [esp + 28] ; eax = qlp_coeff[]
842 xor ebp, ebp
843 jmp edx
844
845 mov ecx, [eax + 124]
846 imul ecx, [esi - 128]
847 add ebp, ecx
848 mov ecx, [eax + 120]
849 imul ecx, [esi - 124]
850 add ebp, ecx
851 mov ecx, [eax + 116]
852 imul ecx, [esi - 120]
853 add ebp, ecx
854 mov ecx, [eax + 112]
855 imul ecx, [esi - 116]
856 add ebp, ecx
857 mov ecx, [eax + 108]
858 imul ecx, [esi - 112]
859 add ebp, ecx
860 mov ecx, [eax + 104]
861 imul ecx, [esi - 108]
862 add ebp, ecx
863 mov ecx, [eax + 100]
864 imul ecx, [esi - 104]
865 add ebp, ecx
866 mov ecx, [eax + 96]
867 imul ecx, [esi - 100]
868 add ebp, ecx
869 mov ecx, [eax + 92]
870 imul ecx, [esi - 96]
871 add ebp, ecx
872 mov ecx, [eax + 88]
873 imul ecx, [esi - 92]
874 add ebp, ecx
875 mov ecx, [eax + 84]
876 imul ecx, [esi - 88]
877 add ebp, ecx
878 mov ecx, [eax + 80]
879 imul ecx, [esi - 84]
880 add ebp, ecx
881 mov ecx, [eax + 76]
882 imul ecx, [esi - 80]
883 add ebp, ecx
884 mov ecx, [eax + 72]
885 imul ecx, [esi - 76]
886 add ebp, ecx
887 mov ecx, [eax + 68]
888 imul ecx, [esi - 72]
889 add ebp, ecx
890 mov ecx, [eax + 64]
891 imul ecx, [esi - 68]
892 add ebp, ecx
893 mov ecx, [eax + 60]
894 imul ecx, [esi - 64]
895 add ebp, ecx
896 mov ecx, [eax + 56]
897 imul ecx, [esi - 60]
898 add ebp, ecx
899 mov ecx, [eax + 52]
900 imul ecx, [esi - 56]
901 add ebp, ecx
902 mov ecx, [eax + 48]
903 imul ecx, [esi - 52]
904 add ebp, ecx
905 mov ecx, [eax + 44]
906 imul ecx, [esi - 48]
907 add ebp, ecx
908 mov ecx, [eax + 40]
909 imul ecx, [esi - 44]
910 add ebp, ecx
911 mov ecx, [eax + 36]
912 imul ecx, [esi - 40]
913 add ebp, ecx
914 mov ecx, [eax + 32]
915 imul ecx, [esi - 36]
916 add ebp, ecx
917 mov ecx, [eax + 28]
918 imul ecx, [esi - 32]
919 add ebp, ecx
920 mov ecx, [eax + 24]
921 imul ecx, [esi - 28]
922 add ebp, ecx
923 mov ecx, [eax + 20]
924 imul ecx, [esi - 24]
925 add ebp, ecx
926 mov ecx, [eax + 16]
927 imul ecx, [esi - 20]
928 add ebp, ecx
929 mov ecx, [eax + 12]
930 imul ecx, [esi - 16]
931 add ebp, ecx
932 mov ecx, [eax + 8]
933 imul ecx, [esi - 12]
934 add ebp, ecx
935 mov ecx, [eax + 4]
936 imul ecx, [esi - 8]
937 add ebp, ecx
938 mov ecx, [eax] ; there is one byte missing
939 imul ecx, [esi - 4]
940 add ebp, ecx
941.jumper_0:
942
943 mov cl, [esp + 36]
944 sar ebp, cl
945 neg ebp
946 add ebp, [esi]
947 mov [edi + esi], ebp
948 add esi, byte 4
949
950 dec ebx
951 jz short .end
952 xor ebp, ebp
953 jmp edx
954
955.end:
956 pop edi
957 pop esi
958 pop ebx
959 pop ebp
960 ret
961
962; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
Josh Coalson1b44f7e2007-03-13 06:35:03 +0000963; the channel and qlp_coeffs must be <= 16. Especially note that this routine
964; cannot be used for side-channel coded 16bps channels since the effective bps
965; is 17.
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000966 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000967cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000968 ;[esp + 40] residual[]
969 ;[esp + 36] lp_quantization
970 ;[esp + 32] order
971 ;[esp + 28] qlp_coeff[]
972 ;[esp + 24] data_len
973 ;[esp + 20] data[]
974
975 ;ASSERT(order > 0)
976
977 push ebp
978 push ebx
979 push esi
980 push edi
981
982 mov esi, [esp + 20] ; esi = data[]
983 mov edi, [esp + 40] ; edi = residual[]
984 mov eax, [esp + 32] ; eax = order
985 mov ebx, [esp + 24] ; ebx = data_len
986
987 test ebx, ebx
988 jz near .end ; do nothing if data_len == 0
989 dec ebx
990 test ebx, ebx
991 jz near .last_one
992
993 mov edx, [esp + 28] ; edx = qlp_coeff[]
994 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
995 mov ebp, esp
996
997 and esp, 0xfffffff8
998
999 xor ecx, ecx
1000.copy_qlp_loop:
1001 push word [edx + 4 * ecx]
1002 inc ecx
1003 cmp ecx, eax
1004 jnz short .copy_qlp_loop
1005
1006 and ecx, 0x3
1007 test ecx, ecx
1008 je short .za_end
1009 sub ecx, byte 4
1010.za_loop:
1011 push word 0
1012 inc eax
1013 inc ecx
1014 jnz short .za_loop
1015.za_end:
1016
1017 movq mm5, [esp + 2 * eax - 8]
1018 movd mm4, [esi - 16]
1019 punpckldq mm4, [esi - 12]
1020 movd mm0, [esi - 8]
1021 punpckldq mm0, [esi - 4]
1022 packssdw mm4, mm0
1023
1024 cmp eax, byte 4
1025 jnbe short .mmx_4more
1026
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001027 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001028.mmx_4_loop_i:
1029 movd mm1, [esi]
1030 movq mm3, mm4
1031 punpckldq mm1, [esi + 4]
1032 psrlq mm4, 16
1033 movq mm0, mm1
1034 psllq mm0, 48
1035 por mm4, mm0
1036 movq mm2, mm4
1037 psrlq mm4, 16
1038 pxor mm0, mm0
1039 punpckhdq mm0, mm1
1040 pmaddwd mm3, mm5
1041 pmaddwd mm2, mm5
1042 psllq mm0, 16
1043 por mm4, mm0
1044 movq mm0, mm3
1045 punpckldq mm3, mm2
1046 punpckhdq mm0, mm2
1047 paddd mm3, mm0
1048 psrad mm3, mm6
1049 psubd mm1, mm3
1050 movd [edi], mm1
1051 punpckhdq mm1, mm1
1052 movd [edi + 4], mm1
1053
1054 add edi, byte 8
1055 add esi, byte 8
1056
1057 sub ebx, 2
1058 jg .mmx_4_loop_i
1059 jmp .mmx_end
Josh Coalsoncd66fc02001-06-18 02:34:09 +00001060
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001061.mmx_4more:
1062 shl eax, 2
1063 neg eax
1064 add eax, byte 16
1065
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001066 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001067.mmx_4more_loop_i:
1068 movd mm1, [esi]
1069 punpckldq mm1, [esi + 4]
1070 movq mm3, mm4
1071 psrlq mm4, 16
1072 movq mm0, mm1
1073 psllq mm0, 48
1074 por mm4, mm0
1075 movq mm2, mm4
1076 psrlq mm4, 16
1077 pxor mm0, mm0
1078 punpckhdq mm0, mm1
1079 pmaddwd mm3, mm5
1080 pmaddwd mm2, mm5
1081 psllq mm0, 16
1082 por mm4, mm0
1083
1084 mov ecx, esi
1085 add ecx, eax
1086 mov edx, esp
1087
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001088 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001089.mmx_4more_loop_j:
1090 movd mm0, [ecx - 16]
1091 movd mm7, [ecx - 8]
1092 punpckldq mm0, [ecx - 12]
1093 punpckldq mm7, [ecx - 4]
1094 packssdw mm0, mm7
1095 pmaddwd mm0, [edx]
1096 punpckhdq mm7, mm7
1097 paddd mm3, mm0
1098 movd mm0, [ecx - 12]
1099 punpckldq mm0, [ecx - 8]
1100 punpckldq mm7, [ecx]
1101 packssdw mm0, mm7
1102 pmaddwd mm0, [edx]
1103 paddd mm2, mm0
1104
1105 add edx, byte 8
1106 add ecx, byte 16
1107 cmp ecx, esi
1108 jnz .mmx_4more_loop_j
1109
1110 movq mm0, mm3
1111 punpckldq mm3, mm2
1112 punpckhdq mm0, mm2
1113 paddd mm3, mm0
1114 psrad mm3, mm6
1115 psubd mm1, mm3
1116 movd [edi], mm1
1117 punpckhdq mm1, mm1
1118 movd [edi + 4], mm1
1119
1120 add edi, byte 8
1121 add esi, byte 8
1122
1123 sub ebx, 2
1124 jg near .mmx_4more_loop_i
1125
1126.mmx_end:
1127 emms
1128 mov esp, ebp
1129.last_one:
1130 mov eax, [esp + 32]
1131 inc ebx
Josh Coalsone6499bd2001-06-13 18:11:25 +00001132 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001133
1134.end:
1135 pop edi
1136 pop esi
1137 pop ebx
1138 pop ebp
1139 ret
1140
1141; **********************************************************************
1142;
Josh Coalson77e3f312001-06-23 03:03:24 +00001143; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001144; {
1145; unsigned i, j;
Josh Coalson77e3f312001-06-23 03:03:24 +00001146; FLAC__int32 sum;
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001147;
1148; FLAC__ASSERT(order > 0);
1149;
1150; for(i = 0; i < data_len; i++) {
1151; sum = 0;
1152; for(j = 0; j < order; j++)
1153; sum += qlp_coeff[j] * data[i-j-1];
1154; data[i] = residual[i] + (sum >> lp_quantization);
1155; }
1156; }
1157 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +00001158cident FLAC__lpc_restore_signal_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001159 ;[esp + 40] data[]
1160 ;[esp + 36] lp_quantization
1161 ;[esp + 32] order
1162 ;[esp + 28] qlp_coeff[]
1163 ;[esp + 24] data_len
1164 ;[esp + 20] residual[]
1165
1166 ;ASSERT(order > 0)
1167
1168 push ebp
1169 push ebx
1170 push esi
1171 push edi
1172
1173 mov esi, [esp + 20] ; esi = residual[]
1174 mov edi, [esp + 40] ; edi = data[]
1175 mov eax, [esp + 32] ; eax = order
1176 mov ebx, [esp + 24] ; ebx = data_len
1177
1178 test ebx, ebx
1179 jz near .end ; do nothing if data_len == 0
1180
1181.begin:
1182 cmp eax, byte 1
1183 jg short .x87_1more
1184
1185 mov ecx, [esp + 28]
1186 mov edx, [ecx]
1187 mov eax, [edi - 4]
1188 mov cl, [esp + 36]
1189 ALIGN 16
1190.x87_1_loop_i:
1191 imul eax, edx
1192 sar eax, cl
1193 add eax, [esi]
1194 mov [edi], eax
1195 add esi, byte 4
1196 add edi, byte 4
1197 dec ebx
1198 jnz .x87_1_loop_i
1199
1200 jmp .end
1201
1202.x87_1more:
1203 cmp eax, byte 32 ; for order <= 32 there is a faster routine
1204 jbe short .x87_32
1205
1206 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1207 ALIGN 16
1208.x87_32more_loop_i:
1209 xor ebp, ebp
1210 mov ecx, [esp + 32]
1211 mov edx, ecx
1212 shl edx, 2
1213 add edx, [esp + 28]
1214 neg ecx
1215 ALIGN 16
1216.x87_32more_loop_j:
1217 sub edx, byte 4
1218 mov eax, [edx]
1219 imul eax, [edi + 4 * ecx]
1220 add ebp, eax
1221 inc ecx
1222 jnz short .x87_32more_loop_j
1223
1224 mov cl, [esp + 36]
1225 sar ebp, cl
1226 add ebp, [esi]
1227 mov [edi], ebp
1228 add edi, byte 4
1229 add esi, byte 4
1230
1231 dec ebx
1232 jnz .x87_32more_loop_i
1233
1234 jmp .end
1235
1236.x87_32:
1237 sub esi, edi
1238 neg eax
Josh Coalson651d6de2001-12-04 05:36:09 +00001239 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1240 call .get_eip0
1241.get_eip0:
1242 pop eax
1243 add edx, eax
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001244 inc edx ; compensate for the shorter opcode on the last iteration
1245 mov eax, [esp + 28] ; eax = qlp_coeff[]
1246 xor ebp, ebp
1247 jmp edx
1248
1249 mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
1250 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
1251 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
1252 mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
1253 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
1254 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
1255 mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
1256 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
1257 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
1258 mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
1259 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
1260 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
1261 mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
1262 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
1263 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
1264 mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
1265 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
1266 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
1267 mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
1268 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
1269 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
1270 mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
1271 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
1272 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
1273 mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
1274 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
1275 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
1276 mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
1277 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
1278 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
1279 mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
1280 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
1281 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
1282 mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
1283 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
1284 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
1285 mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
1286 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
1287 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
1288 mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
1289 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
1290 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
1291 mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
1292 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
1293 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
1294 mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
1295 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
1296 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
1297 mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
1298 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
1299 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
1300 mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
1301 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
1302 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
1303 mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
1304 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
1305 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
1306 mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
1307 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
1308 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
1309 mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
1310 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
1311 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
1312 mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
1313 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
1314 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
1315 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
1316 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
1317 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
1318 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
1319 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
1320 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
1321 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
1322 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
1323 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
1324 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
1325 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
1326 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
1327 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
1328 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
1329 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
1330 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
1331 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
1332 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
1333 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
1334 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
1335 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
1336 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
1337 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
1338 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
1339 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
1340 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
1341 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
1342 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1343 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
1344 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
1345.jumper_0:
1346
1347 mov cl, [esp + 36]
1348 sar ebp, cl ; ebp = (sum >> lp_quantization)
1349 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
1350 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
1351 add edi, byte 4
1352
1353 dec ebx
1354 jz short .end
1355 xor ebp, ebp
1356 jmp edx
1357
1358.end:
1359 pop edi
1360 pop esi
1361 pop ebx
1362 pop ebp
1363 ret
1364
1365; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001366; the channel and qlp_coeffs must be <= 16. Especially note that this routine
1367; cannot be used for side-channel coded 16bps channels since the effective bps
1368; is 17.
Josh Coalsone0a06682001-07-12 21:23:31 +00001369; WATCHOUT: this routine requires that each data array have a buffer of up to
1370; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1371; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001372 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +00001373cident FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001374 ;[esp + 40] data[]
1375 ;[esp + 36] lp_quantization
1376 ;[esp + 32] order
1377 ;[esp + 28] qlp_coeff[]
1378 ;[esp + 24] data_len
1379 ;[esp + 20] residual[]
1380
1381 ;ASSERT(order > 0)
1382
1383 push ebp
1384 push ebx
1385 push esi
1386 push edi
1387
1388 mov esi, [esp + 20]
1389 mov edi, [esp + 40]
1390 mov eax, [esp + 32]
1391 mov ebx, [esp + 24]
1392
1393 test ebx, ebx
1394 jz near .end ; do nothing if data_len == 0
1395 cmp eax, byte 4
Josh Coalsone6499bd2001-06-13 18:11:25 +00001396 jb near FLAC__lpc_restore_signal_asm_ia32.begin
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001397
1398 mov edx, [esp + 28]
1399 movd mm6, [esp + 36]
1400 mov ebp, esp
1401
1402 and esp, 0xfffffff8
1403
1404 xor ecx, ecx
1405.copy_qlp_loop:
1406 push word [edx + 4 * ecx]
1407 inc ecx
1408 cmp ecx, eax
1409 jnz short .copy_qlp_loop
1410
1411 and ecx, 0x3
1412 test ecx, ecx
1413 je short .za_end
1414 sub ecx, byte 4
1415.za_loop:
1416 push word 0
1417 inc eax
1418 inc ecx
1419 jnz short .za_loop
1420.za_end:
1421
1422 movq mm5, [esp + 2 * eax - 8]
1423 movd mm4, [edi - 16]
1424 punpckldq mm4, [edi - 12]
1425 movd mm0, [edi - 8]
1426 punpckldq mm0, [edi - 4]
1427 packssdw mm4, mm0
1428
1429 cmp eax, byte 4
1430 jnbe short .mmx_4more
1431
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001432 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001433.mmx_4_loop_i:
1434 movq mm7, mm4
1435 pmaddwd mm7, mm5
1436 movq mm0, mm7
1437 punpckhdq mm7, mm7
1438 paddd mm7, mm0
1439 psrad mm7, mm6
1440 movd mm1, [esi]
1441 paddd mm7, mm1
1442 movd [edi], mm7
1443 psllq mm7, 48
1444 psrlq mm4, 16
1445 por mm4, mm7
1446
1447 add esi, byte 4
1448 add edi, byte 4
1449
1450 dec ebx
1451 jnz .mmx_4_loop_i
1452 jmp .mmx_end
1453.mmx_4more:
1454 shl eax, 2
1455 neg eax
1456 add eax, byte 16
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001457 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001458.mmx_4more_loop_i:
1459 mov ecx, edi
1460 add ecx, eax
1461 mov edx, esp
1462
1463 movq mm7, mm4
1464 pmaddwd mm7, mm5
1465
Josh Coalson1b44f7e2007-03-13 06:35:03 +00001466 ALIGN 16
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001467.mmx_4more_loop_j:
1468 movd mm0, [ecx - 16]
1469 punpckldq mm0, [ecx - 12]
1470 movd mm1, [ecx - 8]
1471 punpckldq mm1, [ecx - 4]
1472 packssdw mm0, mm1
1473 pmaddwd mm0, [edx]
1474 paddd mm7, mm0
1475
1476 add edx, byte 8
1477 add ecx, byte 16
1478 cmp ecx, edi
1479 jnz .mmx_4more_loop_j
1480
1481 movq mm0, mm7
1482 punpckhdq mm7, mm7
1483 paddd mm7, mm0
1484 psrad mm7, mm6
1485 movd mm1, [esi]
1486 paddd mm7, mm1
1487 movd [edi], mm7
1488 psllq mm7, 48
1489 psrlq mm4, 16
1490 por mm4, mm7
1491
1492 add esi, byte 4
1493 add edi, byte 4
1494
1495 dec ebx
1496 jnz short .mmx_4more_loop_i
1497.mmx_end:
1498 emms
1499 mov esp, ebp
1500
1501.end:
1502 pop edi
1503 pop esi
1504 pop ebx
1505 pop ebp
1506 ret
1507
1508end