blob: e5b5afbf1b9bf3b29610324b2e7d4a8a8e1687db [file] [log] [blame]
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001; libFLAC - Free Lossless Audio Codec library
Josh Coalson1152f9f2002-01-26 18:05:12 +00002; Copyright (C) 2001,2002 Josh Coalson
Josh Coalson9a7b5e22001-06-13 18:03:09 +00003;
4; This library is free software; you can redistribute it and/or
5; modify it under the terms of the GNU Library General Public
6; License as published by the Free Software Foundation; either
7; version 2 of the License, or (at your option) any later version.
8;
9; This library is distributed in the hope that it will be useful,
10; but WITHOUT ANY WARRANTY; without even the implied warranty of
11; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12; Library General Public License for more details.
13;
14; You should have received a copy of the GNU Library General Public
15; License along with this library; if not, write to the
16; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17; Boston, MA 02111-1307, USA.
18
19%include "nasm.h"
20
21 data_section
22
Josh Coalsone6499bd2001-06-13 18:11:25 +000023cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
24cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
25cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
26cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
Josh Coalsonf5925df2001-07-16 21:13:19 +000027cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
Josh Coalsone6499bd2001-06-13 18:11:25 +000028cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
29cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
30cglobal FLAC__lpc_restore_signal_asm_ia32
31cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +000032
33 code_section
34
35; **********************************************************************
36;
Josh Coalson77e3f312001-06-23 03:03:24 +000037; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +000038; {
Josh Coalson77e3f312001-06-23 03:03:24 +000039; FLAC__real d;
Josh Coalson9a7b5e22001-06-13 18:03:09 +000040; unsigned sample, coeff;
41; const unsigned limit = data_len - lag;
42;
43; FLAC__ASSERT(lag > 0);
44; FLAC__ASSERT(lag <= data_len);
45;
46; for(coeff = 0; coeff < lag; coeff++)
47; autoc[coeff] = 0.0;
48; for(sample = 0; sample <= limit; sample++) {
49; d = data[sample];
50; for(coeff = 0; coeff < lag; coeff++)
51; autoc[coeff] += d * data[sample+coeff];
52; }
53; for(; sample < data_len; sample++) {
54; d = data[sample];
55; for(coeff = 0; coeff < data_len - sample; coeff++)
56; autoc[coeff] += d * data[sample+coeff];
57; }
58; }
59;
60 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +000061cident FLAC__lpc_compute_autocorrelation_asm_ia32
Josh Coalson651d6de2001-12-04 05:36:09 +000062 ;[esp + 28] == autoc[]
63 ;[esp + 24] == lag
64 ;[esp + 20] == data_len
65 ;[esp + 16] == data[]
Josh Coalson9a7b5e22001-06-13 18:03:09 +000066
67 ;ASSERT(lag > 0)
68 ;ASSERT(lag <= 33)
69 ;ASSERT(lag <= data_len)
70
71.begin:
72 push esi
73 push edi
Josh Coalson651d6de2001-12-04 05:36:09 +000074 push ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +000075
76 ; for(coeff = 0; coeff < lag; coeff++)
77 ; autoc[coeff] = 0.0;
Josh Coalson651d6de2001-12-04 05:36:09 +000078 mov edi, [esp + 28] ; edi == autoc
79 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write
Josh Coalson9a7b5e22001-06-13 18:03:09 +000080 xor eax, eax
81 rep stosd
82
83 ; const unsigned limit = data_len - lag;
Josh Coalson651d6de2001-12-04 05:36:09 +000084 mov eax, [esp + 24] ; eax == lag
85 mov ecx, [esp + 20]
Josh Coalson9a7b5e22001-06-13 18:03:09 +000086 sub ecx, eax ; ecx == limit
87
Josh Coalson651d6de2001-12-04 05:36:09 +000088 mov edi, [esp + 28] ; edi == autoc
89 mov esi, [esp + 16] ; esi == data
Josh Coalson9a7b5e22001-06-13 18:03:09 +000090 inc ecx ; we are looping <= limit so we add one to the counter
91
92 ; for(sample = 0; sample <= limit; sample++) {
93 ; d = data[sample];
94 ; for(coeff = 0; coeff < lag; coeff++)
95 ; autoc[coeff] += d * data[sample+coeff];
96 ; }
97 fld dword [esi] ; ST = d <- data[sample]
98 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
99 lea edx, [eax + eax*2]
100 neg edx
Josh Coalson651d6de2001-12-04 05:36:09 +0000101 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
102 call .get_eip1
103.get_eip1:
104 pop ebx
105 add edx, ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000106 inc edx ; compensate for the shorter opcode on the last iteration
107 inc edx ; compensate for the shorter opcode on the last iteration
108 inc edx ; compensate for the shorter opcode on the last iteration
109 cmp eax, 33
110 jne .loop1_start
111 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration
112.loop1_start:
113 jmp edx
114
115 fld st0 ; ST = d d
116 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here!
117 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here!
118 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here!
119 fld st0 ; ST = d d
120 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
121 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
122 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
123 fld st0 ; ST = d d
124 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
125 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
126 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
127 fld st0 ; ST = d d
128 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
129 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
130 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
131 fld st0 ; ST = d d
132 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
133 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
134 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
135 fld st0 ; ST = d d
136 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
137 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
138 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
139 fld st0 ; ST = d d
140 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
141 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
142 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
143 fld st0 ; ST = d d
144 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
145 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
146 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
147 fld st0 ; ST = d d
148 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
149 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
150 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
151 fld st0 ; ST = d d
152 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
153 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
154 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
155 fld st0 ; ST = d d
156 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
157 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
158 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
159 fld st0 ; ST = d d
160 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
161 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
162 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
163 fld st0 ; ST = d d
164 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
165 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
166 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
167 fld st0 ; ST = d d
168 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
169 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
170 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
171 fld st0 ; ST = d d
172 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
173 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
174 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
175 fld st0 ; ST = d d
176 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
177 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
178 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
179 fld st0 ; ST = d d
180 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
181 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
182 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
183 fld st0 ; ST = d d
184 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
185 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
186 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
187 fld st0 ; ST = d d
188 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
189 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
190 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
191 fld st0 ; ST = d d
192 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
193 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
194 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
195 fld st0 ; ST = d d
196 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
197 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
198 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
199 fld st0 ; ST = d d
200 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
201 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
202 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
203 fld st0 ; ST = d d
204 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
205 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
206 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
207 fld st0 ; ST = d d
208 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
209 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
210 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
211 fld st0 ; ST = d d
212 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
213 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
214 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
215 fld st0 ; ST = d d
216 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
217 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
218 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
219 fld st0 ; ST = d d
220 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
221 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
222 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
223 fld st0 ; ST = d d
224 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
225 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
226 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
227 fld st0 ; ST = d d
228 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
229 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
230 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
231 fld st0 ; ST = d d
232 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
233 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
234 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
235 fld st0 ; ST = d d
236 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
237 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
238 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
239 fld st0 ; ST = d d
240 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
241 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
242 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
243 fld st0 ; ST = d d
244 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
245 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
246 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
247.jumper1_0:
248
249 fstp st0 ; pop d, ST = empty
250 add esi, byte 4 ; sample++
251 dec ecx
252 jz .loop1_end
253 fld dword [esi] ; ST = d <- data[sample]
254 jmp edx
255.loop1_end:
256
257 ; for(; sample < data_len; sample++) {
258 ; d = data[sample];
259 ; for(coeff = 0; coeff < data_len - sample; coeff++)
260 ; autoc[coeff] += d * data[sample+coeff];
261 ; }
Josh Coalson651d6de2001-12-04 05:36:09 +0000262 mov ecx, [esp + 24] ; ecx <- lag
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000263 dec ecx ; ecx <- lag - 1
264 jz near .end ; skip loop if 0 (i.e. lag == 1)
265
266 fld dword [esi] ; ST = d <- data[sample]
267 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through
268 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
269 lea edx, [eax + eax*2]
270 neg edx
Josh Coalson651d6de2001-12-04 05:36:09 +0000271 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
272 call .get_eip2
273.get_eip2:
274 pop ebx
275 add edx, ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000276 inc edx ; compensate for the shorter opcode on the last iteration
277 inc edx ; compensate for the shorter opcode on the last iteration
278 inc edx ; compensate for the shorter opcode on the last iteration
279 jmp edx
280
281 fld st0 ; ST = d d
282 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
283 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
284 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
285 fld st0 ; ST = d d
286 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
287 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
288 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
289 fld st0 ; ST = d d
290 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
291 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
292 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
293 fld st0 ; ST = d d
294 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
295 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
296 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
297 fld st0 ; ST = d d
298 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
299 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
300 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
301 fld st0 ; ST = d d
302 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
303 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
304 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
305 fld st0 ; ST = d d
306 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
307 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
308 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
309 fld st0 ; ST = d d
310 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
311 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
312 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
313 fld st0 ; ST = d d
314 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
315 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
316 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
317 fld st0 ; ST = d d
318 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
319 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
320 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
321 fld st0 ; ST = d d
322 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
323 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
324 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
325 fld st0 ; ST = d d
326 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
327 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
328 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
329 fld st0 ; ST = d d
330 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
331 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
332 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
333 fld st0 ; ST = d d
334 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
335 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
336 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
337 fld st0 ; ST = d d
338 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
339 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
340 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
341 fld st0 ; ST = d d
342 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
343 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
344 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
345 fld st0 ; ST = d d
346 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
347 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
348 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
349 fld st0 ; ST = d d
350 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
351 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
352 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
353 fld st0 ; ST = d d
354 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
355 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
356 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
357 fld st0 ; ST = d d
358 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
359 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
360 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
361 fld st0 ; ST = d d
362 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
363 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
364 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
365 fld st0 ; ST = d d
366 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
367 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
368 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
369 fld st0 ; ST = d d
370 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
371 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
372 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
373 fld st0 ; ST = d d
374 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
375 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
376 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
377 fld st0 ; ST = d d
378 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
379 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
380 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
381 fld st0 ; ST = d d
382 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
383 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
384 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
385 fld st0 ; ST = d d
386 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
387 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
388 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
389 fld st0 ; ST = d d
390 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
391 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
392 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
393 fld st0 ; ST = d d
394 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
395 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
396 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
397 fld st0 ; ST = d d
398 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
399 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
400 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
401 fld st0 ; ST = d d
402 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
403 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
404 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
405 fld st0 ; ST = d d
406 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
407 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
408 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
409.jumper2_0:
410
411 fstp st0 ; pop d, ST = empty
412 add esi, byte 4 ; sample++
413 dec ecx
414 jz .loop2_end
415 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target
416 fld dword [esi] ; ST = d <- data[sample]
417 jmp edx
418.loop2_end:
419
420.end:
Josh Coalson651d6de2001-12-04 05:36:09 +0000421 pop ebx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000422 pop edi
423 pop esi
424 ret
425
426 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000427cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000428 ;[esp + 16] == autoc[]
429 ;[esp + 12] == lag
430 ;[esp + 8] == data_len
431 ;[esp + 4] == data[]
432
433 ;ASSERT(lag > 0)
434 ;ASSERT(lag <= 4)
435 ;ASSERT(lag <= data_len)
436
437 ; for(coeff = 0; coeff < lag; coeff++)
438 ; autoc[coeff] = 0.0;
439 xorps xmm5, xmm5
440
441 mov edx, [esp + 8] ; edx == data_len
442 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
443
444 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
445 add eax, 4
446 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
447 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
448.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
449 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
450 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
451 dec edx
452 jz .loop_end
453 ALIGN 16
454.loop_start:
455 ; start by reading the next sample
456 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
457 add eax, 4
458 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
459 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
460 movss xmm2, xmm0
461 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
462 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
463 dec edx
464 jnz .loop_start
465.loop_end:
466 ; store autoc
467 mov edx, [esp + 16] ; edx == autoc
468 movups [edx], xmm5
469
470.end:
471 ret
472
473 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000474cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000475 ;[esp + 16] == autoc[]
476 ;[esp + 12] == lag
477 ;[esp + 8] == data_len
478 ;[esp + 4] == data[]
479
480 ;ASSERT(lag > 0)
481 ;ASSERT(lag <= 8)
482 ;ASSERT(lag <= data_len)
483
484 ; for(coeff = 0; coeff < lag; coeff++)
485 ; autoc[coeff] = 0.0;
486 xorps xmm5, xmm5
487 xorps xmm6, xmm6
488
489 mov edx, [esp + 8] ; edx == data_len
490 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
491
492 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
493 add eax, 4
494 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
495 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
496 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
497 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
498.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
499 mulps xmm0, xmm2
500 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
501 addps xmm5, xmm0
502 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
503 dec edx
504 jz .loop_end
505 ALIGN 16
506.loop_start:
507 ; start by reading the next sample
508 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
509 ; here we reorder the instructions; see the (#) indexes for a logical order
510 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
511 add eax, 4 ; (0)
512 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
513 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
514 movss xmm3, xmm2 ; (5)
515 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
516 movss xmm2, xmm0 ; (6)
517 mulps xmm1, xmm3 ; (8)
518 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
519 addps xmm6, xmm1 ; (10)
520 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
521 dec edx
522 jnz .loop_start
523.loop_end:
524 ; store autoc
525 mov edx, [esp + 16] ; edx == autoc
526 movups [edx], xmm5
Josh Coalsona52270e2001-07-18 00:23:40 +0000527 movups [edx + 16], xmm6
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000528
529.end:
530 ret
531
532 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000533cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000534 ;[esp + 16] == autoc[]
535 ;[esp + 12] == lag
536 ;[esp + 8] == data_len
537 ;[esp + 4] == data[]
538
539 ;ASSERT(lag > 0)
540 ;ASSERT(lag <= 12)
541 ;ASSERT(lag <= data_len)
542
543 ; for(coeff = 0; coeff < lag; coeff++)
544 ; autoc[coeff] = 0.0;
545 xorps xmm5, xmm5
546 xorps xmm6, xmm6
547 xorps xmm7, xmm7
548
549 mov edx, [esp + 8] ; edx == data_len
550 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
551
552 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
553 add eax, 4
554 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
555 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
556 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
557 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
558.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
559 movaps xmm1, xmm0
560 mulps xmm1, xmm2
561 addps xmm5, xmm1
562 movaps xmm1, xmm0
563 mulps xmm1, xmm3
564 addps xmm6, xmm1
565 mulps xmm0, xmm4
566 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
567 dec edx
568 jz .loop_end
569 ALIGN 16
570.loop_start:
571 ; start by reading the next sample
572 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
573 add eax, 4
574 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
575
576 ; shift xmm4:xmm3:xmm2 left by one float
577 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
578 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
579 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
580 movss xmm4, xmm3
581 movss xmm3, xmm2
582 movss xmm2, xmm0
583
584 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
585 movaps xmm1, xmm0
586 mulps xmm1, xmm2
587 addps xmm5, xmm1
588 movaps xmm1, xmm0
589 mulps xmm1, xmm3
590 addps xmm6, xmm1
591 mulps xmm0, xmm4
592 addps xmm7, xmm0
593
594 dec edx
595 jnz .loop_start
596.loop_end:
597 ; store autoc
598 mov edx, [esp + 16] ; edx == autoc
599 movups [edx], xmm5
Josh Coalsona52270e2001-07-18 00:23:40 +0000600 movups [edx + 16], xmm6
601 movups [edx + 32], xmm7
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000602
603.end:
604 ret
605
Josh Coalsonf5925df2001-07-16 21:13:19 +0000606 align 16
607cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
608 ;[ebp + 32] autoc
609 ;[ebp + 28] lag
610 ;[ebp + 24] data_len
611 ;[ebp + 20] data
612
613 push ebp
614 push ebx
615 push esi
616 push edi
617 mov ebp, esp
618
619 mov esi, [ebp + 20]
620 mov edi, [ebp + 24]
621 mov edx, [ebp + 28]
Josh Coalson59d84502002-12-30 02:33:19 +0000622 inc edx
623 and edx, byte -2
Josh Coalsonf5925df2001-07-16 21:13:19 +0000624 mov eax, edx
625 neg eax
626 and esp, byte -8
627 lea esp, [esp + 4 * eax]
628 mov ecx, edx
629 xor eax, eax
630.loop0:
631 dec ecx
632 mov [esp + 4 * ecx], eax
633 jnz short .loop0
634
635 mov eax, edi
636 sub eax, edx
637 mov ebx, edx
638 and ebx, byte 1
639 sub eax, ebx
640 lea ecx, [esi + 4 * eax - 12]
641 cmp esi, ecx
642 mov eax, esi
643 ja short .loop2_pre
Josh Coalson59d84502002-12-30 02:33:19 +0000644 align 16 ;4 nops
Josh Coalsonf5925df2001-07-16 21:13:19 +0000645.loop1_i:
646 movd mm0, [eax]
647 movd mm2, [eax + 4]
648 movd mm4, [eax + 8]
649 movd mm6, [eax + 12]
650 mov ebx, edx
651 punpckldq mm0, mm0
652 punpckldq mm2, mm2
653 punpckldq mm4, mm4
654 punpckldq mm6, mm6
655 align 16 ;3 nops
656.loop1_j:
657 sub ebx, byte 2
658 movd mm1, [eax + 4 * ebx]
659 movd mm3, [eax + 4 * ebx + 4]
660 movd mm5, [eax + 4 * ebx + 8]
661 movd mm7, [eax + 4 * ebx + 12]
662 punpckldq mm1, mm3
663 punpckldq mm3, mm5
664 pfmul mm1, mm0
665 punpckldq mm5, mm7
666 pfmul mm3, mm2
667 punpckldq mm7, [eax + 4 * ebx + 16]
668 pfmul mm5, mm4
669 pfmul mm7, mm6
670 pfadd mm1, mm3
671 movq mm3, [esp + 4 * ebx]
672 pfadd mm5, mm7
673 pfadd mm1, mm5
674 pfadd mm3, mm1
675 movq [esp + 4 * ebx], mm3
676 jg short .loop1_j
677
678 add eax, byte 16
679 cmp eax, ecx
680 jb short .loop1_i
681
682.loop2_pre:
683 mov ebx, eax
684 sub eax, esi
685 shr eax, 2
686 lea ecx, [esi + 4 * edi]
687 mov esi, ebx
688.loop2_i:
689 movd mm0, [esi]
690 mov ebx, edi
691 sub ebx, eax
692 cmp ebx, edx
693 jbe short .loop2_j
694 mov ebx, edx
695.loop2_j:
696 dec ebx
697 movd mm1, [esi + 4 * ebx]
698 pfmul mm1, mm0
699 movd mm2, [esp + 4 * ebx]
700 pfadd mm1, mm2
701 movd [esp + 4 * ebx], mm1
702
703 jnz short .loop2_j
704
705 add esi, byte 4
706 inc eax
707 cmp esi, ecx
708 jnz short .loop2_i
709
710 mov edi, [ebp + 32]
Josh Coalson59d84502002-12-30 02:33:19 +0000711 mov edx, [ebp + 28]
Josh Coalsonf5925df2001-07-16 21:13:19 +0000712.loop3:
713 dec edx
714 mov eax, [esp + 4 * edx]
715 mov [edi + 4 * edx], eax
716 jnz short .loop3
717
718 femms
719
720 mov esp, ebp
721 pop edi
722 pop esi
723 pop ebx
724 pop ebp
725 ret
726
Josh Coalson77e3f312001-06-23 03:03:24 +0000727;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000728;
729; for(i = 0; i < data_len; i++) {
730; sum = 0;
731; for(j = 0; j < order; j++)
732; sum += qlp_coeff[j] * data[i-j-1];
733; residual[i] = data[i] - (sum >> lp_quantization);
734; }
735;
736 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000737cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000738 ;[esp + 40] residual[]
739 ;[esp + 36] lp_quantization
740 ;[esp + 32] order
741 ;[esp + 28] qlp_coeff[]
742 ;[esp + 24] data_len
743 ;[esp + 20] data[]
744
745 ;ASSERT(order > 0)
746
747 push ebp
748 push ebx
749 push esi
750 push edi
751
752 mov esi, [esp + 20] ; esi = data[]
753 mov edi, [esp + 40] ; edi = residual[]
754 mov eax, [esp + 32] ; eax = order
755 mov ebx, [esp + 24] ; ebx = data_len
756
757 test ebx, ebx
758 jz near .end ; do nothing if data_len == 0
759.begin:
760 cmp eax, byte 1
761 jg short .i_1more
762
763 mov ecx, [esp + 28]
764 mov edx, [ecx] ; edx = qlp_coeff[0]
765 mov eax, [esi - 4] ; eax = data[-1]
766 mov cl, [esp + 36] ; cl = lp_quantization
767 ALIGN 16
768.i_1_loop_i:
769 imul eax, edx
770 sar eax, cl
771 neg eax
772 add eax, [esi]
773 mov [edi], eax
774 mov eax, [esi]
775 add edi, byte 4
776 add esi, byte 4
777 dec ebx
778 jnz .i_1_loop_i
779
780 jmp .end
781
782.i_1more:
783 cmp eax, byte 32 ; for order <= 32 there is a faster routine
784 jbe short .i_32
785
786 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
787 ALIGN 16
788.i_32more_loop_i:
789 xor ebp, ebp
790 mov ecx, [esp + 32]
791 mov edx, ecx
792 shl edx, 2
793 add edx, [esp + 28]
794 neg ecx
795 ALIGN 16
796.i_32more_loop_j:
797 sub edx, byte 4
798 mov eax, [edx]
799 imul eax, [esi + 4 * ecx]
800 add ebp, eax
801 inc ecx
802 jnz short .i_32more_loop_j
803
804 mov cl, [esp + 36]
805 sar ebp, cl
806 neg ebp
807 add ebp, [esi]
808 mov [edi], ebp
809 add esi, byte 4
810 add edi, byte 4
811
812 dec ebx
813 jnz .i_32more_loop_i
814
815 jmp .end
816
817.i_32:
818 sub edi, esi
819 neg eax
Josh Coalson651d6de2001-12-04 05:36:09 +0000820 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
821 call .get_eip0
822.get_eip0:
823 pop eax
824 add edx, eax
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000825 inc edx
826 mov eax, [esp + 28] ; eax = qlp_coeff[]
827 xor ebp, ebp
828 jmp edx
829
830 mov ecx, [eax + 124]
831 imul ecx, [esi - 128]
832 add ebp, ecx
833 mov ecx, [eax + 120]
834 imul ecx, [esi - 124]
835 add ebp, ecx
836 mov ecx, [eax + 116]
837 imul ecx, [esi - 120]
838 add ebp, ecx
839 mov ecx, [eax + 112]
840 imul ecx, [esi - 116]
841 add ebp, ecx
842 mov ecx, [eax + 108]
843 imul ecx, [esi - 112]
844 add ebp, ecx
845 mov ecx, [eax + 104]
846 imul ecx, [esi - 108]
847 add ebp, ecx
848 mov ecx, [eax + 100]
849 imul ecx, [esi - 104]
850 add ebp, ecx
851 mov ecx, [eax + 96]
852 imul ecx, [esi - 100]
853 add ebp, ecx
854 mov ecx, [eax + 92]
855 imul ecx, [esi - 96]
856 add ebp, ecx
857 mov ecx, [eax + 88]
858 imul ecx, [esi - 92]
859 add ebp, ecx
860 mov ecx, [eax + 84]
861 imul ecx, [esi - 88]
862 add ebp, ecx
863 mov ecx, [eax + 80]
864 imul ecx, [esi - 84]
865 add ebp, ecx
866 mov ecx, [eax + 76]
867 imul ecx, [esi - 80]
868 add ebp, ecx
869 mov ecx, [eax + 72]
870 imul ecx, [esi - 76]
871 add ebp, ecx
872 mov ecx, [eax + 68]
873 imul ecx, [esi - 72]
874 add ebp, ecx
875 mov ecx, [eax + 64]
876 imul ecx, [esi - 68]
877 add ebp, ecx
878 mov ecx, [eax + 60]
879 imul ecx, [esi - 64]
880 add ebp, ecx
881 mov ecx, [eax + 56]
882 imul ecx, [esi - 60]
883 add ebp, ecx
884 mov ecx, [eax + 52]
885 imul ecx, [esi - 56]
886 add ebp, ecx
887 mov ecx, [eax + 48]
888 imul ecx, [esi - 52]
889 add ebp, ecx
890 mov ecx, [eax + 44]
891 imul ecx, [esi - 48]
892 add ebp, ecx
893 mov ecx, [eax + 40]
894 imul ecx, [esi - 44]
895 add ebp, ecx
896 mov ecx, [eax + 36]
897 imul ecx, [esi - 40]
898 add ebp, ecx
899 mov ecx, [eax + 32]
900 imul ecx, [esi - 36]
901 add ebp, ecx
902 mov ecx, [eax + 28]
903 imul ecx, [esi - 32]
904 add ebp, ecx
905 mov ecx, [eax + 24]
906 imul ecx, [esi - 28]
907 add ebp, ecx
908 mov ecx, [eax + 20]
909 imul ecx, [esi - 24]
910 add ebp, ecx
911 mov ecx, [eax + 16]
912 imul ecx, [esi - 20]
913 add ebp, ecx
914 mov ecx, [eax + 12]
915 imul ecx, [esi - 16]
916 add ebp, ecx
917 mov ecx, [eax + 8]
918 imul ecx, [esi - 12]
919 add ebp, ecx
920 mov ecx, [eax + 4]
921 imul ecx, [esi - 8]
922 add ebp, ecx
923 mov ecx, [eax] ; there is one byte missing
924 imul ecx, [esi - 4]
925 add ebp, ecx
926.jumper_0:
927
928 mov cl, [esp + 36]
929 sar ebp, cl
930 neg ebp
931 add ebp, [esi]
932 mov [edi + esi], ebp
933 add esi, byte 4
934
935 dec ebx
936 jz short .end
937 xor ebp, ebp
938 jmp edx
939
940.end:
941 pop edi
942 pop esi
943 pop ebx
944 pop ebp
945 ret
946
947; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
948; the channel must be <= 16. Especially note that this routine cannot be used
949; for side-channel coded 16bps channels since the effective bps is 17.
950 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000951cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000952 ;[esp + 40] residual[]
953 ;[esp + 36] lp_quantization
954 ;[esp + 32] order
955 ;[esp + 28] qlp_coeff[]
956 ;[esp + 24] data_len
957 ;[esp + 20] data[]
958
959 ;ASSERT(order > 0)
960
961 push ebp
962 push ebx
963 push esi
964 push edi
965
966 mov esi, [esp + 20] ; esi = data[]
967 mov edi, [esp + 40] ; edi = residual[]
968 mov eax, [esp + 32] ; eax = order
969 mov ebx, [esp + 24] ; ebx = data_len
970
971 test ebx, ebx
972 jz near .end ; do nothing if data_len == 0
973 dec ebx
974 test ebx, ebx
975 jz near .last_one
976
977 mov edx, [esp + 28] ; edx = qlp_coeff[]
978 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
979 mov ebp, esp
980
981 and esp, 0xfffffff8
982
983 xor ecx, ecx
984.copy_qlp_loop:
985 push word [edx + 4 * ecx]
986 inc ecx
987 cmp ecx, eax
988 jnz short .copy_qlp_loop
989
990 and ecx, 0x3
991 test ecx, ecx
992 je short .za_end
993 sub ecx, byte 4
994.za_loop:
995 push word 0
996 inc eax
997 inc ecx
998 jnz short .za_loop
999.za_end:
1000
1001 movq mm5, [esp + 2 * eax - 8]
1002 movd mm4, [esi - 16]
1003 punpckldq mm4, [esi - 12]
1004 movd mm0, [esi - 8]
1005 punpckldq mm0, [esi - 4]
1006 packssdw mm4, mm0
1007
1008 cmp eax, byte 4
1009 jnbe short .mmx_4more
1010
1011 align 16
1012.mmx_4_loop_i:
1013 movd mm1, [esi]
1014 movq mm3, mm4
1015 punpckldq mm1, [esi + 4]
1016 psrlq mm4, 16
1017 movq mm0, mm1
1018 psllq mm0, 48
1019 por mm4, mm0
1020 movq mm2, mm4
1021 psrlq mm4, 16
1022 pxor mm0, mm0
1023 punpckhdq mm0, mm1
1024 pmaddwd mm3, mm5
1025 pmaddwd mm2, mm5
1026 psllq mm0, 16
1027 por mm4, mm0
1028 movq mm0, mm3
1029 punpckldq mm3, mm2
1030 punpckhdq mm0, mm2
1031 paddd mm3, mm0
1032 psrad mm3, mm6
1033 psubd mm1, mm3
1034 movd [edi], mm1
1035 punpckhdq mm1, mm1
1036 movd [edi + 4], mm1
1037
1038 add edi, byte 8
1039 add esi, byte 8
1040
1041 sub ebx, 2
1042 jg .mmx_4_loop_i
1043 jmp .mmx_end
Josh Coalsoncd66fc02001-06-18 02:34:09 +00001044
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001045.mmx_4more:
1046 shl eax, 2
1047 neg eax
1048 add eax, byte 16
1049
1050 align 16
1051.mmx_4more_loop_i:
1052 movd mm1, [esi]
1053 punpckldq mm1, [esi + 4]
1054 movq mm3, mm4
1055 psrlq mm4, 16
1056 movq mm0, mm1
1057 psllq mm0, 48
1058 por mm4, mm0
1059 movq mm2, mm4
1060 psrlq mm4, 16
1061 pxor mm0, mm0
1062 punpckhdq mm0, mm1
1063 pmaddwd mm3, mm5
1064 pmaddwd mm2, mm5
1065 psllq mm0, 16
1066 por mm4, mm0
1067
1068 mov ecx, esi
1069 add ecx, eax
1070 mov edx, esp
1071
1072 align 16
1073.mmx_4more_loop_j:
1074 movd mm0, [ecx - 16]
1075 movd mm7, [ecx - 8]
1076 punpckldq mm0, [ecx - 12]
1077 punpckldq mm7, [ecx - 4]
1078 packssdw mm0, mm7
1079 pmaddwd mm0, [edx]
1080 punpckhdq mm7, mm7
1081 paddd mm3, mm0
1082 movd mm0, [ecx - 12]
1083 punpckldq mm0, [ecx - 8]
1084 punpckldq mm7, [ecx]
1085 packssdw mm0, mm7
1086 pmaddwd mm0, [edx]
1087 paddd mm2, mm0
1088
1089 add edx, byte 8
1090 add ecx, byte 16
1091 cmp ecx, esi
1092 jnz .mmx_4more_loop_j
1093
1094 movq mm0, mm3
1095 punpckldq mm3, mm2
1096 punpckhdq mm0, mm2
1097 paddd mm3, mm0
1098 psrad mm3, mm6
1099 psubd mm1, mm3
1100 movd [edi], mm1
1101 punpckhdq mm1, mm1
1102 movd [edi + 4], mm1
1103
1104 add edi, byte 8
1105 add esi, byte 8
1106
1107 sub ebx, 2
1108 jg near .mmx_4more_loop_i
1109
1110.mmx_end:
1111 emms
1112 mov esp, ebp
1113.last_one:
1114 mov eax, [esp + 32]
1115 inc ebx
Josh Coalsone6499bd2001-06-13 18:11:25 +00001116 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001117
1118.end:
1119 pop edi
1120 pop esi
1121 pop ebx
1122 pop ebp
1123 ret
1124
1125; **********************************************************************
1126;
Josh Coalson77e3f312001-06-23 03:03:24 +00001127; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001128; {
1129; unsigned i, j;
Josh Coalson77e3f312001-06-23 03:03:24 +00001130; FLAC__int32 sum;
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001131;
1132; FLAC__ASSERT(order > 0);
1133;
1134; for(i = 0; i < data_len; i++) {
1135; sum = 0;
1136; for(j = 0; j < order; j++)
1137; sum += qlp_coeff[j] * data[i-j-1];
1138; data[i] = residual[i] + (sum >> lp_quantization);
1139; }
1140; }
1141 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +00001142cident FLAC__lpc_restore_signal_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001143 ;[esp + 40] data[]
1144 ;[esp + 36] lp_quantization
1145 ;[esp + 32] order
1146 ;[esp + 28] qlp_coeff[]
1147 ;[esp + 24] data_len
1148 ;[esp + 20] residual[]
1149
1150 ;ASSERT(order > 0)
1151
1152 push ebp
1153 push ebx
1154 push esi
1155 push edi
1156
1157 mov esi, [esp + 20] ; esi = residual[]
1158 mov edi, [esp + 40] ; edi = data[]
1159 mov eax, [esp + 32] ; eax = order
1160 mov ebx, [esp + 24] ; ebx = data_len
1161
1162 test ebx, ebx
1163 jz near .end ; do nothing if data_len == 0
1164
1165.begin:
1166 cmp eax, byte 1
1167 jg short .x87_1more
1168
1169 mov ecx, [esp + 28]
1170 mov edx, [ecx]
1171 mov eax, [edi - 4]
1172 mov cl, [esp + 36]
1173 ALIGN 16
1174.x87_1_loop_i:
1175 imul eax, edx
1176 sar eax, cl
1177 add eax, [esi]
1178 mov [edi], eax
1179 add esi, byte 4
1180 add edi, byte 4
1181 dec ebx
1182 jnz .x87_1_loop_i
1183
1184 jmp .end
1185
1186.x87_1more:
1187 cmp eax, byte 32 ; for order <= 32 there is a faster routine
1188 jbe short .x87_32
1189
1190 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1191 ALIGN 16
1192.x87_32more_loop_i:
1193 xor ebp, ebp
1194 mov ecx, [esp + 32]
1195 mov edx, ecx
1196 shl edx, 2
1197 add edx, [esp + 28]
1198 neg ecx
1199 ALIGN 16
1200.x87_32more_loop_j:
1201 sub edx, byte 4
1202 mov eax, [edx]
1203 imul eax, [edi + 4 * ecx]
1204 add ebp, eax
1205 inc ecx
1206 jnz short .x87_32more_loop_j
1207
1208 mov cl, [esp + 36]
1209 sar ebp, cl
1210 add ebp, [esi]
1211 mov [edi], ebp
1212 add edi, byte 4
1213 add esi, byte 4
1214
1215 dec ebx
1216 jnz .x87_32more_loop_i
1217
1218 jmp .end
1219
1220.x87_32:
1221 sub esi, edi
1222 neg eax
Josh Coalson651d6de2001-12-04 05:36:09 +00001223 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1224 call .get_eip0
1225.get_eip0:
1226 pop eax
1227 add edx, eax
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001228 inc edx ; compensate for the shorter opcode on the last iteration
1229 mov eax, [esp + 28] ; eax = qlp_coeff[]
1230 xor ebp, ebp
1231 jmp edx
1232
1233 mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
1234 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
1235 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
1236 mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
1237 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
1238 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
1239 mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
1240 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
1241 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
1242 mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
1243 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
1244 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
1245 mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
1246 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
1247 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
1248 mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
1249 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
1250 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
1251 mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
1252 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
1253 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
1254 mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
1255 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
1256 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
1257 mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
1258 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
1259 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
1260 mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
1261 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
1262 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
1263 mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
1264 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
1265 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
1266 mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
1267 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
1268 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
1269 mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
1270 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
1271 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
1272 mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
1273 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
1274 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
1275 mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
1276 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
1277 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
1278 mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
1279 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
1280 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
1281 mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
1282 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
1283 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
1284 mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
1285 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
1286 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
1287 mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
1288 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
1289 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
1290 mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
1291 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
1292 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
1293 mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
1294 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
1295 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
1296 mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
1297 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
1298 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
1299 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
1300 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
1301 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
1302 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
1303 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
1304 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
1305 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
1306 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
1307 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
1308 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
1309 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
1310 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
1311 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
1312 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
1313 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
1314 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
1315 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
1316 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
1317 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
1318 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
1319 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
1320 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
1321 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
1322 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
1323 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
1324 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
1325 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
1326 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1327 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
1328 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
1329.jumper_0:
1330
1331 mov cl, [esp + 36]
1332 sar ebp, cl ; ebp = (sum >> lp_quantization)
1333 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
1334 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
1335 add edi, byte 4
1336
1337 dec ebx
1338 jz short .end
1339 xor ebp, ebp
1340 jmp edx
1341
1342.end:
1343 pop edi
1344 pop esi
1345 pop ebx
1346 pop ebp
1347 ret
1348
1349; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1350; the channel must be <= 16. Especially note that this routine cannot be used
1351; for side-channel coded 16bps channels since the effective bps is 17.
Josh Coalsone0a06682001-07-12 21:23:31 +00001352; WATCHOUT: this routine requires that each data array have a buffer of up to
1353; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1354; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001355 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +00001356cident FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001357 ;[esp + 40] data[]
1358 ;[esp + 36] lp_quantization
1359 ;[esp + 32] order
1360 ;[esp + 28] qlp_coeff[]
1361 ;[esp + 24] data_len
1362 ;[esp + 20] residual[]
1363
1364 ;ASSERT(order > 0)
1365
1366 push ebp
1367 push ebx
1368 push esi
1369 push edi
1370
1371 mov esi, [esp + 20]
1372 mov edi, [esp + 40]
1373 mov eax, [esp + 32]
1374 mov ebx, [esp + 24]
1375
1376 test ebx, ebx
1377 jz near .end ; do nothing if data_len == 0
1378 cmp eax, byte 4
Josh Coalsone6499bd2001-06-13 18:11:25 +00001379 jb near FLAC__lpc_restore_signal_asm_ia32.begin
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001380
1381 mov edx, [esp + 28]
1382 movd mm6, [esp + 36]
1383 mov ebp, esp
1384
1385 and esp, 0xfffffff8
1386
1387 xor ecx, ecx
1388.copy_qlp_loop:
1389 push word [edx + 4 * ecx]
1390 inc ecx
1391 cmp ecx, eax
1392 jnz short .copy_qlp_loop
1393
1394 and ecx, 0x3
1395 test ecx, ecx
1396 je short .za_end
1397 sub ecx, byte 4
1398.za_loop:
1399 push word 0
1400 inc eax
1401 inc ecx
1402 jnz short .za_loop
1403.za_end:
1404
1405 movq mm5, [esp + 2 * eax - 8]
1406 movd mm4, [edi - 16]
1407 punpckldq mm4, [edi - 12]
1408 movd mm0, [edi - 8]
1409 punpckldq mm0, [edi - 4]
1410 packssdw mm4, mm0
1411
1412 cmp eax, byte 4
1413 jnbe short .mmx_4more
1414
1415 align 16
1416.mmx_4_loop_i:
1417 movq mm7, mm4
1418 pmaddwd mm7, mm5
1419 movq mm0, mm7
1420 punpckhdq mm7, mm7
1421 paddd mm7, mm0
1422 psrad mm7, mm6
1423 movd mm1, [esi]
1424 paddd mm7, mm1
1425 movd [edi], mm7
1426 psllq mm7, 48
1427 psrlq mm4, 16
1428 por mm4, mm7
1429
1430 add esi, byte 4
1431 add edi, byte 4
1432
1433 dec ebx
1434 jnz .mmx_4_loop_i
1435 jmp .mmx_end
1436.mmx_4more:
1437 shl eax, 2
1438 neg eax
1439 add eax, byte 16
1440 align 16
1441.mmx_4more_loop_i:
1442 mov ecx, edi
1443 add ecx, eax
1444 mov edx, esp
1445
1446 movq mm7, mm4
1447 pmaddwd mm7, mm5
1448
1449 align 16
1450.mmx_4more_loop_j:
1451 movd mm0, [ecx - 16]
1452 punpckldq mm0, [ecx - 12]
1453 movd mm1, [ecx - 8]
1454 punpckldq mm1, [ecx - 4]
1455 packssdw mm0, mm1
1456 pmaddwd mm0, [edx]
1457 paddd mm7, mm0
1458
1459 add edx, byte 8
1460 add ecx, byte 16
1461 cmp ecx, edi
1462 jnz .mmx_4more_loop_j
1463
1464 movq mm0, mm7
1465 punpckhdq mm7, mm7
1466 paddd mm7, mm0
1467 psrad mm7, mm6
1468 movd mm1, [esi]
1469 paddd mm7, mm1
1470 movd [edi], mm7
1471 psllq mm7, 48
1472 psrlq mm4, 16
1473 por mm4, mm7
1474
1475 add esi, byte 4
1476 add edi, byte 4
1477
1478 dec ebx
1479 jnz short .mmx_4more_loop_i
1480.mmx_end:
1481 emms
1482 mov esp, ebp
1483
1484.end:
1485 pop edi
1486 pop esi
1487 pop ebx
1488 pop ebp
1489 ret
1490
1491end