blob: 882b82b968f31557d282102ce0eca66858d0e3fb [file] [log] [blame]
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001; libFLAC - Free Lossless Audio Codec library
2; Copyright (C) 2001 Josh Coalson
3;
4; This library is free software; you can redistribute it and/or
5; modify it under the terms of the GNU Library General Public
6; License as published by the Free Software Foundation; either
7; version 2 of the License, or (at your option) any later version.
8;
9; This library is distributed in the hope that it will be useful,
10; but WITHOUT ANY WARRANTY; without even the implied warranty of
11; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12; Library General Public License for more details.
13;
14; You should have received a copy of the GNU Library General Public
15; License along with this library; if not, write to the
16; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17; Boston, MA 02111-1307, USA.
18
19%include "nasm.h"
20
21 data_section
22
Josh Coalsone6499bd2001-06-13 18:11:25 +000023cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
24cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
25cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
26cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
Josh Coalsonf5925df2001-07-16 21:13:19 +000027cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
Josh Coalsone6499bd2001-06-13 18:11:25 +000028cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
29cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
30cglobal FLAC__lpc_restore_signal_asm_ia32
31cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +000032
33 code_section
34
35; **********************************************************************
36;
Josh Coalson77e3f312001-06-23 03:03:24 +000037; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +000038; {
Josh Coalson77e3f312001-06-23 03:03:24 +000039; FLAC__real d;
Josh Coalson9a7b5e22001-06-13 18:03:09 +000040; unsigned sample, coeff;
41; const unsigned limit = data_len - lag;
42;
43; FLAC__ASSERT(lag > 0);
44; FLAC__ASSERT(lag <= data_len);
45;
46; for(coeff = 0; coeff < lag; coeff++)
47; autoc[coeff] = 0.0;
48; for(sample = 0; sample <= limit; sample++) {
49; d = data[sample];
50; for(coeff = 0; coeff < lag; coeff++)
51; autoc[coeff] += d * data[sample+coeff];
52; }
53; for(; sample < data_len; sample++) {
54; d = data[sample];
55; for(coeff = 0; coeff < data_len - sample; coeff++)
56; autoc[coeff] += d * data[sample+coeff];
57; }
58; }
59;
60 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +000061cident FLAC__lpc_compute_autocorrelation_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +000062 ;[esp + 24] == autoc[]
63 ;[esp + 20] == lag
64 ;[esp + 16] == data_len
65 ;[esp + 12] == data[]
66
67 ;ASSERT(lag > 0)
68 ;ASSERT(lag <= 33)
69 ;ASSERT(lag <= data_len)
70
71.begin:
72 push esi
73 push edi
74
75 ; for(coeff = 0; coeff < lag; coeff++)
76 ; autoc[coeff] = 0.0;
77 mov edi, [esp + 24] ; edi == autoc
78 mov ecx, [esp + 20] ; ecx = # of dwords (=lag) of 0 to write
79 xor eax, eax
80 rep stosd
81
82 ; const unsigned limit = data_len - lag;
83 mov eax, [esp + 20] ; eax == lag
84 mov ecx, [esp + 16]
85 sub ecx, eax ; ecx == limit
86
87 mov edi, [esp + 24] ; edi == autoc
88 mov esi, [esp + 12] ; esi == data
89 inc ecx ; we are looping <= limit so we add one to the counter
90
91 ; for(sample = 0; sample <= limit; sample++) {
92 ; d = data[sample];
93 ; for(coeff = 0; coeff < lag; coeff++)
94 ; autoc[coeff] += d * data[sample+coeff];
95 ; }
96 fld dword [esi] ; ST = d <- data[sample]
97 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
98 lea edx, [eax + eax*2]
99 neg edx
100 lea edx, [eax + edx*4 + .jumper1_0]
101 inc edx ; compensate for the shorter opcode on the last iteration
102 inc edx ; compensate for the shorter opcode on the last iteration
103 inc edx ; compensate for the shorter opcode on the last iteration
104 cmp eax, 33
105 jne .loop1_start
106 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration
107.loop1_start:
108 jmp edx
109
110 fld st0 ; ST = d d
111 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here!
112 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here!
113 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here!
114 fld st0 ; ST = d d
115 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
116 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
117 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
118 fld st0 ; ST = d d
119 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
120 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
121 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
122 fld st0 ; ST = d d
123 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
124 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
125 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
126 fld st0 ; ST = d d
127 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
128 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
129 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
130 fld st0 ; ST = d d
131 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
132 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
133 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
134 fld st0 ; ST = d d
135 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
136 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
137 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
138 fld st0 ; ST = d d
139 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
140 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
141 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
142 fld st0 ; ST = d d
143 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
144 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
145 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
146 fld st0 ; ST = d d
147 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
148 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
149 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
150 fld st0 ; ST = d d
151 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
152 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
153 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
154 fld st0 ; ST = d d
155 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
156 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
157 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
158 fld st0 ; ST = d d
159 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
160 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
161 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
162 fld st0 ; ST = d d
163 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
164 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
165 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
166 fld st0 ; ST = d d
167 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
168 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
169 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
170 fld st0 ; ST = d d
171 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
172 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
173 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
174 fld st0 ; ST = d d
175 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
176 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
177 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
178 fld st0 ; ST = d d
179 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
180 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
181 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
182 fld st0 ; ST = d d
183 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
184 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
185 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
186 fld st0 ; ST = d d
187 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
188 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
189 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
190 fld st0 ; ST = d d
191 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
192 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
193 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
194 fld st0 ; ST = d d
195 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
196 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
197 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
198 fld st0 ; ST = d d
199 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
200 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
201 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
202 fld st0 ; ST = d d
203 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
204 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
205 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
206 fld st0 ; ST = d d
207 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
208 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
209 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
210 fld st0 ; ST = d d
211 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
212 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
213 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
214 fld st0 ; ST = d d
215 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
216 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
217 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
218 fld st0 ; ST = d d
219 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
220 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
221 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
222 fld st0 ; ST = d d
223 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
224 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
225 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
226 fld st0 ; ST = d d
227 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
228 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
229 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
230 fld st0 ; ST = d d
231 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
232 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
233 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
234 fld st0 ; ST = d d
235 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
236 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
237 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
238 fld st0 ; ST = d d
239 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
240 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
241 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
242.jumper1_0:
243
244 fstp st0 ; pop d, ST = empty
245 add esi, byte 4 ; sample++
246 dec ecx
247 jz .loop1_end
248 fld dword [esi] ; ST = d <- data[sample]
249 jmp edx
250.loop1_end:
251
252 ; for(; sample < data_len; sample++) {
253 ; d = data[sample];
254 ; for(coeff = 0; coeff < data_len - sample; coeff++)
255 ; autoc[coeff] += d * data[sample+coeff];
256 ; }
257 mov ecx, [esp + 20] ; ecx <- lag
258 dec ecx ; ecx <- lag - 1
259 jz near .end ; skip loop if 0 (i.e. lag == 1)
260
261 fld dword [esi] ; ST = d <- data[sample]
262 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through
263 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
264 lea edx, [eax + eax*2]
265 neg edx
266 lea edx, [eax + edx*4 + .jumper2_0]
267 inc edx ; compensate for the shorter opcode on the last iteration
268 inc edx ; compensate for the shorter opcode on the last iteration
269 inc edx ; compensate for the shorter opcode on the last iteration
270 jmp edx
271
272 fld st0 ; ST = d d
273 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
274 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
275 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
276 fld st0 ; ST = d d
277 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
278 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
279 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
280 fld st0 ; ST = d d
281 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
282 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
283 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
284 fld st0 ; ST = d d
285 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
286 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
287 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
288 fld st0 ; ST = d d
289 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
290 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
291 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
292 fld st0 ; ST = d d
293 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
294 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
295 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
296 fld st0 ; ST = d d
297 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
298 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
299 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
300 fld st0 ; ST = d d
301 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
302 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
303 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
304 fld st0 ; ST = d d
305 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
306 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
307 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
308 fld st0 ; ST = d d
309 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
310 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
311 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
312 fld st0 ; ST = d d
313 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
314 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
315 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
316 fld st0 ; ST = d d
317 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
318 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
319 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
320 fld st0 ; ST = d d
321 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
322 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
323 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
324 fld st0 ; ST = d d
325 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
326 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
327 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
328 fld st0 ; ST = d d
329 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
330 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
331 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
332 fld st0 ; ST = d d
333 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
334 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
335 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
336 fld st0 ; ST = d d
337 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
338 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
339 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
340 fld st0 ; ST = d d
341 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
342 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
343 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
344 fld st0 ; ST = d d
345 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
346 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
347 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
348 fld st0 ; ST = d d
349 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
350 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
351 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
352 fld st0 ; ST = d d
353 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
354 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
355 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
356 fld st0 ; ST = d d
357 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
358 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
359 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
360 fld st0 ; ST = d d
361 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
362 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
363 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
364 fld st0 ; ST = d d
365 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
366 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
367 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
368 fld st0 ; ST = d d
369 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
370 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
371 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
372 fld st0 ; ST = d d
373 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
374 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
375 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
376 fld st0 ; ST = d d
377 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
378 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
379 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
380 fld st0 ; ST = d d
381 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
382 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
383 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
384 fld st0 ; ST = d d
385 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
386 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
387 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
388 fld st0 ; ST = d d
389 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
390 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
391 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
392 fld st0 ; ST = d d
393 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
394 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
395 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
396 fld st0 ; ST = d d
397 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
398 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
399 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
400.jumper2_0:
401
402 fstp st0 ; pop d, ST = empty
403 add esi, byte 4 ; sample++
404 dec ecx
405 jz .loop2_end
406 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target
407 fld dword [esi] ; ST = d <- data[sample]
408 jmp edx
409.loop2_end:
410
411.end:
412 pop edi
413 pop esi
414 ret
415
416 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000417cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000418 ;[esp + 16] == autoc[]
419 ;[esp + 12] == lag
420 ;[esp + 8] == data_len
421 ;[esp + 4] == data[]
422
423 ;ASSERT(lag > 0)
424 ;ASSERT(lag <= 4)
425 ;ASSERT(lag <= data_len)
426
427 ; for(coeff = 0; coeff < lag; coeff++)
428 ; autoc[coeff] = 0.0;
429 xorps xmm5, xmm5
430
431 mov edx, [esp + 8] ; edx == data_len
432 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
433
434 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
435 add eax, 4
436 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
437 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
438.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
439 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
440 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
441 dec edx
442 jz .loop_end
443 ALIGN 16
444.loop_start:
445 ; start by reading the next sample
446 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
447 add eax, 4
448 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
449 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
450 movss xmm2, xmm0
451 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
452 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
453 dec edx
454 jnz .loop_start
455.loop_end:
456 ; store autoc
457 mov edx, [esp + 16] ; edx == autoc
458 movups [edx], xmm5
459
460.end:
461 ret
462
463 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000464cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000465 ;[esp + 16] == autoc[]
466 ;[esp + 12] == lag
467 ;[esp + 8] == data_len
468 ;[esp + 4] == data[]
469
470 ;ASSERT(lag > 0)
471 ;ASSERT(lag <= 8)
472 ;ASSERT(lag <= data_len)
473
474 ; for(coeff = 0; coeff < lag; coeff++)
475 ; autoc[coeff] = 0.0;
476 xorps xmm5, xmm5
477 xorps xmm6, xmm6
478
479 mov edx, [esp + 8] ; edx == data_len
480 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
481
482 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
483 add eax, 4
484 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
485 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
486 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
487 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
488.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
489 mulps xmm0, xmm2
490 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
491 addps xmm5, xmm0
492 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
493 dec edx
494 jz .loop_end
495 ALIGN 16
496.loop_start:
497 ; start by reading the next sample
498 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
499 ; here we reorder the instructions; see the (#) indexes for a logical order
500 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
501 add eax, 4 ; (0)
502 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
503 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
504 movss xmm3, xmm2 ; (5)
505 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
506 movss xmm2, xmm0 ; (6)
507 mulps xmm1, xmm3 ; (8)
508 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
509 addps xmm6, xmm1 ; (10)
510 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
511 dec edx
512 jnz .loop_start
513.loop_end:
514 ; store autoc
515 mov edx, [esp + 16] ; edx == autoc
516 movups [edx], xmm5
Josh Coalsona52270e2001-07-18 00:23:40 +0000517 movups [edx + 16], xmm6
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000518
519.end:
520 ret
521
522 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000523cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000524 ;[esp + 16] == autoc[]
525 ;[esp + 12] == lag
526 ;[esp + 8] == data_len
527 ;[esp + 4] == data[]
528
529 ;ASSERT(lag > 0)
530 ;ASSERT(lag <= 12)
531 ;ASSERT(lag <= data_len)
532
533 ; for(coeff = 0; coeff < lag; coeff++)
534 ; autoc[coeff] = 0.0;
535 xorps xmm5, xmm5
536 xorps xmm6, xmm6
537 xorps xmm7, xmm7
538
539 mov edx, [esp + 8] ; edx == data_len
540 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
541
542 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
543 add eax, 4
544 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
545 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
546 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
547 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
548.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
549 movaps xmm1, xmm0
550 mulps xmm1, xmm2
551 addps xmm5, xmm1
552 movaps xmm1, xmm0
553 mulps xmm1, xmm3
554 addps xmm6, xmm1
555 mulps xmm0, xmm4
556 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
557 dec edx
558 jz .loop_end
559 ALIGN 16
560.loop_start:
561 ; start by reading the next sample
562 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
563 add eax, 4
564 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
565
566 ; shift xmm4:xmm3:xmm2 left by one float
567 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
568 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
569 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
570 movss xmm4, xmm3
571 movss xmm3, xmm2
572 movss xmm2, xmm0
573
574 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
575 movaps xmm1, xmm0
576 mulps xmm1, xmm2
577 addps xmm5, xmm1
578 movaps xmm1, xmm0
579 mulps xmm1, xmm3
580 addps xmm6, xmm1
581 mulps xmm0, xmm4
582 addps xmm7, xmm0
583
584 dec edx
585 jnz .loop_start
586.loop_end:
587 ; store autoc
588 mov edx, [esp + 16] ; edx == autoc
589 movups [edx], xmm5
Josh Coalsona52270e2001-07-18 00:23:40 +0000590 movups [edx + 16], xmm6
591 movups [edx + 32], xmm7
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000592
593.end:
594 ret
595
Josh Coalsonf5925df2001-07-16 21:13:19 +0000596 align 16
597cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
598 ;[ebp + 32] autoc
599 ;[ebp + 28] lag
600 ;[ebp + 24] data_len
601 ;[ebp + 20] data
602
603 push ebp
604 push ebx
605 push esi
606 push edi
607 mov ebp, esp
608
609 mov esi, [ebp + 20]
610 mov edi, [ebp + 24]
611 mov edx, [ebp + 28]
612 mov eax, edx
613 neg eax
614 and esp, byte -8
615 lea esp, [esp + 4 * eax]
616 mov ecx, edx
617 xor eax, eax
618.loop0:
619 dec ecx
620 mov [esp + 4 * ecx], eax
621 jnz short .loop0
622
623 mov eax, edi
624 sub eax, edx
625 mov ebx, edx
626 and ebx, byte 1
627 sub eax, ebx
628 lea ecx, [esi + 4 * eax - 12]
629 cmp esi, ecx
630 mov eax, esi
631 ja short .loop2_pre
632 align 16 ;8 nops
633.loop1_i:
634 movd mm0, [eax]
635 movd mm2, [eax + 4]
636 movd mm4, [eax + 8]
637 movd mm6, [eax + 12]
638 mov ebx, edx
639 punpckldq mm0, mm0
640 punpckldq mm2, mm2
641 punpckldq mm4, mm4
642 punpckldq mm6, mm6
643 align 16 ;3 nops
644.loop1_j:
645 sub ebx, byte 2
646 movd mm1, [eax + 4 * ebx]
647 movd mm3, [eax + 4 * ebx + 4]
648 movd mm5, [eax + 4 * ebx + 8]
649 movd mm7, [eax + 4 * ebx + 12]
650 punpckldq mm1, mm3
651 punpckldq mm3, mm5
652 pfmul mm1, mm0
653 punpckldq mm5, mm7
654 pfmul mm3, mm2
655 punpckldq mm7, [eax + 4 * ebx + 16]
656 pfmul mm5, mm4
657 pfmul mm7, mm6
658 pfadd mm1, mm3
659 movq mm3, [esp + 4 * ebx]
660 pfadd mm5, mm7
661 pfadd mm1, mm5
662 pfadd mm3, mm1
663 movq [esp + 4 * ebx], mm3
664 jg short .loop1_j
665
666 add eax, byte 16
667 cmp eax, ecx
668 jb short .loop1_i
669
670.loop2_pre:
671 mov ebx, eax
672 sub eax, esi
673 shr eax, 2
674 lea ecx, [esi + 4 * edi]
675 mov esi, ebx
676.loop2_i:
677 movd mm0, [esi]
678 mov ebx, edi
679 sub ebx, eax
680 cmp ebx, edx
681 jbe short .loop2_j
682 mov ebx, edx
683.loop2_j:
684 dec ebx
685 movd mm1, [esi + 4 * ebx]
686 pfmul mm1, mm0
687 movd mm2, [esp + 4 * ebx]
688 pfadd mm1, mm2
689 movd [esp + 4 * ebx], mm1
690
691 jnz short .loop2_j
692
693 add esi, byte 4
694 inc eax
695 cmp esi, ecx
696 jnz short .loop2_i
697
698 mov edi, [ebp + 32]
699.loop3:
700 dec edx
701 mov eax, [esp + 4 * edx]
702 mov [edi + 4 * edx], eax
703 jnz short .loop3
704
705 femms
706
707 mov esp, ebp
708 pop edi
709 pop esi
710 pop ebx
711 pop ebp
712 ret
713
Josh Coalson77e3f312001-06-23 03:03:24 +0000714;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000715;
716; for(i = 0; i < data_len; i++) {
717; sum = 0;
718; for(j = 0; j < order; j++)
719; sum += qlp_coeff[j] * data[i-j-1];
720; residual[i] = data[i] - (sum >> lp_quantization);
721; }
722;
723 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000724cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000725 ;[esp + 40] residual[]
726 ;[esp + 36] lp_quantization
727 ;[esp + 32] order
728 ;[esp + 28] qlp_coeff[]
729 ;[esp + 24] data_len
730 ;[esp + 20] data[]
731
732 ;ASSERT(order > 0)
733
734 push ebp
735 push ebx
736 push esi
737 push edi
738
739 mov esi, [esp + 20] ; esi = data[]
740 mov edi, [esp + 40] ; edi = residual[]
741 mov eax, [esp + 32] ; eax = order
742 mov ebx, [esp + 24] ; ebx = data_len
743
744 test ebx, ebx
745 jz near .end ; do nothing if data_len == 0
746.begin:
747 cmp eax, byte 1
748 jg short .i_1more
749
750 mov ecx, [esp + 28]
751 mov edx, [ecx] ; edx = qlp_coeff[0]
752 mov eax, [esi - 4] ; eax = data[-1]
753 mov cl, [esp + 36] ; cl = lp_quantization
754 ALIGN 16
755.i_1_loop_i:
756 imul eax, edx
757 sar eax, cl
758 neg eax
759 add eax, [esi]
760 mov [edi], eax
761 mov eax, [esi]
762 add edi, byte 4
763 add esi, byte 4
764 dec ebx
765 jnz .i_1_loop_i
766
767 jmp .end
768
769.i_1more:
770 cmp eax, byte 32 ; for order <= 32 there is a faster routine
771 jbe short .i_32
772
773 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
774 ALIGN 16
775.i_32more_loop_i:
776 xor ebp, ebp
777 mov ecx, [esp + 32]
778 mov edx, ecx
779 shl edx, 2
780 add edx, [esp + 28]
781 neg ecx
782 ALIGN 16
783.i_32more_loop_j:
784 sub edx, byte 4
785 mov eax, [edx]
786 imul eax, [esi + 4 * ecx]
787 add ebp, eax
788 inc ecx
789 jnz short .i_32more_loop_j
790
791 mov cl, [esp + 36]
792 sar ebp, cl
793 neg ebp
794 add ebp, [esi]
795 mov [edi], ebp
796 add esi, byte 4
797 add edi, byte 4
798
799 dec ebx
800 jnz .i_32more_loop_i
801
802 jmp .end
803
804.i_32:
805 sub edi, esi
806 neg eax
807 lea edx, [eax + eax * 8 + .jumper_0]
808 inc edx
809 mov eax, [esp + 28] ; eax = qlp_coeff[]
810 xor ebp, ebp
811 jmp edx
812
813 mov ecx, [eax + 124]
814 imul ecx, [esi - 128]
815 add ebp, ecx
816 mov ecx, [eax + 120]
817 imul ecx, [esi - 124]
818 add ebp, ecx
819 mov ecx, [eax + 116]
820 imul ecx, [esi - 120]
821 add ebp, ecx
822 mov ecx, [eax + 112]
823 imul ecx, [esi - 116]
824 add ebp, ecx
825 mov ecx, [eax + 108]
826 imul ecx, [esi - 112]
827 add ebp, ecx
828 mov ecx, [eax + 104]
829 imul ecx, [esi - 108]
830 add ebp, ecx
831 mov ecx, [eax + 100]
832 imul ecx, [esi - 104]
833 add ebp, ecx
834 mov ecx, [eax + 96]
835 imul ecx, [esi - 100]
836 add ebp, ecx
837 mov ecx, [eax + 92]
838 imul ecx, [esi - 96]
839 add ebp, ecx
840 mov ecx, [eax + 88]
841 imul ecx, [esi - 92]
842 add ebp, ecx
843 mov ecx, [eax + 84]
844 imul ecx, [esi - 88]
845 add ebp, ecx
846 mov ecx, [eax + 80]
847 imul ecx, [esi - 84]
848 add ebp, ecx
849 mov ecx, [eax + 76]
850 imul ecx, [esi - 80]
851 add ebp, ecx
852 mov ecx, [eax + 72]
853 imul ecx, [esi - 76]
854 add ebp, ecx
855 mov ecx, [eax + 68]
856 imul ecx, [esi - 72]
857 add ebp, ecx
858 mov ecx, [eax + 64]
859 imul ecx, [esi - 68]
860 add ebp, ecx
861 mov ecx, [eax + 60]
862 imul ecx, [esi - 64]
863 add ebp, ecx
864 mov ecx, [eax + 56]
865 imul ecx, [esi - 60]
866 add ebp, ecx
867 mov ecx, [eax + 52]
868 imul ecx, [esi - 56]
869 add ebp, ecx
870 mov ecx, [eax + 48]
871 imul ecx, [esi - 52]
872 add ebp, ecx
873 mov ecx, [eax + 44]
874 imul ecx, [esi - 48]
875 add ebp, ecx
876 mov ecx, [eax + 40]
877 imul ecx, [esi - 44]
878 add ebp, ecx
879 mov ecx, [eax + 36]
880 imul ecx, [esi - 40]
881 add ebp, ecx
882 mov ecx, [eax + 32]
883 imul ecx, [esi - 36]
884 add ebp, ecx
885 mov ecx, [eax + 28]
886 imul ecx, [esi - 32]
887 add ebp, ecx
888 mov ecx, [eax + 24]
889 imul ecx, [esi - 28]
890 add ebp, ecx
891 mov ecx, [eax + 20]
892 imul ecx, [esi - 24]
893 add ebp, ecx
894 mov ecx, [eax + 16]
895 imul ecx, [esi - 20]
896 add ebp, ecx
897 mov ecx, [eax + 12]
898 imul ecx, [esi - 16]
899 add ebp, ecx
900 mov ecx, [eax + 8]
901 imul ecx, [esi - 12]
902 add ebp, ecx
903 mov ecx, [eax + 4]
904 imul ecx, [esi - 8]
905 add ebp, ecx
906 mov ecx, [eax] ; there is one byte missing
907 imul ecx, [esi - 4]
908 add ebp, ecx
909.jumper_0:
910
911 mov cl, [esp + 36]
912 sar ebp, cl
913 neg ebp
914 add ebp, [esi]
915 mov [edi + esi], ebp
916 add esi, byte 4
917
918 dec ebx
919 jz short .end
920 xor ebp, ebp
921 jmp edx
922
923.end:
924 pop edi
925 pop esi
926 pop ebx
927 pop ebp
928 ret
929
930; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
931; the channel must be <= 16. Especially note that this routine cannot be used
932; for side-channel coded 16bps channels since the effective bps is 17.
933 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000934cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000935 ;[esp + 40] residual[]
936 ;[esp + 36] lp_quantization
937 ;[esp + 32] order
938 ;[esp + 28] qlp_coeff[]
939 ;[esp + 24] data_len
940 ;[esp + 20] data[]
941
942 ;ASSERT(order > 0)
943
944 push ebp
945 push ebx
946 push esi
947 push edi
948
949 mov esi, [esp + 20] ; esi = data[]
950 mov edi, [esp + 40] ; edi = residual[]
951 mov eax, [esp + 32] ; eax = order
952 mov ebx, [esp + 24] ; ebx = data_len
953
954 test ebx, ebx
955 jz near .end ; do nothing if data_len == 0
956 dec ebx
957 test ebx, ebx
958 jz near .last_one
959
960 mov edx, [esp + 28] ; edx = qlp_coeff[]
961 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
962 mov ebp, esp
963
964 and esp, 0xfffffff8
965
966 xor ecx, ecx
967.copy_qlp_loop:
968 push word [edx + 4 * ecx]
969 inc ecx
970 cmp ecx, eax
971 jnz short .copy_qlp_loop
972
973 and ecx, 0x3
974 test ecx, ecx
975 je short .za_end
976 sub ecx, byte 4
977.za_loop:
978 push word 0
979 inc eax
980 inc ecx
981 jnz short .za_loop
982.za_end:
983
984 movq mm5, [esp + 2 * eax - 8]
985 movd mm4, [esi - 16]
986 punpckldq mm4, [esi - 12]
987 movd mm0, [esi - 8]
988 punpckldq mm0, [esi - 4]
989 packssdw mm4, mm0
990
991 cmp eax, byte 4
992 jnbe short .mmx_4more
993
994 align 16
995.mmx_4_loop_i:
996 movd mm1, [esi]
997 movq mm3, mm4
998 punpckldq mm1, [esi + 4]
999 psrlq mm4, 16
1000 movq mm0, mm1
1001 psllq mm0, 48
1002 por mm4, mm0
1003 movq mm2, mm4
1004 psrlq mm4, 16
1005 pxor mm0, mm0
1006 punpckhdq mm0, mm1
1007 pmaddwd mm3, mm5
1008 pmaddwd mm2, mm5
1009 psllq mm0, 16
1010 por mm4, mm0
1011 movq mm0, mm3
1012 punpckldq mm3, mm2
1013 punpckhdq mm0, mm2
1014 paddd mm3, mm0
1015 psrad mm3, mm6
1016 psubd mm1, mm3
1017 movd [edi], mm1
1018 punpckhdq mm1, mm1
1019 movd [edi + 4], mm1
1020
1021 add edi, byte 8
1022 add esi, byte 8
1023
1024 sub ebx, 2
1025 jg .mmx_4_loop_i
1026 jmp .mmx_end
Josh Coalsoncd66fc02001-06-18 02:34:09 +00001027
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001028.mmx_4more:
1029 shl eax, 2
1030 neg eax
1031 add eax, byte 16
1032
1033 align 16
1034.mmx_4more_loop_i:
1035 movd mm1, [esi]
1036 punpckldq mm1, [esi + 4]
1037 movq mm3, mm4
1038 psrlq mm4, 16
1039 movq mm0, mm1
1040 psllq mm0, 48
1041 por mm4, mm0
1042 movq mm2, mm4
1043 psrlq mm4, 16
1044 pxor mm0, mm0
1045 punpckhdq mm0, mm1
1046 pmaddwd mm3, mm5
1047 pmaddwd mm2, mm5
1048 psllq mm0, 16
1049 por mm4, mm0
1050
1051 mov ecx, esi
1052 add ecx, eax
1053 mov edx, esp
1054
1055 align 16
1056.mmx_4more_loop_j:
1057 movd mm0, [ecx - 16]
1058 movd mm7, [ecx - 8]
1059 punpckldq mm0, [ecx - 12]
1060 punpckldq mm7, [ecx - 4]
1061 packssdw mm0, mm7
1062 pmaddwd mm0, [edx]
1063 punpckhdq mm7, mm7
1064 paddd mm3, mm0
1065 movd mm0, [ecx - 12]
1066 punpckldq mm0, [ecx - 8]
1067 punpckldq mm7, [ecx]
1068 packssdw mm0, mm7
1069 pmaddwd mm0, [edx]
1070 paddd mm2, mm0
1071
1072 add edx, byte 8
1073 add ecx, byte 16
1074 cmp ecx, esi
1075 jnz .mmx_4more_loop_j
1076
1077 movq mm0, mm3
1078 punpckldq mm3, mm2
1079 punpckhdq mm0, mm2
1080 paddd mm3, mm0
1081 psrad mm3, mm6
1082 psubd mm1, mm3
1083 movd [edi], mm1
1084 punpckhdq mm1, mm1
1085 movd [edi + 4], mm1
1086
1087 add edi, byte 8
1088 add esi, byte 8
1089
1090 sub ebx, 2
1091 jg near .mmx_4more_loop_i
1092
1093.mmx_end:
1094 emms
1095 mov esp, ebp
1096.last_one:
1097 mov eax, [esp + 32]
1098 inc ebx
Josh Coalsone6499bd2001-06-13 18:11:25 +00001099 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001100
1101.end:
1102 pop edi
1103 pop esi
1104 pop ebx
1105 pop ebp
1106 ret
1107
1108; **********************************************************************
1109;
Josh Coalson77e3f312001-06-23 03:03:24 +00001110; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001111; {
1112; unsigned i, j;
Josh Coalson77e3f312001-06-23 03:03:24 +00001113; FLAC__int32 sum;
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001114;
1115; FLAC__ASSERT(order > 0);
1116;
1117; for(i = 0; i < data_len; i++) {
1118; sum = 0;
1119; for(j = 0; j < order; j++)
1120; sum += qlp_coeff[j] * data[i-j-1];
1121; data[i] = residual[i] + (sum >> lp_quantization);
1122; }
1123; }
1124 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +00001125cident FLAC__lpc_restore_signal_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001126 ;[esp + 40] data[]
1127 ;[esp + 36] lp_quantization
1128 ;[esp + 32] order
1129 ;[esp + 28] qlp_coeff[]
1130 ;[esp + 24] data_len
1131 ;[esp + 20] residual[]
1132
1133 ;ASSERT(order > 0)
1134
1135 push ebp
1136 push ebx
1137 push esi
1138 push edi
1139
1140 mov esi, [esp + 20] ; esi = residual[]
1141 mov edi, [esp + 40] ; edi = data[]
1142 mov eax, [esp + 32] ; eax = order
1143 mov ebx, [esp + 24] ; ebx = data_len
1144
1145 test ebx, ebx
1146 jz near .end ; do nothing if data_len == 0
1147
1148.begin:
1149 cmp eax, byte 1
1150 jg short .x87_1more
1151
1152 mov ecx, [esp + 28]
1153 mov edx, [ecx]
1154 mov eax, [edi - 4]
1155 mov cl, [esp + 36]
1156 ALIGN 16
1157.x87_1_loop_i:
1158 imul eax, edx
1159 sar eax, cl
1160 add eax, [esi]
1161 mov [edi], eax
1162 add esi, byte 4
1163 add edi, byte 4
1164 dec ebx
1165 jnz .x87_1_loop_i
1166
1167 jmp .end
1168
1169.x87_1more:
1170 cmp eax, byte 32 ; for order <= 32 there is a faster routine
1171 jbe short .x87_32
1172
1173 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1174 ALIGN 16
1175.x87_32more_loop_i:
1176 xor ebp, ebp
1177 mov ecx, [esp + 32]
1178 mov edx, ecx
1179 shl edx, 2
1180 add edx, [esp + 28]
1181 neg ecx
1182 ALIGN 16
1183.x87_32more_loop_j:
1184 sub edx, byte 4
1185 mov eax, [edx]
1186 imul eax, [edi + 4 * ecx]
1187 add ebp, eax
1188 inc ecx
1189 jnz short .x87_32more_loop_j
1190
1191 mov cl, [esp + 36]
1192 sar ebp, cl
1193 add ebp, [esi]
1194 mov [edi], ebp
1195 add edi, byte 4
1196 add esi, byte 4
1197
1198 dec ebx
1199 jnz .x87_32more_loop_i
1200
1201 jmp .end
1202
1203.x87_32:
1204 sub esi, edi
1205 neg eax
1206 lea edx, [eax + eax * 8 + .jumper_0]
1207 inc edx ; compensate for the shorter opcode on the last iteration
1208 mov eax, [esp + 28] ; eax = qlp_coeff[]
1209 xor ebp, ebp
1210 jmp edx
1211
1212 mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
1213 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
1214 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
1215 mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
1216 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
1217 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
1218 mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
1219 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
1220 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
1221 mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
1222 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
1223 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
1224 mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
1225 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
1226 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
1227 mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
1228 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
1229 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
1230 mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
1231 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
1232 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
1233 mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
1234 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
1235 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
1236 mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
1237 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
1238 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
1239 mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
1240 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
1241 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
1242 mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
1243 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
1244 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
1245 mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
1246 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
1247 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
1248 mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
1249 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
1250 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
1251 mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
1252 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
1253 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
1254 mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
1255 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
1256 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
1257 mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
1258 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
1259 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
1260 mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
1261 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
1262 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
1263 mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
1264 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
1265 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
1266 mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
1267 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
1268 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
1269 mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
1270 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
1271 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
1272 mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
1273 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
1274 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
1275 mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
1276 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
1277 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
1278 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
1279 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
1280 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
1281 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
1282 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
1283 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
1284 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
1285 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
1286 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
1287 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
1288 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
1289 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
1290 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
1291 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
1292 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
1293 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
1294 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
1295 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
1296 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
1297 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
1298 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
1299 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
1300 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
1301 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
1302 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
1303 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
1304 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
1305 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1306 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
1307 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
1308.jumper_0:
1309
1310 mov cl, [esp + 36]
1311 sar ebp, cl ; ebp = (sum >> lp_quantization)
1312 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
1313 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
1314 add edi, byte 4
1315
1316 dec ebx
1317 jz short .end
1318 xor ebp, ebp
1319 jmp edx
1320
1321.end:
1322 pop edi
1323 pop esi
1324 pop ebx
1325 pop ebp
1326 ret
1327
1328; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1329; the channel must be <= 16. Especially note that this routine cannot be used
1330; for side-channel coded 16bps channels since the effective bps is 17.
Josh Coalsone0a06682001-07-12 21:23:31 +00001331; WATCHOUT: this routine requires that each data array have a buffer of up to
1332; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1333; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001334 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +00001335cident FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001336 ;[esp + 40] data[]
1337 ;[esp + 36] lp_quantization
1338 ;[esp + 32] order
1339 ;[esp + 28] qlp_coeff[]
1340 ;[esp + 24] data_len
1341 ;[esp + 20] residual[]
1342
1343 ;ASSERT(order > 0)
1344
1345 push ebp
1346 push ebx
1347 push esi
1348 push edi
1349
1350 mov esi, [esp + 20]
1351 mov edi, [esp + 40]
1352 mov eax, [esp + 32]
1353 mov ebx, [esp + 24]
1354
1355 test ebx, ebx
1356 jz near .end ; do nothing if data_len == 0
1357 cmp eax, byte 4
Josh Coalsone6499bd2001-06-13 18:11:25 +00001358 jb near FLAC__lpc_restore_signal_asm_ia32.begin
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001359
1360 mov edx, [esp + 28]
1361 movd mm6, [esp + 36]
1362 mov ebp, esp
1363
1364 and esp, 0xfffffff8
1365
1366 xor ecx, ecx
1367.copy_qlp_loop:
1368 push word [edx + 4 * ecx]
1369 inc ecx
1370 cmp ecx, eax
1371 jnz short .copy_qlp_loop
1372
1373 and ecx, 0x3
1374 test ecx, ecx
1375 je short .za_end
1376 sub ecx, byte 4
1377.za_loop:
1378 push word 0
1379 inc eax
1380 inc ecx
1381 jnz short .za_loop
1382.za_end:
1383
1384 movq mm5, [esp + 2 * eax - 8]
1385 movd mm4, [edi - 16]
1386 punpckldq mm4, [edi - 12]
1387 movd mm0, [edi - 8]
1388 punpckldq mm0, [edi - 4]
1389 packssdw mm4, mm0
1390
1391 cmp eax, byte 4
1392 jnbe short .mmx_4more
1393
1394 align 16
1395.mmx_4_loop_i:
1396 movq mm7, mm4
1397 pmaddwd mm7, mm5
1398 movq mm0, mm7
1399 punpckhdq mm7, mm7
1400 paddd mm7, mm0
1401 psrad mm7, mm6
1402 movd mm1, [esi]
1403 paddd mm7, mm1
1404 movd [edi], mm7
1405 psllq mm7, 48
1406 psrlq mm4, 16
1407 por mm4, mm7
1408
1409 add esi, byte 4
1410 add edi, byte 4
1411
1412 dec ebx
1413 jnz .mmx_4_loop_i
1414 jmp .mmx_end
1415.mmx_4more:
1416 shl eax, 2
1417 neg eax
1418 add eax, byte 16
1419 align 16
1420.mmx_4more_loop_i:
1421 mov ecx, edi
1422 add ecx, eax
1423 mov edx, esp
1424
1425 movq mm7, mm4
1426 pmaddwd mm7, mm5
1427
1428 align 16
1429.mmx_4more_loop_j:
1430 movd mm0, [ecx - 16]
1431 punpckldq mm0, [ecx - 12]
1432 movd mm1, [ecx - 8]
1433 punpckldq mm1, [ecx - 4]
1434 packssdw mm0, mm1
1435 pmaddwd mm0, [edx]
1436 paddd mm7, mm0
1437
1438 add edx, byte 8
1439 add ecx, byte 16
1440 cmp ecx, edi
1441 jnz .mmx_4more_loop_j
1442
1443 movq mm0, mm7
1444 punpckhdq mm7, mm7
1445 paddd mm7, mm0
1446 psrad mm7, mm6
1447 movd mm1, [esi]
1448 paddd mm7, mm1
1449 movd [edi], mm7
1450 psllq mm7, 48
1451 psrlq mm4, 16
1452 por mm4, mm7
1453
1454 add esi, byte 4
1455 add edi, byte 4
1456
1457 dec ebx
1458 jnz short .mmx_4more_loop_i
1459.mmx_end:
1460 emms
1461 mov esp, ebp
1462
1463.end:
1464 pop edi
1465 pop esi
1466 pop ebx
1467 pop ebp
1468 ret
1469
1470end