blob: de2d1ff1f5c0124c18e29503de9d6eaf3a6dbde0 [file] [log] [blame]
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001; libFLAC - Free Lossless Audio Codec library
2; Copyright (C) 2001 Josh Coalson
3;
4; This library is free software; you can redistribute it and/or
5; modify it under the terms of the GNU Library General Public
6; License as published by the Free Software Foundation; either
7; version 2 of the License, or (at your option) any later version.
8;
9; This library is distributed in the hope that it will be useful,
10; but WITHOUT ANY WARRANTY; without even the implied warranty of
11; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12; Library General Public License for more details.
13;
14; You should have received a copy of the GNU Library General Public
15; License along with this library; if not, write to the
16; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17; Boston, MA 02111-1307, USA.
18
19%include "nasm.h"
20
21 data_section
22
Josh Coalsone6499bd2001-06-13 18:11:25 +000023cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
24cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
25cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
26cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
27cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
28cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
29cglobal FLAC__lpc_restore_signal_asm_ia32
30cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +000031
32 code_section
33
34; **********************************************************************
35;
36; void FLAC__lpc_compute_autocorrelation_asm(const real data[], unsigned data_len, unsigned lag, real autoc[])
37; {
38; real d;
39; unsigned sample, coeff;
40; const unsigned limit = data_len - lag;
41;
42; FLAC__ASSERT(lag > 0);
43; FLAC__ASSERT(lag <= data_len);
44;
45; for(coeff = 0; coeff < lag; coeff++)
46; autoc[coeff] = 0.0;
47; for(sample = 0; sample <= limit; sample++) {
48; d = data[sample];
49; for(coeff = 0; coeff < lag; coeff++)
50; autoc[coeff] += d * data[sample+coeff];
51; }
52; for(; sample < data_len; sample++) {
53; d = data[sample];
54; for(coeff = 0; coeff < data_len - sample; coeff++)
55; autoc[coeff] += d * data[sample+coeff];
56; }
57; }
58;
59 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +000060cident FLAC__lpc_compute_autocorrelation_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +000061 ;[esp + 24] == autoc[]
62 ;[esp + 20] == lag
63 ;[esp + 16] == data_len
64 ;[esp + 12] == data[]
65
66 ;ASSERT(lag > 0)
67 ;ASSERT(lag <= 33)
68 ;ASSERT(lag <= data_len)
69
70.begin:
71 push esi
72 push edi
73
74 ; for(coeff = 0; coeff < lag; coeff++)
75 ; autoc[coeff] = 0.0;
76 mov edi, [esp + 24] ; edi == autoc
77 mov ecx, [esp + 20] ; ecx = # of dwords (=lag) of 0 to write
78 xor eax, eax
79 rep stosd
80
81 ; const unsigned limit = data_len - lag;
82 mov eax, [esp + 20] ; eax == lag
83 mov ecx, [esp + 16]
84 sub ecx, eax ; ecx == limit
85
86 mov edi, [esp + 24] ; edi == autoc
87 mov esi, [esp + 12] ; esi == data
88 inc ecx ; we are looping <= limit so we add one to the counter
89
90 ; for(sample = 0; sample <= limit; sample++) {
91 ; d = data[sample];
92 ; for(coeff = 0; coeff < lag; coeff++)
93 ; autoc[coeff] += d * data[sample+coeff];
94 ; }
95 fld dword [esi] ; ST = d <- data[sample]
96 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
97 lea edx, [eax + eax*2]
98 neg edx
99 lea edx, [eax + edx*4 + .jumper1_0]
100 inc edx ; compensate for the shorter opcode on the last iteration
101 inc edx ; compensate for the shorter opcode on the last iteration
102 inc edx ; compensate for the shorter opcode on the last iteration
103 cmp eax, 33
104 jne .loop1_start
105 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration
106.loop1_start:
107 jmp edx
108
109 fld st0 ; ST = d d
110 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here!
111 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here!
112 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here!
113 fld st0 ; ST = d d
114 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
115 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
116 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
117 fld st0 ; ST = d d
118 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
119 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
120 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
121 fld st0 ; ST = d d
122 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
123 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
124 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
125 fld st0 ; ST = d d
126 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
127 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
128 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
129 fld st0 ; ST = d d
130 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
131 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
132 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
133 fld st0 ; ST = d d
134 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
135 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
136 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
137 fld st0 ; ST = d d
138 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
139 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
140 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
141 fld st0 ; ST = d d
142 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
143 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
144 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
145 fld st0 ; ST = d d
146 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
147 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
148 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
149 fld st0 ; ST = d d
150 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
151 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
152 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
153 fld st0 ; ST = d d
154 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
155 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
156 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
157 fld st0 ; ST = d d
158 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
159 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
160 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
161 fld st0 ; ST = d d
162 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
163 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
164 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
165 fld st0 ; ST = d d
166 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
167 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
168 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
169 fld st0 ; ST = d d
170 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
171 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
172 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
173 fld st0 ; ST = d d
174 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
175 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
176 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
177 fld st0 ; ST = d d
178 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
179 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
180 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
181 fld st0 ; ST = d d
182 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
183 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
184 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
185 fld st0 ; ST = d d
186 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
187 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
188 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
189 fld st0 ; ST = d d
190 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
191 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
192 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
193 fld st0 ; ST = d d
194 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
195 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
196 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
197 fld st0 ; ST = d d
198 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
199 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
200 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
201 fld st0 ; ST = d d
202 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
203 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
204 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
205 fld st0 ; ST = d d
206 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
207 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
208 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
209 fld st0 ; ST = d d
210 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
211 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
212 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
213 fld st0 ; ST = d d
214 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
215 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
216 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
217 fld st0 ; ST = d d
218 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
219 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
220 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
221 fld st0 ; ST = d d
222 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
223 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
224 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
225 fld st0 ; ST = d d
226 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
227 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
228 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
229 fld st0 ; ST = d d
230 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
231 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
232 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
233 fld st0 ; ST = d d
234 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
235 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
236 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
237 fld st0 ; ST = d d
238 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
239 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
240 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
241.jumper1_0:
242
243 fstp st0 ; pop d, ST = empty
244 add esi, byte 4 ; sample++
245 dec ecx
246 jz .loop1_end
247 fld dword [esi] ; ST = d <- data[sample]
248 jmp edx
249.loop1_end:
250
251 ; for(; sample < data_len; sample++) {
252 ; d = data[sample];
253 ; for(coeff = 0; coeff < data_len - sample; coeff++)
254 ; autoc[coeff] += d * data[sample+coeff];
255 ; }
256 mov ecx, [esp + 20] ; ecx <- lag
257 dec ecx ; ecx <- lag - 1
258 jz near .end ; skip loop if 0 (i.e. lag == 1)
259
260 fld dword [esi] ; ST = d <- data[sample]
261 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through
262 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
263 lea edx, [eax + eax*2]
264 neg edx
265 lea edx, [eax + edx*4 + .jumper2_0]
266 inc edx ; compensate for the shorter opcode on the last iteration
267 inc edx ; compensate for the shorter opcode on the last iteration
268 inc edx ; compensate for the shorter opcode on the last iteration
269 jmp edx
270
271 fld st0 ; ST = d d
272 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
273 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
274 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
275 fld st0 ; ST = d d
276 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
277 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
278 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
279 fld st0 ; ST = d d
280 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
281 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
282 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
283 fld st0 ; ST = d d
284 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
285 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
286 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
287 fld st0 ; ST = d d
288 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
289 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
290 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
291 fld st0 ; ST = d d
292 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
293 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
294 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
295 fld st0 ; ST = d d
296 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
297 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
298 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
299 fld st0 ; ST = d d
300 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
301 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
302 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
303 fld st0 ; ST = d d
304 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
305 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
306 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
307 fld st0 ; ST = d d
308 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
309 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
310 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
311 fld st0 ; ST = d d
312 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
313 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
314 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
315 fld st0 ; ST = d d
316 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
317 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
318 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
319 fld st0 ; ST = d d
320 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
321 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
322 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
323 fld st0 ; ST = d d
324 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
325 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
326 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
327 fld st0 ; ST = d d
328 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
329 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
330 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
331 fld st0 ; ST = d d
332 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
333 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
334 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
335 fld st0 ; ST = d d
336 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
337 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
338 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
339 fld st0 ; ST = d d
340 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
341 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
342 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
343 fld st0 ; ST = d d
344 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
345 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
346 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
347 fld st0 ; ST = d d
348 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
349 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
350 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
351 fld st0 ; ST = d d
352 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
353 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
354 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
355 fld st0 ; ST = d d
356 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
357 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
358 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
359 fld st0 ; ST = d d
360 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
361 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
362 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
363 fld st0 ; ST = d d
364 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
365 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
366 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
367 fld st0 ; ST = d d
368 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
369 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
370 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
371 fld st0 ; ST = d d
372 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
373 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
374 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
375 fld st0 ; ST = d d
376 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
377 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
378 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
379 fld st0 ; ST = d d
380 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
381 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
382 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
383 fld st0 ; ST = d d
384 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
385 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
386 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
387 fld st0 ; ST = d d
388 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
389 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
390 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
391 fld st0 ; ST = d d
392 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
393 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
394 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
395 fld st0 ; ST = d d
396 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
397 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
398 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
399.jumper2_0:
400
401 fstp st0 ; pop d, ST = empty
402 add esi, byte 4 ; sample++
403 dec ecx
404 jz .loop2_end
405 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target
406 fld dword [esi] ; ST = d <- data[sample]
407 jmp edx
408.loop2_end:
409
410.end:
411 pop edi
412 pop esi
413 ret
414
415 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000416cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000417 ;[esp + 16] == autoc[]
418 ;[esp + 12] == lag
419 ;[esp + 8] == data_len
420 ;[esp + 4] == data[]
421
422 ;ASSERT(lag > 0)
423 ;ASSERT(lag <= 4)
424 ;ASSERT(lag <= data_len)
425
426 ; for(coeff = 0; coeff < lag; coeff++)
427 ; autoc[coeff] = 0.0;
428 xorps xmm5, xmm5
429
430 mov edx, [esp + 8] ; edx == data_len
431 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
432
433 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
434 add eax, 4
435 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
436 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
437.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
438 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
439 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
440 dec edx
441 jz .loop_end
442 ALIGN 16
443.loop_start:
444 ; start by reading the next sample
445 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
446 add eax, 4
447 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
448 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
449 movss xmm2, xmm0
450 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
451 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
452 dec edx
453 jnz .loop_start
454.loop_end:
455 ; store autoc
456 mov edx, [esp + 16] ; edx == autoc
457 movups [edx], xmm5
458
459.end:
460 ret
461
462 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000463cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000464 ;[esp + 16] == autoc[]
465 ;[esp + 12] == lag
466 ;[esp + 8] == data_len
467 ;[esp + 4] == data[]
468
469 ;ASSERT(lag > 0)
470 ;ASSERT(lag <= 8)
471 ;ASSERT(lag <= data_len)
472
473 ; for(coeff = 0; coeff < lag; coeff++)
474 ; autoc[coeff] = 0.0;
475 xorps xmm5, xmm5
476 xorps xmm6, xmm6
477
478 mov edx, [esp + 8] ; edx == data_len
479 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
480
481 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
482 add eax, 4
483 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
484 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
485 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
486 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
487.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
488 mulps xmm0, xmm2
489 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
490 addps xmm5, xmm0
491 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
492 dec edx
493 jz .loop_end
494 ALIGN 16
495.loop_start:
496 ; start by reading the next sample
497 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
498 ; here we reorder the instructions; see the (#) indexes for a logical order
499 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
500 add eax, 4 ; (0)
501 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
502 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
503 movss xmm3, xmm2 ; (5)
504 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
505 movss xmm2, xmm0 ; (6)
506 mulps xmm1, xmm3 ; (8)
507 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
508 addps xmm6, xmm1 ; (10)
509 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
510 dec edx
511 jnz .loop_start
512.loop_end:
513 ; store autoc
514 mov edx, [esp + 16] ; edx == autoc
515 movups [edx], xmm5
516 movups [edx + 4], xmm6
517
518.end:
519 ret
520
521 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000522cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000523 ;[esp + 16] == autoc[]
524 ;[esp + 12] == lag
525 ;[esp + 8] == data_len
526 ;[esp + 4] == data[]
527
528 ;ASSERT(lag > 0)
529 ;ASSERT(lag <= 12)
530 ;ASSERT(lag <= data_len)
531
532 ; for(coeff = 0; coeff < lag; coeff++)
533 ; autoc[coeff] = 0.0;
534 xorps xmm5, xmm5
535 xorps xmm6, xmm6
536 xorps xmm7, xmm7
537
538 mov edx, [esp + 8] ; edx == data_len
539 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
540
541 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
542 add eax, 4
543 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
544 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
545 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
546 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
547.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
548 movaps xmm1, xmm0
549 mulps xmm1, xmm2
550 addps xmm5, xmm1
551 movaps xmm1, xmm0
552 mulps xmm1, xmm3
553 addps xmm6, xmm1
554 mulps xmm0, xmm4
555 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
556 dec edx
557 jz .loop_end
558 ALIGN 16
559.loop_start:
560 ; start by reading the next sample
561 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
562 add eax, 4
563 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
564
565 ; shift xmm4:xmm3:xmm2 left by one float
566 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
567 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
568 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
569 movss xmm4, xmm3
570 movss xmm3, xmm2
571 movss xmm2, xmm0
572
573 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
574 movaps xmm1, xmm0
575 mulps xmm1, xmm2
576 addps xmm5, xmm1
577 movaps xmm1, xmm0
578 mulps xmm1, xmm3
579 addps xmm6, xmm1
580 mulps xmm0, xmm4
581 addps xmm7, xmm0
582
583 dec edx
584 jnz .loop_start
585.loop_end:
586 ; store autoc
587 mov edx, [esp + 16] ; edx == autoc
588 movups [edx], xmm5
589 movups [edx + 4], xmm6
590 movups [edx + 8], xmm7
591
592.end:
593 ret
594
595;void FLAC__lpc_compute_residual_from_qlp_coefficients(const int32 data[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 residual[])
596;
597; for(i = 0; i < data_len; i++) {
598; sum = 0;
599; for(j = 0; j < order; j++)
600; sum += qlp_coeff[j] * data[i-j-1];
601; residual[i] = data[i] - (sum >> lp_quantization);
602; }
603;
604 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000605cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000606 ;[esp + 40] residual[]
607 ;[esp + 36] lp_quantization
608 ;[esp + 32] order
609 ;[esp + 28] qlp_coeff[]
610 ;[esp + 24] data_len
611 ;[esp + 20] data[]
612
613 ;ASSERT(order > 0)
614
615 push ebp
616 push ebx
617 push esi
618 push edi
619
620 mov esi, [esp + 20] ; esi = data[]
621 mov edi, [esp + 40] ; edi = residual[]
622 mov eax, [esp + 32] ; eax = order
623 mov ebx, [esp + 24] ; ebx = data_len
624
625 test ebx, ebx
626 jz near .end ; do nothing if data_len == 0
627.begin:
628 cmp eax, byte 1
629 jg short .i_1more
630
631 mov ecx, [esp + 28]
632 mov edx, [ecx] ; edx = qlp_coeff[0]
633 mov eax, [esi - 4] ; eax = data[-1]
634 mov cl, [esp + 36] ; cl = lp_quantization
635 ALIGN 16
636.i_1_loop_i:
637 imul eax, edx
638 sar eax, cl
639 neg eax
640 add eax, [esi]
641 mov [edi], eax
642 mov eax, [esi]
643 add edi, byte 4
644 add esi, byte 4
645 dec ebx
646 jnz .i_1_loop_i
647
648 jmp .end
649
650.i_1more:
651 cmp eax, byte 32 ; for order <= 32 there is a faster routine
652 jbe short .i_32
653
654 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
655 ALIGN 16
656.i_32more_loop_i:
657 xor ebp, ebp
658 mov ecx, [esp + 32]
659 mov edx, ecx
660 shl edx, 2
661 add edx, [esp + 28]
662 neg ecx
663 ALIGN 16
664.i_32more_loop_j:
665 sub edx, byte 4
666 mov eax, [edx]
667 imul eax, [esi + 4 * ecx]
668 add ebp, eax
669 inc ecx
670 jnz short .i_32more_loop_j
671
672 mov cl, [esp + 36]
673 sar ebp, cl
674 neg ebp
675 add ebp, [esi]
676 mov [edi], ebp
677 add esi, byte 4
678 add edi, byte 4
679
680 dec ebx
681 jnz .i_32more_loop_i
682
683 jmp .end
684
685.i_32:
686 sub edi, esi
687 neg eax
688 lea edx, [eax + eax * 8 + .jumper_0]
689 inc edx
690 mov eax, [esp + 28] ; eax = qlp_coeff[]
691 xor ebp, ebp
692 jmp edx
693
694 mov ecx, [eax + 124]
695 imul ecx, [esi - 128]
696 add ebp, ecx
697 mov ecx, [eax + 120]
698 imul ecx, [esi - 124]
699 add ebp, ecx
700 mov ecx, [eax + 116]
701 imul ecx, [esi - 120]
702 add ebp, ecx
703 mov ecx, [eax + 112]
704 imul ecx, [esi - 116]
705 add ebp, ecx
706 mov ecx, [eax + 108]
707 imul ecx, [esi - 112]
708 add ebp, ecx
709 mov ecx, [eax + 104]
710 imul ecx, [esi - 108]
711 add ebp, ecx
712 mov ecx, [eax + 100]
713 imul ecx, [esi - 104]
714 add ebp, ecx
715 mov ecx, [eax + 96]
716 imul ecx, [esi - 100]
717 add ebp, ecx
718 mov ecx, [eax + 92]
719 imul ecx, [esi - 96]
720 add ebp, ecx
721 mov ecx, [eax + 88]
722 imul ecx, [esi - 92]
723 add ebp, ecx
724 mov ecx, [eax + 84]
725 imul ecx, [esi - 88]
726 add ebp, ecx
727 mov ecx, [eax + 80]
728 imul ecx, [esi - 84]
729 add ebp, ecx
730 mov ecx, [eax + 76]
731 imul ecx, [esi - 80]
732 add ebp, ecx
733 mov ecx, [eax + 72]
734 imul ecx, [esi - 76]
735 add ebp, ecx
736 mov ecx, [eax + 68]
737 imul ecx, [esi - 72]
738 add ebp, ecx
739 mov ecx, [eax + 64]
740 imul ecx, [esi - 68]
741 add ebp, ecx
742 mov ecx, [eax + 60]
743 imul ecx, [esi - 64]
744 add ebp, ecx
745 mov ecx, [eax + 56]
746 imul ecx, [esi - 60]
747 add ebp, ecx
748 mov ecx, [eax + 52]
749 imul ecx, [esi - 56]
750 add ebp, ecx
751 mov ecx, [eax + 48]
752 imul ecx, [esi - 52]
753 add ebp, ecx
754 mov ecx, [eax + 44]
755 imul ecx, [esi - 48]
756 add ebp, ecx
757 mov ecx, [eax + 40]
758 imul ecx, [esi - 44]
759 add ebp, ecx
760 mov ecx, [eax + 36]
761 imul ecx, [esi - 40]
762 add ebp, ecx
763 mov ecx, [eax + 32]
764 imul ecx, [esi - 36]
765 add ebp, ecx
766 mov ecx, [eax + 28]
767 imul ecx, [esi - 32]
768 add ebp, ecx
769 mov ecx, [eax + 24]
770 imul ecx, [esi - 28]
771 add ebp, ecx
772 mov ecx, [eax + 20]
773 imul ecx, [esi - 24]
774 add ebp, ecx
775 mov ecx, [eax + 16]
776 imul ecx, [esi - 20]
777 add ebp, ecx
778 mov ecx, [eax + 12]
779 imul ecx, [esi - 16]
780 add ebp, ecx
781 mov ecx, [eax + 8]
782 imul ecx, [esi - 12]
783 add ebp, ecx
784 mov ecx, [eax + 4]
785 imul ecx, [esi - 8]
786 add ebp, ecx
787 mov ecx, [eax] ; there is one byte missing
788 imul ecx, [esi - 4]
789 add ebp, ecx
790.jumper_0:
791
792 mov cl, [esp + 36]
793 sar ebp, cl
794 neg ebp
795 add ebp, [esi]
796 mov [edi + esi], ebp
797 add esi, byte 4
798
799 dec ebx
800 jz short .end
801 xor ebp, ebp
802 jmp edx
803
804.end:
805 pop edi
806 pop esi
807 pop ebx
808 pop ebp
809 ret
810
811; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
812; the channel must be <= 16. Especially note that this routine cannot be used
813; for side-channel coded 16bps channels since the effective bps is 17.
814 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +0000815cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000816 ;[esp + 40] residual[]
817 ;[esp + 36] lp_quantization
818 ;[esp + 32] order
819 ;[esp + 28] qlp_coeff[]
820 ;[esp + 24] data_len
821 ;[esp + 20] data[]
822
823 ;ASSERT(order > 0)
824
825 push ebp
826 push ebx
827 push esi
828 push edi
829
830 mov esi, [esp + 20] ; esi = data[]
831 mov edi, [esp + 40] ; edi = residual[]
832 mov eax, [esp + 32] ; eax = order
833 mov ebx, [esp + 24] ; ebx = data_len
834
835 test ebx, ebx
836 jz near .end ; do nothing if data_len == 0
837 dec ebx
838 test ebx, ebx
839 jz near .last_one
840
841 mov edx, [esp + 28] ; edx = qlp_coeff[]
842 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
843 mov ebp, esp
844
845 and esp, 0xfffffff8
846
847 xor ecx, ecx
848.copy_qlp_loop:
849 push word [edx + 4 * ecx]
850 inc ecx
851 cmp ecx, eax
852 jnz short .copy_qlp_loop
853
854 and ecx, 0x3
855 test ecx, ecx
856 je short .za_end
857 sub ecx, byte 4
858.za_loop:
859 push word 0
860 inc eax
861 inc ecx
862 jnz short .za_loop
863.za_end:
864
865 movq mm5, [esp + 2 * eax - 8]
866 movd mm4, [esi - 16]
867 punpckldq mm4, [esi - 12]
868 movd mm0, [esi - 8]
869 punpckldq mm0, [esi - 4]
870 packssdw mm4, mm0
871
872 cmp eax, byte 4
873 jnbe short .mmx_4more
874
875 align 16
876.mmx_4_loop_i:
877 movd mm1, [esi]
878 movq mm3, mm4
879 punpckldq mm1, [esi + 4]
880 psrlq mm4, 16
881 movq mm0, mm1
882 psllq mm0, 48
883 por mm4, mm0
884 movq mm2, mm4
885 psrlq mm4, 16
886 pxor mm0, mm0
887 punpckhdq mm0, mm1
888 pmaddwd mm3, mm5
889 pmaddwd mm2, mm5
890 psllq mm0, 16
891 por mm4, mm0
892 movq mm0, mm3
893 punpckldq mm3, mm2
894 punpckhdq mm0, mm2
895 paddd mm3, mm0
896 psrad mm3, mm6
897 psubd mm1, mm3
898 movd [edi], mm1
899 punpckhdq mm1, mm1
900 movd [edi + 4], mm1
901
902 add edi, byte 8
903 add esi, byte 8
904
905 sub ebx, 2
906 jg .mmx_4_loop_i
907 jmp .mmx_end
908
909.mmx_4more:
910 shl eax, 2
911 neg eax
912 add eax, byte 16
913
914 align 16
915.mmx_4more_loop_i:
916 movd mm1, [esi]
917 punpckldq mm1, [esi + 4]
918 movq mm3, mm4
919 psrlq mm4, 16
920 movq mm0, mm1
921 psllq mm0, 48
922 por mm4, mm0
923 movq mm2, mm4
924 psrlq mm4, 16
925 pxor mm0, mm0
926 punpckhdq mm0, mm1
927 pmaddwd mm3, mm5
928 pmaddwd mm2, mm5
929 psllq mm0, 16
930 por mm4, mm0
931
932 mov ecx, esi
933 add ecx, eax
934 mov edx, esp
935
936 align 16
937.mmx_4more_loop_j:
938 movd mm0, [ecx - 16]
939 movd mm7, [ecx - 8]
940 punpckldq mm0, [ecx - 12]
941 punpckldq mm7, [ecx - 4]
942 packssdw mm0, mm7
943 pmaddwd mm0, [edx]
944 punpckhdq mm7, mm7
945 paddd mm3, mm0
946 movd mm0, [ecx - 12]
947 punpckldq mm0, [ecx - 8]
948 punpckldq mm7, [ecx]
949 packssdw mm0, mm7
950 pmaddwd mm0, [edx]
951 paddd mm2, mm0
952
953 add edx, byte 8
954 add ecx, byte 16
955 cmp ecx, esi
956 jnz .mmx_4more_loop_j
957
958 movq mm0, mm3
959 punpckldq mm3, mm2
960 punpckhdq mm0, mm2
961 paddd mm3, mm0
962 psrad mm3, mm6
963 psubd mm1, mm3
964 movd [edi], mm1
965 punpckhdq mm1, mm1
966 movd [edi + 4], mm1
967
968 add edi, byte 8
969 add esi, byte 8
970
971 sub ebx, 2
972 jg near .mmx_4more_loop_i
973
974.mmx_end:
975 emms
976 mov esp, ebp
977.last_one:
978 mov eax, [esp + 32]
979 inc ebx
Josh Coalsone6499bd2001-06-13 18:11:25 +0000980 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
Josh Coalson9a7b5e22001-06-13 18:03:09 +0000981
982.end:
983 pop edi
984 pop esi
985 pop ebx
986 pop ebp
987 ret
988
989; **********************************************************************
990;
991; void FLAC__lpc_restore_signal(const int32 residual[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 data[])
992; {
993; unsigned i, j;
994; int32 sum;
995;
996; FLAC__ASSERT(order > 0);
997;
998; for(i = 0; i < data_len; i++) {
999; sum = 0;
1000; for(j = 0; j < order; j++)
1001; sum += qlp_coeff[j] * data[i-j-1];
1002; data[i] = residual[i] + (sum >> lp_quantization);
1003; }
1004; }
1005 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +00001006cident FLAC__lpc_restore_signal_asm_ia32
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001007 ;[esp + 40] data[]
1008 ;[esp + 36] lp_quantization
1009 ;[esp + 32] order
1010 ;[esp + 28] qlp_coeff[]
1011 ;[esp + 24] data_len
1012 ;[esp + 20] residual[]
1013
1014 ;ASSERT(order > 0)
1015
1016 push ebp
1017 push ebx
1018 push esi
1019 push edi
1020
1021 mov esi, [esp + 20] ; esi = residual[]
1022 mov edi, [esp + 40] ; edi = data[]
1023 mov eax, [esp + 32] ; eax = order
1024 mov ebx, [esp + 24] ; ebx = data_len
1025
1026 test ebx, ebx
1027 jz near .end ; do nothing if data_len == 0
1028
1029.begin:
1030 cmp eax, byte 1
1031 jg short .x87_1more
1032
1033 mov ecx, [esp + 28]
1034 mov edx, [ecx]
1035 mov eax, [edi - 4]
1036 mov cl, [esp + 36]
1037 ALIGN 16
1038.x87_1_loop_i:
1039 imul eax, edx
1040 sar eax, cl
1041 add eax, [esi]
1042 mov [edi], eax
1043 add esi, byte 4
1044 add edi, byte 4
1045 dec ebx
1046 jnz .x87_1_loop_i
1047
1048 jmp .end
1049
1050.x87_1more:
1051 cmp eax, byte 32 ; for order <= 32 there is a faster routine
1052 jbe short .x87_32
1053
1054 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1055 ALIGN 16
1056.x87_32more_loop_i:
1057 xor ebp, ebp
1058 mov ecx, [esp + 32]
1059 mov edx, ecx
1060 shl edx, 2
1061 add edx, [esp + 28]
1062 neg ecx
1063 ALIGN 16
1064.x87_32more_loop_j:
1065 sub edx, byte 4
1066 mov eax, [edx]
1067 imul eax, [edi + 4 * ecx]
1068 add ebp, eax
1069 inc ecx
1070 jnz short .x87_32more_loop_j
1071
1072 mov cl, [esp + 36]
1073 sar ebp, cl
1074 add ebp, [esi]
1075 mov [edi], ebp
1076 add edi, byte 4
1077 add esi, byte 4
1078
1079 dec ebx
1080 jnz .x87_32more_loop_i
1081
1082 jmp .end
1083
1084.x87_32:
1085 sub esi, edi
1086 neg eax
1087 lea edx, [eax + eax * 8 + .jumper_0]
1088 inc edx ; compensate for the shorter opcode on the last iteration
1089 mov eax, [esp + 28] ; eax = qlp_coeff[]
1090 xor ebp, ebp
1091 jmp edx
1092
1093 mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
1094 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
1095 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
1096 mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
1097 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
1098 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
1099 mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
1100 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
1101 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
1102 mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
1103 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
1104 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
1105 mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
1106 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
1107 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
1108 mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
1109 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
1110 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
1111 mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
1112 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
1113 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
1114 mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
1115 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
1116 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
1117 mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
1118 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
1119 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
1120 mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
1121 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
1122 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
1123 mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
1124 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
1125 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
1126 mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
1127 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
1128 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
1129 mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
1130 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
1131 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
1132 mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
1133 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
1134 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
1135 mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
1136 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
1137 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
1138 mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
1139 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
1140 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
1141 mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
1142 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
1143 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
1144 mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
1145 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
1146 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
1147 mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
1148 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
1149 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
1150 mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
1151 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
1152 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
1153 mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
1154 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
1155 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
1156 mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
1157 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
1158 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
1159 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
1160 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
1161 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
1162 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
1163 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
1164 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
1165 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
1166 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
1167 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
1168 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
1169 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
1170 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
1171 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
1172 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
1173 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
1174 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
1175 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
1176 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
1177 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
1178 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
1179 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
1180 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
1181 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
1182 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
1183 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
1184 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
1185 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
1186 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1187 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
1188 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
1189.jumper_0:
1190
1191 mov cl, [esp + 36]
1192 sar ebp, cl ; ebp = (sum >> lp_quantization)
1193 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
1194 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
1195 add edi, byte 4
1196
1197 dec ebx
1198 jz short .end
1199 xor ebp, ebp
1200 jmp edx
1201
1202.end:
1203 pop edi
1204 pop esi
1205 pop ebx
1206 pop ebp
1207 ret
1208
1209; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1210; the channel must be <= 16. Especially note that this routine cannot be used
1211; for side-channel coded 16bps channels since the effective bps is 17.
1212 ALIGN 16
Josh Coalsone6499bd2001-06-13 18:11:25 +00001213cident FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001214 ;[esp + 40] data[]
1215 ;[esp + 36] lp_quantization
1216 ;[esp + 32] order
1217 ;[esp + 28] qlp_coeff[]
1218 ;[esp + 24] data_len
1219 ;[esp + 20] residual[]
1220
1221 ;ASSERT(order > 0)
1222
1223 push ebp
1224 push ebx
1225 push esi
1226 push edi
1227
1228 mov esi, [esp + 20]
1229 mov edi, [esp + 40]
1230 mov eax, [esp + 32]
1231 mov ebx, [esp + 24]
1232
1233 test ebx, ebx
1234 jz near .end ; do nothing if data_len == 0
1235 cmp eax, byte 4
Josh Coalsone6499bd2001-06-13 18:11:25 +00001236 jb near FLAC__lpc_restore_signal_asm_ia32.begin
Josh Coalson9a7b5e22001-06-13 18:03:09 +00001237
1238 mov edx, [esp + 28]
1239 movd mm6, [esp + 36]
1240 mov ebp, esp
1241
1242 and esp, 0xfffffff8
1243
1244 xor ecx, ecx
1245.copy_qlp_loop:
1246 push word [edx + 4 * ecx]
1247 inc ecx
1248 cmp ecx, eax
1249 jnz short .copy_qlp_loop
1250
1251 and ecx, 0x3
1252 test ecx, ecx
1253 je short .za_end
1254 sub ecx, byte 4
1255.za_loop:
1256 push word 0
1257 inc eax
1258 inc ecx
1259 jnz short .za_loop
1260.za_end:
1261
1262 movq mm5, [esp + 2 * eax - 8]
1263 movd mm4, [edi - 16]
1264 punpckldq mm4, [edi - 12]
1265 movd mm0, [edi - 8]
1266 punpckldq mm0, [edi - 4]
1267 packssdw mm4, mm0
1268
1269 cmp eax, byte 4
1270 jnbe short .mmx_4more
1271
1272 align 16
1273.mmx_4_loop_i:
1274 movq mm7, mm4
1275 pmaddwd mm7, mm5
1276 movq mm0, mm7
1277 punpckhdq mm7, mm7
1278 paddd mm7, mm0
1279 psrad mm7, mm6
1280 movd mm1, [esi]
1281 paddd mm7, mm1
1282 movd [edi], mm7
1283 psllq mm7, 48
1284 psrlq mm4, 16
1285 por mm4, mm7
1286
1287 add esi, byte 4
1288 add edi, byte 4
1289
1290 dec ebx
1291 jnz .mmx_4_loop_i
1292 jmp .mmx_end
1293.mmx_4more:
1294 shl eax, 2
1295 neg eax
1296 add eax, byte 16
1297 align 16
1298.mmx_4more_loop_i:
1299 mov ecx, edi
1300 add ecx, eax
1301 mov edx, esp
1302
1303 movq mm7, mm4
1304 pmaddwd mm7, mm5
1305
1306 align 16
1307.mmx_4more_loop_j:
1308 movd mm0, [ecx - 16]
1309 punpckldq mm0, [ecx - 12]
1310 movd mm1, [ecx - 8]
1311 punpckldq mm1, [ecx - 4]
1312 packssdw mm0, mm1
1313 pmaddwd mm0, [edx]
1314 paddd mm7, mm0
1315
1316 add edx, byte 8
1317 add ecx, byte 16
1318 cmp ecx, edi
1319 jnz .mmx_4more_loop_j
1320
1321 movq mm0, mm7
1322 punpckhdq mm7, mm7
1323 paddd mm7, mm0
1324 psrad mm7, mm6
1325 movd mm1, [esi]
1326 paddd mm7, mm1
1327 movd [edi], mm7
1328 psllq mm7, 48
1329 psrlq mm4, 16
1330 por mm4, mm7
1331
1332 add esi, byte 4
1333 add edi, byte 4
1334
1335 dec ebx
1336 jnz short .mmx_4more_loop_i
1337.mmx_end:
1338 emms
1339 mov esp, ebp
1340
1341.end:
1342 pop edi
1343 pop esi
1344 pop ebx
1345 pop ebp
1346 ret
1347
1348end