Blame - src/libFLAC/ia32/lpc_asm.nasm - platform/external/flac

blob: de2d1ff1f5c0124c18e29503de9d6eaf3a6dbde0 [file] [log] [blame]

Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	1	; libFLAC - Free Lossless Audio Codec library
				2	; Copyright (C) 2001 Josh Coalson
				3	;
				4	; This library is free software; you can redistribute it and/or
				5	; modify it under the terms of the GNU Library General Public
				6	; License as published by the Free Software Foundation; either
				7	; version 2 of the License, or (at your option) any later version.
				8	;
				9	; This library is distributed in the hope that it will be useful,
				10	; but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	; Library General Public License for more details.
				13	;
				14	; You should have received a copy of the GNU Library General Public
				15	; License along with this library; if not, write to the
				16	; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	; Boston, MA 02111-1307, USA.
				18
				19	%include "nasm.h"
				20
				21	data_section
				22
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	23	cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
				24	cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
				25	cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
				26	cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
				27	cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
				28	cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
				29	cglobal FLAC__lpc_restore_signal_asm_ia32
				30	cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	31
				32	code_section
				33
				34	; **********************************************************************
				35	;
				36	; void FLAC__lpc_compute_autocorrelation_asm(const real data[], unsigned data_len, unsigned lag, real autoc[])
				37	; {
				38	; real d;
				39	; unsigned sample, coeff;
				40	; const unsigned limit = data_len - lag;
				41	;
				42	; FLAC__ASSERT(lag > 0);
				43	; FLAC__ASSERT(lag <= data_len);
				44	;
				45	; for(coeff = 0; coeff < lag; coeff++)
				46	; autoc[coeff] = 0.0;
				47	; for(sample = 0; sample <= limit; sample++) {
				48	; d = data[sample];
				49	; for(coeff = 0; coeff < lag; coeff++)
				50	; autoc[coeff] += d * data[sample+coeff];
				51	; }
				52	; for(; sample < data_len; sample++) {
				53	; d = data[sample];
				54	; for(coeff = 0; coeff < data_len - sample; coeff++)
				55	; autoc[coeff] += d * data[sample+coeff];
				56	; }
				57	; }
				58	;
				59	ALIGN 16
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	60	cident FLAC__lpc_compute_autocorrelation_asm_ia32
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	61	;[esp + 24] == autoc[]
				62	;[esp + 20] == lag
				63	;[esp + 16] == data_len
				64	;[esp + 12] == data[]
				65
				66	;ASSERT(lag > 0)
				67	;ASSERT(lag <= 33)
				68	;ASSERT(lag <= data_len)
				69
				70	.begin:
				71	push esi
				72	push edi
				73
				74	; for(coeff = 0; coeff < lag; coeff++)
				75	; autoc[coeff] = 0.0;
				76	mov edi, [esp + 24] ; edi == autoc
				77	mov ecx, [esp + 20] ; ecx = # of dwords (=lag) of 0 to write
				78	xor eax, eax
				79	rep stosd
				80
				81	; const unsigned limit = data_len - lag;
				82	mov eax, [esp + 20] ; eax == lag
				83	mov ecx, [esp + 16]
				84	sub ecx, eax ; ecx == limit
				85
				86	mov edi, [esp + 24] ; edi == autoc
				87	mov esi, [esp + 12] ; esi == data
				88	inc ecx ; we are looping <= limit so we add one to the counter
				89
				90	; for(sample = 0; sample <= limit; sample++) {
				91	; d = data[sample];
				92	; for(coeff = 0; coeff < lag; coeff++)
				93	; autoc[coeff] += d * data[sample+coeff];
				94	; }
				95	fld dword [esi] ; ST = d <- data[sample]
				96	; each iteration is 11 bytes so we need (-eax)11, so we do (-12eax + eax)
				97	lea edx, [eax + eax*2]
				98	neg edx
				99	lea edx, [eax + edx*4 + .jumper1_0]
				100	inc edx ; compensate for the shorter opcode on the last iteration
				101	inc edx ; compensate for the shorter opcode on the last iteration
				102	inc edx ; compensate for the shorter opcode on the last iteration
				103	cmp eax, 33
				104	jne .loop1_start
				105	sub edx, byte 9 ; compensate for the longer opcodes on the first iteration
				106	.loop1_start:
				107	jmp edx
				108
				109	fld st0 ; ST = d d
				110	fmul dword [esi + (324)] ; ST = ddata[sample+32] d WATCHOUT: not a byte displacement here!
				111	fadd dword [edi + (324)] ; ST = autoc[32]+ddata[sample+32] d WATCHOUT: not a byte displacement here!
				112	fstp dword [edi + (324)] ; autoc[32]+=ddata[sample+32] ST = d WATCHOUT: not a byte displacement here!
				113	fld st0 ; ST = d d
				114	fmul dword [esi + (314)] ; ST = ddata[sample+31] d
				115	fadd dword [edi + (314)] ; ST = autoc[31]+ddata[sample+31] d
				116	fstp dword [edi + (314)] ; autoc[31]+=ddata[sample+31] ST = d
				117	fld st0 ; ST = d d
				118	fmul dword [esi + (304)] ; ST = ddata[sample+30] d
				119	fadd dword [edi + (304)] ; ST = autoc[30]+ddata[sample+30] d
				120	fstp dword [edi + (304)] ; autoc[30]+=ddata[sample+30] ST = d
				121	fld st0 ; ST = d d
				122	fmul dword [esi + (294)] ; ST = ddata[sample+29] d
				123	fadd dword [edi + (294)] ; ST = autoc[29]+ddata[sample+29] d
				124	fstp dword [edi + (294)] ; autoc[29]+=ddata[sample+29] ST = d
				125	fld st0 ; ST = d d
				126	fmul dword [esi + (284)] ; ST = ddata[sample+28] d
				127	fadd dword [edi + (284)] ; ST = autoc[28]+ddata[sample+28] d
				128	fstp dword [edi + (284)] ; autoc[28]+=ddata[sample+28] ST = d
				129	fld st0 ; ST = d d
				130	fmul dword [esi + (274)] ; ST = ddata[sample+27] d
				131	fadd dword [edi + (274)] ; ST = autoc[27]+ddata[sample+27] d
				132	fstp dword [edi + (274)] ; autoc[27]+=ddata[sample+27] ST = d
				133	fld st0 ; ST = d d
				134	fmul dword [esi + (264)] ; ST = ddata[sample+26] d
				135	fadd dword [edi + (264)] ; ST = autoc[26]+ddata[sample+26] d
				136	fstp dword [edi + (264)] ; autoc[26]+=ddata[sample+26] ST = d
				137	fld st0 ; ST = d d
				138	fmul dword [esi + (254)] ; ST = ddata[sample+25] d
				139	fadd dword [edi + (254)] ; ST = autoc[25]+ddata[sample+25] d
				140	fstp dword [edi + (254)] ; autoc[25]+=ddata[sample+25] ST = d
				141	fld st0 ; ST = d d
				142	fmul dword [esi + (244)] ; ST = ddata[sample+24] d
				143	fadd dword [edi + (244)] ; ST = autoc[24]+ddata[sample+24] d
				144	fstp dword [edi + (244)] ; autoc[24]+=ddata[sample+24] ST = d
				145	fld st0 ; ST = d d
				146	fmul dword [esi + (234)] ; ST = ddata[sample+23] d
				147	fadd dword [edi + (234)] ; ST = autoc[23]+ddata[sample+23] d
				148	fstp dword [edi + (234)] ; autoc[23]+=ddata[sample+23] ST = d
				149	fld st0 ; ST = d d
				150	fmul dword [esi + (224)] ; ST = ddata[sample+22] d
				151	fadd dword [edi + (224)] ; ST = autoc[22]+ddata[sample+22] d
				152	fstp dword [edi + (224)] ; autoc[22]+=ddata[sample+22] ST = d
				153	fld st0 ; ST = d d
				154	fmul dword [esi + (214)] ; ST = ddata[sample+21] d
				155	fadd dword [edi + (214)] ; ST = autoc[21]+ddata[sample+21] d
				156	fstp dword [edi + (214)] ; autoc[21]+=ddata[sample+21] ST = d
				157	fld st0 ; ST = d d
				158	fmul dword [esi + (204)] ; ST = ddata[sample+20] d
				159	fadd dword [edi + (204)] ; ST = autoc[20]+ddata[sample+20] d
				160	fstp dword [edi + (204)] ; autoc[20]+=ddata[sample+20] ST = d
				161	fld st0 ; ST = d d
				162	fmul dword [esi + (194)] ; ST = ddata[sample+19] d
				163	fadd dword [edi + (194)] ; ST = autoc[19]+ddata[sample+19] d
				164	fstp dword [edi + (194)] ; autoc[19]+=ddata[sample+19] ST = d
				165	fld st0 ; ST = d d
				166	fmul dword [esi + (184)] ; ST = ddata[sample+18] d
				167	fadd dword [edi + (184)] ; ST = autoc[18]+ddata[sample+18] d
				168	fstp dword [edi + (184)] ; autoc[18]+=ddata[sample+18] ST = d
				169	fld st0 ; ST = d d
				170	fmul dword [esi + (174)] ; ST = ddata[sample+17] d
				171	fadd dword [edi + (174)] ; ST = autoc[17]+ddata[sample+17] d
				172	fstp dword [edi + (174)] ; autoc[17]+=ddata[sample+17] ST = d
				173	fld st0 ; ST = d d
				174	fmul dword [esi + (164)] ; ST = ddata[sample+16] d
				175	fadd dword [edi + (164)] ; ST = autoc[16]+ddata[sample+16] d
				176	fstp dword [edi + (164)] ; autoc[16]+=ddata[sample+16] ST = d
				177	fld st0 ; ST = d d
				178	fmul dword [esi + (154)] ; ST = ddata[sample+15] d
				179	fadd dword [edi + (154)] ; ST = autoc[15]+ddata[sample+15] d
				180	fstp dword [edi + (154)] ; autoc[15]+=ddata[sample+15] ST = d
				181	fld st0 ; ST = d d
				182	fmul dword [esi + (144)] ; ST = ddata[sample+14] d
				183	fadd dword [edi + (144)] ; ST = autoc[14]+ddata[sample+14] d
				184	fstp dword [edi + (144)] ; autoc[14]+=ddata[sample+14] ST = d
				185	fld st0 ; ST = d d
				186	fmul dword [esi + (134)] ; ST = ddata[sample+13] d
				187	fadd dword [edi + (134)] ; ST = autoc[13]+ddata[sample+13] d
				188	fstp dword [edi + (134)] ; autoc[13]+=ddata[sample+13] ST = d
				189	fld st0 ; ST = d d
				190	fmul dword [esi + (124)] ; ST = ddata[sample+12] d
				191	fadd dword [edi + (124)] ; ST = autoc[12]+ddata[sample+12] d
				192	fstp dword [edi + (124)] ; autoc[12]+=ddata[sample+12] ST = d
				193	fld st0 ; ST = d d
				194	fmul dword [esi + (114)] ; ST = ddata[sample+11] d
				195	fadd dword [edi + (114)] ; ST = autoc[11]+ddata[sample+11] d
				196	fstp dword [edi + (114)] ; autoc[11]+=ddata[sample+11] ST = d
				197	fld st0 ; ST = d d
				198	fmul dword [esi + (104)] ; ST = ddata[sample+10] d
				199	fadd dword [edi + (104)] ; ST = autoc[10]+ddata[sample+10] d
				200	fstp dword [edi + (104)] ; autoc[10]+=ddata[sample+10] ST = d
				201	fld st0 ; ST = d d
				202	fmul dword [esi + ( 94)] ; ST = ddata[sample+9] d
				203	fadd dword [edi + ( 94)] ; ST = autoc[9]+ddata[sample+9] d
				204	fstp dword [edi + ( 94)] ; autoc[9]+=ddata[sample+9] ST = d
				205	fld st0 ; ST = d d
				206	fmul dword [esi + ( 84)] ; ST = ddata[sample+8] d
				207	fadd dword [edi + ( 84)] ; ST = autoc[8]+ddata[sample+8] d
				208	fstp dword [edi + ( 84)] ; autoc[8]+=ddata[sample+8] ST = d
				209	fld st0 ; ST = d d
				210	fmul dword [esi + ( 74)] ; ST = ddata[sample+7] d
				211	fadd dword [edi + ( 74)] ; ST = autoc[7]+ddata[sample+7] d
				212	fstp dword [edi + ( 74)] ; autoc[7]+=ddata[sample+7] ST = d
				213	fld st0 ; ST = d d
				214	fmul dword [esi + ( 64)] ; ST = ddata[sample+6] d
				215	fadd dword [edi + ( 64)] ; ST = autoc[6]+ddata[sample+6] d
				216	fstp dword [edi + ( 64)] ; autoc[6]+=ddata[sample+6] ST = d
				217	fld st0 ; ST = d d
				218	fmul dword [esi + ( 54)] ; ST = ddata[sample+4] d
				219	fadd dword [edi + ( 54)] ; ST = autoc[4]+ddata[sample+4] d
				220	fstp dword [edi + ( 54)] ; autoc[4]+=ddata[sample+4] ST = d
				221	fld st0 ; ST = d d
				222	fmul dword [esi + ( 44)] ; ST = ddata[sample+4] d
				223	fadd dword [edi + ( 44)] ; ST = autoc[4]+ddata[sample+4] d
				224	fstp dword [edi + ( 44)] ; autoc[4]+=ddata[sample+4] ST = d
				225	fld st0 ; ST = d d
				226	fmul dword [esi + ( 34)] ; ST = ddata[sample+3] d
				227	fadd dword [edi + ( 34)] ; ST = autoc[3]+ddata[sample+3] d
				228	fstp dword [edi + ( 34)] ; autoc[3]+=ddata[sample+3] ST = d
				229	fld st0 ; ST = d d
				230	fmul dword [esi + ( 24)] ; ST = ddata[sample+2] d
				231	fadd dword [edi + ( 24)] ; ST = autoc[2]+ddata[sample+2] d
				232	fstp dword [edi + ( 24)] ; autoc[2]+=ddata[sample+2] ST = d
				233	fld st0 ; ST = d d
				234	fmul dword [esi + ( 14)] ; ST = ddata[sample+1] d
				235	fadd dword [edi + ( 14)] ; ST = autoc[1]+ddata[sample+1] d
				236	fstp dword [edi + ( 14)] ; autoc[1]+=ddata[sample+1] ST = d
				237	fld st0 ; ST = d d
				238	fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
				239	fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
				240	fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
				241	.jumper1_0:
				242
				243	fstp st0 ; pop d, ST = empty
				244	add esi, byte 4 ; sample++
				245	dec ecx
				246	jz .loop1_end
				247	fld dword [esi] ; ST = d <- data[sample]
				248	jmp edx
				249	.loop1_end:
				250
				251	; for(; sample < data_len; sample++) {
				252	; d = data[sample];
				253	; for(coeff = 0; coeff < data_len - sample; coeff++)
				254	; autoc[coeff] += d * data[sample+coeff];
				255	; }
				256	mov ecx, [esp + 20] ; ecx <- lag
				257	dec ecx ; ecx <- lag - 1
				258	jz near .end ; skip loop if 0 (i.e. lag == 1)
				259
				260	fld dword [esi] ; ST = d <- data[sample]
				261	mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through
				262	; each iteration is 11 bytes so we need (-eax)11, so we do (-12eax + eax)
				263	lea edx, [eax + eax*2]
				264	neg edx
				265	lea edx, [eax + edx*4 + .jumper2_0]
				266	inc edx ; compensate for the shorter opcode on the last iteration
				267	inc edx ; compensate for the shorter opcode on the last iteration
				268	inc edx ; compensate for the shorter opcode on the last iteration
				269	jmp edx
				270
				271	fld st0 ; ST = d d
				272	fmul dword [esi + (314)] ; ST = ddata[sample+31] d
				273	fadd dword [edi + (314)] ; ST = autoc[31]+ddata[sample+31] d
				274	fstp dword [edi + (314)] ; autoc[31]+=ddata[sample+31] ST = d
				275	fld st0 ; ST = d d
				276	fmul dword [esi + (304)] ; ST = ddata[sample+30] d
				277	fadd dword [edi + (304)] ; ST = autoc[30]+ddata[sample+30] d
				278	fstp dword [edi + (304)] ; autoc[30]+=ddata[sample+30] ST = d
				279	fld st0 ; ST = d d
				280	fmul dword [esi + (294)] ; ST = ddata[sample+29] d
				281	fadd dword [edi + (294)] ; ST = autoc[29]+ddata[sample+29] d
				282	fstp dword [edi + (294)] ; autoc[29]+=ddata[sample+29] ST = d
				283	fld st0 ; ST = d d
				284	fmul dword [esi + (284)] ; ST = ddata[sample+28] d
				285	fadd dword [edi + (284)] ; ST = autoc[28]+ddata[sample+28] d
				286	fstp dword [edi + (284)] ; autoc[28]+=ddata[sample+28] ST = d
				287	fld st0 ; ST = d d
				288	fmul dword [esi + (274)] ; ST = ddata[sample+27] d
				289	fadd dword [edi + (274)] ; ST = autoc[27]+ddata[sample+27] d
				290	fstp dword [edi + (274)] ; autoc[27]+=ddata[sample+27] ST = d
				291	fld st0 ; ST = d d
				292	fmul dword [esi + (264)] ; ST = ddata[sample+26] d
				293	fadd dword [edi + (264)] ; ST = autoc[26]+ddata[sample+26] d
				294	fstp dword [edi + (264)] ; autoc[26]+=ddata[sample+26] ST = d
				295	fld st0 ; ST = d d
				296	fmul dword [esi + (254)] ; ST = ddata[sample+25] d
				297	fadd dword [edi + (254)] ; ST = autoc[25]+ddata[sample+25] d
				298	fstp dword [edi + (254)] ; autoc[25]+=ddata[sample+25] ST = d
				299	fld st0 ; ST = d d
				300	fmul dword [esi + (244)] ; ST = ddata[sample+24] d
				301	fadd dword [edi + (244)] ; ST = autoc[24]+ddata[sample+24] d
				302	fstp dword [edi + (244)] ; autoc[24]+=ddata[sample+24] ST = d
				303	fld st0 ; ST = d d
				304	fmul dword [esi + (234)] ; ST = ddata[sample+23] d
				305	fadd dword [edi + (234)] ; ST = autoc[23]+ddata[sample+23] d
				306	fstp dword [edi + (234)] ; autoc[23]+=ddata[sample+23] ST = d
				307	fld st0 ; ST = d d
				308	fmul dword [esi + (224)] ; ST = ddata[sample+22] d
				309	fadd dword [edi + (224)] ; ST = autoc[22]+ddata[sample+22] d
				310	fstp dword [edi + (224)] ; autoc[22]+=ddata[sample+22] ST = d
				311	fld st0 ; ST = d d
				312	fmul dword [esi + (214)] ; ST = ddata[sample+21] d
				313	fadd dword [edi + (214)] ; ST = autoc[21]+ddata[sample+21] d
				314	fstp dword [edi + (214)] ; autoc[21]+=ddata[sample+21] ST = d
				315	fld st0 ; ST = d d
				316	fmul dword [esi + (204)] ; ST = ddata[sample+20] d
				317	fadd dword [edi + (204)] ; ST = autoc[20]+ddata[sample+20] d
				318	fstp dword [edi + (204)] ; autoc[20]+=ddata[sample+20] ST = d
				319	fld st0 ; ST = d d
				320	fmul dword [esi + (194)] ; ST = ddata[sample+19] d
				321	fadd dword [edi + (194)] ; ST = autoc[19]+ddata[sample+19] d
				322	fstp dword [edi + (194)] ; autoc[19]+=ddata[sample+19] ST = d
				323	fld st0 ; ST = d d
				324	fmul dword [esi + (184)] ; ST = ddata[sample+18] d
				325	fadd dword [edi + (184)] ; ST = autoc[18]+ddata[sample+18] d
				326	fstp dword [edi + (184)] ; autoc[18]+=ddata[sample+18] ST = d
				327	fld st0 ; ST = d d
				328	fmul dword [esi + (174)] ; ST = ddata[sample+17] d
				329	fadd dword [edi + (174)] ; ST = autoc[17]+ddata[sample+17] d
				330	fstp dword [edi + (174)] ; autoc[17]+=ddata[sample+17] ST = d
				331	fld st0 ; ST = d d
				332	fmul dword [esi + (164)] ; ST = ddata[sample+16] d
				333	fadd dword [edi + (164)] ; ST = autoc[16]+ddata[sample+16] d
				334	fstp dword [edi + (164)] ; autoc[16]+=ddata[sample+16] ST = d
				335	fld st0 ; ST = d d
				336	fmul dword [esi + (154)] ; ST = ddata[sample+15] d
				337	fadd dword [edi + (154)] ; ST = autoc[15]+ddata[sample+15] d
				338	fstp dword [edi + (154)] ; autoc[15]+=ddata[sample+15] ST = d
				339	fld st0 ; ST = d d
				340	fmul dword [esi + (144)] ; ST = ddata[sample+14] d
				341	fadd dword [edi + (144)] ; ST = autoc[14]+ddata[sample+14] d
				342	fstp dword [edi + (144)] ; autoc[14]+=ddata[sample+14] ST = d
				343	fld st0 ; ST = d d
				344	fmul dword [esi + (134)] ; ST = ddata[sample+13] d
				345	fadd dword [edi + (134)] ; ST = autoc[13]+ddata[sample+13] d
				346	fstp dword [edi + (134)] ; autoc[13]+=ddata[sample+13] ST = d
				347	fld st0 ; ST = d d
				348	fmul dword [esi + (124)] ; ST = ddata[sample+12] d
				349	fadd dword [edi + (124)] ; ST = autoc[12]+ddata[sample+12] d
				350	fstp dword [edi + (124)] ; autoc[12]+=ddata[sample+12] ST = d
				351	fld st0 ; ST = d d
				352	fmul dword [esi + (114)] ; ST = ddata[sample+11] d
				353	fadd dword [edi + (114)] ; ST = autoc[11]+ddata[sample+11] d
				354	fstp dword [edi + (114)] ; autoc[11]+=ddata[sample+11] ST = d
				355	fld st0 ; ST = d d
				356	fmul dword [esi + (104)] ; ST = ddata[sample+10] d
				357	fadd dword [edi + (104)] ; ST = autoc[10]+ddata[sample+10] d
				358	fstp dword [edi + (104)] ; autoc[10]+=ddata[sample+10] ST = d
				359	fld st0 ; ST = d d
				360	fmul dword [esi + ( 94)] ; ST = ddata[sample+9] d
				361	fadd dword [edi + ( 94)] ; ST = autoc[9]+ddata[sample+9] d
				362	fstp dword [edi + ( 94)] ; autoc[9]+=ddata[sample+9] ST = d
				363	fld st0 ; ST = d d
				364	fmul dword [esi + ( 84)] ; ST = ddata[sample+8] d
				365	fadd dword [edi + ( 84)] ; ST = autoc[8]+ddata[sample+8] d
				366	fstp dword [edi + ( 84)] ; autoc[8]+=ddata[sample+8] ST = d
				367	fld st0 ; ST = d d
				368	fmul dword [esi + ( 74)] ; ST = ddata[sample+7] d
				369	fadd dword [edi + ( 74)] ; ST = autoc[7]+ddata[sample+7] d
				370	fstp dword [edi + ( 74)] ; autoc[7]+=ddata[sample+7] ST = d
				371	fld st0 ; ST = d d
				372	fmul dword [esi + ( 64)] ; ST = ddata[sample+6] d
				373	fadd dword [edi + ( 64)] ; ST = autoc[6]+ddata[sample+6] d
				374	fstp dword [edi + ( 64)] ; autoc[6]+=ddata[sample+6] ST = d
				375	fld st0 ; ST = d d
				376	fmul dword [esi + ( 54)] ; ST = ddata[sample+4] d
				377	fadd dword [edi + ( 54)] ; ST = autoc[4]+ddata[sample+4] d
				378	fstp dword [edi + ( 54)] ; autoc[4]+=ddata[sample+4] ST = d
				379	fld st0 ; ST = d d
				380	fmul dword [esi + ( 44)] ; ST = ddata[sample+4] d
				381	fadd dword [edi + ( 44)] ; ST = autoc[4]+ddata[sample+4] d
				382	fstp dword [edi + ( 44)] ; autoc[4]+=ddata[sample+4] ST = d
				383	fld st0 ; ST = d d
				384	fmul dword [esi + ( 34)] ; ST = ddata[sample+3] d
				385	fadd dword [edi + ( 34)] ; ST = autoc[3]+ddata[sample+3] d
				386	fstp dword [edi + ( 34)] ; autoc[3]+=ddata[sample+3] ST = d
				387	fld st0 ; ST = d d
				388	fmul dword [esi + ( 24)] ; ST = ddata[sample+2] d
				389	fadd dword [edi + ( 24)] ; ST = autoc[2]+ddata[sample+2] d
				390	fstp dword [edi + ( 24)] ; autoc[2]+=ddata[sample+2] ST = d
				391	fld st0 ; ST = d d
				392	fmul dword [esi + ( 14)] ; ST = ddata[sample+1] d
				393	fadd dword [edi + ( 14)] ; ST = autoc[1]+ddata[sample+1] d
				394	fstp dword [edi + ( 14)] ; autoc[1]+=ddata[sample+1] ST = d
				395	fld st0 ; ST = d d
				396	fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
				397	fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
				398	fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
				399	.jumper2_0:
				400
				401	fstp st0 ; pop d, ST = empty
				402	add esi, byte 4 ; sample++
				403	dec ecx
				404	jz .loop2_end
				405	add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target
				406	fld dword [esi] ; ST = d <- data[sample]
				407	jmp edx
				408	.loop2_end:
				409
				410	.end:
				411	pop edi
				412	pop esi
				413	ret
				414
				415	ALIGN 16
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	416	cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	417	;[esp + 16] == autoc[]
				418	;[esp + 12] == lag
				419	;[esp + 8] == data_len
				420	;[esp + 4] == data[]
				421
				422	;ASSERT(lag > 0)
				423	;ASSERT(lag <= 4)
				424	;ASSERT(lag <= data_len)
				425
				426	; for(coeff = 0; coeff < lag; coeff++)
				427	; autoc[coeff] = 0.0;
				428	xorps xmm5, xmm5
				429
				430	mov edx, [esp + 8] ; edx == data_len
				431	mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
				432
				433	movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
				434	add eax, 4
				435	movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
				436	shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
				437	.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
				438	mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
				439	addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
				440	dec edx
				441	jz .loop_end
				442	ALIGN 16
				443	.loop_start:
				444	; start by reading the next sample
				445	movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
				446	add eax, 4
				447	shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
				448	shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
				449	movss xmm2, xmm0
				450	mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
				451	addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
				452	dec edx
				453	jnz .loop_start
				454	.loop_end:
				455	; store autoc
				456	mov edx, [esp + 16] ; edx == autoc
				457	movups [edx], xmm5
				458
				459	.end:
				460	ret
				461
				462	ALIGN 16
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	463	cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	464	;[esp + 16] == autoc[]
				465	;[esp + 12] == lag
				466	;[esp + 8] == data_len
				467	;[esp + 4] == data[]
				468
				469	;ASSERT(lag > 0)
				470	;ASSERT(lag <= 8)
				471	;ASSERT(lag <= data_len)
				472
				473	; for(coeff = 0; coeff < lag; coeff++)
				474	; autoc[coeff] = 0.0;
				475	xorps xmm5, xmm5
				476	xorps xmm6, xmm6
				477
				478	mov edx, [esp + 8] ; edx == data_len
				479	mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
				480
				481	movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
				482	add eax, 4
				483	movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
				484	shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
				485	movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
				486	xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
				487	.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
				488	mulps xmm0, xmm2
				489	mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
				490	addps xmm5, xmm0
				491	addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
				492	dec edx
				493	jz .loop_end
				494	ALIGN 16
				495	.loop_start:
				496	; start by reading the next sample
				497	movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
				498	; here we reorder the instructions; see the (#) indexes for a logical order
				499	shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
				500	add eax, 4 ; (0)
				501	shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
				502	shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
				503	movss xmm3, xmm2 ; (5)
				504	movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
				505	movss xmm2, xmm0 ; (6)
				506	mulps xmm1, xmm3 ; (8)
				507	mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
				508	addps xmm6, xmm1 ; (10)
				509	addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
				510	dec edx
				511	jnz .loop_start
				512	.loop_end:
				513	; store autoc
				514	mov edx, [esp + 16] ; edx == autoc
				515	movups [edx], xmm5
				516	movups [edx + 4], xmm6
				517
				518	.end:
				519	ret
				520
				521	ALIGN 16
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	522	cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	523	;[esp + 16] == autoc[]
				524	;[esp + 12] == lag
				525	;[esp + 8] == data_len
				526	;[esp + 4] == data[]
				527
				528	;ASSERT(lag > 0)
				529	;ASSERT(lag <= 12)
				530	;ASSERT(lag <= data_len)
				531
				532	; for(coeff = 0; coeff < lag; coeff++)
				533	; autoc[coeff] = 0.0;
				534	xorps xmm5, xmm5
				535	xorps xmm6, xmm6
				536	xorps xmm7, xmm7
				537
				538	mov edx, [esp + 8] ; edx == data_len
				539	mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
				540
				541	movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
				542	add eax, 4
				543	movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
				544	shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
				545	xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
				546	xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
				547	.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
				548	movaps xmm1, xmm0
				549	mulps xmm1, xmm2
				550	addps xmm5, xmm1
				551	movaps xmm1, xmm0
				552	mulps xmm1, xmm3
				553	addps xmm6, xmm1
				554	mulps xmm0, xmm4
				555	addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
				556	dec edx
				557	jz .loop_end
				558	ALIGN 16
				559	.loop_start:
				560	; start by reading the next sample
				561	movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
				562	add eax, 4
				563	shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
				564
				565	; shift xmm4:xmm3:xmm2 left by one float
				566	shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
				567	shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
				568	shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
				569	movss xmm4, xmm3
				570	movss xmm3, xmm2
				571	movss xmm2, xmm0
				572
				573	; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
				574	movaps xmm1, xmm0
				575	mulps xmm1, xmm2
				576	addps xmm5, xmm1
				577	movaps xmm1, xmm0
				578	mulps xmm1, xmm3
				579	addps xmm6, xmm1
				580	mulps xmm0, xmm4
				581	addps xmm7, xmm0
				582
				583	dec edx
				584	jnz .loop_start
				585	.loop_end:
				586	; store autoc
				587	mov edx, [esp + 16] ; edx == autoc
				588	movups [edx], xmm5
				589	movups [edx + 4], xmm6
				590	movups [edx + 8], xmm7
				591
				592	.end:
				593	ret
				594
				595	;void FLAC__lpc_compute_residual_from_qlp_coefficients(const int32 data[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 residual[])
				596	;
				597	; for(i = 0; i < data_len; i++) {
				598	; sum = 0;
				599	; for(j = 0; j < order; j++)
				600	; sum += qlp_coeff[j] * data[i-j-1];
				601	; residual[i] = data[i] - (sum >> lp_quantization);
				602	; }
				603	;
				604	ALIGN 16
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	605	cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	606	;[esp + 40] residual[]
				607	;[esp + 36] lp_quantization
				608	;[esp + 32] order
				609	;[esp + 28] qlp_coeff[]
				610	;[esp + 24] data_len
				611	;[esp + 20] data[]
				612
				613	;ASSERT(order > 0)
				614
				615	push ebp
				616	push ebx
				617	push esi
				618	push edi
				619
				620	mov esi, [esp + 20] ; esi = data[]
				621	mov edi, [esp + 40] ; edi = residual[]
				622	mov eax, [esp + 32] ; eax = order
				623	mov ebx, [esp + 24] ; ebx = data_len
				624
				625	test ebx, ebx
				626	jz near .end ; do nothing if data_len == 0
				627	.begin:
				628	cmp eax, byte 1
				629	jg short .i_1more
				630
				631	mov ecx, [esp + 28]
				632	mov edx, [ecx] ; edx = qlp_coeff[0]
				633	mov eax, [esi - 4] ; eax = data[-1]
				634	mov cl, [esp + 36] ; cl = lp_quantization
				635	ALIGN 16
				636	.i_1_loop_i:
				637	imul eax, edx
				638	sar eax, cl
				639	neg eax
				640	add eax, [esi]
				641	mov [edi], eax
				642	mov eax, [esi]
				643	add edi, byte 4
				644	add esi, byte 4
				645	dec ebx
				646	jnz .i_1_loop_i
				647
				648	jmp .end
				649
				650	.i_1more:
				651	cmp eax, byte 32 ; for order <= 32 there is a faster routine
				652	jbe short .i_32
				653
				654	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
				655	ALIGN 16
				656	.i_32more_loop_i:
				657	xor ebp, ebp
				658	mov ecx, [esp + 32]
				659	mov edx, ecx
				660	shl edx, 2
				661	add edx, [esp + 28]
				662	neg ecx
				663	ALIGN 16
				664	.i_32more_loop_j:
				665	sub edx, byte 4
				666	mov eax, [edx]
				667	imul eax, [esi + 4 * ecx]
				668	add ebp, eax
				669	inc ecx
				670	jnz short .i_32more_loop_j
				671
				672	mov cl, [esp + 36]
				673	sar ebp, cl
				674	neg ebp
				675	add ebp, [esi]
				676	mov [edi], ebp
				677	add esi, byte 4
				678	add edi, byte 4
				679
				680	dec ebx
				681	jnz .i_32more_loop_i
				682
				683	jmp .end
				684
				685	.i_32:
				686	sub edi, esi
				687	neg eax
				688	lea edx, [eax + eax * 8 + .jumper_0]
				689	inc edx
				690	mov eax, [esp + 28] ; eax = qlp_coeff[]
				691	xor ebp, ebp
				692	jmp edx
				693
				694	mov ecx, [eax + 124]
				695	imul ecx, [esi - 128]
				696	add ebp, ecx
				697	mov ecx, [eax + 120]
				698	imul ecx, [esi - 124]
				699	add ebp, ecx
				700	mov ecx, [eax + 116]
				701	imul ecx, [esi - 120]
				702	add ebp, ecx
				703	mov ecx, [eax + 112]
				704	imul ecx, [esi - 116]
				705	add ebp, ecx
				706	mov ecx, [eax + 108]
				707	imul ecx, [esi - 112]
				708	add ebp, ecx
				709	mov ecx, [eax + 104]
				710	imul ecx, [esi - 108]
				711	add ebp, ecx
				712	mov ecx, [eax + 100]
				713	imul ecx, [esi - 104]
				714	add ebp, ecx
				715	mov ecx, [eax + 96]
				716	imul ecx, [esi - 100]
				717	add ebp, ecx
				718	mov ecx, [eax + 92]
				719	imul ecx, [esi - 96]
				720	add ebp, ecx
				721	mov ecx, [eax + 88]
				722	imul ecx, [esi - 92]
				723	add ebp, ecx
				724	mov ecx, [eax + 84]
				725	imul ecx, [esi - 88]
				726	add ebp, ecx
				727	mov ecx, [eax + 80]
				728	imul ecx, [esi - 84]
				729	add ebp, ecx
				730	mov ecx, [eax + 76]
				731	imul ecx, [esi - 80]
				732	add ebp, ecx
				733	mov ecx, [eax + 72]
				734	imul ecx, [esi - 76]
				735	add ebp, ecx
				736	mov ecx, [eax + 68]
				737	imul ecx, [esi - 72]
				738	add ebp, ecx
				739	mov ecx, [eax + 64]
				740	imul ecx, [esi - 68]
				741	add ebp, ecx
				742	mov ecx, [eax + 60]
				743	imul ecx, [esi - 64]
				744	add ebp, ecx
				745	mov ecx, [eax + 56]
				746	imul ecx, [esi - 60]
				747	add ebp, ecx
				748	mov ecx, [eax + 52]
				749	imul ecx, [esi - 56]
				750	add ebp, ecx
				751	mov ecx, [eax + 48]
				752	imul ecx, [esi - 52]
				753	add ebp, ecx
				754	mov ecx, [eax + 44]
				755	imul ecx, [esi - 48]
				756	add ebp, ecx
				757	mov ecx, [eax + 40]
				758	imul ecx, [esi - 44]
				759	add ebp, ecx
				760	mov ecx, [eax + 36]
				761	imul ecx, [esi - 40]
				762	add ebp, ecx
				763	mov ecx, [eax + 32]
				764	imul ecx, [esi - 36]
				765	add ebp, ecx
				766	mov ecx, [eax + 28]
				767	imul ecx, [esi - 32]
				768	add ebp, ecx
				769	mov ecx, [eax + 24]
				770	imul ecx, [esi - 28]
				771	add ebp, ecx
				772	mov ecx, [eax + 20]
				773	imul ecx, [esi - 24]
				774	add ebp, ecx
				775	mov ecx, [eax + 16]
				776	imul ecx, [esi - 20]
				777	add ebp, ecx
				778	mov ecx, [eax + 12]
				779	imul ecx, [esi - 16]
				780	add ebp, ecx
				781	mov ecx, [eax + 8]
				782	imul ecx, [esi - 12]
				783	add ebp, ecx
				784	mov ecx, [eax + 4]
				785	imul ecx, [esi - 8]
				786	add ebp, ecx
				787	mov ecx, [eax] ; there is one byte missing
				788	imul ecx, [esi - 4]
				789	add ebp, ecx
				790	.jumper_0:
				791
				792	mov cl, [esp + 36]
				793	sar ebp, cl
				794	neg ebp
				795	add ebp, [esi]
				796	mov [edi + esi], ebp
				797	add esi, byte 4
				798
				799	dec ebx
				800	jz short .end
				801	xor ebp, ebp
				802	jmp edx
				803
				804	.end:
				805	pop edi
				806	pop esi
				807	pop ebx
				808	pop ebp
				809	ret
				810
				811	; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
				812	; the channel must be <= 16. Especially note that this routine cannot be used
				813	; for side-channel coded 16bps channels since the effective bps is 17.
				814	ALIGN 16
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	815	cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	816	;[esp + 40] residual[]
				817	;[esp + 36] lp_quantization
				818	;[esp + 32] order
				819	;[esp + 28] qlp_coeff[]
				820	;[esp + 24] data_len
				821	;[esp + 20] data[]
				822
				823	;ASSERT(order > 0)
				824
				825	push ebp
				826	push ebx
				827	push esi
				828	push edi
				829
				830	mov esi, [esp + 20] ; esi = data[]
				831	mov edi, [esp + 40] ; edi = residual[]
				832	mov eax, [esp + 32] ; eax = order
				833	mov ebx, [esp + 24] ; ebx = data_len
				834
				835	test ebx, ebx
				836	jz near .end ; do nothing if data_len == 0
				837	dec ebx
				838	test ebx, ebx
				839	jz near .last_one
				840
				841	mov edx, [esp + 28] ; edx = qlp_coeff[]
				842	movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
				843	mov ebp, esp
				844
				845	and esp, 0xfffffff8
				846
				847	xor ecx, ecx
				848	.copy_qlp_loop:
				849	push word [edx + 4 * ecx]
				850	inc ecx
				851	cmp ecx, eax
				852	jnz short .copy_qlp_loop
				853
				854	and ecx, 0x3
				855	test ecx, ecx
				856	je short .za_end
				857	sub ecx, byte 4
				858	.za_loop:
				859	push word 0
				860	inc eax
				861	inc ecx
				862	jnz short .za_loop
				863	.za_end:
				864
				865	movq mm5, [esp + 2 * eax - 8]
				866	movd mm4, [esi - 16]
				867	punpckldq mm4, [esi - 12]
				868	movd mm0, [esi - 8]
				869	punpckldq mm0, [esi - 4]
				870	packssdw mm4, mm0
				871
				872	cmp eax, byte 4
				873	jnbe short .mmx_4more
				874
				875	align 16
				876	.mmx_4_loop_i:
				877	movd mm1, [esi]
				878	movq mm3, mm4
				879	punpckldq mm1, [esi + 4]
				880	psrlq mm4, 16
				881	movq mm0, mm1
				882	psllq mm0, 48
				883	por mm4, mm0
				884	movq mm2, mm4
				885	psrlq mm4, 16
				886	pxor mm0, mm0
				887	punpckhdq mm0, mm1
				888	pmaddwd mm3, mm5
				889	pmaddwd mm2, mm5
				890	psllq mm0, 16
				891	por mm4, mm0
				892	movq mm0, mm3
				893	punpckldq mm3, mm2
				894	punpckhdq mm0, mm2
				895	paddd mm3, mm0
				896	psrad mm3, mm6
				897	psubd mm1, mm3
				898	movd [edi], mm1
				899	punpckhdq mm1, mm1
				900	movd [edi + 4], mm1
				901
				902	add edi, byte 8
				903	add esi, byte 8
				904
				905	sub ebx, 2
				906	jg .mmx_4_loop_i
				907	jmp .mmx_end
				908
				909	.mmx_4more:
				910	shl eax, 2
				911	neg eax
				912	add eax, byte 16
				913
				914	align 16
				915	.mmx_4more_loop_i:
				916	movd mm1, [esi]
				917	punpckldq mm1, [esi + 4]
				918	movq mm3, mm4
				919	psrlq mm4, 16
				920	movq mm0, mm1
				921	psllq mm0, 48
				922	por mm4, mm0
				923	movq mm2, mm4
				924	psrlq mm4, 16
				925	pxor mm0, mm0
				926	punpckhdq mm0, mm1
				927	pmaddwd mm3, mm5
				928	pmaddwd mm2, mm5
				929	psllq mm0, 16
				930	por mm4, mm0
				931
				932	mov ecx, esi
				933	add ecx, eax
				934	mov edx, esp
				935
				936	align 16
				937	.mmx_4more_loop_j:
				938	movd mm0, [ecx - 16]
				939	movd mm7, [ecx - 8]
				940	punpckldq mm0, [ecx - 12]
				941	punpckldq mm7, [ecx - 4]
				942	packssdw mm0, mm7
				943	pmaddwd mm0, [edx]
				944	punpckhdq mm7, mm7
				945	paddd mm3, mm0
				946	movd mm0, [ecx - 12]
				947	punpckldq mm0, [ecx - 8]
				948	punpckldq mm7, [ecx]
				949	packssdw mm0, mm7
				950	pmaddwd mm0, [edx]
				951	paddd mm2, mm0
				952
				953	add edx, byte 8
				954	add ecx, byte 16
				955	cmp ecx, esi
				956	jnz .mmx_4more_loop_j
				957
				958	movq mm0, mm3
				959	punpckldq mm3, mm2
				960	punpckhdq mm0, mm2
				961	paddd mm3, mm0
				962	psrad mm3, mm6
				963	psubd mm1, mm3
				964	movd [edi], mm1
				965	punpckhdq mm1, mm1
				966	movd [edi + 4], mm1
				967
				968	add edi, byte 8
				969	add esi, byte 8
				970
				971	sub ebx, 2
				972	jg near .mmx_4more_loop_i
				973
				974	.mmx_end:
				975	emms
				976	mov esp, ebp
				977	.last_one:
				978	mov eax, [esp + 32]
				979	inc ebx
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	980	jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	981
				982	.end:
				983	pop edi
				984	pop esi
				985	pop ebx
				986	pop ebp
				987	ret
				988
				989	; **********************************************************************
				990	;
				991	; void FLAC__lpc_restore_signal(const int32 residual[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 data[])
				992	; {
				993	; unsigned i, j;
				994	; int32 sum;
				995	;
				996	; FLAC__ASSERT(order > 0);
				997	;
				998	; for(i = 0; i < data_len; i++) {
				999	; sum = 0;
				1000	; for(j = 0; j < order; j++)
				1001	; sum += qlp_coeff[j] * data[i-j-1];
				1002	; data[i] = residual[i] + (sum >> lp_quantization);
				1003	; }
				1004	; }
				1005	ALIGN 16
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	1006	cident FLAC__lpc_restore_signal_asm_ia32
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	1007	;[esp + 40] data[]
				1008	;[esp + 36] lp_quantization
				1009	;[esp + 32] order
				1010	;[esp + 28] qlp_coeff[]
				1011	;[esp + 24] data_len
				1012	;[esp + 20] residual[]
				1013
				1014	;ASSERT(order > 0)
				1015
				1016	push ebp
				1017	push ebx
				1018	push esi
				1019	push edi
				1020
				1021	mov esi, [esp + 20] ; esi = residual[]
				1022	mov edi, [esp + 40] ; edi = data[]
				1023	mov eax, [esp + 32] ; eax = order
				1024	mov ebx, [esp + 24] ; ebx = data_len
				1025
				1026	test ebx, ebx
				1027	jz near .end ; do nothing if data_len == 0
				1028
				1029	.begin:
				1030	cmp eax, byte 1
				1031	jg short .x87_1more
				1032
				1033	mov ecx, [esp + 28]
				1034	mov edx, [ecx]
				1035	mov eax, [edi - 4]
				1036	mov cl, [esp + 36]
				1037	ALIGN 16
				1038	.x87_1_loop_i:
				1039	imul eax, edx
				1040	sar eax, cl
				1041	add eax, [esi]
				1042	mov [edi], eax
				1043	add esi, byte 4
				1044	add edi, byte 4
				1045	dec ebx
				1046	jnz .x87_1_loop_i
				1047
				1048	jmp .end
				1049
				1050	.x87_1more:
				1051	cmp eax, byte 32 ; for order <= 32 there is a faster routine
				1052	jbe short .x87_32
				1053
				1054	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
				1055	ALIGN 16
				1056	.x87_32more_loop_i:
				1057	xor ebp, ebp
				1058	mov ecx, [esp + 32]
				1059	mov edx, ecx
				1060	shl edx, 2
				1061	add edx, [esp + 28]
				1062	neg ecx
				1063	ALIGN 16
				1064	.x87_32more_loop_j:
				1065	sub edx, byte 4
				1066	mov eax, [edx]
				1067	imul eax, [edi + 4 * ecx]
				1068	add ebp, eax
				1069	inc ecx
				1070	jnz short .x87_32more_loop_j
				1071
				1072	mov cl, [esp + 36]
				1073	sar ebp, cl
				1074	add ebp, [esi]
				1075	mov [edi], ebp
				1076	add edi, byte 4
				1077	add esi, byte 4
				1078
				1079	dec ebx
				1080	jnz .x87_32more_loop_i
				1081
				1082	jmp .end
				1083
				1084	.x87_32:
				1085	sub esi, edi
				1086	neg eax
				1087	lea edx, [eax + eax * 8 + .jumper_0]
				1088	inc edx ; compensate for the shorter opcode on the last iteration
				1089	mov eax, [esp + 28] ; eax = qlp_coeff[]
				1090	xor ebp, ebp
				1091	jmp edx
				1092
				1093	mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
				1094	imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
				1095	add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
				1096	mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
				1097	imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
				1098	add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
				1099	mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
				1100	imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
				1101	add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
				1102	mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
				1103	imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
				1104	add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
				1105	mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
				1106	imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
				1107	add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
				1108	mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
				1109	imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
				1110	add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
				1111	mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
				1112	imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
				1113	add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
				1114	mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
				1115	imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
				1116	add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
				1117	mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
				1118	imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
				1119	add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
				1120	mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
				1121	imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
				1122	add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
				1123	mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
				1124	imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
				1125	add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
				1126	mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
				1127	imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
				1128	add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
				1129	mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
				1130	imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
				1131	add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
				1132	mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
				1133	imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
				1134	add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
				1135	mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
				1136	imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
				1137	add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
				1138	mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
				1139	imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
				1140	add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
				1141	mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
				1142	imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
				1143	add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
				1144	mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
				1145	imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
				1146	add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
				1147	mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
				1148	imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
				1149	add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
				1150	mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
				1151	imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
				1152	add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
				1153	mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
				1154	imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
				1155	add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
				1156	mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
				1157	imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
				1158	add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
				1159	mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
				1160	imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
				1161	add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
				1162	mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
				1163	imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
				1164	add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
				1165	mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
				1166	imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
				1167	add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
				1168	mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
				1169	imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
				1170	add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
				1171	mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
				1172	imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
				1173	add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
				1174	mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
				1175	imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
				1176	add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
				1177	mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
				1178	imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
				1179	add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
				1180	mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
				1181	imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
				1182	add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
				1183	mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
				1184	imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
				1185	add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
				1186	mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
				1187	imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
				1188	add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
				1189	.jumper_0:
				1190
				1191	mov cl, [esp + 36]
				1192	sar ebp, cl ; ebp = (sum >> lp_quantization)
				1193	add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
				1194	mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
				1195	add edi, byte 4
				1196
				1197	dec ebx
				1198	jz short .end
				1199	xor ebp, ebp
				1200	jmp edx
				1201
				1202	.end:
				1203	pop edi
				1204	pop esi
				1205	pop ebx
				1206	pop ebp
				1207	ret
				1208
				1209	; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
				1210	; the channel must be <= 16. Especially note that this routine cannot be used
				1211	; for side-channel coded 16bps channels since the effective bps is 17.
				1212	ALIGN 16
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	1213	cident FLAC__lpc_restore_signal_asm_ia32_mmx
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	1214	;[esp + 40] data[]
				1215	;[esp + 36] lp_quantization
				1216	;[esp + 32] order
				1217	;[esp + 28] qlp_coeff[]
				1218	;[esp + 24] data_len
				1219	;[esp + 20] residual[]
				1220
				1221	;ASSERT(order > 0)
				1222
				1223	push ebp
				1224	push ebx
				1225	push esi
				1226	push edi
				1227
				1228	mov esi, [esp + 20]
				1229	mov edi, [esp + 40]
				1230	mov eax, [esp + 32]
				1231	mov ebx, [esp + 24]
				1232
				1233	test ebx, ebx
				1234	jz near .end ; do nothing if data_len == 0
				1235	cmp eax, byte 4
Josh Coalson	e6499bd	2001-06-13 18:11:25 +0000	[diff] [blame^]	1236	jb near FLAC__lpc_restore_signal_asm_ia32.begin
Josh Coalson	9a7b5e2	2001-06-13 18:03:09 +0000	[diff] [blame]	1237
				1238	mov edx, [esp + 28]
				1239	movd mm6, [esp + 36]
				1240	mov ebp, esp
				1241
				1242	and esp, 0xfffffff8
				1243
				1244	xor ecx, ecx
				1245	.copy_qlp_loop:
				1246	push word [edx + 4 * ecx]
				1247	inc ecx
				1248	cmp ecx, eax
				1249	jnz short .copy_qlp_loop
				1250
				1251	and ecx, 0x3
				1252	test ecx, ecx
				1253	je short .za_end
				1254	sub ecx, byte 4
				1255	.za_loop:
				1256	push word 0
				1257	inc eax
				1258	inc ecx
				1259	jnz short .za_loop
				1260	.za_end:
				1261
				1262	movq mm5, [esp + 2 * eax - 8]
				1263	movd mm4, [edi - 16]
				1264	punpckldq mm4, [edi - 12]
				1265	movd mm0, [edi - 8]
				1266	punpckldq mm0, [edi - 4]
				1267	packssdw mm4, mm0
				1268
				1269	cmp eax, byte 4
				1270	jnbe short .mmx_4more
				1271
				1272	align 16
				1273	.mmx_4_loop_i:
				1274	movq mm7, mm4
				1275	pmaddwd mm7, mm5
				1276	movq mm0, mm7
				1277	punpckhdq mm7, mm7
				1278	paddd mm7, mm0
				1279	psrad mm7, mm6
				1280	movd mm1, [esi]
				1281	paddd mm7, mm1
				1282	movd [edi], mm7
				1283	psllq mm7, 48
				1284	psrlq mm4, 16
				1285	por mm4, mm7
				1286
				1287	add esi, byte 4
				1288	add edi, byte 4
				1289
				1290	dec ebx
				1291	jnz .mmx_4_loop_i
				1292	jmp .mmx_end
				1293	.mmx_4more:
				1294	shl eax, 2
				1295	neg eax
				1296	add eax, byte 16
				1297	align 16
				1298	.mmx_4more_loop_i:
				1299	mov ecx, edi
				1300	add ecx, eax
				1301	mov edx, esp
				1302
				1303	movq mm7, mm4
				1304	pmaddwd mm7, mm5
				1305
				1306	align 16
				1307	.mmx_4more_loop_j:
				1308	movd mm0, [ecx - 16]
				1309	punpckldq mm0, [ecx - 12]
				1310	movd mm1, [ecx - 8]
				1311	punpckldq mm1, [ecx - 4]
				1312	packssdw mm0, mm1
				1313	pmaddwd mm0, [edx]
				1314	paddd mm7, mm0
				1315
				1316	add edx, byte 8
				1317	add ecx, byte 16
				1318	cmp ecx, edi
				1319	jnz .mmx_4more_loop_j
				1320
				1321	movq mm0, mm7
				1322	punpckhdq mm7, mm7
				1323	paddd mm7, mm0
				1324	psrad mm7, mm6
				1325	movd mm1, [esi]
				1326	paddd mm7, mm1
				1327	movd [edi], mm7
				1328	psllq mm7, 48
				1329	psrlq mm4, 16
				1330	por mm4, mm7
				1331
				1332	add esi, byte 4
				1333	add edi, byte 4
				1334
				1335	dec ebx
				1336	jnz short .mmx_4more_loop_i
				1337	.mmx_end:
				1338	emms
				1339	mov esp, ebp
				1340
				1341	.end:
				1342	pop edi
				1343	pop esi
				1344	pop ebx
				1345	pop ebp
				1346	ret
				1347
				1348	end