src/libFLAC/ia32/lpc_asm-unrolled.nasm - platform/external/flac - Gitiles

 ;  vim:filetype=nasm ts=8

 ;  libFLAC - Free Lossless Audio Codec library
 ;  Copyright (C) 2001-2009  Josh Coalson
 ;  Copyright (C) 2011-2014  Xiph.Org Foundation
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
 ;  are met:
 ;
 ;  - Redistributions of source code must retain the above copyright
 ;  notice, this list of conditions and the following disclaimer.
 ;
 ;  - Redistributions in binary form must reproduce the above copyright
 ;  notice, this list of conditions and the following disclaimer in the
 ;  documentation and/or other materials provided with the distribution.
 ;
 ;  - Neither the name of the Xiph.org Foundation nor the names of its
 ;  contributors may be used to endorse or promote products derived from
 ;  this software without specific prior written permission.
 ;
 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 ;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 ;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
 ;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ; [CR] is a note to flag that the instruction can be easily reordered

 %include "nasm.h"

 	data_section

 cglobal FLAC__lpc_compute_autocorrelation_asm

 	code_section

 ; **********************************************************************
 ;
 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
 ; {
 ;	FLAC__real d;
 ;	unsigned sample, coeff;
 ;	const unsigned limit = data_len - lag;
 ;
 ;	assert(lag > 0);
 ;	assert(lag <= data_len);
 ;
 ;	for(coeff = 0; coeff < lag; coeff++)
 ;		autoc[coeff] = 0.0;
 ;	for(sample = 0; sample <= limit; sample++){
 ;		d = data[sample];
 ;		for(coeff = 0; coeff < lag; coeff++)
 ;			autoc[coeff] += d * data[sample+coeff];
 ;	}
 ;	for(; sample < data_len; sample++){
 ;		d = data[sample];
 ;		for(coeff = 0; coeff < data_len - sample; coeff++)
 ;			autoc[coeff] += d * data[sample+coeff];
 ;	}
 ; }
 ;
 FLAC__lpc_compute_autocorrelation_asm:

 	push	ebp
 	lea	ebp, [esp + 8]
 	push	ebx
 	push	esi
 	push	edi

 	mov	edx, [ebp + 8]			; edx == lag
 	mov	ecx, [ebp + 4]			; ecx == data_len
 	mov	esi, [ebp]			; esi == data
 	mov	edi, [ebp + 12]			; edi == autoc

 	cmp	edx, 1
 	ja	short .lag_above_1
 .lag_eq_1:
 	fldz					; will accumulate autoc[0]
 	ALIGN 16
 .lag_1_loop:
 	fld	dword [esi]
 	add	esi, byte 4			; sample++
 	fmul	st0, st0
 	faddp	st1, st0
 	dec	ecx
 	jnz	.lag_1_loop
 	fstp	dword [edi]
 	jmp	.end

 .lag_above_1:
 	cmp	edx, 2
 	ja	short .lag_above_2
 .lag_eq_2:
 	fldz					; will accumulate autoc[1]
 	dec	ecx
 	fldz					; will accumulate autoc[0]
 	fld	dword [esi]
 	ALIGN 16
 .lag_2_loop:
 	add	esi, byte 4			; [CR] sample++
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi]
 	fmul	st1, st0
 	fxch
 	faddp	st3, st0			; add to autoc[1]
 	dec	ecx
 	jnz	.lag_2_loop
 	; clean up the leftovers
 	fmul	st0, st0
 	faddp	st1, st0			; add to autoc[0]
 	fstp	dword [edi]
 	fstp	dword [edi + 4]
 	jmp	.end

 .lag_above_2:
 	cmp	edx, 3
 	ja	short .lag_above_3
 .lag_eq_3:
 	fldz					; will accumulate autoc[2]
 	dec	ecx
 	fldz					; will accumulate autoc[1]
 	dec	ecx
 	fldz					; will accumulate autoc[0]
 	ALIGN 16
 .lag_3_loop:
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[1]
 	fld	dword [esi + 8]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st3, st0			; add to autoc[2]
 	dec	ecx
 	jnz	.lag_3_loop
 	; clean up the leftovers
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st1, st0
 	fxch
 	faddp	st3, st0			; add to autoc[1]
 	fmul	st0, st0
 	faddp	st1, st0			; add to autoc[0]
 	fstp	dword [edi]
 	fstp	dword [edi + 4]
 	fstp	dword [edi + 8]
 	jmp	.end

 .lag_above_3:
 	cmp	edx, 4
 	ja	near .lag_above_4
 .lag_eq_4:
 	fldz					; will accumulate autoc[3]
 	dec	ecx
 	fldz					; will accumulate autoc[2]
 	dec	ecx
 	fldz					; will accumulate autoc[1]
 	dec	ecx
 	fldz					; will accumulate autoc[0]
 	ALIGN 16
 .lag_4_loop:
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[1]
 	fld	dword [esi + 8]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[2]
 	fld	dword [esi + 12]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st4, st0			; add to autoc[3]
 	dec	ecx
 	jnz	.lag_4_loop
 	; clean up the leftovers
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[1]
 	fld	dword [esi + 8]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st3, st0			; add to autoc[2]
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st1, st0
 	fxch
 	faddp	st3, st0			; add to autoc[1]
 	fmul	st0, st0
 	faddp	st1, st0			; add to autoc[0]
 	fstp	dword [edi]
 	fstp	dword [edi + 4]
 	fstp	dword [edi + 8]
 	fstp	dword [edi + 12]
 	jmp	.end

 .lag_above_4:
 	cmp	edx, 5
 	ja	near .lag_above_5
 .lag_eq_5:
 	fldz					; will accumulate autoc[4]
 	fldz					; will accumulate autoc[3]
 	fldz					; will accumulate autoc[2]
 	fldz					; will accumulate autoc[1]
 	fldz					; will accumulate autoc[0]
 	sub	ecx, byte 4
 	ALIGN 16
 .lag_5_loop:
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[1]
 	fld	dword [esi + 8]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[2]
 	fld	dword [esi + 12]
 	fmul	st0, st1
 	faddp	st5, st0			; add to autoc[3]
 	fld	dword [esi + 16]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st5, st0			; add to autoc[4]
 	dec	ecx
 	jnz	.lag_5_loop
 	; clean up the leftovers
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[1]
 	fld	dword [esi + 8]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[2]
 	fld	dword [esi + 12]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st4, st0			; add to autoc[3]
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[1]
 	fld	dword [esi + 8]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st3, st0			; add to autoc[2]
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st1, st0
 	fxch
 	faddp	st3, st0			; add to autoc[1]
 	fmul	st0, st0
 	faddp	st1, st0			; add to autoc[0]
 	fstp	dword [edi]
 	fstp	dword [edi + 4]
 	fstp	dword [edi + 8]
 	fstp	dword [edi + 12]
 	fstp	dword [edi + 16]
 	jmp	.end

 .lag_above_5:
 	cmp	edx, 6
 	ja	.lag_above_6
 .lag_eq_6:
 	fldz					; will accumulate autoc[5]
 	fldz					; will accumulate autoc[4]
 	fldz					; will accumulate autoc[3]
 	fldz					; will accumulate autoc[2]
 	fldz					; will accumulate autoc[1]
 	fldz					; will accumulate autoc[0]
 	sub	ecx, byte 5
 	ALIGN 16
 .lag_6_loop:
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[1]
 	fld	dword [esi + 8]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[2]
 	fld	dword [esi + 12]
 	fmul	st0, st1
 	faddp	st5, st0			; add to autoc[3]
 	fld	dword [esi + 16]
 	fmul	st0, st1
 	faddp	st6, st0			; add to autoc[4]
 	fld	dword [esi + 20]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st6, st0			; add to autoc[5]
 	dec	ecx
 	jnz	.lag_6_loop
 	; clean up the leftovers
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[1]
 	fld	dword [esi + 8]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[2]
 	fld	dword [esi + 12]
 	fmul	st0, st1
 	faddp	st5, st0			; add to autoc[3]
 	fld	dword [esi + 16]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st5, st0			; add to autoc[4]
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[1]
 	fld	dword [esi + 8]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[2]
 	fld	dword [esi + 12]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st4, st0			; add to autoc[3]
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[1]
 	fld	dword [esi + 8]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st3, st0			; add to autoc[2]
 	fld	dword [esi]
 	fld	st0
 	fmul	st0, st0
 	faddp	st2, st0			; add to autoc[0]
 	fld	dword [esi + 4]
 	fmul	st1, st0
 	fxch
 	faddp	st3, st0			; add to autoc[1]
 	fmul	st0, st0
 	faddp	st1, st0			; add to autoc[0]
 	fstp	dword [edi]
 	fstp	dword [edi + 4]
 	fstp	dword [edi + 8]
 	fstp	dword [edi + 12]
 	fstp	dword [edi + 16]
 	fstp	dword [edi + 20]
 	jmp	.end

 .lag_above_6:
 	;	for(coeff = 0; coeff < lag; coeff++)
 	;		autoc[coeff] = 0.0;
 	lea	ecx, [edx * 2]			; ecx = # of dwords of 0 to write
 	xor	eax, eax
 	rep	stosd
 	mov	ecx, [ebp + 4]			; ecx == data_len
 	mov	edi, [ebp + 12]			; edi == autoc
 	;	const unsigned limit = data_len - lag;
 	sub	ecx, edx
 	inc	ecx				; we are looping <= limit so we add one to the counter
 	;	for(sample = 0; sample <= limit; sample++){
 	;		d = data[sample];
 	;		for(coeff = 0; coeff < lag; coeff++)
 	;			autoc[coeff] += d * data[sample+coeff];
 	;	}
 	xor	eax, eax			; eax == sample <- 0
 	ALIGN 16
 .outer_loop:
 	push	eax				; save sample
 	fld	dword [esi + eax * 4]		; ST = d <- data[sample]
 	mov	ebx, eax			; ebx == sample+coeff <- sample
 	mov	edx, [ebp + 8]			; edx <- lag
 	xor	eax, eax			; eax == coeff <- 0
 	ALIGN 16
 .inner_loop:
 	fld	st0				; ST = d d
 	fmul	dword [esi + ebx * 4]		; ST = d*data[sample+coeff] d
 	fadd	dword [edi + eax * 4]		; ST = autoc[coeff]+d*data[sample+coeff] d
 	fstp	dword [edi + eax * 4]		; autoc[coeff]+=d*data[sample+coeff]  ST = d
 	inc	ebx				; (sample+coeff)++
 	inc	eax				; coeff++
 	dec	edx
 	jnz	.inner_loop
 	pop	eax				; restore sample
 	fstp	st0				; pop d, ST = empty
 	inc	eax				; sample++
 	loop	.outer_loop
 	;	for(; sample < data_len; sample++){
 	;		d = data[sample];
 	;		for(coeff = 0; coeff < data_len - sample; coeff++)
 	;			autoc[coeff] += d * data[sample+coeff];
 	;	}
 	mov	ecx, [ebp + 8]			; ecx <- lag
 	dec	ecx				; ecx <- lag - 1
 	jz	.outer_end			; skip loop if 0
 .outer_loop2:
 	push	eax				; save sample
 	fld	dword [esi + eax * 4]		; ST = d <- data[sample]
 	mov	ebx, eax			; ebx == sample+coeff <- sample
 	mov	edx, [ebp + 4]			; edx <- data_len
 	sub	edx, eax			; edx <- data_len-sample
 	xor	eax, eax			; eax == coeff <- 0
 .inner_loop2:
 	fld	st0				; ST = d d
 	fmul	dword [esi + ebx * 4]		; ST = d*data[sample+coeff] d
 	fadd	dword [edi + eax * 4]		; ST = autoc[coeff]+d*data[sample+coeff] d
 	fstp	dword [edi + eax * 4]		; autoc[coeff]+=d*data[sample+coeff]  ST = d
 	inc	ebx				; (sample+coeff)++
 	inc	eax				; coeff++
 	dec	edx
 	jnz	.inner_loop2
 	pop	eax				; restore sample
 	fstp	st0				; pop d, ST = empty
 	inc	eax				; sample++
 	loop	.outer_loop2
 .outer_end:
 	jmp	.end

 .lag_eq_6_plus_1:
 	mov	ecx, [ebp + 4]			; ecx == data_len
 	mov	esi, [ebp]			; esi == data
 	mov	edi, [ebp + 12]			; edi == autoc
 	fldz					; will accumulate autoc[6]
 	sub	ecx, byte 6
 	ALIGN 16
 .lag_6_1_loop:
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st1, st0			; add to autoc[6]
 	dec	ecx
 	jnz	.lag_6_1_loop
 	fstp	dword [edi + 24]
 	jmp	.end

 .lag_eq_6_plus_2:
 	mov	ecx, [ebp + 4]			; ecx == data_len
 	mov	esi, [ebp]			; esi == data
 	mov	edi, [ebp + 12]			; edi == autoc
 	fldz					; will accumulate autoc[7]
 	fldz					; will accumulate autoc[6]
 	sub	ecx, byte 7
 	ALIGN 16
 .lag_6_2_loop:
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st2, st0			; add to autoc[7]
 	dec	ecx
 	jnz	.lag_6_2_loop
 	; clean up the leftovers
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmulp	st1, st0
 	faddp	st1, st0			; add to autoc[6]
 	fstp	dword [edi + 24]
 	fstp	dword [edi + 28]
 	jmp	.end

 .lag_eq_6_plus_3:
 	mov	ecx, [ebp + 4]			; ecx == data_len
 	mov	esi, [ebp]			; esi == data
 	mov	edi, [ebp + 12]			; edi == autoc
 	fldz					; will accumulate autoc[8]
 	fldz					; will accumulate autoc[7]
 	fldz					; will accumulate autoc[6]
 	sub	ecx, byte 8
 	ALIGN 16
 .lag_6_3_loop:
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[7]
 	fld	dword [esi + 32]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st3, st0			; add to autoc[8]
 	dec	ecx
 	jnz	.lag_6_3_loop
 	; clean up the leftovers
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st2, st0			; add to autoc[7]
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmulp	st1, st0
 	faddp	st1, st0			; add to autoc[6]
 	fstp	dword [edi + 24]
 	fstp	dword [edi + 28]
 	fstp	dword [edi + 32]
 	jmp	.end

 .lag_eq_6_plus_4:
 	mov	ecx, [ebp + 4]			; ecx == data_len
 	mov	esi, [ebp]			; esi == data
 	mov	edi, [ebp + 12]			; edi == autoc
 	fldz					; will accumulate autoc[9]
 	fldz					; will accumulate autoc[8]
 	fldz					; will accumulate autoc[7]
 	fldz					; will accumulate autoc[6]
 	sub	ecx, byte 9
 	ALIGN 16
 .lag_6_4_loop:
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[7]
 	fld	dword [esi + 32]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[8]
 	fld	dword [esi + 36]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st4, st0			; add to autoc[9]
 	dec	ecx
 	jnz	.lag_6_4_loop
 	; clean up the leftovers
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[7]
 	fld	dword [esi + 32]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st3, st0			; add to autoc[8]
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st2, st0			; add to autoc[7]
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmulp	st1, st0
 	faddp	st1, st0			; add to autoc[6]
 	fstp	dword [edi + 24]
 	fstp	dword [edi + 28]
 	fstp	dword [edi + 32]
 	fstp	dword [edi + 36]
 	jmp	.end

 .lag_eq_6_plus_5:
 	mov	ecx, [ebp + 4]			; ecx == data_len
 	mov	esi, [ebp]			; esi == data
 	mov	edi, [ebp + 12]			; edi == autoc
 	fldz					; will accumulate autoc[10]
 	fldz					; will accumulate autoc[9]
 	fldz					; will accumulate autoc[8]
 	fldz					; will accumulate autoc[7]
 	fldz					; will accumulate autoc[6]
 	sub	ecx, byte 10
 	ALIGN 16
 .lag_6_5_loop:
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[7]
 	fld	dword [esi + 32]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[8]
 	fld	dword [esi + 36]
 	fmul	st0, st1
 	faddp	st5, st0			; add to autoc[9]
 	fld	dword [esi + 40]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st5, st0			; add to autoc[10]
 	dec	ecx
 	jnz	.lag_6_5_loop
 	; clean up the leftovers
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[7]
 	fld	dword [esi + 32]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[8]
 	fld	dword [esi + 36]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st4, st0			; add to autoc[9]
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[7]
 	fld	dword [esi + 32]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st3, st0			; add to autoc[8]
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st2, st0			; add to autoc[7]
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmulp	st1, st0
 	faddp	st1, st0			; add to autoc[6]
 	fstp	dword [edi + 24]
 	fstp	dword [edi + 28]
 	fstp	dword [edi + 32]
 	fstp	dword [edi + 36]
 	fstp	dword [edi + 40]
 	jmp	.end

 .lag_eq_6_plus_6:
 	mov	ecx, [ebp + 4]			; ecx == data_len
 	mov	esi, [ebp]			; esi == data
 	mov	edi, [ebp + 12]			; edi == autoc
 	fldz					; will accumulate autoc[11]
 	fldz					; will accumulate autoc[10]
 	fldz					; will accumulate autoc[9]
 	fldz					; will accumulate autoc[8]
 	fldz					; will accumulate autoc[7]
 	fldz					; will accumulate autoc[6]
 	sub	ecx, byte 11
 	ALIGN 16
 .lag_6_6_loop:
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[7]
 	fld	dword [esi + 32]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[8]
 	fld	dword [esi + 36]
 	fmul	st0, st1
 	faddp	st5, st0			; add to autoc[9]
 	fld	dword [esi + 40]
 	fmul	st0, st1
 	faddp	st6, st0			; add to autoc[10]
 	fld	dword [esi + 44]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st6, st0			; add to autoc[11]
 	dec	ecx
 	jnz	.lag_6_6_loop
 	; clean up the leftovers
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[7]
 	fld	dword [esi + 32]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[8]
 	fld	dword [esi + 36]
 	fmul	st0, st1
 	faddp	st5, st0			; add to autoc[9]
 	fld	dword [esi + 40]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st5, st0			; add to autoc[10]
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[7]
 	fld	dword [esi + 32]
 	fmul	st0, st1
 	faddp	st4, st0			; add to autoc[8]
 	fld	dword [esi + 36]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st4, st0			; add to autoc[9]
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmul	st0, st1
 	faddp	st3, st0			; add to autoc[7]
 	fld	dword [esi + 32]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st3, st0			; add to autoc[8]
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmul	st0, st1
 	faddp	st2, st0			; add to autoc[6]
 	fld	dword [esi + 28]
 	fmulp	st1, st0
 	add	esi, byte 4			; [CR] sample++
 	faddp	st2, st0			; add to autoc[7]
 	fld	dword [esi]
 	fld	dword [esi + 24]
 	fmulp	st1, st0
 	faddp	st1, st0			; add to autoc[6]
 	fstp	dword [edi + 24]
 	fstp	dword [edi + 28]
 	fstp	dword [edi + 32]
 	fstp	dword [edi + 36]
 	fstp	dword [edi + 40]
 	fstp	dword [edi + 44]
 	jmp	.end

 .end:
 	pop	edi
 	pop	esi
 	pop	ebx
 	pop	ebp
 	ret

 ; end
	; vim:filetype=nasm ts=8

	; libFLAC - Free Lossless Audio Codec library
	; Copyright (C) 2001-2009 Josh Coalson
	; Copyright (C) 2011-2014 Xiph.Org Foundation
	;
	; Redistribution and use in source and binary forms, with or without
	; modification, are permitted provided that the following conditions
	; are met:
	;
	; - Redistributions of source code must retain the above copyright
	; notice, this list of conditions and the following disclaimer.
	;
	; - Redistributions in binary form must reproduce the above copyright
	; notice, this list of conditions and the following disclaimer in the
	; documentation and/or other materials provided with the distribution.
	;
	; - Neither the name of the Xiph.org Foundation nor the names of its
	; contributors may be used to endorse or promote products derived from
	; this software without specific prior written permission.
	;
	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
	; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	; [CR] is a note to flag that the instruction can be easily reordered

	%include "nasm.h"

	data_section

	cglobal FLAC__lpc_compute_autocorrelation_asm

	code_section

	; **********************************************************************
	;
	; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
	; {
	; FLAC__real d;
	; unsigned sample, coeff;
	; const unsigned limit = data_len - lag;
	;
	; assert(lag > 0);
	; assert(lag <= data_len);
	;
	; for(coeff = 0; coeff < lag; coeff++)
	; autoc[coeff] = 0.0;
	; for(sample = 0; sample <= limit; sample++){
	; d = data[sample];
	; for(coeff = 0; coeff < lag; coeff++)
	; autoc[coeff] += d * data[sample+coeff];
	; }
	; for(; sample < data_len; sample++){
	; d = data[sample];
	; for(coeff = 0; coeff < data_len - sample; coeff++)
	; autoc[coeff] += d * data[sample+coeff];
	; }
	; }
	;
	FLAC__lpc_compute_autocorrelation_asm:

	push ebp
	lea ebp, [esp + 8]
	push ebx
	push esi
	push edi

	mov edx, [ebp + 8] ; edx == lag
	mov ecx, [ebp + 4] ; ecx == data_len
	mov esi, [ebp] ; esi == data
	mov edi, [ebp + 12] ; edi == autoc

	cmp edx, 1
	ja short .lag_above_1
	.lag_eq_1:
	fldz ; will accumulate autoc[0]
	ALIGN 16
	.lag_1_loop:
	fld dword [esi]
	add esi, byte 4 ; sample++
	fmul st0, st0
	faddp st1, st0
	dec ecx
	jnz .lag_1_loop
	fstp dword [edi]
	jmp .end

	.lag_above_1:
	cmp edx, 2
	ja short .lag_above_2
	.lag_eq_2:
	fldz ; will accumulate autoc[1]
	dec ecx
	fldz ; will accumulate autoc[0]
	fld dword [esi]
	ALIGN 16
	.lag_2_loop:
	add esi, byte 4 ; [CR] sample++
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi]
	fmul st1, st0
	fxch
	faddp st3, st0 ; add to autoc[1]
	dec ecx
	jnz .lag_2_loop
	; clean up the leftovers
	fmul st0, st0
	faddp st1, st0 ; add to autoc[0]
	fstp dword [edi]
	fstp dword [edi + 4]
	jmp .end

	.lag_above_2:
	cmp edx, 3
	ja short .lag_above_3
	.lag_eq_3:
	fldz ; will accumulate autoc[2]
	dec ecx
	fldz ; will accumulate autoc[1]
	dec ecx
	fldz ; will accumulate autoc[0]
	ALIGN 16
	.lag_3_loop:
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[1]
	fld dword [esi + 8]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st3, st0 ; add to autoc[2]
	dec ecx
	jnz .lag_3_loop
	; clean up the leftovers
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st1, st0
	fxch
	faddp st3, st0 ; add to autoc[1]
	fmul st0, st0
	faddp st1, st0 ; add to autoc[0]
	fstp dword [edi]
	fstp dword [edi + 4]
	fstp dword [edi + 8]
	jmp .end

	.lag_above_3:
	cmp edx, 4
	ja near .lag_above_4
	.lag_eq_4:
	fldz ; will accumulate autoc[3]
	dec ecx
	fldz ; will accumulate autoc[2]
	dec ecx
	fldz ; will accumulate autoc[1]
	dec ecx
	fldz ; will accumulate autoc[0]
	ALIGN 16
	.lag_4_loop:
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[1]
	fld dword [esi + 8]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[2]
	fld dword [esi + 12]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st4, st0 ; add to autoc[3]
	dec ecx
	jnz .lag_4_loop
	; clean up the leftovers
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[1]
	fld dword [esi + 8]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st3, st0 ; add to autoc[2]
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st1, st0
	fxch
	faddp st3, st0 ; add to autoc[1]
	fmul st0, st0
	faddp st1, st0 ; add to autoc[0]
	fstp dword [edi]
	fstp dword [edi + 4]
	fstp dword [edi + 8]
	fstp dword [edi + 12]
	jmp .end

	.lag_above_4:
	cmp edx, 5
	ja near .lag_above_5
	.lag_eq_5:
	fldz ; will accumulate autoc[4]
	fldz ; will accumulate autoc[3]
	fldz ; will accumulate autoc[2]
	fldz ; will accumulate autoc[1]
	fldz ; will accumulate autoc[0]
	sub ecx, byte 4
	ALIGN 16
	.lag_5_loop:
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[1]
	fld dword [esi + 8]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[2]
	fld dword [esi + 12]
	fmul st0, st1
	faddp st5, st0 ; add to autoc[3]
	fld dword [esi + 16]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st5, st0 ; add to autoc[4]
	dec ecx
	jnz .lag_5_loop
	; clean up the leftovers
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[1]
	fld dword [esi + 8]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[2]
	fld dword [esi + 12]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st4, st0 ; add to autoc[3]
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[1]
	fld dword [esi + 8]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st3, st0 ; add to autoc[2]
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st1, st0
	fxch
	faddp st3, st0 ; add to autoc[1]
	fmul st0, st0
	faddp st1, st0 ; add to autoc[0]
	fstp dword [edi]
	fstp dword [edi + 4]
	fstp dword [edi + 8]
	fstp dword [edi + 12]
	fstp dword [edi + 16]
	jmp .end

	.lag_above_5:
	cmp edx, 6
	ja .lag_above_6
	.lag_eq_6:
	fldz ; will accumulate autoc[5]
	fldz ; will accumulate autoc[4]
	fldz ; will accumulate autoc[3]
	fldz ; will accumulate autoc[2]
	fldz ; will accumulate autoc[1]
	fldz ; will accumulate autoc[0]
	sub ecx, byte 5
	ALIGN 16
	.lag_6_loop:
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[1]
	fld dword [esi + 8]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[2]
	fld dword [esi + 12]
	fmul st0, st1
	faddp st5, st0 ; add to autoc[3]
	fld dword [esi + 16]
	fmul st0, st1
	faddp st6, st0 ; add to autoc[4]
	fld dword [esi + 20]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st6, st0 ; add to autoc[5]
	dec ecx
	jnz .lag_6_loop
	; clean up the leftovers
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[1]
	fld dword [esi + 8]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[2]
	fld dword [esi + 12]
	fmul st0, st1
	faddp st5, st0 ; add to autoc[3]
	fld dword [esi + 16]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st5, st0 ; add to autoc[4]
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[1]
	fld dword [esi + 8]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[2]
	fld dword [esi + 12]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st4, st0 ; add to autoc[3]
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[1]
	fld dword [esi + 8]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st3, st0 ; add to autoc[2]
	fld dword [esi]
	fld st0
	fmul st0, st0
	faddp st2, st0 ; add to autoc[0]
	fld dword [esi + 4]
	fmul st1, st0
	fxch
	faddp st3, st0 ; add to autoc[1]
	fmul st0, st0
	faddp st1, st0 ; add to autoc[0]
	fstp dword [edi]
	fstp dword [edi + 4]
	fstp dword [edi + 8]
	fstp dword [edi + 12]
	fstp dword [edi + 16]
	fstp dword [edi + 20]
	jmp .end

	.lag_above_6:
	; for(coeff = 0; coeff < lag; coeff++)
	; autoc[coeff] = 0.0;
	lea ecx, [edx * 2] ; ecx = # of dwords of 0 to write
	xor eax, eax
	rep stosd
	mov ecx, [ebp + 4] ; ecx == data_len
	mov edi, [ebp + 12] ; edi == autoc
	; const unsigned limit = data_len - lag;
	sub ecx, edx
	inc ecx ; we are looping <= limit so we add one to the counter
	; for(sample = 0; sample <= limit; sample++){
	; d = data[sample];
	; for(coeff = 0; coeff < lag; coeff++)
	; autoc[coeff] += d * data[sample+coeff];
	; }
	xor eax, eax ; eax == sample <- 0
	ALIGN 16
	.outer_loop:
	push eax ; save sample
	fld dword [esi + eax * 4] ; ST = d <- data[sample]
	mov ebx, eax ; ebx == sample+coeff <- sample
	mov edx, [ebp + 8] ; edx <- lag
	xor eax, eax ; eax == coeff <- 0
	ALIGN 16
	.inner_loop:
	fld st0 ; ST = d d
	fmul dword [esi + ebx * 4] ; ST = d*data[sample+coeff] d
	fadd dword [edi + eax * 4] ; ST = autoc[coeff]+d*data[sample+coeff] d
	fstp dword [edi + eax * 4] ; autoc[coeff]+=d*data[sample+coeff] ST = d
	inc ebx ; (sample+coeff)++
	inc eax ; coeff++
	dec edx
	jnz .inner_loop
	pop eax ; restore sample
	fstp st0 ; pop d, ST = empty
	inc eax ; sample++
	loop .outer_loop
	; for(; sample < data_len; sample++){
	; d = data[sample];
	; for(coeff = 0; coeff < data_len - sample; coeff++)
	; autoc[coeff] += d * data[sample+coeff];
	; }
	mov ecx, [ebp + 8] ; ecx <- lag
	dec ecx ; ecx <- lag - 1
	jz .outer_end ; skip loop if 0
	.outer_loop2:
	push eax ; save sample
	fld dword [esi + eax * 4] ; ST = d <- data[sample]
	mov ebx, eax ; ebx == sample+coeff <- sample
	mov edx, [ebp + 4] ; edx <- data_len
	sub edx, eax ; edx <- data_len-sample
	xor eax, eax ; eax == coeff <- 0
	.inner_loop2:
	fld st0 ; ST = d d
	fmul dword [esi + ebx * 4] ; ST = d*data[sample+coeff] d
	fadd dword [edi + eax * 4] ; ST = autoc[coeff]+d*data[sample+coeff] d
	fstp dword [edi + eax * 4] ; autoc[coeff]+=d*data[sample+coeff] ST = d
	inc ebx ; (sample+coeff)++
	inc eax ; coeff++
	dec edx
	jnz .inner_loop2
	pop eax ; restore sample
	fstp st0 ; pop d, ST = empty
	inc eax ; sample++
	loop .outer_loop2
	.outer_end:
	jmp .end

	.lag_eq_6_plus_1:
	mov ecx, [ebp + 4] ; ecx == data_len
	mov esi, [ebp] ; esi == data
	mov edi, [ebp + 12] ; edi == autoc
	fldz ; will accumulate autoc[6]
	sub ecx, byte 6
	ALIGN 16
	.lag_6_1_loop:
	fld dword [esi]
	fld dword [esi + 24]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st1, st0 ; add to autoc[6]
	dec ecx
	jnz .lag_6_1_loop
	fstp dword [edi + 24]
	jmp .end

	.lag_eq_6_plus_2:
	mov ecx, [ebp + 4] ; ecx == data_len
	mov esi, [ebp] ; esi == data
	mov edi, [ebp + 12] ; edi == autoc
	fldz ; will accumulate autoc[7]
	fldz ; will accumulate autoc[6]
	sub ecx, byte 7
	ALIGN 16
	.lag_6_2_loop:
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st2, st0 ; add to autoc[7]
	dec ecx
	jnz .lag_6_2_loop
	; clean up the leftovers
	fld dword [esi]
	fld dword [esi + 24]
	fmulp st1, st0
	faddp st1, st0 ; add to autoc[6]
	fstp dword [edi + 24]
	fstp dword [edi + 28]
	jmp .end

	.lag_eq_6_plus_3:
	mov ecx, [ebp + 4] ; ecx == data_len
	mov esi, [ebp] ; esi == data
	mov edi, [ebp + 12] ; edi == autoc
	fldz ; will accumulate autoc[8]
	fldz ; will accumulate autoc[7]
	fldz ; will accumulate autoc[6]
	sub ecx, byte 8
	ALIGN 16
	.lag_6_3_loop:
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[7]
	fld dword [esi + 32]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st3, st0 ; add to autoc[8]
	dec ecx
	jnz .lag_6_3_loop
	; clean up the leftovers
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st2, st0 ; add to autoc[7]
	fld dword [esi]
	fld dword [esi + 24]
	fmulp st1, st0
	faddp st1, st0 ; add to autoc[6]
	fstp dword [edi + 24]
	fstp dword [edi + 28]
	fstp dword [edi + 32]
	jmp .end

	.lag_eq_6_plus_4:
	mov ecx, [ebp + 4] ; ecx == data_len
	mov esi, [ebp] ; esi == data
	mov edi, [ebp + 12] ; edi == autoc
	fldz ; will accumulate autoc[9]
	fldz ; will accumulate autoc[8]
	fldz ; will accumulate autoc[7]
	fldz ; will accumulate autoc[6]
	sub ecx, byte 9
	ALIGN 16
	.lag_6_4_loop:
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[7]
	fld dword [esi + 32]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[8]
	fld dword [esi + 36]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st4, st0 ; add to autoc[9]
	dec ecx
	jnz .lag_6_4_loop
	; clean up the leftovers
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[7]
	fld dword [esi + 32]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st3, st0 ; add to autoc[8]
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st2, st0 ; add to autoc[7]
	fld dword [esi]
	fld dword [esi + 24]
	fmulp st1, st0
	faddp st1, st0 ; add to autoc[6]
	fstp dword [edi + 24]
	fstp dword [edi + 28]
	fstp dword [edi + 32]
	fstp dword [edi + 36]
	jmp .end

	.lag_eq_6_plus_5:
	mov ecx, [ebp + 4] ; ecx == data_len
	mov esi, [ebp] ; esi == data
	mov edi, [ebp + 12] ; edi == autoc
	fldz ; will accumulate autoc[10]
	fldz ; will accumulate autoc[9]
	fldz ; will accumulate autoc[8]
	fldz ; will accumulate autoc[7]
	fldz ; will accumulate autoc[6]
	sub ecx, byte 10
	ALIGN 16
	.lag_6_5_loop:
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[7]
	fld dword [esi + 32]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[8]
	fld dword [esi + 36]
	fmul st0, st1
	faddp st5, st0 ; add to autoc[9]
	fld dword [esi + 40]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st5, st0 ; add to autoc[10]
	dec ecx
	jnz .lag_6_5_loop
	; clean up the leftovers
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[7]
	fld dword [esi + 32]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[8]
	fld dword [esi + 36]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st4, st0 ; add to autoc[9]
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[7]
	fld dword [esi + 32]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st3, st0 ; add to autoc[8]
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st2, st0 ; add to autoc[7]
	fld dword [esi]
	fld dword [esi + 24]
	fmulp st1, st0
	faddp st1, st0 ; add to autoc[6]
	fstp dword [edi + 24]
	fstp dword [edi + 28]
	fstp dword [edi + 32]
	fstp dword [edi + 36]
	fstp dword [edi + 40]
	jmp .end

	.lag_eq_6_plus_6:
	mov ecx, [ebp + 4] ; ecx == data_len
	mov esi, [ebp] ; esi == data
	mov edi, [ebp + 12] ; edi == autoc
	fldz ; will accumulate autoc[11]
	fldz ; will accumulate autoc[10]
	fldz ; will accumulate autoc[9]
	fldz ; will accumulate autoc[8]
	fldz ; will accumulate autoc[7]
	fldz ; will accumulate autoc[6]
	sub ecx, byte 11
	ALIGN 16
	.lag_6_6_loop:
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[7]
	fld dword [esi + 32]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[8]
	fld dword [esi + 36]
	fmul st0, st1
	faddp st5, st0 ; add to autoc[9]
	fld dword [esi + 40]
	fmul st0, st1
	faddp st6, st0 ; add to autoc[10]
	fld dword [esi + 44]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st6, st0 ; add to autoc[11]
	dec ecx
	jnz .lag_6_6_loop
	; clean up the leftovers
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[7]
	fld dword [esi + 32]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[8]
	fld dword [esi + 36]
	fmul st0, st1
	faddp st5, st0 ; add to autoc[9]
	fld dword [esi + 40]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st5, st0 ; add to autoc[10]
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[7]
	fld dword [esi + 32]
	fmul st0, st1
	faddp st4, st0 ; add to autoc[8]
	fld dword [esi + 36]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st4, st0 ; add to autoc[9]
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmul st0, st1
	faddp st3, st0 ; add to autoc[7]
	fld dword [esi + 32]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st3, st0 ; add to autoc[8]
	fld dword [esi]
	fld dword [esi + 24]
	fmul st0, st1
	faddp st2, st0 ; add to autoc[6]
	fld dword [esi + 28]
	fmulp st1, st0
	add esi, byte 4 ; [CR] sample++
	faddp st2, st0 ; add to autoc[7]
	fld dword [esi]
	fld dword [esi + 24]
	fmulp st1, st0
	faddp st1, st0 ; add to autoc[6]
	fstp dword [edi + 24]
	fstp dword [edi + 28]
	fstp dword [edi + 32]
	fstp dword [edi + 36]
	fstp dword [edi + 40]
	fstp dword [edi + 44]
	jmp .end

	.end:
	pop edi
	pop esi
	pop ebx
	pop ebp
	ret

	; end