Blame - sumsq_sse2_assist.s - platform/external/fec

blob: d1c4ee794c8d30e5d07748e51b57d2c285962d09 [file] [log] [blame]

Bill Yi	4e213d5	2015-06-23 13:53:11 -0700	[diff] [blame]	1	# SSE2 assist routines for sumsq
				2	# Copyright 2001 Phil Karn, KA9Q
				3	# May be used under the terms of the GNU Public License (GPL)
				4
				5	.text
				6	# Evaluate sum of squares of signed 16-bit input samples
				7	# long long sumsq_sse2_assist(signed short *in,int cnt);
				8	.global sumsq_sse2_assist
				9	.type sumsq_sse2_assist,@function
				10	.align 16
				11	sumsq_sse2_assist:
				12	pushl %ebp
				13	movl %esp,%ebp
				14	pushl %esi
				15	pushl %ecx
				16
				17	movl 8(%ebp),%esi
				18	movl 12(%ebp),%ecx
				19	pxor %xmm2,%xmm2 # zero sum
				20	movaps low,%xmm3 # load mask
				21
				22	1: subl $8,%ecx
				23	jl 2f
				24	movaps (%esi),%xmm0 # S0 S1 S2 S3 S4 S5 S6 S7
				25	pmaddwd %xmm0,%xmm0 # (S0S0+S1S1) (S2S2+S3S3) (S4S4+S5S5) (S6S6+S7S7)
				26	movaps %xmm0,%xmm1
				27	pand %xmm3,%xmm1 # (S0S0+S1S1) 0 (S4S4+S5S5) 0
				28	paddq %xmm1,%xmm2 # sum even-numbered dwords
				29	psrlq $32,%xmm0 # (S2S2+S3S3) 0 (S6S6+S7S7) 0
				30	paddq %xmm0,%xmm2 # sum odd-numbered dwords
				31	addl $16,%esi
				32	jmp 1b
				33
				34	2: movaps %xmm2,%xmm0
				35	psrldq $8,%xmm0
				36	paddq %xmm2,%xmm0 # combine 64-bit sums
				37
				38	movd %xmm0,%eax # low 32 bits of sum
				39	psrldq $4,%xmm0
				40	movd %xmm0,%edx # high 32 bits of sum
				41
				42	popl %ecx
				43	popl %esi
				44	popl %ebp
				45	ret
				46
				47	.data
				48	.align 16
				49	low: .byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0