Blame - dotprod_mmx_assist.s - platform/external/fec

blob: 25deffd998f8bb67c1ae90e34d543564af0e0452 [file] [log] [blame]

Bill Yi	4e213d5	2015-06-23 13:53:11 -0700	[diff] [blame]	1	# SIMD MMX dot product
				2	# Equivalent to the following C code:
				3	# long dotprod(signed short a,signed short b,int cnt)
				4	# {
				5	# long sum = 0;
				6	# cnt *= 4;
				7	# while(cnt--)
				8	# sum += a++ + b++;
				9	# return sum;
				10	# }
				11	# a and b should also be 64-bit aligned, or speed will suffer greatly
				12	# Copyright 1999, Phil Karn KA9Q
				13	# May be used under the terms of the GNU Lesser General Public License (LGPL)
				14
				15	.text
				16	.global dotprod_mmx_assist
				17	.type dotprod_mmx_assist,@function
				18	dotprod_mmx_assist:
				19	pushl %ebp
				20	movl %esp,%ebp
				21	pushl %esi
				22	pushl %edi
				23	pushl %ecx
				24	pushl %ebx
				25	movl 8(%ebp),%esi # a
				26	movl 12(%ebp),%edi # b
				27	movl 16(%ebp),%ecx # cnt
				28	pxor %mm0,%mm0 # clear running sum (in two 32-bit halves)
				29
				30	# MMX dot product loop unrolled 4 times, crunching 16 terms per loop
				31	.align 16
				32	.Loop1: subl $4,%ecx
				33	jl .Loop1Done
				34
				35	movq (%esi),%mm1 # mm1 = a[3],a[2],a[1],a[0]
				36	pmaddwd (%edi),%mm1 # mm1 = b[3]a[3]+b[2]a[2],b[1]a[1]+b[0]a[0]
				37	paddd %mm1,%mm0
				38
				39	movq 8(%esi),%mm1
				40	pmaddwd 8(%edi),%mm1
				41	paddd %mm1,%mm0
				42
				43	movq 16(%esi),%mm1
				44	pmaddwd 16(%edi),%mm1
				45	paddd %mm1,%mm0
				46
				47	movq 24(%esi),%mm1
				48	addl $32,%esi
				49	pmaddwd 24(%edi),%mm1
				50	addl $32,%edi
				51	paddd %mm1,%mm0
				52
				53	jmp .Loop1
				54	.Loop1Done:
				55
				56	addl $4,%ecx
				57
				58	# MMX dot product loop, not unrolled, crunching 4 terms per loop
				59	# This could be redone as Duff's Device on the unrolled loop above
				60	.Loop2: subl $1,%ecx
				61	jl .Loop2Done
				62
				63	movq (%esi),%mm1
				64	addl $8,%esi
				65	pmaddwd (%edi),%mm1
				66	addl $8,%edi
				67	paddd %mm1,%mm0
				68	jmp .Loop2
				69	.Loop2Done:
				70
				71	movd %mm0,%ebx # right-hand word to ebx
				72	punpckhdq %mm0,%mm0 # left-hand word to right side of %mm0
				73	movd %mm0,%eax
				74	addl %ebx,%eax # running sum now in %eax
				75	emms # done with MMX
				76
				77	popl %ebx
				78	popl %ecx
				79	popl %edi
				80	popl %esi
				81	movl %ebp,%esp
				82	popl %ebp
				83	ret