blob: d1c4ee794c8d30e5d07748e51b57d2c285962d09 [file] [log] [blame]
Bill Yi4e213d52015-06-23 13:53:11 -07001# SSE2 assist routines for sumsq
2# Copyright 2001 Phil Karn, KA9Q
3# May be used under the terms of the GNU Public License (GPL)
4
5 .text
6# Evaluate sum of squares of signed 16-bit input samples
7# long long sumsq_sse2_assist(signed short *in,int cnt);
8 .global sumsq_sse2_assist
9 .type sumsq_sse2_assist,@function
10 .align 16
11sumsq_sse2_assist:
12 pushl %ebp
13 movl %esp,%ebp
14 pushl %esi
15 pushl %ecx
16
17 movl 8(%ebp),%esi
18 movl 12(%ebp),%ecx
19 pxor %xmm2,%xmm2 # zero sum
20 movaps low,%xmm3 # load mask
21
221: subl $8,%ecx
23 jl 2f
24 movaps (%esi),%xmm0 # S0 S1 S2 S3 S4 S5 S6 S7
25 pmaddwd %xmm0,%xmm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7)
26 movaps %xmm0,%xmm1
27 pand %xmm3,%xmm1 # (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0
28 paddq %xmm1,%xmm2 # sum even-numbered dwords
29 psrlq $32,%xmm0 # (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0
30 paddq %xmm0,%xmm2 # sum odd-numbered dwords
31 addl $16,%esi
32 jmp 1b
33
342: movaps %xmm2,%xmm0
35 psrldq $8,%xmm0
36 paddq %xmm2,%xmm0 # combine 64-bit sums
37
38 movd %xmm0,%eax # low 32 bits of sum
39 psrldq $4,%xmm0
40 movd %xmm0,%edx # high 32 bits of sum
41
42 popl %ecx
43 popl %esi
44 popl %ebp
45 ret
46
47 .data
48 .align 16
49low: .byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0