blob: b3bac6633b18a305871510edb87b2a7b76629c53 [file] [log] [blame]
Bill Yi4e213d52015-06-23 13:53:11 -07001# MMX assist routines for sumsq
2# Copyright 2001 Phil Karn, KA9Q
3# May be used under the terms of the GNU Public License (GPL)
4
5 .text
6
7# Evaluate sum of squares of signed 16-bit input samples
8# long long sumsq_mmx_assist(signed short *in,int cnt);
9 .global sumsq_mmx_assist
10 .type sumsq_mmx_assist,@function
11 .align 16
12sumsq_mmx_assist:
13 pushl %ebp
14 movl %esp,%ebp
15 pushl %esi
16 pushl %ecx
17 pushl %ebx
18
19 movl 8(%ebp),%esi
20 movl 12(%ebp),%ecx
21 xor %eax,%eax
22 xor %edx,%edx
23
24 # Since 4 * 32767**2 < 2**32, we can accumulate two at a time
251: subl $8,%ecx
26 jl 2f
27 movq (%esi),%mm0 # S0 S1 S2 S3
28 pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2)
29 movq 8(%esi),%mm6 # S4 S5 S6 S7
30 pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2)
31 paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2)
32 movd %mm0,%ebx
33 addl %ebx,%eax
34 adcl $0,%edx
35 psrlq $32,%mm0
36 movd %mm0,%ebx
37 addl %ebx,%eax
38 adcl $0,%edx
39 addl $16,%esi
40 jmp 1b
41
422: emms
43 popl %ebx
44 popl %ecx
45 popl %esi
46 popl %ebp
47 ret
48
49# Evaluate sum of squares of signed 16-bit input samples
50# long sumsq_wd_mmx_assist(signed short *in,int cnt);
51# Quick version, only safe for small numbers of small input values...
52 .global sumsq_wd_mmx_assist
53 .type sumsq_wd_mmx_assist,@function
54 .align 16
55sumsq_wd_mmx_assist:
56 pushl %ebp
57 movl %esp,%ebp
58 pushl %esi
59
60 movl 8(%ebp),%esi
61 movl 12(%ebp),%ecx
62 pxor %mm2,%mm2 # zero sum
63
641: subl $8,%ecx
65 jl 2f
66 movq (%esi),%mm0 # S0 S1 S2 S3
67 pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3)
68 movq 8(%esi),%mm1
69 pmaddwd %mm1,%mm1
70 paddd %mm1,%mm2
71 paddd %mm0,%mm2 # accumulate
72
73 addl $16,%esi
74 jmp 1b
75
762: movd %mm2,%eax # even sum
77 psrlq $32,%mm2
78 movd %mm2,%edx # odd sum
79 addl %edx,%eax
80 emms
81 popl %esi
82 popl %ebp
83 ret