Bill Yi | 4e213d5 | 2015-06-23 13:53:11 -0700 | [diff] [blame] | 1 | # SSE2 assist routines for sumsq |
| 2 | # Copyright 2001 Phil Karn, KA9Q |
| 3 | # May be used under the terms of the GNU Public License (GPL) |
| 4 | |
| 5 | .text |
| 6 | # Evaluate sum of squares of signed 16-bit input samples |
| 7 | # long long sumsq_sse2_assist(signed short *in,int cnt); |
| 8 | .global sumsq_sse2_assist |
| 9 | .type sumsq_sse2_assist,@function |
| 10 | .align 16 |
| 11 | sumsq_sse2_assist: |
| 12 | pushl %ebp |
| 13 | movl %esp,%ebp |
| 14 | pushl %esi |
| 15 | pushl %ecx |
| 16 | |
| 17 | movl 8(%ebp),%esi |
| 18 | movl 12(%ebp),%ecx |
| 19 | pxor %xmm2,%xmm2 # zero sum |
| 20 | movaps low,%xmm3 # load mask |
| 21 | |
| 22 | 1: subl $8,%ecx |
| 23 | jl 2f |
| 24 | movaps (%esi),%xmm0 # S0 S1 S2 S3 S4 S5 S6 S7 |
| 25 | pmaddwd %xmm0,%xmm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7) |
| 26 | movaps %xmm0,%xmm1 |
| 27 | pand %xmm3,%xmm1 # (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0 |
| 28 | paddq %xmm1,%xmm2 # sum even-numbered dwords |
| 29 | psrlq $32,%xmm0 # (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0 |
| 30 | paddq %xmm0,%xmm2 # sum odd-numbered dwords |
| 31 | addl $16,%esi |
| 32 | jmp 1b |
| 33 | |
| 34 | 2: movaps %xmm2,%xmm0 |
| 35 | psrldq $8,%xmm0 |
| 36 | paddq %xmm2,%xmm0 # combine 64-bit sums |
| 37 | |
| 38 | movd %xmm0,%eax # low 32 bits of sum |
| 39 | psrldq $4,%xmm0 |
| 40 | movd %xmm0,%edx # high 32 bits of sum |
| 41 | |
| 42 | popl %ecx |
| 43 | popl %esi |
| 44 | popl %ebp |
| 45 | ret |
| 46 | |
| 47 | .data |
| 48 | .align 16 |
| 49 | low: .byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0 |