Bill Yi | 4e213d5 | 2015-06-23 13:53:11 -0700 | [diff] [blame] | 1 | # MMX assist routines for sumsq |
| 2 | # Copyright 2001 Phil Karn, KA9Q |
| 3 | # May be used under the terms of the GNU Public License (GPL) |
| 4 | |
| 5 | .text |
| 6 | |
| 7 | # Evaluate sum of squares of signed 16-bit input samples |
| 8 | # long long sumsq_mmx_assist(signed short *in,int cnt); |
| 9 | .global sumsq_mmx_assist |
| 10 | .type sumsq_mmx_assist,@function |
| 11 | .align 16 |
| 12 | sumsq_mmx_assist: |
| 13 | pushl %ebp |
| 14 | movl %esp,%ebp |
| 15 | pushl %esi |
| 16 | pushl %ecx |
| 17 | pushl %ebx |
| 18 | |
| 19 | movl 8(%ebp),%esi |
| 20 | movl 12(%ebp),%ecx |
| 21 | xor %eax,%eax |
| 22 | xor %edx,%edx |
| 23 | |
| 24 | # Since 4 * 32767**2 < 2**32, we can accumulate two at a time |
| 25 | 1: subl $8,%ecx |
| 26 | jl 2f |
| 27 | movq (%esi),%mm0 # S0 S1 S2 S3 |
| 28 | pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2) |
| 29 | movq 8(%esi),%mm6 # S4 S5 S6 S7 |
| 30 | pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2) |
| 31 | paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2) |
| 32 | movd %mm0,%ebx |
| 33 | addl %ebx,%eax |
| 34 | adcl $0,%edx |
| 35 | psrlq $32,%mm0 |
| 36 | movd %mm0,%ebx |
| 37 | addl %ebx,%eax |
| 38 | adcl $0,%edx |
| 39 | addl $16,%esi |
| 40 | jmp 1b |
| 41 | |
| 42 | 2: emms |
| 43 | popl %ebx |
| 44 | popl %ecx |
| 45 | popl %esi |
| 46 | popl %ebp |
| 47 | ret |
| 48 | |
| 49 | # Evaluate sum of squares of signed 16-bit input samples |
| 50 | # long sumsq_wd_mmx_assist(signed short *in,int cnt); |
| 51 | # Quick version, only safe for small numbers of small input values... |
| 52 | .global sumsq_wd_mmx_assist |
| 53 | .type sumsq_wd_mmx_assist,@function |
| 54 | .align 16 |
| 55 | sumsq_wd_mmx_assist: |
| 56 | pushl %ebp |
| 57 | movl %esp,%ebp |
| 58 | pushl %esi |
| 59 | |
| 60 | movl 8(%ebp),%esi |
| 61 | movl 12(%ebp),%ecx |
| 62 | pxor %mm2,%mm2 # zero sum |
| 63 | |
| 64 | 1: subl $8,%ecx |
| 65 | jl 2f |
| 66 | movq (%esi),%mm0 # S0 S1 S2 S3 |
| 67 | pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) |
| 68 | movq 8(%esi),%mm1 |
| 69 | pmaddwd %mm1,%mm1 |
| 70 | paddd %mm1,%mm2 |
| 71 | paddd %mm0,%mm2 # accumulate |
| 72 | |
| 73 | addl $16,%esi |
| 74 | jmp 1b |
| 75 | |
| 76 | 2: movd %mm2,%eax # even sum |
| 77 | psrlq $32,%mm2 |
| 78 | movd %mm2,%edx # odd sum |
| 79 | addl %edx,%eax |
| 80 | emms |
| 81 | popl %esi |
| 82 | popl %ebp |
| 83 | ret |