Bill Yi | 4e213d5 | 2015-06-23 13:53:11 -0700 | [diff] [blame] | 1 | # SIMD SSE2 dot product |
| 2 | # Equivalent to the following C code: |
| 3 | # long dotprod(signed short *a,signed short *b,int cnt) |
| 4 | # { |
| 5 | # long sum = 0; |
| 6 | # cnt *= 8; |
| 7 | # while(cnt--) |
| 8 | # sum += *a++ + *b++; |
| 9 | # return sum; |
| 10 | # } |
| 11 | # a and b must be 128-bit aligned |
| 12 | # Copyright 2001, Phil Karn KA9Q |
| 13 | # May be used under the terms of the GNU Lesser General Public License (LGPL) |
| 14 | |
| 15 | .text |
| 16 | .global dotprod_sse2_assist |
| 17 | .type dotprod_sse2_assist,@function |
| 18 | dotprod_sse2_assist: |
| 19 | pushl %ebp |
| 20 | movl %esp,%ebp |
| 21 | pushl %esi |
| 22 | pushl %edi |
| 23 | pushl %ecx |
| 24 | pushl %ebx |
| 25 | movl 8(%ebp),%esi # a |
| 26 | movl 12(%ebp),%edi # b |
| 27 | movl 16(%ebp),%ecx # cnt |
| 28 | pxor %xmm0,%xmm0 # clear running sum (in two 32-bit halves) |
| 29 | |
| 30 | # SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop |
| 31 | .align 16 |
| 32 | .Loop1: subl $4,%ecx |
| 33 | jl .Loop1Done |
| 34 | |
| 35 | movdqa (%esi),%xmm1 |
| 36 | pmaddwd (%edi),%xmm1 |
| 37 | paddd %xmm1,%xmm0 |
| 38 | |
| 39 | movdqa 16(%esi),%xmm1 |
| 40 | pmaddwd 16(%edi),%xmm1 |
| 41 | paddd %xmm1,%xmm0 |
| 42 | |
| 43 | movdqa 32(%esi),%xmm1 |
| 44 | pmaddwd 32(%edi),%xmm1 |
| 45 | paddd %xmm1,%xmm0 |
| 46 | |
| 47 | movdqa 48(%esi),%xmm1 |
| 48 | addl $64,%esi |
| 49 | pmaddwd 48(%edi),%xmm1 |
| 50 | addl $64,%edi |
| 51 | paddd %xmm1,%xmm0 |
| 52 | |
| 53 | jmp .Loop1 |
| 54 | .Loop1Done: |
| 55 | |
| 56 | addl $4,%ecx |
| 57 | |
| 58 | # SSE2 dot product loop, not unrolled, crunching 4 terms per loop |
| 59 | # This could be redone as Duff's Device on the unrolled loop above |
| 60 | .Loop2: subl $1,%ecx |
| 61 | jl .Loop2Done |
| 62 | |
| 63 | movdqa (%esi),%xmm1 |
| 64 | addl $16,%esi |
| 65 | pmaddwd (%edi),%xmm1 |
| 66 | addl $16,%edi |
| 67 | paddd %xmm1,%xmm0 |
| 68 | jmp .Loop2 |
| 69 | .Loop2Done: |
| 70 | |
| 71 | movdqa %xmm0,%xmm1 |
| 72 | psrldq $8,%xmm0 |
| 73 | paddd %xmm1,%xmm0 |
| 74 | movd %xmm0,%eax # right-hand word to eax |
| 75 | psrldq $4,%xmm0 |
| 76 | movd %xmm0,%ebx |
| 77 | addl %ebx,%eax |
| 78 | |
| 79 | popl %ebx |
| 80 | popl %ecx |
| 81 | popl %edi |
| 82 | popl %esi |
| 83 | movl %ebp,%esp |
| 84 | popl %ebp |
| 85 | ret |