| /* 16-bit signed integer dot product |
| * MMX assisted version; also for SSE |
| * |
| * Copyright 2004 Phil Karn |
| * May be used under the terms of the GNU Lesser General Public License (LGPL) |
| */ |
| #include <stdlib.h> |
| #include "fec.h" |
| |
| struct dotprod { |
| int len; /* Number of coefficients */ |
| |
| /* On a MMX or SSE machine, these hold 4 copies of the coefficients, |
| * preshifted by 0,1,2,3 words to meet all possible input data |
| * alignments (see Intel ap559 on MMX dot products). |
| */ |
| signed short *coeffs[4]; |
| }; |
| long dotprod_mmx_assist(signed short *a,signed short *b,int cnt); |
| |
| /* Create and return a descriptor for use with the dot product function */ |
| void *initdp_mmx(signed short coeffs[],int len){ |
| struct dotprod *dp; |
| int i,j; |
| |
| |
| if(len == 0) |
| return NULL; |
| |
| dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); |
| dp->len = len; |
| |
| /* Make 4 copies of coefficients, one for each data alignment */ |
| for(i=0;i<4;i++){ |
| dp->coeffs[i] = (signed short *)calloc(1+(len+i-1)/4, |
| 4*sizeof(signed short)); |
| for(j=0;j<len;j++) |
| dp->coeffs[i][j+i] = coeffs[j]; |
| } |
| return (void *)dp; |
| } |
| |
| |
| /* Free a dot product descriptor created earlier */ |
| void freedp_mmx(void *p){ |
| struct dotprod *dp = (struct dotprod *)p; |
| int i; |
| |
| for(i=0;i<4;i++) |
| if(dp->coeffs[i] != NULL) |
| free(dp->coeffs[i]); |
| free(dp); |
| } |
| |
| /* Compute a dot product given a descriptor and an input array |
| * The length is taken from the descriptor |
| */ |
| long dotprod_mmx(void *p,signed short a[]){ |
| struct dotprod *dp = (struct dotprod *)p; |
| int al; |
| signed short *ar; |
| |
| /* Round input data address down to 8 byte boundary |
| * NB: depending on the alignment of a[], memory |
| * before a[] will be accessed. The contents don't matter since they'll |
| * be multiplied by zero coefficients. I can't conceive of any |
| * situation where this could cause a segfault since memory protection |
| * in the x86 machines is done on much larger boundaries |
| */ |
| ar = (signed short *)((int)a & ~7); |
| |
| /* Choose one of 4 sets of pre-shifted coefficients. al is both the |
| * index into dp->coeffs[] and the number of 0 words padded onto |
| * that coefficients array for alignment purposes |
| */ |
| al = a - ar; |
| |
| /* Call assembler routine to do the work, passing number of 4-word blocks */ |
| return dotprod_mmx_assist(ar,dp->coeffs[al],(dp->len+al-1)/4+1); |
| } |
| |