Blame - dotprod_av.c - platform/external/fec

blob: 1f70471abe583253be7dfe04c7b6879c393a16ed [file] [log] [blame]

Bill Yi	4e213d5	2015-06-23 13:53:11 -0700	[diff] [blame]	1	/* 16-bit signed integer dot product
				2	* Altivec-assisted version
				3	* Copyright 2004 Phil Karn
				4	* May be used under the terms of the GNU Lesser General Public License (LGPL)
				5	*/
				6	#include <stdlib.h>
				7	#include "fec.h"
				8
				9	struct dotprod {
				10	int len; /* Number of coefficients */
				11
				12	/* On an Altivec machine, these hold 8 copies of the coefficients,
				13	* preshifted by 0,1,..7 words to meet all possible input data
				14	*/
				15	signed short *coeffs[8];
				16	};
				17
				18	/* Create and return a descriptor for use with the dot product function */
				19	void *initdp_av(signed short coeffs[],int len){
				20	struct dotprod *dp;
				21	int i,j;
				22
				23	if(len == 0)
				24	return NULL;
				25
				26	dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
				27	dp->len = len;
				28
				29	/* Make 8 copies of coefficients, one for each data alignment,
				30	* each aligned to 16-byte boundary
				31	*/
				32	for(i=0;i<8;i++){
				33	dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
				34	for(j=0;j<len;j++)
				35	dp->coeffs[i][j+i] = coeffs[j];
				36	}
				37	return (void *)dp;
				38	}
				39
				40
				41	/* Free a dot product descriptor created earlier */
				42	void freedp_av(void *p){
				43	struct dotprod dp = (struct dotprod )p;
				44	int i;
				45
				46	for(i=0;i<8;i++)
				47	if(dp->coeffs[i] != NULL)
				48	free(dp->coeffs[i]);
				49	free(dp);
				50	}
				51
				52	/* Compute a dot product given a descriptor and an input array
				53	* The length is taken from the descriptor
				54	*/
				55	long dotprod_av(void *p,signed short a[]){
				56	struct dotprod dp = (struct dotprod )p;
				57	int al;
				58	vector signed short ar,d;
				59	vector signed int sums0,sums1,sums2,sums3;
				60	union { vector signed int v; signed int w[4];} s;
				61	int nblocks;
				62
				63	/* round ar down to beginning of 16-byte block containing 0th element of
				64	* input buffer. Then set d to one of 8 sets of shifted coefficients
				65	*/
				66	ar = (vector signed short *)((int)a & ~15);
				67	al = ((int)a & 15)/sizeof(signed short);
				68	d = (vector signed short *)dp->coeffs[al];
				69
				70	nblocks = (dp->len+al-1)/8+1;
				71
				72	/* Sum into four vectors each holding four 32-bit partial sums */
				73	sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
				74	while(nblocks >= 4){
				75	sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
				76	sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
				77	sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
				78	sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
				79	nblocks -= 4;
				80	}
				81	sums0 = vec_adds(sums0,sums1);
				82	sums2 = vec_adds(sums2,sums3);
				83	sums0 = vec_adds(sums0,sums2);
				84	while(nblocks-- > 0){
				85	sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
				86	}
				87	/* Sum 4 partial sums into final result */
				88	s.v = vec_sums(sums0,(vector signed int)(0));
				89
				90	return s.w[3];
				91	}
				92
				93