Blame - sumsq_av.c - platform/external/fec

blob: 53c6acfd3b6d1493ec01e7940fd5b0d88ff67ee8 [file] [log] [blame]

Bill Yi	4e213d5	2015-06-23 13:53:11 -0700	[diff] [blame]	1	/* Compute the sum of the squares of a vector of signed shorts
				2
				3	* This is the Altivec SIMD version. It's a little hairy because Altivec
				4	* does not do 64-bit operations directly, so we have to accumulate separate
				5	* 32-bit sums and carries
				6
				7	* Copyright 2004 Phil Karn, KA9Q
				8	* May be used under the terms of the GNU Lesser General Public License (LGPL)
				9	*/
				10
				11	#include "fec.h"
				12
				13	unsigned long long sumsq_av(signed short *in,int cnt){
				14	long long sum;
				15	vector signed short x;
				16	vector unsigned int sums,carries,s1,s2;
				17	int pad;
				18	union { vector unsigned char cv; vector unsigned int iv; unsigned int w[4]; unsigned char c[16];} s;
				19
				20	carries = sums = (vector unsigned int)(0);
				21	if((pad = (int)in & 15)!=0){
				22	/* Load unaligned leading word */
				23	x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in));
				24	if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */
				25	s.c[15] = (8-cnt)<<4;
				26	x = vec_sro(x,s.cv);
				27	}
				28	sums = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
				29	in += 8-pad/2;
				30	cnt -= 8-pad/2;
				31	}
				32	/* Everything is now aligned, rip through most of the block */
				33	while(cnt >= 8){
				34	x = vec_ld(0,in);
				35	/* A single vec_msum cannot overflow, but we have to sum it with
				36	* the earlier terms separately to handle the carries
				37	* The cast to unsigned is OK because squares are always positive
				38	*/
				39	s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
				40	carries = vec_add(carries,vec_addc(sums,s1));
				41	sums = vec_add(sums,s1);
				42	in += 8;
				43	cnt -= 8;
				44	}
				45	/* Handle trailing fragment, if any */
				46	if(cnt > 0){
				47	x = vec_ld(0,in);
				48	s.c[15] = (8-cnt)<<4;
				49	x = vec_sro(x,s.cv);
				50	s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
				51	carries = vec_add(carries,vec_addc(sums,s1));
				52	sums = vec_add(sums,s1);
				53	}
				54	/* Combine 4 sub-sums and carries */
				55	s.c[15] = 64; /* Shift right two 32-bit words */
				56	s1 = vec_sro(sums,s.cv);
				57	s2 = vec_sro(carries,s.cv);
				58	carries = vec_add(carries,vec_addc(sums,s1));
				59	sums = vec_add(sums,s1);
				60	carries = vec_add(carries,s2);
				61
				62	s.c[15] = 32; /* Shift right one 32-bit word */
				63	s1 = vec_sro(sums,s.cv);
				64	s2 = vec_sro(carries,s.cv);
				65	carries = vec_add(carries,vec_addc(sums,s1));
				66	sums = vec_add(sums,s1);
				67	carries = vec_add(carries,s2);
				68
				69	/* Extract sum and carries from right-hand words and combine into result */
				70	s.iv = sums;
				71	sum = s.w[3];
				72
				73	s.iv = carries;
				74	sum += (long long)s.w[3] << 32;
				75
				76	return sum;
				77	}
				78