Blame - viterbi39_sse2.c - platform/external/fec

blob: f13794e7608f7476d771461ed74a0bfc1244ecdc [file] [log] [blame]

Bill Yi	4e213d5	2015-06-23 13:53:11 -0700	[diff] [blame]	1	/* K=15 r=1/6 Viterbi decoder for x86 SSE2
				2	* Copyright Mar 2004, Phil Karn, KA9Q
				3	* May be used under the terms of the GNU Lesser General Public License (LGPL)
				4	*/
				5	#include <emmintrin.h>
				6	#include <stdio.h>
				7	#include <stdlib.h>
				8	#include <memory.h>
				9	#include <limits.h>
				10	#include "fec.h"
				11
				12	typedef union { unsigned long w[8]; unsigned short s[16];} decision_t;
				13	typedef union { signed short s[256]; __m128i v[32];} metric_t;
				14
				15	static union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3];
				16	static int Init = 0;
				17
				18	/* State info for instance of Viterbi decoder */
				19	struct v39 {
				20	metric_t metrics1; /* path metric buffer 1 */
				21	metric_t metrics2; /* path metric buffer 2 */
				22	void dp; / Pointer to current decision */
				23	metric_t old_metrics,new_metrics; /* Pointers to path metrics, swapped on every bit */
				24	void decisions; / Beginning of decisions for block */
				25	};
				26
				27	/* Initialize Viterbi decoder for start of new frame */
				28	int init_viterbi39_sse2(void *p,int starting_state){
				29	struct v39 *vp = p;
				30	int i;
				31
				32	for(i=0;i<256;i++)
				33	vp->metrics1.s[i] = (SHRT_MIN+1000);
				34
				35	vp->old_metrics = &vp->metrics1;
				36	vp->new_metrics = &vp->metrics2;
				37	vp->dp = vp->decisions;
				38	vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */
				39	return 0;
				40	}
				41
				42	/* Create a new instance of a Viterbi decoder */
				43	void *create_viterbi39_sse2(int len){
				44	void *p;
				45	struct v39 *vp;
				46
				47	if(!Init){
				48	int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
				49
				50	set_viterbi39_polynomial_sse2(polys);
				51	}
				52	/* Ordinary malloc() only returns 8-byte alignment, we need 16 */
				53	if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39)))
				54	return NULL;
				55
				56	vp = (struct v39 *)p;
				57	if((p = malloc((len+8)*sizeof(decision_t))) == NULL){
				58	free(vp);
				59	return NULL;
				60	}
				61	vp->decisions = (decision_t *)p;
				62	init_viterbi39_sse2(vp,0);
				63	return vp;
				64	}
				65
				66	void set_viterbi39_polynomial_sse2(int polys[3]){
				67	int state;
				68
				69	for(state=0;state < 128;state++){
				70	Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
				71	Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
				72	Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
				73	}
				74	Init++;
				75	}
				76
				77	/* Viterbi chainback */
				78	int chainback_viterbi39_sse2(
				79	void *p,
				80	unsigned char data, / Decoded output data */
				81	unsigned int nbits, /* Number of data bits */
				82	unsigned int endstate){ /* Terminal encoder state */
				83	struct v39 *vp = p;
				84	decision_t d = (decision_t )vp->decisions;
				85	int path_metric;
				86
				87	endstate %= 256;
				88
				89	path_metric = vp->old_metrics->s[endstate];
				90
				91	/* The store into data[] only needs to be done every 8 bits.
				92	* But this avoids a conditional branch, and the writes will
				93	* combine in the cache anyway
				94	*/
				95	d += 8; /* Look past tail */
				96	while(nbits-- != 0){
				97	int k;
				98
				99	k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;
				100	endstate = (k << 7) \| (endstate >> 1);
				101	data[nbits>>3] = endstate;
				102	}
				103	return path_metric;
				104	}
				105
				106	/* Delete instance of a Viterbi decoder */
				107	void delete_viterbi39_sse2(void *p){
				108	struct v39 *vp = p;
				109
				110	if(vp != NULL){
				111	free(vp->decisions);
				112	free(vp);
				113	}
				114	}
				115
				116
				117	int update_viterbi39_blk_sse2(void p,unsigned char syms,int nbits){
				118	struct v39 *vp = p;
				119	decision_t d = (decision_t )vp->dp;
				120	int path_metric = 0;
				121
				122	while(nbits--){
				123	__m128i sym0v,sym1v,sym2v;
				124	void *tmp;
				125	int i;
				126
				127	/* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
				128	sym0v = _mm_set1_epi16(syms[0]);
				129	sym1v = _mm_set1_epi16(syms[1]);
				130	sym2v = _mm_set1_epi16(syms[2]);
				131	syms += 3;
				132
				133	/* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */
				134	for(i=0;i<16;i++){
				135	__m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
				136
				137	/* Form branch metrics
				138	* Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
				139	* the XOR operations constitute conditional negation.
				140	* metric and m_metric (-metric) are in the range 0-765
				141	*/
				142	m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v));
				143	metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0);
				144	m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric);
				145
				146	/* Add branch metrics to path metrics */
				147	m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric);
				148	m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric);
				149	m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric);
				150	m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric);
				151
				152	/* Compare and select */
				153	survivor0 = _mm_min_epi16(m0,m1);
				154	survivor1 = _mm_min_epi16(m2,m3);
				155	decision0 = _mm_cmpeq_epi16(survivor0,m1);
				156	decision1 = _mm_cmpeq_epi16(survivor1,m3);
				157
				158	/* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */
				159	d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128())));
				160
				161	/* Store surviving metrics */
				162	vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1);
				163	vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1);
				164	}
				165	/* See if we need to renormalize */
				166	if(vp->new_metrics->s[0] >= SHRT_MAX-5000){
				167	int i,adjust;
				168	__m128i adjustv;
				169	union { __m128i v; signed short w[8]; } t;
				170
				171	/* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
				172	adjustv = vp->new_metrics->v[0];
				173	for(i=1;i<32;i++)
				174	adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]);
				175
				176	adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8));
				177	adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4));
				178	adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2));
				179	t.v = adjustv;
				180	adjust = t.w[0] - SHRT_MIN;
				181	path_metric += adjust;
				182	adjustv = _mm_set1_epi16(adjust);
				183
				184	/* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX
				185	* This is okay since it can't overflow anyway
				186	*/
				187	for(i=0;i<32;i++)
				188	vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv);
				189	}
				190	d++;
				191	/* Swap pointers to old and new metrics */
				192	tmp = vp->old_metrics;
				193	vp->old_metrics = vp->new_metrics;
				194	vp->new_metrics = tmp;
				195	}
				196	vp->dp = d;
				197	return path_metric;
				198	}
				199
				200