Blame - pf_mixer.cpp - platform/external/pffft

blob: 0f2c31005fc64b8336751cbdeaecb5c5e4e7a3be [file] [log] [blame]

hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1	/*
				2	This software is part of pffft/pfdsp, a set of simple DSP routines.
				3
				4	Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
				5	Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
				6	All rights reserved.
				7
				8	Redistribution and use in source and binary forms, with or without
				9	modification, are permitted provided that the following conditions are met:
				10	* Redistributions of source code must retain the above copyright
				11	notice, this list of conditions and the following disclaimer.
				12	* Redistributions in binary form must reproduce the above copyright
				13	notice, this list of conditions and the following disclaimer in the
				14	documentation and/or other materials provided with the distribution.
				15	* Neither the name of the copyright holder nor the
				16	names of its contributors may be used to endorse or promote products
				17	derived from this software without specific prior written permission.
				18
				19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
				20	ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
				21	WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
				22	DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
				23	DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
				24	(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
				25	LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
				26	ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				27	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				28	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				29	*/
				30
				31	/* include own header first, to see missing includes */
				32	#include "pf_mixer.h"
				33	#include "fmv.h"
				34
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	35	#include <math.h>
				36	#include <stdlib.h>
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	37	#include <assert.h>
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	38
				39	//they dropped M_PI in C99, so we define it:
				40	#define PI ((float)3.14159265358979323846)
				41
				42	//apply to pointers:
				43	#define iof(complexf_input_p,i) ((((float)complexf_input_p)+2*(i)))
				44	#define qof(complexf_input_p,i) ((((float)complexf_input_p)+2*(i)+1))
				45
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	46	#define USE_ALIGNED_ADDRESSES 0
				47
				48
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	49
				50	/*
				51	_____ _____ _____ __ _ _
				52	\| __ \ / ____\| __ \ / _\| \| \| (_)
				53	\| \| \| \| (___ \| \|__) \| \| \|_ _ _ _ __ ___\| \|_ _ ___ _ __ ___
				54	\| \| \| \|\___ \\| ___/ \| _\| \| \| \| '_ \ / __\| __\| \|/ _ \\| '_ \/ __\|
				55	\| \|__\| \|____) \| \| \| \| \| \|_\| \| \| \| \| (__\| \|_\| \| (_) \| \| \| \__ \
				56	\|_____/\|_____/\|_\| \|_\| \__,_\|_\| \|_\|\___\|\__\|_\|\___/\|_\| \|_\|___/
				57
				58	*/
				59
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	60
				61	#if defined(__GNUC__)
				62	# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
				63	# define RESTRICT __restrict
				64	#elif defined(_MSC_VER)
				65	# define ALWAYS_INLINE(return_type) __forceinline return_type
				66	# define RESTRICT __restrict
				67	#endif
				68
				69
				70	#ifndef PFFFT_SIMD_DISABLE
				71	#if (defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(i386) \|\| defined(_M_IX86))
				72	#pragma message "Manual SSE x86/x64 optimizations are ON"
				73	#include <xmmintrin.h>
				74	#define HAVE_SSE_INTRINSICS 1
				75
				76	#elif defined(PFFFT_ENABLE_NEON) && defined(__arm__)
				77	#pragma message "Manual NEON (arm32) optimizations are ON"
				78	#include "sse2neon.h"
				79	#define HAVE_SSE_INTRINSICS 1
				80
				81	#elif defined(PFFFT_ENABLE_NEON) && defined(__aarch64__)
				82	#pragma message "Manual NEON (aarch64) optimizations are ON"
				83	#include "sse2neon.h"
				84	#define HAVE_SSE_INTRINSICS 1
				85
				86	#endif
				87	#endif
				88
				89	#ifdef HAVE_SSE_INTRINSICS
				90
				91	typedef __m128 v4sf;
				92	# define SIMD_SZ 4
				93
				94	typedef union v4_union {
				95	__m128 v;
				96	float f[4];
				97	} v4_union;
				98
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	99	#define VMUL(a,b) _mm_mul_ps(a,b)
				100	#define VDIV(a,b) _mm_div_ps(a,b)
				101	#define VADD(a,b) _mm_add_ps(a,b)
				102	#define VSUB(a,b) _mm_sub_ps(a,b)
				103	#define LD_PS1(s) _mm_set1_ps(s)
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	104	#define VLOAD_UNALIGNED(ptr) _mm_loadu_ps((const float *)(ptr))
				105	#define VLOAD_ALIGNED(ptr) _mm_load_ps((const float *)(ptr))
				106	#define VSTORE_UNALIGNED(ptr, v) _mm_storeu_ps((float*)(ptr), v)
				107	#define VSTORE_ALIGNED(ptr, v) _mm_store_ps((float*)(ptr), v)
				108	#define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
				109	#define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
				110
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	111	#if USE_ALIGNED_ADDRESSES
				112	#define VLOAD(ptr) _mm_load_ps((const float *)(ptr))
				113	#define VSTORE(ptr, v) _mm_store_ps((float*)(ptr), v)
				114	#else
				115	#define VLOAD(ptr) _mm_loadu_ps((const float *)(ptr))
				116	#define VSTORE(ptr, v) _mm_storeu_ps((float*)(ptr), v)
				117	#endif
				118
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	119
				120	int have_sse_shift_mixer_impl()
				121	{
				122	return 1;
				123	}
				124
				125	#else
				126
				127	int have_sse_shift_mixer_impl()
				128	{
				129	return 0;
				130	}
				131
				132	#endif
				133
				134
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	135	/*********************************************************************/
				136
				137	/**************/
				138	/* ALGO A */
				139	/**************/
				140
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	141	PF_TARGET_CLONES
				142	float shift_math_cc(complexf input, complexf output, int input_size, float rate, float starting_phase)
				143	{
				144	rate*=2;
				145	//Shifts the complex spectrum. Basically a complex mixer. This version uses cmath.
				146	float phase=starting_phase;
				147	float phase_increment=rate*PI;
				148	float cosval, sinval;
				149	for(int i=0;i<input_size; i++)
				150	{
				151	cosval=cos(phase);
				152	sinval=sin(phase);
				153	//we multiply two complex numbers.
				154	//how? enter this to maxima (software) for explanation:
				155	// (a+b%i)(c+d*%i), rectform;
				156	iof(output,i)=cosvaliof(input,i)-sinvalqof(input,i);
				157	qof(output,i)=sinvaliof(input,i)+cosvalqof(input,i);
				158	phase+=phase_increment;
				159	while(phase>2PI) phase-=2PI; //@shift_math_cc: normalize phase
				160	while(phase<0) phase+=2*PI;
				161	}
				162	return phase;
				163	}
				164
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	165	/*********************************************************************/
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	166
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	167	/**************/
				168	/* ALGO B */
				169	/**************/
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	170
				171	shift_table_data_t shift_table_init(int table_size)
				172	{
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	173	shift_table_data_t output;
				174	output.table=(float)malloc(sizeof(float)table_size);
				175	output.table_size=table_size;
				176	for(int i=0;i<table_size;i++)
				177	{
				178	output.table[i]=sin(((float)i/table_size)*(PI/2));
				179	}
				180	return output;
				181	}
				182
				183	void shift_table_deinit(shift_table_data_t table_data)
				184	{
				185	free(table_data.table);
				186	}
				187
				188
				189	PF_TARGET_CLONES
				190	float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase)
				191	{
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	192	rate*=2;
				193	//Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table.
				194	float phase=starting_phase;
				195	float phase_increment=rate*PI;
				196	float cosval, sinval;
				197	for(int i=0;i<input_size; i++) //@shift_math_cc
				198	{
				199	int sin_index, cos_index, temp_index, sin_sign, cos_sign;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	200	int quadrant=phase/(PI/2); //between 0 and 3
				201	float vphase=phase-quadrant*(PI/2);
				202	sin_index=(vphase/(PI/2))*table_data.table_size;
				203	cos_index=table_data.table_size-1-sin_index;
				204	if(quadrant&1) //in quadrant 1 and 3
				205	{
				206	temp_index=sin_index;
				207	sin_index=cos_index;
				208	cos_index=temp_index;
				209	}
				210	sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3
				211	cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2
				212	sinval=sin_sign*table_data.table[sin_index];
				213	cosval=cos_sign*table_data.table[cos_index];
				214	//we multiply two complex numbers.
				215	//how? enter this to maxima (software) for explanation:
				216	// (a+b%i)(c+d*%i), rectform;
				217	iof(output,i)=cosvaliof(input,i)-sinvalqof(input,i);
				218	qof(output,i)=sinvaliof(input,i)+cosvalqof(input,i);
				219	phase+=phase_increment;
				220	while(phase>2PI) phase-=2PI; //@shift_math_cc: normalize phase
				221	while(phase<0) phase+=2*PI;
				222	}
				223	return phase;
				224	}
				225
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	226	/*********************************************************************/
				227
				228	/**************/
				229	/* ALGO C */
				230	/**************/
				231
				232	shift_addfast_data_t shift_addfast_init(float rate)
				233	{
				234	shift_addfast_data_t output;
				235	output.phase_increment=2ratePI;
				236	for(int i=0;i<4;i++)
				237	{
				238	output.dsin[i]=sin(output.phase_increment*(i+1));
				239	output.dcos[i]=cos(output.phase_increment*(i+1));
				240	}
				241	return output;
				242	}
				243
				244	#define SADF_L1(j) \
				245	cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
				246	sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
				247	#define SADF_L2(j) \
				248	iof(output,4i+j)=(cos_vals_ ## j)iof(input,4i+j)-(sin_vals_ ## j)qof(input,4*i+j); \
				249	qof(output,4i+j)=(sin_vals_ ## j)iof(input,4i+j)+(cos_vals_ ## j)qof(input,4*i+j);
				250
				251	PF_TARGET_CLONES
				252	float shift_addfast_cc(complexf input, complexf output, int input_size, shift_addfast_data_t* d, float starting_phase)
				253	{
				254	//input_size should be multiple of 4
				255	//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
				256	float cos_start=cos(starting_phase);
				257	float sin_start=sin(starting_phase);
				258	float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
				259	sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
				260	dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
				261	dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
				262
				263	for(int i=0;i<input_size/4; i++)
				264	{
				265	SADF_L1(0)
				266	SADF_L1(1)
				267	SADF_L1(2)
				268	SADF_L1(3)
				269	SADF_L2(0)
				270	SADF_L2(1)
				271	SADF_L2(2)
				272	SADF_L2(3)
				273	cos_start = cos_vals_3;
				274	sin_start = sin_vals_3;
				275	}
				276	starting_phase+=input_size*d->phase_increment;
				277	while(starting_phase>PI) starting_phase-=2*PI;
				278	while(starting_phase<-PI) starting_phase+=2*PI;
				279	return starting_phase;
				280	}
				281
				282	#undef SADF_L2
				283
				284
				285	#define SADF_L2(j) \
				286	tmp_inp_cos = iof(in_out,4*i+j); \
				287	tmp_inp_sin = qof(in_out,4*i+j); \
				288	iof(in_out,4i+j)=(cos_vals_ ## j)tmp_inp_cos - (sin_vals_ ## j)*tmp_inp_sin; \
				289	qof(in_out,4i+j)=(sin_vals_ ## j)tmp_inp_cos + (cos_vals_ ## j)*tmp_inp_sin;
				290
				291	PF_TARGET_CLONES
				292	float shift_addfast_inp_c(complexf in_out, int N_cplx, shift_addfast_data_t d, float starting_phase)
				293	{
				294	//input_size should be multiple of 4
				295	//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
				296	float cos_start=cos(starting_phase);
				297	float sin_start=sin(starting_phase);
				298	float register tmp_inp_cos, tmp_inp_sin,
				299	cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
				300	sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
				301	dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
				302	dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
				303
				304	for(int i=0;i<N_cplx/4; i++)
				305	{
				306	SADF_L1(0)
				307	SADF_L1(1)
				308	SADF_L1(2)
				309	SADF_L1(3)
				310	SADF_L2(0)
				311	SADF_L2(1)
				312	SADF_L2(2)
				313	SADF_L2(3)
				314	cos_start = cos_vals_3;
				315	sin_start = sin_vals_3;
				316	}
				317	starting_phase+=N_cplx*d->phase_increment;
				318	while(starting_phase>PI) starting_phase-=2*PI;
				319	while(starting_phase<-PI) starting_phase+=2*PI;
				320	return starting_phase;
				321	}
				322
				323	#undef SADF_L1
				324	#undef SADF_L2
				325
				326
				327	/*********************************************************************/
				328
				329	/**************/
				330	/* ALGO D */
				331	/**************/
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	332
				333	shift_unroll_data_t shift_unroll_init(float rate, int size)
				334	{
				335	shift_unroll_data_t output;
				336	output.phase_increment=2ratePI;
				337	output.size = size;
				338	output.dsin=(float)malloc(sizeof(float)size);
				339	output.dcos=(float)malloc(sizeof(float)size);
				340	float myphase = 0;
				341	for(int i=0;i<size;i++)
				342	{
				343	myphase += output.phase_increment;
				344	while(myphase>PI) myphase-=2*PI;
				345	while(myphase<-PI) myphase+=2*PI;
				346	output.dsin[i]=sin(myphase);
				347	output.dcos[i]=cos(myphase);
				348	}
				349	return output;
				350	}
				351
				352	void shift_unroll_deinit(shift_unroll_data_t* d)
				353	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	354	if (!d)
				355	return;
				356	free(d->dsin);
				357	free(d->dcos);
				358	d->dsin = NULL;
				359	d->dcos = NULL;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	360	}
				361
				362	PF_TARGET_CLONES
				363	float shift_unroll_cc(complexf input, complexf output, int input_size, shift_unroll_data_t* d, float starting_phase)
				364	{
				365	//input_size should be multiple of 4
				366	//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
				367	float cos_start = cos(starting_phase);
				368	float sin_start = sin(starting_phase);
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	369	register float cos_val = cos_start, sin_val = sin_start;
				370	for(int i=0;i<input_size; i++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	371	{
				372	iof(output,i) = cos_valiof(input,i) - sin_valqof(input,i);
				373	qof(output,i) = sin_valiof(input,i) + cos_valqof(input,i);
				374	// calculate complex phasor for next iteration
				375	cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
				376	sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
				377	}
				378	starting_phase+=input_size*d->phase_increment;
				379	while(starting_phase>PI) starting_phase-=2*PI;
				380	while(starting_phase<-PI) starting_phase+=2*PI;
				381	return starting_phase;
				382	}
				383
				384	PF_TARGET_CLONES
				385	float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase)
				386	{
				387	float cos_start = cos(starting_phase);
				388	float sin_start = sin(starting_phase);
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	389	register float cos_val = cos_start, sin_val = sin_start;
				390	for(int i=0;i<size; i++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	391	{
				392	register float inp_i = iof(in_out,i);
				393	register float inp_q = qof(in_out,i);
				394	iof(in_out,i) = cos_valinp_i - sin_valinp_q;
				395	qof(in_out,i) = sin_valinp_i + cos_valinp_q;
				396	// calculate complex phasor for next iteration
				397	cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
				398	sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
				399	}
				400	starting_phase += size * d->phase_increment;
				401	while(starting_phase>PI) starting_phase-=2*PI;
				402	while(starting_phase<-PI) starting_phase+=2*PI;
				403	return starting_phase;
				404	}
				405
				406
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	407	/*********************************************************************/
				408
				409	/**************/
				410	/* ALGO E */
				411	/**************/
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	412
				413	shift_limited_unroll_data_t shift_limited_unroll_init(float rate)
				414	{
				415	shift_limited_unroll_data_t output;
				416	output.phase_increment=2ratePI;
				417	float myphase = 0;
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	418	for(int i=0; i < PF_SHIFT_LIMITED_UNROLL_SIZE; i++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	419	{
				420	myphase += output.phase_increment;
				421	while(myphase>PI) myphase-=2*PI;
				422	while(myphase<-PI) myphase+=2*PI;
				423	output.dcos[i] = cos(myphase);
				424	output.dsin[i] = sin(myphase);
				425	}
				426	output.complex_phase.i = 1.0F;
				427	output.complex_phase.q = 0.0F;
				428	return output;
				429	}
				430
				431	PF_TARGET_CLONES
				432	void shift_limited_unroll_cc(const complexf input, complexf output, int size, shift_limited_unroll_data_t* d)
				433	{
				434	float cos_start = d->complex_phase.i;
				435	float sin_start = d->complex_phase.q;
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	436	register float cos_val = cos_start, sin_val = sin_start, mag;
				437	while (size > 0)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	438	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	439	int N = (size >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : size;
				440	for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	441	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	442	for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	443	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	444	iof(output,PF_SHIFT_LIMITED_SIMD_SZi+j) = cos_valiof(input,PF_SHIFT_LIMITED_SIMD_SZi+j) - sin_valqof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
				445	qof(output,PF_SHIFT_LIMITED_SIMD_SZi+j) = sin_valiof(input,PF_SHIFT_LIMITED_SIMD_SZi+j) + cos_valqof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	446	// calculate complex phasor for next iteration
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	447	cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZi+j] - sin_start d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
				448	sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZi+j] + cos_start d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	449	}
				450	}
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	451	// "starts := vals := vals / \|vals\|"
				452	mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
				453	cos_val /= mag;
				454	sin_val /= mag;
				455	cos_start = cos_val;
				456	sin_start = sin_val;
				457
				458	input += PF_SHIFT_LIMITED_UNROLL_SIZE;
				459	output += PF_SHIFT_LIMITED_UNROLL_SIZE;
				460	size -= PF_SHIFT_LIMITED_UNROLL_SIZE;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	461	}
				462	d->complex_phase.i = cos_val;
				463	d->complex_phase.q = sin_val;
				464	}
				465
				466	PF_TARGET_CLONES
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	467	void shift_limited_unroll_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_data_t* d)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	468	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	469	float inp_i[PF_SHIFT_LIMITED_SIMD_SZ];
				470	float inp_q[PF_SHIFT_LIMITED_SIMD_SZ];
				471	// "vals := starts := phase_state"
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	472	float cos_start = d->complex_phase.i;
				473	float sin_start = d->complex_phase.q;
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	474	register float cos_val = cos_start, sin_val = sin_start, mag;
				475	while (N_cplx)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	476	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	477	int N = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
				478	for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	479	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	480	for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
				481	inp_i[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].i;
				482	for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
				483	inp_q[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].q;
				484	for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	485	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	486	// "out[] = inp[] * vals"
				487	iof(in_out,PF_SHIFT_LIMITED_SIMD_SZi+j) = cos_valinp_i[j] - sin_val*inp_q[j];
				488	qof(in_out,PF_SHIFT_LIMITED_SIMD_SZi+j) = sin_valinp_i[j] + cos_val*inp_q[j];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	489	// calculate complex phasor for next iteration
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	490	// "vals := d[] * starts"
				491	cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZi+j] - sin_start d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
				492	sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZi+j] + cos_start d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	493	}
				494	}
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	495	// "starts := vals := vals / \|vals\|"
				496	mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
				497	cos_val /= mag;
				498	sin_val /= mag;
				499	cos_start = cos_val;
				500	sin_start = sin_val;
				501
				502	in_out += PF_SHIFT_LIMITED_UNROLL_SIZE;
				503	N_cplx -= PF_SHIFT_LIMITED_UNROLL_SIZE;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	504	}
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	505	// "phase_state := starts"
				506	d->complex_phase.i = cos_start;
				507	d->complex_phase.q = sin_start;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	508	}
				509
				510
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	511	#ifdef HAVE_SSE_INTRINSICS
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	512
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	513	/*********************************************************************/
				514
				515	/**************/
				516	/* ALGO F */
				517	/**************/
				518
				519	shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	520	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	521	shift_limited_unroll_A_sse_data_t output;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	522	float myphase;
				523
				524	output.phase_increment = 2relative_freqPI;
				525
				526	myphase = 0.0F;
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	527	for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	528	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	529	for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
				530	{
				531	myphase += output.phase_increment;
				532	while(myphase>PI) myphase-=2*PI;
				533	while(myphase<-PI) myphase+=2*PI;
				534	}
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	535	output.dcos[i] = cos(myphase);
				536	output.dsin[i] = sin(myphase);
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	537	for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
				538	{
				539	output.dcos[i+k] = output.dcos[i];
				540	output.dsin[i+k] = output.dsin[i];
				541	}
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	542	}
				543
				544	output.dcos_blk = 0.0F;
				545	output.dsin_blk = 0.0F;
				546
				547	myphase = phase_start_rad;
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	548	for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	549	{
				550	output.phase_state_i[i] = cos(myphase);
				551	output.phase_state_q[i] = sin(myphase);
				552	myphase += output.phase_increment;
				553	while(myphase>PI) myphase-=2*PI;
				554	while(myphase<-PI) myphase+=2*PI;
				555	}
				556	return output;
				557	}
				558
				559
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	560	PF_TARGET_CLONES
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	561	void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	562	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	563	// "vals := starts := phase_state"
				564	__m128 cos_starts = VLOAD( &d->phase_state_i[0] );
				565	__m128 sin_starts = VLOAD( &d->phase_state_q[0] );
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	566	__m128 cos_vals = cos_starts;
				567	__m128 sin_vals = sin_starts;
				568	__m128 inp_re, inp_im;
				569	__m128 product_re, product_im;
				570	__m128 interl_prod_a, interl_prod_b;
				571	__m128 * RESTRICT p_trig_cos_tab;
				572	__m128 * RESTRICT p_trig_sin_tab;
				573	__m128 * RESTRICT u = (__m128*)in_out;
				574
				575	while (N_cplx)
				576	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	577	const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	578	int B = NB;
				579	p_trig_cos_tab = (__m128*)( &d->dcos[0] );
				580	p_trig_sin_tab = (__m128*)( &d->dsin[0] );
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	581	while (B)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	582	{
				583	// complex multiplication of 4 complex values from/to in_out[]
				584	// == u[0..3] = (cos_val[0..3] + i sin_val[0..3]):
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	585	// "out[] = inp[] * vals"
				586	UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	587	product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
				588	product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
				589	INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	590	VSTORE(u, interl_prod_a);
				591	VSTORE(u+1, interl_prod_b);
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	592	u += 2;
				593	// calculate complex phasor for next iteration
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	594	// cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZi+j] - sin_start d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
				595	// sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZi+j] + cos_start d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	596	// cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	597	// "vals := d[] * starts"
				598	inp_re = VLOAD(p_trig_cos_tab);
				599	inp_im = VLOAD(p_trig_sin_tab);
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	600	cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
				601	sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
				602	++p_trig_cos_tab;
				603	++p_trig_sin_tab;
				604	B -= 4;
				605	}
				606	N_cplx -= NB;
				607	/* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
				608	/* re-use product_re[]/product_im[] for normalization */
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	609	// "starts := vals := vals / \|vals\|"
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	610	product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	611	#if 0
				612	// more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
				613	// higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	614	product_im = _mm_rsqrt_ps(product_re);
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	615	cos_starts = cos_vals = VMUL(cos_vals, product_im);
				616	sin_starts = sin_vals = VMUL(sin_vals, product_im);
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	617	#else
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	618	// spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
				619	// spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
				620	product_im = _mm_sqrt_ps(product_re);
				621	cos_starts = cos_vals = VDIV(cos_vals, product_im);
				622	sin_starts = sin_vals = VDIV(sin_vals, product_im);
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	623	#endif
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	624	}
				625	// "phase_state := starts"
				626	VSTORE( &d->phase_state_i[0], cos_starts );
				627	VSTORE( &d->phase_state_q[0], sin_starts );
				628	}
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	629
				630
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	631	/*********************************************************************/
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	632
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	633	/**************/
				634	/* ALGO G */
				635	/**************/
				636
				637	shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	638	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	639	shift_limited_unroll_B_sse_data_t output;
				640	float myphase;
				641
				642	output.phase_increment = 2relative_freqPI;
				643
				644	myphase = 0.0F;
				645	for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	646	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	647	for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
				648	{
				649	myphase += output.phase_increment;
				650	while(myphase>PI) myphase-=2*PI;
				651	while(myphase<-PI) myphase+=2*PI;
				652	}
				653	output.dtrig[i+0] = cos(myphase);
				654	output.dtrig[i+1] = sin(myphase);
				655	output.dtrig[i+2] = output.dtrig[i+0];
				656	output.dtrig[i+3] = output.dtrig[i+1];
				657	}
				658
				659	output.dcos_blk = 0.0F;
				660	output.dsin_blk = 0.0F;
				661
				662	myphase = phase_start_rad;
				663	for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
				664	{
				665	output.phase_state_i[i] = cos(myphase);
				666	output.phase_state_q[i] = sin(myphase);
				667	myphase += output.phase_increment;
				668	while(myphase>PI) myphase-=2*PI;
				669	while(myphase<-PI) myphase+=2*PI;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	670	}
				671	return output;
				672	}
				673
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	674
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	675	PF_TARGET_CLONES
				676	void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d)
				677	{
				678	// "vals := starts := phase_state"
				679	__m128 cos_starts = VLOAD( &d->phase_state_i[0] );
				680	__m128 sin_starts = VLOAD( &d->phase_state_q[0] );
				681	__m128 cos_vals = cos_starts;
				682	__m128 sin_vals = sin_starts;
				683	__m128 inp_re, inp_im;
				684	__m128 product_re, product_im;
				685	__m128 interl_prod_a, interl_prod_b;
				686	__m128 * RESTRICT p_trig_tab;
				687	__m128 * RESTRICT u = (__m128*)in_out;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	688
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	689	while (N_cplx)
				690	{
				691	const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
				692	int B = NB;
				693	p_trig_tab = (__m128*)( &d->dtrig[0] );
				694	while (B)
				695	{
				696	// complex multiplication of 4 complex values from/to in_out[]
				697	// == u[0..3] = (cos_val[0..3] + i sin_val[0..3]):
				698	// "out[] = inp[] * vals"
				699	UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
				700	product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
				701	product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
				702	INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
				703	VSTORE(u, interl_prod_a);
				704	VSTORE(u+1, interl_prod_b);
				705	u += 2;
				706	// calculate complex phasor for next iteration
				707	// cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZi+j] - sin_start d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
				708	// sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZi+j] + cos_start d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
				709	// cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
				710	// "vals := d[] * starts"
				711	product_re = VLOAD(p_trig_tab);
				712	UNINTERLEAVE2(product_re, product_re, inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
				713	cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
				714	sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
				715	++p_trig_tab;
				716	B -= 4;
				717	}
				718	N_cplx -= NB;
				719	/* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
				720	/* re-use product_re[]/product_im[] for normalization */
				721	// "starts := vals := vals / \|vals\|"
				722	product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
				723	#if 0
				724	// more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
				725	// higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
				726	product_im = _mm_rsqrt_ps(product_re);
				727	cos_starts = cos_vals = VMUL(cos_vals, product_im);
				728	sin_starts = sin_vals = VMUL(sin_vals, product_im);
				729	#else
				730	// spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
				731	// spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
				732	product_im = _mm_sqrt_ps(product_re);
				733	cos_starts = cos_vals = VDIV(cos_vals, product_im);
				734	sin_starts = sin_vals = VDIV(sin_vals, product_im);
				735	#endif
				736	}
				737	// "phase_state := starts"
				738	VSTORE( &d->phase_state_i[0], cos_starts );
				739	VSTORE( &d->phase_state_q[0], sin_starts );
				740	}
				741
				742
				743	/*********************************************************************/
				744
				745
				746	/**************/
				747	/* ALGO H */
				748	/**************/
				749
				750	shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad)
				751	{
				752	shift_limited_unroll_C_sse_data_t output;
				753	float myphase;
				754
				755	output.phase_increment = 2relative_freqPI;
				756
				757	myphase = 0.0F;
				758	for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
				759	{
				760	for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
				761	{
				762	myphase += output.phase_increment;
				763	while(myphase>PI) myphase-=2*PI;
				764	while(myphase<-PI) myphase+=2*PI;
				765	}
				766	output.dinterl_trig[2*i] = cos(myphase);
				767	output.dinterl_trig[2*i+4] = sin(myphase);
				768	for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
				769	{
				770	output.dinterl_trig[2i+k] = output.dinterl_trig[2i];
				771	output.dinterl_trig[2i+k+4] = output.dinterl_trig[2i+4];
				772	}
				773	}
				774
				775	output.dcos_blk = 0.0F;
				776	output.dsin_blk = 0.0F;
				777
				778	myphase = phase_start_rad;
				779	for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
				780	{
				781	output.phase_state_i[i] = cos(myphase);
				782	output.phase_state_q[i] = sin(myphase);
				783	myphase += output.phase_increment;
				784	while(myphase>PI) myphase-=2*PI;
				785	while(myphase<-PI) myphase+=2*PI;
				786	}
				787	return output;
				788	}
				789
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	790
				791	PF_TARGET_CLONES
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	792	void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	793	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	794	// "vals := starts := phase_state"
				795	__m128 cos_starts = VLOAD( &d->phase_state_i[0] );
				796	__m128 sin_starts = VLOAD( &d->phase_state_q[0] );
				797	__m128 cos_vals = cos_starts;
				798	__m128 sin_vals = sin_starts;
				799	__m128 inp_re, inp_im;
				800	__m128 product_re, product_im;
				801	__m128 interl_prod_a, interl_prod_b;
				802	__m128 * RESTRICT p_trig_tab;
				803	__m128 * RESTRICT u = (__m128*)in_out;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	804
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	805	while (N_cplx)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	806	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	807	const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
				808	int B = NB;
				809	p_trig_tab = (__m128*)( &d->dinterl_trig[0] );
				810	while (B)
				811	{
				812	// complex multiplication of 4 complex values from/to in_out[]
				813	// == u[0..3] = (cos_val[0..3] + i sin_val[0..3]):
				814	// "out[] = inp[] * vals"
				815	UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
				816	product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
				817	product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
				818	INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
				819	VSTORE(u, interl_prod_a);
				820	VSTORE(u+1, interl_prod_b);
				821	u += 2;
				822	// calculate complex phasor for next iteration
				823	// cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZi+j] - sin_start d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
				824	// sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZi+j] + cos_start d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
				825	// cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
				826	// "vals := d[] * starts"
				827	inp_re = VLOAD(p_trig_tab);
				828	inp_im = VLOAD(p_trig_tab+1);
				829	cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
				830	sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
				831	p_trig_tab += 2;
				832	B -= 4;
				833	}
				834	N_cplx -= NB;
				835	/* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
				836	/* re-use product_re[]/product_im[] for normalization */
				837	// "starts := vals := vals / \|vals\|"
				838	product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
				839	#if 0
				840	// more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
				841	// higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
				842	product_im = _mm_rsqrt_ps(product_re);
				843	cos_starts = cos_vals = VMUL(cos_vals, product_im);
				844	sin_starts = sin_vals = VMUL(sin_vals, product_im);
				845	#else
				846	// spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
				847	// spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
				848	product_im = _mm_sqrt_ps(product_re);
				849	cos_starts = cos_vals = VDIV(cos_vals, product_im);
				850	sin_starts = sin_vals = VDIV(sin_vals, product_im);
				851	#endif
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	852	}
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	853	// "phase_state := starts"
				854	VSTORE( &d->phase_state_i[0], cos_starts );
				855	VSTORE( &d->phase_state_q[0], sin_starts );
				856	}
				857
				858
				859	#else
				860
				861	/*********************************************************************/
				862
				863	shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) {
				864	assert(0);
				865	shift_limited_unroll_A_sse_data_t r;
				866	return r;
				867	}
				868	shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) {
				869	assert(0);
				870	shift_limited_unroll_B_sse_data_t r;
				871	return r;
				872	}
				873	shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) {
				874	assert(0);
				875	shift_limited_unroll_C_sse_data_t r;
				876	return r;
				877	}
				878
				879	void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) {
				880	assert(0);
				881	}
				882	void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) {
				883	assert(0);
				884	}
				885	void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) {
				886	assert(0);
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	887	}
				888
				889	#endif
				890
				891
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	892	/*********************************************************************/
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	893
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	894	/**************/
				895	/* ALGO I */
				896	/**************/
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	897
				898	void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t conf, shift_recursive_osc_t state)
				899	{
				900	// constants for single phase step
				901	float phase_increment_s = rate*PI;
				902	float k1 = tan(0.5*phase_increment_s);
				903	float k2 = 2k1 /(1 + k1 k1);
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	904	for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SZ; j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	905	{
				906	float tmp;
				907	state->u_cos[j] = state->u_cos[j-1];
				908	state->v_sin[j] = state->v_sin[j-1];
				909	// small steps
				910	tmp = state->u_cos[j] - k1 * state->v_sin[j];
				911	state->v_sin[j] += k2 * tmp;
				912	state->u_cos[j] = tmp - k1 * state->v_sin[j];
				913	}
				914
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	915	// constants for PF_SHIFT_RECURSIVE_SIMD_SZ times phase step
				916	float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SZ;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	917	while(phase_increment_b > PI) phase_increment_b-=2*PI;
				918	while(phase_increment_b < -PI) phase_increment_b+=2*PI;
				919	conf->k1 = tan(0.5*phase_increment_b);
				920	conf->k2 = 2conf->k1 / (1 + conf->k1 conf->k1);
				921	}
				922
				923	void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t conf, shift_recursive_osc_t state)
				924	{
				925	if (starting_phase != 0.0F)
				926	{
				927	state->u_cos[0] = cos(starting_phase);
				928	state->v_sin[0] = sin(starting_phase);
				929	}
				930	else
				931	{
				932	state->u_cos[0] = 1.0F;
				933	state->v_sin[0] = 0.0F;
				934	}
				935	shift_recursive_osc_update_rate(rate, conf, state);
				936	}
				937
				938
				939	PF_TARGET_CLONES
				940	void shift_recursive_osc_cc(const complexf input, complexf output,
				941	int size, const shift_recursive_osc_conf_t conf, shift_recursive_osc_t state_ext)
				942	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	943	float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
				944	float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
				945	float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	946	shift_recursive_osc_t state = *state_ext;
				947	const float k1 = conf->k1;
				948	const float k2 = conf->k2;
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	949	for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_cc
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	950	{
				951	//we multiply two complex numbers - similar to shift_math_cc
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	952	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	953	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	954	inp_i[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
				955	inp_q[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	956	}
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	957	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	958	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	959	iof(output,PF_SHIFT_RECURSIVE_SIMD_SZi+j) = state.u_cos[j] inp_i[j] - state.v_sin[j] * inp_q[j];
				960	qof(output,PF_SHIFT_RECURSIVE_SIMD_SZi+j) = state.v_sin[j] inp_i[j] + state.u_cos[j] * inp_q[j];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	961	}
				962	// update complex phasor - like incrementing phase
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	963	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	964	tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	965	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	966	state.v_sin[j] += k2 * tmp[j];
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	967	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	968	state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
				969	}
				970	*state_ext = state;
				971	}
				972
				973	PF_TARGET_CLONES
				974	void shift_recursive_osc_inp_c(complexf* in_out,
				975	int size, const shift_recursive_osc_conf_t conf, shift_recursive_osc_t state_ext)
				976	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	977	float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
				978	float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
				979	float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	980	shift_recursive_osc_t state = *state_ext;
				981	const float k1 = conf->k1;
				982	const float k2 = conf->k2;
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	983	for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_inp_c
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	984	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	985	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	986	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	987	inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
				988	inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	989	}
				990	//we multiply two complex numbers - similar to shift_math_cc
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	991	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	992	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	993	iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZi+j) = state.u_cos[j] inp_i[j] - state.v_sin[j] * inp_q[j];
				994	qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZi+j) = state.v_sin[j] inp_i[j] + state.u_cos[j] * inp_q[j];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	995	}
				996	// update complex phasor - like incrementing phase
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	997	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	998	tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	999	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1000	state.v_sin[j] += k2 * tmp[j];
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1001	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1002	state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
				1003	}
				1004	*state_ext = state;
				1005	}
				1006
				1007	PF_TARGET_CLONES
				1008	void gen_recursive_osc_c(complexf* output,
				1009	int size, const shift_recursive_osc_conf_t conf, shift_recursive_osc_t state_ext)
				1010	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1011	float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1012	shift_recursive_osc_t state = *state_ext;
				1013	const float k1 = conf->k1;
				1014	const float k2 = conf->k2;
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1015	for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@gen_recursive_osc_c
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1016	{
				1017	// output complex oscillator value
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1018	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1019	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1020	iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j];
				1021	qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1022	}
				1023	// update complex phasor - like incrementing phase
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1024	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1025	tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1026	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1027	state.v_sin[j] += k2 * tmp[j];
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1028	for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1029	state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
				1030	}
				1031	*state_ext = state;
				1032	}
				1033
				1034
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	1035	#ifdef HAVE_SSE_INTRINSICS
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1036
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1037	/*********************************************************************/
				1038
				1039	/**************/
				1040	/* ALGO J */
				1041	/**************/
				1042
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1043	void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t conf, shift_recursive_osc_sse_t state)
				1044	{
				1045	// constants for single phase step
				1046	float phase_increment_s = rate*PI;
				1047	float k1 = tan(0.5*phase_increment_s);
				1048	float k2 = 2k1 /(1 + k1 k1);
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1049	for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++)
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1050	{
				1051	float tmp;
				1052	state->u_cos[j] = state->u_cos[j-1];
				1053	state->v_sin[j] = state->v_sin[j-1];
				1054	// small steps
				1055	tmp = state->u_cos[j] - k1 * state->v_sin[j];
				1056	state->v_sin[j] += k2 * tmp;
				1057	state->u_cos[j] = tmp - k1 * state->v_sin[j];
				1058	}
				1059
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1060	// constants for PF_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step
				1061	float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SSE_SZ;
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1062	while(phase_increment_b > PI) phase_increment_b-=2*PI;
				1063	while(phase_increment_b < -PI) phase_increment_b+=2*PI;
				1064	conf->k1 = tan(0.5*phase_increment_b);
				1065	conf->k2 = 2conf->k1 / (1 + conf->k1 conf->k1);
				1066	}
				1067
				1068
				1069	void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t conf, shift_recursive_osc_sse_t state)
				1070	{
				1071	if (starting_phase != 0.0F)
				1072	{
				1073	state->u_cos[0] = cos(starting_phase);
				1074	state->v_sin[0] = sin(starting_phase);
				1075	}
				1076	else
				1077	{
				1078	state->u_cos[0] = 1.0F;
				1079	state->v_sin[0] = 0.0F;
				1080	}
				1081	shift_recursive_osc_sse_update_rate(rate, conf, state);
				1082	}
				1083
				1084
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	1085	PF_TARGET_CLONES
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1086	void shift_recursive_osc_sse_inp_c(complexf* in_out,
				1087	int N_cplx, const shift_recursive_osc_sse_conf_t conf, shift_recursive_osc_sse_t state_ext)
				1088	{
				1089	const __m128 k1 = LD_PS1( conf->k1 );
				1090	const __m128 k2 = LD_PS1( conf->k2 );
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1091	__m128 u_cos = VLOAD( &state_ext->u_cos[0] );
				1092	__m128 v_sin = VLOAD( &state_ext->v_sin[0] );
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1093	__m128 inp_re, inp_im;
				1094	__m128 product_re, product_im;
				1095	__m128 interl_prod_a, interl_prod_b;
				1096	__m128 * RESTRICT u = (__m128*)in_out;
				1097
				1098	while (N_cplx)
				1099	{
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1100	//inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i;
				1101	//inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q;
				1102	UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1103
				1104	//we multiply two complex numbers - similar to shift_math_cc
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1105	//iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZi+j) = state.u_cos[j] inp_i[j] - state.v_sin[j] * inp_q[j];
				1106	//qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZi+j) = state.v_sin[j] inp_i[j] + state.u_cos[j] * inp_q[j];
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1107	product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) );
				1108	product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) );
				1109	INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1110	VSTORE(u, interl_prod_a);
				1111	VSTORE(u+1, interl_prod_b);
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1112	u += 2;
				1113
				1114	// update complex phasor - like incrementing phase
				1115	// tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
				1116	product_re = VSUB( u_cos, VMUL(k1, v_sin) );
				1117	// state.v_sin[j] += k2 * tmp[j];
				1118	v_sin = VADD( v_sin, VMUL(k2, product_re) );
				1119	// state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
				1120	u_cos = VSUB( product_re, VMUL(k1, v_sin) );
				1121
				1122	N_cplx -= 4;
				1123	}
hayati ayguen	d64eea5	2020-11-14 15:12:19 +0100	[diff] [blame]	1124	VSTORE( &state_ext->u_cos[0], u_cos );
				1125	VSTORE( &state_ext->v_sin[0], v_sin );
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1126	}
				1127
hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	1128	#else
				1129
				1130	void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t conf, shift_recursive_osc_sse_t state)
				1131	{
				1132	assert(0);
				1133	}
				1134
				1135	void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t conf, shift_recursive_osc_sse_t state)
				1136	{
				1137	assert(0);
				1138	}
				1139
				1140
				1141	void shift_recursive_osc_sse_inp_c(complexf* in_out,
				1142	int N_cplx, const shift_recursive_osc_sse_conf_t conf, shift_recursive_osc_sse_t state_ext)
				1143	{
				1144	assert(0);
				1145	}
				1146
hayati ayguen	29eb847	2020-11-11 23:09:58 +0100	[diff] [blame]	1147	#endif
				1148