blob: 0f2c31005fc64b8336751cbdeaecb5c5e4e7a3be [file] [log] [blame]
hayati ayguen29eb8472020-11-11 23:09:58 +01001/*
2This software is part of pffft/pfdsp, a set of simple DSP routines.
3
4Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
5Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
6All rights reserved.
7
8Redistribution and use in source and binary forms, with or without
9modification, are permitted provided that the following conditions are met:
10 * Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 * Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 * Neither the name of the copyright holder nor the
16 names of its contributors may be used to endorse or promote products
17 derived from this software without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
23DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31/* include own header first, to see missing includes */
32#include "pf_mixer.h"
33#include "fmv.h"
34
hayati ayguen29eb8472020-11-11 23:09:58 +010035#include <math.h>
36#include <stdlib.h>
hayati ayguen2587d832020-11-12 07:15:43 +000037#include <assert.h>
hayati ayguen29eb8472020-11-11 23:09:58 +010038
39//they dropped M_PI in C99, so we define it:
40#define PI ((float)3.14159265358979323846)
41
42//apply to pointers:
43#define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)))
44#define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1))
45
hayati ayguend64eea52020-11-14 15:12:19 +010046#define USE_ALIGNED_ADDRESSES 0
47
48
hayati ayguen29eb8472020-11-11 23:09:58 +010049
50/*
51 _____ _____ _____ __ _ _
52 | __ \ / ____| __ \ / _| | | (_)
53 | | | | (___ | |__) | | |_ _ _ _ __ ___| |_ _ ___ _ __ ___
54 | | | |\___ \| ___/ | _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
55 | |__| |____) | | | | | |_| | | | | (__| |_| | (_) | | | \__ \
56 |_____/|_____/|_| |_| \__,_|_| |_|\___|\__|_|\___/|_| |_|___/
57
58*/
59
hayati ayguen2587d832020-11-12 07:15:43 +000060
61#if defined(__GNUC__)
62# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
63# define RESTRICT __restrict
64#elif defined(_MSC_VER)
65# define ALWAYS_INLINE(return_type) __forceinline return_type
66# define RESTRICT __restrict
67#endif
68
69
70#ifndef PFFFT_SIMD_DISABLE
71#if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
72 #pragma message "Manual SSE x86/x64 optimizations are ON"
73 #include <xmmintrin.h>
74 #define HAVE_SSE_INTRINSICS 1
75
76#elif defined(PFFFT_ENABLE_NEON) && defined(__arm__)
77 #pragma message "Manual NEON (arm32) optimizations are ON"
78 #include "sse2neon.h"
79 #define HAVE_SSE_INTRINSICS 1
80
81#elif defined(PFFFT_ENABLE_NEON) && defined(__aarch64__)
82 #pragma message "Manual NEON (aarch64) optimizations are ON"
83 #include "sse2neon.h"
84 #define HAVE_SSE_INTRINSICS 1
85
86#endif
87#endif
88
89#ifdef HAVE_SSE_INTRINSICS
90
91typedef __m128 v4sf;
92# define SIMD_SZ 4
93
94typedef union v4_union {
95 __m128 v;
96 float f[4];
97} v4_union;
98
hayati ayguend64eea52020-11-14 15:12:19 +010099#define VMUL(a,b) _mm_mul_ps(a,b)
100#define VDIV(a,b) _mm_div_ps(a,b)
101#define VADD(a,b) _mm_add_ps(a,b)
102#define VSUB(a,b) _mm_sub_ps(a,b)
103#define LD_PS1(s) _mm_set1_ps(s)
hayati ayguen2587d832020-11-12 07:15:43 +0000104#define VLOAD_UNALIGNED(ptr) _mm_loadu_ps((const float *)(ptr))
105#define VLOAD_ALIGNED(ptr) _mm_load_ps((const float *)(ptr))
106#define VSTORE_UNALIGNED(ptr, v) _mm_storeu_ps((float*)(ptr), v)
107#define VSTORE_ALIGNED(ptr, v) _mm_store_ps((float*)(ptr), v)
108#define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
109#define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
110
hayati ayguend64eea52020-11-14 15:12:19 +0100111#if USE_ALIGNED_ADDRESSES
112 #define VLOAD(ptr) _mm_load_ps((const float *)(ptr))
113 #define VSTORE(ptr, v) _mm_store_ps((float*)(ptr), v)
114#else
115 #define VLOAD(ptr) _mm_loadu_ps((const float *)(ptr))
116 #define VSTORE(ptr, v) _mm_storeu_ps((float*)(ptr), v)
117#endif
118
hayati ayguen2587d832020-11-12 07:15:43 +0000119
120int have_sse_shift_mixer_impl()
121{
122 return 1;
123}
124
125#else
126
127int have_sse_shift_mixer_impl()
128{
129 return 0;
130}
131
132#endif
133
134
hayati ayguend64eea52020-11-14 15:12:19 +0100135/*********************************************************************/
136
137/**************/
138/*** ALGO A ***/
139/**************/
140
hayati ayguen29eb8472020-11-11 23:09:58 +0100141PF_TARGET_CLONES
142float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase)
143{
144 rate*=2;
145 //Shifts the complex spectrum. Basically a complex mixer. This version uses cmath.
146 float phase=starting_phase;
147 float phase_increment=rate*PI;
148 float cosval, sinval;
149 for(int i=0;i<input_size; i++)
150 {
151 cosval=cos(phase);
152 sinval=sin(phase);
153 //we multiply two complex numbers.
154 //how? enter this to maxima (software) for explanation:
155 // (a+b*%i)*(c+d*%i), rectform;
156 iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
157 qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
158 phase+=phase_increment;
159 while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
160 while(phase<0) phase+=2*PI;
161 }
162 return phase;
163}
164
hayati ayguend64eea52020-11-14 15:12:19 +0100165/*********************************************************************/
hayati ayguen29eb8472020-11-11 23:09:58 +0100166
hayati ayguend64eea52020-11-14 15:12:19 +0100167/**************/
168/*** ALGO B ***/
169/**************/
hayati ayguen29eb8472020-11-11 23:09:58 +0100170
171shift_table_data_t shift_table_init(int table_size)
172{
hayati ayguen29eb8472020-11-11 23:09:58 +0100173 shift_table_data_t output;
174 output.table=(float*)malloc(sizeof(float)*table_size);
175 output.table_size=table_size;
176 for(int i=0;i<table_size;i++)
177 {
178 output.table[i]=sin(((float)i/table_size)*(PI/2));
179 }
180 return output;
181}
182
183void shift_table_deinit(shift_table_data_t table_data)
184{
185 free(table_data.table);
186}
187
188
189PF_TARGET_CLONES
190float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase)
191{
hayati ayguen29eb8472020-11-11 23:09:58 +0100192 rate*=2;
193 //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table.
194 float phase=starting_phase;
195 float phase_increment=rate*PI;
196 float cosval, sinval;
197 for(int i=0;i<input_size; i++) //@shift_math_cc
198 {
199 int sin_index, cos_index, temp_index, sin_sign, cos_sign;
hayati ayguen29eb8472020-11-11 23:09:58 +0100200 int quadrant=phase/(PI/2); //between 0 and 3
201 float vphase=phase-quadrant*(PI/2);
202 sin_index=(vphase/(PI/2))*table_data.table_size;
203 cos_index=table_data.table_size-1-sin_index;
204 if(quadrant&1) //in quadrant 1 and 3
205 {
206 temp_index=sin_index;
207 sin_index=cos_index;
208 cos_index=temp_index;
209 }
210 sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3
211 cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2
212 sinval=sin_sign*table_data.table[sin_index];
213 cosval=cos_sign*table_data.table[cos_index];
214 //we multiply two complex numbers.
215 //how? enter this to maxima (software) for explanation:
216 // (a+b*%i)*(c+d*%i), rectform;
217 iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
218 qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
219 phase+=phase_increment;
220 while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
221 while(phase<0) phase+=2*PI;
222 }
223 return phase;
224}
225
hayati ayguend64eea52020-11-14 15:12:19 +0100226/*********************************************************************/
227
228/**************/
229/*** ALGO C ***/
230/**************/
231
232shift_addfast_data_t shift_addfast_init(float rate)
233{
234 shift_addfast_data_t output;
235 output.phase_increment=2*rate*PI;
236 for(int i=0;i<4;i++)
237 {
238 output.dsin[i]=sin(output.phase_increment*(i+1));
239 output.dcos[i]=cos(output.phase_increment*(i+1));
240 }
241 return output;
242}
243
244#define SADF_L1(j) \
245 cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
246 sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
247#define SADF_L2(j) \
248 iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \
249 qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j);
250
251PF_TARGET_CLONES
252float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
253{
254 //input_size should be multiple of 4
255 //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
256 float cos_start=cos(starting_phase);
257 float sin_start=sin(starting_phase);
258 float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
259 sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
260 dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
261 dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
262
263 for(int i=0;i<input_size/4; i++)
264 {
265 SADF_L1(0)
266 SADF_L1(1)
267 SADF_L1(2)
268 SADF_L1(3)
269 SADF_L2(0)
270 SADF_L2(1)
271 SADF_L2(2)
272 SADF_L2(3)
273 cos_start = cos_vals_3;
274 sin_start = sin_vals_3;
275 }
276 starting_phase+=input_size*d->phase_increment;
277 while(starting_phase>PI) starting_phase-=2*PI;
278 while(starting_phase<-PI) starting_phase+=2*PI;
279 return starting_phase;
280}
281
282#undef SADF_L2
283
284
285#define SADF_L2(j) \
286 tmp_inp_cos = iof(in_out,4*i+j); \
287 tmp_inp_sin = qof(in_out,4*i+j); \
288 iof(in_out,4*i+j)=(cos_vals_ ## j)*tmp_inp_cos - (sin_vals_ ## j)*tmp_inp_sin; \
289 qof(in_out,4*i+j)=(sin_vals_ ## j)*tmp_inp_cos + (cos_vals_ ## j)*tmp_inp_sin;
290
291PF_TARGET_CLONES
292float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase)
293{
294 //input_size should be multiple of 4
295 //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
296 float cos_start=cos(starting_phase);
297 float sin_start=sin(starting_phase);
298 float register tmp_inp_cos, tmp_inp_sin,
299 cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
300 sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
301 dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
302 dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
303
304 for(int i=0;i<N_cplx/4; i++)
305 {
306 SADF_L1(0)
307 SADF_L1(1)
308 SADF_L1(2)
309 SADF_L1(3)
310 SADF_L2(0)
311 SADF_L2(1)
312 SADF_L2(2)
313 SADF_L2(3)
314 cos_start = cos_vals_3;
315 sin_start = sin_vals_3;
316 }
317 starting_phase+=N_cplx*d->phase_increment;
318 while(starting_phase>PI) starting_phase-=2*PI;
319 while(starting_phase<-PI) starting_phase+=2*PI;
320 return starting_phase;
321}
322
323#undef SADF_L1
324#undef SADF_L2
325
326
327/*********************************************************************/
328
329/**************/
330/*** ALGO D ***/
331/**************/
hayati ayguen29eb8472020-11-11 23:09:58 +0100332
333shift_unroll_data_t shift_unroll_init(float rate, int size)
334{
335 shift_unroll_data_t output;
336 output.phase_increment=2*rate*PI;
337 output.size = size;
338 output.dsin=(float*)malloc(sizeof(float)*size);
339 output.dcos=(float*)malloc(sizeof(float)*size);
340 float myphase = 0;
341 for(int i=0;i<size;i++)
342 {
343 myphase += output.phase_increment;
344 while(myphase>PI) myphase-=2*PI;
345 while(myphase<-PI) myphase+=2*PI;
346 output.dsin[i]=sin(myphase);
347 output.dcos[i]=cos(myphase);
348 }
349 return output;
350}
351
352void shift_unroll_deinit(shift_unroll_data_t* d)
353{
hayati ayguend64eea52020-11-14 15:12:19 +0100354 if (!d)
355 return;
356 free(d->dsin);
357 free(d->dcos);
358 d->dsin = NULL;
359 d->dcos = NULL;
hayati ayguen29eb8472020-11-11 23:09:58 +0100360}
361
362PF_TARGET_CLONES
363float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
364{
365 //input_size should be multiple of 4
366 //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
367 float cos_start = cos(starting_phase);
368 float sin_start = sin(starting_phase);
hayati ayguend64eea52020-11-14 15:12:19 +0100369 register float cos_val = cos_start, sin_val = sin_start;
370 for(int i=0;i<input_size; i++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100371 {
372 iof(output,i) = cos_val*iof(input,i) - sin_val*qof(input,i);
373 qof(output,i) = sin_val*iof(input,i) + cos_val*qof(input,i);
374 // calculate complex phasor for next iteration
375 cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
376 sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
377 }
378 starting_phase+=input_size*d->phase_increment;
379 while(starting_phase>PI) starting_phase-=2*PI;
380 while(starting_phase<-PI) starting_phase+=2*PI;
381 return starting_phase;
382}
383
384PF_TARGET_CLONES
385float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase)
386{
387 float cos_start = cos(starting_phase);
388 float sin_start = sin(starting_phase);
hayati ayguend64eea52020-11-14 15:12:19 +0100389 register float cos_val = cos_start, sin_val = sin_start;
390 for(int i=0;i<size; i++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100391 {
392 register float inp_i = iof(in_out,i);
393 register float inp_q = qof(in_out,i);
394 iof(in_out,i) = cos_val*inp_i - sin_val*inp_q;
395 qof(in_out,i) = sin_val*inp_i + cos_val*inp_q;
396 // calculate complex phasor for next iteration
397 cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
398 sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
399 }
400 starting_phase += size * d->phase_increment;
401 while(starting_phase>PI) starting_phase-=2*PI;
402 while(starting_phase<-PI) starting_phase+=2*PI;
403 return starting_phase;
404}
405
406
hayati ayguend64eea52020-11-14 15:12:19 +0100407/*********************************************************************/
408
409/**************/
410/*** ALGO E ***/
411/**************/
hayati ayguen29eb8472020-11-11 23:09:58 +0100412
413shift_limited_unroll_data_t shift_limited_unroll_init(float rate)
414{
415 shift_limited_unroll_data_t output;
416 output.phase_increment=2*rate*PI;
417 float myphase = 0;
hayati ayguend64eea52020-11-14 15:12:19 +0100418 for(int i=0; i < PF_SHIFT_LIMITED_UNROLL_SIZE; i++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100419 {
420 myphase += output.phase_increment;
421 while(myphase>PI) myphase-=2*PI;
422 while(myphase<-PI) myphase+=2*PI;
423 output.dcos[i] = cos(myphase);
424 output.dsin[i] = sin(myphase);
425 }
426 output.complex_phase.i = 1.0F;
427 output.complex_phase.q = 0.0F;
428 return output;
429}
430
431PF_TARGET_CLONES
432void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d)
433{
434 float cos_start = d->complex_phase.i;
435 float sin_start = d->complex_phase.q;
hayati ayguend64eea52020-11-14 15:12:19 +0100436 register float cos_val = cos_start, sin_val = sin_start, mag;
437 while (size > 0)
hayati ayguen29eb8472020-11-11 23:09:58 +0100438 {
hayati ayguend64eea52020-11-14 15:12:19 +0100439 int N = (size >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : size;
440 for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
hayati ayguen29eb8472020-11-11 23:09:58 +0100441 {
hayati ayguend64eea52020-11-14 15:12:19 +0100442 for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100443 {
hayati ayguend64eea52020-11-14 15:12:19 +0100444 iof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
445 qof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
hayati ayguen29eb8472020-11-11 23:09:58 +0100446 // calculate complex phasor for next iteration
hayati ayguend64eea52020-11-14 15:12:19 +0100447 cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
448 sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
hayati ayguen29eb8472020-11-11 23:09:58 +0100449 }
450 }
hayati ayguend64eea52020-11-14 15:12:19 +0100451 // "starts := vals := vals / |vals|"
452 mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
453 cos_val /= mag;
454 sin_val /= mag;
455 cos_start = cos_val;
456 sin_start = sin_val;
457
458 input += PF_SHIFT_LIMITED_UNROLL_SIZE;
459 output += PF_SHIFT_LIMITED_UNROLL_SIZE;
460 size -= PF_SHIFT_LIMITED_UNROLL_SIZE;
hayati ayguen29eb8472020-11-11 23:09:58 +0100461 }
462 d->complex_phase.i = cos_val;
463 d->complex_phase.q = sin_val;
464}
465
466PF_TARGET_CLONES
hayati ayguend64eea52020-11-14 15:12:19 +0100467void shift_limited_unroll_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_data_t* d)
hayati ayguen29eb8472020-11-11 23:09:58 +0100468{
hayati ayguend64eea52020-11-14 15:12:19 +0100469 float inp_i[PF_SHIFT_LIMITED_SIMD_SZ];
470 float inp_q[PF_SHIFT_LIMITED_SIMD_SZ];
471 // "vals := starts := phase_state"
hayati ayguen29eb8472020-11-11 23:09:58 +0100472 float cos_start = d->complex_phase.i;
473 float sin_start = d->complex_phase.q;
hayati ayguend64eea52020-11-14 15:12:19 +0100474 register float cos_val = cos_start, sin_val = sin_start, mag;
475 while (N_cplx)
hayati ayguen29eb8472020-11-11 23:09:58 +0100476 {
hayati ayguend64eea52020-11-14 15:12:19 +0100477 int N = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
478 for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
hayati ayguen29eb8472020-11-11 23:09:58 +0100479 {
hayati ayguend64eea52020-11-14 15:12:19 +0100480 for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
481 inp_i[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].i;
482 for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
483 inp_q[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].q;
484 for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100485 {
hayati ayguend64eea52020-11-14 15:12:19 +0100486 // "out[] = inp[] * vals"
487 iof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j];
488 qof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j];
hayati ayguen29eb8472020-11-11 23:09:58 +0100489 // calculate complex phasor for next iteration
hayati ayguend64eea52020-11-14 15:12:19 +0100490 // "vals := d[] * starts"
491 cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
492 sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
hayati ayguen29eb8472020-11-11 23:09:58 +0100493 }
494 }
hayati ayguend64eea52020-11-14 15:12:19 +0100495 // "starts := vals := vals / |vals|"
496 mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
497 cos_val /= mag;
498 sin_val /= mag;
499 cos_start = cos_val;
500 sin_start = sin_val;
501
502 in_out += PF_SHIFT_LIMITED_UNROLL_SIZE;
503 N_cplx -= PF_SHIFT_LIMITED_UNROLL_SIZE;
hayati ayguen29eb8472020-11-11 23:09:58 +0100504 }
hayati ayguend64eea52020-11-14 15:12:19 +0100505 // "phase_state := starts"
506 d->complex_phase.i = cos_start;
507 d->complex_phase.q = sin_start;
hayati ayguen29eb8472020-11-11 23:09:58 +0100508}
509
510
hayati ayguen2587d832020-11-12 07:15:43 +0000511#ifdef HAVE_SSE_INTRINSICS
hayati ayguen29eb8472020-11-11 23:09:58 +0100512
hayati ayguend64eea52020-11-14 15:12:19 +0100513/*********************************************************************/
514
515/**************/
516/*** ALGO F ***/
517/**************/
518
519shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad)
hayati ayguen29eb8472020-11-11 23:09:58 +0100520{
hayati ayguend64eea52020-11-14 15:12:19 +0100521 shift_limited_unroll_A_sse_data_t output;
hayati ayguen29eb8472020-11-11 23:09:58 +0100522 float myphase;
523
524 output.phase_increment = 2*relative_freq*PI;
525
526 myphase = 0.0F;
hayati ayguend64eea52020-11-14 15:12:19 +0100527 for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
hayati ayguen29eb8472020-11-11 23:09:58 +0100528 {
hayati ayguend64eea52020-11-14 15:12:19 +0100529 for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
530 {
531 myphase += output.phase_increment;
532 while(myphase>PI) myphase-=2*PI;
533 while(myphase<-PI) myphase+=2*PI;
534 }
hayati ayguen29eb8472020-11-11 23:09:58 +0100535 output.dcos[i] = cos(myphase);
536 output.dsin[i] = sin(myphase);
hayati ayguend64eea52020-11-14 15:12:19 +0100537 for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
538 {
539 output.dcos[i+k] = output.dcos[i];
540 output.dsin[i+k] = output.dsin[i];
541 }
hayati ayguen29eb8472020-11-11 23:09:58 +0100542 }
543
544 output.dcos_blk = 0.0F;
545 output.dsin_blk = 0.0F;
546
547 myphase = phase_start_rad;
hayati ayguend64eea52020-11-14 15:12:19 +0100548 for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100549 {
550 output.phase_state_i[i] = cos(myphase);
551 output.phase_state_q[i] = sin(myphase);
552 myphase += output.phase_increment;
553 while(myphase>PI) myphase-=2*PI;
554 while(myphase<-PI) myphase+=2*PI;
555 }
556 return output;
557}
558
559
hayati ayguen2587d832020-11-12 07:15:43 +0000560PF_TARGET_CLONES
hayati ayguend64eea52020-11-14 15:12:19 +0100561void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d)
hayati ayguen29eb8472020-11-11 23:09:58 +0100562{
hayati ayguend64eea52020-11-14 15:12:19 +0100563 // "vals := starts := phase_state"
564 __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
565 __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
hayati ayguen29eb8472020-11-11 23:09:58 +0100566 __m128 cos_vals = cos_starts;
567 __m128 sin_vals = sin_starts;
568 __m128 inp_re, inp_im;
569 __m128 product_re, product_im;
570 __m128 interl_prod_a, interl_prod_b;
571 __m128 * RESTRICT p_trig_cos_tab;
572 __m128 * RESTRICT p_trig_sin_tab;
573 __m128 * RESTRICT u = (__m128*)in_out;
574
575 while (N_cplx)
576 {
hayati ayguend64eea52020-11-14 15:12:19 +0100577 const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
hayati ayguen29eb8472020-11-11 23:09:58 +0100578 int B = NB;
579 p_trig_cos_tab = (__m128*)( &d->dcos[0] );
580 p_trig_sin_tab = (__m128*)( &d->dsin[0] );
hayati ayguend64eea52020-11-14 15:12:19 +0100581 while (B)
hayati ayguen29eb8472020-11-11 23:09:58 +0100582 {
583 // complex multiplication of 4 complex values from/to in_out[]
584 // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
hayati ayguend64eea52020-11-14 15:12:19 +0100585 // "out[] = inp[] * vals"
586 UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
hayati ayguen29eb8472020-11-11 23:09:58 +0100587 product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
588 product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
589 INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
hayati ayguend64eea52020-11-14 15:12:19 +0100590 VSTORE(u, interl_prod_a);
591 VSTORE(u+1, interl_prod_b);
hayati ayguen29eb8472020-11-11 23:09:58 +0100592 u += 2;
593 // calculate complex phasor for next iteration
hayati ayguend64eea52020-11-14 15:12:19 +0100594 // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
595 // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
hayati ayguen29eb8472020-11-11 23:09:58 +0100596 // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
hayati ayguend64eea52020-11-14 15:12:19 +0100597 // "vals := d[] * starts"
598 inp_re = VLOAD(p_trig_cos_tab);
599 inp_im = VLOAD(p_trig_sin_tab);
hayati ayguen29eb8472020-11-11 23:09:58 +0100600 cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
601 sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
602 ++p_trig_cos_tab;
603 ++p_trig_sin_tab;
604 B -= 4;
605 }
606 N_cplx -= NB;
607 /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
608 /* re-use product_re[]/product_im[] for normalization */
hayati ayguend64eea52020-11-14 15:12:19 +0100609 // "starts := vals := vals / |vals|"
hayati ayguen29eb8472020-11-11 23:09:58 +0100610 product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
hayati ayguend64eea52020-11-14 15:12:19 +0100611#if 0
612 // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
613 // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
hayati ayguen29eb8472020-11-11 23:09:58 +0100614 product_im = _mm_rsqrt_ps(product_re);
hayati ayguend64eea52020-11-14 15:12:19 +0100615 cos_starts = cos_vals = VMUL(cos_vals, product_im);
616 sin_starts = sin_vals = VMUL(sin_vals, product_im);
hayati ayguen2587d832020-11-12 07:15:43 +0000617#else
hayati ayguend64eea52020-11-14 15:12:19 +0100618 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
619 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
620 product_im = _mm_sqrt_ps(product_re);
621 cos_starts = cos_vals = VDIV(cos_vals, product_im);
622 sin_starts = sin_vals = VDIV(sin_vals, product_im);
hayati ayguen29eb8472020-11-11 23:09:58 +0100623#endif
hayati ayguend64eea52020-11-14 15:12:19 +0100624 }
625 // "phase_state := starts"
626 VSTORE( &d->phase_state_i[0], cos_starts );
627 VSTORE( &d->phase_state_q[0], sin_starts );
628}
hayati ayguen29eb8472020-11-11 23:09:58 +0100629
630
hayati ayguend64eea52020-11-14 15:12:19 +0100631/*********************************************************************/
hayati ayguen2587d832020-11-12 07:15:43 +0000632
hayati ayguend64eea52020-11-14 15:12:19 +0100633/**************/
634/*** ALGO G ***/
635/**************/
636
637shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad)
hayati ayguen29eb8472020-11-11 23:09:58 +0100638{
hayati ayguend64eea52020-11-14 15:12:19 +0100639 shift_limited_unroll_B_sse_data_t output;
640 float myphase;
641
642 output.phase_increment = 2*relative_freq*PI;
643
644 myphase = 0.0F;
645 for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
hayati ayguen29eb8472020-11-11 23:09:58 +0100646 {
hayati ayguend64eea52020-11-14 15:12:19 +0100647 for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
648 {
649 myphase += output.phase_increment;
650 while(myphase>PI) myphase-=2*PI;
651 while(myphase<-PI) myphase+=2*PI;
652 }
653 output.dtrig[i+0] = cos(myphase);
654 output.dtrig[i+1] = sin(myphase);
655 output.dtrig[i+2] = output.dtrig[i+0];
656 output.dtrig[i+3] = output.dtrig[i+1];
657 }
658
659 output.dcos_blk = 0.0F;
660 output.dsin_blk = 0.0F;
661
662 myphase = phase_start_rad;
663 for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
664 {
665 output.phase_state_i[i] = cos(myphase);
666 output.phase_state_q[i] = sin(myphase);
667 myphase += output.phase_increment;
668 while(myphase>PI) myphase-=2*PI;
669 while(myphase<-PI) myphase+=2*PI;
hayati ayguen29eb8472020-11-11 23:09:58 +0100670 }
671 return output;
672}
673
hayati ayguen29eb8472020-11-11 23:09:58 +0100674
hayati ayguend64eea52020-11-14 15:12:19 +0100675PF_TARGET_CLONES
676void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d)
677{
678 // "vals := starts := phase_state"
679 __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
680 __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
681 __m128 cos_vals = cos_starts;
682 __m128 sin_vals = sin_starts;
683 __m128 inp_re, inp_im;
684 __m128 product_re, product_im;
685 __m128 interl_prod_a, interl_prod_b;
686 __m128 * RESTRICT p_trig_tab;
687 __m128 * RESTRICT u = (__m128*)in_out;
hayati ayguen29eb8472020-11-11 23:09:58 +0100688
hayati ayguend64eea52020-11-14 15:12:19 +0100689 while (N_cplx)
690 {
691 const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
692 int B = NB;
693 p_trig_tab = (__m128*)( &d->dtrig[0] );
694 while (B)
695 {
696 // complex multiplication of 4 complex values from/to in_out[]
697 // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
698 // "out[] = inp[] * vals"
699 UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
700 product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
701 product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
702 INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
703 VSTORE(u, interl_prod_a);
704 VSTORE(u+1, interl_prod_b);
705 u += 2;
706 // calculate complex phasor for next iteration
707 // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
708 // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
709 // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
710 // "vals := d[] * starts"
711 product_re = VLOAD(p_trig_tab);
712 UNINTERLEAVE2(product_re, product_re, inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
713 cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
714 sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
715 ++p_trig_tab;
716 B -= 4;
717 }
718 N_cplx -= NB;
719 /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
720 /* re-use product_re[]/product_im[] for normalization */
721 // "starts := vals := vals / |vals|"
722 product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
723#if 0
724 // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
725 // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
726 product_im = _mm_rsqrt_ps(product_re);
727 cos_starts = cos_vals = VMUL(cos_vals, product_im);
728 sin_starts = sin_vals = VMUL(sin_vals, product_im);
729#else
730 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
731 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
732 product_im = _mm_sqrt_ps(product_re);
733 cos_starts = cos_vals = VDIV(cos_vals, product_im);
734 sin_starts = sin_vals = VDIV(sin_vals, product_im);
735#endif
736 }
737 // "phase_state := starts"
738 VSTORE( &d->phase_state_i[0], cos_starts );
739 VSTORE( &d->phase_state_q[0], sin_starts );
740}
741
742
743/*********************************************************************/
744
745
746/**************/
747/*** ALGO H ***/
748/**************/
749
750shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad)
751{
752 shift_limited_unroll_C_sse_data_t output;
753 float myphase;
754
755 output.phase_increment = 2*relative_freq*PI;
756
757 myphase = 0.0F;
758 for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
759 {
760 for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
761 {
762 myphase += output.phase_increment;
763 while(myphase>PI) myphase-=2*PI;
764 while(myphase<-PI) myphase+=2*PI;
765 }
766 output.dinterl_trig[2*i] = cos(myphase);
767 output.dinterl_trig[2*i+4] = sin(myphase);
768 for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
769 {
770 output.dinterl_trig[2*i+k] = output.dinterl_trig[2*i];
771 output.dinterl_trig[2*i+k+4] = output.dinterl_trig[2*i+4];
772 }
773 }
774
775 output.dcos_blk = 0.0F;
776 output.dsin_blk = 0.0F;
777
778 myphase = phase_start_rad;
779 for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
780 {
781 output.phase_state_i[i] = cos(myphase);
782 output.phase_state_q[i] = sin(myphase);
783 myphase += output.phase_increment;
784 while(myphase>PI) myphase-=2*PI;
785 while(myphase<-PI) myphase+=2*PI;
786 }
787 return output;
788}
789
hayati ayguen29eb8472020-11-11 23:09:58 +0100790
791PF_TARGET_CLONES
hayati ayguend64eea52020-11-14 15:12:19 +0100792void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d)
hayati ayguen29eb8472020-11-11 23:09:58 +0100793{
hayati ayguend64eea52020-11-14 15:12:19 +0100794 // "vals := starts := phase_state"
795 __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
796 __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
797 __m128 cos_vals = cos_starts;
798 __m128 sin_vals = sin_starts;
799 __m128 inp_re, inp_im;
800 __m128 product_re, product_im;
801 __m128 interl_prod_a, interl_prod_b;
802 __m128 * RESTRICT p_trig_tab;
803 __m128 * RESTRICT u = (__m128*)in_out;
hayati ayguen29eb8472020-11-11 23:09:58 +0100804
hayati ayguend64eea52020-11-14 15:12:19 +0100805 while (N_cplx)
hayati ayguen29eb8472020-11-11 23:09:58 +0100806 {
hayati ayguend64eea52020-11-14 15:12:19 +0100807 const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
808 int B = NB;
809 p_trig_tab = (__m128*)( &d->dinterl_trig[0] );
810 while (B)
811 {
812 // complex multiplication of 4 complex values from/to in_out[]
813 // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
814 // "out[] = inp[] * vals"
815 UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
816 product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
817 product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
818 INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
819 VSTORE(u, interl_prod_a);
820 VSTORE(u+1, interl_prod_b);
821 u += 2;
822 // calculate complex phasor for next iteration
823 // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
824 // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
825 // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
826 // "vals := d[] * starts"
827 inp_re = VLOAD(p_trig_tab);
828 inp_im = VLOAD(p_trig_tab+1);
829 cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
830 sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
831 p_trig_tab += 2;
832 B -= 4;
833 }
834 N_cplx -= NB;
835 /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
836 /* re-use product_re[]/product_im[] for normalization */
837 // "starts := vals := vals / |vals|"
838 product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
839#if 0
840 // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
841 // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
842 product_im = _mm_rsqrt_ps(product_re);
843 cos_starts = cos_vals = VMUL(cos_vals, product_im);
844 sin_starts = sin_vals = VMUL(sin_vals, product_im);
845#else
846 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
847 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
848 product_im = _mm_sqrt_ps(product_re);
849 cos_starts = cos_vals = VDIV(cos_vals, product_im);
850 sin_starts = sin_vals = VDIV(sin_vals, product_im);
851#endif
hayati ayguen29eb8472020-11-11 23:09:58 +0100852 }
hayati ayguend64eea52020-11-14 15:12:19 +0100853 // "phase_state := starts"
854 VSTORE( &d->phase_state_i[0], cos_starts );
855 VSTORE( &d->phase_state_q[0], sin_starts );
856}
857
858
859#else
860
861/*********************************************************************/
862
863shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) {
864 assert(0);
865 shift_limited_unroll_A_sse_data_t r;
866 return r;
867}
868shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) {
869 assert(0);
870 shift_limited_unroll_B_sse_data_t r;
871 return r;
872}
873shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) {
874 assert(0);
875 shift_limited_unroll_C_sse_data_t r;
876 return r;
877}
878
879void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) {
880 assert(0);
881}
882void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) {
883 assert(0);
884}
885void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) {
886 assert(0);
hayati ayguen29eb8472020-11-11 23:09:58 +0100887}
888
889#endif
890
891
hayati ayguend64eea52020-11-14 15:12:19 +0100892/*********************************************************************/
hayati ayguen29eb8472020-11-11 23:09:58 +0100893
hayati ayguend64eea52020-11-14 15:12:19 +0100894/**************/
895/*** ALGO I ***/
896/**************/
hayati ayguen29eb8472020-11-11 23:09:58 +0100897
898void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state)
899{
900 // constants for single phase step
901 float phase_increment_s = rate*PI;
902 float k1 = tan(0.5*phase_increment_s);
903 float k2 = 2*k1 /(1 + k1 * k1);
hayati ayguend64eea52020-11-14 15:12:19 +0100904 for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SZ; j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100905 {
906 float tmp;
907 state->u_cos[j] = state->u_cos[j-1];
908 state->v_sin[j] = state->v_sin[j-1];
909 // small steps
910 tmp = state->u_cos[j] - k1 * state->v_sin[j];
911 state->v_sin[j] += k2 * tmp;
912 state->u_cos[j] = tmp - k1 * state->v_sin[j];
913 }
914
hayati ayguend64eea52020-11-14 15:12:19 +0100915 // constants for PF_SHIFT_RECURSIVE_SIMD_SZ times phase step
916 float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SZ;
hayati ayguen29eb8472020-11-11 23:09:58 +0100917 while(phase_increment_b > PI) phase_increment_b-=2*PI;
918 while(phase_increment_b < -PI) phase_increment_b+=2*PI;
919 conf->k1 = tan(0.5*phase_increment_b);
920 conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
921}
922
923void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state)
924{
925 if (starting_phase != 0.0F)
926 {
927 state->u_cos[0] = cos(starting_phase);
928 state->v_sin[0] = sin(starting_phase);
929 }
930 else
931 {
932 state->u_cos[0] = 1.0F;
933 state->v_sin[0] = 0.0F;
934 }
935 shift_recursive_osc_update_rate(rate, conf, state);
936}
937
938
939PF_TARGET_CLONES
940void shift_recursive_osc_cc(const complexf *input, complexf* output,
941 int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
942{
hayati ayguend64eea52020-11-14 15:12:19 +0100943 float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
944 float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
945 float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
hayati ayguen29eb8472020-11-11 23:09:58 +0100946 shift_recursive_osc_t state = *state_ext;
947 const float k1 = conf->k1;
948 const float k2 = conf->k2;
hayati ayguend64eea52020-11-14 15:12:19 +0100949 for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_cc
hayati ayguen29eb8472020-11-11 23:09:58 +0100950 {
951 //we multiply two complex numbers - similar to shift_math_cc
hayati ayguend64eea52020-11-14 15:12:19 +0100952 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100953 {
hayati ayguend64eea52020-11-14 15:12:19 +0100954 inp_i[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
955 inp_q[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
hayati ayguen29eb8472020-11-11 23:09:58 +0100956 }
hayati ayguend64eea52020-11-14 15:12:19 +0100957 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100958 {
hayati ayguend64eea52020-11-14 15:12:19 +0100959 iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
960 qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
hayati ayguen29eb8472020-11-11 23:09:58 +0100961 }
962 // update complex phasor - like incrementing phase
hayati ayguend64eea52020-11-14 15:12:19 +0100963 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100964 tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
hayati ayguend64eea52020-11-14 15:12:19 +0100965 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100966 state.v_sin[j] += k2 * tmp[j];
hayati ayguend64eea52020-11-14 15:12:19 +0100967 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100968 state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
969 }
970 *state_ext = state;
971}
972
973PF_TARGET_CLONES
974void shift_recursive_osc_inp_c(complexf* in_out,
975 int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
976{
hayati ayguend64eea52020-11-14 15:12:19 +0100977 float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
978 float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
979 float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
hayati ayguen29eb8472020-11-11 23:09:58 +0100980 shift_recursive_osc_t state = *state_ext;
981 const float k1 = conf->k1;
982 const float k2 = conf->k2;
hayati ayguend64eea52020-11-14 15:12:19 +0100983 for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_inp_c
hayati ayguen29eb8472020-11-11 23:09:58 +0100984 {
hayati ayguend64eea52020-11-14 15:12:19 +0100985 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100986 {
hayati ayguend64eea52020-11-14 15:12:19 +0100987 inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
988 inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
hayati ayguen29eb8472020-11-11 23:09:58 +0100989 }
990 //we multiply two complex numbers - similar to shift_math_cc
hayati ayguend64eea52020-11-14 15:12:19 +0100991 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100992 {
hayati ayguend64eea52020-11-14 15:12:19 +0100993 iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
994 qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
hayati ayguen29eb8472020-11-11 23:09:58 +0100995 }
996 // update complex phasor - like incrementing phase
hayati ayguend64eea52020-11-14 15:12:19 +0100997 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +0100998 tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
hayati ayguend64eea52020-11-14 15:12:19 +0100999 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +01001000 state.v_sin[j] += k2 * tmp[j];
hayati ayguend64eea52020-11-14 15:12:19 +01001001 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +01001002 state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1003 }
1004 *state_ext = state;
1005}
1006
1007PF_TARGET_CLONES
1008void gen_recursive_osc_c(complexf* output,
1009 int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
1010{
hayati ayguend64eea52020-11-14 15:12:19 +01001011 float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
hayati ayguen29eb8472020-11-11 23:09:58 +01001012 shift_recursive_osc_t state = *state_ext;
1013 const float k1 = conf->k1;
1014 const float k2 = conf->k2;
hayati ayguend64eea52020-11-14 15:12:19 +01001015 for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@gen_recursive_osc_c
hayati ayguen29eb8472020-11-11 23:09:58 +01001016 {
1017 // output complex oscillator value
hayati ayguend64eea52020-11-14 15:12:19 +01001018 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +01001019 {
hayati ayguend64eea52020-11-14 15:12:19 +01001020 iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j];
1021 qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j];
hayati ayguen29eb8472020-11-11 23:09:58 +01001022 }
1023 // update complex phasor - like incrementing phase
hayati ayguend64eea52020-11-14 15:12:19 +01001024 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +01001025 tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
hayati ayguend64eea52020-11-14 15:12:19 +01001026 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +01001027 state.v_sin[j] += k2 * tmp[j];
hayati ayguend64eea52020-11-14 15:12:19 +01001028 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
hayati ayguen29eb8472020-11-11 23:09:58 +01001029 state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1030 }
1031 *state_ext = state;
1032}
1033
1034
hayati ayguen2587d832020-11-12 07:15:43 +00001035#ifdef HAVE_SSE_INTRINSICS
hayati ayguen29eb8472020-11-11 23:09:58 +01001036
hayati ayguend64eea52020-11-14 15:12:19 +01001037/*********************************************************************/
1038
1039/**************/
1040/*** ALGO J ***/
1041/**************/
1042
hayati ayguen29eb8472020-11-11 23:09:58 +01001043void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
1044{
1045 // constants for single phase step
1046 float phase_increment_s = rate*PI;
1047 float k1 = tan(0.5*phase_increment_s);
1048 float k2 = 2*k1 /(1 + k1 * k1);
hayati ayguend64eea52020-11-14 15:12:19 +01001049 for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++)
hayati ayguen29eb8472020-11-11 23:09:58 +01001050 {
1051 float tmp;
1052 state->u_cos[j] = state->u_cos[j-1];
1053 state->v_sin[j] = state->v_sin[j-1];
1054 // small steps
1055 tmp = state->u_cos[j] - k1 * state->v_sin[j];
1056 state->v_sin[j] += k2 * tmp;
1057 state->u_cos[j] = tmp - k1 * state->v_sin[j];
1058 }
1059
hayati ayguend64eea52020-11-14 15:12:19 +01001060 // constants for PF_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step
1061 float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SSE_SZ;
hayati ayguen29eb8472020-11-11 23:09:58 +01001062 while(phase_increment_b > PI) phase_increment_b-=2*PI;
1063 while(phase_increment_b < -PI) phase_increment_b+=2*PI;
1064 conf->k1 = tan(0.5*phase_increment_b);
1065 conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
1066}
1067
1068
1069void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
1070{
1071 if (starting_phase != 0.0F)
1072 {
1073 state->u_cos[0] = cos(starting_phase);
1074 state->v_sin[0] = sin(starting_phase);
1075 }
1076 else
1077 {
1078 state->u_cos[0] = 1.0F;
1079 state->v_sin[0] = 0.0F;
1080 }
1081 shift_recursive_osc_sse_update_rate(rate, conf, state);
1082}
1083
1084
hayati ayguen2587d832020-11-12 07:15:43 +00001085PF_TARGET_CLONES
hayati ayguen29eb8472020-11-11 23:09:58 +01001086void shift_recursive_osc_sse_inp_c(complexf* in_out,
1087 int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
1088{
1089 const __m128 k1 = LD_PS1( conf->k1 );
1090 const __m128 k2 = LD_PS1( conf->k2 );
hayati ayguend64eea52020-11-14 15:12:19 +01001091 __m128 u_cos = VLOAD( &state_ext->u_cos[0] );
1092 __m128 v_sin = VLOAD( &state_ext->v_sin[0] );
hayati ayguen29eb8472020-11-11 23:09:58 +01001093 __m128 inp_re, inp_im;
1094 __m128 product_re, product_im;
1095 __m128 interl_prod_a, interl_prod_b;
1096 __m128 * RESTRICT u = (__m128*)in_out;
1097
1098 while (N_cplx)
1099 {
hayati ayguend64eea52020-11-14 15:12:19 +01001100 //inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i;
1101 //inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q;
1102 UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
hayati ayguen29eb8472020-11-11 23:09:58 +01001103
1104 //we multiply two complex numbers - similar to shift_math_cc
hayati ayguend64eea52020-11-14 15:12:19 +01001105 //iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
1106 //qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
hayati ayguen29eb8472020-11-11 23:09:58 +01001107 product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) );
1108 product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) );
1109 INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
hayati ayguend64eea52020-11-14 15:12:19 +01001110 VSTORE(u, interl_prod_a);
1111 VSTORE(u+1, interl_prod_b);
hayati ayguen29eb8472020-11-11 23:09:58 +01001112 u += 2;
1113
1114 // update complex phasor - like incrementing phase
1115 // tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
1116 product_re = VSUB( u_cos, VMUL(k1, v_sin) );
1117 // state.v_sin[j] += k2 * tmp[j];
1118 v_sin = VADD( v_sin, VMUL(k2, product_re) );
1119 // state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1120 u_cos = VSUB( product_re, VMUL(k1, v_sin) );
1121
1122 N_cplx -= 4;
1123 }
hayati ayguend64eea52020-11-14 15:12:19 +01001124 VSTORE( &state_ext->u_cos[0], u_cos );
1125 VSTORE( &state_ext->v_sin[0], v_sin );
hayati ayguen29eb8472020-11-11 23:09:58 +01001126}
1127
hayati ayguen2587d832020-11-12 07:15:43 +00001128#else
1129
1130void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
1131{
1132 assert(0);
1133}
1134
1135void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
1136{
1137 assert(0);
1138}
1139
1140
1141void shift_recursive_osc_sse_inp_c(complexf* in_out,
1142 int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
1143{
1144 assert(0);
1145}
1146
hayati ayguen29eb8472020-11-11 23:09:58 +01001147#endif
1148