hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1 | /* |
| 2 | This software is part of pffft/pfdsp, a set of simple DSP routines. |
| 3 | |
| 4 | Copyright (c) 2014, Andras Retzler <randras@sdr.hu> |
| 5 | Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de> |
| 6 | All rights reserved. |
| 7 | |
| 8 | Redistribution and use in source and binary forms, with or without |
| 9 | modification, are permitted provided that the following conditions are met: |
| 10 | * Redistributions of source code must retain the above copyright |
| 11 | notice, this list of conditions and the following disclaimer. |
| 12 | * Redistributions in binary form must reproduce the above copyright |
| 13 | notice, this list of conditions and the following disclaimer in the |
| 14 | documentation and/or other materials provided with the distribution. |
| 15 | * Neither the name of the copyright holder nor the |
| 16 | names of its contributors may be used to endorse or promote products |
| 17 | derived from this software without specific prior written permission. |
| 18 | |
| 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 22 | DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY |
| 23 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| 26 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | */ |
| 30 | |
| 31 | /* include own header first, to see missing includes */ |
| 32 | #include "pf_mixer.h" |
| 33 | #include "fmv.h" |
| 34 | |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 35 | #include <math.h> |
| 36 | #include <stdlib.h> |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 37 | #include <assert.h> |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 38 | |
| 39 | //they dropped M_PI in C99, so we define it: |
| 40 | #define PI ((float)3.14159265358979323846) |
| 41 | |
| 42 | //apply to pointers: |
| 43 | #define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i))) |
| 44 | #define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1)) |
| 45 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 46 | #define USE_ALIGNED_ADDRESSES 0 |
| 47 | |
| 48 | |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 49 | |
| 50 | /* |
| 51 | _____ _____ _____ __ _ _ |
| 52 | | __ \ / ____| __ \ / _| | | (_) |
| 53 | | | | | (___ | |__) | | |_ _ _ _ __ ___| |_ _ ___ _ __ ___ |
| 54 | | | | |\___ \| ___/ | _| | | | '_ \ / __| __| |/ _ \| '_ \/ __| |
| 55 | | |__| |____) | | | | | |_| | | | | (__| |_| | (_) | | | \__ \ |
| 56 | |_____/|_____/|_| |_| \__,_|_| |_|\___|\__|_|\___/|_| |_|___/ |
| 57 | |
| 58 | */ |
| 59 | |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 60 | |
| 61 | #if defined(__GNUC__) |
| 62 | # define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) |
| 63 | # define RESTRICT __restrict |
| 64 | #elif defined(_MSC_VER) |
| 65 | # define ALWAYS_INLINE(return_type) __forceinline return_type |
| 66 | # define RESTRICT __restrict |
| 67 | #endif |
| 68 | |
| 69 | |
| 70 | #ifndef PFFFT_SIMD_DISABLE |
| 71 | #if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)) |
| 72 | #pragma message "Manual SSE x86/x64 optimizations are ON" |
| 73 | #include <xmmintrin.h> |
| 74 | #define HAVE_SSE_INTRINSICS 1 |
| 75 | |
| 76 | #elif defined(PFFFT_ENABLE_NEON) && defined(__arm__) |
| 77 | #pragma message "Manual NEON (arm32) optimizations are ON" |
| 78 | #include "sse2neon.h" |
| 79 | #define HAVE_SSE_INTRINSICS 1 |
| 80 | |
| 81 | #elif defined(PFFFT_ENABLE_NEON) && defined(__aarch64__) |
| 82 | #pragma message "Manual NEON (aarch64) optimizations are ON" |
| 83 | #include "sse2neon.h" |
| 84 | #define HAVE_SSE_INTRINSICS 1 |
| 85 | |
| 86 | #endif |
| 87 | #endif |
| 88 | |
| 89 | #ifdef HAVE_SSE_INTRINSICS |
| 90 | |
| 91 | typedef __m128 v4sf; |
| 92 | # define SIMD_SZ 4 |
| 93 | |
| 94 | typedef union v4_union { |
| 95 | __m128 v; |
| 96 | float f[4]; |
| 97 | } v4_union; |
| 98 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 99 | #define VMUL(a,b) _mm_mul_ps(a,b) |
| 100 | #define VDIV(a,b) _mm_div_ps(a,b) |
| 101 | #define VADD(a,b) _mm_add_ps(a,b) |
| 102 | #define VSUB(a,b) _mm_sub_ps(a,b) |
| 103 | #define LD_PS1(s) _mm_set1_ps(s) |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 104 | #define VLOAD_UNALIGNED(ptr) _mm_loadu_ps((const float *)(ptr)) |
| 105 | #define VLOAD_ALIGNED(ptr) _mm_load_ps((const float *)(ptr)) |
| 106 | #define VSTORE_UNALIGNED(ptr, v) _mm_storeu_ps((float*)(ptr), v) |
| 107 | #define VSTORE_ALIGNED(ptr, v) _mm_store_ps((float*)(ptr), v) |
| 108 | #define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } |
| 109 | #define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } |
| 110 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 111 | #if USE_ALIGNED_ADDRESSES |
| 112 | #define VLOAD(ptr) _mm_load_ps((const float *)(ptr)) |
| 113 | #define VSTORE(ptr, v) _mm_store_ps((float*)(ptr), v) |
| 114 | #else |
| 115 | #define VLOAD(ptr) _mm_loadu_ps((const float *)(ptr)) |
| 116 | #define VSTORE(ptr, v) _mm_storeu_ps((float*)(ptr), v) |
| 117 | #endif |
| 118 | |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 119 | |
| 120 | int have_sse_shift_mixer_impl() |
| 121 | { |
| 122 | return 1; |
| 123 | } |
| 124 | |
| 125 | #else |
| 126 | |
| 127 | int have_sse_shift_mixer_impl() |
| 128 | { |
| 129 | return 0; |
| 130 | } |
| 131 | |
| 132 | #endif |
| 133 | |
| 134 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 135 | /*********************************************************************/ |
| 136 | |
| 137 | /**************/ |
| 138 | /*** ALGO A ***/ |
| 139 | /**************/ |
| 140 | |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 141 | PF_TARGET_CLONES |
| 142 | float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase) |
| 143 | { |
| 144 | rate*=2; |
| 145 | //Shifts the complex spectrum. Basically a complex mixer. This version uses cmath. |
| 146 | float phase=starting_phase; |
| 147 | float phase_increment=rate*PI; |
| 148 | float cosval, sinval; |
| 149 | for(int i=0;i<input_size; i++) |
| 150 | { |
| 151 | cosval=cos(phase); |
| 152 | sinval=sin(phase); |
| 153 | //we multiply two complex numbers. |
| 154 | //how? enter this to maxima (software) for explanation: |
| 155 | // (a+b*%i)*(c+d*%i), rectform; |
| 156 | iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i); |
| 157 | qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i); |
| 158 | phase+=phase_increment; |
| 159 | while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase |
| 160 | while(phase<0) phase+=2*PI; |
| 161 | } |
| 162 | return phase; |
| 163 | } |
| 164 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 165 | /*********************************************************************/ |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 166 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 167 | /**************/ |
| 168 | /*** ALGO B ***/ |
| 169 | /**************/ |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 170 | |
| 171 | shift_table_data_t shift_table_init(int table_size) |
| 172 | { |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 173 | shift_table_data_t output; |
| 174 | output.table=(float*)malloc(sizeof(float)*table_size); |
| 175 | output.table_size=table_size; |
| 176 | for(int i=0;i<table_size;i++) |
| 177 | { |
| 178 | output.table[i]=sin(((float)i/table_size)*(PI/2)); |
| 179 | } |
| 180 | return output; |
| 181 | } |
| 182 | |
| 183 | void shift_table_deinit(shift_table_data_t table_data) |
| 184 | { |
| 185 | free(table_data.table); |
| 186 | } |
| 187 | |
| 188 | |
| 189 | PF_TARGET_CLONES |
| 190 | float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase) |
| 191 | { |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 192 | rate*=2; |
| 193 | //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table. |
| 194 | float phase=starting_phase; |
| 195 | float phase_increment=rate*PI; |
| 196 | float cosval, sinval; |
| 197 | for(int i=0;i<input_size; i++) //@shift_math_cc |
| 198 | { |
| 199 | int sin_index, cos_index, temp_index, sin_sign, cos_sign; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 200 | int quadrant=phase/(PI/2); //between 0 and 3 |
| 201 | float vphase=phase-quadrant*(PI/2); |
| 202 | sin_index=(vphase/(PI/2))*table_data.table_size; |
| 203 | cos_index=table_data.table_size-1-sin_index; |
| 204 | if(quadrant&1) //in quadrant 1 and 3 |
| 205 | { |
| 206 | temp_index=sin_index; |
| 207 | sin_index=cos_index; |
| 208 | cos_index=temp_index; |
| 209 | } |
| 210 | sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3 |
| 211 | cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2 |
| 212 | sinval=sin_sign*table_data.table[sin_index]; |
| 213 | cosval=cos_sign*table_data.table[cos_index]; |
| 214 | //we multiply two complex numbers. |
| 215 | //how? enter this to maxima (software) for explanation: |
| 216 | // (a+b*%i)*(c+d*%i), rectform; |
| 217 | iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i); |
| 218 | qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i); |
| 219 | phase+=phase_increment; |
| 220 | while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase |
| 221 | while(phase<0) phase+=2*PI; |
| 222 | } |
| 223 | return phase; |
| 224 | } |
| 225 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 226 | /*********************************************************************/ |
| 227 | |
| 228 | /**************/ |
| 229 | /*** ALGO C ***/ |
| 230 | /**************/ |
| 231 | |
| 232 | shift_addfast_data_t shift_addfast_init(float rate) |
| 233 | { |
| 234 | shift_addfast_data_t output; |
| 235 | output.phase_increment=2*rate*PI; |
| 236 | for(int i=0;i<4;i++) |
| 237 | { |
| 238 | output.dsin[i]=sin(output.phase_increment*(i+1)); |
| 239 | output.dcos[i]=cos(output.phase_increment*(i+1)); |
| 240 | } |
| 241 | return output; |
| 242 | } |
| 243 | |
| 244 | #define SADF_L1(j) \ |
| 245 | cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \ |
| 246 | sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j; |
| 247 | #define SADF_L2(j) \ |
| 248 | iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \ |
| 249 | qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j); |
| 250 | |
| 251 | PF_TARGET_CLONES |
| 252 | float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase) |
| 253 | { |
| 254 | //input_size should be multiple of 4 |
| 255 | //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); |
| 256 | float cos_start=cos(starting_phase); |
| 257 | float sin_start=sin(starting_phase); |
| 258 | float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3, |
| 259 | sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3, |
| 260 | dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3], |
| 261 | dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3]; |
| 262 | |
| 263 | for(int i=0;i<input_size/4; i++) |
| 264 | { |
| 265 | SADF_L1(0) |
| 266 | SADF_L1(1) |
| 267 | SADF_L1(2) |
| 268 | SADF_L1(3) |
| 269 | SADF_L2(0) |
| 270 | SADF_L2(1) |
| 271 | SADF_L2(2) |
| 272 | SADF_L2(3) |
| 273 | cos_start = cos_vals_3; |
| 274 | sin_start = sin_vals_3; |
| 275 | } |
| 276 | starting_phase+=input_size*d->phase_increment; |
| 277 | while(starting_phase>PI) starting_phase-=2*PI; |
| 278 | while(starting_phase<-PI) starting_phase+=2*PI; |
| 279 | return starting_phase; |
| 280 | } |
| 281 | |
| 282 | #undef SADF_L2 |
| 283 | |
| 284 | |
| 285 | #define SADF_L2(j) \ |
| 286 | tmp_inp_cos = iof(in_out,4*i+j); \ |
| 287 | tmp_inp_sin = qof(in_out,4*i+j); \ |
| 288 | iof(in_out,4*i+j)=(cos_vals_ ## j)*tmp_inp_cos - (sin_vals_ ## j)*tmp_inp_sin; \ |
| 289 | qof(in_out,4*i+j)=(sin_vals_ ## j)*tmp_inp_cos + (cos_vals_ ## j)*tmp_inp_sin; |
| 290 | |
| 291 | PF_TARGET_CLONES |
| 292 | float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase) |
| 293 | { |
| 294 | //input_size should be multiple of 4 |
| 295 | //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); |
| 296 | float cos_start=cos(starting_phase); |
| 297 | float sin_start=sin(starting_phase); |
| 298 | float register tmp_inp_cos, tmp_inp_sin, |
| 299 | cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3, |
| 300 | sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3, |
| 301 | dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3], |
| 302 | dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3]; |
| 303 | |
| 304 | for(int i=0;i<N_cplx/4; i++) |
| 305 | { |
| 306 | SADF_L1(0) |
| 307 | SADF_L1(1) |
| 308 | SADF_L1(2) |
| 309 | SADF_L1(3) |
| 310 | SADF_L2(0) |
| 311 | SADF_L2(1) |
| 312 | SADF_L2(2) |
| 313 | SADF_L2(3) |
| 314 | cos_start = cos_vals_3; |
| 315 | sin_start = sin_vals_3; |
| 316 | } |
| 317 | starting_phase+=N_cplx*d->phase_increment; |
| 318 | while(starting_phase>PI) starting_phase-=2*PI; |
| 319 | while(starting_phase<-PI) starting_phase+=2*PI; |
| 320 | return starting_phase; |
| 321 | } |
| 322 | |
| 323 | #undef SADF_L1 |
| 324 | #undef SADF_L2 |
| 325 | |
| 326 | |
| 327 | /*********************************************************************/ |
| 328 | |
| 329 | /**************/ |
| 330 | /*** ALGO D ***/ |
| 331 | /**************/ |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 332 | |
| 333 | shift_unroll_data_t shift_unroll_init(float rate, int size) |
| 334 | { |
| 335 | shift_unroll_data_t output; |
| 336 | output.phase_increment=2*rate*PI; |
| 337 | output.size = size; |
| 338 | output.dsin=(float*)malloc(sizeof(float)*size); |
| 339 | output.dcos=(float*)malloc(sizeof(float)*size); |
| 340 | float myphase = 0; |
| 341 | for(int i=0;i<size;i++) |
| 342 | { |
| 343 | myphase += output.phase_increment; |
| 344 | while(myphase>PI) myphase-=2*PI; |
| 345 | while(myphase<-PI) myphase+=2*PI; |
| 346 | output.dsin[i]=sin(myphase); |
| 347 | output.dcos[i]=cos(myphase); |
| 348 | } |
| 349 | return output; |
| 350 | } |
| 351 | |
| 352 | void shift_unroll_deinit(shift_unroll_data_t* d) |
| 353 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 354 | if (!d) |
| 355 | return; |
| 356 | free(d->dsin); |
| 357 | free(d->dcos); |
| 358 | d->dsin = NULL; |
| 359 | d->dcos = NULL; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 360 | } |
| 361 | |
| 362 | PF_TARGET_CLONES |
| 363 | float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase) |
| 364 | { |
| 365 | //input_size should be multiple of 4 |
| 366 | //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); |
| 367 | float cos_start = cos(starting_phase); |
| 368 | float sin_start = sin(starting_phase); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 369 | register float cos_val = cos_start, sin_val = sin_start; |
| 370 | for(int i=0;i<input_size; i++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 371 | { |
| 372 | iof(output,i) = cos_val*iof(input,i) - sin_val*qof(input,i); |
| 373 | qof(output,i) = sin_val*iof(input,i) + cos_val*qof(input,i); |
| 374 | // calculate complex phasor for next iteration |
| 375 | cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i]; |
| 376 | sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i]; |
| 377 | } |
| 378 | starting_phase+=input_size*d->phase_increment; |
| 379 | while(starting_phase>PI) starting_phase-=2*PI; |
| 380 | while(starting_phase<-PI) starting_phase+=2*PI; |
| 381 | return starting_phase; |
| 382 | } |
| 383 | |
| 384 | PF_TARGET_CLONES |
| 385 | float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase) |
| 386 | { |
| 387 | float cos_start = cos(starting_phase); |
| 388 | float sin_start = sin(starting_phase); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 389 | register float cos_val = cos_start, sin_val = sin_start; |
| 390 | for(int i=0;i<size; i++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 391 | { |
| 392 | register float inp_i = iof(in_out,i); |
| 393 | register float inp_q = qof(in_out,i); |
| 394 | iof(in_out,i) = cos_val*inp_i - sin_val*inp_q; |
| 395 | qof(in_out,i) = sin_val*inp_i + cos_val*inp_q; |
| 396 | // calculate complex phasor for next iteration |
| 397 | cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i]; |
| 398 | sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i]; |
| 399 | } |
| 400 | starting_phase += size * d->phase_increment; |
| 401 | while(starting_phase>PI) starting_phase-=2*PI; |
| 402 | while(starting_phase<-PI) starting_phase+=2*PI; |
| 403 | return starting_phase; |
| 404 | } |
| 405 | |
| 406 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 407 | /*********************************************************************/ |
| 408 | |
| 409 | /**************/ |
| 410 | /*** ALGO E ***/ |
| 411 | /**************/ |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 412 | |
| 413 | shift_limited_unroll_data_t shift_limited_unroll_init(float rate) |
| 414 | { |
| 415 | shift_limited_unroll_data_t output; |
| 416 | output.phase_increment=2*rate*PI; |
| 417 | float myphase = 0; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 418 | for(int i=0; i < PF_SHIFT_LIMITED_UNROLL_SIZE; i++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 419 | { |
| 420 | myphase += output.phase_increment; |
| 421 | while(myphase>PI) myphase-=2*PI; |
| 422 | while(myphase<-PI) myphase+=2*PI; |
| 423 | output.dcos[i] = cos(myphase); |
| 424 | output.dsin[i] = sin(myphase); |
| 425 | } |
| 426 | output.complex_phase.i = 1.0F; |
| 427 | output.complex_phase.q = 0.0F; |
| 428 | return output; |
| 429 | } |
| 430 | |
| 431 | PF_TARGET_CLONES |
| 432 | void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d) |
| 433 | { |
| 434 | float cos_start = d->complex_phase.i; |
| 435 | float sin_start = d->complex_phase.q; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 436 | register float cos_val = cos_start, sin_val = sin_start, mag; |
| 437 | while (size > 0) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 438 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 439 | int N = (size >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : size; |
| 440 | for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ ) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 441 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 442 | for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 443 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 444 | iof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j); |
| 445 | qof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j); |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 446 | // calculate complex phasor for next iteration |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 447 | cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; |
| 448 | sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 449 | } |
| 450 | } |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 451 | // "starts := vals := vals / |vals|" |
| 452 | mag = sqrtf(cos_val * cos_val + sin_val * sin_val); |
| 453 | cos_val /= mag; |
| 454 | sin_val /= mag; |
| 455 | cos_start = cos_val; |
| 456 | sin_start = sin_val; |
| 457 | |
| 458 | input += PF_SHIFT_LIMITED_UNROLL_SIZE; |
| 459 | output += PF_SHIFT_LIMITED_UNROLL_SIZE; |
| 460 | size -= PF_SHIFT_LIMITED_UNROLL_SIZE; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 461 | } |
| 462 | d->complex_phase.i = cos_val; |
| 463 | d->complex_phase.q = sin_val; |
| 464 | } |
| 465 | |
| 466 | PF_TARGET_CLONES |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 467 | void shift_limited_unroll_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_data_t* d) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 468 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 469 | float inp_i[PF_SHIFT_LIMITED_SIMD_SZ]; |
| 470 | float inp_q[PF_SHIFT_LIMITED_SIMD_SZ]; |
| 471 | // "vals := starts := phase_state" |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 472 | float cos_start = d->complex_phase.i; |
| 473 | float sin_start = d->complex_phase.q; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 474 | register float cos_val = cos_start, sin_val = sin_start, mag; |
| 475 | while (N_cplx) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 476 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 477 | int N = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; |
| 478 | for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ ) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 479 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 480 | for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++) |
| 481 | inp_i[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].i; |
| 482 | for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++) |
| 483 | inp_q[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].q; |
| 484 | for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 485 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 486 | // "out[] = inp[] * vals" |
| 487 | iof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j]; |
| 488 | qof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 489 | // calculate complex phasor for next iteration |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 490 | // "vals := d[] * starts" |
| 491 | cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; |
| 492 | sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 493 | } |
| 494 | } |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 495 | // "starts := vals := vals / |vals|" |
| 496 | mag = sqrtf(cos_val * cos_val + sin_val * sin_val); |
| 497 | cos_val /= mag; |
| 498 | sin_val /= mag; |
| 499 | cos_start = cos_val; |
| 500 | sin_start = sin_val; |
| 501 | |
| 502 | in_out += PF_SHIFT_LIMITED_UNROLL_SIZE; |
| 503 | N_cplx -= PF_SHIFT_LIMITED_UNROLL_SIZE; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 504 | } |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 505 | // "phase_state := starts" |
| 506 | d->complex_phase.i = cos_start; |
| 507 | d->complex_phase.q = sin_start; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 508 | } |
| 509 | |
| 510 | |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 511 | #ifdef HAVE_SSE_INTRINSICS |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 512 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 513 | /*********************************************************************/ |
| 514 | |
| 515 | /**************/ |
| 516 | /*** ALGO F ***/ |
| 517 | /**************/ |
| 518 | |
| 519 | shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 520 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 521 | shift_limited_unroll_A_sse_data_t output; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 522 | float myphase; |
| 523 | |
| 524 | output.phase_increment = 2*relative_freq*PI; |
| 525 | |
| 526 | myphase = 0.0F; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 527 | for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 528 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 529 | for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) |
| 530 | { |
| 531 | myphase += output.phase_increment; |
| 532 | while(myphase>PI) myphase-=2*PI; |
| 533 | while(myphase<-PI) myphase+=2*PI; |
| 534 | } |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 535 | output.dcos[i] = cos(myphase); |
| 536 | output.dsin[i] = sin(myphase); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 537 | for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) |
| 538 | { |
| 539 | output.dcos[i+k] = output.dcos[i]; |
| 540 | output.dsin[i+k] = output.dsin[i]; |
| 541 | } |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 542 | } |
| 543 | |
| 544 | output.dcos_blk = 0.0F; |
| 545 | output.dsin_blk = 0.0F; |
| 546 | |
| 547 | myphase = phase_start_rad; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 548 | for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 549 | { |
| 550 | output.phase_state_i[i] = cos(myphase); |
| 551 | output.phase_state_q[i] = sin(myphase); |
| 552 | myphase += output.phase_increment; |
| 553 | while(myphase>PI) myphase-=2*PI; |
| 554 | while(myphase<-PI) myphase+=2*PI; |
| 555 | } |
| 556 | return output; |
| 557 | } |
| 558 | |
| 559 | |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 560 | PF_TARGET_CLONES |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 561 | void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 562 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 563 | // "vals := starts := phase_state" |
| 564 | __m128 cos_starts = VLOAD( &d->phase_state_i[0] ); |
| 565 | __m128 sin_starts = VLOAD( &d->phase_state_q[0] ); |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 566 | __m128 cos_vals = cos_starts; |
| 567 | __m128 sin_vals = sin_starts; |
| 568 | __m128 inp_re, inp_im; |
| 569 | __m128 product_re, product_im; |
| 570 | __m128 interl_prod_a, interl_prod_b; |
| 571 | __m128 * RESTRICT p_trig_cos_tab; |
| 572 | __m128 * RESTRICT p_trig_sin_tab; |
| 573 | __m128 * RESTRICT u = (__m128*)in_out; |
| 574 | |
| 575 | while (N_cplx) |
| 576 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 577 | const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 578 | int B = NB; |
| 579 | p_trig_cos_tab = (__m128*)( &d->dcos[0] ); |
| 580 | p_trig_sin_tab = (__m128*)( &d->dsin[0] ); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 581 | while (B) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 582 | { |
| 583 | // complex multiplication of 4 complex values from/to in_out[] |
| 584 | // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]): |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 585 | // "out[] = inp[] * vals" |
| 586 | UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 587 | product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) ); |
| 588 | product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) ); |
| 589 | INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 590 | VSTORE(u, interl_prod_a); |
| 591 | VSTORE(u+1, interl_prod_b); |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 592 | u += 2; |
| 593 | // calculate complex phasor for next iteration |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 594 | // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; |
| 595 | // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 596 | // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-) |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 597 | // "vals := d[] * starts" |
| 598 | inp_re = VLOAD(p_trig_cos_tab); |
| 599 | inp_im = VLOAD(p_trig_sin_tab); |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 600 | cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) ); |
| 601 | sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) ); |
| 602 | ++p_trig_cos_tab; |
| 603 | ++p_trig_sin_tab; |
| 604 | B -= 4; |
| 605 | } |
| 606 | N_cplx -= NB; |
| 607 | /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */ |
| 608 | /* re-use product_re[]/product_im[] for normalization */ |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 609 | // "starts := vals := vals / |vals|" |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 610 | product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) ); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 611 | #if 0 |
| 612 | // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64 |
| 613 | // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16 |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 614 | product_im = _mm_rsqrt_ps(product_re); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 615 | cos_starts = cos_vals = VMUL(cos_vals, product_im); |
| 616 | sin_starts = sin_vals = VMUL(sin_vals, product_im); |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 617 | #else |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 618 | // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower! |
| 619 | // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again |
| 620 | product_im = _mm_sqrt_ps(product_re); |
| 621 | cos_starts = cos_vals = VDIV(cos_vals, product_im); |
| 622 | sin_starts = sin_vals = VDIV(sin_vals, product_im); |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 623 | #endif |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 624 | } |
| 625 | // "phase_state := starts" |
| 626 | VSTORE( &d->phase_state_i[0], cos_starts ); |
| 627 | VSTORE( &d->phase_state_q[0], sin_starts ); |
| 628 | } |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 629 | |
| 630 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 631 | /*********************************************************************/ |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 632 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 633 | /**************/ |
| 634 | /*** ALGO G ***/ |
| 635 | /**************/ |
| 636 | |
| 637 | shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 638 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 639 | shift_limited_unroll_B_sse_data_t output; |
| 640 | float myphase; |
| 641 | |
| 642 | output.phase_increment = 2*relative_freq*PI; |
| 643 | |
| 644 | myphase = 0.0F; |
| 645 | for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 646 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 647 | for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) |
| 648 | { |
| 649 | myphase += output.phase_increment; |
| 650 | while(myphase>PI) myphase-=2*PI; |
| 651 | while(myphase<-PI) myphase+=2*PI; |
| 652 | } |
| 653 | output.dtrig[i+0] = cos(myphase); |
| 654 | output.dtrig[i+1] = sin(myphase); |
| 655 | output.dtrig[i+2] = output.dtrig[i+0]; |
| 656 | output.dtrig[i+3] = output.dtrig[i+1]; |
| 657 | } |
| 658 | |
| 659 | output.dcos_blk = 0.0F; |
| 660 | output.dsin_blk = 0.0F; |
| 661 | |
| 662 | myphase = phase_start_rad; |
| 663 | for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++) |
| 664 | { |
| 665 | output.phase_state_i[i] = cos(myphase); |
| 666 | output.phase_state_q[i] = sin(myphase); |
| 667 | myphase += output.phase_increment; |
| 668 | while(myphase>PI) myphase-=2*PI; |
| 669 | while(myphase<-PI) myphase+=2*PI; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 670 | } |
| 671 | return output; |
| 672 | } |
| 673 | |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 674 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 675 | PF_TARGET_CLONES |
| 676 | void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) |
| 677 | { |
| 678 | // "vals := starts := phase_state" |
| 679 | __m128 cos_starts = VLOAD( &d->phase_state_i[0] ); |
| 680 | __m128 sin_starts = VLOAD( &d->phase_state_q[0] ); |
| 681 | __m128 cos_vals = cos_starts; |
| 682 | __m128 sin_vals = sin_starts; |
| 683 | __m128 inp_re, inp_im; |
| 684 | __m128 product_re, product_im; |
| 685 | __m128 interl_prod_a, interl_prod_b; |
| 686 | __m128 * RESTRICT p_trig_tab; |
| 687 | __m128 * RESTRICT u = (__m128*)in_out; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 688 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 689 | while (N_cplx) |
| 690 | { |
| 691 | const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; |
| 692 | int B = NB; |
| 693 | p_trig_tab = (__m128*)( &d->dtrig[0] ); |
| 694 | while (B) |
| 695 | { |
| 696 | // complex multiplication of 4 complex values from/to in_out[] |
| 697 | // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]): |
| 698 | // "out[] = inp[] * vals" |
| 699 | UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ |
| 700 | product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) ); |
| 701 | product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) ); |
| 702 | INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); |
| 703 | VSTORE(u, interl_prod_a); |
| 704 | VSTORE(u+1, interl_prod_b); |
| 705 | u += 2; |
| 706 | // calculate complex phasor for next iteration |
| 707 | // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; |
| 708 | // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; |
| 709 | // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-) |
| 710 | // "vals := d[] * starts" |
| 711 | product_re = VLOAD(p_trig_tab); |
| 712 | UNINTERLEAVE2(product_re, product_re, inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ |
| 713 | cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) ); |
| 714 | sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) ); |
| 715 | ++p_trig_tab; |
| 716 | B -= 4; |
| 717 | } |
| 718 | N_cplx -= NB; |
| 719 | /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */ |
| 720 | /* re-use product_re[]/product_im[] for normalization */ |
| 721 | // "starts := vals := vals / |vals|" |
| 722 | product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) ); |
| 723 | #if 0 |
| 724 | // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64 |
| 725 | // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16 |
| 726 | product_im = _mm_rsqrt_ps(product_re); |
| 727 | cos_starts = cos_vals = VMUL(cos_vals, product_im); |
| 728 | sin_starts = sin_vals = VMUL(sin_vals, product_im); |
| 729 | #else |
| 730 | // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower! |
| 731 | // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again |
| 732 | product_im = _mm_sqrt_ps(product_re); |
| 733 | cos_starts = cos_vals = VDIV(cos_vals, product_im); |
| 734 | sin_starts = sin_vals = VDIV(sin_vals, product_im); |
| 735 | #endif |
| 736 | } |
| 737 | // "phase_state := starts" |
| 738 | VSTORE( &d->phase_state_i[0], cos_starts ); |
| 739 | VSTORE( &d->phase_state_q[0], sin_starts ); |
| 740 | } |
| 741 | |
| 742 | |
| 743 | /*********************************************************************/ |
| 744 | |
| 745 | |
| 746 | /**************/ |
| 747 | /*** ALGO H ***/ |
| 748 | /**************/ |
| 749 | |
| 750 | shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) |
| 751 | { |
| 752 | shift_limited_unroll_C_sse_data_t output; |
| 753 | float myphase; |
| 754 | |
| 755 | output.phase_increment = 2*relative_freq*PI; |
| 756 | |
| 757 | myphase = 0.0F; |
| 758 | for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ) |
| 759 | { |
| 760 | for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) |
| 761 | { |
| 762 | myphase += output.phase_increment; |
| 763 | while(myphase>PI) myphase-=2*PI; |
| 764 | while(myphase<-PI) myphase+=2*PI; |
| 765 | } |
| 766 | output.dinterl_trig[2*i] = cos(myphase); |
| 767 | output.dinterl_trig[2*i+4] = sin(myphase); |
| 768 | for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) |
| 769 | { |
| 770 | output.dinterl_trig[2*i+k] = output.dinterl_trig[2*i]; |
| 771 | output.dinterl_trig[2*i+k+4] = output.dinterl_trig[2*i+4]; |
| 772 | } |
| 773 | } |
| 774 | |
| 775 | output.dcos_blk = 0.0F; |
| 776 | output.dsin_blk = 0.0F; |
| 777 | |
| 778 | myphase = phase_start_rad; |
| 779 | for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++) |
| 780 | { |
| 781 | output.phase_state_i[i] = cos(myphase); |
| 782 | output.phase_state_q[i] = sin(myphase); |
| 783 | myphase += output.phase_increment; |
| 784 | while(myphase>PI) myphase-=2*PI; |
| 785 | while(myphase<-PI) myphase+=2*PI; |
| 786 | } |
| 787 | return output; |
| 788 | } |
| 789 | |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 790 | |
| 791 | PF_TARGET_CLONES |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 792 | void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 793 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 794 | // "vals := starts := phase_state" |
| 795 | __m128 cos_starts = VLOAD( &d->phase_state_i[0] ); |
| 796 | __m128 sin_starts = VLOAD( &d->phase_state_q[0] ); |
| 797 | __m128 cos_vals = cos_starts; |
| 798 | __m128 sin_vals = sin_starts; |
| 799 | __m128 inp_re, inp_im; |
| 800 | __m128 product_re, product_im; |
| 801 | __m128 interl_prod_a, interl_prod_b; |
| 802 | __m128 * RESTRICT p_trig_tab; |
| 803 | __m128 * RESTRICT u = (__m128*)in_out; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 804 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 805 | while (N_cplx) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 806 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 807 | const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; |
| 808 | int B = NB; |
| 809 | p_trig_tab = (__m128*)( &d->dinterl_trig[0] ); |
| 810 | while (B) |
| 811 | { |
| 812 | // complex multiplication of 4 complex values from/to in_out[] |
| 813 | // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]): |
| 814 | // "out[] = inp[] * vals" |
| 815 | UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ |
| 816 | product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) ); |
| 817 | product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) ); |
| 818 | INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); |
| 819 | VSTORE(u, interl_prod_a); |
| 820 | VSTORE(u+1, interl_prod_b); |
| 821 | u += 2; |
| 822 | // calculate complex phasor for next iteration |
| 823 | // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; |
| 824 | // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; |
| 825 | // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-) |
| 826 | // "vals := d[] * starts" |
| 827 | inp_re = VLOAD(p_trig_tab); |
| 828 | inp_im = VLOAD(p_trig_tab+1); |
| 829 | cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) ); |
| 830 | sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) ); |
| 831 | p_trig_tab += 2; |
| 832 | B -= 4; |
| 833 | } |
| 834 | N_cplx -= NB; |
| 835 | /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */ |
| 836 | /* re-use product_re[]/product_im[] for normalization */ |
| 837 | // "starts := vals := vals / |vals|" |
| 838 | product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) ); |
| 839 | #if 0 |
| 840 | // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64 |
| 841 | // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16 |
| 842 | product_im = _mm_rsqrt_ps(product_re); |
| 843 | cos_starts = cos_vals = VMUL(cos_vals, product_im); |
| 844 | sin_starts = sin_vals = VMUL(sin_vals, product_im); |
| 845 | #else |
| 846 | // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower! |
| 847 | // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again |
| 848 | product_im = _mm_sqrt_ps(product_re); |
| 849 | cos_starts = cos_vals = VDIV(cos_vals, product_im); |
| 850 | sin_starts = sin_vals = VDIV(sin_vals, product_im); |
| 851 | #endif |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 852 | } |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 853 | // "phase_state := starts" |
| 854 | VSTORE( &d->phase_state_i[0], cos_starts ); |
| 855 | VSTORE( &d->phase_state_q[0], sin_starts ); |
| 856 | } |
| 857 | |
| 858 | |
| 859 | #else |
| 860 | |
| 861 | /*********************************************************************/ |
| 862 | |
| 863 | shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) { |
| 864 | assert(0); |
| 865 | shift_limited_unroll_A_sse_data_t r; |
| 866 | return r; |
| 867 | } |
| 868 | shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) { |
| 869 | assert(0); |
| 870 | shift_limited_unroll_B_sse_data_t r; |
| 871 | return r; |
| 872 | } |
| 873 | shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) { |
| 874 | assert(0); |
| 875 | shift_limited_unroll_C_sse_data_t r; |
| 876 | return r; |
| 877 | } |
| 878 | |
| 879 | void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) { |
| 880 | assert(0); |
| 881 | } |
| 882 | void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) { |
| 883 | assert(0); |
| 884 | } |
| 885 | void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) { |
| 886 | assert(0); |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 887 | } |
| 888 | |
| 889 | #endif |
| 890 | |
| 891 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 892 | /*********************************************************************/ |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 893 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 894 | /**************/ |
| 895 | /*** ALGO I ***/ |
| 896 | /**************/ |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 897 | |
| 898 | void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state) |
| 899 | { |
| 900 | // constants for single phase step |
| 901 | float phase_increment_s = rate*PI; |
| 902 | float k1 = tan(0.5*phase_increment_s); |
| 903 | float k2 = 2*k1 /(1 + k1 * k1); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 904 | for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SZ; j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 905 | { |
| 906 | float tmp; |
| 907 | state->u_cos[j] = state->u_cos[j-1]; |
| 908 | state->v_sin[j] = state->v_sin[j-1]; |
| 909 | // small steps |
| 910 | tmp = state->u_cos[j] - k1 * state->v_sin[j]; |
| 911 | state->v_sin[j] += k2 * tmp; |
| 912 | state->u_cos[j] = tmp - k1 * state->v_sin[j]; |
| 913 | } |
| 914 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 915 | // constants for PF_SHIFT_RECURSIVE_SIMD_SZ times phase step |
| 916 | float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SZ; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 917 | while(phase_increment_b > PI) phase_increment_b-=2*PI; |
| 918 | while(phase_increment_b < -PI) phase_increment_b+=2*PI; |
| 919 | conf->k1 = tan(0.5*phase_increment_b); |
| 920 | conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1); |
| 921 | } |
| 922 | |
| 923 | void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state) |
| 924 | { |
| 925 | if (starting_phase != 0.0F) |
| 926 | { |
| 927 | state->u_cos[0] = cos(starting_phase); |
| 928 | state->v_sin[0] = sin(starting_phase); |
| 929 | } |
| 930 | else |
| 931 | { |
| 932 | state->u_cos[0] = 1.0F; |
| 933 | state->v_sin[0] = 0.0F; |
| 934 | } |
| 935 | shift_recursive_osc_update_rate(rate, conf, state); |
| 936 | } |
| 937 | |
| 938 | |
| 939 | PF_TARGET_CLONES |
| 940 | void shift_recursive_osc_cc(const complexf *input, complexf* output, |
| 941 | int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext) |
| 942 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 943 | float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ]; |
| 944 | float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ]; |
| 945 | float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 946 | shift_recursive_osc_t state = *state_ext; |
| 947 | const float k1 = conf->k1; |
| 948 | const float k2 = conf->k2; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 949 | for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_cc |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 950 | { |
| 951 | //we multiply two complex numbers - similar to shift_math_cc |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 952 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 953 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 954 | inp_i[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i; |
| 955 | inp_q[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 956 | } |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 957 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 958 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 959 | iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; |
| 960 | qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 961 | } |
| 962 | // update complex phasor - like incrementing phase |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 963 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 964 | tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 965 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 966 | state.v_sin[j] += k2 * tmp[j]; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 967 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 968 | state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; |
| 969 | } |
| 970 | *state_ext = state; |
| 971 | } |
| 972 | |
| 973 | PF_TARGET_CLONES |
| 974 | void shift_recursive_osc_inp_c(complexf* in_out, |
| 975 | int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext) |
| 976 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 977 | float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ]; |
| 978 | float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ]; |
| 979 | float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 980 | shift_recursive_osc_t state = *state_ext; |
| 981 | const float k1 = conf->k1; |
| 982 | const float k2 = conf->k2; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 983 | for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_inp_c |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 984 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 985 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 986 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 987 | inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i; |
| 988 | inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 989 | } |
| 990 | //we multiply two complex numbers - similar to shift_math_cc |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 991 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 992 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 993 | iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; |
| 994 | qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 995 | } |
| 996 | // update complex phasor - like incrementing phase |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 997 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 998 | tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 999 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1000 | state.v_sin[j] += k2 * tmp[j]; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1001 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1002 | state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; |
| 1003 | } |
| 1004 | *state_ext = state; |
| 1005 | } |
| 1006 | |
| 1007 | PF_TARGET_CLONES |
| 1008 | void gen_recursive_osc_c(complexf* output, |
| 1009 | int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext) |
| 1010 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1011 | float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1012 | shift_recursive_osc_t state = *state_ext; |
| 1013 | const float k1 = conf->k1; |
| 1014 | const float k2 = conf->k2; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1015 | for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@gen_recursive_osc_c |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1016 | { |
| 1017 | // output complex oscillator value |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1018 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1019 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1020 | iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j]; |
| 1021 | qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1022 | } |
| 1023 | // update complex phasor - like incrementing phase |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1024 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1025 | tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1026 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1027 | state.v_sin[j] += k2 * tmp[j]; |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1028 | for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1029 | state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; |
| 1030 | } |
| 1031 | *state_ext = state; |
| 1032 | } |
| 1033 | |
| 1034 | |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 1035 | #ifdef HAVE_SSE_INTRINSICS |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1036 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1037 | /*********************************************************************/ |
| 1038 | |
| 1039 | /**************/ |
| 1040 | /*** ALGO J ***/ |
| 1041 | /**************/ |
| 1042 | |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1043 | void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state) |
| 1044 | { |
| 1045 | // constants for single phase step |
| 1046 | float phase_increment_s = rate*PI; |
| 1047 | float k1 = tan(0.5*phase_increment_s); |
| 1048 | float k2 = 2*k1 /(1 + k1 * k1); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1049 | for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++) |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1050 | { |
| 1051 | float tmp; |
| 1052 | state->u_cos[j] = state->u_cos[j-1]; |
| 1053 | state->v_sin[j] = state->v_sin[j-1]; |
| 1054 | // small steps |
| 1055 | tmp = state->u_cos[j] - k1 * state->v_sin[j]; |
| 1056 | state->v_sin[j] += k2 * tmp; |
| 1057 | state->u_cos[j] = tmp - k1 * state->v_sin[j]; |
| 1058 | } |
| 1059 | |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1060 | // constants for PF_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step |
| 1061 | float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1062 | while(phase_increment_b > PI) phase_increment_b-=2*PI; |
| 1063 | while(phase_increment_b < -PI) phase_increment_b+=2*PI; |
| 1064 | conf->k1 = tan(0.5*phase_increment_b); |
| 1065 | conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1); |
| 1066 | } |
| 1067 | |
| 1068 | |
| 1069 | void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state) |
| 1070 | { |
| 1071 | if (starting_phase != 0.0F) |
| 1072 | { |
| 1073 | state->u_cos[0] = cos(starting_phase); |
| 1074 | state->v_sin[0] = sin(starting_phase); |
| 1075 | } |
| 1076 | else |
| 1077 | { |
| 1078 | state->u_cos[0] = 1.0F; |
| 1079 | state->v_sin[0] = 0.0F; |
| 1080 | } |
| 1081 | shift_recursive_osc_sse_update_rate(rate, conf, state); |
| 1082 | } |
| 1083 | |
| 1084 | |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 1085 | PF_TARGET_CLONES |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1086 | void shift_recursive_osc_sse_inp_c(complexf* in_out, |
| 1087 | int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext) |
| 1088 | { |
| 1089 | const __m128 k1 = LD_PS1( conf->k1 ); |
| 1090 | const __m128 k2 = LD_PS1( conf->k2 ); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1091 | __m128 u_cos = VLOAD( &state_ext->u_cos[0] ); |
| 1092 | __m128 v_sin = VLOAD( &state_ext->v_sin[0] ); |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1093 | __m128 inp_re, inp_im; |
| 1094 | __m128 product_re, product_im; |
| 1095 | __m128 interl_prod_a, interl_prod_b; |
| 1096 | __m128 * RESTRICT u = (__m128*)in_out; |
| 1097 | |
| 1098 | while (N_cplx) |
| 1099 | { |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1100 | //inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i; |
| 1101 | //inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q; |
| 1102 | UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1103 | |
| 1104 | //we multiply two complex numbers - similar to shift_math_cc |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1105 | //iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; |
| 1106 | //qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1107 | product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) ); |
| 1108 | product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) ); |
| 1109 | INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1110 | VSTORE(u, interl_prod_a); |
| 1111 | VSTORE(u+1, interl_prod_b); |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1112 | u += 2; |
| 1113 | |
| 1114 | // update complex phasor - like incrementing phase |
| 1115 | // tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; |
| 1116 | product_re = VSUB( u_cos, VMUL(k1, v_sin) ); |
| 1117 | // state.v_sin[j] += k2 * tmp[j]; |
| 1118 | v_sin = VADD( v_sin, VMUL(k2, product_re) ); |
| 1119 | // state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; |
| 1120 | u_cos = VSUB( product_re, VMUL(k1, v_sin) ); |
| 1121 | |
| 1122 | N_cplx -= 4; |
| 1123 | } |
hayati ayguen | d64eea5 | 2020-11-14 15:12:19 +0100 | [diff] [blame] | 1124 | VSTORE( &state_ext->u_cos[0], u_cos ); |
| 1125 | VSTORE( &state_ext->v_sin[0], v_sin ); |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1126 | } |
| 1127 | |
hayati ayguen | 2587d83 | 2020-11-12 07:15:43 +0000 | [diff] [blame] | 1128 | #else |
| 1129 | |
| 1130 | void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state) |
| 1131 | { |
| 1132 | assert(0); |
| 1133 | } |
| 1134 | |
| 1135 | void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state) |
| 1136 | { |
| 1137 | assert(0); |
| 1138 | } |
| 1139 | |
| 1140 | |
| 1141 | void shift_recursive_osc_sse_inp_c(complexf* in_out, |
| 1142 | int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext) |
| 1143 | { |
| 1144 | assert(0); |
| 1145 | } |
| 1146 | |
hayati ayguen | 29eb847 | 2020-11-11 23:09:58 +0100 | [diff] [blame] | 1147 | #endif |
| 1148 | |