blob: 90d36ca3372c634073128bb458718b4adae64766 [file] [log] [blame]
hayati ayguenb2d29362020-01-02 00:06:09 +01001/*
2 Copyright (c) 2013 Julien Pommier.
hayati ayguenca112412020-04-13 00:19:40 +02003 Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
hayati ayguenb2d29362020-01-02 00:06:09 +01004 */
hayati ayguenca112412020-04-13 00:19:40 +02005
hayati ayguenb2d29362020-01-02 00:06:09 +01006#define _WANT_SNAN 1
7
8#include "pffft.h"
9#include "pffastconv.h"
10
11#include <math.h>
12#include <float.h>
13#include <limits.h>
14#include <inttypes.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <time.h>
18#include <assert.h>
19#include <string.h>
20
21#ifdef HAVE_SYS_TIMES
22# include <sys/times.h>
23# include <unistd.h>
24#endif
25
26/*
27 vector support macros: the rest of the code is independant of
28 SSE/Altivec/NEON -- adding support for other platforms with 4-element
29 vectors should be limited to these macros
30*/
hayati ayguenca112412020-04-13 00:19:40 +020031#if 0
32#include "simd/pf_float.h"
33#endif
hayati ayguenb2d29362020-01-02 00:06:09 +010034
35#if defined(_MSC_VER)
36# define RESTRICT __restrict
37#elif defined(__GNUC__)
38# define RESTRICT __restrict
39#else
40# define RESTRICT
41#endif
42
43
44#if defined(_MSC_VER)
45#pragma warning( disable : 4244 )
46#endif
47
48
49#ifdef SNANF
50 #define INVALID_FLOAT_VAL SNANF
51#elif defined(SNAN)
52 #define INVALID_FLOAT_VAL SNAN
53#elif defined(NAN)
54 #define INVALID_FLOAT_VAL NAN
55#elif defined(INFINITY)
56 #define INVALID_FLOAT_VAL INFINITY
57#else
58 #define INVALID_FLOAT_VAL FLT_MAX
59#endif
60
61
62#if defined(HAVE_SYS_TIMES)
63 inline double uclock_sec(void) {
64 static double ttclk = 0.;
65 struct tms t;
66 if (ttclk == 0.)
67 ttclk = sysconf(_SC_CLK_TCK);
68 times(&t);
69 /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
70 return ((double)t.tms_utime)) / ttclk;
71 }
72# else
73 double uclock_sec(void)
74{ return (double)clock()/(double)CLOCKS_PER_SEC; }
75#endif
76
77
78
hayati ayguenb2d29362020-01-02 00:06:09 +010079typedef int (*pfnConvolution) (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
80typedef void* (*pfnConvSetup) (float *Hfwd, int Nf, int * BlkLen, int flags);
81typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
82typedef void (*pfnConvDestroy) (void * setup);
83
84
85struct ConvSetup
86{
87 pfnConvolution pfn;
88 int N;
89 int B;
90 float * H;
91 int flags;
92};
93
94
95void * convSetupRev( float * H, int N, int * BlkLen, int flags )
96{
97 struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
98 int i, Nr = N;
99 if (flags & PFFASTCONV_CPLX_INP_OUT)
100 Nr *= 2;
101 Nr += 4;
102 s->pfn = NULL;
103 s->N = N;
104 s->B = *BlkLen;
105 s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
106 s->flags = flags;
107 memset(s->H, 0, (unsigned)Nr * sizeof(float));
108 if (flags & PFFASTCONV_CPLX_INP_OUT)
109 {
110 for ( i = 0; i < N; ++i ) {
111 s->H[2*(N-1 -i) ] = H[i];
112 s->H[2*(N-1 -i)+1] = H[i];
113 }
114 /* simpler detection of overruns */
115 s->H[ 2*N ] = INVALID_FLOAT_VAL;
116 s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
117 s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
118 s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
119 }
120 else
121 {
122 for ( i = 0; i < N; ++i )
123 s->H[ N-1 -i ] = H[i];
124 /* simpler detection of overruns */
125 s->H[ N ] = INVALID_FLOAT_VAL;
126 s->H[ N +1 ] = INVALID_FLOAT_VAL;
127 s->H[ N +2 ] = INVALID_FLOAT_VAL;
128 s->H[ N +3 ] = INVALID_FLOAT_VAL;
129 }
130 return s;
131}
132
133void convDestroyRev( void * setup )
134{
135 struct ConvSetup * s = (struct ConvSetup*)setup;
136 pffastconv_free(s->H);
137 pffastconv_free(setup);
138}
139
140
141pfnConvolution ConvGetFnPtrRev( void * setup )
142{
143 struct ConvSetup * s = (struct ConvSetup*)setup;
144 if (!s)
145 return NULL;
146 return s->pfn;
147}
148
149
150void convSimdDestroy( void * setup )
151{
152 convDestroyRev(setup);
153}
154
155
156void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
157{
158 void * p = pffastconv_new_setup( H, N, BlkLen, flags );
159 if (!p)
160 printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
161 return p;
162}
163
164
165void fastConvDestroy( void * setup )
166{
167 pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
168}
169
170
171
172int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
173{
174 struct ConvSetup * p = (struct ConvSetup*)setup;
175 const float * RESTRICT X = input;
176 const float * RESTRICT Hrev = p->H;
177 float * RESTRICT Y = output;
178 const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
179 const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
180 int i, j;
181 (void)Yref;
182 (void)applyFlush;
183
184 if (p->flags & PFFASTCONV_CPLX_INP_OUT)
185 {
186 for ( i = 0; i <= lenNr; i += 2 )
187 {
188 float sumRe = 0.0F, sumIm = 0.0F;
189 for ( j = 0; j < Nr; j += 2 )
190 {
191 sumRe += X[i+j ] * Hrev[j];
192 sumIm += X[i+j+1] * Hrev[j+1];
193 }
194 Y[i ] = sumRe;
195 Y[i+1] = sumIm;
196 }
197 return i/2;
198 }
199 else
200 {
201 for ( i = 0; i <= lenNr; ++i )
202 {
203 float sum = 0.0F;
204 for (j = 0; j < Nr; ++j )
205 sum += X[i+j] * Hrev[j];
206 Y[i] = sum;
207 }
208 return i;
209 }
210}
211
212
213
214int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
215{
216 float sum[4];
217 struct ConvSetup * p = (struct ConvSetup*)setup;
218 const float * RESTRICT X = input;
219 const float * RESTRICT Hrev = p->H;
220 float * RESTRICT Y = output;
221 const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
222 const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
223 int i, j;
224 (void)Yref;
225 (void)applyFlush;
226
227 if (p->flags & PFFASTCONV_CPLX_INP_OUT)
228 {
229 if ( (Nr & 3) == 0 )
230 {
231 for ( i = 0; i <= lenNr; i += 2 )
232 {
233 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
234 for (j = 0; j < Nr; j += 4 )
235 {
236 sum[0] += X[i+j] * Hrev[j];
237 sum[1] += X[i+j+1] * Hrev[j+1];
238 sum[2] += X[i+j+2] * Hrev[j+2];
239 sum[3] += X[i+j+3] * Hrev[j+3];
240 }
241 Y[i ] = sum[0] + sum[2];
242 Y[i+1] = sum[1] + sum[3];
243 }
244 }
245 else
246 {
247 const int M = Nr & (~3);
248 for ( i = 0; i <= lenNr; i += 2 )
249 {
250 float tailSumRe = 0.0F, tailSumIm = 0.0F;
251 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
252 for (j = 0; j < M; j += 4 )
253 {
254 sum[0] += X[i+j ] * Hrev[j ];
255 sum[1] += X[i+j+1] * Hrev[j+1];
256 sum[2] += X[i+j+2] * Hrev[j+2];
257 sum[3] += X[i+j+3] * Hrev[j+3];
258 }
259 for ( ; j < Nr; j += 2 ) {
260 tailSumRe += X[i+j ] * Hrev[j ];
261 tailSumIm += X[i+j+1] * Hrev[j+1];
262 }
263 Y[i ] = ( sum[0] + sum[2] ) + tailSumRe;
264 Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
265 }
266 }
267 return i/2;
268 }
269 else
270 {
271 if ( (Nr & 3) == 0 )
272 {
273 for ( i = 0; i <= lenNr; ++i )
274 {
275 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
276 for (j = 0; j < Nr; j += 4 )
277 {
278 sum[0] += X[i+j] * Hrev[j];
279 sum[1] += X[i+j+1] * Hrev[j+1];
280 sum[2] += X[i+j+2] * Hrev[j+2];
281 sum[3] += X[i+j+3] * Hrev[j+3];
282 }
283 Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
284 }
285 return i;
286 }
287 else
288 {
289 const int M = Nr & (~3);
290 /* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
291 for ( i = 0; i <= lenNr; ++i )
292 {
293 float tailSum = 0.0;
294 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
295 for (j = 0; j < M; j += 4 )
296 {
297 sum[0] += X[i+j] * Hrev[j];
298 sum[1] += X[i+j+1] * Hrev[j+1];
299 sum[2] += X[i+j+2] * Hrev[j+2];
300 sum[3] += X[i+j+3] * Hrev[j+3];
301 }
302 for ( ; j < Nr; ++j )
303 tailSum += X[i+j] * Hrev[j];
304 Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
305 }
306 return i;
307 }
308 }
309}
310
311
312int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
313{
314 float sum[4];
315 struct ConvSetup * p = (struct ConvSetup*)setup;
316 (void)Yref;
317 (void)applyFlush;
318 if (p->flags & PFFASTCONV_SYMMETRIC)
319 {
320 const float * RESTRICT X = input;
321 const float * RESTRICT Hrev = p->H;
322 float * RESTRICT Y = output;
323 const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
324 const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
325 const int h = Nr / 2 -4;
326 const int E = Nr -4;
327 int i, j;
328
329 if (p->flags & PFFASTCONV_CPLX_INP_OUT)
330 {
331 for ( i = 0; i <= lenNr; i += 2 )
332 {
333 const int k = i + E;
334 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
335 for (j = 0; j <= h; j += 4 )
336 {
337 sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+2] );
338 sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
339 sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j ] );
340 sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
341 }
342 Y[i ] = sum[0] + sum[2];
343 Y[i+1] = sum[1] + sum[3];
344 }
345 return i/2;
346 }
347 else
348 {
349 for ( i = 0; i <= lenNr; ++i )
350 {
351 const int k = i + E;
352 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
353 for (j = 0; j <= h; j += 4 )
354 {
355 sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+3] );
356 sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
357 sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
358 sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j ] );
359 }
360 Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
361 }
362 return i;
363 }
364 }
365 else
366 {
367 const float * RESTRICT X = input;
368 const float * RESTRICT Hrev = p->H;
369 float * RESTRICT Y = output;
370 const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
371 const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
372 int i, j;
373
374 if (p->flags & PFFASTCONV_CPLX_INP_OUT)
375 {
376 for ( i = 0; i <= lenNr; i += 2 )
377 {
378 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
379 for (j = 0; j < Nr; j += 4 )
380 {
381 sum[0] += X[i+j] * Hrev[j];
382 sum[1] += X[i+j+1] * Hrev[j+1];
383 sum[2] += X[i+j+2] * Hrev[j+2];
384 sum[3] += X[i+j+3] * Hrev[j+3];
385 }
386 Y[i ] = sum[0] + sum[2];
387 Y[i+1] = sum[1] + sum[3];
388 }
389 return i/2;
390 }
391 else
392 {
393 if ( (Nr & 3) == 0 )
394 {
395 for ( i = 0; i <= lenNr; ++i )
396 {
397 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
398 for (j = 0; j < Nr; j += 4 )
399 {
400 sum[0] += X[i+j] * Hrev[j];
401 sum[1] += X[i+j+1] * Hrev[j+1];
402 sum[2] += X[i+j+2] * Hrev[j+2];
403 sum[3] += X[i+j+3] * Hrev[j+3];
404 }
405 Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
406 }
407 return i;
408 }
409 else
410 {
411 const int M = Nr & (~3);
412 /* printf("B: Nr = %d\n", Nr ); */
413 for ( i = 0; i <= lenNr; ++i )
414 {
415 float tailSum = 0.0;
416 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
417 for (j = 0; j < M; j += 4 )
418 {
419 sum[0] += X[i+j] * Hrev[j];
420 sum[1] += X[i+j+1] * Hrev[j+1];
421 sum[2] += X[i+j+2] * Hrev[j+2];
422 sum[3] += X[i+j+3] * Hrev[j+3];
423 }
424 for ( ; j < Nr; ++j )
425 tailSum += X[i+j] * Hrev[j];
426 Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
427 }
428 return i;
429 }
430 }
431 }
432
433}
434
435
hayati ayguenb2d29362020-01-02 00:06:09 +0100436int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
437{
438 (void)Yref;
439 return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
440}
441
442
443
444void printFirst( const float * V, const char * st, const int N, const int perLine )
445{
446 (void)V; (void)st; (void)N; (void)perLine;
447 return;
448#if 0
449 int i;
450 for ( i = 0; i < N; ++i )
451 {
452 if ( (i % perLine) == 0 )
453 printf("\n%s[%d]", st, i);
454 printf("\t%.1f", V[i]);
455 }
456 printf("\n");
457#endif
458}
459
460
461
462#define NUMY 11
463
464
465int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed) {
466 double t0, t1, tstop, td, tdref;
467 float *X, *H;
468 float *Y[NUMY];
469 int64_t outN[NUMY];
470 /* 256 KFloats or 16 MFloats data */
471#if 1
472 const int len = testOutLen ? (1 << 18) : (1 << 24);
473#elif 0
474 const int len = testOutLen ? (1 << 18) : (1 << 13);
475#else
476 const int len = testOutLen ? (1 << 18) : (1024);
477#endif
478 const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
479 const int lenC = len / cplxFactor;
480
481 int yi, yc, posMaxErr;
482 float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
483 int i, j, numErrOverLimit, iter;
484 int retErr = 0;
485
hayati ayguenca112412020-04-13 00:19:40 +0200486 /* 0 1 2 3 4 5 6 7 8 9 */
487 pfnConvSetup aSetup[NUMY] = { convSetupRev, convSetupRev, convSetupRev, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup };
488 pfnConvDestroy aDestroy[NUMY] = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy };
489 pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, };
490 pfnConvolution aConv[NUMY] = { slow_conv_R, slow_conv_A, slow_conv_B, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv };
491 const char * convText[NUMY] = { "R(non-simd)", "A(non-simd)", "B(non-simd)", "fast_conv_64", "fast_conv_128", "fast_conv_256", "fast_conv_512", "fast_conv_1K", "fast_conv_2K", "fast_conv_4K" };
492 int aFastAlgo[NUMY] = { 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 };
493 void * aSetupCfg[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
494 int aBlkLen[NUMY] = { 1024, 1024, 1024, 64, 128, 256, 512, 1024, 2048, 4096 };
hayati ayguenb2d29362020-01-02 00:06:09 +0100495#if 1
hayati ayguenca112412020-04-13 00:19:40 +0200496 int aRunAlgo[NUMY] = { 1, 1, 1, FILTERLEN<64, FILTERLEN<128, FILTERLEN<256, FILTERLEN<512, FILTERLEN<1024, FILTERLEN<2048, FILTERLEN<4096 };
497#elif 0
498 int aRunAlgo[NUMY] = { 1, 0, 0, 0 && FILTERLEN<64, 1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096 };
hayati ayguenb2d29362020-01-02 00:06:09 +0100499#else
hayati ayguenca112412020-04-13 00:19:40 +0200500 int aRunAlgo[NUMY] = { 1, 1, 1, 0 && FILTERLEN<64, 0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096 };
hayati ayguenb2d29362020-01-02 00:06:09 +0100501#endif
502 double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];
503
504 X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
505 for ( i=0; i < NUMY; ++i)
506 {
hayati ayguenca112412020-04-13 00:19:40 +0200507 if ( 1 || i < 2 )
hayati ayguenb2d29362020-01-02 00:06:09 +0100508 Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
509 else
510 Y[i] = Y[1];
511
hayati ayguene6cffc92020-02-28 19:57:30 +0100512 Y[i][0] = 123.F; /* test for pffft_zconvolve_no_accu() */
hayati ayguenb2d29362020-01-02 00:06:09 +0100513 aSpeedFactor[i] = -1.0;
514 aDuration[i] = -1.0;
515 procSmpPerSec[i] = -1.0;
516 }
517
518 H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));
519
520 /* initialize input */
521 if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
522 {
523 for ( i = 0; i < lenC; ++i )
524 {
525 X[2*i ] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
526 X[2*i+1] = (float)((i+2048) % 4093);
527 }
528 }
529 else
530 {
531 for ( i = 0; i < len; ++i )
532 X[i] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
533 }
534 X[ len ] = INVALID_FLOAT_VAL;
535 X[ len +1 ] = INVALID_FLOAT_VAL;
536 X[ len +2 ] = INVALID_FLOAT_VAL;
537 X[ len +3 ] = INVALID_FLOAT_VAL;
538
539 if (!testOutLen)
540 printFirst( X, "X", 64, 8 );
541
542 /* filter coeffs */
543 memset( H, 0, FILTERLEN * sizeof(float) );
544#if 1
545 if ( convFlags & PFFASTCONV_SYMMETRIC )
546 {
547 const int half = FILTERLEN / 2;
548 for ( j = 0; j < half; ++j ) {
549 switch (j % 3) {
550 case 0: H[j] = H[FILTERLEN-1-j] = -1.0F; break;
551 case 1: H[j] = H[FILTERLEN-1-j] = 1.0F; break;
552 case 2: H[j] = H[FILTERLEN-1-j] = 0.5F; break;
553 }
554 }
555 }
556 else
557 {
558 for ( j = 0; j < FILTERLEN; ++j ) {
559 switch (j % 3) {
560 case 0: H[j] = -1.0F; break;
561 case 1: H[j] = 1.0F; break;
562 case 2: H[j] = 0.5F; break;
563 }
564 }
565 }
566#else
567 H[0] = 1.0F;
568 H[FILTERLEN -1] = 1.0F;
569#endif
570 if (!testOutLen)
571 printFirst( H, "H", FILTERLEN, 8 );
572
573 printf("\n");
574 printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
575 ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
576 (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
577 ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );
578
579 while (1)
580 {
581
582 for ( yi = 0; yi < NUMY; ++yi )
583 {
584 if (!aRunAlgo[yi])
585 continue;
586
587 aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );
588
589 /* get effective apply function ptr */
590 if ( aSetupCfg[yi] && aGetFnPtr[yi] )
591 aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );
592
593 if ( aSetupCfg[yi] && aConv[yi] ) {
594 if (testOutLen)
595 {
596 t0 = uclock_sec();
597 outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
598 t1 = uclock_sec();
599 td = t1 - t0;
600 }
601 else
602 {
603 const int blkLen = 4096; /* required for 'fast_conv_4K' */
604 int64_t offC = 0, offS, Nout;
605 int k;
606 iter = 0;
607 outN[yi] = 0;
608 t0 = uclock_sec();
609 tstop = t0 + 0.25; /* benchmark duration: 250 ms */
610 do {
611 for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
612 {
613 offS = cplxFactor * offC;
614 Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], (offC +blkLen >= lenC) /* applyFlush */ );
615 offC += Nout;
616 ++iter;
617 if ( !Nout )
618 break;
619 if ( offC +blkLen >= lenC )
620 {
621 outN[yi] += offC;
622 offC = 0;
623 }
624 }
625 t1 = uclock_sec();
626 } while ( t1 < tstop );
627 outN[yi] += offC;
628 td = t1 - t0;
hayati ayguene6cffc92020-02-28 19:57:30 +0100629 procSmpPerSec[yi] = cplxFactor * (double)outN[yi] / td;
hayati ayguenb2d29362020-01-02 00:06:09 +0100630 }
631 }
632 else
633 {
634 t0 = t1 = td = 0.0;
635 outN[yi] = 0;
636 }
637 aDuration[yi] = td;
638 if ( yi == 0 ) {
639 const float * Yvals = Y[0];
640 const int64_t refOutLen = cplxFactor * outN[0];
641 tdref = td;
642 if (printDbg) {
643 printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
644 printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
645 }
646 aSpeedFactor[yi] = 1.0;
647 /* */
648 yRangeMin = FLT_MAX;
649 yRangeMax = FLT_MIN;
650 for ( i = 0; i < refOutLen; ++i )
651 {
652 if ( yRangeMax < Yvals[i] ) yRangeMax = Yvals[i];
653 if ( yRangeMin > Yvals[i] ) yRangeMin = Yvals[i];
654 }
655 yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
656 /* yErrLimit = 0.01F; */
657 if (testOutLen) {
658 if (1) {
659 printf("reference output len = %" PRId64 " smp\n", outN[0]);
660 printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
661 }
662 printFirst( Yvals, "Yref", 64, 8 );
663 }
664 }
665 else
666 {
667 aSpeedFactor[yi] = tdref / td;
668 if (printDbg) {
669 printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
670 printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
671 }
672 }
673 }
674
675 int iMaxSpeedSlowAlgo = -1;
676 int iFirstFastAlgo = -1;
677 int iMaxSpeedFastAlgo = -1;
678 int iPrintedRefOutLen = 0;
679 {
680 for ( yc = 1; yc < NUMY; ++yc )
681 {
682 if (!aRunAlgo[yc])
683 continue;
684 if (aFastAlgo[yc]) {
685 if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
686 iMaxSpeedFastAlgo = yc;
687
688 if (iFirstFastAlgo < 0)
689 iFirstFastAlgo = yc;
690 }
691 else
692 {
693 if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
694 iMaxSpeedSlowAlgo = yc;
695 }
696 }
697
698 if (printSpeed)
699 {
700 if (testOutLen)
701 {
702 if (iMaxSpeedSlowAlgo >= 0 )
703 printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
hayati ayguenca112412020-04-13 00:19:40 +0200704 if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0])
hayati ayguenb2d29362020-01-02 00:06:09 +0100705 printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
hayati ayguenca112412020-04-13 00:19:40 +0200706 if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1])
hayati ayguenb2d29362020-01-02 00:06:09 +0100707 printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);
hayati ayguenb2d29362020-01-02 00:06:09 +0100708
709 if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
710 printf("first fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo], aSpeedFactor[iFirstFastAlgo], 1000.0 * aDuration[iFirstFastAlgo]);
711 if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
712 printf("2nd fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1], aSpeedFactor[iFirstFastAlgo+1], 1000.0 * aDuration[iFirstFastAlgo+1]);
713
714 if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
715 {
716 printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
717 if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
718 printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
719 }
720 printf("\n");
721 }
722 else
723 {
724 for ( yc = 0; yc < NUMY; ++yc )
725 {
726 if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
727 continue;
hayati ayguene6cffc92020-02-28 19:57:30 +0100728 printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\n",
729 convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001 );
hayati ayguenb2d29362020-01-02 00:06:09 +0100730 }
731 }
732
733 }
734 }
735
736
737 for ( yc = 1; yc < NUMY; ++yc )
738 {
739 const float * Yref;
740 const float * Ycurr;
741 int outMin;
742
743 if (!aRunAlgo[yc])
744 continue;
745
746 if (printDbg)
747 printf("\n");
748
749 if ( outN[yc] == 0 )
750 {
751 printf("output size 0: '%s' not implemented\n", convText[yc]);
752 }
753 else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
754 {
755 if (!iPrintedRefOutLen)
756 {
757 printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
758 iPrintedRefOutLen = 1;
759 }
760 printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
761 FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
762 retErr = 1;
763 }
764
765 posMaxErr = 0;
766 maxErr = -1.0;
767 Yref = Y[0];
768 Ycurr = Y[yc];
769 outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
770 numErrOverLimit = 0;
771 for ( i = 0; i < outMin; ++i )
772 {
773 if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit )
774 {
775 printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
776 convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
777 ++numErrOverLimit;
778 }
779
780 if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
781 {
782 maxErr = fabsf(Ycurr[i] - Yref[i]);
783 posMaxErr = i;
784 }
785 }
786
787 if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
788 printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
789 }
790
791 break;
792 }
793
794 pffastconv_free(X);
795 for ( i=0; i < NUMY; ++i)
796 {
hayati ayguenca112412020-04-13 00:19:40 +0200797 if ( 1 || i < 2 )
hayati ayguenb2d29362020-01-02 00:06:09 +0100798 pffastconv_free( Y[i] );
799 if (!aRunAlgo[i])
800 continue;
801 aDestroy[i]( aSetupCfg[i] );
802 }
803
804 pffastconv_free(H);
805
806 return retErr;
807}
808
hayati ayguenca112412020-04-13 00:19:40 +0200809/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
810void validate_pffft_simd();
811int validate_pffft_simd_ex(FILE * DbgOut);
812
hayati ayguenb2d29362020-01-02 00:06:09 +0100813
814int main(int argc, char **argv)
815{
816 int result = 0;
817 int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
818 int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
819 int testReal = 1, testCplx = 1, testSymetric = 0;
820
821 for ( i = 1; i < argc; ++i ) {
hayati ayguenca112412020-04-13 00:19:40 +0200822
823 if (!strcmp(argv[i], "--test-simd")) {
824 int numErrs = validate_pffft_simd_ex(stdout);
825 fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
826 return ( numErrs > 0 ? 1 : 0 );
827 }
828
hayati ayguenb2d29362020-01-02 00:06:09 +0100829 if (!strcmp(argv[i], "--no-len")) {
830 testOutLens = 0;
831 }
832 else if (!strcmp(argv[i], "--no-bench")) {
833 benchConv = 0;
834 }
835 else if (!strcmp(argv[i], "--quick")) {
836 quickTest = 1;
837 }
838 else if (!strcmp(argv[i], "--slow")) {
839 slowTest = 1;
840 }
841 else if (!strcmp(argv[i], "--real")) {
842 testCplx = 0;
843 }
844 else if (!strcmp(argv[i], "--cplx")) {
845 testReal = 0;
846 }
847 else if (!strcmp(argv[i], "--sym")) {
848 testSymetric = 1;
849 }
850 else /* if (!strcmp(argv[i], "--help")) */ {
hayati ayguenca112412020-04-13 00:19:40 +0200851 printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
hayati ayguenb2d29362020-01-02 00:06:09 +0100852 exit(1);
853 }
854 }
855
856
857 if (testOutLens)
858 {
859 for ( k = 0; k < 3; ++k )
860 {
861 if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
862 continue;
863 printf("\n\n==========\n");
864 printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
865 printf("==========\n");
866 flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
867 flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
868 flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
869 testOutLen = 1;
870 printDbg = 0;
871 printSpeed = 0;
872 for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
873 {
874 if ( (M % 16) != 0 && testSymetric )
875 continue;
876 result |= test(M, flagsB, testOutLen, printDbg, printSpeed);
877 }
878 }
879 }
880
881 if (benchConv)
882 {
883 for ( k = 0; k < 3; ++k )
884 {
885 if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
886 continue;
887 printf("\n\n==========\n");
888 printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
889 printf("==========\n");
890 flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
891 flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
892 flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
893 testOutLen = 0;
894 printDbg = 0;
895 printSpeed = 1;
896 if (!slowTest) {
897 result |= test( 32, flagsC, testOutLen, printDbg, printSpeed);
898 result |= test( 32+ 16, flagsC, testOutLen, printDbg, printSpeed);
899 result |= test( 64, flagsC, testOutLen, printDbg, printSpeed);
900 result |= test( 64+ 32, flagsC, testOutLen, printDbg, printSpeed);
901 result |= test(128, flagsC, testOutLen, printDbg, printSpeed);
902 }
903 if (!quickTest) {
904 result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed);
905 result |= test(256, flagsC, testOutLen, printDbg, printSpeed);
906 result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed);
907 result |= test(512, flagsC, testOutLen, printDbg, printSpeed);
908 result |= test(1024, flagsC, testOutLen, printDbg, printSpeed);
909 }
910 }
911 }
912
913 return result;
914}
915