add PFFASTCONV library and test + bench
* API for fast convolution: pffastconv.h
* move simd macros from pffft.c into pfsimd_macros.h for reusability
Signed-off-by: hayati ayguen <h_ayguen@web.de>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70dcb20..fc2dc76 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,9 +4,16 @@
option(USE_SIMD "use SIMD (SSE/NEON) CPU features?" ON)
option(USE_NEON "force using NEON on ARM? (requires SIMD)" OFF)
-option(USE_FFTW "use (system-installed) FFTW3 in benchmark?" OFF)
-option(USE_GREEN "use Green FFT in benchmark? - if exists in subdir" ON)
-option(USE_KISS "use KissFFT in benchmark? - if exists in subdir" ON)
+option(USE_FFTW "use (system-installed) FFTW3 in fft benchmark?" OFF)
+option(USE_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON)
+option(USE_KISS "use KissFFT in fft benchmark? - if exists in subdir" ON)
+option(USE_ASAN "use GCC's address sanitizer?" OFF)
+
+if (USE_ASAN)
+ set(ASANLIB "asan")
+else()
+ set(ASANLIB "")
+endif()
if (USE_GREEN)
@@ -39,34 +46,84 @@
message(STATUS "Build type not specified: defaulting to release.")
endif(NOT CMAKE_BUILD_TYPE)
+if ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC")
+ # using Visual Studio C++
+ message(STATUS "INFO: detected MSVC: will not link math lib m")
+ set(MATHLIB "")
+else()
+ message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
+ set(MATHLIB "m")
+endif()
+
+
add_library(PFFFT STATIC pffft.c)
+target_compile_definitions(PFFFT PRIVATE _USE_MATH_DEFINES)
+if (USE_ASAN)
+ target_compile_options(PFFFT PRIVATE "-fsanitize=address")
+endif()
if (NOT USE_SIMD)
target_compile_definitions(PFFFT PRIVATE PFFFT_SIMD_DISABLE=1)
endif()
if (USE_SIMD AND USE_NEON)
target_compile_options(PFFFT PRIVATE "-mfpu=neon")
endif()
-
-
-target_link_libraries(PFFFT m)
+target_link_libraries( PFFFT ${MATHLIB} )
set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
)
+
add_library(FFTPACK STATIC fftpack.c)
-target_link_libraries(FFTPACK m)
+target_compile_definitions(FFTPACK PRIVATE _USE_MATH_DEFINES)
+target_link_libraries( FFTPACK ${MATHLIB} )
set_property(TARGET FFTPACK APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
)
-add_executable(bench_pffft bench_pffft.c )
+add_library(PFFASTCONV STATIC pffastconv.c)
+target_compile_definitions(PFFASTCONV PRIVATE _USE_MATH_DEFINES)
+if (USE_ASAN)
+ target_compile_options(PFFASTCONV PRIVATE "-fsanitize=address")
+endif()
+if (NOT USE_SIMD)
+ target_compile_definitions(PFFASTCONV PRIVATE PFFFT_SIMD_DISABLE=1)
+endif()
+if (USE_SIMD AND USE_NEON)
+ target_compile_options(PFFASTCONV PRIVATE "-mfpu=neon")
+endif()
+target_link_libraries( PFFASTCONV PFFFT ${ASANLIB} ${MATHLIB} )
+set_property(TARGET PFFASTCONV APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+)
+
+add_executable( test_pffft test_pffft.c )
+target_compile_definitions(test_pffft PRIVATE _USE_MATH_DEFINES)
+target_link_libraries( test_pffft PFFFT ${ASANLIB} )
+
+
+add_executable(test_pffastconv test_pffastconv.c )
+target_compile_definitions(test_pffastconv PRIVATE _USE_MATH_DEFINES)
+if (USE_ASAN)
+ target_compile_options(test_pffastconv PRIVATE "-fsanitize=address")
+endif()
+if (NOT USE_SIMD)
+ target_compile_definitions(test_pffastconv PRIVATE PFFFT_SIMD_DISABLE=1)
+endif()
+if (USE_SIMD AND USE_NEON)
+ target_compile_options(test_pffastconv PRIVATE "-mfpu=neon")
+endif()
+target_link_libraries( test_pffastconv PFFASTCONV ${ASANLIB} ${MATHLIB} )
+
+
+add_executable(bench_pffft bench_pffft.c )
+target_compile_definitions(bench_pffft PRIVATE _USE_MATH_DEFINES)
if (NOT USE_SIMD)
target_compile_definitions(bench_pffft PRIVATE PFFFT_SIMD_DISABLE=1)
endif()
-target_link_libraries(bench_pffft PFFFT FFTPACK)
+target_link_libraries( bench_pffft PFFFT FFTPACK ${ASANLIB} )
if (USE_FFTW)
target_compile_definitions(bench_pffft PRIVATE HAVE_FFTW=1)
@@ -101,4 +158,23 @@
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
+add_test(NAME test_pfconv_lens_symetric
+ COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick" "--sym"
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+add_test(NAME test_pfconv_lens_non_sym
+ COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick"
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+add_test(NAME bench_pfconv_symetric
+ COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--sym"
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+add_test(NAME bench_pfconv_non_sym
+ COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len"
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
diff --git a/pffastconv.c b/pffastconv.c
new file mode 100644
index 0000000..0aaed62
--- /dev/null
+++ b/pffastconv.c
@@ -0,0 +1,291 @@
+/*
+ Copyright (c) 2019 Hayati Ayguen.
+
+ */
+
+#include "pffastconv.h"
+#include "pffft.h"
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+# define COMPILER_MSVC
+#elif defined(__GNUC__)
+# define COMPILER_GCC
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+
+#define FASTCONV_DBG_OUT 0
+
+
+#if defined(COMPILER_GCC)
+# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+# define RESTRICT __restrict
+#elif defined(COMPILER_MSVC)
+# define ALWAYS_INLINE(return_type) __forceinline return_type
+# define RESTRICT __restrict
+#endif
+
+
+#ifdef COMPILER_MSVC
+#pragma warning( disable : 4244 4305 4204 4456 )
+#endif
+
+/*
+ vector support macros: the rest of the code is independant of
+ SSE/Altivec/NEON -- adding support for other platforms with 4-element
+ vectors should be limited to these macros
+*/
+#include "pfsimd_macros.h"
+
+
+static
+unsigned nextPowerOfTwo(unsigned v) {
+ /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+ /* compute the next highest power of 2 of 32-bit v */
+ v--;
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ v++;
+ return v;
+}
+
+void *pffastconv_malloc(size_t nb_bytes)
+{
+ return Valigned_malloc(nb_bytes);
+}
+
+void pffastconv_free(void *p)
+{
+ Valigned_free(p);
+}
+
+int pffastconv_simd_size()
+{
+ return SIMD_SZ;
+}
+
+
+
+struct PFFASTCONV_Setup
+{
+ float * Xt; /* input == x in time domain - copy for alignment */
+ float * Xf; /* input == X in freq domain */
+ float * Hf; /* filterCoeffs == H in freq domain */
+ float * Mf; /* input * filterCoeffs in freq domain */
+ PFFFT_Setup *st;
+ int filterLen; /* convolution length */
+ int Nfft; /* FFT/block length */
+ int flags;
+ float scale;
+};
+
+
+PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags )
+{
+ PFFASTCONV_Setup * s = NULL;
+ const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
+ int i, Nfft = 2 * nextPowerOfTwo(filterLen -1);
+#if FASTCONV_DBG_OUT
+ const int iOldBlkLen = *blockLen;
+#endif
+
+ if ( flags & PFFASTCONV_CPLX_FILTER )
+ return NULL;
+
+ s = pffastconv_malloc( sizeof(struct PFFASTCONV_Setup) );
+
+ if ( *blockLen > Nfft ) {
+ Nfft = *blockLen;
+ Nfft = nextPowerOfTwo(Nfft);
+ }
+ *blockLen = Nfft; /* this is in (complex) samples */
+
+ Nfft *= cplxFactor;
+
+ if ( (flags & PFFASTCONV_DIRECT_INP) && !(flags & PFFASTCONV_CPLX_INP_OUT) )
+ s->Xt = NULL;
+ else
+ s->Xt = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+ s->Xf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+ s->Hf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+ s->Mf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+ s->st = pffft_new_setup(Nfft, PFFFT_REAL); /* with complex: we do 2 x fft() */
+ s->filterLen = filterLen; /* filterLen == convolution length == length of impulse response */
+ if ( cplxFactor == 2 )
+ s->filterLen = 2 * filterLen - 1;
+ s->Nfft = Nfft; /* FFT/block length */
+ s->flags = flags;
+ s->scale = (float)( 1.0 / Nfft );
+
+ memset( s->Xt, 0, (unsigned)Nfft * sizeof(float) );
+ for ( i = 0; i < filterLen; ++i )
+ s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ filterLen - 1 - i ];
+
+ pffft_transform(s->st, s->Xt, s->Hf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+
+#if FASTCONV_DBG_OUT
+ printf("\n fastConvSetup(filterLen = %d, blockLen %d) --> blockLen %d, OutLen = %d\n"
+ , filterLen, iOldBlkLen, *blockLen, Nfft - filterLen +1 );
+#endif
+
+ return s;
+}
+
+
+void pffastconv_destroy_setup( PFFASTCONV_Setup * s )
+{
+ if (!s)
+ return;
+ pffft_destroy_setup(s->st);
+ pffastconv_free(s->Mf);
+ pffastconv_free(s->Hf);
+ pffastconv_free(s->Xf);
+ if ( s->Xt )
+ pffastconv_free(s->Xt);
+ pffastconv_free(s);
+}
+
+
+int pffastconv_apply(PFFASTCONV_Setup * s, const float *input_, int cplxInputLen, float *output_, int applyFlush)
+{
+ const float * RESTRICT X = input_;
+ float * RESTRICT Y = output_;
+ const int Nfft = s->Nfft;
+ const int filterLen = s->filterLen;
+ const int flags = s->flags;
+ const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
+ const int inputLen = cplxFactor * cplxInputLen;
+ int inpOff, procLen, numOut = 0, j, part, cplxOff;
+
+ /* applyFlush != 0:
+ * inputLen - inpOff -filterLen + 1 > 0
+ * <=> inputLen -filterLen + 1 > inpOff
+ * <=> inpOff < inputLen -filterLen + 1
+ *
+ * applyFlush == 0:
+ * inputLen - inpOff >= Nfft
+ * <=> inputLen - Nfft >= inpOff
+ * <=> inpOff <= inputLen - Nfft
+ * <=> inpOff < inputLen - Nfft + 1
+ */
+
+ if ( cplxFactor == 2 )
+ {
+ const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
+#if 0
+ printf( "*** inputLen %d, filterLen %d, Nfft %d => maxOff %d\n", inputLen, filterLen, Nfft, maxOff);
+#endif
+ for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
+ {
+ procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
+ numOut = ( procLen - filterLen + 1 ) & ( ~1 );
+ if (!numOut)
+ break;
+#if 0
+ if (!inpOff)
+ printf("*** inpOff = %d, numOut = %d\n", inpOff, numOut);
+ if (inpOff + filterLen + 2 >= maxOff )
+ printf("*** inpOff = %d, inpOff + numOut = %d\n", inpOff, inpOff + numOut);
+#endif
+
+ if ( flags & PFFASTCONV_DIRECT_INP )
+ {
+ pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+ }
+ else
+ {
+ memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
+ if ( procLen < Nfft )
+ memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
+
+ pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+ }
+
+ memset( s->Mf, 0, (unsigned)Nfft * sizeof(float) ); /* why does pffft_zconvolve_accumulate() accumulate?? */
+ pffft_zconvolve_accumulate(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
+
+ if ( flags & PFFASTCONV_DIRECT_OUT )
+ {
+ pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
+ }
+ else
+ {
+ pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
+ memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
+ }
+ }
+ return inpOff / cplxFactor;
+ }
+ else
+ {
+ const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
+ const int numParts = (flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1;
+
+ for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
+ {
+ procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
+ numOut = procLen - filterLen + 1;
+
+ for ( part = 0; part < numParts; ++part ) /* iterate per real/imag component */
+ {
+
+ if ( flags & PFFASTCONV_CPLX_INP_OUT )
+ {
+ cplxOff = 2 * inpOff + part;
+ for ( j = 0; j < procLen; ++j )
+ s->Xt[j] = X[cplxOff + 2 * j];
+ if ( procLen < Nfft )
+ memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
+
+ pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+ }
+ else if ( flags & PFFASTCONV_DIRECT_INP )
+ {
+ pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+ }
+ else
+ {
+ memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
+ if ( procLen < Nfft )
+ memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
+
+ pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+ }
+
+ memset( s->Mf, 0, (unsigned)Nfft * sizeof(float) ); /* why does pffft_zconvolve_accumulate() accumulate?? */
+ pffft_zconvolve_accumulate(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
+
+ if ( flags & PFFASTCONV_CPLX_INP_OUT )
+ {
+ pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
+
+ cplxOff = 2 * inpOff + part;
+ for ( j = 0; j < numOut; ++j )
+ Y[ cplxOff + 2 * j ] = s->Xf[j];
+ }
+ else if ( flags & PFFASTCONV_DIRECT_OUT )
+ {
+ pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
+ }
+ else
+ {
+ pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
+ memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
+ }
+
+ }
+ }
+
+ return inpOff;
+ }
+}
+
diff --git a/pffastconv.h b/pffastconv.h
new file mode 100644
index 0000000..6e59d42
--- /dev/null
+++ b/pffastconv.h
@@ -0,0 +1,164 @@
+/* Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
+
+ Redistribution and use of the Software in source and binary forms,
+ with or without modification, is permitted provided that the
+ following conditions are met:
+
+ - Neither the names of PFFFT, PFFASTCONV, nor the names of its
+ sponsors or contributors may be used to endorse or promote products
+ derived from this Software without specific prior written permission.
+
+ - Redistributions of source code must retain the above copyright
+ notices, this list of conditions, and the disclaimer below.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the disclaimer below in the
+ documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+ HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+ SOFTWARE.
+*/
+
+/*
+ PFFASTCONV : a Pretty Fast Fast Convolution
+
+ This is basically the implementation of fast convolution,
+ utilizing the FFT (pffft).
+
+ Restrictions:
+
+ - 1D transforms only, with 32-bit single precision.
+
+ - all (float*) pointers in the functions below are expected to
+ have an "simd-compatible" alignment, that is 16 bytes on x86 and
+ powerpc CPUs.
+
+ You can allocate such buffers with the functions
+ pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+ posix_memalign..)
+
+*/
+
+#ifndef PFFASTCONV_H
+#define PFFASTCONV_H
+
+#include <stddef.h> // for size_t
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /* opaque struct holding internal stuff
+ this struct can't be shared by many threads as it contains
+ temporary data, computed within the convolution
+ */
+ typedef struct PFFASTCONV_Setup PFFASTCONV_Setup;
+
+ typedef enum {
+ PFFASTCONV_CPLX_INP_OUT = 1,
+ /* set when input and output is complex,
+ * with real and imag part interleaved in both vectors.
+ * input[] has inputLen complex values: 2 * inputLen floats,
+ * output[] is also written with complex values.
+ * without this flag, the input is interpreted as real vector
+ */
+
+ PFFASTCONV_CPLX_FILTER = 2,
+ /* set when filterCoeffs is complex,
+ * with real and imag part interleaved.
+ * filterCoeffs[] has filterLen complex values: 2 * filterLen floats
+ * without this flag, the filter is interpreted as real vector
+ * ATTENTION: this is not implemented yet!
+ */
+
+ PFFASTCONV_DIRECT_INP = 4,
+ /* set PFFASTCONV_DIRECT_INP only, when following conditions are met:
+ * 1- input vecor X must be aligned
+ * 2- (all) inputLen <= ouput blockLen
+ * 3- X must have minimum length of output BlockLen
+ * 4- the additional samples from inputLen .. BlockLen-1
+ * must contain valid small and non-NAN samples (ideally zero)
+ *
+ * this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
+ */
+
+ PFFASTCONV_DIRECT_OUT = 8,
+ /* set PFFASTCONV_DIRECT_OUT only when following conditions are met:
+ * 1- output vector Y must be aligned
+ * 2- (all) inputLen <= ouput blockLen
+ * 3- Y must have minimum length of output blockLen
+ *
+ * this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
+ */
+
+ PFFASTCONV_CPLX_SINGLE_FFT = 16,
+ /* hint to process complex data with one single FFT;
+ * default is to use 2 FFTs: one for real part, one for imag part
+ * */
+
+
+ PFFASTCONV_SYMMETRIC = 32
+ /* just informal, that filter is symmetric .. and filterLen is multiple of 8 */
+
+ } pffastconv_flags_t;
+
+ /*
+ prepare for performing fast convolution(s) of 'filterLen' with input 'blockLen'.
+ The output 'blockLen' might be bigger to allow the fast convolution.
+
+ 'flags' are bitmask over the 'pffastconv_flags_t' enum.
+
+ PFFASTCONV_Setup structure can't be shared accross multiple filters
+ or concurrent threads.
+ */
+ PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags );
+
+ void pffastconv_destroy_setup(PFFASTCONV_Setup *);
+
+ /*
+ Perform the fast convolution.
+
+ 'input' and 'output' don't need to be aligned - unless any of
+ PFFASTCONV_DIRECT_INP or PFFASTCONV_DIRECT_OUT is set in 'flags'.
+
+ inputLen > output 'blockLen' (from pffastconv_new_setup()) is allowed.
+ in this case, multiple FFTs are called internally, to process the
+ input[].
+
+ 'output' vector must have size >= (inputLen - filterLen + 1)
+
+ set bool option 'applyFlush' to process the full input[].
+ with this option, 'tail samples' of input are also processed.
+ This might be inefficient, because the FFT is called to produce
+ few(er) output samples, than possible.
+ This option is useful to process the last samples of an input (file)
+ or to reduce latency.
+
+ return value is the number of produced samples in output[].
+ the same amount of samples is processed from input[]. to continue
+ processing, the caller must save/move the remaining samples of
+ input[].
+
+ */
+ int pffastconv_apply(PFFASTCONV_Setup * s, const float *input, int inputLen, float *output, int applyFlush);
+
+ void *pffastconv_malloc(size_t nb_bytes);
+ void pffastconv_free(void *);
+
+ /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+ int pffastconv_simd_size();
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PFFASTCONV_H */
diff --git a/pffft.c b/pffft.c
index ce76542..1a5ca4f 100644
--- a/pffft.c
+++ b/pffft.c
@@ -66,10 +66,6 @@
# define COMPILER_GCC
#endif
-#ifdef COMPILER_MSVC
-# define _USE_MATH_DEFINES
-#endif
-
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
@@ -103,173 +99,21 @@
// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
//#define PFFFT_SIMD_DISABLE
-/*
- Altivec support macros
-*/
-#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
-typedef vector float v4sf;
-# define SIMD_SZ 4
-# define VZERO() ((vector float) vec_splat_u8(0))
-# define VMUL(a,b) vec_madd(a,b, VZERO())
-# define VADD(a,b) vec_add(a,b)
-# define VMADD(a,b,c) vec_madd(a,b,c)
-# define VSUB(a,b) vec_sub(a,b)
-inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
-# define LD_PS1(p) ld_ps1(&p)
-# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
-# define UNINTERLEAVE2(in1, in2, out1, out2) { \
- vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
- vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
- v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
- }
-# define VTRANSPOSE4(x0,x1,x2,x3) { \
- v4sf y0 = vec_mergeh(x0, x2); \
- v4sf y1 = vec_mergel(x0, x2); \
- v4sf y2 = vec_mergeh(x1, x3); \
- v4sf y3 = vec_mergel(x1, x3); \
- x0 = vec_mergeh(y0, y2); \
- x1 = vec_mergel(y0, y2); \
- x2 = vec_mergeh(y1, y3); \
- x3 = vec_mergel(y1, y3); \
- }
-# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
-# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
-
-/*
- SSE1 support macros
-*/
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
-
-#include <xmmintrin.h>
-typedef __m128 v4sf;
-# define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
-# define VZERO() _mm_setzero_ps()
-# define VMUL(a,b) _mm_mul_ps(a,b)
-# define VADD(a,b) _mm_add_ps(a,b)
-# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
-# define VSUB(a,b) _mm_sub_ps(a,b)
-# define LD_PS1(p) _mm_set1_ps(p)
-# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
-# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
-# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
-# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
-# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
-
-/*
- ARM NEON support macros
-*/
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
-# include <arm_neon.h>
-typedef float32x4_t v4sf;
-# define SIMD_SZ 4
-# define VZERO() vdupq_n_f32(0)
-# define VMUL(a,b) vmulq_f32(a,b)
-# define VADD(a,b) vaddq_f32(a,b)
-# define VMADD(a,b,c) vmlaq_f32(c,a,b)
-# define VSUB(a,b) vsubq_f32(a,b)
-# define LD_PS1(p) vld1q_dup_f32(&(p))
-# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
-# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
-# define VTRANSPOSE4(x0,x1,x2,x3) { \
- float32x4x2_t t0_ = vzipq_f32(x0, x2); \
- float32x4x2_t t1_ = vzipq_f32(x1, x3); \
- float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
- float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
- x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
- }
-// marginally faster version
-//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
-# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
-# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
-#else
-# if !defined(PFFFT_SIMD_DISABLE)
-# warning "building with simd disabled !\n";
-# define PFFFT_SIMD_DISABLE // fallback to scalar code
-# endif
-#endif
-
-// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
-#ifdef PFFFT_SIMD_DISABLE
-typedef float v4sf;
-# define SIMD_SZ 1
-# define VZERO() 0.f
-# define VMUL(a,b) ((a)*(b))
-# define VADD(a,b) ((a)+(b))
-# define VMADD(a,b,c) ((a)*(b)+(c))
-# define VSUB(a,b) ((a)-(b))
-# define LD_PS1(p) (p)
-# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
-#endif
-
-// shortcuts for complex multiplcations
-#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
-#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
-#ifndef SVMUL
-// multiply a scalar with a vector
-#define SVMUL(f,v) VMUL(LD_PS1(f),v)
-#endif
-
-#if !defined(PFFFT_SIMD_DISABLE)
-typedef union v4sf_union {
- v4sf v;
- float f[4];
-} v4sf_union;
-
-#include <string.h>
-
-#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+#include "pfsimd_macros.h"
/* detect bugs with the vector support macros */
void validate_pffft_simd() {
- float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
- v4sf_union a0, a1, a2, a3, t, u;
- memcpy(a0.f, f, 4*sizeof(float));
- memcpy(a1.f, f+4, 4*sizeof(float));
- memcpy(a2.f, f+8, 4*sizeof(float));
- memcpy(a3.f, f+12, 4*sizeof(float));
-
- t = a0; u = a1; t.v = VZERO();
- printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
- t.v = VADD(a1.v, a2.v);
- printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
- t.v = VMUL(a1.v, a2.v);
- printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
- t.v = VMADD(a1.v, a2.v,a0.v);
- printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
-
- INTERLEAVE2(a1.v,a2.v,t.v,u.v);
- printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
- assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
- UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
- printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
- assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
-
- t.v=LD_PS1(f[15]);
- printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
- assertv4(t, 15, 15, 15, 15);
- t.v = VSWAPHL(a1.v, a2.v);
- printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
- assertv4(t, 8, 9, 6, 7);
- VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
- printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
- a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
- a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
- assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
+ Vvalidate_simd();
}
-#endif //!PFFFT_SIMD_DISABLE
/* SSE and co like 16-bytes aligned pointers */
#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
void *pffft_aligned_malloc(size_t nb_bytes) {
- void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
- if (!p0) return (void *) 0;
- p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
- *((void **) p - 1) = p0;
- return p;
+ return Valigned_malloc(nb_bytes);
}
void pffft_aligned_free(void *p) {
- if (p) free(*((void **) p - 1));
+ Valigned_free(p);
}
int pffft_simd_size() { return SIMD_SZ; }
diff --git a/pfsimd_macros.h b/pfsimd_macros.h
new file mode 100644
index 0000000..cfb3bd1
--- /dev/null
+++ b/pfsimd_macros.h
@@ -0,0 +1,250 @@
+
+/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
+
+ Redistribution and use of the Software in source and binary forms,
+ with or without modification, is permitted provided that the
+ following conditions are met:
+
+ - Neither the names of NCAR's Computational and Information Systems
+ Laboratory, the University Corporation for Atmospheric Research,
+ nor the names of its sponsors or contributors may be used to
+ endorse or promote products derived from this Software without
+ specific prior written permission.
+
+ - Redistributions of source code must retain the above copyright
+ notices, this list of conditions, and the disclaimer below.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the disclaimer below in the
+ documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+ HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+ SOFTWARE.
+*/
+
+#ifndef PF_SIMD_MACROS_H
+#define PF_SIMD_MACROS_H
+
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+
+
+/*
+ * SIMD reference material:
+ *
+ * general SIMD introduction:
+ * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
+ *
+ * SSE 1:
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+ *
+ * ARM NEON:
+ * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
+ *
+ * Altivec:
+ * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
+ * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
+ * better one?
+ *
+ */
+
+
+/*
+ Altivec support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
+typedef vector float v4sf;
+# define SIMD_SZ 4
+# define VREQUIRES_ALIGN 1 /* not sure, if really required */
+# define VZERO() ((vector float) vec_splat_u8(0))
+# define VMUL(a,b) vec_madd(a,b, VZERO())
+# define VADD(a,b) vec_add(a,b)
+# define VMADD(a,b,c) vec_madd(a,b,c)
+# define VSUB(a,b) vec_sub(a,b)
+inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
+# define LD_PS1(p) ld_ps1(&p)
+# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
+# define UNINTERLEAVE2(in1, in2, out1, out2) { \
+ vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
+ vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
+ v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
+ }
+# define VTRANSPOSE4(x0,x1,x2,x3) { \
+ v4sf y0 = vec_mergeh(x0, x2); \
+ v4sf y1 = vec_mergel(x0, x2); \
+ v4sf y2 = vec_mergeh(x1, x3); \
+ v4sf y3 = vec_mergel(x1, x3); \
+ x0 = vec_mergeh(y0, y2); \
+ x1 = vec_mergel(y0, y2); \
+ x2 = vec_mergeh(y1, y3); \
+ x3 = vec_mergel(y1, y3); \
+ }
+# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
+# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
+
+/*
+ SSE1 support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
+ * anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
+# define SIMD_SZ 4
+# define VREQUIRES_ALIGN 1
+# define VZERO() _mm_setzero_ps()
+# define VMUL(a,b) _mm_mul_ps(a,b)
+# define VADD(a,b) _mm_add_ps(a,b)
+# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
+# define VSUB(a,b) _mm_sub_ps(a,b)
+# define LD_PS1(p) _mm_set1_ps(p)
+# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr)
+# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr)
+# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
+/* reverse/flip all floats */
+# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
+/* reverse/flip complex floats */
+# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
+# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
+
+/*
+ ARM NEON support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
+# include <arm_neon.h>
+typedef float32x4_t v4sf;
+# define SIMD_SZ 4
+# define VREQUIRES_ALIGN 0 /* usually no alignment required */
+# define VZERO() vdupq_n_f32(0)
+# define VMUL(a,b) vmulq_f32(a,b)
+# define VADD(a,b) vaddq_f32(a,b)
+# define VMADD(a,b,c) vmlaq_f32(c,a,b)
+# define VSUB(a,b) vsubq_f32(a,b)
+# define LD_PS1(p) vld1q_dup_f32(&(p))
+# define VLOAD_UNALIGNED(ptr) (*(ptr))
+# define VLOAD_ALIGNED(ptr) (*(ptr))
+# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+# define VTRANSPOSE4(x0,x1,x2,x3) { \
+ float32x4x2_t t0_ = vzipq_f32(x0, x2); \
+ float32x4x2_t t1_ = vzipq_f32(x1, x3); \
+ float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
+ float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
+ x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
+ }
+// marginally faster version
+//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
+# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
+#else
+# if !defined(PFFFT_SIMD_DISABLE)
+# warning "building with simd disabled !\n";
+# define PFFFT_SIMD_DISABLE // fallback to scalar code
+# endif
+#endif
+
+// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
+#ifdef PFFFT_SIMD_DISABLE
+typedef float v4sf;
+# define SIMD_SZ 1
+# define VREQUIRES_ALIGN 0
+# define VZERO() 0.f
+# define VMUL(a,b) ((a)*(b))
+# define VADD(a,b) ((a)+(b))
+# define VMADD(a,b,c) ((a)*(b)+(c))
+# define VSUB(a,b) ((a)-(b))
+# define LD_PS1(p) (p)
+# define VLOAD_UNALIGNED(ptr) (*(ptr))
+# define VLOAD_ALIGNED(ptr) (*(ptr))
+# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
+#endif
+
+// shortcuts for complex multiplcations
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+// multiply a scalar with a vector
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+typedef union v4sf_union {
+ v4sf v;
+ float f[SIMD_SZ];
+} v4sf_union;
+
+#if !defined(PFFFT_SIMD_DISABLE)
+
+#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+
+/* detect bugs with the vector support macros */
+static void Vvalidate_simd() {
+ float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
+ v4sf_union a0, a1, a2, a3, t, u;
+ memcpy(a0.f, f, 4*sizeof(float));
+ memcpy(a1.f, f+4, 4*sizeof(float));
+ memcpy(a2.f, f+8, 4*sizeof(float));
+ memcpy(a3.f, f+12, 4*sizeof(float));
+
+ t = a0; u = a1; t.v = VZERO();
+ printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
+ t.v = VADD(a1.v, a2.v);
+ printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
+ t.v = VMUL(a1.v, a2.v);
+ printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
+ t.v = VMADD(a1.v, a2.v,a0.v);
+ printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
+
+ INTERLEAVE2(a1.v,a2.v,t.v,u.v);
+ printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+ assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
+ UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
+ printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+ assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
+
+ t.v=LD_PS1(f[15]);
+ printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+ assertv4(t, 15, 15, 15, 15);
+ t.v = VSWAPHL(a1.v, a2.v);
+ printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+ assertv4(t, 8, 9, 6, 7);
+ VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
+ printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
+ a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
+ a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
+ assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
+}
+#endif //!PFFFT_SIMD_DISABLE
+
+/* SSE and co like 16-bytes aligned pointers */
+#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
+
+static
+void *Valigned_malloc(size_t nb_bytes) {
+ void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+ if (!p0) return (void *) 0;
+ p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+ *((void **) p - 1) = p0;
+ return p;
+}
+
+static
+void Valigned_free(void *p) {
+ if (p) free(*((void **) p - 1));
+}
+
+
+#endif /* PF_SIMD_MACROS_H */
+
diff --git a/test_pffastconv.c b/test_pffastconv.c
new file mode 100644
index 0000000..58d1f8c
--- /dev/null
+++ b/test_pffastconv.c
@@ -0,0 +1,1093 @@
+/*
+ Copyright (c) 2013 Julien Pommier.
+
+ Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
+
+ How to build:
+
+ on linux, with fftw3:
+ gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+ on macos, without fftw3:
+ clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
+
+ on macos, with fftw3:
+ clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
+
+ as alternative: replace clang by gcc.
+
+ on windows, with visual c++:
+ cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+
+ build without SIMD instructions:
+ gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
+
+ */
+
+#define _WANT_SNAN 1
+
+#include "pffft.h"
+#include "pffastconv.h"
+
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+#ifdef HAVE_SYS_TIMES
+# include <sys/times.h>
+# include <unistd.h>
+#endif
+
+/*
+ vector support macros: the rest of the code is independant of
+ SSE/Altivec/NEON -- adding support for other platforms with 4-element
+ vectors should be limited to these macros
+*/
+#include "pfsimd_macros.h"
+
+
+#if defined(_MSC_VER)
+# define RESTRICT __restrict
+#elif defined(__GNUC__)
+# define RESTRICT __restrict
+#else
+# define RESTRICT
+#endif
+
+
+#if defined(_MSC_VER)
+#pragma warning( disable : 4244 )
+#endif
+
+
+#ifdef SNANF
+ #define INVALID_FLOAT_VAL SNANF
+#elif defined(SNAN)
+ #define INVALID_FLOAT_VAL SNAN
+#elif defined(NAN)
+ #define INVALID_FLOAT_VAL NAN
+#elif defined(INFINITY)
+ #define INVALID_FLOAT_VAL INFINITY
+#else
+ #define INVALID_FLOAT_VAL FLT_MAX
+#endif
+
+
+#if defined(HAVE_SYS_TIMES)
+ inline double uclock_sec(void) {
+ static double ttclk = 0.;
+ struct tms t;
+ if (ttclk == 0.)
+ ttclk = sysconf(_SC_CLK_TCK);
+ times(&t);
+ /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
+ return ((double)t.tms_utime)) / ttclk;
+ }
+# else
+ double uclock_sec(void)
+{ return (double)clock()/(double)CLOCKS_PER_SEC; }
+#endif
+
+
+
+
+unsigned nextPowerOfTwo(unsigned v) {
+ /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+ /* compute the next highest power of 2 of 32-bit v */
+ v--;
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ v++;
+ return v;
+}
+
+
+typedef int (*pfnConvolution) (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
+typedef void* (*pfnConvSetup) (float *Hfwd, int Nf, int * BlkLen, int flags);
+typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
+typedef void (*pfnConvDestroy) (void * setup);
+
+
+struct ConvSetup
+{
+ pfnConvolution pfn;
+ int N;
+ int B;
+ float * H;
+ int flags;
+};
+
+
+void * convSetupRev( float * H, int N, int * BlkLen, int flags )
+{
+ struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
+ int i, Nr = N;
+ if (flags & PFFASTCONV_CPLX_INP_OUT)
+ Nr *= 2;
+ Nr += 4;
+ s->pfn = NULL;
+ s->N = N;
+ s->B = *BlkLen;
+ s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
+ s->flags = flags;
+ memset(s->H, 0, (unsigned)Nr * sizeof(float));
+ if (flags & PFFASTCONV_CPLX_INP_OUT)
+ {
+ for ( i = 0; i < N; ++i ) {
+ s->H[2*(N-1 -i) ] = H[i];
+ s->H[2*(N-1 -i)+1] = H[i];
+ }
+ /* simpler detection of overruns */
+ s->H[ 2*N ] = INVALID_FLOAT_VAL;
+ s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
+ s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
+ s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
+ }
+ else
+ {
+ for ( i = 0; i < N; ++i )
+ s->H[ N-1 -i ] = H[i];
+ /* simpler detection of overruns */
+ s->H[ N ] = INVALID_FLOAT_VAL;
+ s->H[ N +1 ] = INVALID_FLOAT_VAL;
+ s->H[ N +2 ] = INVALID_FLOAT_VAL;
+ s->H[ N +3 ] = INVALID_FLOAT_VAL;
+ }
+ return s;
+}
+
+void convDestroyRev( void * setup )
+{
+ struct ConvSetup * s = (struct ConvSetup*)setup;
+ pffastconv_free(s->H);
+ pffastconv_free(setup);
+}
+
+
+pfnConvolution ConvGetFnPtrRev( void * setup )
+{
+ struct ConvSetup * s = (struct ConvSetup*)setup;
+ if (!s)
+ return NULL;
+ return s->pfn;
+}
+
+
+void convSimdDestroy( void * setup )
+{
+ convDestroyRev(setup);
+}
+
+
+void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
+{
+ void * p = pffastconv_new_setup( H, N, BlkLen, flags );
+ if (!p)
+ printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
+ return p;
+}
+
+
+void fastConvDestroy( void * setup )
+{
+ pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
+}
+
+
+
+int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+ struct ConvSetup * p = (struct ConvSetup*)setup;
+ const float * RESTRICT X = input;
+ const float * RESTRICT Hrev = p->H;
+ float * RESTRICT Y = output;
+ const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+ const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+ int i, j;
+ (void)Yref;
+ (void)applyFlush;
+
+ if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+ {
+ for ( i = 0; i <= lenNr; i += 2 )
+ {
+ float sumRe = 0.0F, sumIm = 0.0F;
+ for ( j = 0; j < Nr; j += 2 )
+ {
+ sumRe += X[i+j ] * Hrev[j];
+ sumIm += X[i+j+1] * Hrev[j+1];
+ }
+ Y[i ] = sumRe;
+ Y[i+1] = sumIm;
+ }
+ return i/2;
+ }
+ else
+ {
+ for ( i = 0; i <= lenNr; ++i )
+ {
+ float sum = 0.0F;
+ for (j = 0; j < Nr; ++j )
+ sum += X[i+j] * Hrev[j];
+ Y[i] = sum;
+ }
+ return i;
+ }
+}
+
+
+
+int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+ float sum[4];
+ struct ConvSetup * p = (struct ConvSetup*)setup;
+ const float * RESTRICT X = input;
+ const float * RESTRICT Hrev = p->H;
+ float * RESTRICT Y = output;
+ const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+ const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+ int i, j;
+ (void)Yref;
+ (void)applyFlush;
+
+ if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+ {
+ if ( (Nr & 3) == 0 )
+ {
+ for ( i = 0; i <= lenNr; i += 2 )
+ {
+ sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+ for (j = 0; j < Nr; j += 4 )
+ {
+ sum[0] += X[i+j] * Hrev[j];
+ sum[1] += X[i+j+1] * Hrev[j+1];
+ sum[2] += X[i+j+2] * Hrev[j+2];
+ sum[3] += X[i+j+3] * Hrev[j+3];
+ }
+ Y[i ] = sum[0] + sum[2];
+ Y[i+1] = sum[1] + sum[3];
+ }
+ }
+ else
+ {
+ const int M = Nr & (~3);
+ for ( i = 0; i <= lenNr; i += 2 )
+ {
+ float tailSumRe = 0.0F, tailSumIm = 0.0F;
+ sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+ for (j = 0; j < M; j += 4 )
+ {
+ sum[0] += X[i+j ] * Hrev[j ];
+ sum[1] += X[i+j+1] * Hrev[j+1];
+ sum[2] += X[i+j+2] * Hrev[j+2];
+ sum[3] += X[i+j+3] * Hrev[j+3];
+ }
+ for ( ; j < Nr; j += 2 ) {
+ tailSumRe += X[i+j ] * Hrev[j ];
+ tailSumIm += X[i+j+1] * Hrev[j+1];
+ }
+ Y[i ] = ( sum[0] + sum[2] ) + tailSumRe;
+ Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
+ }
+ }
+ return i/2;
+ }
+ else
+ {
+ if ( (Nr & 3) == 0 )
+ {
+ for ( i = 0; i <= lenNr; ++i )
+ {
+ sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+ for (j = 0; j < Nr; j += 4 )
+ {
+ sum[0] += X[i+j] * Hrev[j];
+ sum[1] += X[i+j+1] * Hrev[j+1];
+ sum[2] += X[i+j+2] * Hrev[j+2];
+ sum[3] += X[i+j+3] * Hrev[j+3];
+ }
+ Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
+ }
+ return i;
+ }
+ else
+ {
+ const int M = Nr & (~3);
+ /* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
+ for ( i = 0; i <= lenNr; ++i )
+ {
+ float tailSum = 0.0;
+ sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+ for (j = 0; j < M; j += 4 )
+ {
+ sum[0] += X[i+j] * Hrev[j];
+ sum[1] += X[i+j+1] * Hrev[j+1];
+ sum[2] += X[i+j+2] * Hrev[j+2];
+ sum[3] += X[i+j+3] * Hrev[j+3];
+ }
+ for ( ; j < Nr; ++j )
+ tailSum += X[i+j] * Hrev[j];
+ Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
+ }
+ return i;
+ }
+ }
+}
+
+
+int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+ float sum[4];
+ struct ConvSetup * p = (struct ConvSetup*)setup;
+ (void)Yref;
+ (void)applyFlush;
+ if (p->flags & PFFASTCONV_SYMMETRIC)
+ {
+ const float * RESTRICT X = input;
+ const float * RESTRICT Hrev = p->H;
+ float * RESTRICT Y = output;
+ const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+ const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+ const int h = Nr / 2 -4;
+ const int E = Nr -4;
+ int i, j;
+
+ if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+ {
+ for ( i = 0; i <= lenNr; i += 2 )
+ {
+ const int k = i + E;
+ sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+ for (j = 0; j <= h; j += 4 )
+ {
+ sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+2] );
+ sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
+ sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j ] );
+ sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
+ }
+ Y[i ] = sum[0] + sum[2];
+ Y[i+1] = sum[1] + sum[3];
+ }
+ return i/2;
+ }
+ else
+ {
+ for ( i = 0; i <= lenNr; ++i )
+ {
+ const int k = i + E;
+ sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+ for (j = 0; j <= h; j += 4 )
+ {
+ sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+3] );
+ sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
+ sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
+ sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j ] );
+ }
+ Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
+ }
+ return i;
+ }
+ }
+ else
+ {
+ const float * RESTRICT X = input;
+ const float * RESTRICT Hrev = p->H;
+ float * RESTRICT Y = output;
+ const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+ const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+ int i, j;
+
+ if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+ {
+ for ( i = 0; i <= lenNr; i += 2 )
+ {
+ sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+ for (j = 0; j < Nr; j += 4 )
+ {
+ sum[0] += X[i+j] * Hrev[j];
+ sum[1] += X[i+j+1] * Hrev[j+1];
+ sum[2] += X[i+j+2] * Hrev[j+2];
+ sum[3] += X[i+j+3] * Hrev[j+3];
+ }
+ Y[i ] = sum[0] + sum[2];
+ Y[i+1] = sum[1] + sum[3];
+ }
+ return i/2;
+ }
+ else
+ {
+ if ( (Nr & 3) == 0 )
+ {
+ for ( i = 0; i <= lenNr; ++i )
+ {
+ sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+ for (j = 0; j < Nr; j += 4 )
+ {
+ sum[0] += X[i+j] * Hrev[j];
+ sum[1] += X[i+j+1] * Hrev[j+1];
+ sum[2] += X[i+j+2] * Hrev[j+2];
+ sum[3] += X[i+j+3] * Hrev[j+3];
+ }
+ Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
+ }
+ return i;
+ }
+ else
+ {
+ const int M = Nr & (~3);
+ /* printf("B: Nr = %d\n", Nr ); */
+ for ( i = 0; i <= lenNr; ++i )
+ {
+ float tailSum = 0.0;
+ sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+ for (j = 0; j < M; j += 4 )
+ {
+ sum[0] += X[i+j] * Hrev[j];
+ sum[1] += X[i+j+1] * Hrev[j+1];
+ sum[2] += X[i+j+2] * Hrev[j+2];
+ sum[3] += X[i+j+3] * Hrev[j+3];
+ }
+ for ( ; j < Nr; ++j )
+ tailSum += X[i+j] * Hrev[j];
+ Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
+ }
+ return i;
+ }
+ }
+ }
+
+}
+
+
+int slow_conv_SIMD(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+ v4sf_union sum;
+ struct ConvSetup * p = (struct ConvSetup*)setup;
+ (void)Yref;
+ (void)applyFlush;
+
+#if defined(PFFFT_SIMD_DISABLE) || !(defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+ return 0;
+#endif
+#if SIMD_SZ != 4
+ #error "Error: SIMD_SZ != 4 not implemented!"
+ return 0;
+#endif
+
+ if (p->flags & PFFASTCONV_SYMMETRIC)
+ {
+ const float * RESTRICT X = input;
+ const float * RESTRICT Hrev = p->H;
+ float * RESTRICT Y = output;
+ const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+ const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+ const int h = Nr / 2 -SIMD_SZ;
+ const int E = Nr -SIMD_SZ;
+ int i, j;
+
+ assert(VALIGNED(Hrev));
+
+ if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+ {
+ for ( i = 0; i <= lenNr; i += 2 )
+ {
+ const int k = i + E;
+ sum.v = VZERO();
+ for ( j = 0; j <= h; j += SIMD_SZ )
+ {
+ v4sf t = VLOAD_UNALIGNED(X +(k-j));
+ sum.v = VMADD( VADD( VLOAD_UNALIGNED(X+(i+j)), VREV_C(t) ) , VLOAD_ALIGNED(Hrev+j), sum.v );
+ }
+ Y[i ] = sum.f[0] + sum.f[2];
+ Y[i+1] = sum.f[1] + sum.f[3];
+ }
+ return i/2;
+ }
+ else
+ {
+ for ( i = 0; i <= lenNr; ++i )
+ {
+ const int k = i + E;
+ sum.v = VZERO();
+ for ( j = 0; j <= h; j += SIMD_SZ )
+ {
+ v4sf t = VLOAD_UNALIGNED(X +(k-j));
+ sum.v = VMADD( VADD( VLOAD_UNALIGNED(X+(i+j)), VREV_S(t) ) , VLOAD_ALIGNED(Hrev+j), sum.v );
+ }
+ Y[i] = (sum.f[0] + sum.f[1]) + (sum.f[2] + sum.f[3]);
+ }
+ return i;
+ }
+ }
+ else
+ {
+ const float * RESTRICT X = input;
+ const float * RESTRICT Hrev = p->H;
+ float * RESTRICT Y = output;
+ const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+ const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+ int i, j;
+
+ assert(VALIGNED(Hrev));
+
+ if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+ {
+ for ( i = 0; i <= lenNr; i += 2 )
+ {
+ sum.v = VZERO();
+ for ( j = 0; j < Nr; j += SIMD_SZ )
+ sum.v = VMADD( VLOAD_UNALIGNED(X+(i+j)), VLOAD_ALIGNED(Hrev+j), sum.v );
+ Y[i ] = sum.f[0] + sum.f[2];
+ Y[i+1] = sum.f[1] + sum.f[3];
+ }
+ return i/2;
+ }
+ else
+ {
+ if ( (Nr & 3) == 0 )
+ {
+ for ( i = 0; i <= lenNr; ++i )
+ {
+ sum.v = VZERO();
+ for ( j = 0; j < Nr; j += SIMD_SZ )
+ sum.v = VMADD( VLOAD_UNALIGNED(X+(i+j)), VLOAD_ALIGNED(Hrev+j), sum.v );
+ Y[i] = (sum.f[0] + sum.f[1]) + (sum.f[2] + sum.f[3]);
+ }
+ return i;
+ }
+ else
+ {
+ const int M = Nr & (~3);
+ for ( i = 0; i <= lenNr; ++i )
+ {
+ float tailSum = 0.0;
+ sum.v = VZERO();
+ for (j = 0; j < M; j += SIMD_SZ )
+ sum.v = VMADD( VLOAD_UNALIGNED(X+(i+j)), VLOAD_ALIGNED(Hrev+j), sum.v );
+ for ( ; j < Nr; ++j )
+ tailSum += X[i+j] * Hrev[j];
+ Y[i] = (sum.f[0] + sum.f[1]) + (sum.f[2] + sum.f[3]) + tailSum;
+ }
+ return i;
+ }
+ }
+ }
+
+}
+
+
+void * convSimdSetup( float * H, int N, int * BlkLen, int flags )
+{
+ struct ConvSetup * p = (struct ConvSetup*) convSetupRev( H, N, BlkLen, flags );
+ int NomMul = 1, DenMul = 1;
+
+ if (!p)
+ return p;
+
+#if defined(PFFFT_SIMD_DISABLE) || (SIMD_SZ != 4) || !(defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+ p->pfn = slow_conv_B;
+ return p;
+#endif
+
+ if (p->flags & PFFASTCONV_SYMMETRIC)
+ {
+ if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+ NomMul = DenMul = 1;
+ else
+ DenMul = 2;
+ }
+ else
+ {
+ if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+ NomMul = 2;
+ else
+ {
+ p->pfn = slow_conv_SIMD;
+ return p;
+ }
+ }
+
+ p->pfn = ( ( (NomMul * p->N) % (DenMul * SIMD_SZ) ) != 0 ) ? slow_conv_B : slow_conv_SIMD;
+ return p;
+}
+
+
+
+int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
+{
+ (void)Yref;
+ return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
+}
+
+
+
+void printFirst( const float * V, const char * st, const int N, const int perLine )
+{
+ (void)V; (void)st; (void)N; (void)perLine;
+ return;
+#if 0
+ int i;
+ for ( i = 0; i < N; ++i )
+ {
+ if ( (i % perLine) == 0 )
+ printf("\n%s[%d]", st, i);
+ printf("\t%.1f", V[i]);
+ }
+ printf("\n");
+#endif
+}
+
+
+
+#define NUMY 11
+
+
+int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed) {
+ double t0, t1, tstop, td, tdref;
+ float *X, *H;
+ float *Y[NUMY];
+ int64_t outN[NUMY];
+ /* 256 KFloats or 16 MFloats data */
+#if 1
+ const int len = testOutLen ? (1 << 18) : (1 << 24);
+#elif 0
+ const int len = testOutLen ? (1 << 18) : (1 << 13);
+#else
+ const int len = testOutLen ? (1 << 18) : (1024);
+#endif
+ const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
+ const int lenC = len / cplxFactor;
+
+ int yi, yc, posMaxErr;
+ float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
+ int i, j, numErrOverLimit, iter;
+ int retErr = 0;
+
+ const int iSlowSIMDalgo = 3;
+ /* 0 1 2 3 4 5 6 7 8 9 10 */
+ pfnConvSetup aSetup[NUMY] = { convSetupRev, convSetupRev, convSetupRev, convSimdSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup };
+ pfnConvDestroy aDestroy[NUMY] = { convDestroyRev, convDestroyRev, convDestroyRev, convSimdDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy };
+ pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL, NULL, NULL, ConvGetFnPtrRev, NULL, NULL, NULL, NULL, NULL, NULL, NULL, };
+ pfnConvolution aConv[NUMY] = { slow_conv_R, slow_conv_A, slow_conv_B, NULL, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv };
+ const char * convText[NUMY] = { "R(non-simd)", "A(non-simd)", "B(non-simd)", "slow_simd", "fast_conv_64", "fast_conv_128", "fast_conv_256", "fast_conv_512", "fast_conv_1K", "fast_conv_2K", "fast_conv_4K" };
+ int aFastAlgo[NUMY] = { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 };
+ void * aSetupCfg[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+ int aBlkLen[NUMY] = { 1024, 1024, 1024, 1024, 64, 128, 256, 512, 1024, 2048, 4096 };
+#if 1
+ int aRunAlgo[NUMY] = { 1, 1, 1, 1, FILTERLEN<64, FILTERLEN<128, FILTERLEN<256, FILTERLEN<512, FILTERLEN<1024, FILTERLEN<2048, FILTERLEN<4096 };
+#else
+ int aRunAlgo[NUMY] = { 1, 0, 0, 0, 0 && FILTERLEN<64, 1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096 };
+#endif
+ double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];
+
+ X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
+ for ( i=0; i < NUMY; ++i)
+ {
+ if ( i < 2 )
+ Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
+ else
+ Y[i] = Y[1];
+
+ aSpeedFactor[i] = -1.0;
+ aDuration[i] = -1.0;
+ procSmpPerSec[i] = -1.0;
+ }
+
+ H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));
+
+ /* initialize input */
+ if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
+ {
+ for ( i = 0; i < lenC; ++i )
+ {
+ X[2*i ] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
+ X[2*i+1] = (float)((i+2048) % 4093);
+ }
+ }
+ else
+ {
+ for ( i = 0; i < len; ++i )
+ X[i] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
+ }
+ X[ len ] = INVALID_FLOAT_VAL;
+ X[ len +1 ] = INVALID_FLOAT_VAL;
+ X[ len +2 ] = INVALID_FLOAT_VAL;
+ X[ len +3 ] = INVALID_FLOAT_VAL;
+
+ if (!testOutLen)
+ printFirst( X, "X", 64, 8 );
+
+ /* filter coeffs */
+ memset( H, 0, FILTERLEN * sizeof(float) );
+#if 1
+ if ( convFlags & PFFASTCONV_SYMMETRIC )
+ {
+ const int half = FILTERLEN / 2;
+ for ( j = 0; j < half; ++j ) {
+ switch (j % 3) {
+ case 0: H[j] = H[FILTERLEN-1-j] = -1.0F; break;
+ case 1: H[j] = H[FILTERLEN-1-j] = 1.0F; break;
+ case 2: H[j] = H[FILTERLEN-1-j] = 0.5F; break;
+ }
+ }
+ }
+ else
+ {
+ for ( j = 0; j < FILTERLEN; ++j ) {
+ switch (j % 3) {
+ case 0: H[j] = -1.0F; break;
+ case 1: H[j] = 1.0F; break;
+ case 2: H[j] = 0.5F; break;
+ }
+ }
+ }
+#else
+ H[0] = 1.0F;
+ H[FILTERLEN -1] = 1.0F;
+#endif
+ if (!testOutLen)
+ printFirst( H, "H", FILTERLEN, 8 );
+
+ printf("\n");
+ printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
+ ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
+ (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
+ ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );
+
+ while (1)
+ {
+
+ for ( yi = 0; yi < NUMY; ++yi )
+ {
+ if (!aRunAlgo[yi])
+ continue;
+
+ aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );
+
+ /* get effective apply function ptr */
+ if ( aSetupCfg[yi] && aGetFnPtr[yi] )
+ aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );
+
+ if ( aSetupCfg[yi] && aConv[yi] ) {
+ if (testOutLen)
+ {
+ t0 = uclock_sec();
+ outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
+ t1 = uclock_sec();
+ td = t1 - t0;
+ }
+ else
+ {
+ const int blkLen = 4096; /* required for 'fast_conv_4K' */
+ int64_t offC = 0, offS, Nout;
+ int k;
+ iter = 0;
+ outN[yi] = 0;
+ t0 = uclock_sec();
+ tstop = t0 + 0.25; /* benchmark duration: 250 ms */
+ do {
+ for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
+ {
+ offS = cplxFactor * offC;
+ Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], (offC +blkLen >= lenC) /* applyFlush */ );
+ offC += Nout;
+ ++iter;
+ if ( !Nout )
+ break;
+ if ( offC +blkLen >= lenC )
+ {
+ outN[yi] += offC;
+ offC = 0;
+ }
+ }
+ t1 = uclock_sec();
+ } while ( t1 < tstop );
+ outN[yi] += offC;
+ td = t1 - t0;
+ procSmpPerSec[yi] = cplxFactor * (double)offC / td;
+ }
+ }
+ else
+ {
+ t0 = t1 = td = 0.0;
+ outN[yi] = 0;
+ }
+ aDuration[yi] = td;
+ if ( yi == 0 ) {
+ const float * Yvals = Y[0];
+ const int64_t refOutLen = cplxFactor * outN[0];
+ tdref = td;
+ if (printDbg) {
+ printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
+ printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
+ }
+ aSpeedFactor[yi] = 1.0;
+ /* */
+ yRangeMin = FLT_MAX;
+ yRangeMax = FLT_MIN;
+ for ( i = 0; i < refOutLen; ++i )
+ {
+ if ( yRangeMax < Yvals[i] ) yRangeMax = Yvals[i];
+ if ( yRangeMin > Yvals[i] ) yRangeMin = Yvals[i];
+ }
+ yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
+ /* yErrLimit = 0.01F; */
+ if (testOutLen) {
+ if (1) {
+ printf("reference output len = %" PRId64 " smp\n", outN[0]);
+ printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
+ }
+ printFirst( Yvals, "Yref", 64, 8 );
+ }
+ }
+ else
+ {
+ aSpeedFactor[yi] = tdref / td;
+ if (printDbg) {
+ printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
+ printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
+ }
+ }
+ }
+
+ int iMaxSpeedSlowAlgo = -1;
+ int iFirstFastAlgo = -1;
+ int iMaxSpeedFastAlgo = -1;
+ int iPrintedRefOutLen = 0;
+ {
+ for ( yc = 1; yc < NUMY; ++yc )
+ {
+ if (!aRunAlgo[yc])
+ continue;
+ if (aFastAlgo[yc]) {
+ if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
+ iMaxSpeedFastAlgo = yc;
+
+ if (iFirstFastAlgo < 0)
+ iFirstFastAlgo = yc;
+ }
+ else
+ {
+ if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
+ iMaxSpeedSlowAlgo = yc;
+ }
+ }
+
+ if (printSpeed)
+ {
+ if (testOutLen)
+ {
+ if (iMaxSpeedSlowAlgo >= 0 )
+ printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
+ if (0 != iMaxSpeedSlowAlgo && 0 != iSlowSIMDalgo && aRunAlgo[0])
+ printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
+ if (1 != iMaxSpeedSlowAlgo && 1 != iSlowSIMDalgo && aRunAlgo[1])
+ printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);
+ if (iSlowSIMDalgo != iMaxSpeedSlowAlgo && aRunAlgo[iSlowSIMDalgo])
+ printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[iSlowSIMDalgo], aSpeedFactor[iSlowSIMDalgo], 1000.0 * aDuration[iSlowSIMDalgo]);
+
+ if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
+ printf("first fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo], aSpeedFactor[iFirstFastAlgo], 1000.0 * aDuration[iFirstFastAlgo]);
+ if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
+ printf("2nd fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1], aSpeedFactor[iFirstFastAlgo+1], 1000.0 * aDuration[iFirstFastAlgo+1]);
+
+ if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
+ {
+ printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
+ if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
+ printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
+ }
+ printf("\n");
+ }
+ else
+ {
+ for ( yc = 0; yc < NUMY; ++yc )
+ {
+ if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
+ continue;
+ printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g SmpPerSec\n",
+ convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] );
+ }
+ }
+
+ }
+ }
+
+
+ for ( yc = 1; yc < NUMY; ++yc )
+ {
+ const float * Yref;
+ const float * Ycurr;
+ int outMin;
+
+ if (!aRunAlgo[yc])
+ continue;
+
+ if (printDbg)
+ printf("\n");
+
+ if ( outN[yc] == 0 )
+ {
+ printf("output size 0: '%s' not implemented\n", convText[yc]);
+ }
+ else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
+ {
+ if (!iPrintedRefOutLen)
+ {
+ printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
+ iPrintedRefOutLen = 1;
+ }
+ printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
+ FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
+ retErr = 1;
+ }
+
+ posMaxErr = 0;
+ maxErr = -1.0;
+ Yref = Y[0];
+ Ycurr = Y[yc];
+ outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
+ numErrOverLimit = 0;
+ for ( i = 0; i < outMin; ++i )
+ {
+ if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit )
+ {
+ printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
+ convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
+ ++numErrOverLimit;
+ }
+
+ if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
+ {
+ maxErr = fabsf(Ycurr[i] - Yref[i]);
+ posMaxErr = i;
+ }
+ }
+
+ if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
+ printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
+ }
+
+ break;
+ }
+
+ pffastconv_free(X);
+ for ( i=0; i < NUMY; ++i)
+ {
+ if ( i < 2 )
+ pffastconv_free( Y[i] );
+ if (!aRunAlgo[i])
+ continue;
+ aDestroy[i]( aSetupCfg[i] );
+ }
+
+ pffastconv_free(H);
+
+ return retErr;
+}
+
+
+int main(int argc, char **argv)
+{
+ int result = 0;
+ int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
+ int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
+ int testReal = 1, testCplx = 1, testSymetric = 0;
+
+ for ( i = 1; i < argc; ++i ) {
+ if (!strcmp(argv[i], "--no-len")) {
+ testOutLens = 0;
+ }
+ else if (!strcmp(argv[i], "--no-bench")) {
+ benchConv = 0;
+ }
+ else if (!strcmp(argv[i], "--quick")) {
+ quickTest = 1;
+ }
+ else if (!strcmp(argv[i], "--slow")) {
+ slowTest = 1;
+ }
+ else if (!strcmp(argv[i], "--real")) {
+ testCplx = 0;
+ }
+ else if (!strcmp(argv[i], "--cplx")) {
+ testReal = 0;
+ }
+ else if (!strcmp(argv[i], "--sym")) {
+ testSymetric = 1;
+ }
+ else /* if (!strcmp(argv[i], "--help")) */ {
+ printf("usage: %s [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
+ exit(1);
+ }
+ }
+
+
+ if (testOutLens)
+ {
+ for ( k = 0; k < 3; ++k )
+ {
+ if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
+ continue;
+ printf("\n\n==========\n");
+ printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
+ printf("==========\n");
+ flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
+ flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
+ flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
+ testOutLen = 1;
+ printDbg = 0;
+ printSpeed = 0;
+ for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
+ {
+ if ( (M % 16) != 0 && testSymetric )
+ continue;
+ result |= test(M, flagsB, testOutLen, printDbg, printSpeed);
+ }
+ }
+ }
+
+ if (benchConv)
+ {
+ for ( k = 0; k < 3; ++k )
+ {
+ if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
+ continue;
+ printf("\n\n==========\n");
+ printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
+ printf("==========\n");
+ flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
+ flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
+ flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
+ testOutLen = 0;
+ printDbg = 0;
+ printSpeed = 1;
+ if (!slowTest) {
+ result |= test( 32, flagsC, testOutLen, printDbg, printSpeed);
+ result |= test( 32+ 16, flagsC, testOutLen, printDbg, printSpeed);
+ result |= test( 64, flagsC, testOutLen, printDbg, printSpeed);
+ result |= test( 64+ 32, flagsC, testOutLen, printDbg, printSpeed);
+ result |= test(128, flagsC, testOutLen, printDbg, printSpeed);
+ }
+ if (!quickTest) {
+ result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed);
+ result |= test(256, flagsC, testOutLen, printDbg, printSpeed);
+ result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed);
+ result |= test(512, flagsC, testOutLen, printDbg, printSpeed);
+ result |= test(1024, flagsC, testOutLen, printDbg, printSpeed);
+ }
+ }
+ }
+
+ return result;
+}
+
diff --git a/test_pffft.c b/test_pffft.c
new file mode 100644
index 0000000..294a43f
--- /dev/null
+++ b/test_pffft.c
@@ -0,0 +1,246 @@
+/*
+ Copyright (c) 2013 Julien Pommier.
+
+ Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
+
+ How to build:
+
+ on linux, with fftw3:
+ gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+ on macos, without fftw3:
+ clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
+
+ on macos, with fftw3:
+ clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
+
+ as alternative: replace clang by gcc.
+
+ on windows, with visual c++:
+ cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+
+ build without SIMD instructions:
+ gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
+
+ */
+
+#include "pffft.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+
+/* EXPECTED_DYN_RANGE in dB:
+ * single precision float has 24 bits mantissa
+ * => 24 Bits * 6 dB = 144 dB
+ * allow a few dB tolerance (even 144 dB looks good on my PC)
+ */
+#define EXPECTED_DYN_RANGE 140.0
+
+/* maximum allowed phase error in degree */
+#define DEG_ERR_LIMIT 1E-4
+
+/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
+#define MAG_ERR_LIMIT 1E-6
+
+
+#define PRINT_SPEC 0
+
+#define PWR2LOG(PWR) ( (PWR) < 1E-30 ? 10.0*log10(1E-30) : 10.0*log10(PWR) )
+
+
+int isPowerOfTwo(unsigned v) {
+ /* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
+ int f = v && !(v & (v - 1));
+ return f;
+}
+
+
+int test(int N, int cplx, int useOrdered) {
+ int Nfloat = (cplx ? N*2 : N);
+ float *X = pffft_aligned_malloc((unsigned)Nfloat * sizeof(float));
+ float *Y = pffft_aligned_malloc((unsigned)Nfloat * sizeof(float));
+ float *Z = pffft_aligned_malloc((unsigned)Nfloat * sizeof(float));
+ float *W = pffft_aligned_malloc((unsigned)Nfloat * sizeof(float));
+ int k, j, m, iter, kmaxOther, retError = 0;
+ double freq, dPhi, phi, phi0;
+ double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
+ float amp = 1.0F;
+
+ assert( isPowerOfTwo(N) );
+
+ PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+ assert(s);
+ if (!s) {
+ printf("Error setting up PFFFT!\n");
+ return 1;
+ }
+
+ for ( k = m = 0; k < (cplx? N : (1 + N/2) ); k += N/16, ++m )
+ {
+ amp = ( (m % 3) == 0 ) ? 1.0F : 1.1F;
+ freq = (k < N/2) ? ((double)k / N) : ((double)(k-N) / N);
+ dPhi = 2.0 * M_PI * freq;
+ if ( dPhi < 0.0 )
+ dPhi += 2.0 * M_PI;
+
+ iter = -1;
+ while (1)
+ {
+ ++iter;
+
+ if (iter)
+ printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
+
+ /* generate cosine carrier as time signal - start at defined phase phi0 */
+ phi = phi0 = (m % 4) * 0.125 * M_PI; /* have phi0 < 90 deg to be normalized */
+ for ( j = 0; j < N; ++j )
+ {
+ if (cplx) {
+ X[2*j] = amp * (float)cos(phi); /* real part */
+ X[2*j+1] = amp * (float)sin(phi); /* imag part */
+ }
+ else
+ X[j] = amp * (float)cos(phi); /* only real part */
+
+ /* phase increment .. stay normalized - cos()/sin() might degrade! */
+ phi += dPhi;
+ if ( phi >= M_PI )
+ phi -= 2.0 * M_PI;
+ }
+
+ /* forward transform from X --> Y .. using work buffer W */
+ if ( useOrdered )
+ pffft_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
+ else
+ {
+ pffft_transform(s, X, Z, W, PFFFT_FORWARD ); /* temporarily use Z for reordering */
+ pffft_zreorder(s, Z, Y, PFFFT_FORWARD );
+ }
+
+ pwrOther = -1.0;
+ pwrCar = 0;
+
+
+ /* for positive frequencies: 0 to 0.5 * samplerate */
+ /* and also for negative frequencies: -0.5 * samplerate to 0 */
+ for ( j = 0; j < ( cplx ? N : (1 + N/2) ); ++j )
+ {
+ if (!cplx && !j) /* special treatment for DC for real input */
+ pwr = Y[j]*Y[j];
+ else if (!cplx && j == N/2) /* treat 0.5 * samplerate */
+ pwr = Y[1] * Y[1]; /* despite j (for freq calculation) we have index 1 */
+ else
+ pwr = Y[2*j] * Y[2*j] + Y[2*j+1] * Y[2*j+1];
+ if (iter || PRINT_SPEC)
+ printf("%s fft %d: pwr[j = %d] = %g == %f dB\n", (cplx ? "cplx":"real"), N, j, pwr, PWR2LOG(pwr) );
+ if (k == j)
+ pwrCar = pwr;
+ else if ( pwr > pwrOther ) {
+ pwrOther = pwr;
+ kmaxOther = j;
+ }
+ }
+
+ if ( PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE ) {
+ printf("%s fft %d amp %f iter %d:\n", (cplx ? "cplx":"real"), N, amp, iter);
+ printf(" carrier power at bin %d: %g == %f dB\n", k, pwrCar, PWR2LOG(pwrCar) );
+ printf(" carrier mag || at bin %d: %g\n", k, sqrt(pwrCar) );
+ printf(" max other pwr at bin %d: %g == %f dB\n", kmaxOther, pwrOther, PWR2LOG(pwrOther) );
+ printf(" dynamic range: %f dB\n\n", PWR2LOG(pwrCar) - PWR2LOG(pwrOther) );
+ retError = 1;
+ if ( iter == 0 )
+ continue;
+ }
+
+ if ( k > 0 && k != N/2 )
+ {
+ phi = atan2( Y[2*k+1], Y[2*k] );
+ if ( fabs( phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0 )
+ {
+ retError = 1;
+ printf("%s fft %d bin %d amp %f : phase mismatch! phase = %f deg expected = %f deg\n",
+ (cplx ? "cplx":"real"), N, k, amp, phi * 180.0 / M_PI, phi0 * 180.0 / M_PI );
+ }
+ }
+
+ expextedMag = cplx ? amp : ( (k == 0 || k == N/2) ? amp : (amp/2) );
+ mag = sqrt(pwrCar) / N;
+ if ( fabs(mag - expextedMag) > MAG_ERR_LIMIT )
+ {
+ retError = 1;
+ printf("%s fft %d bin %d amp %f : mag = %g expected = %g\n", (cplx ? "cplx":"real"), N, k, amp, mag, expextedMag );
+ }
+
+
+ /* now convert spectrum back */
+ pffft_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
+
+ errSum = 0.0;
+ for ( j = 0; j < (cplx ? (2*N) : N); ++j )
+ {
+ /* scale back */
+ Z[j] /= N;
+ /* square sum errors over real (and imag parts) */
+ err = (X[j]-Z[j]) * (X[j]-Z[j]);
+ errSum += err;
+ }
+
+ if ( errSum > N * 1E-7 )
+ {
+ retError = 1;
+ printf("%s fft %d bin %d : inverse FFT doesn't match original signal! errSum = %g ; mean err = %g\n", (cplx ? "cplx":"real"), N, k, errSum, errSum / N);
+ }
+
+ break;
+ }
+
+ }
+ pffft_destroy_setup(s);
+ pffft_aligned_free(X);
+ pffft_aligned_free(Y);
+ pffft_aligned_free(Z);
+ pffft_aligned_free(W);
+
+ return retError;
+}
+
+
+
+int main(int argc, char **argv)
+{
+ int N, result, resN, resAll;
+
+ resAll = 0;
+ for ( N = 32; N <= 65536; N *= 2 )
+ {
+ result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
+ resN = result;
+ resAll |= result;
+
+ result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
+ resN |= result;
+ resAll |= result;
+
+ result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
+ resN |= result;
+ resAll |= result;
+
+ result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
+ resN |= result;
+ resAll |= result;
+
+ if (!resN)
+ printf("tests for size %d succeeded successfully.\n", N);
+ }
+
+ if (!resAll)
+ printf("all tests succeeded successfully.\n");
+
+ return resAll;
+}
+