add PFFASTCONV library and test + bench

* API for fast convolution: pffastconv.h
* move simd macros from pffft.c into pfsimd_macros.h for reusability

Signed-off-by: hayati ayguen <h_ayguen@web.de>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70dcb20..fc2dc76 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,9 +4,16 @@
 
 option(USE_SIMD  "use SIMD (SSE/NEON) CPU features?" ON)
 option(USE_NEON  "force using NEON on ARM? (requires SIMD)" OFF)
-option(USE_FFTW  "use (system-installed) FFTW3 in benchmark?" OFF)
-option(USE_GREEN "use Green FFT in benchmark? - if exists in subdir" ON)
-option(USE_KISS  "use KissFFT in benchmark? - if exists in subdir" ON)
+option(USE_FFTW  "use (system-installed) FFTW3 in fft benchmark?" OFF)
+option(USE_GREEN "use Green FFT in fft benchmark? - if exists in subdir" ON)
+option(USE_KISS  "use KissFFT in fft benchmark? - if exists in subdir" ON)
+option(USE_ASAN  "use GCC's address sanitizer?" OFF)
+
+if (USE_ASAN)
+  set(ASANLIB "asan")
+else()
+  set(ASANLIB "")
+endif()
 
 
 if (USE_GREEN)
@@ -39,34 +46,84 @@
    message(STATUS "Build type not specified: defaulting to release.")
 endif(NOT CMAKE_BUILD_TYPE)
 
+if ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC")
+  # using Visual Studio C++
+  message(STATUS "INFO: detected MSVC: will not link math lib m")
+  set(MATHLIB "")
+else()
+  message(STATUS "INFO: detected NO MSVC: ${CMAKE_C_COMPILER_ID}: will link math lib m")
+  set(MATHLIB "m")
+endif()
+
+
 add_library(PFFFT STATIC pffft.c)
+target_compile_definitions(PFFFT PRIVATE _USE_MATH_DEFINES)
+if (USE_ASAN)
+  target_compile_options(PFFFT PRIVATE "-fsanitize=address")
+endif()
 if (NOT USE_SIMD)
   target_compile_definitions(PFFFT PRIVATE PFFFT_SIMD_DISABLE=1)
 endif()
 if (USE_SIMD AND USE_NEON)
   target_compile_options(PFFFT PRIVATE "-mfpu=neon")
 endif()
-
-
-target_link_libraries(PFFFT m)
+target_link_libraries( PFFFT ${MATHLIB} )
 set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
 )
 
+
 add_library(FFTPACK STATIC fftpack.c)
-target_link_libraries(FFTPACK m)
+target_compile_definitions(FFTPACK PRIVATE _USE_MATH_DEFINES)
+target_link_libraries( FFTPACK ${MATHLIB} )
 set_property(TARGET FFTPACK APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
 )
 
 
-add_executable(bench_pffft   bench_pffft.c )
+add_library(PFFASTCONV STATIC pffastconv.c)
+target_compile_definitions(PFFASTCONV PRIVATE _USE_MATH_DEFINES)
+if (USE_ASAN)
+  target_compile_options(PFFASTCONV PRIVATE "-fsanitize=address")
+endif()
+if (NOT USE_SIMD)
+  target_compile_definitions(PFFASTCONV PRIVATE PFFFT_SIMD_DISABLE=1)
+endif()
+if (USE_SIMD AND USE_NEON)
+  target_compile_options(PFFASTCONV PRIVATE "-mfpu=neon")
+endif()
+target_link_libraries( PFFASTCONV PFFFT ${ASANLIB} ${MATHLIB} )
+set_property(TARGET PFFASTCONV APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+)
 
+
+add_executable( test_pffft  test_pffft.c )
+target_compile_definitions(test_pffft PRIVATE _USE_MATH_DEFINES)
+target_link_libraries( test_pffft  PFFFT ${ASANLIB} )
+
+
+add_executable(test_pffastconv   test_pffastconv.c )
+target_compile_definitions(test_pffastconv PRIVATE _USE_MATH_DEFINES)
+if (USE_ASAN)
+  target_compile_options(test_pffastconv PRIVATE "-fsanitize=address")
+endif()
+if (NOT USE_SIMD)
+  target_compile_definitions(test_pffastconv PRIVATE PFFFT_SIMD_DISABLE=1)
+endif()
+if (USE_SIMD AND USE_NEON)
+  target_compile_options(test_pffastconv PRIVATE "-mfpu=neon")
+endif()
+target_link_libraries( test_pffastconv  PFFASTCONV ${ASANLIB} ${MATHLIB} )
+
+
+add_executable(bench_pffft   bench_pffft.c )
+target_compile_definitions(bench_pffft PRIVATE _USE_MATH_DEFINES)
 if (NOT USE_SIMD)
   target_compile_definitions(bench_pffft PRIVATE PFFFT_SIMD_DISABLE=1)
 endif()
 
-target_link_libraries(bench_pffft  PFFFT FFTPACK)
+target_link_libraries( bench_pffft  PFFFT FFTPACK ${ASANLIB} )
 
 if (USE_FFTW)
   target_compile_definitions(bench_pffft PRIVATE HAVE_FFTW=1)
@@ -101,4 +158,23 @@
   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 )
 
+add_test(NAME test_pfconv_lens_symetric
+  COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick" "--sym"
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+add_test(NAME test_pfconv_lens_non_sym
+  COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-bench" "--quick"
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+add_test(NAME bench_pfconv_symetric
+  COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len" "--sym"
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+add_test(NAME bench_pfconv_non_sym
+  COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_pffastconv" "--no-len"
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
 
diff --git a/pffastconv.c b/pffastconv.c
new file mode 100644
index 0000000..0aaed62
--- /dev/null
+++ b/pffastconv.c
@@ -0,0 +1,291 @@
+/*
+  Copyright (c) 2019 Hayati Ayguen.
+
+ */
+
+#include "pffastconv.h"
+#include "pffft.h"
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#  define COMPILER_MSVC
+#elif defined(__GNUC__)
+#  define COMPILER_GCC
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+
+#define FASTCONV_DBG_OUT  0
+
+
+#if defined(COMPILER_GCC)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define RESTRICT __restrict
+#elif defined(COMPILER_MSVC)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define RESTRICT __restrict
+#endif
+
+
+#ifdef COMPILER_MSVC
+#pragma warning( disable : 4244 4305 4204 4456 )
+#endif
+
+/* 
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros 
+*/
+#include "pfsimd_macros.h"
+
+
+static
+unsigned nextPowerOfTwo(unsigned v) {
+  /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+  /* compute the next highest power of 2 of 32-bit v */
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
+void *pffastconv_malloc(size_t nb_bytes)
+{
+  return Valigned_malloc(nb_bytes);
+}
+
+void pffastconv_free(void *p)
+{
+  Valigned_free(p);
+}
+
+int pffastconv_simd_size()
+{
+  return SIMD_SZ;
+}
+
+
+
+struct PFFASTCONV_Setup
+{
+  float * Xt;      /* input == x in time domain - copy for alignment */
+  float * Xf;      /* input == X in freq domain */
+  float * Hf;      /* filterCoeffs == H in freq domain */
+  float * Mf;      /* input * filterCoeffs in freq domain */
+  PFFFT_Setup *st;
+  int filterLen;   /* convolution length */
+  int Nfft;        /* FFT/block length */
+  int flags;
+  float scale;
+};
+
+
+PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags )
+{
+  PFFASTCONV_Setup * s = NULL;
+  const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
+  int i, Nfft = 2 * nextPowerOfTwo(filterLen -1);
+#if FASTCONV_DBG_OUT
+  const int iOldBlkLen = *blockLen;
+#endif
+
+  if ( flags & PFFASTCONV_CPLX_FILTER )
+    return NULL;
+
+  s = pffastconv_malloc( sizeof(struct PFFASTCONV_Setup) );
+
+  if ( *blockLen > Nfft ) {
+    Nfft = *blockLen;
+    Nfft = nextPowerOfTwo(Nfft);
+  }
+  *blockLen = Nfft;  /* this is in (complex) samples */
+
+  Nfft *= cplxFactor;
+
+  if ( (flags & PFFASTCONV_DIRECT_INP) && !(flags & PFFASTCONV_CPLX_INP_OUT) )
+    s->Xt = NULL;
+  else
+    s->Xt = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+  s->Xf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+  s->Hf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+  s->Mf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
+  s->st = pffft_new_setup(Nfft, PFFFT_REAL);  /* with complex: we do 2 x fft() */
+  s->filterLen = filterLen;        /* filterLen == convolution length == length of impulse response */
+  if ( cplxFactor == 2 )
+    s->filterLen = 2 * filterLen - 1;
+  s->Nfft = Nfft;  /* FFT/block length */
+  s->flags = flags;
+  s->scale = (float)( 1.0 / Nfft );
+
+  memset( s->Xt, 0, (unsigned)Nfft * sizeof(float) );
+  for ( i = 0; i < filterLen; ++i )
+    s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ filterLen - 1 - i ];
+
+  pffft_transform(s->st, s->Xt, s->Hf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+
+#if FASTCONV_DBG_OUT
+  printf("\n  fastConvSetup(filterLen = %d, blockLen %d) --> blockLen %d, OutLen = %d\n"
+    , filterLen, iOldBlkLen, *blockLen, Nfft - filterLen +1 );
+#endif
+
+  return s;
+}
+
+
+void pffastconv_destroy_setup( PFFASTCONV_Setup * s )
+{
+  if (!s)
+    return;
+  pffft_destroy_setup(s->st);
+  pffastconv_free(s->Mf);
+  pffastconv_free(s->Hf);
+  pffastconv_free(s->Xf);
+  if ( s->Xt )
+    pffastconv_free(s->Xt);
+  pffastconv_free(s);
+}
+
+
+int pffastconv_apply(PFFASTCONV_Setup * s, const float *input_, int cplxInputLen, float *output_, int applyFlush)
+{
+  const float * RESTRICT X = input_;
+  float * RESTRICT Y = output_;
+  const int Nfft = s->Nfft;
+  const int filterLen = s->filterLen;
+  const int flags = s->flags;
+  const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
+  const int inputLen = cplxFactor * cplxInputLen;
+  int inpOff, procLen, numOut = 0, j, part, cplxOff;
+
+  /* applyFlush != 0:
+   *     inputLen - inpOff -filterLen + 1 > 0
+   * <=> inputLen -filterLen + 1 > inpOff
+   * <=> inpOff < inputLen -filterLen + 1
+   * 
+   * applyFlush == 0:
+   *     inputLen - inpOff >= Nfft
+   * <=> inputLen - Nfft >= inpOff
+   * <=> inpOff <= inputLen - Nfft
+   * <=> inpOff < inputLen - Nfft + 1
+   */
+
+  if ( cplxFactor == 2 )
+  {
+    const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
+#if 0
+    printf( "*** inputLen %d, filterLen %d, Nfft %d => maxOff %d\n", inputLen, filterLen, Nfft, maxOff);
+#endif
+    for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
+    {
+      procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
+      numOut = ( procLen - filterLen + 1 ) & ( ~1 );
+      if (!numOut)
+        break;
+#if 0
+      if (!inpOff)
+        printf("*** inpOff = %d, numOut = %d\n", inpOff, numOut);
+      if (inpOff + filterLen + 2 >= maxOff )
+        printf("*** inpOff = %d, inpOff + numOut = %d\n", inpOff, inpOff + numOut);
+#endif
+
+      if ( flags & PFFASTCONV_DIRECT_INP )
+      {
+        pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+      }
+      else
+      {
+        memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
+        if ( procLen < Nfft )
+          memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
+    
+        pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+      }
+
+      memset( s->Mf, 0, (unsigned)Nfft * sizeof(float) );  /* why does pffft_zconvolve_accumulate() accumulate?? */
+      pffft_zconvolve_accumulate(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
+
+      if ( flags & PFFASTCONV_DIRECT_OUT )
+      {
+        pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
+      }
+      else
+      {
+        pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
+        memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
+      }
+    }
+    return inpOff / cplxFactor;
+  }
+  else
+  {
+    const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
+    const int numParts = (flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1;
+
+    for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
+    {
+      procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
+      numOut = procLen - filterLen + 1;
+
+      for ( part = 0; part < numParts; ++part )  /* iterate per real/imag component */
+      {
+
+        if ( flags & PFFASTCONV_CPLX_INP_OUT )
+        {
+          cplxOff = 2 * inpOff + part;
+          for ( j = 0; j < procLen; ++j )
+            s->Xt[j] = X[cplxOff + 2 * j];
+          if ( procLen < Nfft )
+            memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
+    
+          pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+        }
+        else if ( flags & PFFASTCONV_DIRECT_INP )
+        {
+          pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+        }
+        else
+        {
+          memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
+          if ( procLen < Nfft )
+            memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
+    
+          pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
+        }
+    
+        memset( s->Mf, 0, (unsigned)Nfft * sizeof(float) );  /* why does pffft_zconvolve_accumulate() accumulate?? */
+        pffft_zconvolve_accumulate(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
+
+        if ( flags & PFFASTCONV_CPLX_INP_OUT )
+        {
+          pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
+    
+          cplxOff = 2 * inpOff + part;
+          for ( j = 0; j < numOut; ++j )
+            Y[ cplxOff + 2 * j ] = s->Xf[j];
+        }
+        else if ( flags & PFFASTCONV_DIRECT_OUT )
+        {
+          pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
+        }
+        else
+        {
+          pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
+          memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
+        }
+
+      }
+    }
+
+    return inpOff;
+  }
+}
+
diff --git a/pffastconv.h b/pffastconv.h
new file mode 100644
index 0000000..6e59d42
--- /dev/null
+++ b/pffastconv.h
@@ -0,0 +1,164 @@
+/* Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of PFFFT, PFFASTCONV, nor the names of its
+   sponsors or contributors may be used to endorse or promote products
+   derived from this Software without specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+   
+/*
+   PFFASTCONV : a Pretty Fast Fast Convolution
+
+   This is basically the implementation of fast convolution,
+   utilizing the FFT (pffft).
+
+   Restrictions: 
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+  
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFASTCONV_H
+#define PFFASTCONV_H
+
+#include <stddef.h> // for size_t
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff
+     this struct can't be shared by many threads as it contains
+     temporary data, computed within the convolution
+  */
+  typedef struct PFFASTCONV_Setup PFFASTCONV_Setup;
+
+  typedef enum {
+    PFFASTCONV_CPLX_INP_OUT = 1,
+    /* set when input and output is complex,
+     * with real and imag part interleaved in both vectors.
+     * input[] has inputLen complex values: 2 * inputLen floats,
+     * output[] is also written with complex values.
+     * without this flag, the input is interpreted as real vector
+     */
+
+    PFFASTCONV_CPLX_FILTER = 2,
+    /* set when filterCoeffs is complex,
+     * with real and imag part interleaved.
+     * filterCoeffs[] has filterLen complex values: 2 * filterLen floats
+     * without this flag, the filter is interpreted as real vector
+     * ATTENTION: this is not implemented yet!
+     */
+
+    PFFASTCONV_DIRECT_INP = 4,
+    /* set PFFASTCONV_DIRECT_INP only, when following conditions are met:
+     * 1- input vecor X must be aligned
+     * 2- (all) inputLen <= ouput blockLen
+     * 3- X must have minimum length of output BlockLen
+     * 4- the additional samples from inputLen .. BlockLen-1
+     *   must contain valid small and non-NAN samples (ideally zero)
+     * 
+     * this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
+     */
+
+    PFFASTCONV_DIRECT_OUT = 8,
+    /* set PFFASTCONV_DIRECT_OUT only when following conditions are met:
+     * 1- output vector Y must be aligned
+     * 2- (all) inputLen <= ouput blockLen
+     * 3- Y must have minimum length of output blockLen
+     * 
+     * this option is ignored when PFFASTCONV_CPLX_INP_OUT is set
+     */
+
+    PFFASTCONV_CPLX_SINGLE_FFT = 16,
+    /* hint to process complex data with one single FFT;
+     * default is to use 2 FFTs: one for real part, one for imag part
+     * */
+
+
+    PFFASTCONV_SYMMETRIC = 32
+    /* just informal, that filter is symmetric .. and filterLen is multiple of 8 */
+    
+  } pffastconv_flags_t;
+
+  /*
+    prepare for performing fast convolution(s) of 'filterLen' with input 'blockLen'.
+    The output 'blockLen' might be bigger to allow the fast convolution.
+    
+    'flags' are bitmask over the 'pffastconv_flags_t' enum.
+
+    PFFASTCONV_Setup structure can't be shared accross multiple filters
+    or concurrent threads.
+  */
+  PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags );
+
+  void pffastconv_destroy_setup(PFFASTCONV_Setup *);
+
+  /* 
+     Perform the fast convolution.
+
+     'input' and 'output' don't need to be aligned - unless any of
+     PFFASTCONV_DIRECT_INP or PFFASTCONV_DIRECT_OUT is set in 'flags'.
+
+     inputLen > output 'blockLen' (from pffastconv_new_setup()) is allowed.
+     in this case, multiple FFTs are called internally, to process the
+     input[].
+
+     'output' vector must have size >= (inputLen - filterLen + 1)
+
+     set bool option 'applyFlush' to process the full input[].
+     with this option, 'tail samples' of input are also processed.
+     This might be inefficient, because the FFT is called to produce
+     few(er) output samples, than possible.
+     This option is useful to process the last samples of an input (file)
+     or to reduce latency.
+
+     return value is the number of produced samples in output[].
+     the same amount of samples is processed from input[]. to continue
+     processing, the caller must save/move the remaining samples of
+     input[].
+
+  */
+  int pffastconv_apply(PFFASTCONV_Setup * s, const float *input, int inputLen, float *output, int applyFlush);
+
+  void *pffastconv_malloc(size_t nb_bytes);
+  void pffastconv_free(void *);
+
+  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+  int pffastconv_simd_size();
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PFFASTCONV_H */
diff --git a/pffft.c b/pffft.c
index ce76542..1a5ca4f 100644
--- a/pffft.c
+++ b/pffft.c
@@ -66,10 +66,6 @@
 #  define COMPILER_GCC
 #endif
 
-#ifdef COMPILER_MSVC
-#  define _USE_MATH_DEFINES
-#endif
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -103,173 +99,21 @@
 // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
 //#define PFFFT_SIMD_DISABLE
 
-/*
-   Altivec support macros 
-*/
-#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
-typedef vector float v4sf;
-#  define SIMD_SZ 4
-#  define VZERO() ((vector float) vec_splat_u8(0))
-#  define VMUL(a,b) vec_madd(a,b, VZERO())
-#  define VADD(a,b) vec_add(a,b)
-#  define VMADD(a,b,c) vec_madd(a,b,c)
-#  define VSUB(a,b) vec_sub(a,b)
-inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
-#  define LD_PS1(p) ld_ps1(&p)
-#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
-#  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
-    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
-    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
-    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
-  }
-#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
-    v4sf y0 = vec_mergeh(x0, x2);               \
-    v4sf y1 = vec_mergel(x0, x2);               \
-    v4sf y2 = vec_mergeh(x1, x3);               \
-    v4sf y3 = vec_mergel(x1, x3);               \
-    x0 = vec_mergeh(y0, y2);                    \
-    x1 = vec_mergel(y0, y2);                    \
-    x2 = vec_mergeh(y1, y3);                    \
-    x3 = vec_mergel(y1, y3);                    \
-  }
-#  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
-#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
-
-/*
-  SSE1 support macros
-*/
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
-
-#include <xmmintrin.h>
-typedef __m128 v4sf;
-#  define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
-#  define VZERO() _mm_setzero_ps()
-#  define VMUL(a,b) _mm_mul_ps(a,b)
-#  define VADD(a,b) _mm_add_ps(a,b)
-#  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
-#  define VSUB(a,b) _mm_sub_ps(a,b)
-#  define LD_PS1(p) _mm_set1_ps(p)
-#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
-#  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
-#  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
-#  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
-#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
-
-/*
-  ARM NEON support macros
-*/
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
-#  include <arm_neon.h>
-typedef float32x4_t v4sf;
-#  define SIMD_SZ 4
-#  define VZERO() vdupq_n_f32(0)
-#  define VMUL(a,b) vmulq_f32(a,b)
-#  define VADD(a,b) vaddq_f32(a,b)
-#  define VMADD(a,b,c) vmlaq_f32(c,a,b)
-#  define VSUB(a,b) vsubq_f32(a,b)
-#  define LD_PS1(p) vld1q_dup_f32(&(p))
-#  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
-#  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
-#  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
-    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
-    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
-    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
-    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
-    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
-  }
-// marginally faster version
-//#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
-#  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
-#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
-#else
-#  if !defined(PFFFT_SIMD_DISABLE)
-#    warning "building with simd disabled !\n";
-#    define PFFFT_SIMD_DISABLE // fallback to scalar code
-#  endif
-#endif
-
-// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
-#ifdef PFFFT_SIMD_DISABLE
-typedef float v4sf;
-#  define SIMD_SZ 1
-#  define VZERO() 0.f
-#  define VMUL(a,b) ((a)*(b))
-#  define VADD(a,b) ((a)+(b))
-#  define VMADD(a,b,c) ((a)*(b)+(c))
-#  define VSUB(a,b) ((a)-(b))
-#  define LD_PS1(p) (p)
-#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
-#endif
-
-// shortcuts for complex multiplcations
-#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
-#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
-#ifndef SVMUL
-// multiply a scalar with a vector
-#define SVMUL(f,v) VMUL(LD_PS1(f),v)
-#endif
-
-#if !defined(PFFFT_SIMD_DISABLE)
-typedef union v4sf_union {
-  v4sf  v;
-  float f[4];
-} v4sf_union;
-
-#include <string.h>
-
-#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+#include "pfsimd_macros.h"
 
 /* detect bugs with the vector support macros */
 void validate_pffft_simd() {
-  float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
-  v4sf_union a0, a1, a2, a3, t, u; 
-  memcpy(a0.f, f, 4*sizeof(float));
-  memcpy(a1.f, f+4, 4*sizeof(float));
-  memcpy(a2.f, f+8, 4*sizeof(float));
-  memcpy(a3.f, f+12, 4*sizeof(float));
-
-  t = a0; u = a1; t.v = VZERO();
-  printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
-  t.v = VADD(a1.v, a2.v);
-  printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
-  t.v = VMUL(a1.v, a2.v);
-  printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
-  t.v = VMADD(a1.v, a2.v,a0.v);
-  printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
-
-  INTERLEAVE2(a1.v,a2.v,t.v,u.v);
-  printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
-  assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
-  UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
-  printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
-  assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
-
-  t.v=LD_PS1(f[15]);
-  printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
-  assertv4(t, 15, 15, 15, 15);
-  t.v = VSWAPHL(a1.v, a2.v);
-  printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
-  assertv4(t, 8, 9, 6, 7);
-  VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
-  printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", 
-         a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], 
-         a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); 
-  assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
+  Vvalidate_simd();
 }
-#endif //!PFFFT_SIMD_DISABLE
 
 /* SSE and co like 16-bytes aligned pointers */
 #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
 void *pffft_aligned_malloc(size_t nb_bytes) {
-  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
-  if (!p0) return (void *) 0;
-  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
-  *((void **) p - 1) = p0;
-  return p;
+  return Valigned_malloc(nb_bytes);
 }
 
 void pffft_aligned_free(void *p) {
-  if (p) free(*((void **) p - 1));
+  Valigned_free(p);
 }
 
 int pffft_simd_size() { return SIMD_SZ; }
diff --git a/pfsimd_macros.h b/pfsimd_macros.h
new file mode 100644
index 0000000..cfb3bd1
--- /dev/null
+++ b/pfsimd_macros.h
@@ -0,0 +1,250 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#ifndef PF_SIMD_MACROS_H
+#define PF_SIMD_MACROS_H
+
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+
+
+/*
+ *  SIMD reference material:
+ * 
+ * general SIMD introduction:
+ * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
+ * 
+ * SSE 1:
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+ * 
+ * ARM NEON:
+ * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
+ * 
+ * Altivec:
+ * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
+ * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
+ * better one?
+ * 
+ */
+
+
+/*
+   Altivec support macros 
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
+typedef vector float v4sf;
+#  define SIMD_SZ 4
+#  define VREQUIRES_ALIGN 1  /* not sure, if really required */
+#  define VZERO() ((vector float) vec_splat_u8(0))
+#  define VMUL(a,b) vec_madd(a,b, VZERO())
+#  define VADD(a,b) vec_add(a,b)
+#  define VMADD(a,b,c) vec_madd(a,b,c)
+#  define VSUB(a,b) vec_sub(a,b)
+inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
+#  define LD_PS1(p) ld_ps1(&p)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
+    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
+    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
+    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
+  }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
+    v4sf y0 = vec_mergeh(x0, x2);               \
+    v4sf y1 = vec_mergel(x0, x2);               \
+    v4sf y2 = vec_mergeh(x1, x3);               \
+    v4sf y3 = vec_mergel(x1, x3);               \
+    x0 = vec_mergeh(y0, y2);                    \
+    x1 = vec_mergel(y0, y2);                    \
+    x2 = vec_mergeh(y1, y3);                    \
+    x3 = vec_mergel(y1, y3);                    \
+  }
+#  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
+
+/*
+  SSE1 support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
+ *  anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
+#  define SIMD_SZ 4
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm_setzero_ps()
+#  define VMUL(a,b) _mm_mul_ps(a,b)
+#  define VADD(a,b) _mm_add_ps(a,b)
+#  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
+#  define VSUB(a,b) _mm_sub_ps(a,b)
+#  define LD_PS1(p) _mm_set1_ps(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm_loadu_ps(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm_load_ps(ptr)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
+/* reverse/flip all floats */
+#  define VREV_S(a)    _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
+
+/*
+  ARM NEON support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
+#  include <arm_neon.h>
+typedef float32x4_t v4sf;
+#  define SIMD_SZ 4
+#  define VREQUIRES_ALIGN 0  /* usually no alignment required */
+#  define VZERO() vdupq_n_f32(0)
+#  define VMUL(a,b) vmulq_f32(a,b)
+#  define VADD(a,b) vaddq_f32(a,b)
+#  define VMADD(a,b,c) vmlaq_f32(c,a,b)
+#  define VSUB(a,b) vsubq_f32(a,b)
+#  define LD_PS1(p) vld1q_dup_f32(&(p))
+#  define VLOAD_UNALIGNED(ptr)  (*(ptr))
+#  define VLOAD_ALIGNED(ptr)    (*(ptr))
+#  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
+    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
+    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
+    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
+    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
+    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
+  }
+// marginally faster version
+//#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+#  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
+#else
+#  if !defined(PFFFT_SIMD_DISABLE)
+#    warning "building with simd disabled !\n";
+#    define PFFFT_SIMD_DISABLE // fallback to scalar code
+#  endif
+#endif
+
+// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
+#ifdef PFFFT_SIMD_DISABLE
+typedef float v4sf;
+#  define SIMD_SZ 1
+#  define VREQUIRES_ALIGN 0
+#  define VZERO() 0.f
+#  define VMUL(a,b) ((a)*(b))
+#  define VADD(a,b) ((a)+(b))
+#  define VMADD(a,b,c) ((a)*(b)+(c))
+#  define VSUB(a,b) ((a)-(b))
+#  define LD_PS1(p) (p)
+#  define VLOAD_UNALIGNED(ptr)  (*(ptr))
+#  define VLOAD_ALIGNED(ptr)    (*(ptr))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
+#endif
+
+// shortcuts for complex multiplcations
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+// multiply a scalar with a vector
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+typedef union v4sf_union {
+  v4sf  v;
+  float f[SIMD_SZ];
+} v4sf_union;
+
+#if !defined(PFFFT_SIMD_DISABLE)
+
+#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+
+/* detect bugs with the vector support macros */
+static void Vvalidate_simd() {
+  float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
+  v4sf_union a0, a1, a2, a3, t, u; 
+  memcpy(a0.f, f, 4*sizeof(float));
+  memcpy(a1.f, f+4, 4*sizeof(float));
+  memcpy(a2.f, f+8, 4*sizeof(float));
+  memcpy(a3.f, f+12, 4*sizeof(float));
+
+  t = a0; u = a1; t.v = VZERO();
+  printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
+  t.v = VADD(a1.v, a2.v);
+  printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
+  t.v = VMUL(a1.v, a2.v);
+  printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
+  t.v = VMADD(a1.v, a2.v,a0.v);
+  printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
+
+  INTERLEAVE2(a1.v,a2.v,t.v,u.v);
+  printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+  assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
+  UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
+  printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+  assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
+
+  t.v=LD_PS1(f[15]);
+  printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+  assertv4(t, 15, 15, 15, 15);
+  t.v = VSWAPHL(a1.v, a2.v);
+  printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+  assertv4(t, 8, 9, 6, 7);
+  VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
+  printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", 
+         a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], 
+         a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); 
+  assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
+}
+#endif //!PFFFT_SIMD_DISABLE
+
+/* SSE and co like 16-bytes aligned pointers */
+#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
+
+static
+void *Valigned_malloc(size_t nb_bytes) {
+  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+  if (!p0) return (void *) 0;
+  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+  *((void **) p - 1) = p0;
+  return p;
+}
+
+static
+void Valigned_free(void *p) {
+  if (p) free(*((void **) p - 1));
+}
+
+
+#endif /* PF_SIMD_MACROS_H */
+
diff --git a/test_pffastconv.c b/test_pffastconv.c
new file mode 100644
index 0000000..58d1f8c
--- /dev/null
+++ b/test_pffastconv.c
@@ -0,0 +1,1093 @@
+/*
+  Copyright (c) 2013 Julien Pommier.
+
+  Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
+
+  How to build: 
+
+  on linux, with fftw3:
+  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+  on macos, without fftw3:
+  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
+
+  on macos, with fftw3:
+  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
+
+  as alternative: replace clang by gcc.
+
+  on windows, with visual c++:
+  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+  
+  build without SIMD instructions:
+  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
+
+ */
+ 
+#define _WANT_SNAN  1
+
+#include "pffft.h"
+#include "pffastconv.h"
+
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+#ifdef HAVE_SYS_TIMES
+#  include <sys/times.h>
+#  include <unistd.h>
+#endif
+
+/* 
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros 
+*/
+#include "pfsimd_macros.h"
+
+
+#if defined(_MSC_VER)
+#  define RESTRICT __restrict
+#elif defined(__GNUC__)
+#  define RESTRICT __restrict
+#else
+#  define RESTRICT
+#endif
+
+
+#if defined(_MSC_VER)
+#pragma warning( disable : 4244 )
+#endif
+
+
+#ifdef SNANF
+  #define INVALID_FLOAT_VAL  SNANF
+#elif defined(SNAN)
+  #define INVALID_FLOAT_VAL  SNAN
+#elif defined(NAN)
+  #define INVALID_FLOAT_VAL  NAN
+#elif defined(INFINITY)
+  #define INVALID_FLOAT_VAL  INFINITY
+#else
+  #define INVALID_FLOAT_VAL  FLT_MAX
+#endif
+
+
+#if defined(HAVE_SYS_TIMES)
+  inline double uclock_sec(void) {
+    static double ttclk = 0.;
+    struct tms t;
+    if (ttclk == 0.)
+      ttclk = sysconf(_SC_CLK_TCK);
+    times(&t);
+    /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
+    return ((double)t.tms_utime)) / ttclk;
+  }
+# else
+  double uclock_sec(void)
+{ return (double)clock()/(double)CLOCKS_PER_SEC; }
+#endif
+
+
+
+
+unsigned nextPowerOfTwo(unsigned v) {
+  /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+  /* compute the next highest power of 2 of 32-bit v */
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
+
+typedef int            (*pfnConvolution)  (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
+typedef void*          (*pfnConvSetup)    (float *Hfwd, int Nf, int * BlkLen, int flags);
+typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
+typedef void           (*pfnConvDestroy)  (void * setup);
+
+
+struct ConvSetup
+{
+  pfnConvolution pfn;
+  int N;
+  int B;
+  float * H;
+  int flags;
+};
+
+
+void * convSetupRev( float * H, int N, int * BlkLen, int flags )
+{
+  struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
+  int i, Nr = N;
+  if (flags & PFFASTCONV_CPLX_INP_OUT)
+    Nr *= 2;
+  Nr += 4;
+  s->pfn = NULL;
+  s->N = N;
+  s->B = *BlkLen;
+  s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
+  s->flags = flags;
+  memset(s->H, 0, (unsigned)Nr * sizeof(float));
+  if (flags & PFFASTCONV_CPLX_INP_OUT)
+  {
+    for ( i = 0; i < N; ++i ) {
+      s->H[2*(N-1 -i)  ] = H[i];
+      s->H[2*(N-1 -i)+1] = H[i];
+    }
+    /* simpler detection of overruns */
+    s->H[ 2*N    ] = INVALID_FLOAT_VAL;
+    s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
+    s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
+    s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
+  }
+  else
+  {
+    for ( i = 0; i < N; ++i )
+      s->H[ N-1 -i ] = H[i];
+    /* simpler detection of overruns */
+    s->H[ N    ] = INVALID_FLOAT_VAL;
+    s->H[ N +1 ] = INVALID_FLOAT_VAL;
+    s->H[ N +2 ] = INVALID_FLOAT_VAL;
+    s->H[ N +3 ] = INVALID_FLOAT_VAL;
+  }
+  return s;
+}
+
+void convDestroyRev( void * setup )
+{
+  struct ConvSetup * s = (struct ConvSetup*)setup;
+  pffastconv_free(s->H);
+  pffastconv_free(setup);
+}
+
+
+pfnConvolution ConvGetFnPtrRev( void * setup )
+{
+  struct ConvSetup * s = (struct ConvSetup*)setup;
+  if (!s)
+    return NULL;
+  return s->pfn;
+}
+
+
+void convSimdDestroy( void * setup )
+{
+  convDestroyRev(setup);
+}
+
+
+void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
+{
+  void * p = pffastconv_new_setup( H, N, BlkLen, flags );
+  if (!p)
+    printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
+  return p;
+}
+
+
+void fastConvDestroy( void * setup )
+{
+  pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
+}
+
+
+
+int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+  struct ConvSetup * p = (struct ConvSetup*)setup;
+  const float * RESTRICT X = input;
+  const float * RESTRICT Hrev = p->H;
+  float * RESTRICT Y = output;
+  const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+  const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+  int i, j;
+  (void)Yref;
+  (void)applyFlush;
+
+  if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+  {
+    for ( i = 0; i <= lenNr; i += 2 )
+    {
+      float sumRe = 0.0F, sumIm = 0.0F;
+      for ( j = 0; j < Nr; j += 2 )
+      {
+        sumRe += X[i+j  ] * Hrev[j];
+        sumIm += X[i+j+1] * Hrev[j+1];
+      }
+      Y[i  ] = sumRe;
+      Y[i+1] = sumIm;
+    }
+    return i/2;
+  }
+  else
+  {
+    for ( i = 0; i <= lenNr; ++i )
+    {
+      float sum = 0.0F;
+      for (j = 0; j < Nr; ++j )
+        sum += X[i+j]   * Hrev[j];
+      Y[i] = sum;
+    }
+    return i;
+  }
+}
+
+
+
+int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+  float sum[4];
+  struct ConvSetup * p = (struct ConvSetup*)setup;
+  const float * RESTRICT X = input;
+  const float * RESTRICT Hrev = p->H;
+  float * RESTRICT Y = output;
+  const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+  const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+  int i, j;
+  (void)Yref;
+  (void)applyFlush;
+
+  if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+  {
+    if ( (Nr & 3) == 0 )
+    {
+      for ( i = 0; i <= lenNr; i += 2 )
+      {
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j < Nr; j += 4 )
+        {
+          sum[0] += X[i+j]   * Hrev[j];
+          sum[1] += X[i+j+1] * Hrev[j+1];
+          sum[2] += X[i+j+2] * Hrev[j+2];
+          sum[3] += X[i+j+3] * Hrev[j+3];
+        }
+        Y[i  ] = sum[0] + sum[2];
+        Y[i+1] = sum[1] + sum[3];
+      }
+    }
+    else
+    {
+      const int M = Nr & (~3);
+      for ( i = 0; i <= lenNr; i += 2 )
+      {
+        float tailSumRe = 0.0F, tailSumIm = 0.0F;
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j < M; j += 4 )
+        {
+          sum[0] += X[i+j  ] * Hrev[j  ];
+          sum[1] += X[i+j+1] * Hrev[j+1];
+          sum[2] += X[i+j+2] * Hrev[j+2];
+          sum[3] += X[i+j+3] * Hrev[j+3];
+        }
+        for ( ; j < Nr; j += 2 ) {
+          tailSumRe += X[i+j  ] * Hrev[j  ];
+          tailSumIm += X[i+j+1] * Hrev[j+1];
+        }
+        Y[i  ] = ( sum[0] + sum[2] ) + tailSumRe;
+        Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
+      }
+    }
+    return i/2;
+  }
+  else
+  {
+    if ( (Nr & 3) == 0 )
+    {
+      for ( i = 0; i <= lenNr; ++i )
+      {
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j < Nr; j += 4 )
+        {
+          sum[0] += X[i+j]   * Hrev[j];
+          sum[1] += X[i+j+1] * Hrev[j+1];
+          sum[2] += X[i+j+2] * Hrev[j+2];
+          sum[3] += X[i+j+3] * Hrev[j+3];
+        }
+        Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
+      }
+      return i;
+    }
+    else
+    {
+      const int M = Nr & (~3);
+      /* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
+      for ( i = 0; i <= lenNr; ++i )
+      {
+        float tailSum = 0.0;
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j < M; j += 4 )
+        {
+          sum[0] += X[i+j]   * Hrev[j];
+          sum[1] += X[i+j+1] * Hrev[j+1];
+          sum[2] += X[i+j+2] * Hrev[j+2];
+          sum[3] += X[i+j+3] * Hrev[j+3];
+        }
+        for ( ; j < Nr; ++j )
+          tailSum += X[i+j] * Hrev[j];
+        Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
+      }
+      return i;
+    }
+  }
+}
+
+
+int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+  float sum[4];
+  struct ConvSetup * p = (struct ConvSetup*)setup;
+  (void)Yref;
+  (void)applyFlush;
+  if (p->flags & PFFASTCONV_SYMMETRIC)
+  {
+    const float * RESTRICT X = input;
+    const float * RESTRICT Hrev = p->H;
+    float * RESTRICT Y = output;
+    const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+    const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+    const int h = Nr / 2 -4;
+    const int E = Nr -4;
+    int i, j;
+
+    if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+    {
+      for ( i = 0; i <= lenNr; i += 2 )
+      {
+        const int k = i + E;
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j <= h; j += 4 )
+        {
+          sum[0] += Hrev[j  ] * ( X[i+j  ] + X[k-j+2] );
+          sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
+          sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j  ] );
+          sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
+        }
+        Y[i  ] = sum[0] + sum[2];
+        Y[i+1] = sum[1] + sum[3];
+      }
+      return i/2;
+    }
+    else
+    {
+      for ( i = 0; i <= lenNr; ++i )
+      {
+        const int k = i + E;
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j <= h; j += 4 )
+        {
+          sum[0] += Hrev[j  ] * ( X[i+j  ] + X[k-j+3] );
+          sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
+          sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
+          sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j  ] );
+        }
+        Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
+      }
+      return i;
+    }
+  }
+  else
+  {
+    const float * RESTRICT X = input;
+    const float * RESTRICT Hrev = p->H;
+    float * RESTRICT Y = output;
+    const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+    const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+    int i, j;
+
+    if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+    {
+      for ( i = 0; i <= lenNr; i += 2 )
+      {
+        sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+        for (j = 0; j < Nr; j += 4 )
+        {
+          sum[0] += X[i+j]   * Hrev[j];
+          sum[1] += X[i+j+1] * Hrev[j+1];
+          sum[2] += X[i+j+2] * Hrev[j+2];
+          sum[3] += X[i+j+3] * Hrev[j+3];
+        }
+        Y[i  ] = sum[0] + sum[2];
+        Y[i+1] = sum[1] + sum[3];
+      }
+      return i/2;
+    }
+    else
+    {
+      if ( (Nr & 3) == 0 )
+      {
+        for ( i = 0; i <= lenNr; ++i )
+        {
+          sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+          for (j = 0; j < Nr; j += 4 )
+          {
+            sum[0] += X[i+j]   * Hrev[j];
+            sum[1] += X[i+j+1] * Hrev[j+1];
+            sum[2] += X[i+j+2] * Hrev[j+2];
+            sum[3] += X[i+j+3] * Hrev[j+3];
+          }
+          Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
+        }
+        return i;
+      }
+      else
+      {
+        const int M = Nr & (~3);
+        /* printf("B: Nr = %d\n", Nr ); */
+        for ( i = 0; i <= lenNr; ++i )
+        {
+          float tailSum = 0.0;
+          sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
+          for (j = 0; j < M; j += 4 )
+          {
+            sum[0] += X[i+j]   * Hrev[j];
+            sum[1] += X[i+j+1] * Hrev[j+1];
+            sum[2] += X[i+j+2] * Hrev[j+2];
+            sum[3] += X[i+j+3] * Hrev[j+3];
+          }
+          for ( ; j < Nr; ++j )
+            tailSum += X[i+j] * Hrev[j];
+          Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
+        }
+        return i;
+      }
+    }
+  }
+
+}
+
+
+int slow_conv_SIMD(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
+{
+  v4sf_union sum;
+  struct ConvSetup * p = (struct ConvSetup*)setup;
+  (void)Yref;
+  (void)applyFlush;
+
+#if defined(PFFFT_SIMD_DISABLE) || !(defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+  return 0;
+#endif
+#if SIMD_SZ != 4
+  #error "Error: SIMD_SZ != 4 not implemented!"
+  return 0;
+#endif
+
+  if (p->flags & PFFASTCONV_SYMMETRIC)
+  {
+    const float * RESTRICT X = input;
+    const float * RESTRICT Hrev = p->H;
+    float * RESTRICT Y = output;
+    const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+    const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+    const int h = Nr / 2 -SIMD_SZ;
+    const int E = Nr -SIMD_SZ;
+    int i, j;
+
+    assert(VALIGNED(Hrev));
+
+    if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+    {
+      for ( i = 0; i <= lenNr; i += 2 )
+      {
+        const int k = i + E;
+        sum.v = VZERO();
+        for ( j = 0; j <= h; j += SIMD_SZ )
+        {
+          v4sf t = VLOAD_UNALIGNED(X +(k-j));
+          sum.v = VMADD( VADD( VLOAD_UNALIGNED(X+(i+j)), VREV_C(t) ) , VLOAD_ALIGNED(Hrev+j), sum.v );
+        }
+        Y[i  ] = sum.f[0] + sum.f[2];
+        Y[i+1] = sum.f[1] + sum.f[3];
+      }
+      return i/2;
+    }
+    else
+    {
+      for ( i = 0; i <= lenNr; ++i )
+      {
+        const int k = i + E;
+        sum.v = VZERO();
+        for ( j = 0; j <= h; j += SIMD_SZ )
+        {
+          v4sf t = VLOAD_UNALIGNED(X +(k-j));
+          sum.v = VMADD( VADD( VLOAD_UNALIGNED(X+(i+j)), VREV_S(t) ) , VLOAD_ALIGNED(Hrev+j), sum.v );
+        }
+        Y[i] = (sum.f[0] + sum.f[1]) + (sum.f[2] + sum.f[3]);
+      }
+      return i;
+    }
+  }
+  else
+  {
+    const float * RESTRICT X = input;
+    const float * RESTRICT Hrev = p->H;
+    float * RESTRICT Y = output;
+    const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
+    const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
+    int i, j;
+
+    assert(VALIGNED(Hrev));
+
+    if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+    {
+      for ( i = 0; i <= lenNr; i += 2 )
+      {
+        sum.v = VZERO();
+        for ( j = 0; j < Nr; j += SIMD_SZ )
+          sum.v = VMADD( VLOAD_UNALIGNED(X+(i+j)), VLOAD_ALIGNED(Hrev+j), sum.v );
+        Y[i  ] = sum.f[0] + sum.f[2];
+        Y[i+1] = sum.f[1] + sum.f[3];
+      }
+      return i/2;
+    }
+    else
+    {
+      if ( (Nr & 3) == 0 )
+      {
+        for ( i = 0; i <= lenNr; ++i )
+        {
+          sum.v = VZERO();
+          for ( j = 0; j < Nr; j += SIMD_SZ )
+            sum.v = VMADD( VLOAD_UNALIGNED(X+(i+j)), VLOAD_ALIGNED(Hrev+j), sum.v );
+          Y[i] = (sum.f[0] + sum.f[1]) + (sum.f[2] + sum.f[3]);
+        }
+        return i;
+      }
+      else
+      {
+        const int M = Nr & (~3);
+        for ( i = 0; i <= lenNr; ++i )
+        {
+          float tailSum = 0.0;
+          sum.v = VZERO();
+          for (j = 0; j < M; j += SIMD_SZ )
+            sum.v = VMADD( VLOAD_UNALIGNED(X+(i+j)), VLOAD_ALIGNED(Hrev+j), sum.v );
+          for ( ; j < Nr; ++j )
+            tailSum += X[i+j] * Hrev[j];
+          Y[i] = (sum.f[0] + sum.f[1]) + (sum.f[2] + sum.f[3]) + tailSum;
+        }
+        return i;
+      }
+    }
+  }
+
+}
+
+
+void * convSimdSetup( float * H, int N, int * BlkLen, int flags )
+{
+  struct ConvSetup * p = (struct ConvSetup*) convSetupRev( H, N, BlkLen, flags );
+  int NomMul = 1, DenMul = 1;
+
+  if (!p)
+    return p;
+
+#if defined(PFFFT_SIMD_DISABLE) || (SIMD_SZ != 4) || !(defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+  p->pfn = slow_conv_B;
+  return p;
+#endif
+
+  if (p->flags & PFFASTCONV_SYMMETRIC)
+  {
+    if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+      NomMul = DenMul = 1;
+    else
+      DenMul = 2;
+  }
+  else
+  {
+    if (p->flags & PFFASTCONV_CPLX_INP_OUT)
+      NomMul = 2;
+    else
+    {
+      p->pfn = slow_conv_SIMD;
+      return p;
+    }
+  }
+
+  p->pfn = ( ( (NomMul * p->N) % (DenMul * SIMD_SZ) ) != 0 ) ? slow_conv_B : slow_conv_SIMD;
+  return p;
+}
+
+
+
+int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
+{
+  (void)Yref;
+  return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
+}
+
+
+
+void printFirst( const float * V, const char * st, const int N, const int perLine )
+{
+  (void)V;  (void)st;  (void)N;  (void)perLine;
+  return;
+#if 0
+  int i;
+  for ( i = 0; i < N; ++i )
+  {
+    if ( (i % perLine) == 0 )
+      printf("\n%s[%d]", st, i);
+    printf("\t%.1f", V[i]);
+  }
+  printf("\n");
+#endif
+}
+
+
+
+#define NUMY       11
+
+
+int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed) {
+  double t0, t1, tstop, td, tdref;
+  float *X, *H;
+  float *Y[NUMY];
+  int64_t outN[NUMY];
+  /* 256 KFloats or 16 MFloats data */
+#if 1
+  const int len = testOutLen ? (1 << 18) : (1 << 24);
+#elif 0
+  const int len = testOutLen ? (1 << 18) : (1 << 13);
+#else
+  const int len = testOutLen ? (1 << 18) : (1024);
+#endif
+  const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
+  const int lenC = len / cplxFactor;
+
+  int yi, yc, posMaxErr;
+  float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
+  int i, j, numErrOverLimit, iter;
+  int retErr = 0;
+
+  const int iSlowSIMDalgo = 3;
+  /*                                  0               1               2               3                  4                   5                   6                   7                   8                   9                      10              */
+  pfnConvSetup   aSetup[NUMY]     = { convSetupRev,   convSetupRev,   convSetupRev,   convSimdSetup,     fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,         fastConvSetup   };
+  pfnConvDestroy aDestroy[NUMY]   = { convDestroyRev, convDestroyRev, convDestroyRev, convSimdDestroy,   fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,       fastConvDestroy };
+  pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL,           NULL,           NULL,           ConvGetFnPtrRev,   NULL,               NULL,               NULL,               NULL,               NULL,               NULL,                  NULL,           };
+  pfnConvolution aConv[NUMY]      = { slow_conv_R,    slow_conv_A,    slow_conv_B,    NULL,              fast_conv,          fast_conv,          fast_conv,          fast_conv,          fast_conv,          fast_conv,             fast_conv       };
+  const char * convText[NUMY]     = { "R(non-simd)",  "A(non-simd)",  "B(non-simd)",  "slow_simd",       "fast_conv_64",     "fast_conv_128",    "fast_conv_256",    "fast_conv_512",    "fast_conv_1K",     "fast_conv_2K",        "fast_conv_4K"  };
+  int    aFastAlgo[NUMY]          = { 0,              0,              0,              0,                 1,                  1,                  1,                  1,                  1,                  1,                     1               };
+  void * aSetupCfg[NUMY]          = { NULL,           NULL,           NULL,           NULL,              NULL,               NULL,               NULL,               NULL,               NULL,               NULL,                  NULL            };
+  int    aBlkLen[NUMY]            = { 1024,           1024,           1024,           1024,              64,                 128,                256,                512,                1024,               2048,                  4096            };
+#if 1
+  int    aRunAlgo[NUMY]           = { 1,              1,              1,              1,                 FILTERLEN<64,       FILTERLEN<128,      FILTERLEN<256,      FILTERLEN<512,      FILTERLEN<1024,     FILTERLEN<2048,        FILTERLEN<4096  };
+#else
+  int    aRunAlgo[NUMY]           = { 1,              0,              0,              0,                 0 && FILTERLEN<64,  1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048,  0 && FILTERLEN<4096  };
+#endif
+  double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];
+
+  X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
+  for ( i=0; i < NUMY; ++i)
+  {
+    if ( i < 2 )
+      Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
+    else
+      Y[i] = Y[1];
+
+    aSpeedFactor[i] = -1.0;
+    aDuration[i] = -1.0;
+    procSmpPerSec[i] = -1.0;
+  }
+
+  H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));
+
+  /* initialize input */
+  if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
+  {
+    for ( i = 0; i < lenC; ++i )
+    {
+      X[2*i  ] = (float)(i % 4093);  /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
+      X[2*i+1] = (float)((i+2048) % 4093);
+    }
+  }
+  else
+  {
+    for ( i = 0; i < len; ++i )
+      X[i] = (float)(i % 4093);  /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
+  }
+  X[ len    ] = INVALID_FLOAT_VAL;
+  X[ len +1 ] = INVALID_FLOAT_VAL;
+  X[ len +2 ] = INVALID_FLOAT_VAL;
+  X[ len +3 ] = INVALID_FLOAT_VAL;
+
+  if (!testOutLen)
+    printFirst( X, "X", 64, 8 );
+
+  /* filter coeffs */
+  memset( H, 0, FILTERLEN * sizeof(float) );
+#if 1
+  if ( convFlags & PFFASTCONV_SYMMETRIC )
+  {
+    const int half = FILTERLEN / 2;
+    for ( j = 0; j < half; ++j ) {
+      switch (j % 3) {
+        case 0: H[j] = H[FILTERLEN-1-j] = -1.0F;  break;
+        case 1: H[j] = H[FILTERLEN-1-j] =  1.0F;  break;
+        case 2: H[j] = H[FILTERLEN-1-j] =  0.5F;  break;
+      }
+    }
+  }
+  else
+  {
+    for ( j = 0; j < FILTERLEN; ++j ) {
+      switch (j % 3) {
+        case 0: H[j] = -1.0F;  break;
+        case 1: H[j] = 1.0F;   break;
+        case 2: H[j] = 0.5F;   break;
+      }
+    }
+  }
+#else
+  H[0] = 1.0F;
+  H[FILTERLEN -1] = 1.0F;
+#endif
+  if (!testOutLen)
+    printFirst( H, "H", FILTERLEN, 8 );
+
+  printf("\n");
+  printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
+    ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
+    (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
+    ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );
+
+  while (1)
+  {
+
+    for ( yi = 0; yi < NUMY; ++yi )
+    {
+      if (!aRunAlgo[yi])
+        continue;
+
+      aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );
+
+      /* get effective apply function ptr */
+      if ( aSetupCfg[yi] && aGetFnPtr[yi] )
+        aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );
+
+      if ( aSetupCfg[yi] && aConv[yi] ) {
+        if (testOutLen)
+        {
+          t0 = uclock_sec();
+          outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
+          t1 = uclock_sec();
+          td = t1 - t0;
+        }
+        else
+        {
+          const int blkLen = 4096;  /* required for 'fast_conv_4K' */
+          int64_t offC = 0, offS, Nout;
+          int k;
+          iter = 0;
+          outN[yi] = 0;
+          t0 = uclock_sec();
+          tstop = t0 + 0.25;  /* benchmark duration: 250 ms */
+          do {
+            for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
+            {
+              offS = cplxFactor * offC;
+              Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], (offC +blkLen >= lenC) /* applyFlush */ );
+              offC += Nout;
+              ++iter;
+              if ( !Nout )
+                break;
+              if ( offC +blkLen >= lenC )
+              {
+                outN[yi] += offC;
+                offC = 0;
+              }
+            }
+            t1 = uclock_sec();
+          } while ( t1 < tstop );
+          outN[yi] += offC;
+          td = t1 - t0;
+          procSmpPerSec[yi] = cplxFactor * (double)offC / td;
+        }
+      }
+      else
+      {
+        t0 = t1 = td = 0.0;
+        outN[yi] = 0;
+      }
+      aDuration[yi] = td;
+      if ( yi == 0 ) {
+        const float * Yvals = Y[0];
+        const int64_t refOutLen = cplxFactor * outN[0];
+        tdref = td;
+        if (printDbg) {
+          printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
+          printf("  convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
+        }
+        aSpeedFactor[yi] = 1.0;
+        /*  */
+        yRangeMin = FLT_MAX;
+        yRangeMax = FLT_MIN;
+        for ( i = 0; i < refOutLen; ++i )
+        {
+          if ( yRangeMax < Yvals[i] )  yRangeMax = Yvals[i];
+          if ( yRangeMin > Yvals[i] )  yRangeMin = Yvals[i];
+        }
+        yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
+        /* yErrLimit = 0.01F; */
+        if (testOutLen) {
+          if (1) {
+            printf("reference output len = %" PRId64 " smp\n", outN[0]);
+            printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
+          }
+          printFirst( Yvals, "Yref", 64, 8 );
+        }
+      }
+      else
+      {
+        aSpeedFactor[yi] = tdref / td;
+        if (printDbg) {
+          printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
+          printf("  convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
+        }
+      }
+    }
+
+    int iMaxSpeedSlowAlgo = -1;
+    int iFirstFastAlgo = -1;
+    int iMaxSpeedFastAlgo = -1;
+    int iPrintedRefOutLen = 0;
+    {
+      for ( yc = 1; yc < NUMY; ++yc )
+      {
+        if (!aRunAlgo[yc])
+          continue;
+        if (aFastAlgo[yc]) {
+          if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
+            iMaxSpeedFastAlgo = yc;
+            
+          if (iFirstFastAlgo < 0)
+            iFirstFastAlgo = yc;
+        }
+        else
+        {
+          if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
+            iMaxSpeedSlowAlgo = yc;
+        }
+      }
+
+      if (printSpeed)
+      {
+        if (testOutLen)
+        {
+          if (iMaxSpeedSlowAlgo >= 0 )
+            printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
+          if (0 != iMaxSpeedSlowAlgo && 0 != iSlowSIMDalgo && aRunAlgo[0])
+            printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
+          if (1 != iMaxSpeedSlowAlgo && 1 != iSlowSIMDalgo && aRunAlgo[1])
+            printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);
+          if (iSlowSIMDalgo != iMaxSpeedSlowAlgo && aRunAlgo[iSlowSIMDalgo])
+            printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[iSlowSIMDalgo], aSpeedFactor[iSlowSIMDalgo], 1000.0 * aDuration[iSlowSIMDalgo]);
+
+          if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
+            printf("first   fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo],    aSpeedFactor[iFirstFastAlgo],    1000.0 * aDuration[iFirstFastAlgo]);
+          if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
+            printf("2nd     fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1],  aSpeedFactor[iFirstFastAlgo+1],  1000.0 * aDuration[iFirstFastAlgo+1]);
+
+          if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
+          {
+            printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
+            if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
+              printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
+          }
+          printf("\n");
+        }
+        else
+        {
+          for ( yc = 0; yc < NUMY; ++yc )
+          {
+            if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
+              continue;
+            printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g SmpPerSec\n",
+              convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] );
+          }
+        }
+
+      }
+    }
+
+
+    for ( yc = 1; yc < NUMY; ++yc )
+    {
+      const float * Yref;
+      const float * Ycurr;
+      int outMin;
+
+      if (!aRunAlgo[yc])
+        continue;
+
+      if (printDbg)
+        printf("\n");
+
+      if ( outN[yc] == 0 )
+      {
+        printf("output size 0: '%s' not implemented\n", convText[yc]);
+      }
+      else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
+      {
+        if (!iPrintedRefOutLen)
+        {
+          printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
+          iPrintedRefOutLen = 1;
+        }
+        printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
+          FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
+        retErr = 1;
+      }
+
+      posMaxErr = 0;
+      maxErr = -1.0;
+      Yref = Y[0];
+      Ycurr = Y[yc];
+      outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
+      numErrOverLimit = 0;
+      for ( i = 0; i < outMin; ++i )
+      {
+        if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit )
+        {
+          printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
+            convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
+          ++numErrOverLimit;
+        }
+
+        if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
+        {
+          maxErr = fabsf(Ycurr[i] - Yref[i]);
+          posMaxErr = i;
+        }
+      }
+
+      if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
+        printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
+    }
+
+    break;
+  }
+
+  pffastconv_free(X);
+  for ( i=0; i < NUMY; ++i)
+  {
+    if ( i < 2 )
+      pffastconv_free( Y[i] );
+    if (!aRunAlgo[i])
+      continue;
+    aDestroy[i]( aSetupCfg[i] );
+  }
+
+  pffastconv_free(H);
+
+  return retErr;
+}
+
+
+int main(int argc, char **argv)
+{
+  int result = 0;
+  int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
+  int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
+  int testReal = 1, testCplx = 1, testSymetric = 0;
+
+  for ( i = 1; i < argc; ++i ) {
+    if (!strcmp(argv[i], "--no-len")) {
+      testOutLens = 0;
+    }
+    else if (!strcmp(argv[i], "--no-bench")) {
+      benchConv = 0;
+    }
+    else if (!strcmp(argv[i], "--quick")) {
+      quickTest = 1;
+    }
+    else if (!strcmp(argv[i], "--slow")) {
+      slowTest = 1;
+    }
+    else if (!strcmp(argv[i], "--real")) {
+      testCplx = 0;
+    }
+    else if (!strcmp(argv[i], "--cplx")) {
+      testReal = 0;
+    }
+    else if (!strcmp(argv[i], "--sym")) {
+      testSymetric = 1;
+    }
+    else /* if (!strcmp(argv[i], "--help")) */ {
+      printf("usage: %s [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
+      exit(1);
+    }
+  }
+
+
+  if (testOutLens)
+  {
+    for ( k = 0; k < 3; ++k )
+    {
+      if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
+        continue;
+      printf("\n\n==========\n");
+      printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
+      printf("==========\n");
+      flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
+      flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
+      flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
+      testOutLen = 1;
+      printDbg = 0;
+      printSpeed = 0;
+      for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
+      {
+        if ( (M % 16) != 0 && testSymetric )
+          continue;
+        result |= test(M, flagsB, testOutLen, printDbg, printSpeed);
+      }
+    }
+  }
+
+  if (benchConv)
+  {
+    for ( k = 0; k < 3; ++k )
+    {
+      if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
+        continue;
+      printf("\n\n==========\n");
+      printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
+      printf("==========\n");
+      flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
+      flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
+      flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
+      testOutLen = 0;
+      printDbg = 0;
+      printSpeed = 1;
+      if (!slowTest) {
+        result |= test( 32,     flagsC, testOutLen, printDbg, printSpeed);
+        result |= test( 32+ 16, flagsC, testOutLen, printDbg, printSpeed);
+        result |= test( 64,     flagsC, testOutLen, printDbg, printSpeed);
+        result |= test( 64+ 32, flagsC, testOutLen, printDbg, printSpeed);
+        result |= test(128,     flagsC, testOutLen, printDbg, printSpeed);
+      }
+      if (!quickTest) {
+        result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed);
+        result |= test(256,     flagsC, testOutLen, printDbg, printSpeed);
+        result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed);
+        result |= test(512,     flagsC, testOutLen, printDbg, printSpeed);
+        result |= test(1024,    flagsC, testOutLen, printDbg, printSpeed);
+      }
+    }
+  }
+
+  return result;
+}
+
diff --git a/test_pffft.c b/test_pffft.c
new file mode 100644
index 0000000..294a43f
--- /dev/null
+++ b/test_pffft.c
@@ -0,0 +1,246 @@
+/*
+  Copyright (c) 2013 Julien Pommier.
+
+  Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
+
+  How to build: 
+
+  on linux, with fftw3:
+  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+  on macos, without fftw3:
+  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
+
+  on macos, with fftw3:
+  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
+
+  as alternative: replace clang by gcc.
+
+  on windows, with visual c++:
+  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+  
+  build without SIMD instructions:
+  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
+
+ */
+
+#include "pffft.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+
+/* EXPECTED_DYN_RANGE in dB:
+ * single precision float has 24 bits mantissa
+ * => 24 Bits * 6 dB = 144 dB
+ * allow a few dB tolerance (even 144 dB looks good on my PC)
+ */
+#define EXPECTED_DYN_RANGE  140.0
+
+/* maximum allowed phase error in degree */
+#define DEG_ERR_LIMIT   1E-4
+
+/* maximum allowed magnitude error in amplitude (of 1.0 or 1.1) */
+#define MAG_ERR_LIMIT  1E-6
+
+
+#define PRINT_SPEC  0
+
+#define PWR2LOG(PWR)  ( (PWR) < 1E-30 ? 10.0*log10(1E-30) : 10.0*log10(PWR) )
+
+
+int isPowerOfTwo(unsigned v) {
+  /* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
+  int f = v && !(v & (v - 1));
+  return f;
+}
+
+
+int test(int N, int cplx, int useOrdered) {
+  int Nfloat = (cplx ? N*2 : N);
+  float *X = pffft_aligned_malloc((unsigned)Nfloat * sizeof(float));
+  float *Y = pffft_aligned_malloc((unsigned)Nfloat * sizeof(float));
+  float *Z = pffft_aligned_malloc((unsigned)Nfloat * sizeof(float));
+  float *W = pffft_aligned_malloc((unsigned)Nfloat * sizeof(float));
+  int k, j, m, iter, kmaxOther, retError = 0;
+  double freq, dPhi, phi, phi0;
+  double pwr, pwrCar, pwrOther, err, errSum, mag, expextedMag;
+  float amp = 1.0F;
+
+  assert( isPowerOfTwo(N) );
+
+  PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+  assert(s);
+  if (!s) {
+    printf("Error setting up PFFFT!\n");
+    return 1;
+  }
+
+  for ( k = m = 0; k < (cplx? N : (1 + N/2) ); k += N/16, ++m )
+  {
+    amp = ( (m % 3) == 0 ) ? 1.0F : 1.1F;
+    freq = (k < N/2) ? ((double)k / N) : ((double)(k-N) / N);
+    dPhi = 2.0 * M_PI * freq;
+    if ( dPhi < 0.0 )
+      dPhi += 2.0 * M_PI;
+
+    iter = -1;
+    while (1)
+    {
+      ++iter;
+
+      if (iter)
+        printf("bin %d: dphi = %f for freq %f\n", k, dPhi, freq);
+
+      /* generate cosine carrier as time signal - start at defined phase phi0 */
+      phi = phi0 = (m % 4) * 0.125 * M_PI;  /* have phi0 < 90 deg to be normalized */
+      for ( j = 0; j < N; ++j )
+      {
+        if (cplx) {
+          X[2*j] = amp * (float)cos(phi);  /* real part */
+          X[2*j+1] = amp * (float)sin(phi);  /* imag part */
+        }
+        else
+          X[j] = amp * (float)cos(phi);  /* only real part */
+
+        /* phase increment .. stay normalized - cos()/sin() might degrade! */
+        phi += dPhi;
+        if ( phi >= M_PI )
+          phi -= 2.0 * M_PI;
+      }
+
+      /* forward transform from X --> Y  .. using work buffer W */
+      if ( useOrdered )
+        pffft_transform_ordered(s, X, Y, W, PFFFT_FORWARD );
+      else
+      {
+        pffft_transform(s, X, Z, W, PFFFT_FORWARD );  /* temporarily use Z for reordering */
+        pffft_zreorder(s, Z, Y, PFFFT_FORWARD );
+      }
+
+      pwrOther = -1.0;
+      pwrCar = 0;
+
+
+      /* for positive frequencies: 0 to 0.5 * samplerate */
+      /* and also for negative frequencies: -0.5 * samplerate to 0 */
+      for ( j = 0; j < ( cplx ? N : (1 + N/2) ); ++j )
+      {
+        if (!cplx && !j)  /* special treatment for DC for real input */
+          pwr = Y[j]*Y[j];
+        else if (!cplx && j == N/2)  /* treat 0.5 * samplerate */
+          pwr = Y[1] * Y[1];  /* despite j (for freq calculation) we have index 1 */
+        else
+          pwr = Y[2*j] * Y[2*j] + Y[2*j+1] * Y[2*j+1];
+        if (iter || PRINT_SPEC)
+          printf("%s fft %d:  pwr[j = %d] = %g == %f dB\n", (cplx ? "cplx":"real"), N, j, pwr, PWR2LOG(pwr) );
+        if (k == j)
+          pwrCar = pwr;
+        else if ( pwr > pwrOther ) {
+          pwrOther = pwr;
+          kmaxOther = j;
+        }
+      }
+
+      if ( PWR2LOG(pwrCar) - PWR2LOG(pwrOther) < EXPECTED_DYN_RANGE ) {
+        printf("%s fft %d amp %f iter %d:\n", (cplx ? "cplx":"real"), N, amp, iter);
+        printf("  carrier power  at bin %d: %g == %f dB\n", k, pwrCar, PWR2LOG(pwrCar) );
+        printf("  carrier mag || at bin %d: %g\n", k, sqrt(pwrCar) );
+        printf("  max other pwr  at bin %d: %g == %f dB\n", kmaxOther, pwrOther, PWR2LOG(pwrOther) );
+        printf("  dynamic range: %f dB\n\n", PWR2LOG(pwrCar) - PWR2LOG(pwrOther) );
+        retError = 1;
+        if ( iter == 0 )
+          continue;
+      }
+
+      if ( k > 0 && k != N/2 )
+      {
+        phi = atan2( Y[2*k+1], Y[2*k] );
+        if ( fabs( phi - phi0) > DEG_ERR_LIMIT * M_PI / 180.0 )
+        {
+        retError = 1;
+        printf("%s fft %d  bin %d amp %f : phase mismatch! phase = %f deg   expected = %f deg\n",
+            (cplx ? "cplx":"real"), N, k, amp, phi * 180.0 / M_PI, phi0 * 180.0 / M_PI );
+        }
+      }
+
+      expextedMag = cplx ? amp : ( (k == 0 || k == N/2) ? amp : (amp/2) );
+      mag = sqrt(pwrCar) / N;
+      if ( fabs(mag - expextedMag) > MAG_ERR_LIMIT )
+      {
+        retError = 1;
+        printf("%s fft %d  bin %d amp %f : mag = %g   expected = %g\n", (cplx ? "cplx":"real"), N, k, amp, mag, expextedMag );
+      }
+
+
+      /* now convert spectrum back */
+      pffft_transform_ordered(s, Y, Z, W, PFFFT_BACKWARD);
+
+      errSum = 0.0;
+      for ( j = 0; j < (cplx ? (2*N) : N); ++j )
+      {
+        /* scale back */
+        Z[j] /= N;
+        /* square sum errors over real (and imag parts) */
+        err = (X[j]-Z[j]) * (X[j]-Z[j]);
+        errSum += err;
+      }
+
+      if ( errSum > N * 1E-7 )
+      {
+        retError = 1;
+        printf("%s fft %d  bin %d : inverse FFT doesn't match original signal! errSum = %g ; mean err = %g\n", (cplx ? "cplx":"real"), N, k, errSum, errSum / N);
+      }
+
+      break;
+    }
+
+  }
+  pffft_destroy_setup(s);
+  pffft_aligned_free(X);
+  pffft_aligned_free(Y);
+  pffft_aligned_free(Z);
+  pffft_aligned_free(W);
+
+  return retError;
+}
+
+
+
+int main(int argc, char **argv)
+{
+  int N, result, resN, resAll;
+
+  resAll = 0;
+  for ( N = 32; N <= 65536; N *= 2 )
+  {
+    result = test(N, 1 /* cplx fft */, 1 /* useOrdered */);
+    resN = result;
+    resAll |= result;
+
+    result = test(N, 0 /* cplx fft */, 1 /* useOrdered */);
+    resN |= result;
+    resAll |= result;
+
+    result = test(N, 1 /* cplx fft */, 0 /* useOrdered */);
+    resN |= result;
+    resAll |= result;
+
+    result = test(N, 0 /* cplx fft */, 0 /* useOrdered */);
+    resN |= result;
+    resAll |= result;
+
+    if (!resN)
+      printf("tests for size %d succeeded successfully.\n", N);
+  }
+
+  if (!resAll)
+    printf("all tests succeeded successfully.\n");
+
+  return resAll;
+}
+