blob: 9094bb4be9c8656dfb5a1e43e8c72310ace03ca7 [file] [log] [blame]
hayati ayguenb2d29362020-01-02 00:06:09 +01001/*
2 Copyright (c) 2019 Hayati Ayguen.
3
4 */
5
6#include "pffastconv.h"
7#include "pffft.h"
8
9/* detect compiler flavour */
10#if defined(_MSC_VER)
11# define COMPILER_MSVC
12#elif defined(__GNUC__)
13# define COMPILER_GCC
14#endif
15
16#include <stdlib.h>
17#include <stdint.h>
18#include <stdio.h>
19#include <math.h>
20#include <assert.h>
21#include <string.h>
22
23#define FASTCONV_DBG_OUT 0
24
25
26#if defined(COMPILER_GCC)
27# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
28# define RESTRICT __restrict
29#elif defined(COMPILER_MSVC)
30# define ALWAYS_INLINE(return_type) __forceinline return_type
31# define RESTRICT __restrict
32#endif
33
34
35#ifdef COMPILER_MSVC
36#pragma warning( disable : 4244 4305 4204 4456 )
37#endif
38
hayati ayguen55be34f2020-01-15 11:37:06 +010039#ifdef PFFFT_FLOAT
40#define float PFFFT_FLOAT
41#endif
42
43
hayati ayguenb2d29362020-01-02 00:06:09 +010044/*
45 vector support macros: the rest of the code is independant of
46 SSE/Altivec/NEON -- adding support for other platforms with 4-element
47 vectors should be limited to these macros
48*/
49#include "pfsimd_macros.h"
50
51
52static
53unsigned nextPowerOfTwo(unsigned v) {
54 /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
55 /* compute the next highest power of 2 of 32-bit v */
56 v--;
57 v |= v >> 1;
58 v |= v >> 2;
59 v |= v >> 4;
60 v |= v >> 8;
61 v |= v >> 16;
62 v++;
63 return v;
64}
65
66void *pffastconv_malloc(size_t nb_bytes)
67{
68 return Valigned_malloc(nb_bytes);
69}
70
71void pffastconv_free(void *p)
72{
73 Valigned_free(p);
74}
75
76int pffastconv_simd_size()
77{
78 return SIMD_SZ;
79}
80
81
82
83struct PFFASTCONV_Setup
84{
85 float * Xt; /* input == x in time domain - copy for alignment */
86 float * Xf; /* input == X in freq domain */
87 float * Hf; /* filterCoeffs == H in freq domain */
88 float * Mf; /* input * filterCoeffs in freq domain */
89 PFFFT_Setup *st;
90 int filterLen; /* convolution length */
91 int Nfft; /* FFT/block length */
92 int flags;
93 float scale;
94};
95
96
97PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags )
98{
99 PFFASTCONV_Setup * s = NULL;
100 const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
101 int i, Nfft = 2 * nextPowerOfTwo(filterLen -1);
102#if FASTCONV_DBG_OUT
103 const int iOldBlkLen = *blockLen;
104#endif
105
106 if ( flags & PFFASTCONV_CPLX_FILTER )
107 return NULL;
108
109 s = pffastconv_malloc( sizeof(struct PFFASTCONV_Setup) );
110
111 if ( *blockLen > Nfft ) {
112 Nfft = *blockLen;
113 Nfft = nextPowerOfTwo(Nfft);
114 }
115 *blockLen = Nfft; /* this is in (complex) samples */
116
117 Nfft *= cplxFactor;
118
119 if ( (flags & PFFASTCONV_DIRECT_INP) && !(flags & PFFASTCONV_CPLX_INP_OUT) )
120 s->Xt = NULL;
121 else
122 s->Xt = pffastconv_malloc((unsigned)Nfft * sizeof(float));
123 s->Xf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
124 s->Hf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
125 s->Mf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
126 s->st = pffft_new_setup(Nfft, PFFFT_REAL); /* with complex: we do 2 x fft() */
127 s->filterLen = filterLen; /* filterLen == convolution length == length of impulse response */
128 if ( cplxFactor == 2 )
129 s->filterLen = 2 * filterLen - 1;
130 s->Nfft = Nfft; /* FFT/block length */
131 s->flags = flags;
132 s->scale = (float)( 1.0 / Nfft );
133
134 memset( s->Xt, 0, (unsigned)Nfft * sizeof(float) );
135 for ( i = 0; i < filterLen; ++i )
136 s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ filterLen - 1 - i ];
137
138 pffft_transform(s->st, s->Xt, s->Hf, /* tmp = */ s->Mf, PFFFT_FORWARD);
139
140#if FASTCONV_DBG_OUT
141 printf("\n fastConvSetup(filterLen = %d, blockLen %d) --> blockLen %d, OutLen = %d\n"
142 , filterLen, iOldBlkLen, *blockLen, Nfft - filterLen +1 );
143#endif
144
145 return s;
146}
147
148
149void pffastconv_destroy_setup( PFFASTCONV_Setup * s )
150{
151 if (!s)
152 return;
153 pffft_destroy_setup(s->st);
154 pffastconv_free(s->Mf);
155 pffastconv_free(s->Hf);
156 pffastconv_free(s->Xf);
157 if ( s->Xt )
158 pffastconv_free(s->Xt);
159 pffastconv_free(s);
160}
161
162
163int pffastconv_apply(PFFASTCONV_Setup * s, const float *input_, int cplxInputLen, float *output_, int applyFlush)
164{
165 const float * RESTRICT X = input_;
166 float * RESTRICT Y = output_;
167 const int Nfft = s->Nfft;
168 const int filterLen = s->filterLen;
169 const int flags = s->flags;
170 const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
171 const int inputLen = cplxFactor * cplxInputLen;
172 int inpOff, procLen, numOut = 0, j, part, cplxOff;
173
174 /* applyFlush != 0:
175 * inputLen - inpOff -filterLen + 1 > 0
176 * <=> inputLen -filterLen + 1 > inpOff
177 * <=> inpOff < inputLen -filterLen + 1
178 *
179 * applyFlush == 0:
180 * inputLen - inpOff >= Nfft
181 * <=> inputLen - Nfft >= inpOff
182 * <=> inpOff <= inputLen - Nfft
183 * <=> inpOff < inputLen - Nfft + 1
184 */
185
186 if ( cplxFactor == 2 )
187 {
188 const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
189#if 0
190 printf( "*** inputLen %d, filterLen %d, Nfft %d => maxOff %d\n", inputLen, filterLen, Nfft, maxOff);
191#endif
192 for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
193 {
194 procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
195 numOut = ( procLen - filterLen + 1 ) & ( ~1 );
196 if (!numOut)
197 break;
198#if 0
199 if (!inpOff)
200 printf("*** inpOff = %d, numOut = %d\n", inpOff, numOut);
201 if (inpOff + filterLen + 2 >= maxOff )
202 printf("*** inpOff = %d, inpOff + numOut = %d\n", inpOff, inpOff + numOut);
203#endif
204
205 if ( flags & PFFASTCONV_DIRECT_INP )
206 {
207 pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
208 }
209 else
210 {
211 memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
212 if ( procLen < Nfft )
213 memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
214
215 pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
216 }
217
218 memset( s->Mf, 0, (unsigned)Nfft * sizeof(float) ); /* why does pffft_zconvolve_accumulate() accumulate?? */
219 pffft_zconvolve_accumulate(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
220
221 if ( flags & PFFASTCONV_DIRECT_OUT )
222 {
223 pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
224 }
225 else
226 {
227 pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
228 memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
229 }
230 }
231 return inpOff / cplxFactor;
232 }
233 else
234 {
235 const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
236 const int numParts = (flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1;
237
238 for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
239 {
240 procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
241 numOut = procLen - filterLen + 1;
242
243 for ( part = 0; part < numParts; ++part ) /* iterate per real/imag component */
244 {
245
246 if ( flags & PFFASTCONV_CPLX_INP_OUT )
247 {
248 cplxOff = 2 * inpOff + part;
249 for ( j = 0; j < procLen; ++j )
250 s->Xt[j] = X[cplxOff + 2 * j];
251 if ( procLen < Nfft )
252 memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
253
254 pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
255 }
256 else if ( flags & PFFASTCONV_DIRECT_INP )
257 {
258 pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
259 }
260 else
261 {
262 memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
263 if ( procLen < Nfft )
264 memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
265
266 pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
267 }
268
269 memset( s->Mf, 0, (unsigned)Nfft * sizeof(float) ); /* why does pffft_zconvolve_accumulate() accumulate?? */
270 pffft_zconvolve_accumulate(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
271
272 if ( flags & PFFASTCONV_CPLX_INP_OUT )
273 {
274 pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
275
276 cplxOff = 2 * inpOff + part;
277 for ( j = 0; j < numOut; ++j )
278 Y[ cplxOff + 2 * j ] = s->Xf[j];
279 }
280 else if ( flags & PFFASTCONV_DIRECT_OUT )
281 {
282 pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
283 }
284 else
285 {
286 pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
287 memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
288 }
289
290 }
291 }
292
293 return inpOff;
294 }
295}
296