blob: 50cce1914c82e4a11bba74c066b057e311d8818c [file] [log] [blame]
Julien Pommier370d2092011-11-19 18:04:25 +01001/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com )
2
3 Based on original fortran 77 code from FFTPACKv4 from NETLIB
4 (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5 of NCAR, in 1985.
6
7 As confirmed by the NCAR fftpack software curators, the following
8 FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9 released under the same terms.
10
11 FFTPACK license:
12
13 http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14
15 Copyright (c) 2004 the University Corporation for Atmospheric
16 Research ("UCAR"). All rights reserved. Developed by NCAR's
17 Computational and Information Systems Laboratory, UCAR,
18 www.cisl.ucar.edu.
19
20 Redistribution and use of the Software in source and binary forms,
21 with or without modification, is permitted provided that the
22 following conditions are met:
23
24 - Neither the names of NCAR's Computational and Information Systems
25 Laboratory, the University Corporation for Atmospheric Research,
26 nor the names of its sponsors or contributors may be used to
27 endorse or promote products derived from this Software without
28 specific prior written permission.
29
30 - Redistributions of source code must retain the above copyright
31 notices, this list of conditions, and the disclaimer below.
32
33 - Redistributions in binary form must reproduce the above copyright
34 notice, this list of conditions, and the disclaimer below in the
35 documentation and/or other materials provided with the
36 distribution.
37
38 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46 SOFTWARE.
47
48
49 PFFFT : a Pretty Fast FFT.
50
51 This file is largerly based on the original FFTPACK implementation, modified in
52 order to take advantage of SIMD instructions of modern CPUs.
53*/
54
55/*
56 ChangeLog:
57 - 2011/10/02, version 1: This is the very first release of this file.
58*/
59
60#include "pffft.h"
61#include <stdlib.h>
62#include <stdio.h>
63#include <math.h>
64#include <assert.h>
65
66/* detect compiler flavour */
67#if defined(_MSC_VER)
68# define COMPILER_MSVC
69#elif defined(__GNUC__)
70# define COMPILER_GCC
71#endif
72
73#if defined(COMPILER_GCC)
74# define ALWAYS_INLINE(return_type) return_type __attribute__ ((always_inline))
75# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
76# define RESTRICT __restrict
77# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
78#elif defined(COMPILER_MSVC)
79# define ALWAYS_INLINE(return_type) __forceinline return_type
80# define NEVER_INLINE(return_type) __declspec(noinline) return_type
81# define RESTRICT __restrict
82# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (v4sf*)_alloca(size__ * sizeof(type__))
83#endif
84
85
86/*
87 vector support macros: the rest of the code is independant of
88 SSE/Altivec/NEON -- adding support for other platforms with 4-element
89 vectors should be limited to these macros
90*/
91
92
93// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
94//#define PFFFT_SIMD_DISABLE
95
96/*
97 Altivec support macros
98*/
99#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
100typedef vector float v4sf;
101# define SIMD_SZ 4
102# define VZERO() ((vector float) vec_splat_u8(0))
103# define VMUL(a,b) vec_madd(a,b, VZERO())
104# define VADD(a,b) vec_add(a,b)
105# define VMADD(a,b,c) vec_madd(a,b,c)
106# define VSUB(a,b) vec_sub(a,b)
107inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
108# define LD_PS1(p) ld_ps1(&p)
109# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
110# define UNINTERLEAVE2(in1, in2, out1, out2) { \
111 vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
112 vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
113 v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
114 }
115# define VTRANSPOSE4(x0,x1,x2,x3) { \
116 v4sf y0 = vec_mergeh(x0, x2); \
117 v4sf y1 = vec_mergel(x0, x2); \
118 v4sf y2 = vec_mergeh(x1, x3); \
119 v4sf y3 = vec_mergel(x1, x3); \
120 x0 = vec_mergeh(y0, y2); \
121 x1 = vec_mergel(y0, y2); \
122 x2 = vec_mergeh(y1, y3); \
123 x3 = vec_mergel(y1, y3); \
124 }
125# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
126# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
127
128/*
129 SSE1 support macros
130*/
131#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
132
133#include <xmmintrin.h>
134typedef __m128 v4sf;
135# define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
136# define VZERO() _mm_setzero_ps()
137# define VMUL(a,b) _mm_mul_ps(a,b)
138# define VADD(a,b) _mm_add_ps(a,b)
139# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
140# define VSUB(a,b) _mm_sub_ps(a,b)
141# define LD_PS1(p) _mm_set1_ps(p)
142# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
143# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
144# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
145# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
146# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
147
148/*
149 ARM NEON support macros
150*/
151#elif !defined(PFFFT_SIMD_DISABLE) && defined(__arm__)
152# include <arm_neon.h>
153typedef float32x4_t v4sf;
154# define SIMD_SZ 4
155# define VZERO() vdupq_n_f32(0)
156# define VMUL(a,b) vmulq_f32(a,b)
157# define VADD(a,b) vaddq_f32(a,b)
158# define VMADD(a,b,c) vmlaq_f32(c,a,b)
159# define VSUB(a,b) vsubq_f32(a,b)
160# define LD_PS1(p) vld1q_dup_f32(&(p))
161# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
162# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
163# define VTRANSPOSE4_(x0,x1,x2,x3) { \
164 float32x4x2_t t0_ = vzipq_f32(x0, x2); \
165 float32x4x2_t t1_ = vzipq_f32(x1, x3); \
166 float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
167 float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
168 x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
169 }
170// marginally faster version
171# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
172# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
173# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
174#else
175# if !defined(PFFFT_SIMD_DISABLE)
176# warning "building with simd disabled !\n";
177# define PFFFT_SIMD_DISABLE // fallback to scalar code
178# endif
179#endif
180
181// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
182#ifdef PFFFT_SIMD_DISABLE
183typedef float v4sf;
184# define SIMD_SZ 1
185# define VZERO() 0.f
186# define VMUL(a,b) ((a)*(b))
187# define VADD(a,b) ((a)+(b))
188# define VMADD(a,b,c) ((a)*(b)+(c))
189# define VSUB(a,b) ((a)-(b))
190# define LD_PS1(p) (p)
191# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
192#endif
193
194// shortcuts for complex multiplcations
195#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
196#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
197
198#if !defined(PFFFT_SIMD_DISABLE)
199typedef union v4sf_union {
200 v4sf v;
201 float f[4];
202} v4sf_union;
203
204#include <string.h>
205
206#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
207
208/* detect bugs with the vector support macros */
209void validate_pffft_simd() {
210 float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
211 v4sf_union a0, a1, a2, a3, t, u;
212 memcpy(a0.f, f, 4*sizeof(float));
213 memcpy(a1.f, f+4, 4*sizeof(float));
214 memcpy(a2.f, f+8, 4*sizeof(float));
215 memcpy(a3.f, f+12, 4*sizeof(float));
216
217 t = a0; u = a1; t.v = VZERO();
218 printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
219 t.v = VADD(a1.v, a2.v);
220 printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
221 t.v = VMUL(a1.v, a2.v);
222 printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
223 t.v = VMADD(a1.v, a2.v,a0.v);
224 printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
225
226 INTERLEAVE2(a1.v,a2.v,t.v,u.v);
227 printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
228 assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
229 UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
230 printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
231 assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
232
233 t.v=LD_PS1(f[15]);
234 printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
235 assertv4(t, 15, 15, 15, 15);
236 t.v = VSWAPHL(a1.v, a2.v);
237 printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
238 assertv4(t, 8, 9, 6, 7);
239 VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
240 printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
241 a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
242 a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
243 assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
244}
245#endif //!PFFFT_SIMD_DISABLE
246
247/* SSE and co like 16-bytes aligned pointers */
248#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
249void *pffft_aligned_malloc(size_t nb_bytes) {
250 void *p0, *p;
251 if (!(p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT))) return (void *) 0;
252 p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
253 *((void **) p - 1) = p0;
254 return p;
255}
256
257void pffft_aligned_free(void *p) {
258 if (p) free(*((void **) p - 1));
259}
260
261int pffft_simd_size() { return SIMD_SZ; }
262
263/*
264 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
265*/
266static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
267 int k, i;
268 int l1ido = l1*ido;
269 if (ido <= 2) {
270 for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
271 ch[0] = VADD(cc[0], cc[ido+0]);
272 ch[l1ido] = VSUB(cc[0], cc[ido+0]);
273 ch[1] = VADD(cc[1], cc[ido+1]);
274 ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
275 }
276 } else {
277 for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
278 for (i=0; i<ido-1; i+=2) {
279 v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
280 v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
281 v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
282 ch[i] = VADD(cc[i+0], cc[i+ido+0]);
283 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
284 VCPLXMUL(tr2, ti2, wr, wi);
285 ch[i+l1ido] = tr2;
286 ch[i+l1ido+1] = ti2;
287 }
288 }
289 }
290}
291
292/*
293 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
294*/
295static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
296 const float *wa1, const float *wa2, float fsign) {
297 static const float taur = -0.5f;
298 float taui = 0.866025403784439f*fsign;
299 int i, k;
300 v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
301 int l1ido = l1*ido;
302 float wr1, wi1, wr2, wi2;
303 assert(ido > 2);
304 for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
305 for (i=0; i<ido-1; i+=2) {
306 tr2 = VADD(cc[i+ido], cc[i+2*ido]);
307 cr2 = VADD(cc[i], VMUL(LD_PS1(taur),tr2));
308 ch[i] = VADD(cc[i], tr2);
309 ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
310 ci2 = VADD(cc[i +1], VMUL(LD_PS1(taur),ti2));
311 ch[i+1] = VADD(cc[i+1], ti2);
312 cr3 = VMUL(LD_PS1(taui), VSUB(cc[i+ido], cc[i+2*ido]));
313 ci3 = VMUL(LD_PS1(taui), VSUB(cc[i+ido+1], cc[i+2*ido+1]));
314 dr2 = VSUB(cr2, ci3);
315 dr3 = VADD(cr2, ci3);
316 di2 = VADD(ci2, cr3);
317 di3 = VSUB(ci2, cr3);
318 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
319 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
320 ch[i+l1ido] = dr2;
321 ch[i+l1ido + 1] = di2;
322 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
323 ch[i+2*l1ido] = dr3;
324 ch[i+2*l1ido+1] = di3;
325 }
326 }
327} /* passf3 */
328
329static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
330 const float *wa1, const float *wa2, const float *wa3, float fsign) {
331 /* isign == -1 for forward transform and +1 for backward transform */
332
333 int i, k;
334 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
335 int l1ido = l1*ido;
336 if (ido == 2) {
337 for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
338 tr1 = VSUB(cc[0], cc[2*ido + 0]);
339 tr2 = VADD(cc[0], cc[2*ido + 0]);
340 ti1 = VSUB(cc[1], cc[2*ido + 1]);
341 ti2 = VADD(cc[1], cc[2*ido + 1]);
342 ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
343 tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
344 tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
345 ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
346
347 ch[0*l1ido + 0] = VADD(tr2, tr3);
348 ch[0*l1ido + 1] = VADD(ti2, ti3);
349 ch[1*l1ido + 0] = VADD(tr1, tr4);
350 ch[1*l1ido + 1] = VADD(ti1, ti4);
351 ch[2*l1ido + 0] = VSUB(tr2, tr3);
352 ch[2*l1ido + 1] = VSUB(ti2, ti3);
353 ch[3*l1ido + 0] = VSUB(tr1, tr4);
354 ch[3*l1ido + 1] = VSUB(ti1, ti4);
355 }
356 } else {
357 for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
358 for (i=0; i<ido-1; i+=2) {
359 float wr1, wi1, wr2, wi2, wr3, wi3;
360 tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
361 tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
362 ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
363 ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
364 tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
365 ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
366 tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
367 ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
368
369 ch[i] = VADD(tr2, tr3);
370 cr3 = VSUB(tr2, tr3);
371 ch[i + 1] = VADD(ti2, ti3);
372 ci3 = VSUB(ti2, ti3);
373
374 cr2 = VADD(tr1, tr4);
375 cr4 = VSUB(tr1, tr4);
376 ci2 = VADD(ti1, ti4);
377 ci4 = VSUB(ti1, ti4);
378 wr1=wa1[i], wi1=fsign*wa1[i+1];
379 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
380 wr2=wa2[i], wi2=fsign*wa2[i+1];
381 ch[i + l1ido] = cr2;
382 ch[i + l1ido + 1] = ci2;
383
384 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
385 wr3=wa3[i], wi3=fsign*wa3[i+1];
386 ch[i + 2*l1ido] = cr3;
387 ch[i + 2*l1ido + 1] = ci3;
388
389 VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
390 ch[i + 3*l1ido] = cr4;
391 ch[i + 3*l1ido + 1] = ci4;
392 }
393 }
394 }
395} /* passf4 */
396
397static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
398 static const float minus_one = -1.f;
399 int i, k, l1ido = l1*ido;
400 for (k=0; k < l1ido; k += ido) {
401 v4sf a = cc[k], b = cc[k + l1ido];
402 ch[2*k] = VADD(a, b);
403 ch[2*(k+ido)-1] = VSUB(a, b);
404 }
405 if (ido < 2) return;
406 if (ido != 2) {
407 for (k=0; k < l1ido; k += ido) {
408 for (i=2; i<ido; i+=2) {
409 v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
410 v4sf br = cc[i - 1 + k], bi = cc[i + k];
411 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
412 ch[i + 2*k] = VADD(bi, ti2);
413 ch[2*(k+ido) - i] = VSUB(ti2, bi);
414 ch[i - 1 + 2*k] = VADD(br, tr2);
415 ch[2*(k+ido) - i -1] = VSUB(br, tr2);
416 }
417 }
418 if (ido % 2 == 1) return;
419 }
420 for (k=0; k < l1ido; k += ido) {
421 ch[2*k + ido] = VMUL(LD_PS1(minus_one), cc[ido-1 + k + l1ido]);
422 ch[2*k + ido-1] = cc[k + ido-1];
423 }
424} /* radf2 */
425
426
427static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
428 static const float minus_two=-2;
429 int i, k, l1ido = l1*ido;
430 v4sf a,b,c,d, tr2, ti2;
431 for (k=0; k < l1ido; k += ido) {
432 a = cc[2*k]; b = cc[2*(k+ido) - 1];
433 ch[k] = VADD(a, b);
434 ch[k + l1ido] =VSUB(a, b);
435 }
436 if (ido < 2) return;
437 if (ido != 2) {
438 for (k = 0; k < l1ido; k += ido) {
439 for (i = 2; i < ido; i += 2) {
440 a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
441 c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
442 ch[i-1 + k] = VADD(a, b);
443 tr2 = VSUB(a, b);
444 ch[i+0 + k] = VSUB(c, d);
445 ti2 = VADD(c, d);
446 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
447 ch[i-1 + k + l1ido] = tr2;
448 ch[i+0 + k + l1ido] = ti2;
449 }
450 }
451 if (ido % 2 == 1) return;
452 }
453 for (k = 0; k < l1ido; k += ido) {
454 a = cc[2*k + ido-1]; b = cc[2*k + ido];
455 ch[k + ido-1] = VADD(a,a);
456 ch[k + ido-1 + l1ido] = VMUL(LD_PS1(minus_two), b);
457 }
458} /* radb2 */
459
460static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
461 const float *wa1, const float *wa2) {
462 static const float taur = -0.5f;
463 static const float taui = 0.866025403784439f;
464 int i, k, ic;
465 v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
466 for (k=0; k<l1; k++) {
467 cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
468 ch[3*k*ido] = VADD(cc[k*ido], cr2);
469 ch[(3*k+2)*ido] = VMUL(LD_PS1(taui), VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
470 ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], VMUL(LD_PS1(taur), cr2));
471 }
472 if (ido == 1) return;
473 for (k=0; k<l1; k++) {
474 for (i=2; i<ido; i+=2) {
475 ic = ido - i;
476 wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
477 dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
478 VCPLXMULCONJ(dr2, di2, wr1, wi1);
479
480 wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
481 dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
482 VCPLXMULCONJ(dr3, di3, wr2, wi2);
483
484 cr2 = VADD(dr2, dr3);
485 ci2 = VADD(di2, di3);
486 ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
487 ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
488 tr2 = VADD(cc[i - 1 + k*ido], VMUL(LD_PS1(taur), cr2));
489 ti2 = VADD(cc[i + k*ido], VMUL(LD_PS1(taur), ci2));
490 tr3 = VMUL(LD_PS1(taui), VSUB(di2, di3));
491 ti3 = VMUL(LD_PS1(taui), VSUB(dr3, dr2));
492 ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
493 ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
494 ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
495 ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
496 }
497 }
498} /* radf3 */
499
500
501static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
502 const float *wa1, const float *wa2)
503{
504 static const float taur = -0.5f;
505 static const float taui = 0.866025403784439f;
506 static const float taui_2 = 0.866025403784439f*2;
507 int i, k, ic;
508 v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
509 for (k=0; k<l1; k++) {
510 tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
511 cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
512 ch[k*ido] = VADD(cc[3*k*ido], tr2);
513 ci3 = VMUL(LD_PS1(taui_2), cc[(3*k + 2)*ido]);
514 ch[(k + l1)*ido] = VSUB(cr2, ci3);
515 ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
516 }
517 if (ido == 1) return;
518 for (k=0; k<l1; k++) {
519 for (i=2; i<ido; i+=2) {
520 ic = ido - i;
521 tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
522 cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
523 ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
524 ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
525 ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
526 ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
527 cr3 = VMUL(LD_PS1(taui), VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
528 ci3 = VMUL(LD_PS1(taui), VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
529 dr2 = VSUB(cr2, ci3);
530 dr3 = VADD(cr2, ci3);
531 di2 = VADD(ci2, cr3);
532 di3 = VSUB(ci2, cr3);
533 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
534 ch[i - 1 + (k + l1)*ido] = dr2;
535 ch[i + (k + l1)*ido] = di2;
536 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
537 ch[i - 1 + (k + 2*l1)*ido] = dr3;
538 ch[i + (k + 2*l1)*ido] = di3;
539 }
540 }
541} /* radb3 */
542
543
544static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
545 const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
546{
547 static const float minus_hsqt2 = (float)-0.7071067811865475;
548 int i, k, l1ido = l1*ido;
549 {
550 const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
551 v4sf * RESTRICT ch_ = ch;
552 while (cc < cc_end) {
553 // this loop represents between 25% and 40% of total radf4_ps cost !
554 v4sf a0 = cc[0], a1 = cc[l1ido];
555 v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
556 v4sf tr1 = VADD(a1, a3);
557 v4sf tr2 = VADD(a0, a2);
558 ch[2*ido-1] = VSUB(a0, a2);
559 ch[2*ido ] = VSUB(a3, a1);
560 ch[0 ] = VADD(tr1, tr2);
561 ch[4*ido-1] = VSUB(tr2, tr1);
562 cc += ido; ch += 4*ido;
563 }
564 cc = cc_; ch = ch_;
565 }
566 if (ido < 2) return;
567 if (ido != 2) {
568 for (k = 0; k < l1ido; k += ido) {
569 const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
570 for (i=2; i<ido; i += 2, pc += 2) {
571 int ic = ido - i;
572 v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
573 v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
574
575 cr2 = pc[1*l1ido+0];
576 ci2 = pc[1*l1ido+1];
577 wr=LD_PS1(wa1[i - 2]);
578 wi=LD_PS1(wa1[i - 1]);
579 VCPLXMULCONJ(cr2,ci2,wr,wi);
580
581 cr3 = pc[2*l1ido+0];
582 ci3 = pc[2*l1ido+1];
583 wr = LD_PS1(wa2[i-2]);
584 wi = LD_PS1(wa2[i-1]);
585 VCPLXMULCONJ(cr3, ci3, wr, wi);
586
587 cr4 = pc[3*l1ido];
588 ci4 = pc[3*l1ido+1];
589 wr = LD_PS1(wa3[i-2]);
590 wi = LD_PS1(wa3[i-1]);
591 VCPLXMULCONJ(cr4, ci4, wr, wi);
592
593 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
594
595 tr1 = VADD(cr2,cr4);
596 tr4 = VSUB(cr4,cr2);
597 tr2 = VADD(pc[0],cr3);
598 tr3 = VSUB(pc[0],cr3);
599 ch[i - 1 + 4*k] = VADD(tr1,tr2);
600 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
601 ti1 = VADD(ci2,ci4);
602 ti4 = VSUB(ci2,ci4);
603 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
604 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
605 ti2 = VADD(pc[1],ci3);
606 ti3 = VSUB(pc[1],ci3);
607 ch[i + 4*k] = VADD(ti1, ti2);
608 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
609 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
610 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
611 }
612 }
613 if (ido % 2 == 1) return;
614 }
615 for (k=0; k<l1ido; k += ido) {
616 v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
617 v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
618 v4sf ti1 = VMUL(LD_PS1(minus_hsqt2), VADD(a, b));
619 v4sf tr1 = VMUL(LD_PS1(minus_hsqt2), VSUB(b, a));
620 ch[ido-1 + 4*k] = VADD(tr1, c);
621 ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
622 ch[4*k + 1*ido] = VSUB(ti1, d);
623 ch[4*k + 3*ido] = VADD(ti1, d);
624 }
625} /* radf4 */
626
627
628static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
629 const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
630{
631 static const float minus_sqrt2 = (float)-1.414213562373095;
632 static const float two = 2.f;
633 int i, k, l1ido = l1*ido;
634 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
635 {
636 const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
637 v4sf *ch_ = ch;
638 while (ch < ch_end) {
639 v4sf a = cc[0], b = cc[4*ido-1];
640 v4sf c = cc[2*ido], d = cc[2*ido-1];
641 tr3 = VMUL(LD_PS1(two),d);
642 tr2 = VADD(a,b);
643 tr1 = VSUB(a,b);
644 tr4 = VMUL(LD_PS1(two),c);
645 ch[0*l1ido] = VADD(tr2, tr3);
646 ch[2*l1ido] = VSUB(tr2, tr3);
647 ch[1*l1ido] = VSUB(tr1, tr4);
648 ch[3*l1ido] = VADD(tr1, tr4);
649
650 cc += 4*ido; ch += ido;
651 }
652 cc = cc_; ch = ch_;
653 }
654 if (ido < 2) return;
655 if (ido != 2) {
656 for (k = 0; k < l1ido; k += ido) {
657 const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
658 v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
659 for (i = 2; i < ido; i += 2) {
660
661 tr1 = VSUB(pc[i], pc[4*ido - i]);
662 tr2 = VADD(pc[i], pc[4*ido - i]);
663 ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
664 tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
665 ph[0] = VADD(tr2, tr3);
666 cr3 = VSUB(tr2, tr3);
667
668 ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
669 tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
670 cr2 = VSUB(tr1, tr4);
671 cr4 = VADD(tr1, tr4);
672
673 ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
674 ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
675
676 ph[1] = VADD(ti2, ti3); ph += l1ido;
677 ci3 = VSUB(ti2, ti3);
678 ci2 = VADD(ti1, ti4);
679 ci4 = VSUB(ti1, ti4);
680 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
681 ph[0] = cr2;
682 ph[1] = ci2; ph += l1ido;
683 VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
684 ph[0] = cr3;
685 ph[1] = ci3; ph += l1ido;
686 VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
687 ph[0] = cr4;
688 ph[1] = ci4; ph = ph - 3*l1ido + 2;
689 }
690 }
691 if (ido % 2 == 1) return;
692 }
693 for (k=0; k < l1ido; k+=ido) {
694 int i0 = 4*k + ido;
695 v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
696 v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
697 tr1 = VSUB(c,d);
698 tr2 = VADD(c,d);
699 ti1 = VADD(b,a);
700 ti2 = VSUB(b,a);
701 ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
702 ch[ido-1 + k + 1*l1ido] = VMUL(LD_PS1(minus_sqrt2), VSUB(ti1, tr1));
703 ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
704 ch[ido-1 + k + 3*l1ido] = VMUL(LD_PS1(minus_sqrt2), VADD(ti1, tr1));
705 }
706} /* radb4 */
707
708static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
709 const float *wa, const int *ifac) {
710 v4sf *in = (v4sf*)input_readonly;
711 v4sf *out = (in == work2 ? work1 : work2);
712 int nf = ifac[1], k1;
713 int l2 = n;
714 int iw = n-1;
715 assert(in != out && work1 != work2);
716 for (k1 = 1; k1 <= nf; ++k1) {
717 int kh = nf - k1;
718 int ip = ifac[kh + 2];
719 int l1 = l2 / ip;
720 int ido = n / l2;
721 iw -= (ip - 1)*ido;
722 switch (ip) {
723 case 4: {
724 int ix2 = iw + ido;
725 int ix3 = ix2 + ido;
726 radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
727 } break;
728 case 3: {
729 int ix2 = iw + ido;
730 radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
731 } break;
732 case 2:
733 radf2_ps(ido, l1, in, out, &wa[iw]);
734 break;
735 default:
736 assert(0);
737 break;
738 }
739 l2 = l1;
740 if (out == work2) {
741 out = work1; in = work2;
742 } else {
743 out = work2; in = work1;
744 }
745 }
746 return in; /* this is in fact the output .. */
747} /* rfftf1 */
748
749static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
750 const float *wa, const int *ifac) {
751 v4sf *in = (v4sf*)input_readonly;
752 v4sf *out = (in == work2 ? work1 : work2);
753 int nf = ifac[1], k1;
754 int l1 = 1;
755 int iw = 0;
756 assert(in != out);
757 for (k1=1; k1<=nf; k1++) {
758 int ip = ifac[k1 + 1];
759 int l2 = ip*l1;
760 int ido = n / l2;
761 switch (ip) {
762 case 4: {
763 int ix2 = iw + ido;
764 int ix3 = ix2 + ido;
765 radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
766 } break;
767 case 3: {
768 int ix2 = iw + ido;
769 radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
770 } break;
771 case 2:
772 radb2_ps(ido, l1, in, out, &wa[iw]);
773 break;
774 default:
775 assert(0);
776 break;
777 }
778 l1 = l2;
779 iw += (ip - 1)*ido;
780
781 if (out == work2) {
782 out = work1; in = work2;
783 } else {
784 out = work2; in = work1;
785 }
786 }
787 return in; /* this is in fact the output .. */
788}
789
790static int decompose(int n, int *ifac, const int ntryh[3]) {
791 int nl = n, nf = 0, i, j = 0;
792 for (j=0; j < 3; ++j) {
793 int ntry = ntryh[j];
794 while (nl != 1) {
795 int nq = nl / ntry;
796 int nr = nl - ntry * nq;
797 if (nr == 0) {
798 ifac[2+nf++] = ntry;
799 nl = nq;
800 if (ntry == 2 && nf != 1) {
801 for (i = 2; i <= nf; ++i) {
802 int ib = nf - i + 2;
803 ifac[ib + 1] = ifac[ib];
804 }
805 ifac[2] = 2;
806 }
807 } else break;
808 }
809 }
810 ifac[0] = n;
811 ifac[1] = nf;
812 return nf;
813}
814
815
816
817static void rffti1_ps(int n, float *wa, int *ifac)
818{
819 static const int ntryh[3] = { 4,2,3 };
820 int k1, j, ii;
821
822 int nf = decompose(n,ifac,ntryh);
823 float argh = (2*M_PI) / n;
824 int is = 0;
825 int nfm1 = nf - 1;
826 int l1 = 1;
827 if (nfm1 == 0) return;
828 for (k1 = 1; k1 <= nfm1; k1++) {
829 int ip = ifac[k1 + 1];
830 int ld = 0;
831 int l2 = l1*ip;
832 int ido = n / l2;
833 int ipm = ip - 1;
834 for (j = 1; j <= ipm; ++j) {
835 float argld;
836 int i = is, fi=0;
837 ld += l1;
838 argld = ld*argh;
839 for (ii = 3; ii <= ido; ii += 2) {
840 i += 2;
841 fi += 1;
842 wa[i - 2] = cos(fi*argld);
843 wa[i - 1] = sin(fi*argld);
844 }
845 is += ido;
846 }
847 l1 = l2;
848 }
849} /* rffti1 */
850
851void cffti1_ps(int n, float *wa, int *ifac)
852{
853 static const int ntryh[3] = { 3,4,2 };
854 int k1, j, ii;
855
856 int nf = decompose(n,ifac,ntryh);
857 float argh = (2*M_PI)/(float)n;
858 int i = 1;
859 int l1 = 1;
860 for (k1=1; k1<=nf; k1++) {
861 int ip = ifac[k1+1];
862 int ld = 0;
863 int l2 = l1*ip;
864 int ido = n / l2;
865 int idot = ido + ido + 2;
866 int ipm = ip - 1;
867 for (j=1; j<=ipm; j++) {
868 float argld;
869 int i1 = i, fi = 0;
870 wa[i-1] = 1;
871 wa[i] = 0;
872 ld += l1;
873 argld = ld*argh;
874 for (ii = 4; ii <= idot; ii += 2) {
875 i += 2;
876 fi += 1;
877 wa[i-1] = cos(fi*argld);
878 wa[i] = sin(fi*argld);
879 }
880 if (ip > 5) {
881 wa[i1-1] = wa[i-1];
882 wa[i1] = wa[i];
883 }
884 }
885 l1 = l2;
886 }
887} /* cffti1 */
888
889
890v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
891 v4sf *in = (v4sf*)input_readonly;
892 v4sf *out = (in == work2 ? work1 : work2);
893 int nf = ifac[1], k1;
894 int l1 = 1;
895 int iw = 0;
896 assert(in != out && work1 != work2);
897 for (k1=2; k1<=nf+1; k1++) {
898 int ip = ifac[k1];
899 int l2 = ip*l1;
900 int ido = n / l2;
901 int idot = ido + ido;
902 switch (ip) {
903 case 4: {
904 int ix2 = iw + idot;
905 int ix3 = ix2 + idot;
906 passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
907 } break;
908 case 2: {
909 passf2_ps(idot, l1, in, out, &wa[iw], isign);
910 } break;
911 case 3: {
912 int ix2 = iw + idot;
913 passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
914 } break;
915 default:
916 assert(0);
917 }
918 l1 = l2;
919 iw += (ip - 1)*idot;
920 if (out == work2) {
921 out = work1; in = work2;
922 } else {
923 out = work2; in = work1;
924 }
925 }
926
927 return in; /* this is in fact the output .. */
928}
929
930
931struct PFFFT_Setup {
932 int N;
933 int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
934 int ifac[15];
935 pffft_transform_t transform;
936 v4sf *data; // allocated room for twiddle coefs
937 float *e; // points into 'data' , N/4*3 elements
938 float *twiddle; // points into 'data', N/4 elements
939};
940
941PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
942 PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
943 int k, m;
944 if (transform == PFFFT_REAL) { assert(N >= 32); }
945 if (transform == PFFFT_COMPLEX) { assert(N >= 16); }
946 //assert((N % 32) == 0);
947 s->N = N;
948 s->transform = transform;
949 /* nb of complex simd vectors */
950 s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
951 s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
952 s->e = (float*)s->data;
953 s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
954
955 if (transform == PFFFT_REAL) {
956 for (k=0; k < s->Ncvec; ++k) {
957 int i = k/SIMD_SZ;
958 int j = k%SIMD_SZ;
959 for (m=0; m < SIMD_SZ-1; ++m) {
960 float A = -2*M_PI*(m+1)*k / N;
961 s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
962 s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
963 }
964 }
965 rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
966 } else {
967 for (k=0; k < s->Ncvec; ++k) {
968 int i = k/SIMD_SZ;
969 int j = k%SIMD_SZ;
970 for (m=0; m < SIMD_SZ-1; ++m) {
971 float A = -2*M_PI*(m+1)*k / N;
972 s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
973 s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
974 }
975 }
976 cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
977 }
978 return s;
979}
980
981
982void pffft_destroy_setup(PFFFT_Setup *s) {
983 pffft_aligned_free(s->data);
984 free(s);
985}
986
987#if !defined(PFFFT_SIMD_DISABLE)
988
989/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
990static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
991 v4sf g0, g1;
992 int k;
993 INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
994
995 *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
996 for (k=1; k < N; ++k) {
997 v4sf h0, h1;
998 INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
999 *--out = VSWAPHL(g1, h0);
1000 *--out = VSWAPHL(h0, h1);
1001 g1 = h1;
1002 }
1003 *--out = VSWAPHL(g1, g0);
1004}
1005
1006static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1007 v4sf g0, g1, h0, h1;
1008 int k;
1009 g0 = g1 = in[0]; ++in;
1010 for (k=1; k < N; ++k) {
1011 h0 = *in++; h1 = *in++;
1012 g1 = VSWAPHL(g1, h0);
1013 h0 = VSWAPHL(h0, h1);
1014 UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1015 g1 = h1;
1016 }
1017 h0 = *in++; h1 = g0;
1018 g1 = VSWAPHL(g1, h0);
1019 h0 = VSWAPHL(h0, h1);
1020 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1021}
1022
1023void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1024 int k, N = setup->N, Ncvec = setup->Ncvec;
1025 const v4sf *vin = (const v4sf*)in;
1026 v4sf *vout = (v4sf*)out;
1027 assert(in != out);
1028 if (setup->transform == PFFFT_REAL) {
1029 int k, dk = N/32;
1030 if (direction == PFFFT_FORWARD) {
1031 for (k=0; k < dk; ++k) {
1032 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1033 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1034 }
1035 reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1036 reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1037 } else {
1038 for (k=0; k < dk; ++k) {
1039 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1040 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1041 }
1042 unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1043 unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1044 }
1045 } else {
1046 if (direction == PFFFT_FORWARD) {
1047 for (k=0; k < Ncvec; ++k) {
1048 int kk = (k/4) + (k%4)*(Ncvec/4);
1049 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1050 }
1051 } else {
1052 for (k=0; k < Ncvec; ++k) {
1053 int kk = (k/4) + (k%4)*(Ncvec/4);
1054 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1055 }
1056 }
1057 }
1058}
1059
1060void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1061 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1062 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1063 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1064 assert(in != out);
1065 for (k=0; k < dk; ++k) {
1066 r0 = in[8*k+0]; i0 = in[8*k+1];
1067 r1 = in[8*k+2]; i1 = in[8*k+3];
1068 r2 = in[8*k+4]; i2 = in[8*k+5];
1069 r3 = in[8*k+6]; i3 = in[8*k+7];
1070 VTRANSPOSE4(r0,r1,r2,r3);
1071 VTRANSPOSE4(i0,i1,i2,i3);
1072 VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1073 VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1074 VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1075
1076 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1077 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1078 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1079 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1080
1081 /*
1082 transformation for each column is:
1083
1084 [1 1 1 1 0 0 0 0] [r0]
1085 [1 0 -1 0 0 -1 0 1] [r1]
1086 [1 -1 1 -1 0 0 0 0] [r2]
1087 [1 0 -1 0 0 1 0 -1] [r3]
1088 [0 0 0 0 1 1 1 1] * [i0]
1089 [0 1 0 -1 1 0 -1 0] [i1]
1090 [0 0 0 0 1 -1 1 -1] [i2]
1091 [0 -1 0 1 1 0 -1 0] [i3]
1092 */
1093
1094 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1095 r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1096 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1097 r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1098
1099 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1100 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1101 }
1102}
1103
1104void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1105 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1106 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1107 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1108 assert(in != out);
1109 for (k=0; k < dk; ++k) {
1110 r0 = in[8*k+0]; i0 = in[8*k+1];
1111 r1 = in[8*k+2]; i1 = in[8*k+3];
1112 r2 = in[8*k+4]; i2 = in[8*k+5];
1113 r3 = in[8*k+6]; i3 = in[8*k+7];
1114
1115 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1116 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1117 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1118 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1119
1120 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1121 r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1122 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1123 r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1124
1125 VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1126 VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1127 VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1128
1129 VTRANSPOSE4(r0,r1,r2,r3);
1130 VTRANSPOSE4(i0,i1,i2,i3);
1131
1132 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1133 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1134 }
1135}
1136
1137
1138ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1139 const v4sf *e, v4sf *out) {
1140 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1141 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1142 r0 = *in0; i0 = *in1;
1143 r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1144 VTRANSPOSE4(r0,r1,r2,r3);
1145 VTRANSPOSE4(i0,i1,i2,i3);
1146
1147 /*
1148 transformation for each column is:
1149
1150 [1 1 1 1 0 0 0 0] [r0]
1151 [1 0 -1 0 0 -1 0 1] [r1]
1152 [1 0 -1 0 0 1 0 -1] [r2]
1153 [1 -1 1 -1 0 0 0 0] [r3]
1154 [0 0 0 0 1 1 1 1] * [i0]
1155 [0 -1 0 1 -1 0 1 0] [i1]
1156 [0 -1 0 1 1 0 -1 0] [i2]
1157 [0 0 0 0 -1 1 -1 1] [i3]
1158 */
1159
1160 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1161 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1162
1163 VCPLXMUL(r1,i1,e[0],e[1]);
1164 VCPLXMUL(r2,i2,e[2],e[3]);
1165 VCPLXMUL(r3,i3,e[4],e[5]);
1166
1167 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1168 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1169
1170 sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1171 sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1172 si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1173 si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1174
1175 r0 = VADD(sr0, sr1);
1176 r3 = VSUB(sr0, sr1);
1177 i0 = VADD(si0, si1);
1178 i3 = VSUB(si1, si0);
1179 r1 = VADD(dr0, di1);
1180 r2 = VSUB(dr0, di1);
1181 i1 = VSUB(dr1, di0);
1182 i2 = VADD(dr1, di0);
1183
1184 *out++ = r0;
1185 *out++ = i0;
1186 *out++ = r1;
1187 *out++ = i1;
1188 *out++ = r2;
1189 *out++ = i2;
1190 *out++ = r3;
1191 *out++ = i3;
1192
1193}
1194
1195static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1196 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1197 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1198
1199 v4sf_union cr, ci, *uout = (v4sf_union*)out;
1200 v4sf save = in[7], zero=VZERO();
1201 float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1202 static const float s = M_SQRT2/2;
1203
1204 cr.v = in[0]; ci.v = in[Ncvec*2-1];
1205 assert(in != out);
1206 pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1207
1208 /*
1209 [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1210
1211 [Xr(1)] ] [1 1 1 1 0 0 0 0]
1212 [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1213 [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1214 [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1215 [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1216 [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1217 [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1218 [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1219 */
1220
1221 xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1222 xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1223 xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
1224 xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
1225 xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
1226 xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
1227 xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
1228 xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
1229
1230 for (k=1; k < dk; ++k) {
1231 v4sf save_next = in[8*k+7];
1232 pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
1233 e + k*6, out + k*8);
1234 save = save_next;
1235 }
1236
1237}
1238
1239ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
1240 const v4sf *e, v4sf *out, int first) {
1241 v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1242 /*
1243 transformation for each column is:
1244
1245 [1 1 1 1 0 0 0 0] [r0]
1246 [1 0 0 -1 0 -1 -1 0] [r1]
1247 [1 -1 -1 1 0 0 0 0] [r2]
1248 [1 0 0 -1 0 1 1 0] [r3]
1249 [0 0 0 0 1 -1 1 -1] * [i0]
1250 [0 -1 1 0 1 0 0 1] [i1]
1251 [0 0 0 0 1 1 -1 -1] [i2]
1252 [0 1 -1 0 1 0 0 1] [i3]
1253 */
1254
1255 v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1256 v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1257 v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1258 v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1259
1260 r0 = VADD(sr0, sr1);
1261 r2 = VSUB(sr0, sr1);
1262 r1 = VSUB(dr0, si1);
1263 r3 = VADD(dr0, si1);
1264 i0 = VSUB(di0, di1);
1265 i2 = VADD(di0, di1);
1266 i1 = VSUB(si0, dr1);
1267 i3 = VADD(si0, dr1);
1268
1269 VCPLXMULCONJ(r1,i1,e[0],e[1]);
1270 VCPLXMULCONJ(r2,i2,e[2],e[3]);
1271 VCPLXMULCONJ(r3,i3,e[4],e[5]);
1272
1273 VTRANSPOSE4(r0,r1,r2,r3);
1274 VTRANSPOSE4(i0,i1,i2,i3);
1275
1276 if (!first) {
1277 *out++ = r0;
1278 *out++ = i0;
1279 }
1280 *out++ = r1;
1281 *out++ = i1;
1282 *out++ = r2;
1283 *out++ = i2;
1284 *out++ = r3;
1285 *out++ = i3;
1286}
1287
1288static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1289 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1290 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1291
1292 v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1293 float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1294 static const float s = M_SQRT2;
1295 assert(in != out);
1296 for (k=0; k < 4; ++k) {
1297 Xr.f[k] = ((float*)in)[8*k];
1298 Xi.f[k] = ((float*)in)[8*k+4];
1299 }
1300
1301 pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
1302
1303 /*
1304 [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1305
1306 [cr0] [1 0 2 0 1 0 0 0]
1307 [cr1] [1 0 0 0 -1 0 -2 0]
1308 [cr2] [1 0 -2 0 1 0 0 0]
1309 [cr3] [1 0 0 0 -1 0 2 0]
1310 [ci0] [0 2 0 2 0 0 0 0]
1311 [ci1] [0 s 0 -s 0 -s 0 -s]
1312 [ci2] [0 0 0 0 0 -2 0 2]
1313 [ci3] [0 -s 0 s 0 -s 0 -s]
1314 */
1315 for (k=1; k < dk; ++k) {
1316 pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
1317 }
1318
1319 cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1320 cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1321 cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1322 cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1323 ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0;
1324 ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1325 ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2;
1326 ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1327}
1328
1329
1330void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1331 pffft_direction_t direction, int ordered) {
1332 int k, Ncvec = setup->Ncvec;
1333 int nf_odd = (setup->ifac[1] & 1);
1334
1335 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1336 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1337 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1338 if (scratch == 0) scratch = scratch_on_stack;
1339
1340 const v4sf *vinput = (const v4sf*)finput;
1341 v4sf *voutput = (v4sf*)foutput;
1342 v4sf *buff[2] = { voutput, scratch };
1343 int ib = (nf_odd ^ ordered ? 1 : 0);
1344
1345 assert(VALIGNED(finput) && VALIGNED(foutput));
1346
1347 //assert(finput != foutput);
1348 if (direction == PFFFT_FORWARD) {
1349 ib = !ib;
1350 if (setup->transform == PFFFT_REAL) {
1351 ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1352 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1353 pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1354 } else {
1355 v4sf *tmp = buff[ib];
1356 for (k=0; k < Ncvec; ++k) {
1357 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1358 }
1359 ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1360 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1361 pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1362 }
1363 if (ordered) {
1364 pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1365 } else ib = !ib;
1366 } else {
1367 if (vinput == buff[ib]) {
1368 ib = !ib; // may happen when finput == foutput
1369 }
1370 if (ordered) {
1371 pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1372 vinput = buff[ib]; ib = !ib;
1373 }
1374 if (setup->transform == PFFFT_REAL) {
1375 pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1376 ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1377 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1378 } else {
1379 pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1380 ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1381 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1382 for (k=0; k < Ncvec; ++k) {
1383 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1384 }
1385 }
1386 }
1387
1388 if (buff[ib] != voutput) {
1389 /* extra copy required -- this situation should only happen when finput == foutput */
1390 assert(finput==foutput);
1391 for (k=0; k < Ncvec; ++k) {
1392 v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1393 voutput[2*k] = a; voutput[2*k+1] = b;
1394 }
1395 ib = !ib;
1396 }
1397 assert(buff[ib] == voutput);
1398}
1399
1400void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
1401 int i, Ncvec = s->Ncvec;
1402 const v4sf * RESTRICT va = (const v4sf*)a;
1403 const v4sf * RESTRICT vb = (const v4sf*)b;
1404 v4sf * RESTRICT vab = (v4sf*)ab;
1405
1406#ifdef __arm__
1407 __builtin_prefetch(va);
1408 __builtin_prefetch(vb);
1409 __builtin_prefetch(vab);
1410 __builtin_prefetch(va+2);
1411 __builtin_prefetch(vb+2);
1412 __builtin_prefetch(vab+2);
1413 __builtin_prefetch(va+4);
1414 __builtin_prefetch(vb+4);
1415 __builtin_prefetch(vab+4);
1416 __builtin_prefetch(va+6);
1417 __builtin_prefetch(vb+6);
1418 __builtin_prefetch(vab+6);
1419#endif
1420
1421 float ar, ai, br, bi, abr, abi;
1422 v4sf vscal = LD_PS1(scaling);
1423
1424 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1425 ar = ((v4sf_union*)va)[0].f[0];
1426 ai = ((v4sf_union*)va)[1].f[0];
1427 br = ((v4sf_union*)vb)[0].f[0];
1428 bi = ((v4sf_union*)vb)[1].f[0];
1429 abr = ((v4sf_union*)vab)[0].f[0];
1430 abi = ((v4sf_union*)vab)[1].f[0];
1431
1432#ifdef __arm__
1433# if 1 // inline asm version
1434 const float *a_ = a, *b_ = b; float *ab_ = ab;
1435 int N = Ncvec;
1436 asm volatile("mov r8, %2 \n"
1437 "vdup.f32 q15, %4 \n"
1438 "1: \n"
1439 "pld [%0,#64] \n"
1440 "pld [%1,#64] \n"
1441 "pld [%2,#64] \n"
1442 "pld [%0,#96] \n"
1443 "pld [%1,#96] \n"
1444 "pld [%2,#96] \n"
1445 "vld1.f32 {q0,q1}, [%0,:128]! \n"
1446 "vld1.f32 {q4,q5}, [%1,:128]! \n"
1447 "vld1.f32 {q2,q3}, [%0,:128]! \n"
1448 "vld1.f32 {q6,q7}, [%1,:128]! \n"
1449 "vld1.f32 {q8,q9}, [r8,:128]! \n"
1450
1451 "vmul.f32 q10, q0, q4 \n"
1452 "vmul.f32 q11, q0, q5 \n"
1453 "vmul.f32 q12, q2, q6 \n"
1454 "vmul.f32 q13, q2, q7 \n"
1455 "vmls.f32 q10, q1, q5 \n"
1456 "vmla.f32 q11, q1, q4 \n"
1457 "vld1.f32 {q0,q1}, [r8,:128]! \n"
1458 "vmls.f32 q12, q3, q7 \n"
1459 "vmla.f32 q13, q3, q6 \n"
1460 "vmla.f32 q8, q10, q15 \n"
1461 "vmla.f32 q9, q11, q15 \n"
1462 "vmla.f32 q0, q12, q15 \n"
1463 "vmla.f32 q1, q13, q15 \n"
1464 "vst1.f32 {q8,q9},[%2,:128]! \n"
1465 "vst1.f32 {q0,q1},[%2,:128]! \n"
1466 "subs %3, #2 \n"
1467 "bne 1b \n"
1468 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1469
1470# else // neon instrinsics version, 30% slower that the asm one with gcc 4.6
1471 v4sf a1r, a1i, b1r, b1i;
1472 v4sf a2r, a2i, b2r, b2i;
1473 v4sf ab1r, ab1i, ab2r, ab2i;
1474 for (i=0; i < Ncvec; i += 2) {
1475 __builtin_prefetch(va+8);
1476 __builtin_prefetch(va+10);
1477
1478 a1r = *va++; a1i = *va++;
1479 a2r = *va++; a2i = *va++;
1480 b1r = *vb++; b1i = *vb++;
1481 b2r = *vb++; b2i = *vb++;
1482 ab1r = vab[0]; ab1i = vab[1];
1483 ab2r = vab[2]; ab2i = vab[3];
1484
1485 v4sf z1r = VMUL(a1r, b1r);
1486 v4sf z2r = VMUL(a2r, b2r);
1487 v4sf z1i = VMUL(a1r, b1i);
1488 v4sf z2i = VMUL(a2r, b2i);
1489
1490 __builtin_prefetch(vb+4);
1491 __builtin_prefetch(vb+6);
1492
1493 z1r = vmlsq_f32(z1r, a1i, b1i);
1494 z2r = vmlsq_f32(z2r, a2i, b2i);
1495 z1i = vmlaq_f32(z1i, a1i, b1r);
1496 z2i = vmlaq_f32(z2i, a2i, b2r);
1497
1498 __builtin_prefetch(vab+4);
1499 __builtin_prefetch(vab+6);
1500
1501 ab1r = vmlaq_f32(ab1r, z1r, vscal);
1502 ab2r = vmlaq_f32(ab2r, z2r, vscal);
1503 ab1i = vmlaq_f32(ab1i, z1i, vscal);
1504 ab2i = vmlaq_f32(ab2i, z2i, vscal);
1505
1506 *vab++ = ab1r; *vab++ = ab1i;
1507 *vab++ = ab2r; *vab++ = ab2i;
1508 }
1509# endif
1510
1511#else // not ARM, no need to use a special routine
1512 for (i=0; i < Ncvec; i += 2) {
1513 v4sf ar, ai, br, bi;
1514 ar = va[2*i+0]; ai = va[2*i+1];
1515 br = vb[2*i+0]; bi = vb[2*i+1];
1516 VCPLXMUL(ar, ai, br, bi);
1517 vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1518 vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1519 ar = va[2*i+2]; ai = va[2*i+3];
1520 br = vb[2*i+2]; bi = vb[2*i+3];
1521 VCPLXMUL(ar, ai, br, bi);
1522 vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1523 vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1524 }
1525#endif
1526 if (s->transform == PFFFT_REAL) {
1527 ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1528 ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1529 }
1530}
1531
1532
1533#else // defined(PFFFT_SIMD_DISABLE)
1534
1535// standard routine using scalar floats, without SIMD stuff.
1536
1537#define pffft_zreorder_nosimd pffft_zreorder
1538void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1539 int k, N = setup->N;
1540 if (setup->transform == PFFFT_COMPLEX) {
1541 for (k=0; k < 2*N; ++k) out[k] = in[k];
1542 return;
1543 }
1544 else if (direction == PFFFT_FORWARD) {
1545 float x_N = in[N-1];
1546 for (k=N-1; k > 1; --k) out[k] = in[k-1];
1547 out[0] = in[0];
1548 out[1] = x_N;
1549 } else {
1550 float x_N = in[1];
1551 for (k=1; k < N-1; ++k) out[k] = in[k+1];
1552 out[0] = in[0];
1553 out[N-1] = x_N;
1554 }
1555}
1556
1557#define pffft_transform_internal_nosimd pffft_transform_internal
1558void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1559 pffft_direction_t direction, int ordered) {
1560 int Ncvec = setup->Ncvec;
1561 int nf_odd = (setup->ifac[1] & 1);
1562
1563 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1564 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1565 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1566 if (scratch == 0) scratch = scratch_on_stack;
1567
1568 float *buff[2] = { output, scratch };
1569 int ib;
1570 if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1571 ib = (nf_odd ^ ordered ? 1 : 0);
1572
1573 if (direction == PFFFT_FORWARD) {
1574 if (setup->transform == PFFFT_REAL) {
1575 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1576 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1577 } else {
1578 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1579 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1580 }
1581 if (ordered) {
1582 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1583 }
1584 } else {
1585 if (input == buff[ib]) {
1586 ib = !ib; // may happen when finput == foutput
1587 }
1588 if (ordered) {
1589 pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
1590 input = buff[!ib];
1591 }
1592 if (setup->transform == PFFFT_REAL) {
1593 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1594 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1595 } else {
1596 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1597 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1598 }
1599 }
1600 if (buff[ib] != output) {
1601 int k;
1602 // extra copy required -- this situation should happens only when finput == foutput
1603 assert(input==output);
1604 for (k=0; k < Ncvec; ++k) {
1605 float a = buff[ib][2*k], b = buff[ib][2*k+1];
1606 output[2*k] = a; output[2*k+1] = b;
1607 }
1608 ib = !ib;
1609 }
1610 assert(buff[ib] == output);
1611}
1612
1613#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
1614void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1615 float *ab, float scaling) {
1616 int i, Ncvec = s->Ncvec;
1617
1618 if (s->transform == PFFFT_REAL) {
1619 // take care of the fftpack ordering
1620 ab[0] += a[0]*b[0]*scaling;
1621 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1622 ++ab; ++a; ++b; --Ncvec;
1623 }
1624 for (i=0; i < Ncvec; ++i) {
1625 float ar, ai, br, bi;
1626 ar = a[2*i+0]; ai = a[2*i+1];
1627 br = b[2*i+0]; bi = b[2*i+1];
1628 VCPLXMUL(ar, ai, br, bi);
1629 ab[2*i+0] += ar*scaling;
1630 ab[2*i+1] += ai*scaling;
1631 }
1632}
1633
1634#endif // defined(PFFFT_SIMD_DISABLE)
1635
1636void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1637 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
1638}
1639
1640void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1641 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
1642}