blob: 0603f2b1f6980d8fdc3e6df1b5e66ccebbd3febe [file] [log] [blame]
Julien Pommier432b3e82013-01-12 19:28:03 +01001/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Julien Pommier370d2092011-11-19 18:04:25 +01002
3 Based on original fortran 77 code from FFTPACKv4 from NETLIB
4 (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5 of NCAR, in 1985.
6
7 As confirmed by the NCAR fftpack software curators, the following
8 FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9 released under the same terms.
10
11 FFTPACK license:
12
13 http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14
15 Copyright (c) 2004 the University Corporation for Atmospheric
16 Research ("UCAR"). All rights reserved. Developed by NCAR's
17 Computational and Information Systems Laboratory, UCAR,
18 www.cisl.ucar.edu.
19
20 Redistribution and use of the Software in source and binary forms,
21 with or without modification, is permitted provided that the
22 following conditions are met:
23
24 - Neither the names of NCAR's Computational and Information Systems
25 Laboratory, the University Corporation for Atmospheric Research,
26 nor the names of its sponsors or contributors may be used to
27 endorse or promote products derived from this Software without
28 specific prior written permission.
29
30 - Redistributions of source code must retain the above copyright
31 notices, this list of conditions, and the disclaimer below.
32
33 - Redistributions in binary form must reproduce the above copyright
34 notice, this list of conditions, and the disclaimer below in the
35 documentation and/or other materials provided with the
36 distribution.
37
38 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46 SOFTWARE.
47
48
49 PFFFT : a Pretty Fast FFT.
50
51 This file is largerly based on the original FFTPACK implementation, modified in
52 order to take advantage of SIMD instructions of modern CPUs.
53*/
54
55/*
56 ChangeLog:
57 - 2011/10/02, version 1: This is the very first release of this file.
58*/
59
60#include "pffft.h"
61#include <stdlib.h>
62#include <stdio.h>
63#include <math.h>
64#include <assert.h>
65
66/* detect compiler flavour */
67#if defined(_MSC_VER)
68# define COMPILER_MSVC
69#elif defined(__GNUC__)
70# define COMPILER_GCC
71#endif
72
73#if defined(COMPILER_GCC)
Julien Pommier432b3e82013-01-12 19:28:03 +010074# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
Julien Pommier370d2092011-11-19 18:04:25 +010075# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
76# define RESTRICT __restrict
77# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
78#elif defined(COMPILER_MSVC)
79# define ALWAYS_INLINE(return_type) __forceinline return_type
80# define NEVER_INLINE(return_type) __declspec(noinline) return_type
81# define RESTRICT __restrict
Julien Pommier2a195842012-10-11 11:11:41 +020082# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
Julien Pommier370d2092011-11-19 18:04:25 +010083#endif
84
85
86/*
87 vector support macros: the rest of the code is independant of
88 SSE/Altivec/NEON -- adding support for other platforms with 4-element
89 vectors should be limited to these macros
90*/
91
92
93// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
94//#define PFFFT_SIMD_DISABLE
95
96/*
97 Altivec support macros
98*/
99#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
100typedef vector float v4sf;
101# define SIMD_SZ 4
102# define VZERO() ((vector float) vec_splat_u8(0))
103# define VMUL(a,b) vec_madd(a,b, VZERO())
104# define VADD(a,b) vec_add(a,b)
105# define VMADD(a,b,c) vec_madd(a,b,c)
106# define VSUB(a,b) vec_sub(a,b)
107inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
108# define LD_PS1(p) ld_ps1(&p)
109# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
110# define UNINTERLEAVE2(in1, in2, out1, out2) { \
111 vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
112 vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
113 v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
114 }
115# define VTRANSPOSE4(x0,x1,x2,x3) { \
116 v4sf y0 = vec_mergeh(x0, x2); \
117 v4sf y1 = vec_mergel(x0, x2); \
118 v4sf y2 = vec_mergeh(x1, x3); \
119 v4sf y3 = vec_mergel(x1, x3); \
120 x0 = vec_mergeh(y0, y2); \
121 x1 = vec_mergel(y0, y2); \
122 x2 = vec_mergeh(y1, y3); \
123 x3 = vec_mergel(y1, y3); \
124 }
125# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
126# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
127
128/*
129 SSE1 support macros
130*/
131#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
132
133#include <xmmintrin.h>
134typedef __m128 v4sf;
135# define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
136# define VZERO() _mm_setzero_ps()
137# define VMUL(a,b) _mm_mul_ps(a,b)
138# define VADD(a,b) _mm_add_ps(a,b)
139# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
140# define VSUB(a,b) _mm_sub_ps(a,b)
141# define LD_PS1(p) _mm_set1_ps(p)
142# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
143# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
144# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
145# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
146# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
147
148/*
149 ARM NEON support macros
150*/
151#elif !defined(PFFFT_SIMD_DISABLE) && defined(__arm__)
152# include <arm_neon.h>
153typedef float32x4_t v4sf;
154# define SIMD_SZ 4
155# define VZERO() vdupq_n_f32(0)
156# define VMUL(a,b) vmulq_f32(a,b)
157# define VADD(a,b) vaddq_f32(a,b)
158# define VMADD(a,b,c) vmlaq_f32(c,a,b)
159# define VSUB(a,b) vsubq_f32(a,b)
160# define LD_PS1(p) vld1q_dup_f32(&(p))
161# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
162# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
Julien Pommier432b3e82013-01-12 19:28:03 +0100163# define VTRANSPOSE4(x0,x1,x2,x3) { \
Julien Pommier370d2092011-11-19 18:04:25 +0100164 float32x4x2_t t0_ = vzipq_f32(x0, x2); \
165 float32x4x2_t t1_ = vzipq_f32(x1, x3); \
166 float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
167 float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
168 x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
169 }
170// marginally faster version
Julien Pommier432b3e82013-01-12 19:28:03 +0100171//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
Julien Pommier370d2092011-11-19 18:04:25 +0100172# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
173# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
174#else
175# if !defined(PFFFT_SIMD_DISABLE)
176# warning "building with simd disabled !\n";
177# define PFFFT_SIMD_DISABLE // fallback to scalar code
178# endif
179#endif
180
181// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
182#ifdef PFFFT_SIMD_DISABLE
183typedef float v4sf;
184# define SIMD_SZ 1
185# define VZERO() 0.f
186# define VMUL(a,b) ((a)*(b))
187# define VADD(a,b) ((a)+(b))
188# define VMADD(a,b,c) ((a)*(b)+(c))
189# define VSUB(a,b) ((a)-(b))
190# define LD_PS1(p) (p)
191# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
192#endif
193
194// shortcuts for complex multiplcations
195#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
196#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
Julien Pommier0302e8a2012-10-11 18:04:09 +0200197#ifndef SVMUL
198// multiply a scalar with a vector
199#define SVMUL(f,v) VMUL(LD_PS1(f),v)
200#endif
Julien Pommier370d2092011-11-19 18:04:25 +0100201
202#if !defined(PFFFT_SIMD_DISABLE)
203typedef union v4sf_union {
204 v4sf v;
205 float f[4];
206} v4sf_union;
207
208#include <string.h>
209
210#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
211
212/* detect bugs with the vector support macros */
213void validate_pffft_simd() {
214 float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
215 v4sf_union a0, a1, a2, a3, t, u;
216 memcpy(a0.f, f, 4*sizeof(float));
217 memcpy(a1.f, f+4, 4*sizeof(float));
218 memcpy(a2.f, f+8, 4*sizeof(float));
219 memcpy(a3.f, f+12, 4*sizeof(float));
220
221 t = a0; u = a1; t.v = VZERO();
222 printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
223 t.v = VADD(a1.v, a2.v);
224 printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
225 t.v = VMUL(a1.v, a2.v);
226 printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
227 t.v = VMADD(a1.v, a2.v,a0.v);
228 printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
229
230 INTERLEAVE2(a1.v,a2.v,t.v,u.v);
231 printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
232 assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
233 UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
234 printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
235 assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
236
237 t.v=LD_PS1(f[15]);
238 printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
239 assertv4(t, 15, 15, 15, 15);
240 t.v = VSWAPHL(a1.v, a2.v);
241 printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
242 assertv4(t, 8, 9, 6, 7);
243 VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
244 printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
245 a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
246 a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
247 assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
248}
249#endif //!PFFFT_SIMD_DISABLE
250
251/* SSE and co like 16-bytes aligned pointers */
252#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
253void *pffft_aligned_malloc(size_t nb_bytes) {
Julien Pommier2a195842012-10-11 11:11:41 +0200254 void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
255 if (!p0) return (void *) 0;
Julien Pommier370d2092011-11-19 18:04:25 +0100256 p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
257 *((void **) p - 1) = p0;
258 return p;
259}
260
261void pffft_aligned_free(void *p) {
262 if (p) free(*((void **) p - 1));
263}
264
265int pffft_simd_size() { return SIMD_SZ; }
266
267/*
268 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
269*/
270static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
271 int k, i;
272 int l1ido = l1*ido;
273 if (ido <= 2) {
274 for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
275 ch[0] = VADD(cc[0], cc[ido+0]);
276 ch[l1ido] = VSUB(cc[0], cc[ido+0]);
277 ch[1] = VADD(cc[1], cc[ido+1]);
278 ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
279 }
280 } else {
281 for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
282 for (i=0; i<ido-1; i+=2) {
283 v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
284 v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
285 v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
286 ch[i] = VADD(cc[i+0], cc[i+ido+0]);
287 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
288 VCPLXMUL(tr2, ti2, wr, wi);
289 ch[i+l1ido] = tr2;
290 ch[i+l1ido+1] = ti2;
291 }
292 }
293 }
294}
295
296/*
297 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
298*/
299static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
300 const float *wa1, const float *wa2, float fsign) {
301 static const float taur = -0.5f;
302 float taui = 0.866025403784439f*fsign;
303 int i, k;
304 v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
305 int l1ido = l1*ido;
306 float wr1, wi1, wr2, wi2;
307 assert(ido > 2);
308 for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
309 for (i=0; i<ido-1; i+=2) {
310 tr2 = VADD(cc[i+ido], cc[i+2*ido]);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200311 cr2 = VADD(cc[i], SVMUL(taur,tr2));
Julien Pommier370d2092011-11-19 18:04:25 +0100312 ch[i] = VADD(cc[i], tr2);
313 ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200314 ci2 = VADD(cc[i +1], SVMUL(taur,ti2));
Julien Pommier370d2092011-11-19 18:04:25 +0100315 ch[i+1] = VADD(cc[i+1], ti2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200316 cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
317 ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
Julien Pommier370d2092011-11-19 18:04:25 +0100318 dr2 = VSUB(cr2, ci3);
319 dr3 = VADD(cr2, ci3);
320 di2 = VADD(ci2, cr3);
321 di3 = VSUB(ci2, cr3);
322 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
323 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
324 ch[i+l1ido] = dr2;
325 ch[i+l1ido + 1] = di2;
326 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
327 ch[i+2*l1ido] = dr3;
328 ch[i+2*l1ido+1] = di3;
329 }
330 }
331} /* passf3 */
332
333static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
334 const float *wa1, const float *wa2, const float *wa3, float fsign) {
335 /* isign == -1 for forward transform and +1 for backward transform */
336
337 int i, k;
338 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
339 int l1ido = l1*ido;
340 if (ido == 2) {
341 for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
342 tr1 = VSUB(cc[0], cc[2*ido + 0]);
343 tr2 = VADD(cc[0], cc[2*ido + 0]);
344 ti1 = VSUB(cc[1], cc[2*ido + 1]);
345 ti2 = VADD(cc[1], cc[2*ido + 1]);
346 ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
347 tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
348 tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
349 ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
350
351 ch[0*l1ido + 0] = VADD(tr2, tr3);
352 ch[0*l1ido + 1] = VADD(ti2, ti3);
353 ch[1*l1ido + 0] = VADD(tr1, tr4);
354 ch[1*l1ido + 1] = VADD(ti1, ti4);
355 ch[2*l1ido + 0] = VSUB(tr2, tr3);
356 ch[2*l1ido + 1] = VSUB(ti2, ti3);
357 ch[3*l1ido + 0] = VSUB(tr1, tr4);
358 ch[3*l1ido + 1] = VSUB(ti1, ti4);
359 }
360 } else {
361 for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
362 for (i=0; i<ido-1; i+=2) {
363 float wr1, wi1, wr2, wi2, wr3, wi3;
364 tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
365 tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
366 ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
367 ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
368 tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
369 ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
370 tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
371 ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
372
373 ch[i] = VADD(tr2, tr3);
374 cr3 = VSUB(tr2, tr3);
375 ch[i + 1] = VADD(ti2, ti3);
376 ci3 = VSUB(ti2, ti3);
377
378 cr2 = VADD(tr1, tr4);
379 cr4 = VSUB(tr1, tr4);
380 ci2 = VADD(ti1, ti4);
381 ci4 = VSUB(ti1, ti4);
382 wr1=wa1[i], wi1=fsign*wa1[i+1];
383 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
384 wr2=wa2[i], wi2=fsign*wa2[i+1];
385 ch[i + l1ido] = cr2;
386 ch[i + l1ido + 1] = ci2;
387
388 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
389 wr3=wa3[i], wi3=fsign*wa3[i+1];
390 ch[i + 2*l1ido] = cr3;
391 ch[i + 2*l1ido + 1] = ci3;
392
393 VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
394 ch[i + 3*l1ido] = cr4;
395 ch[i + 3*l1ido + 1] = ci4;
396 }
397 }
398 }
399} /* passf4 */
400
Julien Pommier0302e8a2012-10-11 18:04:09 +0200401/*
402 passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
403*/
404static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
405 const float *wa1, const float *wa2,
406 const float *wa3, const float *wa4, float fsign) {
407 static const float tr11 = .309016994374947f;
408 const float ti11 = .951056516295154f*fsign;
409 static const float tr12 = -.809016994374947f;
410 const float ti12 = .587785252292473f*fsign;
411
412 /* Local variables */
413 int i, k;
414 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
415 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
416
417 float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
418
419#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
420#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
421
422 assert(ido > 2);
423 for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
424 for (i = 0; i < ido-1; i += 2) {
425 ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
426 ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
427 ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
428 ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
429 tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
430 tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
431 tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
432 tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
433 ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
434 ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
435 cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
436 ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
437 cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
438 ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
439 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
440 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
441 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
442 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
443 dr3 = VSUB(cr3, ci4);
444 dr4 = VADD(cr3, ci4);
445 di3 = VADD(ci3, cr4);
446 di4 = VSUB(ci3, cr4);
447 dr5 = VADD(cr2, ci5);
448 dr2 = VSUB(cr2, ci5);
449 di5 = VSUB(ci2, cr5);
450 di2 = VADD(ci2, cr5);
451 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
452 wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
453 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
454 ch_ref(i - 1, 2) = dr2;
455 ch_ref(i, 2) = di2;
456 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
457 ch_ref(i - 1, 3) = dr3;
458 ch_ref(i, 3) = di3;
459 VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
460 ch_ref(i - 1, 4) = dr4;
461 ch_ref(i, 4) = di4;
462 VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
463 ch_ref(i - 1, 5) = dr5;
464 ch_ref(i, 5) = di5;
465 }
466 }
467#undef ch_ref
468#undef cc_ref
469}
470
Julien Pommier370d2092011-11-19 18:04:25 +0100471static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
472 static const float minus_one = -1.f;
473 int i, k, l1ido = l1*ido;
474 for (k=0; k < l1ido; k += ido) {
475 v4sf a = cc[k], b = cc[k + l1ido];
476 ch[2*k] = VADD(a, b);
477 ch[2*(k+ido)-1] = VSUB(a, b);
478 }
479 if (ido < 2) return;
480 if (ido != 2) {
481 for (k=0; k < l1ido; k += ido) {
482 for (i=2; i<ido; i+=2) {
483 v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
484 v4sf br = cc[i - 1 + k], bi = cc[i + k];
485 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
486 ch[i + 2*k] = VADD(bi, ti2);
487 ch[2*(k+ido) - i] = VSUB(ti2, bi);
488 ch[i - 1 + 2*k] = VADD(br, tr2);
489 ch[2*(k+ido) - i -1] = VSUB(br, tr2);
490 }
491 }
492 if (ido % 2 == 1) return;
493 }
494 for (k=0; k < l1ido; k += ido) {
Julien Pommier0302e8a2012-10-11 18:04:09 +0200495 ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
Julien Pommier370d2092011-11-19 18:04:25 +0100496 ch[2*k + ido-1] = cc[k + ido-1];
497 }
498} /* radf2 */
499
500
501static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
502 static const float minus_two=-2;
503 int i, k, l1ido = l1*ido;
504 v4sf a,b,c,d, tr2, ti2;
505 for (k=0; k < l1ido; k += ido) {
506 a = cc[2*k]; b = cc[2*(k+ido) - 1];
507 ch[k] = VADD(a, b);
508 ch[k + l1ido] =VSUB(a, b);
509 }
510 if (ido < 2) return;
511 if (ido != 2) {
512 for (k = 0; k < l1ido; k += ido) {
513 for (i = 2; i < ido; i += 2) {
514 a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
515 c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
516 ch[i-1 + k] = VADD(a, b);
517 tr2 = VSUB(a, b);
518 ch[i+0 + k] = VSUB(c, d);
519 ti2 = VADD(c, d);
520 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
521 ch[i-1 + k + l1ido] = tr2;
522 ch[i+0 + k + l1ido] = ti2;
523 }
524 }
525 if (ido % 2 == 1) return;
526 }
527 for (k = 0; k < l1ido; k += ido) {
528 a = cc[2*k + ido-1]; b = cc[2*k + ido];
529 ch[k + ido-1] = VADD(a,a);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200530 ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
Julien Pommier370d2092011-11-19 18:04:25 +0100531 }
532} /* radb2 */
533
534static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
535 const float *wa1, const float *wa2) {
536 static const float taur = -0.5f;
537 static const float taui = 0.866025403784439f;
538 int i, k, ic;
539 v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
540 for (k=0; k<l1; k++) {
541 cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
542 ch[3*k*ido] = VADD(cc[k*ido], cr2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200543 ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
544 ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
Julien Pommier370d2092011-11-19 18:04:25 +0100545 }
546 if (ido == 1) return;
547 for (k=0; k<l1; k++) {
548 for (i=2; i<ido; i+=2) {
549 ic = ido - i;
550 wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
551 dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
552 VCPLXMULCONJ(dr2, di2, wr1, wi1);
553
554 wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
555 dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
556 VCPLXMULCONJ(dr3, di3, wr2, wi2);
557
558 cr2 = VADD(dr2, dr3);
559 ci2 = VADD(di2, di3);
560 ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
561 ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200562 tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
563 ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
564 tr3 = SVMUL(taui, VSUB(di2, di3));
565 ti3 = SVMUL(taui, VSUB(dr3, dr2));
Julien Pommier370d2092011-11-19 18:04:25 +0100566 ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
567 ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
568 ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
569 ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
570 }
571 }
572} /* radf3 */
573
574
575static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
576 const float *wa1, const float *wa2)
577{
578 static const float taur = -0.5f;
579 static const float taui = 0.866025403784439f;
580 static const float taui_2 = 0.866025403784439f*2;
581 int i, k, ic;
582 v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
583 for (k=0; k<l1; k++) {
584 tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
585 cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
586 ch[k*ido] = VADD(cc[3*k*ido], tr2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200587 ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
Julien Pommier370d2092011-11-19 18:04:25 +0100588 ch[(k + l1)*ido] = VSUB(cr2, ci3);
589 ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
590 }
591 if (ido == 1) return;
592 for (k=0; k<l1; k++) {
593 for (i=2; i<ido; i+=2) {
594 ic = ido - i;
595 tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
596 cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
597 ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
598 ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
599 ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
600 ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200601 cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
602 ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
Julien Pommier370d2092011-11-19 18:04:25 +0100603 dr2 = VSUB(cr2, ci3);
604 dr3 = VADD(cr2, ci3);
605 di2 = VADD(ci2, cr3);
606 di3 = VSUB(ci2, cr3);
607 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
608 ch[i - 1 + (k + l1)*ido] = dr2;
609 ch[i + (k + l1)*ido] = di2;
610 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
611 ch[i - 1 + (k + 2*l1)*ido] = dr3;
612 ch[i + (k + 2*l1)*ido] = di3;
613 }
614 }
615} /* radb3 */
616
Julien Pommier370d2092011-11-19 18:04:25 +0100617static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
618 const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
619{
620 static const float minus_hsqt2 = (float)-0.7071067811865475;
621 int i, k, l1ido = l1*ido;
622 {
623 const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
624 v4sf * RESTRICT ch_ = ch;
625 while (cc < cc_end) {
626 // this loop represents between 25% and 40% of total radf4_ps cost !
627 v4sf a0 = cc[0], a1 = cc[l1ido];
628 v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
629 v4sf tr1 = VADD(a1, a3);
630 v4sf tr2 = VADD(a0, a2);
631 ch[2*ido-1] = VSUB(a0, a2);
632 ch[2*ido ] = VSUB(a3, a1);
633 ch[0 ] = VADD(tr1, tr2);
634 ch[4*ido-1] = VSUB(tr2, tr1);
635 cc += ido; ch += 4*ido;
636 }
637 cc = cc_; ch = ch_;
638 }
639 if (ido < 2) return;
640 if (ido != 2) {
641 for (k = 0; k < l1ido; k += ido) {
642 const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
643 for (i=2; i<ido; i += 2, pc += 2) {
644 int ic = ido - i;
645 v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
646 v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
647
648 cr2 = pc[1*l1ido+0];
649 ci2 = pc[1*l1ido+1];
650 wr=LD_PS1(wa1[i - 2]);
651 wi=LD_PS1(wa1[i - 1]);
652 VCPLXMULCONJ(cr2,ci2,wr,wi);
653
654 cr3 = pc[2*l1ido+0];
655 ci3 = pc[2*l1ido+1];
656 wr = LD_PS1(wa2[i-2]);
657 wi = LD_PS1(wa2[i-1]);
658 VCPLXMULCONJ(cr3, ci3, wr, wi);
659
660 cr4 = pc[3*l1ido];
661 ci4 = pc[3*l1ido+1];
662 wr = LD_PS1(wa3[i-2]);
663 wi = LD_PS1(wa3[i-1]);
664 VCPLXMULCONJ(cr4, ci4, wr, wi);
665
666 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
667
668 tr1 = VADD(cr2,cr4);
669 tr4 = VSUB(cr4,cr2);
670 tr2 = VADD(pc[0],cr3);
671 tr3 = VSUB(pc[0],cr3);
672 ch[i - 1 + 4*k] = VADD(tr1,tr2);
673 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
674 ti1 = VADD(ci2,ci4);
675 ti4 = VSUB(ci2,ci4);
676 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
677 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
678 ti2 = VADD(pc[1],ci3);
679 ti3 = VSUB(pc[1],ci3);
680 ch[i + 4*k] = VADD(ti1, ti2);
681 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
682 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
683 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
684 }
685 }
686 if (ido % 2 == 1) return;
687 }
688 for (k=0; k<l1ido; k += ido) {
689 v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
690 v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
Julien Pommier0302e8a2012-10-11 18:04:09 +0200691 v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
692 v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
Julien Pommier370d2092011-11-19 18:04:25 +0100693 ch[ido-1 + 4*k] = VADD(tr1, c);
694 ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
695 ch[4*k + 1*ido] = VSUB(ti1, d);
696 ch[4*k + 3*ido] = VADD(ti1, d);
697 }
698} /* radf4 */
699
700
701static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
702 const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
703{
704 static const float minus_sqrt2 = (float)-1.414213562373095;
705 static const float two = 2.f;
706 int i, k, l1ido = l1*ido;
707 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
708 {
709 const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
710 v4sf *ch_ = ch;
711 while (ch < ch_end) {
712 v4sf a = cc[0], b = cc[4*ido-1];
713 v4sf c = cc[2*ido], d = cc[2*ido-1];
Julien Pommier0302e8a2012-10-11 18:04:09 +0200714 tr3 = SVMUL(two,d);
Julien Pommier370d2092011-11-19 18:04:25 +0100715 tr2 = VADD(a,b);
716 tr1 = VSUB(a,b);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200717 tr4 = SVMUL(two,c);
Julien Pommier370d2092011-11-19 18:04:25 +0100718 ch[0*l1ido] = VADD(tr2, tr3);
719 ch[2*l1ido] = VSUB(tr2, tr3);
720 ch[1*l1ido] = VSUB(tr1, tr4);
721 ch[3*l1ido] = VADD(tr1, tr4);
722
723 cc += 4*ido; ch += ido;
724 }
725 cc = cc_; ch = ch_;
726 }
727 if (ido < 2) return;
728 if (ido != 2) {
729 for (k = 0; k < l1ido; k += ido) {
730 const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
731 v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
732 for (i = 2; i < ido; i += 2) {
733
734 tr1 = VSUB(pc[i], pc[4*ido - i]);
735 tr2 = VADD(pc[i], pc[4*ido - i]);
736 ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
737 tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
738 ph[0] = VADD(tr2, tr3);
739 cr3 = VSUB(tr2, tr3);
740
741 ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
742 tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
743 cr2 = VSUB(tr1, tr4);
744 cr4 = VADD(tr1, tr4);
745
746 ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
747 ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
748
749 ph[1] = VADD(ti2, ti3); ph += l1ido;
750 ci3 = VSUB(ti2, ti3);
751 ci2 = VADD(ti1, ti4);
752 ci4 = VSUB(ti1, ti4);
753 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
754 ph[0] = cr2;
755 ph[1] = ci2; ph += l1ido;
756 VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
757 ph[0] = cr3;
758 ph[1] = ci3; ph += l1ido;
759 VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
760 ph[0] = cr4;
761 ph[1] = ci4; ph = ph - 3*l1ido + 2;
762 }
763 }
764 if (ido % 2 == 1) return;
765 }
766 for (k=0; k < l1ido; k+=ido) {
767 int i0 = 4*k + ido;
768 v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
769 v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
770 tr1 = VSUB(c,d);
771 tr2 = VADD(c,d);
772 ti1 = VADD(b,a);
773 ti2 = VSUB(b,a);
774 ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200775 ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
Julien Pommier370d2092011-11-19 18:04:25 +0100776 ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200777 ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
Julien Pommier370d2092011-11-19 18:04:25 +0100778 }
779} /* radb4 */
780
Julien Pommier0302e8a2012-10-11 18:04:09 +0200781static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
782 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
783{
784 static const float tr11 = .309016994374947f;
785 static const float ti11 = .951056516295154f;
786 static const float tr12 = -.809016994374947f;
787 static const float ti12 = .587785252292473f;
788
789 /* System generated locals */
790 int cc_offset, ch_offset;
791
792 /* Local variables */
793 int i, k, ic;
794 v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
795 cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
796 int idp2;
797
798
799#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
800#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
801
802 /* Parameter adjustments */
803 ch_offset = 1 + ido * 6;
804 ch -= ch_offset;
805 cc_offset = 1 + ido * (1 + l1);
806 cc -= cc_offset;
807
808 /* Function Body */
809 for (k = 1; k <= l1; ++k) {
810 cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
811 ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
812 cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
813 ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
814 ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
815 ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
816 ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
817 ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
818 ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
819 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
820 }
821 if (ido == 1) {
822 return;
823 }
824 idp2 = ido + 2;
825 for (k = 1; k <= l1; ++k) {
826 for (i = 3; i <= ido; i += 2) {
827 ic = idp2 - i;
Julien Pommier432b3e82013-01-12 19:28:03 +0100828 dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
829 dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
830 dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
831 dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
832 VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
833 VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
834 VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
835 VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
Julien Pommier0302e8a2012-10-11 18:04:09 +0200836 cr2 = VADD(dr2, dr5);
837 ci5 = VSUB(dr5, dr2);
838 cr5 = VSUB(di2, di5);
839 ci2 = VADD(di2, di5);
840 cr3 = VADD(dr3, dr4);
841 ci4 = VSUB(dr4, dr3);
842 cr4 = VSUB(di3, di4);
843 ci3 = VADD(di3, di4);
844 ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
Julien Pommier432b3e82013-01-12 19:28:03 +0100845 ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
Julien Pommier0302e8a2012-10-11 18:04:09 +0200846 tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
Julien Pommier432b3e82013-01-12 19:28:03 +0100847 ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
Julien Pommier0302e8a2012-10-11 18:04:09 +0200848 tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
Julien Pommier432b3e82013-01-12 19:28:03 +0100849 ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
Julien Pommier0302e8a2012-10-11 18:04:09 +0200850 tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
851 ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
852 tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
853 ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
Julien Pommier432b3e82013-01-12 19:28:03 +0100854 ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
855 ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200856 ch_ref(i, 3, k) = VADD(ti2, ti5);
857 ch_ref(ic, 2, k) = VSUB(ti5, ti2);
Julien Pommier432b3e82013-01-12 19:28:03 +0100858 ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
859 ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200860 ch_ref(i, 5, k) = VADD(ti3, ti4);
861 ch_ref(ic, 4, k) = VSUB(ti4, ti3);
862 }
863 }
864#undef cc_ref
865#undef ch_ref
866} /* radf5 */
867
868static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
869 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
870{
871 static const float tr11 = .309016994374947f;
872 static const float ti11 = .951056516295154f;
873 static const float tr12 = -.809016994374947f;
874 static const float ti12 = .587785252292473f;
875
876 int cc_offset, ch_offset;
877
878 /* Local variables */
879 int i, k, ic;
880 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
881 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
882 int idp2;
883
884#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
885#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
886
887 /* Parameter adjustments */
888 ch_offset = 1 + ido * (1 + l1);
889 ch -= ch_offset;
890 cc_offset = 1 + ido * 6;
891 cc -= cc_offset;
892
893 /* Function Body */
894 for (k = 1; k <= l1; ++k) {
895 ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
896 ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
897 tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
898 tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
899 ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
900 cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
901 cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
902 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
903 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
904 ch_ref(1, k, 2) = VSUB(cr2, ci5);
905 ch_ref(1, k, 3) = VSUB(cr3, ci4);
906 ch_ref(1, k, 4) = VADD(cr3, ci4);
907 ch_ref(1, k, 5) = VADD(cr2, ci5);
908 }
909 if (ido == 1) {
910 return;
911 }
912 idp2 = ido + 2;
913 for (k = 1; k <= l1; ++k) {
914 for (i = 3; i <= ido; i += 2) {
915 ic = idp2 - i;
916 ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k));
917 ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k));
918 ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k));
919 ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k));
920 tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
921 tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
922 tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
923 tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
924 ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
925 ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
926 cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
927 ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
928 cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
929 ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
930 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
931 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
932 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
933 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
934 dr3 = VSUB(cr3, ci4);
935 dr4 = VADD(cr3, ci4);
936 di3 = VADD(ci3, cr4);
937 di4 = VSUB(ci3, cr4);
938 dr5 = VADD(cr2, ci5);
939 dr2 = VSUB(cr2, ci5);
940 di5 = VSUB(ci2, cr5);
941 di2 = VADD(ci2, cr5);
Julien Pommier432b3e82013-01-12 19:28:03 +0100942 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
943 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
944 VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
945 VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
Julien Pommier0302e8a2012-10-11 18:04:09 +0200946
947 ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
948 ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
949 ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
950 ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
951 }
952 }
953#undef cc_ref
954#undef ch_ref
955} /* radb5 */
956
Julien Pommier370d2092011-11-19 18:04:25 +0100957static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
958 const float *wa, const int *ifac) {
959 v4sf *in = (v4sf*)input_readonly;
960 v4sf *out = (in == work2 ? work1 : work2);
961 int nf = ifac[1], k1;
962 int l2 = n;
963 int iw = n-1;
964 assert(in != out && work1 != work2);
965 for (k1 = 1; k1 <= nf; ++k1) {
966 int kh = nf - k1;
967 int ip = ifac[kh + 2];
968 int l1 = l2 / ip;
969 int ido = n / l2;
970 iw -= (ip - 1)*ido;
971 switch (ip) {
Julien Pommier0302e8a2012-10-11 18:04:09 +0200972 case 5: {
973 int ix2 = iw + ido;
974 int ix3 = ix2 + ido;
975 int ix4 = ix3 + ido;
976 radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
977 } break;
Julien Pommier370d2092011-11-19 18:04:25 +0100978 case 4: {
979 int ix2 = iw + ido;
980 int ix3 = ix2 + ido;
981 radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
982 } break;
983 case 3: {
984 int ix2 = iw + ido;
985 radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
986 } break;
987 case 2:
988 radf2_ps(ido, l1, in, out, &wa[iw]);
989 break;
990 default:
991 assert(0);
992 break;
993 }
994 l2 = l1;
995 if (out == work2) {
996 out = work1; in = work2;
997 } else {
998 out = work2; in = work1;
999 }
1000 }
1001 return in; /* this is in fact the output .. */
1002} /* rfftf1 */
1003
1004static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1005 const float *wa, const int *ifac) {
1006 v4sf *in = (v4sf*)input_readonly;
1007 v4sf *out = (in == work2 ? work1 : work2);
1008 int nf = ifac[1], k1;
1009 int l1 = 1;
1010 int iw = 0;
1011 assert(in != out);
1012 for (k1=1; k1<=nf; k1++) {
1013 int ip = ifac[k1 + 1];
1014 int l2 = ip*l1;
1015 int ido = n / l2;
1016 switch (ip) {
Julien Pommier0302e8a2012-10-11 18:04:09 +02001017 case 5: {
1018 int ix2 = iw + ido;
1019 int ix3 = ix2 + ido;
1020 int ix4 = ix3 + ido;
1021 radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1022 } break;
Julien Pommier370d2092011-11-19 18:04:25 +01001023 case 4: {
1024 int ix2 = iw + ido;
1025 int ix3 = ix2 + ido;
1026 radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1027 } break;
1028 case 3: {
1029 int ix2 = iw + ido;
1030 radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1031 } break;
1032 case 2:
1033 radb2_ps(ido, l1, in, out, &wa[iw]);
1034 break;
1035 default:
1036 assert(0);
1037 break;
1038 }
1039 l1 = l2;
1040 iw += (ip - 1)*ido;
1041
1042 if (out == work2) {
1043 out = work1; in = work2;
1044 } else {
1045 out = work2; in = work1;
1046 }
1047 }
1048 return in; /* this is in fact the output .. */
1049}
1050
Julien Pommier0302e8a2012-10-11 18:04:09 +02001051static int decompose(int n, int *ifac, const int *ntryh) {
Julien Pommier370d2092011-11-19 18:04:25 +01001052 int nl = n, nf = 0, i, j = 0;
Julien Pommier0302e8a2012-10-11 18:04:09 +02001053 for (j=0; ntryh[j]; ++j) {
Julien Pommier370d2092011-11-19 18:04:25 +01001054 int ntry = ntryh[j];
1055 while (nl != 1) {
1056 int nq = nl / ntry;
1057 int nr = nl - ntry * nq;
1058 if (nr == 0) {
1059 ifac[2+nf++] = ntry;
1060 nl = nq;
1061 if (ntry == 2 && nf != 1) {
1062 for (i = 2; i <= nf; ++i) {
1063 int ib = nf - i + 2;
1064 ifac[ib + 1] = ifac[ib];
1065 }
1066 ifac[2] = 2;
1067 }
1068 } else break;
1069 }
1070 }
1071 ifac[0] = n;
1072 ifac[1] = nf;
1073 return nf;
1074}
1075
1076
1077
1078static void rffti1_ps(int n, float *wa, int *ifac)
1079{
Julien Pommier0302e8a2012-10-11 18:04:09 +02001080 static const int ntryh[] = { 4,2,3,5,0 };
Julien Pommier370d2092011-11-19 18:04:25 +01001081 int k1, j, ii;
1082
1083 int nf = decompose(n,ifac,ntryh);
1084 float argh = (2*M_PI) / n;
1085 int is = 0;
1086 int nfm1 = nf - 1;
1087 int l1 = 1;
Julien Pommier370d2092011-11-19 18:04:25 +01001088 for (k1 = 1; k1 <= nfm1; k1++) {
1089 int ip = ifac[k1 + 1];
1090 int ld = 0;
1091 int l2 = l1*ip;
1092 int ido = n / l2;
1093 int ipm = ip - 1;
1094 for (j = 1; j <= ipm; ++j) {
1095 float argld;
1096 int i = is, fi=0;
1097 ld += l1;
1098 argld = ld*argh;
1099 for (ii = 3; ii <= ido; ii += 2) {
1100 i += 2;
1101 fi += 1;
1102 wa[i - 2] = cos(fi*argld);
1103 wa[i - 1] = sin(fi*argld);
1104 }
1105 is += ido;
1106 }
1107 l1 = l2;
1108 }
1109} /* rffti1 */
1110
1111void cffti1_ps(int n, float *wa, int *ifac)
1112{
Julien Pommier0302e8a2012-10-11 18:04:09 +02001113 static const int ntryh[] = { 5,3,4,2,0 };
Julien Pommier370d2092011-11-19 18:04:25 +01001114 int k1, j, ii;
1115
1116 int nf = decompose(n,ifac,ntryh);
1117 float argh = (2*M_PI)/(float)n;
1118 int i = 1;
1119 int l1 = 1;
1120 for (k1=1; k1<=nf; k1++) {
1121 int ip = ifac[k1+1];
1122 int ld = 0;
1123 int l2 = l1*ip;
1124 int ido = n / l2;
1125 int idot = ido + ido + 2;
1126 int ipm = ip - 1;
1127 for (j=1; j<=ipm; j++) {
1128 float argld;
1129 int i1 = i, fi = 0;
1130 wa[i-1] = 1;
1131 wa[i] = 0;
1132 ld += l1;
1133 argld = ld*argh;
1134 for (ii = 4; ii <= idot; ii += 2) {
1135 i += 2;
1136 fi += 1;
1137 wa[i-1] = cos(fi*argld);
1138 wa[i] = sin(fi*argld);
1139 }
1140 if (ip > 5) {
1141 wa[i1-1] = wa[i-1];
1142 wa[i1] = wa[i];
1143 }
1144 }
1145 l1 = l2;
1146 }
1147} /* cffti1 */
1148
1149
1150v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
1151 v4sf *in = (v4sf*)input_readonly;
1152 v4sf *out = (in == work2 ? work1 : work2);
1153 int nf = ifac[1], k1;
1154 int l1 = 1;
1155 int iw = 0;
1156 assert(in != out && work1 != work2);
1157 for (k1=2; k1<=nf+1; k1++) {
1158 int ip = ifac[k1];
1159 int l2 = ip*l1;
1160 int ido = n / l2;
1161 int idot = ido + ido;
1162 switch (ip) {
Julien Pommier0302e8a2012-10-11 18:04:09 +02001163 case 5: {
1164 int ix2 = iw + idot;
1165 int ix3 = ix2 + idot;
1166 int ix4 = ix3 + idot;
1167 passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1168 } break;
Julien Pommier370d2092011-11-19 18:04:25 +01001169 case 4: {
1170 int ix2 = iw + idot;
1171 int ix3 = ix2 + idot;
1172 passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
1173 } break;
1174 case 2: {
1175 passf2_ps(idot, l1, in, out, &wa[iw], isign);
1176 } break;
1177 case 3: {
1178 int ix2 = iw + idot;
1179 passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
1180 } break;
1181 default:
1182 assert(0);
1183 }
1184 l1 = l2;
1185 iw += (ip - 1)*idot;
1186 if (out == work2) {
1187 out = work1; in = work2;
1188 } else {
1189 out = work2; in = work1;
1190 }
1191 }
1192
1193 return in; /* this is in fact the output .. */
1194}
1195
1196
1197struct PFFFT_Setup {
1198 int N;
1199 int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1200 int ifac[15];
1201 pffft_transform_t transform;
1202 v4sf *data; // allocated room for twiddle coefs
1203 float *e; // points into 'data' , N/4*3 elements
1204 float *twiddle; // points into 'data', N/4 elements
1205};
1206
1207PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
1208 PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
1209 int k, m;
Julien Pommier0302e8a2012-10-11 18:04:09 +02001210 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1211 and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1212 handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1213 if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1214 if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
Julien Pommier370d2092011-11-19 18:04:25 +01001215 //assert((N % 32) == 0);
1216 s->N = N;
1217 s->transform = transform;
1218 /* nb of complex simd vectors */
1219 s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1220 s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
1221 s->e = (float*)s->data;
1222 s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1223
1224 if (transform == PFFFT_REAL) {
1225 for (k=0; k < s->Ncvec; ++k) {
1226 int i = k/SIMD_SZ;
1227 int j = k%SIMD_SZ;
1228 for (m=0; m < SIMD_SZ-1; ++m) {
1229 float A = -2*M_PI*(m+1)*k / N;
1230 s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
1231 s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
1232 }
1233 }
1234 rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1235 } else {
1236 for (k=0; k < s->Ncvec; ++k) {
1237 int i = k/SIMD_SZ;
1238 int j = k%SIMD_SZ;
1239 for (m=0; m < SIMD_SZ-1; ++m) {
1240 float A = -2*M_PI*(m+1)*k / N;
1241 s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
1242 s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
1243 }
1244 }
1245 cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1246 }
Julien Pommier0302e8a2012-10-11 18:04:09 +02001247
1248 /* check that N is decomposable with allowed prime factors */
1249 for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1250 if (m != N/SIMD_SZ) {
1251 pffft_destroy_setup(s); s = 0;
1252 }
1253
Julien Pommier370d2092011-11-19 18:04:25 +01001254 return s;
1255}
1256
1257
1258void pffft_destroy_setup(PFFFT_Setup *s) {
1259 pffft_aligned_free(s->data);
1260 free(s);
1261}
1262
1263#if !defined(PFFFT_SIMD_DISABLE)
1264
1265/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1266static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1267 v4sf g0, g1;
1268 int k;
1269 INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1270
1271 *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1272 for (k=1; k < N; ++k) {
1273 v4sf h0, h1;
1274 INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1275 *--out = VSWAPHL(g1, h0);
1276 *--out = VSWAPHL(h0, h1);
1277 g1 = h1;
1278 }
1279 *--out = VSWAPHL(g1, g0);
1280}
1281
1282static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1283 v4sf g0, g1, h0, h1;
1284 int k;
1285 g0 = g1 = in[0]; ++in;
1286 for (k=1; k < N; ++k) {
1287 h0 = *in++; h1 = *in++;
1288 g1 = VSWAPHL(g1, h0);
1289 h0 = VSWAPHL(h0, h1);
1290 UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1291 g1 = h1;
1292 }
1293 h0 = *in++; h1 = g0;
1294 g1 = VSWAPHL(g1, h0);
1295 h0 = VSWAPHL(h0, h1);
1296 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1297}
1298
1299void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1300 int k, N = setup->N, Ncvec = setup->Ncvec;
1301 const v4sf *vin = (const v4sf*)in;
1302 v4sf *vout = (v4sf*)out;
1303 assert(in != out);
1304 if (setup->transform == PFFFT_REAL) {
1305 int k, dk = N/32;
1306 if (direction == PFFFT_FORWARD) {
1307 for (k=0; k < dk; ++k) {
1308 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1309 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1310 }
1311 reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1312 reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1313 } else {
1314 for (k=0; k < dk; ++k) {
1315 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1316 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1317 }
1318 unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1319 unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1320 }
1321 } else {
1322 if (direction == PFFFT_FORWARD) {
1323 for (k=0; k < Ncvec; ++k) {
1324 int kk = (k/4) + (k%4)*(Ncvec/4);
1325 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1326 }
1327 } else {
1328 for (k=0; k < Ncvec; ++k) {
1329 int kk = (k/4) + (k%4)*(Ncvec/4);
1330 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1331 }
1332 }
1333 }
1334}
1335
1336void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1337 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1338 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1339 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1340 assert(in != out);
1341 for (k=0; k < dk; ++k) {
1342 r0 = in[8*k+0]; i0 = in[8*k+1];
1343 r1 = in[8*k+2]; i1 = in[8*k+3];
1344 r2 = in[8*k+4]; i2 = in[8*k+5];
1345 r3 = in[8*k+6]; i3 = in[8*k+7];
1346 VTRANSPOSE4(r0,r1,r2,r3);
1347 VTRANSPOSE4(i0,i1,i2,i3);
1348 VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1349 VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1350 VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1351
1352 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1353 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1354 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1355 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1356
1357 /*
1358 transformation for each column is:
1359
1360 [1 1 1 1 0 0 0 0] [r0]
1361 [1 0 -1 0 0 -1 0 1] [r1]
1362 [1 -1 1 -1 0 0 0 0] [r2]
1363 [1 0 -1 0 0 1 0 -1] [r3]
1364 [0 0 0 0 1 1 1 1] * [i0]
1365 [0 1 0 -1 1 0 -1 0] [i1]
1366 [0 0 0 0 1 -1 1 -1] [i2]
1367 [0 -1 0 1 1 0 -1 0] [i3]
1368 */
1369
1370 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1371 r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1372 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1373 r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1374
1375 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1376 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1377 }
1378}
1379
1380void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1381 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1382 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1383 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1384 assert(in != out);
1385 for (k=0; k < dk; ++k) {
1386 r0 = in[8*k+0]; i0 = in[8*k+1];
1387 r1 = in[8*k+2]; i1 = in[8*k+3];
1388 r2 = in[8*k+4]; i2 = in[8*k+5];
1389 r3 = in[8*k+6]; i3 = in[8*k+7];
1390
1391 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1392 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1393 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1394 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1395
1396 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1397 r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1398 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1399 r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1400
1401 VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1402 VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1403 VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1404
1405 VTRANSPOSE4(r0,r1,r2,r3);
1406 VTRANSPOSE4(i0,i1,i2,i3);
1407
1408 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1409 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1410 }
1411}
1412
1413
Julien Pommier432b3e82013-01-12 19:28:03 +01001414static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
Julien Pommier370d2092011-11-19 18:04:25 +01001415 const v4sf *e, v4sf *out) {
1416 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1417 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1418 r0 = *in0; i0 = *in1;
1419 r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1420 VTRANSPOSE4(r0,r1,r2,r3);
1421 VTRANSPOSE4(i0,i1,i2,i3);
1422
1423 /*
1424 transformation for each column is:
1425
1426 [1 1 1 1 0 0 0 0] [r0]
1427 [1 0 -1 0 0 -1 0 1] [r1]
1428 [1 0 -1 0 0 1 0 -1] [r2]
1429 [1 -1 1 -1 0 0 0 0] [r3]
1430 [0 0 0 0 1 1 1 1] * [i0]
1431 [0 -1 0 1 -1 0 1 0] [i1]
1432 [0 -1 0 1 1 0 -1 0] [i2]
1433 [0 0 0 0 -1 1 -1 1] [i3]
1434 */
1435
1436 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1437 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1438
1439 VCPLXMUL(r1,i1,e[0],e[1]);
1440 VCPLXMUL(r2,i2,e[2],e[3]);
1441 VCPLXMUL(r3,i3,e[4],e[5]);
1442
1443 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1444 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1445
1446 sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1447 sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1448 si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1449 si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1450
1451 r0 = VADD(sr0, sr1);
1452 r3 = VSUB(sr0, sr1);
1453 i0 = VADD(si0, si1);
1454 i3 = VSUB(si1, si0);
1455 r1 = VADD(dr0, di1);
1456 r2 = VSUB(dr0, di1);
1457 i1 = VSUB(dr1, di0);
1458 i2 = VADD(dr1, di0);
1459
1460 *out++ = r0;
1461 *out++ = i0;
1462 *out++ = r1;
1463 *out++ = i1;
1464 *out++ = r2;
1465 *out++ = i2;
1466 *out++ = r3;
1467 *out++ = i3;
1468
1469}
1470
1471static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1472 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1473 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1474
1475 v4sf_union cr, ci, *uout = (v4sf_union*)out;
1476 v4sf save = in[7], zero=VZERO();
1477 float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1478 static const float s = M_SQRT2/2;
1479
1480 cr.v = in[0]; ci.v = in[Ncvec*2-1];
1481 assert(in != out);
1482 pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1483
1484 /*
1485 [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1486
1487 [Xr(1)] ] [1 1 1 1 0 0 0 0]
1488 [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1489 [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1490 [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1491 [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1492 [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1493 [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1494 [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1495 */
1496
1497 xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1498 xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1499 xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
1500 xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
1501 xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
1502 xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
1503 xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
1504 xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
1505
1506 for (k=1; k < dk; ++k) {
1507 v4sf save_next = in[8*k+7];
1508 pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
1509 e + k*6, out + k*8);
1510 save = save_next;
1511 }
1512
1513}
1514
Julien Pommier432b3e82013-01-12 19:28:03 +01001515static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
Julien Pommier370d2092011-11-19 18:04:25 +01001516 const v4sf *e, v4sf *out, int first) {
1517 v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1518 /*
1519 transformation for each column is:
1520
1521 [1 1 1 1 0 0 0 0] [r0]
1522 [1 0 0 -1 0 -1 -1 0] [r1]
1523 [1 -1 -1 1 0 0 0 0] [r2]
1524 [1 0 0 -1 0 1 1 0] [r3]
1525 [0 0 0 0 1 -1 1 -1] * [i0]
1526 [0 -1 1 0 1 0 0 1] [i1]
1527 [0 0 0 0 1 1 -1 -1] [i2]
1528 [0 1 -1 0 1 0 0 1] [i3]
1529 */
1530
1531 v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1532 v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1533 v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1534 v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1535
1536 r0 = VADD(sr0, sr1);
1537 r2 = VSUB(sr0, sr1);
1538 r1 = VSUB(dr0, si1);
1539 r3 = VADD(dr0, si1);
1540 i0 = VSUB(di0, di1);
1541 i2 = VADD(di0, di1);
1542 i1 = VSUB(si0, dr1);
1543 i3 = VADD(si0, dr1);
1544
1545 VCPLXMULCONJ(r1,i1,e[0],e[1]);
1546 VCPLXMULCONJ(r2,i2,e[2],e[3]);
1547 VCPLXMULCONJ(r3,i3,e[4],e[5]);
1548
1549 VTRANSPOSE4(r0,r1,r2,r3);
1550 VTRANSPOSE4(i0,i1,i2,i3);
1551
1552 if (!first) {
1553 *out++ = r0;
1554 *out++ = i0;
1555 }
1556 *out++ = r1;
1557 *out++ = i1;
1558 *out++ = r2;
1559 *out++ = i2;
1560 *out++ = r3;
1561 *out++ = i3;
1562}
1563
1564static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1565 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1566 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1567
1568 v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1569 float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1570 static const float s = M_SQRT2;
1571 assert(in != out);
1572 for (k=0; k < 4; ++k) {
1573 Xr.f[k] = ((float*)in)[8*k];
1574 Xi.f[k] = ((float*)in)[8*k+4];
1575 }
1576
1577 pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
1578
1579 /*
1580 [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1581
1582 [cr0] [1 0 2 0 1 0 0 0]
1583 [cr1] [1 0 0 0 -1 0 -2 0]
1584 [cr2] [1 0 -2 0 1 0 0 0]
1585 [cr3] [1 0 0 0 -1 0 2 0]
1586 [ci0] [0 2 0 2 0 0 0 0]
1587 [ci1] [0 s 0 -s 0 -s 0 -s]
1588 [ci2] [0 0 0 0 0 -2 0 2]
1589 [ci3] [0 -s 0 s 0 -s 0 -s]
1590 */
1591 for (k=1; k < dk; ++k) {
1592 pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
1593 }
1594
1595 cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1596 cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1597 cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1598 cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1599 ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0;
1600 ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1601 ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2;
1602 ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1603}
1604
1605
1606void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1607 pffft_direction_t direction, int ordered) {
1608 int k, Ncvec = setup->Ncvec;
1609 int nf_odd = (setup->ifac[1] & 1);
1610
1611 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1612 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1613 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
Julien Pommier370d2092011-11-19 18:04:25 +01001614
1615 const v4sf *vinput = (const v4sf*)finput;
1616 v4sf *voutput = (v4sf*)foutput;
Julien Pommier836bc4b2011-11-20 10:58:07 +01001617 v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
Julien Pommier370d2092011-11-19 18:04:25 +01001618 int ib = (nf_odd ^ ordered ? 1 : 0);
1619
Julien Pommier370d2092011-11-19 18:04:25 +01001620 assert(VALIGNED(finput) && VALIGNED(foutput));
1621
1622 //assert(finput != foutput);
1623 if (direction == PFFFT_FORWARD) {
1624 ib = !ib;
1625 if (setup->transform == PFFFT_REAL) {
1626 ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1627 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1628 pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1629 } else {
1630 v4sf *tmp = buff[ib];
1631 for (k=0; k < Ncvec; ++k) {
1632 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1633 }
1634 ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1635 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1636 pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1637 }
1638 if (ordered) {
1639 pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1640 } else ib = !ib;
1641 } else {
1642 if (vinput == buff[ib]) {
1643 ib = !ib; // may happen when finput == foutput
1644 }
1645 if (ordered) {
1646 pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1647 vinput = buff[ib]; ib = !ib;
1648 }
1649 if (setup->transform == PFFFT_REAL) {
1650 pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1651 ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1652 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1653 } else {
1654 pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1655 ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1656 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1657 for (k=0; k < Ncvec; ++k) {
1658 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1659 }
1660 }
1661 }
1662
1663 if (buff[ib] != voutput) {
1664 /* extra copy required -- this situation should only happen when finput == foutput */
1665 assert(finput==foutput);
1666 for (k=0; k < Ncvec; ++k) {
1667 v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1668 voutput[2*k] = a; voutput[2*k+1] = b;
1669 }
1670 ib = !ib;
1671 }
1672 assert(buff[ib] == voutput);
1673}
1674
1675void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
Julien Pommier432b3e82013-01-12 19:28:03 +01001676 int Ncvec = s->Ncvec;
Julien Pommier370d2092011-11-19 18:04:25 +01001677 const v4sf * RESTRICT va = (const v4sf*)a;
1678 const v4sf * RESTRICT vb = (const v4sf*)b;
1679 v4sf * RESTRICT vab = (v4sf*)ab;
1680
1681#ifdef __arm__
1682 __builtin_prefetch(va);
1683 __builtin_prefetch(vb);
1684 __builtin_prefetch(vab);
1685 __builtin_prefetch(va+2);
1686 __builtin_prefetch(vb+2);
1687 __builtin_prefetch(vab+2);
1688 __builtin_prefetch(va+4);
1689 __builtin_prefetch(vb+4);
1690 __builtin_prefetch(vab+4);
1691 __builtin_prefetch(va+6);
1692 __builtin_prefetch(vb+6);
1693 __builtin_prefetch(vab+6);
Julien Pommier432b3e82013-01-12 19:28:03 +01001694# ifndef __clang__
1695# define ZCONVOLVE_USING_INLINE_NEON_ASM
1696# endif
Julien Pommier370d2092011-11-19 18:04:25 +01001697#endif
1698
1699 float ar, ai, br, bi, abr, abi;
Julien Pommier432b3e82013-01-12 19:28:03 +01001700#ifndef ZCONVOLVE_USING_INLINE_ASM
Julien Pommier370d2092011-11-19 18:04:25 +01001701 v4sf vscal = LD_PS1(scaling);
Julien Pommier432b3e82013-01-12 19:28:03 +01001702 int i;
1703#endif
Julien Pommier370d2092011-11-19 18:04:25 +01001704
1705 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1706 ar = ((v4sf_union*)va)[0].f[0];
1707 ai = ((v4sf_union*)va)[1].f[0];
1708 br = ((v4sf_union*)vb)[0].f[0];
1709 bi = ((v4sf_union*)vb)[1].f[0];
1710 abr = ((v4sf_union*)vab)[0].f[0];
1711 abi = ((v4sf_union*)vab)[1].f[0];
1712
Julien Pommier432b3e82013-01-12 19:28:03 +01001713#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
Julien Pommier370d2092011-11-19 18:04:25 +01001714 const float *a_ = a, *b_ = b; float *ab_ = ab;
1715 int N = Ncvec;
1716 asm volatile("mov r8, %2 \n"
1717 "vdup.f32 q15, %4 \n"
1718 "1: \n"
1719 "pld [%0,#64] \n"
1720 "pld [%1,#64] \n"
1721 "pld [%2,#64] \n"
1722 "pld [%0,#96] \n"
1723 "pld [%1,#96] \n"
1724 "pld [%2,#96] \n"
1725 "vld1.f32 {q0,q1}, [%0,:128]! \n"
1726 "vld1.f32 {q4,q5}, [%1,:128]! \n"
1727 "vld1.f32 {q2,q3}, [%0,:128]! \n"
1728 "vld1.f32 {q6,q7}, [%1,:128]! \n"
1729 "vld1.f32 {q8,q9}, [r8,:128]! \n"
1730
1731 "vmul.f32 q10, q0, q4 \n"
1732 "vmul.f32 q11, q0, q5 \n"
1733 "vmul.f32 q12, q2, q6 \n"
1734 "vmul.f32 q13, q2, q7 \n"
1735 "vmls.f32 q10, q1, q5 \n"
1736 "vmla.f32 q11, q1, q4 \n"
1737 "vld1.f32 {q0,q1}, [r8,:128]! \n"
1738 "vmls.f32 q12, q3, q7 \n"
1739 "vmla.f32 q13, q3, q6 \n"
1740 "vmla.f32 q8, q10, q15 \n"
1741 "vmla.f32 q9, q11, q15 \n"
1742 "vmla.f32 q0, q12, q15 \n"
1743 "vmla.f32 q1, q13, q15 \n"
1744 "vst1.f32 {q8,q9},[%2,:128]! \n"
1745 "vst1.f32 {q0,q1},[%2,:128]! \n"
1746 "subs %3, #2 \n"
1747 "bne 1b \n"
1748 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
Julien Pommier432b3e82013-01-12 19:28:03 +01001749#else // default routine, works fine for non-arm cpus with current compilers
Julien Pommier370d2092011-11-19 18:04:25 +01001750 for (i=0; i < Ncvec; i += 2) {
1751 v4sf ar, ai, br, bi;
1752 ar = va[2*i+0]; ai = va[2*i+1];
1753 br = vb[2*i+0]; bi = vb[2*i+1];
1754 VCPLXMUL(ar, ai, br, bi);
1755 vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1756 vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1757 ar = va[2*i+2]; ai = va[2*i+3];
1758 br = vb[2*i+2]; bi = vb[2*i+3];
1759 VCPLXMUL(ar, ai, br, bi);
1760 vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1761 vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1762 }
1763#endif
1764 if (s->transform == PFFFT_REAL) {
1765 ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1766 ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1767 }
1768}
1769
1770
1771#else // defined(PFFFT_SIMD_DISABLE)
1772
1773// standard routine using scalar floats, without SIMD stuff.
1774
1775#define pffft_zreorder_nosimd pffft_zreorder
1776void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1777 int k, N = setup->N;
1778 if (setup->transform == PFFFT_COMPLEX) {
1779 for (k=0; k < 2*N; ++k) out[k] = in[k];
1780 return;
1781 }
1782 else if (direction == PFFFT_FORWARD) {
1783 float x_N = in[N-1];
1784 for (k=N-1; k > 1; --k) out[k] = in[k-1];
1785 out[0] = in[0];
1786 out[1] = x_N;
1787 } else {
1788 float x_N = in[1];
1789 for (k=1; k < N-1; ++k) out[k] = in[k+1];
1790 out[0] = in[0];
1791 out[N-1] = x_N;
1792 }
1793}
1794
1795#define pffft_transform_internal_nosimd pffft_transform_internal
1796void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1797 pffft_direction_t direction, int ordered) {
1798 int Ncvec = setup->Ncvec;
1799 int nf_odd = (setup->ifac[1] & 1);
1800
1801 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1802 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1803 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
Julien Pommier2a195842012-10-11 11:11:41 +02001804 float *buff[2];
Julien Pommier370d2092011-11-19 18:04:25 +01001805 int ib;
Julien Pommier2a195842012-10-11 11:11:41 +02001806 if (scratch == 0) scratch = scratch_on_stack;
1807 buff[0] = output; buff[1] = scratch;
1808
Julien Pommier370d2092011-11-19 18:04:25 +01001809 if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1810 ib = (nf_odd ^ ordered ? 1 : 0);
1811
1812 if (direction == PFFFT_FORWARD) {
1813 if (setup->transform == PFFFT_REAL) {
1814 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1815 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1816 } else {
1817 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1818 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1819 }
1820 if (ordered) {
1821 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1822 }
1823 } else {
1824 if (input == buff[ib]) {
1825 ib = !ib; // may happen when finput == foutput
1826 }
1827 if (ordered) {
1828 pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
1829 input = buff[!ib];
1830 }
1831 if (setup->transform == PFFFT_REAL) {
1832 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1833 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1834 } else {
1835 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1836 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1837 }
1838 }
1839 if (buff[ib] != output) {
1840 int k;
1841 // extra copy required -- this situation should happens only when finput == foutput
1842 assert(input==output);
1843 for (k=0; k < Ncvec; ++k) {
1844 float a = buff[ib][2*k], b = buff[ib][2*k+1];
1845 output[2*k] = a; output[2*k+1] = b;
1846 }
1847 ib = !ib;
1848 }
1849 assert(buff[ib] == output);
1850}
1851
1852#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
1853void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1854 float *ab, float scaling) {
1855 int i, Ncvec = s->Ncvec;
1856
1857 if (s->transform == PFFFT_REAL) {
1858 // take care of the fftpack ordering
1859 ab[0] += a[0]*b[0]*scaling;
1860 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1861 ++ab; ++a; ++b; --Ncvec;
1862 }
1863 for (i=0; i < Ncvec; ++i) {
1864 float ar, ai, br, bi;
1865 ar = a[2*i+0]; ai = a[2*i+1];
1866 br = b[2*i+0]; bi = b[2*i+1];
1867 VCPLXMUL(ar, ai, br, bi);
1868 ab[2*i+0] += ar*scaling;
1869 ab[2*i+1] += ai*scaling;
1870 }
1871}
1872
1873#endif // defined(PFFFT_SIMD_DISABLE)
1874
1875void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1876 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
1877}
1878
1879void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1880 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
1881}