blob: ce7654280086648c9018c90c1280ff781769700f [file] [log] [blame]
Julien Pommier432b3e82013-01-12 19:28:03 +01001/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Julien Pommier370d2092011-11-19 18:04:25 +01002
3 Based on original fortran 77 code from FFTPACKv4 from NETLIB
4 (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5 of NCAR, in 1985.
6
7 As confirmed by the NCAR fftpack software curators, the following
8 FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9 released under the same terms.
10
11 FFTPACK license:
12
13 http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14
15 Copyright (c) 2004 the University Corporation for Atmospheric
16 Research ("UCAR"). All rights reserved. Developed by NCAR's
17 Computational and Information Systems Laboratory, UCAR,
18 www.cisl.ucar.edu.
19
20 Redistribution and use of the Software in source and binary forms,
21 with or without modification, is permitted provided that the
22 following conditions are met:
23
24 - Neither the names of NCAR's Computational and Information Systems
25 Laboratory, the University Corporation for Atmospheric Research,
26 nor the names of its sponsors or contributors may be used to
27 endorse or promote products derived from this Software without
28 specific prior written permission.
29
30 - Redistributions of source code must retain the above copyright
31 notices, this list of conditions, and the disclaimer below.
32
33 - Redistributions in binary form must reproduce the above copyright
34 notice, this list of conditions, and the disclaimer below in the
35 documentation and/or other materials provided with the
36 distribution.
37
38 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46 SOFTWARE.
47
48
49 PFFFT : a Pretty Fast FFT.
50
51 This file is largerly based on the original FFTPACK implementation, modified in
52 order to take advantage of SIMD instructions of modern CPUs.
53*/
54
55/*
56 ChangeLog:
57 - 2011/10/02, version 1: This is the very first release of this file.
58*/
59
hayati ayguen3673ac02019-12-22 07:09:56 +010060#include "pffft.h"
61
Julien Pommier370d2092011-11-19 18:04:25 +010062/* detect compiler flavour */
63#if defined(_MSC_VER)
64# define COMPILER_MSVC
65#elif defined(__GNUC__)
66# define COMPILER_GCC
67#endif
68
meng kea82d0342018-11-23 03:48:01 +080069#ifdef COMPILER_MSVC
70# define _USE_MATH_DEFINES
71#endif
72
meng kea82d0342018-11-23 03:48:01 +080073#include <stdlib.h>
hayati ayguenbc8d4a82019-12-25 01:27:33 +010074#include <stdint.h>
meng kea82d0342018-11-23 03:48:01 +080075#include <stdio.h>
76#include <math.h>
77#include <assert.h>
78
Julien Pommier370d2092011-11-19 18:04:25 +010079#if defined(COMPILER_GCC)
Julien Pommier432b3e82013-01-12 19:28:03 +010080# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
Julien Pommier370d2092011-11-19 18:04:25 +010081# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
82# define RESTRICT __restrict
83# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
84#elif defined(COMPILER_MSVC)
85# define ALWAYS_INLINE(return_type) __forceinline return_type
86# define NEVER_INLINE(return_type) __declspec(noinline) return_type
87# define RESTRICT __restrict
Julien Pommier2a195842012-10-11 11:11:41 +020088# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
Julien Pommier370d2092011-11-19 18:04:25 +010089#endif
90
91
R. Martinho Fernandesa57d6972015-10-06 18:13:12 +020092#ifdef COMPILER_MSVC
meng ke2f55d812018-11-23 04:03:17 +080093#pragma warning( disable : 4244 4305 4204 4456 )
R. Martinho Fernandesa57d6972015-10-06 18:13:12 +020094#endif
95
Julien Pommier370d2092011-11-19 18:04:25 +010096/*
97 vector support macros: the rest of the code is independant of
98 SSE/Altivec/NEON -- adding support for other platforms with 4-element
99 vectors should be limited to these macros
100*/
101
102
103// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
104//#define PFFFT_SIMD_DISABLE
105
106/*
107 Altivec support macros
108*/
109#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
110typedef vector float v4sf;
111# define SIMD_SZ 4
112# define VZERO() ((vector float) vec_splat_u8(0))
113# define VMUL(a,b) vec_madd(a,b, VZERO())
114# define VADD(a,b) vec_add(a,b)
115# define VMADD(a,b,c) vec_madd(a,b,c)
116# define VSUB(a,b) vec_sub(a,b)
117inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
118# define LD_PS1(p) ld_ps1(&p)
119# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
120# define UNINTERLEAVE2(in1, in2, out1, out2) { \
121 vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
122 vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
123 v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
124 }
125# define VTRANSPOSE4(x0,x1,x2,x3) { \
126 v4sf y0 = vec_mergeh(x0, x2); \
127 v4sf y1 = vec_mergel(x0, x2); \
128 v4sf y2 = vec_mergeh(x1, x3); \
129 v4sf y3 = vec_mergel(x1, x3); \
130 x0 = vec_mergeh(y0, y2); \
131 x1 = vec_mergel(y0, y2); \
132 x2 = vec_mergeh(y1, y3); \
133 x3 = vec_mergel(y1, y3); \
134 }
135# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
meng ke57bd4682018-11-23 03:58:44 +0800136# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
Julien Pommier370d2092011-11-19 18:04:25 +0100137
138/*
139 SSE1 support macros
140*/
141#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
142
143#include <xmmintrin.h>
144typedef __m128 v4sf;
145# define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
146# define VZERO() _mm_setzero_ps()
147# define VMUL(a,b) _mm_mul_ps(a,b)
148# define VADD(a,b) _mm_add_ps(a,b)
149# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
150# define VSUB(a,b) _mm_sub_ps(a,b)
151# define LD_PS1(p) _mm_set1_ps(p)
152# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
153# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
154# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
155# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
meng ke57bd4682018-11-23 03:58:44 +0800156# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
Julien Pommier370d2092011-11-19 18:04:25 +0100157
158/*
159 ARM NEON support macros
160*/
hayati ayguen3673ac02019-12-22 07:09:56 +0100161#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
Julien Pommier370d2092011-11-19 18:04:25 +0100162# include <arm_neon.h>
163typedef float32x4_t v4sf;
164# define SIMD_SZ 4
165# define VZERO() vdupq_n_f32(0)
166# define VMUL(a,b) vmulq_f32(a,b)
167# define VADD(a,b) vaddq_f32(a,b)
168# define VMADD(a,b,c) vmlaq_f32(c,a,b)
169# define VSUB(a,b) vsubq_f32(a,b)
170# define LD_PS1(p) vld1q_dup_f32(&(p))
171# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
172# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
Julien Pommier432b3e82013-01-12 19:28:03 +0100173# define VTRANSPOSE4(x0,x1,x2,x3) { \
Julien Pommier370d2092011-11-19 18:04:25 +0100174 float32x4x2_t t0_ = vzipq_f32(x0, x2); \
175 float32x4x2_t t1_ = vzipq_f32(x1, x3); \
176 float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
177 float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
178 x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
179 }
180// marginally faster version
Julien Pommier432b3e82013-01-12 19:28:03 +0100181//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
Julien Pommier370d2092011-11-19 18:04:25 +0100182# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
meng ke57bd4682018-11-23 03:58:44 +0800183# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
Julien Pommier370d2092011-11-19 18:04:25 +0100184#else
185# if !defined(PFFFT_SIMD_DISABLE)
186# warning "building with simd disabled !\n";
187# define PFFFT_SIMD_DISABLE // fallback to scalar code
188# endif
189#endif
190
191// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
192#ifdef PFFFT_SIMD_DISABLE
193typedef float v4sf;
194# define SIMD_SZ 1
195# define VZERO() 0.f
196# define VMUL(a,b) ((a)*(b))
197# define VADD(a,b) ((a)+(b))
198# define VMADD(a,b,c) ((a)*(b)+(c))
199# define VSUB(a,b) ((a)-(b))
200# define LD_PS1(p) (p)
meng ke57bd4682018-11-23 03:58:44 +0800201# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
Julien Pommier370d2092011-11-19 18:04:25 +0100202#endif
203
204// shortcuts for complex multiplcations
205#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
206#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
Julien Pommier0302e8a2012-10-11 18:04:09 +0200207#ifndef SVMUL
208// multiply a scalar with a vector
209#define SVMUL(f,v) VMUL(LD_PS1(f),v)
210#endif
Julien Pommier370d2092011-11-19 18:04:25 +0100211
212#if !defined(PFFFT_SIMD_DISABLE)
213typedef union v4sf_union {
214 v4sf v;
215 float f[4];
216} v4sf_union;
217
218#include <string.h>
219
220#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
221
222/* detect bugs with the vector support macros */
223void validate_pffft_simd() {
224 float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
225 v4sf_union a0, a1, a2, a3, t, u;
226 memcpy(a0.f, f, 4*sizeof(float));
227 memcpy(a1.f, f+4, 4*sizeof(float));
228 memcpy(a2.f, f+8, 4*sizeof(float));
229 memcpy(a3.f, f+12, 4*sizeof(float));
230
231 t = a0; u = a1; t.v = VZERO();
232 printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
233 t.v = VADD(a1.v, a2.v);
234 printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
235 t.v = VMUL(a1.v, a2.v);
236 printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
237 t.v = VMADD(a1.v, a2.v,a0.v);
238 printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
239
240 INTERLEAVE2(a1.v,a2.v,t.v,u.v);
241 printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
242 assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
243 UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
244 printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
245 assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
246
247 t.v=LD_PS1(f[15]);
248 printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
249 assertv4(t, 15, 15, 15, 15);
250 t.v = VSWAPHL(a1.v, a2.v);
251 printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
252 assertv4(t, 8, 9, 6, 7);
253 VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
254 printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
255 a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
256 a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
257 assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
258}
259#endif //!PFFFT_SIMD_DISABLE
260
261/* SSE and co like 16-bytes aligned pointers */
262#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
263void *pffft_aligned_malloc(size_t nb_bytes) {
Julien Pommier2a195842012-10-11 11:11:41 +0200264 void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
265 if (!p0) return (void *) 0;
Julien Pommier370d2092011-11-19 18:04:25 +0100266 p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
267 *((void **) p - 1) = p0;
268 return p;
269}
270
271void pffft_aligned_free(void *p) {
272 if (p) free(*((void **) p - 1));
273}
274
275int pffft_simd_size() { return SIMD_SZ; }
276
277/*
278 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
279*/
280static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
281 int k, i;
282 int l1ido = l1*ido;
283 if (ido <= 2) {
284 for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
285 ch[0] = VADD(cc[0], cc[ido+0]);
286 ch[l1ido] = VSUB(cc[0], cc[ido+0]);
287 ch[1] = VADD(cc[1], cc[ido+1]);
288 ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
289 }
290 } else {
291 for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
292 for (i=0; i<ido-1; i+=2) {
293 v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
294 v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
295 v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
296 ch[i] = VADD(cc[i+0], cc[i+ido+0]);
297 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
298 VCPLXMUL(tr2, ti2, wr, wi);
299 ch[i+l1ido] = tr2;
300 ch[i+l1ido+1] = ti2;
301 }
302 }
303 }
304}
305
306/*
307 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
308*/
309static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
310 const float *wa1, const float *wa2, float fsign) {
311 static const float taur = -0.5f;
312 float taui = 0.866025403784439f*fsign;
313 int i, k;
314 v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
315 int l1ido = l1*ido;
316 float wr1, wi1, wr2, wi2;
317 assert(ido > 2);
318 for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
319 for (i=0; i<ido-1; i+=2) {
320 tr2 = VADD(cc[i+ido], cc[i+2*ido]);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200321 cr2 = VADD(cc[i], SVMUL(taur,tr2));
Julien Pommier370d2092011-11-19 18:04:25 +0100322 ch[i] = VADD(cc[i], tr2);
323 ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200324 ci2 = VADD(cc[i +1], SVMUL(taur,ti2));
Julien Pommier370d2092011-11-19 18:04:25 +0100325 ch[i+1] = VADD(cc[i+1], ti2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200326 cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
327 ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
Julien Pommier370d2092011-11-19 18:04:25 +0100328 dr2 = VSUB(cr2, ci3);
329 dr3 = VADD(cr2, ci3);
330 di2 = VADD(ci2, cr3);
331 di3 = VSUB(ci2, cr3);
332 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
333 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
334 ch[i+l1ido] = dr2;
335 ch[i+l1ido + 1] = di2;
336 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
337 ch[i+2*l1ido] = dr3;
338 ch[i+2*l1ido+1] = di3;
339 }
340 }
341} /* passf3 */
342
343static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
344 const float *wa1, const float *wa2, const float *wa3, float fsign) {
345 /* isign == -1 for forward transform and +1 for backward transform */
346
347 int i, k;
348 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
349 int l1ido = l1*ido;
350 if (ido == 2) {
351 for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
352 tr1 = VSUB(cc[0], cc[2*ido + 0]);
353 tr2 = VADD(cc[0], cc[2*ido + 0]);
354 ti1 = VSUB(cc[1], cc[2*ido + 1]);
355 ti2 = VADD(cc[1], cc[2*ido + 1]);
356 ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
357 tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
358 tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
359 ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
360
361 ch[0*l1ido + 0] = VADD(tr2, tr3);
362 ch[0*l1ido + 1] = VADD(ti2, ti3);
363 ch[1*l1ido + 0] = VADD(tr1, tr4);
364 ch[1*l1ido + 1] = VADD(ti1, ti4);
365 ch[2*l1ido + 0] = VSUB(tr2, tr3);
366 ch[2*l1ido + 1] = VSUB(ti2, ti3);
367 ch[3*l1ido + 0] = VSUB(tr1, tr4);
368 ch[3*l1ido + 1] = VSUB(ti1, ti4);
369 }
370 } else {
371 for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
372 for (i=0; i<ido-1; i+=2) {
373 float wr1, wi1, wr2, wi2, wr3, wi3;
374 tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
375 tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
376 ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
377 ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
378 tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
379 ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
380 tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
381 ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
382
383 ch[i] = VADD(tr2, tr3);
384 cr3 = VSUB(tr2, tr3);
385 ch[i + 1] = VADD(ti2, ti3);
386 ci3 = VSUB(ti2, ti3);
387
388 cr2 = VADD(tr1, tr4);
389 cr4 = VSUB(tr1, tr4);
390 ci2 = VADD(ti1, ti4);
391 ci4 = VSUB(ti1, ti4);
392 wr1=wa1[i], wi1=fsign*wa1[i+1];
393 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
394 wr2=wa2[i], wi2=fsign*wa2[i+1];
395 ch[i + l1ido] = cr2;
396 ch[i + l1ido + 1] = ci2;
397
398 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
399 wr3=wa3[i], wi3=fsign*wa3[i+1];
400 ch[i + 2*l1ido] = cr3;
401 ch[i + 2*l1ido + 1] = ci3;
402
403 VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
404 ch[i + 3*l1ido] = cr4;
405 ch[i + 3*l1ido + 1] = ci4;
406 }
407 }
408 }
409} /* passf4 */
410
Julien Pommier0302e8a2012-10-11 18:04:09 +0200411/*
412 passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
413*/
414static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
415 const float *wa1, const float *wa2,
416 const float *wa3, const float *wa4, float fsign) {
417 static const float tr11 = .309016994374947f;
418 const float ti11 = .951056516295154f*fsign;
419 static const float tr12 = -.809016994374947f;
420 const float ti12 = .587785252292473f*fsign;
421
422 /* Local variables */
423 int i, k;
424 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
425 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
426
427 float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
428
429#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
430#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
431
432 assert(ido > 2);
433 for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
434 for (i = 0; i < ido-1; i += 2) {
435 ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
436 ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
437 ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
438 ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
439 tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
440 tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
441 tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
442 tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
443 ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
444 ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
445 cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
446 ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
447 cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
448 ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
449 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
450 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
451 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
452 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
453 dr3 = VSUB(cr3, ci4);
454 dr4 = VADD(cr3, ci4);
455 di3 = VADD(ci3, cr4);
456 di4 = VSUB(ci3, cr4);
457 dr5 = VADD(cr2, ci5);
458 dr2 = VSUB(cr2, ci5);
459 di5 = VSUB(ci2, cr5);
460 di2 = VADD(ci2, cr5);
461 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
462 wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
463 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
464 ch_ref(i - 1, 2) = dr2;
465 ch_ref(i, 2) = di2;
466 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
467 ch_ref(i - 1, 3) = dr3;
468 ch_ref(i, 3) = di3;
469 VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
470 ch_ref(i - 1, 4) = dr4;
471 ch_ref(i, 4) = di4;
472 VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
473 ch_ref(i - 1, 5) = dr5;
474 ch_ref(i, 5) = di5;
475 }
476 }
477#undef ch_ref
478#undef cc_ref
479}
480
Julien Pommier370d2092011-11-19 18:04:25 +0100481static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
482 static const float minus_one = -1.f;
483 int i, k, l1ido = l1*ido;
484 for (k=0; k < l1ido; k += ido) {
485 v4sf a = cc[k], b = cc[k + l1ido];
486 ch[2*k] = VADD(a, b);
487 ch[2*(k+ido)-1] = VSUB(a, b);
488 }
489 if (ido < 2) return;
490 if (ido != 2) {
491 for (k=0; k < l1ido; k += ido) {
492 for (i=2; i<ido; i+=2) {
493 v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
494 v4sf br = cc[i - 1 + k], bi = cc[i + k];
495 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
496 ch[i + 2*k] = VADD(bi, ti2);
497 ch[2*(k+ido) - i] = VSUB(ti2, bi);
498 ch[i - 1 + 2*k] = VADD(br, tr2);
499 ch[2*(k+ido) - i -1] = VSUB(br, tr2);
500 }
501 }
502 if (ido % 2 == 1) return;
503 }
504 for (k=0; k < l1ido; k += ido) {
Julien Pommier0302e8a2012-10-11 18:04:09 +0200505 ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
Julien Pommier370d2092011-11-19 18:04:25 +0100506 ch[2*k + ido-1] = cc[k + ido-1];
507 }
508} /* radf2 */
509
510
511static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
512 static const float minus_two=-2;
513 int i, k, l1ido = l1*ido;
514 v4sf a,b,c,d, tr2, ti2;
515 for (k=0; k < l1ido; k += ido) {
516 a = cc[2*k]; b = cc[2*(k+ido) - 1];
517 ch[k] = VADD(a, b);
518 ch[k + l1ido] =VSUB(a, b);
519 }
520 if (ido < 2) return;
521 if (ido != 2) {
522 for (k = 0; k < l1ido; k += ido) {
523 for (i = 2; i < ido; i += 2) {
524 a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
525 c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
526 ch[i-1 + k] = VADD(a, b);
527 tr2 = VSUB(a, b);
528 ch[i+0 + k] = VSUB(c, d);
529 ti2 = VADD(c, d);
530 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
531 ch[i-1 + k + l1ido] = tr2;
532 ch[i+0 + k + l1ido] = ti2;
533 }
534 }
535 if (ido % 2 == 1) return;
536 }
537 for (k = 0; k < l1ido; k += ido) {
538 a = cc[2*k + ido-1]; b = cc[2*k + ido];
539 ch[k + ido-1] = VADD(a,a);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200540 ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
Julien Pommier370d2092011-11-19 18:04:25 +0100541 }
542} /* radb2 */
543
544static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
545 const float *wa1, const float *wa2) {
546 static const float taur = -0.5f;
547 static const float taui = 0.866025403784439f;
548 int i, k, ic;
549 v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
550 for (k=0; k<l1; k++) {
551 cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
552 ch[3*k*ido] = VADD(cc[k*ido], cr2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200553 ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
554 ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
Julien Pommier370d2092011-11-19 18:04:25 +0100555 }
556 if (ido == 1) return;
557 for (k=0; k<l1; k++) {
558 for (i=2; i<ido; i+=2) {
559 ic = ido - i;
560 wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
561 dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
562 VCPLXMULCONJ(dr2, di2, wr1, wi1);
563
564 wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
565 dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
566 VCPLXMULCONJ(dr3, di3, wr2, wi2);
567
568 cr2 = VADD(dr2, dr3);
569 ci2 = VADD(di2, di3);
570 ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
571 ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200572 tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
573 ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
574 tr3 = SVMUL(taui, VSUB(di2, di3));
575 ti3 = SVMUL(taui, VSUB(dr3, dr2));
Julien Pommier370d2092011-11-19 18:04:25 +0100576 ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
577 ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
578 ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
579 ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
580 }
581 }
582} /* radf3 */
583
584
585static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
586 const float *wa1, const float *wa2)
587{
588 static const float taur = -0.5f;
589 static const float taui = 0.866025403784439f;
590 static const float taui_2 = 0.866025403784439f*2;
591 int i, k, ic;
592 v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
593 for (k=0; k<l1; k++) {
594 tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
595 cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
596 ch[k*ido] = VADD(cc[3*k*ido], tr2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200597 ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
Julien Pommier370d2092011-11-19 18:04:25 +0100598 ch[(k + l1)*ido] = VSUB(cr2, ci3);
599 ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
600 }
601 if (ido == 1) return;
602 for (k=0; k<l1; k++) {
603 for (i=2; i<ido; i+=2) {
604 ic = ido - i;
605 tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
606 cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
607 ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
608 ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
609 ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
610 ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200611 cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
612 ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
Julien Pommier370d2092011-11-19 18:04:25 +0100613 dr2 = VSUB(cr2, ci3);
614 dr3 = VADD(cr2, ci3);
615 di2 = VADD(ci2, cr3);
616 di3 = VSUB(ci2, cr3);
617 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
618 ch[i - 1 + (k + l1)*ido] = dr2;
619 ch[i + (k + l1)*ido] = di2;
620 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
621 ch[i - 1 + (k + 2*l1)*ido] = dr3;
622 ch[i + (k + 2*l1)*ido] = di3;
623 }
624 }
625} /* radb3 */
626
Julien Pommier370d2092011-11-19 18:04:25 +0100627static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
628 const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
629{
630 static const float minus_hsqt2 = (float)-0.7071067811865475;
631 int i, k, l1ido = l1*ido;
632 {
633 const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
634 v4sf * RESTRICT ch_ = ch;
635 while (cc < cc_end) {
636 // this loop represents between 25% and 40% of total radf4_ps cost !
637 v4sf a0 = cc[0], a1 = cc[l1ido];
638 v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
639 v4sf tr1 = VADD(a1, a3);
640 v4sf tr2 = VADD(a0, a2);
641 ch[2*ido-1] = VSUB(a0, a2);
642 ch[2*ido ] = VSUB(a3, a1);
643 ch[0 ] = VADD(tr1, tr2);
644 ch[4*ido-1] = VSUB(tr2, tr1);
645 cc += ido; ch += 4*ido;
646 }
647 cc = cc_; ch = ch_;
648 }
649 if (ido < 2) return;
650 if (ido != 2) {
651 for (k = 0; k < l1ido; k += ido) {
652 const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
653 for (i=2; i<ido; i += 2, pc += 2) {
654 int ic = ido - i;
655 v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
656 v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
657
658 cr2 = pc[1*l1ido+0];
659 ci2 = pc[1*l1ido+1];
660 wr=LD_PS1(wa1[i - 2]);
661 wi=LD_PS1(wa1[i - 1]);
662 VCPLXMULCONJ(cr2,ci2,wr,wi);
663
664 cr3 = pc[2*l1ido+0];
665 ci3 = pc[2*l1ido+1];
666 wr = LD_PS1(wa2[i-2]);
667 wi = LD_PS1(wa2[i-1]);
668 VCPLXMULCONJ(cr3, ci3, wr, wi);
669
670 cr4 = pc[3*l1ido];
671 ci4 = pc[3*l1ido+1];
672 wr = LD_PS1(wa3[i-2]);
673 wi = LD_PS1(wa3[i-1]);
674 VCPLXMULCONJ(cr4, ci4, wr, wi);
675
676 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
677
678 tr1 = VADD(cr2,cr4);
679 tr4 = VSUB(cr4,cr2);
680 tr2 = VADD(pc[0],cr3);
681 tr3 = VSUB(pc[0],cr3);
682 ch[i - 1 + 4*k] = VADD(tr1,tr2);
683 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
684 ti1 = VADD(ci2,ci4);
685 ti4 = VSUB(ci2,ci4);
686 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
687 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
688 ti2 = VADD(pc[1],ci3);
689 ti3 = VSUB(pc[1],ci3);
690 ch[i + 4*k] = VADD(ti1, ti2);
691 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
692 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
693 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
694 }
695 }
696 if (ido % 2 == 1) return;
697 }
698 for (k=0; k<l1ido; k += ido) {
699 v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
700 v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
Julien Pommier0302e8a2012-10-11 18:04:09 +0200701 v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
702 v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
Julien Pommier370d2092011-11-19 18:04:25 +0100703 ch[ido-1 + 4*k] = VADD(tr1, c);
704 ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
705 ch[4*k + 1*ido] = VSUB(ti1, d);
706 ch[4*k + 3*ido] = VADD(ti1, d);
707 }
708} /* radf4 */
709
710
711static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
712 const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
713{
714 static const float minus_sqrt2 = (float)-1.414213562373095;
715 static const float two = 2.f;
716 int i, k, l1ido = l1*ido;
717 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
718 {
719 const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
720 v4sf *ch_ = ch;
721 while (ch < ch_end) {
722 v4sf a = cc[0], b = cc[4*ido-1];
723 v4sf c = cc[2*ido], d = cc[2*ido-1];
Julien Pommier0302e8a2012-10-11 18:04:09 +0200724 tr3 = SVMUL(two,d);
Julien Pommier370d2092011-11-19 18:04:25 +0100725 tr2 = VADD(a,b);
726 tr1 = VSUB(a,b);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200727 tr4 = SVMUL(two,c);
Julien Pommier370d2092011-11-19 18:04:25 +0100728 ch[0*l1ido] = VADD(tr2, tr3);
729 ch[2*l1ido] = VSUB(tr2, tr3);
730 ch[1*l1ido] = VSUB(tr1, tr4);
731 ch[3*l1ido] = VADD(tr1, tr4);
732
733 cc += 4*ido; ch += ido;
734 }
735 cc = cc_; ch = ch_;
736 }
737 if (ido < 2) return;
738 if (ido != 2) {
739 for (k = 0; k < l1ido; k += ido) {
740 const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
741 v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
742 for (i = 2; i < ido; i += 2) {
743
744 tr1 = VSUB(pc[i], pc[4*ido - i]);
745 tr2 = VADD(pc[i], pc[4*ido - i]);
746 ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
747 tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
748 ph[0] = VADD(tr2, tr3);
749 cr3 = VSUB(tr2, tr3);
750
751 ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
752 tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
753 cr2 = VSUB(tr1, tr4);
754 cr4 = VADD(tr1, tr4);
755
756 ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
757 ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
758
759 ph[1] = VADD(ti2, ti3); ph += l1ido;
760 ci3 = VSUB(ti2, ti3);
761 ci2 = VADD(ti1, ti4);
762 ci4 = VSUB(ti1, ti4);
763 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
764 ph[0] = cr2;
765 ph[1] = ci2; ph += l1ido;
766 VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
767 ph[0] = cr3;
768 ph[1] = ci3; ph += l1ido;
769 VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
770 ph[0] = cr4;
771 ph[1] = ci4; ph = ph - 3*l1ido + 2;
772 }
773 }
774 if (ido % 2 == 1) return;
775 }
776 for (k=0; k < l1ido; k+=ido) {
777 int i0 = 4*k + ido;
778 v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
779 v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
780 tr1 = VSUB(c,d);
781 tr2 = VADD(c,d);
782 ti1 = VADD(b,a);
783 ti2 = VSUB(b,a);
784 ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200785 ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
Julien Pommier370d2092011-11-19 18:04:25 +0100786 ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200787 ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
Julien Pommier370d2092011-11-19 18:04:25 +0100788 }
789} /* radb4 */
790
Julien Pommier0302e8a2012-10-11 18:04:09 +0200791static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
792 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
793{
794 static const float tr11 = .309016994374947f;
795 static const float ti11 = .951056516295154f;
796 static const float tr12 = -.809016994374947f;
797 static const float ti12 = .587785252292473f;
798
799 /* System generated locals */
800 int cc_offset, ch_offset;
801
802 /* Local variables */
803 int i, k, ic;
804 v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
805 cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
806 int idp2;
807
808
809#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
810#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
811
812 /* Parameter adjustments */
813 ch_offset = 1 + ido * 6;
814 ch -= ch_offset;
815 cc_offset = 1 + ido * (1 + l1);
816 cc -= cc_offset;
817
818 /* Function Body */
819 for (k = 1; k <= l1; ++k) {
820 cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
821 ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
822 cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
823 ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
824 ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
825 ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
826 ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
827 ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
828 ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
829 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
830 }
831 if (ido == 1) {
832 return;
833 }
834 idp2 = ido + 2;
835 for (k = 1; k <= l1; ++k) {
836 for (i = 3; i <= ido; i += 2) {
837 ic = idp2 - i;
Julien Pommier432b3e82013-01-12 19:28:03 +0100838 dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
839 dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
840 dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
841 dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
842 VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
843 VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
844 VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
845 VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
Julien Pommier0302e8a2012-10-11 18:04:09 +0200846 cr2 = VADD(dr2, dr5);
847 ci5 = VSUB(dr5, dr2);
848 cr5 = VSUB(di2, di5);
849 ci2 = VADD(di2, di5);
850 cr3 = VADD(dr3, dr4);
851 ci4 = VSUB(dr4, dr3);
852 cr4 = VSUB(di3, di4);
853 ci3 = VADD(di3, di4);
854 ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
Julien Pommier432b3e82013-01-12 19:28:03 +0100855 ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
Julien Pommier0302e8a2012-10-11 18:04:09 +0200856 tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
Julien Pommier432b3e82013-01-12 19:28:03 +0100857 ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
Julien Pommier0302e8a2012-10-11 18:04:09 +0200858 tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
Julien Pommier432b3e82013-01-12 19:28:03 +0100859 ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
Julien Pommier0302e8a2012-10-11 18:04:09 +0200860 tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
861 ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
862 tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
863 ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
Julien Pommier432b3e82013-01-12 19:28:03 +0100864 ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
865 ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200866 ch_ref(i, 3, k) = VADD(ti2, ti5);
867 ch_ref(ic, 2, k) = VSUB(ti5, ti2);
Julien Pommier432b3e82013-01-12 19:28:03 +0100868 ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
869 ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
Julien Pommier0302e8a2012-10-11 18:04:09 +0200870 ch_ref(i, 5, k) = VADD(ti3, ti4);
871 ch_ref(ic, 4, k) = VSUB(ti4, ti3);
872 }
873 }
874#undef cc_ref
875#undef ch_ref
876} /* radf5 */
877
878static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
879 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
880{
881 static const float tr11 = .309016994374947f;
882 static const float ti11 = .951056516295154f;
883 static const float tr12 = -.809016994374947f;
884 static const float ti12 = .587785252292473f;
885
886 int cc_offset, ch_offset;
887
888 /* Local variables */
889 int i, k, ic;
890 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
891 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
892 int idp2;
893
894#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
895#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
896
897 /* Parameter adjustments */
898 ch_offset = 1 + ido * (1 + l1);
899 ch -= ch_offset;
900 cc_offset = 1 + ido * 6;
901 cc -= cc_offset;
902
903 /* Function Body */
904 for (k = 1; k <= l1; ++k) {
905 ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
906 ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
907 tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
908 tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
909 ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
910 cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
911 cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
912 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
913 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
914 ch_ref(1, k, 2) = VSUB(cr2, ci5);
915 ch_ref(1, k, 3) = VSUB(cr3, ci4);
916 ch_ref(1, k, 4) = VADD(cr3, ci4);
917 ch_ref(1, k, 5) = VADD(cr2, ci5);
918 }
919 if (ido == 1) {
920 return;
921 }
922 idp2 = ido + 2;
923 for (k = 1; k <= l1; ++k) {
924 for (i = 3; i <= ido; i += 2) {
925 ic = idp2 - i;
926 ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k));
927 ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k));
928 ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k));
929 ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k));
930 tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
931 tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
932 tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
933 tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
934 ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
935 ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
936 cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
937 ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
938 cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
939 ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
940 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
941 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
942 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
943 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
944 dr3 = VSUB(cr3, ci4);
945 dr4 = VADD(cr3, ci4);
946 di3 = VADD(ci3, cr4);
947 di4 = VSUB(ci3, cr4);
948 dr5 = VADD(cr2, ci5);
949 dr2 = VSUB(cr2, ci5);
950 di5 = VSUB(ci2, cr5);
951 di2 = VADD(ci2, cr5);
Julien Pommier432b3e82013-01-12 19:28:03 +0100952 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
953 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
954 VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
955 VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
Julien Pommier0302e8a2012-10-11 18:04:09 +0200956
957 ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
958 ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
959 ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
960 ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
961 }
962 }
963#undef cc_ref
964#undef ch_ref
965} /* radb5 */
966
Julien Pommier370d2092011-11-19 18:04:25 +0100967static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
968 const float *wa, const int *ifac) {
969 v4sf *in = (v4sf*)input_readonly;
970 v4sf *out = (in == work2 ? work1 : work2);
971 int nf = ifac[1], k1;
972 int l2 = n;
973 int iw = n-1;
974 assert(in != out && work1 != work2);
975 for (k1 = 1; k1 <= nf; ++k1) {
976 int kh = nf - k1;
977 int ip = ifac[kh + 2];
978 int l1 = l2 / ip;
979 int ido = n / l2;
980 iw -= (ip - 1)*ido;
981 switch (ip) {
Julien Pommier0302e8a2012-10-11 18:04:09 +0200982 case 5: {
983 int ix2 = iw + ido;
984 int ix3 = ix2 + ido;
985 int ix4 = ix3 + ido;
986 radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
987 } break;
Julien Pommier370d2092011-11-19 18:04:25 +0100988 case 4: {
989 int ix2 = iw + ido;
990 int ix3 = ix2 + ido;
991 radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
992 } break;
993 case 3: {
994 int ix2 = iw + ido;
995 radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
996 } break;
997 case 2:
998 radf2_ps(ido, l1, in, out, &wa[iw]);
999 break;
1000 default:
1001 assert(0);
1002 break;
1003 }
1004 l2 = l1;
1005 if (out == work2) {
1006 out = work1; in = work2;
1007 } else {
1008 out = work2; in = work1;
1009 }
1010 }
1011 return in; /* this is in fact the output .. */
1012} /* rfftf1 */
1013
1014static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1015 const float *wa, const int *ifac) {
1016 v4sf *in = (v4sf*)input_readonly;
1017 v4sf *out = (in == work2 ? work1 : work2);
1018 int nf = ifac[1], k1;
1019 int l1 = 1;
1020 int iw = 0;
1021 assert(in != out);
1022 for (k1=1; k1<=nf; k1++) {
1023 int ip = ifac[k1 + 1];
1024 int l2 = ip*l1;
1025 int ido = n / l2;
1026 switch (ip) {
Julien Pommier0302e8a2012-10-11 18:04:09 +02001027 case 5: {
1028 int ix2 = iw + ido;
1029 int ix3 = ix2 + ido;
1030 int ix4 = ix3 + ido;
1031 radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1032 } break;
Julien Pommier370d2092011-11-19 18:04:25 +01001033 case 4: {
1034 int ix2 = iw + ido;
1035 int ix3 = ix2 + ido;
1036 radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1037 } break;
1038 case 3: {
1039 int ix2 = iw + ido;
1040 radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1041 } break;
1042 case 2:
1043 radb2_ps(ido, l1, in, out, &wa[iw]);
1044 break;
1045 default:
1046 assert(0);
1047 break;
1048 }
1049 l1 = l2;
1050 iw += (ip - 1)*ido;
1051
1052 if (out == work2) {
1053 out = work1; in = work2;
1054 } else {
1055 out = work2; in = work1;
1056 }
1057 }
1058 return in; /* this is in fact the output .. */
1059}
1060
Julien Pommier0302e8a2012-10-11 18:04:09 +02001061static int decompose(int n, int *ifac, const int *ntryh) {
Julien Pommier370d2092011-11-19 18:04:25 +01001062 int nl = n, nf = 0, i, j = 0;
Julien Pommier0302e8a2012-10-11 18:04:09 +02001063 for (j=0; ntryh[j]; ++j) {
Julien Pommier370d2092011-11-19 18:04:25 +01001064 int ntry = ntryh[j];
1065 while (nl != 1) {
1066 int nq = nl / ntry;
1067 int nr = nl - ntry * nq;
1068 if (nr == 0) {
1069 ifac[2+nf++] = ntry;
1070 nl = nq;
1071 if (ntry == 2 && nf != 1) {
1072 for (i = 2; i <= nf; ++i) {
1073 int ib = nf - i + 2;
1074 ifac[ib + 1] = ifac[ib];
1075 }
1076 ifac[2] = 2;
1077 }
1078 } else break;
1079 }
1080 }
1081 ifac[0] = n;
1082 ifac[1] = nf;
1083 return nf;
1084}
1085
1086
1087
1088static void rffti1_ps(int n, float *wa, int *ifac)
1089{
Julien Pommier0302e8a2012-10-11 18:04:09 +02001090 static const int ntryh[] = { 4,2,3,5,0 };
Julien Pommier370d2092011-11-19 18:04:25 +01001091 int k1, j, ii;
1092
1093 int nf = decompose(n,ifac,ntryh);
Márton Danóczyc5062dc2016-04-07 18:59:33 +02001094 float argh = (2*(float)M_PI) / n;
Julien Pommier370d2092011-11-19 18:04:25 +01001095 int is = 0;
1096 int nfm1 = nf - 1;
1097 int l1 = 1;
Julien Pommier370d2092011-11-19 18:04:25 +01001098 for (k1 = 1; k1 <= nfm1; k1++) {
1099 int ip = ifac[k1 + 1];
1100 int ld = 0;
1101 int l2 = l1*ip;
1102 int ido = n / l2;
1103 int ipm = ip - 1;
1104 for (j = 1; j <= ipm; ++j) {
1105 float argld;
1106 int i = is, fi=0;
1107 ld += l1;
1108 argld = ld*argh;
1109 for (ii = 3; ii <= ido; ii += 2) {
1110 i += 2;
1111 fi += 1;
Márton Danóczyc5062dc2016-04-07 18:59:33 +02001112 wa[i - 2] = cosf(fi*argld);
1113 wa[i - 1] = sinf(fi*argld);
Julien Pommier370d2092011-11-19 18:04:25 +01001114 }
1115 is += ido;
1116 }
1117 l1 = l2;
1118 }
1119} /* rffti1 */
1120
1121void cffti1_ps(int n, float *wa, int *ifac)
1122{
Julien Pommier0302e8a2012-10-11 18:04:09 +02001123 static const int ntryh[] = { 5,3,4,2,0 };
Julien Pommier370d2092011-11-19 18:04:25 +01001124 int k1, j, ii;
1125
1126 int nf = decompose(n,ifac,ntryh);
Márton Danóczyc5062dc2016-04-07 18:59:33 +02001127 float argh = (2*(float)M_PI) / n;
Julien Pommier370d2092011-11-19 18:04:25 +01001128 int i = 1;
1129 int l1 = 1;
1130 for (k1=1; k1<=nf; k1++) {
1131 int ip = ifac[k1+1];
1132 int ld = 0;
1133 int l2 = l1*ip;
1134 int ido = n / l2;
1135 int idot = ido + ido + 2;
1136 int ipm = ip - 1;
1137 for (j=1; j<=ipm; j++) {
1138 float argld;
1139 int i1 = i, fi = 0;
1140 wa[i-1] = 1;
1141 wa[i] = 0;
1142 ld += l1;
1143 argld = ld*argh;
1144 for (ii = 4; ii <= idot; ii += 2) {
1145 i += 2;
1146 fi += 1;
Márton Danóczyc5062dc2016-04-07 18:59:33 +02001147 wa[i-1] = cosf(fi*argld);
1148 wa[i] = sinf(fi*argld);
Julien Pommier370d2092011-11-19 18:04:25 +01001149 }
1150 if (ip > 5) {
1151 wa[i1-1] = wa[i-1];
1152 wa[i1] = wa[i];
1153 }
1154 }
1155 l1 = l2;
1156 }
1157} /* cffti1 */
1158
1159
1160v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
1161 v4sf *in = (v4sf*)input_readonly;
1162 v4sf *out = (in == work2 ? work1 : work2);
1163 int nf = ifac[1], k1;
1164 int l1 = 1;
1165 int iw = 0;
1166 assert(in != out && work1 != work2);
1167 for (k1=2; k1<=nf+1; k1++) {
1168 int ip = ifac[k1];
1169 int l2 = ip*l1;
1170 int ido = n / l2;
1171 int idot = ido + ido;
1172 switch (ip) {
Julien Pommier0302e8a2012-10-11 18:04:09 +02001173 case 5: {
1174 int ix2 = iw + idot;
1175 int ix3 = ix2 + idot;
1176 int ix4 = ix3 + idot;
1177 passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1178 } break;
Julien Pommier370d2092011-11-19 18:04:25 +01001179 case 4: {
1180 int ix2 = iw + idot;
1181 int ix3 = ix2 + idot;
1182 passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
1183 } break;
1184 case 2: {
1185 passf2_ps(idot, l1, in, out, &wa[iw], isign);
1186 } break;
1187 case 3: {
1188 int ix2 = iw + idot;
1189 passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
1190 } break;
1191 default:
1192 assert(0);
1193 }
1194 l1 = l2;
1195 iw += (ip - 1)*idot;
1196 if (out == work2) {
1197 out = work1; in = work2;
1198 } else {
1199 out = work2; in = work1;
1200 }
1201 }
1202
1203 return in; /* this is in fact the output .. */
1204}
1205
1206
1207struct PFFFT_Setup {
1208 int N;
1209 int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1210 int ifac[15];
1211 pffft_transform_t transform;
1212 v4sf *data; // allocated room for twiddle coefs
1213 float *e; // points into 'data' , N/4*3 elements
1214 float *twiddle; // points into 'data', N/4 elements
1215};
1216
1217PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
1218 PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
1219 int k, m;
Julien Pommier0302e8a2012-10-11 18:04:09 +02001220 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1221 and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1222 handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1223 if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1224 if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
Julien Pommier370d2092011-11-19 18:04:25 +01001225 //assert((N % 32) == 0);
1226 s->N = N;
1227 s->transform = transform;
1228 /* nb of complex simd vectors */
1229 s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1230 s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
1231 s->e = (float*)s->data;
1232 s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1233
1234 if (transform == PFFFT_REAL) {
1235 for (k=0; k < s->Ncvec; ++k) {
1236 int i = k/SIMD_SZ;
1237 int j = k%SIMD_SZ;
1238 for (m=0; m < SIMD_SZ-1; ++m) {
Márton Danóczyc5062dc2016-04-07 18:59:33 +02001239 float A = -2*(float)M_PI*(m+1)*k / N;
1240 s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cosf(A);
1241 s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sinf(A);
Julien Pommier370d2092011-11-19 18:04:25 +01001242 }
1243 }
1244 rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1245 } else {
1246 for (k=0; k < s->Ncvec; ++k) {
1247 int i = k/SIMD_SZ;
1248 int j = k%SIMD_SZ;
1249 for (m=0; m < SIMD_SZ-1; ++m) {
Márton Danóczyc5062dc2016-04-07 18:59:33 +02001250 float A = -2*(float)M_PI*(m+1)*k / N;
1251 s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cosf(A);
1252 s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sinf(A);
Julien Pommier370d2092011-11-19 18:04:25 +01001253 }
1254 }
1255 cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1256 }
Julien Pommier0302e8a2012-10-11 18:04:09 +02001257
1258 /* check that N is decomposable with allowed prime factors */
1259 for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1260 if (m != N/SIMD_SZ) {
1261 pffft_destroy_setup(s); s = 0;
1262 }
1263
Julien Pommier370d2092011-11-19 18:04:25 +01001264 return s;
1265}
1266
1267
1268void pffft_destroy_setup(PFFFT_Setup *s) {
1269 pffft_aligned_free(s->data);
1270 free(s);
1271}
1272
1273#if !defined(PFFFT_SIMD_DISABLE)
1274
1275/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1276static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1277 v4sf g0, g1;
1278 int k;
1279 INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1280
1281 *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1282 for (k=1; k < N; ++k) {
1283 v4sf h0, h1;
1284 INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1285 *--out = VSWAPHL(g1, h0);
1286 *--out = VSWAPHL(h0, h1);
1287 g1 = h1;
1288 }
1289 *--out = VSWAPHL(g1, g0);
1290}
1291
1292static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1293 v4sf g0, g1, h0, h1;
1294 int k;
1295 g0 = g1 = in[0]; ++in;
1296 for (k=1; k < N; ++k) {
1297 h0 = *in++; h1 = *in++;
1298 g1 = VSWAPHL(g1, h0);
1299 h0 = VSWAPHL(h0, h1);
1300 UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1301 g1 = h1;
1302 }
1303 h0 = *in++; h1 = g0;
1304 g1 = VSWAPHL(g1, h0);
1305 h0 = VSWAPHL(h0, h1);
1306 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1307}
1308
1309void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1310 int k, N = setup->N, Ncvec = setup->Ncvec;
1311 const v4sf *vin = (const v4sf*)in;
1312 v4sf *vout = (v4sf*)out;
1313 assert(in != out);
1314 if (setup->transform == PFFFT_REAL) {
1315 int k, dk = N/32;
1316 if (direction == PFFFT_FORWARD) {
1317 for (k=0; k < dk; ++k) {
1318 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1319 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1320 }
1321 reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1322 reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1323 } else {
1324 for (k=0; k < dk; ++k) {
1325 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1326 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1327 }
1328 unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1329 unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1330 }
1331 } else {
1332 if (direction == PFFFT_FORWARD) {
1333 for (k=0; k < Ncvec; ++k) {
1334 int kk = (k/4) + (k%4)*(Ncvec/4);
1335 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1336 }
1337 } else {
1338 for (k=0; k < Ncvec; ++k) {
1339 int kk = (k/4) + (k%4)*(Ncvec/4);
1340 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1341 }
1342 }
1343 }
1344}
1345
1346void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1347 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1348 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1349 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1350 assert(in != out);
1351 for (k=0; k < dk; ++k) {
1352 r0 = in[8*k+0]; i0 = in[8*k+1];
1353 r1 = in[8*k+2]; i1 = in[8*k+3];
1354 r2 = in[8*k+4]; i2 = in[8*k+5];
1355 r3 = in[8*k+6]; i3 = in[8*k+7];
1356 VTRANSPOSE4(r0,r1,r2,r3);
1357 VTRANSPOSE4(i0,i1,i2,i3);
1358 VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1359 VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1360 VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1361
1362 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1363 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1364 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1365 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1366
1367 /*
1368 transformation for each column is:
1369
1370 [1 1 1 1 0 0 0 0] [r0]
1371 [1 0 -1 0 0 -1 0 1] [r1]
1372 [1 -1 1 -1 0 0 0 0] [r2]
1373 [1 0 -1 0 0 1 0 -1] [r3]
1374 [0 0 0 0 1 1 1 1] * [i0]
1375 [0 1 0 -1 1 0 -1 0] [i1]
1376 [0 0 0 0 1 -1 1 -1] [i2]
1377 [0 -1 0 1 1 0 -1 0] [i3]
1378 */
1379
1380 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1381 r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1382 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1383 r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1384
1385 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1386 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1387 }
1388}
1389
1390void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1391 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1392 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1393 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1394 assert(in != out);
1395 for (k=0; k < dk; ++k) {
1396 r0 = in[8*k+0]; i0 = in[8*k+1];
1397 r1 = in[8*k+2]; i1 = in[8*k+3];
1398 r2 = in[8*k+4]; i2 = in[8*k+5];
1399 r3 = in[8*k+6]; i3 = in[8*k+7];
1400
1401 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1402 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1403 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1404 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1405
1406 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1407 r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1408 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1409 r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1410
1411 VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1412 VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1413 VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1414
1415 VTRANSPOSE4(r0,r1,r2,r3);
1416 VTRANSPOSE4(i0,i1,i2,i3);
1417
1418 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1419 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1420 }
1421}
1422
1423
Julien Pommier432b3e82013-01-12 19:28:03 +01001424static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
Julien Pommier370d2092011-11-19 18:04:25 +01001425 const v4sf *e, v4sf *out) {
1426 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1427 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1428 r0 = *in0; i0 = *in1;
1429 r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1430 VTRANSPOSE4(r0,r1,r2,r3);
1431 VTRANSPOSE4(i0,i1,i2,i3);
1432
1433 /*
1434 transformation for each column is:
1435
1436 [1 1 1 1 0 0 0 0] [r0]
1437 [1 0 -1 0 0 -1 0 1] [r1]
1438 [1 0 -1 0 0 1 0 -1] [r2]
1439 [1 -1 1 -1 0 0 0 0] [r3]
1440 [0 0 0 0 1 1 1 1] * [i0]
1441 [0 -1 0 1 -1 0 1 0] [i1]
1442 [0 -1 0 1 1 0 -1 0] [i2]
1443 [0 0 0 0 -1 1 -1 1] [i3]
1444 */
1445
1446 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1447 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1448
1449 VCPLXMUL(r1,i1,e[0],e[1]);
1450 VCPLXMUL(r2,i2,e[2],e[3]);
1451 VCPLXMUL(r3,i3,e[4],e[5]);
1452
1453 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1454 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1455
1456 sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1457 sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1458 si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1459 si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1460
1461 r0 = VADD(sr0, sr1);
1462 r3 = VSUB(sr0, sr1);
1463 i0 = VADD(si0, si1);
1464 i3 = VSUB(si1, si0);
1465 r1 = VADD(dr0, di1);
1466 r2 = VSUB(dr0, di1);
1467 i1 = VSUB(dr1, di0);
1468 i2 = VADD(dr1, di0);
1469
1470 *out++ = r0;
1471 *out++ = i0;
1472 *out++ = r1;
1473 *out++ = i1;
1474 *out++ = r2;
1475 *out++ = i2;
1476 *out++ = r3;
1477 *out++ = i3;
1478
1479}
1480
1481static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1482 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1483 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1484
1485 v4sf_union cr, ci, *uout = (v4sf_union*)out;
1486 v4sf save = in[7], zero=VZERO();
1487 float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
Márton Danóczyc5062dc2016-04-07 18:59:33 +02001488 static const float s = (float)M_SQRT2/2;
Julien Pommier370d2092011-11-19 18:04:25 +01001489
1490 cr.v = in[0]; ci.v = in[Ncvec*2-1];
1491 assert(in != out);
1492 pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1493
1494 /*
1495 [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1496
1497 [Xr(1)] ] [1 1 1 1 0 0 0 0]
1498 [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1499 [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1500 [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1501 [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1502 [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1503 [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1504 [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1505 */
1506
1507 xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1508 xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1509 xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
1510 xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
1511 xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
1512 xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
1513 xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
1514 xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
1515
1516 for (k=1; k < dk; ++k) {
1517 v4sf save_next = in[8*k+7];
1518 pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
1519 e + k*6, out + k*8);
1520 save = save_next;
1521 }
1522
1523}
1524
Julien Pommier432b3e82013-01-12 19:28:03 +01001525static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
Julien Pommier370d2092011-11-19 18:04:25 +01001526 const v4sf *e, v4sf *out, int first) {
1527 v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1528 /*
1529 transformation for each column is:
1530
1531 [1 1 1 1 0 0 0 0] [r0]
1532 [1 0 0 -1 0 -1 -1 0] [r1]
1533 [1 -1 -1 1 0 0 0 0] [r2]
1534 [1 0 0 -1 0 1 1 0] [r3]
1535 [0 0 0 0 1 -1 1 -1] * [i0]
1536 [0 -1 1 0 1 0 0 1] [i1]
1537 [0 0 0 0 1 1 -1 -1] [i2]
1538 [0 1 -1 0 1 0 0 1] [i3]
1539 */
1540
1541 v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1542 v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1543 v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1544 v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1545
1546 r0 = VADD(sr0, sr1);
1547 r2 = VSUB(sr0, sr1);
1548 r1 = VSUB(dr0, si1);
1549 r3 = VADD(dr0, si1);
1550 i0 = VSUB(di0, di1);
1551 i2 = VADD(di0, di1);
1552 i1 = VSUB(si0, dr1);
1553 i3 = VADD(si0, dr1);
1554
1555 VCPLXMULCONJ(r1,i1,e[0],e[1]);
1556 VCPLXMULCONJ(r2,i2,e[2],e[3]);
1557 VCPLXMULCONJ(r3,i3,e[4],e[5]);
1558
1559 VTRANSPOSE4(r0,r1,r2,r3);
1560 VTRANSPOSE4(i0,i1,i2,i3);
1561
1562 if (!first) {
1563 *out++ = r0;
1564 *out++ = i0;
1565 }
1566 *out++ = r1;
1567 *out++ = i1;
1568 *out++ = r2;
1569 *out++ = i2;
1570 *out++ = r3;
1571 *out++ = i3;
1572}
1573
1574static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1575 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1576 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1577
1578 v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1579 float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
Márton Danóczyc5062dc2016-04-07 18:59:33 +02001580 static const float s = (float)M_SQRT2;
Julien Pommier370d2092011-11-19 18:04:25 +01001581 assert(in != out);
1582 for (k=0; k < 4; ++k) {
1583 Xr.f[k] = ((float*)in)[8*k];
1584 Xi.f[k] = ((float*)in)[8*k+4];
1585 }
1586
1587 pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
1588
1589 /*
1590 [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1591
1592 [cr0] [1 0 2 0 1 0 0 0]
1593 [cr1] [1 0 0 0 -1 0 -2 0]
1594 [cr2] [1 0 -2 0 1 0 0 0]
1595 [cr3] [1 0 0 0 -1 0 2 0]
1596 [ci0] [0 2 0 2 0 0 0 0]
1597 [ci1] [0 s 0 -s 0 -s 0 -s]
1598 [ci2] [0 0 0 0 0 -2 0 2]
1599 [ci3] [0 -s 0 s 0 -s 0 -s]
1600 */
1601 for (k=1; k < dk; ++k) {
1602 pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
1603 }
1604
1605 cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1606 cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1607 cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1608 cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1609 ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0;
1610 ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1611 ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2;
1612 ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1613}
1614
1615
1616void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1617 pffft_direction_t direction, int ordered) {
1618 int k, Ncvec = setup->Ncvec;
1619 int nf_odd = (setup->ifac[1] & 1);
1620
1621 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1622 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1623 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
Julien Pommier370d2092011-11-19 18:04:25 +01001624
1625 const v4sf *vinput = (const v4sf*)finput;
1626 v4sf *voutput = (v4sf*)foutput;
Julien Pommier836bc4b2011-11-20 10:58:07 +01001627 v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
Julien Pommier370d2092011-11-19 18:04:25 +01001628 int ib = (nf_odd ^ ordered ? 1 : 0);
1629
Julien Pommier370d2092011-11-19 18:04:25 +01001630 assert(VALIGNED(finput) && VALIGNED(foutput));
1631
1632 //assert(finput != foutput);
1633 if (direction == PFFFT_FORWARD) {
1634 ib = !ib;
1635 if (setup->transform == PFFFT_REAL) {
1636 ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1637 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1638 pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1639 } else {
1640 v4sf *tmp = buff[ib];
1641 for (k=0; k < Ncvec; ++k) {
1642 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1643 }
1644 ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1645 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1646 pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1647 }
1648 if (ordered) {
1649 pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1650 } else ib = !ib;
1651 } else {
1652 if (vinput == buff[ib]) {
1653 ib = !ib; // may happen when finput == foutput
1654 }
1655 if (ordered) {
1656 pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1657 vinput = buff[ib]; ib = !ib;
1658 }
1659 if (setup->transform == PFFFT_REAL) {
1660 pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1661 ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1662 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1663 } else {
1664 pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1665 ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1666 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1667 for (k=0; k < Ncvec; ++k) {
1668 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1669 }
1670 }
1671 }
1672
1673 if (buff[ib] != voutput) {
1674 /* extra copy required -- this situation should only happen when finput == foutput */
1675 assert(finput==foutput);
1676 for (k=0; k < Ncvec; ++k) {
1677 v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1678 voutput[2*k] = a; voutput[2*k+1] = b;
1679 }
1680 ib = !ib;
1681 }
1682 assert(buff[ib] == voutput);
1683}
1684
1685void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
Julien Pommier432b3e82013-01-12 19:28:03 +01001686 int Ncvec = s->Ncvec;
Julien Pommier370d2092011-11-19 18:04:25 +01001687 const v4sf * RESTRICT va = (const v4sf*)a;
1688 const v4sf * RESTRICT vb = (const v4sf*)b;
1689 v4sf * RESTRICT vab = (v4sf*)ab;
1690
1691#ifdef __arm__
1692 __builtin_prefetch(va);
1693 __builtin_prefetch(vb);
1694 __builtin_prefetch(vab);
1695 __builtin_prefetch(va+2);
1696 __builtin_prefetch(vb+2);
1697 __builtin_prefetch(vab+2);
1698 __builtin_prefetch(va+4);
1699 __builtin_prefetch(vb+4);
1700 __builtin_prefetch(vab+4);
1701 __builtin_prefetch(va+6);
1702 __builtin_prefetch(vb+6);
1703 __builtin_prefetch(vab+6);
Julien Pommier432b3e82013-01-12 19:28:03 +01001704# ifndef __clang__
1705# define ZCONVOLVE_USING_INLINE_NEON_ASM
1706# endif
Julien Pommier370d2092011-11-19 18:04:25 +01001707#endif
1708
1709 float ar, ai, br, bi, abr, abi;
Julien Pommier432b3e82013-01-12 19:28:03 +01001710#ifndef ZCONVOLVE_USING_INLINE_ASM
Julien Pommier370d2092011-11-19 18:04:25 +01001711 v4sf vscal = LD_PS1(scaling);
Julien Pommier432b3e82013-01-12 19:28:03 +01001712 int i;
1713#endif
Julien Pommier370d2092011-11-19 18:04:25 +01001714
1715 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1716 ar = ((v4sf_union*)va)[0].f[0];
1717 ai = ((v4sf_union*)va)[1].f[0];
1718 br = ((v4sf_union*)vb)[0].f[0];
1719 bi = ((v4sf_union*)vb)[1].f[0];
1720 abr = ((v4sf_union*)vab)[0].f[0];
1721 abi = ((v4sf_union*)vab)[1].f[0];
1722
Julien Pommier432b3e82013-01-12 19:28:03 +01001723#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
Julien Pommier370d2092011-11-19 18:04:25 +01001724 const float *a_ = a, *b_ = b; float *ab_ = ab;
1725 int N = Ncvec;
1726 asm volatile("mov r8, %2 \n"
1727 "vdup.f32 q15, %4 \n"
1728 "1: \n"
1729 "pld [%0,#64] \n"
1730 "pld [%1,#64] \n"
1731 "pld [%2,#64] \n"
1732 "pld [%0,#96] \n"
1733 "pld [%1,#96] \n"
1734 "pld [%2,#96] \n"
1735 "vld1.f32 {q0,q1}, [%0,:128]! \n"
1736 "vld1.f32 {q4,q5}, [%1,:128]! \n"
1737 "vld1.f32 {q2,q3}, [%0,:128]! \n"
1738 "vld1.f32 {q6,q7}, [%1,:128]! \n"
1739 "vld1.f32 {q8,q9}, [r8,:128]! \n"
1740
1741 "vmul.f32 q10, q0, q4 \n"
1742 "vmul.f32 q11, q0, q5 \n"
1743 "vmul.f32 q12, q2, q6 \n"
1744 "vmul.f32 q13, q2, q7 \n"
1745 "vmls.f32 q10, q1, q5 \n"
1746 "vmla.f32 q11, q1, q4 \n"
1747 "vld1.f32 {q0,q1}, [r8,:128]! \n"
1748 "vmls.f32 q12, q3, q7 \n"
1749 "vmla.f32 q13, q3, q6 \n"
1750 "vmla.f32 q8, q10, q15 \n"
1751 "vmla.f32 q9, q11, q15 \n"
1752 "vmla.f32 q0, q12, q15 \n"
1753 "vmla.f32 q1, q13, q15 \n"
1754 "vst1.f32 {q8,q9},[%2,:128]! \n"
1755 "vst1.f32 {q0,q1},[%2,:128]! \n"
1756 "subs %3, #2 \n"
1757 "bne 1b \n"
1758 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
Julien Pommier432b3e82013-01-12 19:28:03 +01001759#else // default routine, works fine for non-arm cpus with current compilers
Julien Pommier370d2092011-11-19 18:04:25 +01001760 for (i=0; i < Ncvec; i += 2) {
1761 v4sf ar, ai, br, bi;
1762 ar = va[2*i+0]; ai = va[2*i+1];
1763 br = vb[2*i+0]; bi = vb[2*i+1];
1764 VCPLXMUL(ar, ai, br, bi);
1765 vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1766 vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1767 ar = va[2*i+2]; ai = va[2*i+3];
1768 br = vb[2*i+2]; bi = vb[2*i+3];
1769 VCPLXMUL(ar, ai, br, bi);
1770 vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1771 vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1772 }
1773#endif
1774 if (s->transform == PFFFT_REAL) {
1775 ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1776 ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1777 }
1778}
1779
1780
1781#else // defined(PFFFT_SIMD_DISABLE)
1782
1783// standard routine using scalar floats, without SIMD stuff.
1784
1785#define pffft_zreorder_nosimd pffft_zreorder
1786void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1787 int k, N = setup->N;
1788 if (setup->transform == PFFFT_COMPLEX) {
1789 for (k=0; k < 2*N; ++k) out[k] = in[k];
1790 return;
1791 }
1792 else if (direction == PFFFT_FORWARD) {
1793 float x_N = in[N-1];
1794 for (k=N-1; k > 1; --k) out[k] = in[k-1];
1795 out[0] = in[0];
1796 out[1] = x_N;
1797 } else {
1798 float x_N = in[1];
1799 for (k=1; k < N-1; ++k) out[k] = in[k+1];
1800 out[0] = in[0];
1801 out[N-1] = x_N;
1802 }
1803}
1804
1805#define pffft_transform_internal_nosimd pffft_transform_internal
1806void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1807 pffft_direction_t direction, int ordered) {
1808 int Ncvec = setup->Ncvec;
1809 int nf_odd = (setup->ifac[1] & 1);
1810
1811 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1812 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1813 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
Julien Pommier2a195842012-10-11 11:11:41 +02001814 float *buff[2];
Julien Pommier370d2092011-11-19 18:04:25 +01001815 int ib;
Julien Pommier2a195842012-10-11 11:11:41 +02001816 if (scratch == 0) scratch = scratch_on_stack;
1817 buff[0] = output; buff[1] = scratch;
1818
Julien Pommier370d2092011-11-19 18:04:25 +01001819 if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1820 ib = (nf_odd ^ ordered ? 1 : 0);
1821
1822 if (direction == PFFFT_FORWARD) {
1823 if (setup->transform == PFFFT_REAL) {
1824 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1825 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1826 } else {
1827 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1828 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1829 }
1830 if (ordered) {
1831 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1832 }
1833 } else {
1834 if (input == buff[ib]) {
1835 ib = !ib; // may happen when finput == foutput
1836 }
1837 if (ordered) {
1838 pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
1839 input = buff[!ib];
1840 }
1841 if (setup->transform == PFFFT_REAL) {
1842 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1843 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1844 } else {
1845 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1846 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1847 }
1848 }
1849 if (buff[ib] != output) {
1850 int k;
1851 // extra copy required -- this situation should happens only when finput == foutput
1852 assert(input==output);
1853 for (k=0; k < Ncvec; ++k) {
1854 float a = buff[ib][2*k], b = buff[ib][2*k+1];
1855 output[2*k] = a; output[2*k+1] = b;
1856 }
1857 ib = !ib;
1858 }
1859 assert(buff[ib] == output);
1860}
1861
1862#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
1863void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1864 float *ab, float scaling) {
1865 int i, Ncvec = s->Ncvec;
1866
1867 if (s->transform == PFFFT_REAL) {
1868 // take care of the fftpack ordering
1869 ab[0] += a[0]*b[0]*scaling;
1870 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1871 ++ab; ++a; ++b; --Ncvec;
1872 }
1873 for (i=0; i < Ncvec; ++i) {
1874 float ar, ai, br, bi;
1875 ar = a[2*i+0]; ai = a[2*i+1];
1876 br = b[2*i+0]; bi = b[2*i+1];
1877 VCPLXMUL(ar, ai, br, bi);
1878 ab[2*i+0] += ar*scaling;
1879 ab[2*i+1] += ai*scaling;
1880 }
1881}
1882
1883#endif // defined(PFFFT_SIMD_DISABLE)
1884
1885void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1886 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
1887}
1888
1889void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1890 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
1891}