blob: e1b7b940938a3141143933fcc48ea2aa50d32ca3 [file] [log] [blame]
hayati ayguen88918bb2020-03-26 22:34:46 +01001/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
2 Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
3 Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
4
5 Based on original fortran 77 code from FFTPACKv4 from NETLIB
6 (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
7 of NCAR, in 1985.
8
9 As confirmed by the NCAR fftpack software curators, the following
10 FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
11 released under the same terms.
12
13 FFTPACK license:
14
15 http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
16
17 Copyright (c) 2004 the University Corporation for Atmospheric
18 Research ("UCAR"). All rights reserved. Developed by NCAR's
19 Computational and Information Systems Laboratory, UCAR,
20 www.cisl.ucar.edu.
21
22 Redistribution and use of the Software in source and binary forms,
23 with or without modification, is permitted provided that the
24 following conditions are met:
25
26 - Neither the names of NCAR's Computational and Information Systems
27 Laboratory, the University Corporation for Atmospheric Research,
28 nor the names of its sponsors or contributors may be used to
29 endorse or promote products derived from this Software without
30 specific prior written permission.
31
32 - Redistributions of source code must retain the above copyright
33 notices, this list of conditions, and the disclaimer below.
34
35 - Redistributions in binary form must reproduce the above copyright
36 notice, this list of conditions, and the disclaimer below in the
37 documentation and/or other materials provided with the
38 distribution.
39
40 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
41 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
42 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
43 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
44 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
45 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
46 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
47 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
48 SOFTWARE.
49
50
51 PFFFT : a Pretty Fast FFT.
52
53 This file is largerly based on the original FFTPACK implementation, modified in
54 order to take advantage of SIMD instructions of modern CPUs.
55*/
56
57/* this file requires architecture specific preprocessor definitions
58 * it's only for library internal use
59 */
60
61
hayati ayguenc974c1d2020-03-29 03:39:30 +020062/* define own constants required to turn off g++ extensions .. */
63#ifndef M_PI
64 #define M_PI 3.14159265358979323846 /* pi */
65#endif
66
67#ifndef M_SQRT2
68 #define M_SQRT2 1.41421356237309504880 /* sqrt(2) */
69#endif
70
71
hayati ayguen88918bb2020-03-26 22:34:46 +010072int FUNC_SIMD_SIZE() { return SIMD_SZ; }
73
hayati ayguenca112412020-04-13 00:19:40 +020074const char * FUNC_SIMD_ARCH() { return VARCH; }
75
76
hayati ayguen88918bb2020-03-26 22:34:46 +010077/*
78 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
79*/
80static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
81 int k, i;
82 int l1ido = l1*ido;
83 if (ido <= 2) {
84 for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
85 ch[0] = VADD(cc[0], cc[ido+0]);
86 ch[l1ido] = VSUB(cc[0], cc[ido+0]);
87 ch[1] = VADD(cc[1], cc[ido+1]);
88 ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
89 }
90 } else {
91 for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
92 for (i=0; i<ido-1; i+=2) {
93 v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
94 v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
95 v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
96 ch[i] = VADD(cc[i+0], cc[i+ido+0]);
97 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
98 VCPLXMUL(tr2, ti2, wr, wi);
99 ch[i+l1ido] = tr2;
100 ch[i+l1ido+1] = ti2;
101 }
102 }
103 }
104}
105
106/*
107 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
108*/
109static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
110 const float *wa1, const float *wa2, float fsign) {
111 static const float taur = -0.5f;
112 float taui = 0.866025403784439f*fsign;
113 int i, k;
114 v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
115 int l1ido = l1*ido;
116 float wr1, wi1, wr2, wi2;
117 assert(ido > 2);
118 for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
119 for (i=0; i<ido-1; i+=2) {
120 tr2 = VADD(cc[i+ido], cc[i+2*ido]);
121 cr2 = VADD(cc[i], SVMUL(taur,tr2));
122 ch[i] = VADD(cc[i], tr2);
123 ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
124 ci2 = VADD(cc[i +1], SVMUL(taur,ti2));
125 ch[i+1] = VADD(cc[i+1], ti2);
126 cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
127 ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
128 dr2 = VSUB(cr2, ci3);
129 dr3 = VADD(cr2, ci3);
130 di2 = VADD(ci2, cr3);
131 di3 = VSUB(ci2, cr3);
132 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
133 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
134 ch[i+l1ido] = dr2;
135 ch[i+l1ido + 1] = di2;
136 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
137 ch[i+2*l1ido] = dr3;
138 ch[i+2*l1ido+1] = di3;
139 }
140 }
141} /* passf3 */
142
143static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
144 const float *wa1, const float *wa2, const float *wa3, float fsign) {
145 /* isign == -1 for forward transform and +1 for backward transform */
146
147 int i, k;
148 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
149 int l1ido = l1*ido;
150 if (ido == 2) {
151 for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
152 tr1 = VSUB(cc[0], cc[2*ido + 0]);
153 tr2 = VADD(cc[0], cc[2*ido + 0]);
154 ti1 = VSUB(cc[1], cc[2*ido + 1]);
155 ti2 = VADD(cc[1], cc[2*ido + 1]);
156 ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
157 tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
158 tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
159 ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
160
161 ch[0*l1ido + 0] = VADD(tr2, tr3);
162 ch[0*l1ido + 1] = VADD(ti2, ti3);
163 ch[1*l1ido + 0] = VADD(tr1, tr4);
164 ch[1*l1ido + 1] = VADD(ti1, ti4);
165 ch[2*l1ido + 0] = VSUB(tr2, tr3);
166 ch[2*l1ido + 1] = VSUB(ti2, ti3);
167 ch[3*l1ido + 0] = VSUB(tr1, tr4);
168 ch[3*l1ido + 1] = VSUB(ti1, ti4);
169 }
170 } else {
171 for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
172 for (i=0; i<ido-1; i+=2) {
173 float wr1, wi1, wr2, wi2, wr3, wi3;
174 tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
175 tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
176 ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
177 ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
178 tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
179 ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
180 tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
181 ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
182
183 ch[i] = VADD(tr2, tr3);
184 cr3 = VSUB(tr2, tr3);
185 ch[i + 1] = VADD(ti2, ti3);
186 ci3 = VSUB(ti2, ti3);
187
188 cr2 = VADD(tr1, tr4);
189 cr4 = VSUB(tr1, tr4);
190 ci2 = VADD(ti1, ti4);
191 ci4 = VSUB(ti1, ti4);
192 wr1=wa1[i], wi1=fsign*wa1[i+1];
193 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
194 wr2=wa2[i], wi2=fsign*wa2[i+1];
195 ch[i + l1ido] = cr2;
196 ch[i + l1ido + 1] = ci2;
197
198 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
199 wr3=wa3[i], wi3=fsign*wa3[i+1];
200 ch[i + 2*l1ido] = cr3;
201 ch[i + 2*l1ido + 1] = ci3;
202
203 VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
204 ch[i + 3*l1ido] = cr4;
205 ch[i + 3*l1ido + 1] = ci4;
206 }
207 }
208 }
209} /* passf4 */
210
211/*
212 passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
213*/
214static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
215 const float *wa1, const float *wa2,
216 const float *wa3, const float *wa4, float fsign) {
217 static const float tr11 = .309016994374947f;
218 const float ti11 = .951056516295154f*fsign;
219 static const float tr12 = -.809016994374947f;
220 const float ti12 = .587785252292473f*fsign;
221
222 /* Local variables */
223 int i, k;
224 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
225 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
226
227 float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
228
229#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
230#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
231
232 assert(ido > 2);
233 for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
234 for (i = 0; i < ido-1; i += 2) {
235 ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
236 ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
237 ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
238 ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
239 tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
240 tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
241 tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
242 tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
243 ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
244 ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
245 cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
246 ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
247 cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
248 ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
249 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
250 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
251 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
252 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
253 dr3 = VSUB(cr3, ci4);
254 dr4 = VADD(cr3, ci4);
255 di3 = VADD(ci3, cr4);
256 di4 = VSUB(ci3, cr4);
257 dr5 = VADD(cr2, ci5);
258 dr2 = VSUB(cr2, ci5);
259 di5 = VSUB(ci2, cr5);
260 di2 = VADD(ci2, cr5);
261 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
262 wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
263 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
264 ch_ref(i - 1, 2) = dr2;
265 ch_ref(i, 2) = di2;
266 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
267 ch_ref(i - 1, 3) = dr3;
268 ch_ref(i, 3) = di3;
269 VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
270 ch_ref(i - 1, 4) = dr4;
271 ch_ref(i, 4) = di4;
272 VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
273 ch_ref(i - 1, 5) = dr5;
274 ch_ref(i, 5) = di5;
275 }
276 }
277#undef ch_ref
278#undef cc_ref
279}
280
281static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
282 static const float minus_one = -1.f;
283 int i, k, l1ido = l1*ido;
284 for (k=0; k < l1ido; k += ido) {
285 v4sf a = cc[k], b = cc[k + l1ido];
286 ch[2*k] = VADD(a, b);
287 ch[2*(k+ido)-1] = VSUB(a, b);
288 }
289 if (ido < 2) return;
290 if (ido != 2) {
291 for (k=0; k < l1ido; k += ido) {
292 for (i=2; i<ido; i+=2) {
293 v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
294 v4sf br = cc[i - 1 + k], bi = cc[i + k];
295 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
296 ch[i + 2*k] = VADD(bi, ti2);
297 ch[2*(k+ido) - i] = VSUB(ti2, bi);
298 ch[i - 1 + 2*k] = VADD(br, tr2);
299 ch[2*(k+ido) - i -1] = VSUB(br, tr2);
300 }
301 }
302 if (ido % 2 == 1) return;
303 }
304 for (k=0; k < l1ido; k += ido) {
305 ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
306 ch[2*k + ido-1] = cc[k + ido-1];
307 }
308} /* radf2 */
309
310
311static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
312 static const float minus_two=-2;
313 int i, k, l1ido = l1*ido;
314 v4sf a,b,c,d, tr2, ti2;
315 for (k=0; k < l1ido; k += ido) {
316 a = cc[2*k]; b = cc[2*(k+ido) - 1];
317 ch[k] = VADD(a, b);
318 ch[k + l1ido] =VSUB(a, b);
319 }
320 if (ido < 2) return;
321 if (ido != 2) {
322 for (k = 0; k < l1ido; k += ido) {
323 for (i = 2; i < ido; i += 2) {
324 a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
325 c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
326 ch[i-1 + k] = VADD(a, b);
327 tr2 = VSUB(a, b);
328 ch[i+0 + k] = VSUB(c, d);
329 ti2 = VADD(c, d);
330 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
331 ch[i-1 + k + l1ido] = tr2;
332 ch[i+0 + k + l1ido] = ti2;
333 }
334 }
335 if (ido % 2 == 1) return;
336 }
337 for (k = 0; k < l1ido; k += ido) {
338 a = cc[2*k + ido-1]; b = cc[2*k + ido];
339 ch[k + ido-1] = VADD(a,a);
340 ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
341 }
342} /* radb2 */
343
344static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
345 const float *wa1, const float *wa2) {
346 static const float taur = -0.5f;
347 static const float taui = 0.866025403784439f;
348 int i, k, ic;
349 v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
350 for (k=0; k<l1; k++) {
351 cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
352 ch[3*k*ido] = VADD(cc[k*ido], cr2);
353 ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
354 ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
355 }
356 if (ido == 1) return;
357 for (k=0; k<l1; k++) {
358 for (i=2; i<ido; i+=2) {
359 ic = ido - i;
360 wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
361 dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
362 VCPLXMULCONJ(dr2, di2, wr1, wi1);
363
364 wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
365 dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
366 VCPLXMULCONJ(dr3, di3, wr2, wi2);
367
368 cr2 = VADD(dr2, dr3);
369 ci2 = VADD(di2, di3);
370 ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
371 ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
372 tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
373 ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
374 tr3 = SVMUL(taui, VSUB(di2, di3));
375 ti3 = SVMUL(taui, VSUB(dr3, dr2));
376 ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
377 ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
378 ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
379 ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
380 }
381 }
382} /* radf3 */
383
384
385static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
386 const float *wa1, const float *wa2)
387{
388 static const float taur = -0.5f;
389 static const float taui = 0.866025403784439f;
390 static const float taui_2 = 0.866025403784439f*2;
391 int i, k, ic;
392 v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
393 for (k=0; k<l1; k++) {
394 tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
395 cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
396 ch[k*ido] = VADD(cc[3*k*ido], tr2);
397 ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
398 ch[(k + l1)*ido] = VSUB(cr2, ci3);
399 ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
400 }
401 if (ido == 1) return;
402 for (k=0; k<l1; k++) {
403 for (i=2; i<ido; i+=2) {
404 ic = ido - i;
405 tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
406 cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
407 ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
408 ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
409 ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
410 ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
411 cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
412 ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
413 dr2 = VSUB(cr2, ci3);
414 dr3 = VADD(cr2, ci3);
415 di2 = VADD(ci2, cr3);
416 di3 = VSUB(ci2, cr3);
417 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
418 ch[i - 1 + (k + l1)*ido] = dr2;
419 ch[i + (k + l1)*ido] = di2;
420 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
421 ch[i - 1 + (k + 2*l1)*ido] = dr3;
422 ch[i + (k + 2*l1)*ido] = di3;
423 }
424 }
425} /* radb3 */
426
427static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
428 const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
429{
430 static const float minus_hsqt2 = (float)-0.7071067811865475;
431 int i, k, l1ido = l1*ido;
432 {
433 const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
434 v4sf * RESTRICT ch_ = ch;
435 while (cc < cc_end) {
hayati ayguenc974c1d2020-03-29 03:39:30 +0200436 /* this loop represents between 25% and 40% of total radf4_ps cost ! */
hayati ayguen88918bb2020-03-26 22:34:46 +0100437 v4sf a0 = cc[0], a1 = cc[l1ido];
438 v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
439 v4sf tr1 = VADD(a1, a3);
440 v4sf tr2 = VADD(a0, a2);
441 ch[2*ido-1] = VSUB(a0, a2);
442 ch[2*ido ] = VSUB(a3, a1);
443 ch[0 ] = VADD(tr1, tr2);
444 ch[4*ido-1] = VSUB(tr2, tr1);
445 cc += ido; ch += 4*ido;
446 }
447 cc = cc_; ch = ch_;
448 }
449 if (ido < 2) return;
450 if (ido != 2) {
451 for (k = 0; k < l1ido; k += ido) {
452 const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
453 for (i=2; i<ido; i += 2, pc += 2) {
454 int ic = ido - i;
455 v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
456 v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
457
458 cr2 = pc[1*l1ido+0];
459 ci2 = pc[1*l1ido+1];
460 wr=LD_PS1(wa1[i - 2]);
461 wi=LD_PS1(wa1[i - 1]);
462 VCPLXMULCONJ(cr2,ci2,wr,wi);
463
464 cr3 = pc[2*l1ido+0];
465 ci3 = pc[2*l1ido+1];
466 wr = LD_PS1(wa2[i-2]);
467 wi = LD_PS1(wa2[i-1]);
468 VCPLXMULCONJ(cr3, ci3, wr, wi);
469
470 cr4 = pc[3*l1ido];
471 ci4 = pc[3*l1ido+1];
472 wr = LD_PS1(wa3[i-2]);
473 wi = LD_PS1(wa3[i-1]);
474 VCPLXMULCONJ(cr4, ci4, wr, wi);
475
476 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
477
478 tr1 = VADD(cr2,cr4);
479 tr4 = VSUB(cr4,cr2);
480 tr2 = VADD(pc[0],cr3);
481 tr3 = VSUB(pc[0],cr3);
482 ch[i - 1 + 4*k] = VADD(tr1,tr2);
hayati ayguenc974c1d2020-03-29 03:39:30 +0200483 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); /* at this point tr1 and tr2 can be disposed */
hayati ayguen88918bb2020-03-26 22:34:46 +0100484 ti1 = VADD(ci2,ci4);
485 ti4 = VSUB(ci2,ci4);
486 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
hayati ayguenc974c1d2020-03-29 03:39:30 +0200487 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); /* dispose tr3, ti4 */
hayati ayguen88918bb2020-03-26 22:34:46 +0100488 ti2 = VADD(pc[1],ci3);
489 ti3 = VSUB(pc[1],ci3);
490 ch[i + 4*k] = VADD(ti1, ti2);
491 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
492 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
493 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
494 }
495 }
496 if (ido % 2 == 1) return;
497 }
498 for (k=0; k<l1ido; k += ido) {
499 v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
500 v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
501 v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
502 v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
503 ch[ido-1 + 4*k] = VADD(tr1, c);
504 ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
505 ch[4*k + 1*ido] = VSUB(ti1, d);
506 ch[4*k + 3*ido] = VADD(ti1, d);
507 }
508} /* radf4 */
509
510
511static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
512 const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
513{
514 static const float minus_sqrt2 = (float)-1.414213562373095;
515 static const float two = 2.f;
516 int i, k, l1ido = l1*ido;
517 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
518 {
519 const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
520 v4sf *ch_ = ch;
521 while (ch < ch_end) {
522 v4sf a = cc[0], b = cc[4*ido-1];
523 v4sf c = cc[2*ido], d = cc[2*ido-1];
524 tr3 = SVMUL(two,d);
525 tr2 = VADD(a,b);
526 tr1 = VSUB(a,b);
527 tr4 = SVMUL(two,c);
528 ch[0*l1ido] = VADD(tr2, tr3);
529 ch[2*l1ido] = VSUB(tr2, tr3);
530 ch[1*l1ido] = VSUB(tr1, tr4);
531 ch[3*l1ido] = VADD(tr1, tr4);
532
533 cc += 4*ido; ch += ido;
534 }
535 cc = cc_; ch = ch_;
536 }
537 if (ido < 2) return;
538 if (ido != 2) {
539 for (k = 0; k < l1ido; k += ido) {
540 const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
541 v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
542 for (i = 2; i < ido; i += 2) {
543
544 tr1 = VSUB(pc[i], pc[4*ido - i]);
545 tr2 = VADD(pc[i], pc[4*ido - i]);
546 ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
547 tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
548 ph[0] = VADD(tr2, tr3);
549 cr3 = VSUB(tr2, tr3);
550
551 ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
552 tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
553 cr2 = VSUB(tr1, tr4);
554 cr4 = VADD(tr1, tr4);
555
556 ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
557 ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
558
559 ph[1] = VADD(ti2, ti3); ph += l1ido;
560 ci3 = VSUB(ti2, ti3);
561 ci2 = VADD(ti1, ti4);
562 ci4 = VSUB(ti1, ti4);
563 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
564 ph[0] = cr2;
565 ph[1] = ci2; ph += l1ido;
566 VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
567 ph[0] = cr3;
568 ph[1] = ci3; ph += l1ido;
569 VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
570 ph[0] = cr4;
571 ph[1] = ci4; ph = ph - 3*l1ido + 2;
572 }
573 }
574 if (ido % 2 == 1) return;
575 }
576 for (k=0; k < l1ido; k+=ido) {
577 int i0 = 4*k + ido;
578 v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
579 v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
580 tr1 = VSUB(c,d);
581 tr2 = VADD(c,d);
582 ti1 = VADD(b,a);
583 ti2 = VSUB(b,a);
584 ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
585 ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
586 ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
587 ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
588 }
589} /* radb4 */
590
591static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
592 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
593{
594 static const float tr11 = .309016994374947f;
595 static const float ti11 = .951056516295154f;
596 static const float tr12 = -.809016994374947f;
597 static const float ti12 = .587785252292473f;
598
599 /* System generated locals */
600 int cc_offset, ch_offset;
601
602 /* Local variables */
603 int i, k, ic;
604 v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
605 cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
606 int idp2;
607
608
609#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
610#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
611
612 /* Parameter adjustments */
613 ch_offset = 1 + ido * 6;
614 ch -= ch_offset;
615 cc_offset = 1 + ido * (1 + l1);
616 cc -= cc_offset;
617
618 /* Function Body */
619 for (k = 1; k <= l1; ++k) {
620 cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
621 ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
622 cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
623 ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
624 ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
625 ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
626 ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
627 ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
628 ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
hayati ayguenc974c1d2020-03-29 03:39:30 +0200629 /* printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4); */
hayati ayguen88918bb2020-03-26 22:34:46 +0100630 }
631 if (ido == 1) {
632 return;
633 }
634 idp2 = ido + 2;
635 for (k = 1; k <= l1; ++k) {
636 for (i = 3; i <= ido; i += 2) {
637 ic = idp2 - i;
638 dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
639 dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
640 dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
641 dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
642 VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
643 VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
644 VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
645 VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
646 cr2 = VADD(dr2, dr5);
647 ci5 = VSUB(dr5, dr2);
648 cr5 = VSUB(di2, di5);
649 ci2 = VADD(di2, di5);
650 cr3 = VADD(dr3, dr4);
651 ci4 = VSUB(dr4, dr3);
652 cr4 = VSUB(di3, di4);
653 ci3 = VADD(di3, di4);
654 ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
hayati ayguenc974c1d2020-03-29 03:39:30 +0200655 ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));
hayati ayguen88918bb2020-03-26 22:34:46 +0100656 tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
hayati ayguenc974c1d2020-03-29 03:39:30 +0200657 ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));
hayati ayguen88918bb2020-03-26 22:34:46 +0100658 tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
hayati ayguenc974c1d2020-03-29 03:39:30 +0200659 ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));
hayati ayguen88918bb2020-03-26 22:34:46 +0100660 tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
661 ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
662 tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
663 ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
664 ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
665 ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
666 ch_ref(i, 3, k) = VADD(ti2, ti5);
667 ch_ref(ic, 2, k) = VSUB(ti5, ti2);
668 ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
669 ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
670 ch_ref(i, 5, k) = VADD(ti3, ti4);
671 ch_ref(ic, 4, k) = VSUB(ti4, ti3);
672 }
673 }
674#undef cc_ref
675#undef ch_ref
676} /* radf5 */
677
678static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
679 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
680{
681 static const float tr11 = .309016994374947f;
682 static const float ti11 = .951056516295154f;
683 static const float tr12 = -.809016994374947f;
684 static const float ti12 = .587785252292473f;
685
686 int cc_offset, ch_offset;
687
688 /* Local variables */
689 int i, k, ic;
690 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
691 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
692 int idp2;
693
694#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
695#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
696
697 /* Parameter adjustments */
698 ch_offset = 1 + ido * (1 + l1);
699 ch -= ch_offset;
700 cc_offset = 1 + ido * 6;
701 cc -= cc_offset;
702
703 /* Function Body */
704 for (k = 1; k <= l1; ++k) {
705 ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
706 ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
707 tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
708 tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
709 ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
710 cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
711 cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
712 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
713 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
714 ch_ref(1, k, 2) = VSUB(cr2, ci5);
715 ch_ref(1, k, 3) = VSUB(cr3, ci4);
716 ch_ref(1, k, 4) = VADD(cr3, ci4);
717 ch_ref(1, k, 5) = VADD(cr2, ci5);
718 }
719 if (ido == 1) {
720 return;
721 }
722 idp2 = ido + 2;
723 for (k = 1; k <= l1; ++k) {
724 for (i = 3; i <= ido; i += 2) {
725 ic = idp2 - i;
726 ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k));
727 ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k));
728 ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k));
729 ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k));
730 tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
731 tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
732 tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
733 tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
734 ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
735 ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
736 cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
737 ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
738 cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
739 ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
740 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
741 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
742 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
743 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
744 dr3 = VSUB(cr3, ci4);
745 dr4 = VADD(cr3, ci4);
746 di3 = VADD(ci3, cr4);
747 di4 = VSUB(ci3, cr4);
748 dr5 = VADD(cr2, ci5);
749 dr2 = VSUB(cr2, ci5);
750 di5 = VSUB(ci2, cr5);
751 di2 = VADD(ci2, cr5);
752 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
753 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
754 VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
755 VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
756
757 ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
758 ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
759 ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
760 ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
761 }
762 }
763#undef cc_ref
764#undef ch_ref
765} /* radb5 */
766
767static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
768 const float *wa, const int *ifac) {
769 v4sf *in = (v4sf*)input_readonly;
770 v4sf *out = (in == work2 ? work1 : work2);
771 int nf = ifac[1], k1;
772 int l2 = n;
773 int iw = n-1;
774 assert(in != out && work1 != work2);
775 for (k1 = 1; k1 <= nf; ++k1) {
776 int kh = nf - k1;
777 int ip = ifac[kh + 2];
778 int l1 = l2 / ip;
779 int ido = n / l2;
780 iw -= (ip - 1)*ido;
781 switch (ip) {
782 case 5: {
783 int ix2 = iw + ido;
784 int ix3 = ix2 + ido;
785 int ix4 = ix3 + ido;
786 radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
787 } break;
788 case 4: {
789 int ix2 = iw + ido;
790 int ix3 = ix2 + ido;
791 radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
792 } break;
793 case 3: {
794 int ix2 = iw + ido;
795 radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
796 } break;
797 case 2:
798 radf2_ps(ido, l1, in, out, &wa[iw]);
799 break;
800 default:
801 assert(0);
802 break;
803 }
804 l2 = l1;
805 if (out == work2) {
806 out = work1; in = work2;
807 } else {
808 out = work2; in = work1;
809 }
810 }
811 return in; /* this is in fact the output .. */
812} /* rfftf1 */
813
814static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
815 const float *wa, const int *ifac) {
816 v4sf *in = (v4sf*)input_readonly;
817 v4sf *out = (in == work2 ? work1 : work2);
818 int nf = ifac[1], k1;
819 int l1 = 1;
820 int iw = 0;
821 assert(in != out);
822 for (k1=1; k1<=nf; k1++) {
823 int ip = ifac[k1 + 1];
824 int l2 = ip*l1;
825 int ido = n / l2;
826 switch (ip) {
827 case 5: {
828 int ix2 = iw + ido;
829 int ix3 = ix2 + ido;
830 int ix4 = ix3 + ido;
831 radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
832 } break;
833 case 4: {
834 int ix2 = iw + ido;
835 int ix3 = ix2 + ido;
836 radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
837 } break;
838 case 3: {
839 int ix2 = iw + ido;
840 radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
841 } break;
842 case 2:
843 radb2_ps(ido, l1, in, out, &wa[iw]);
844 break;
845 default:
846 assert(0);
847 break;
848 }
849 l1 = l2;
850 iw += (ip - 1)*ido;
851
852 if (out == work2) {
853 out = work1; in = work2;
854 } else {
855 out = work2; in = work1;
856 }
857 }
858 return in; /* this is in fact the output .. */
859}
860
861static int decompose(int n, int *ifac, const int *ntryh) {
862 int nl = n, nf = 0, i, j = 0;
863 for (j=0; ntryh[j]; ++j) {
864 int ntry = ntryh[j];
865 while (nl != 1) {
866 int nq = nl / ntry;
867 int nr = nl - ntry * nq;
868 if (nr == 0) {
869 ifac[2+nf++] = ntry;
870 nl = nq;
871 if (ntry == 2 && nf != 1) {
872 for (i = 2; i <= nf; ++i) {
873 int ib = nf - i + 2;
874 ifac[ib + 1] = ifac[ib];
875 }
876 ifac[2] = 2;
877 }
878 } else break;
879 }
880 }
881 ifac[0] = n;
882 ifac[1] = nf;
883 return nf;
884}
885
886
887
888static void rffti1_ps(int n, float *wa, int *ifac)
889{
890 static const int ntryh[] = { 4,2,3,5,0 };
891 int k1, j, ii;
892
893 int nf = decompose(n,ifac,ntryh);
894 float argh = (2*(float)M_PI) / n;
895 int is = 0;
896 int nfm1 = nf - 1;
897 int l1 = 1;
898 for (k1 = 1; k1 <= nfm1; k1++) {
899 int ip = ifac[k1 + 1];
900 int ld = 0;
901 int l2 = l1*ip;
902 int ido = n / l2;
903 int ipm = ip - 1;
904 for (j = 1; j <= ipm; ++j) {
905 float argld;
906 int i = is, fi=0;
907 ld += l1;
908 argld = ld*argh;
909 for (ii = 3; ii <= ido; ii += 2) {
910 i += 2;
911 fi += 1;
912 wa[i - 2] = FUNC_COS(fi*argld);
913 wa[i - 1] = FUNC_SIN(fi*argld);
914 }
915 is += ido;
916 }
917 l1 = l2;
918 }
919} /* rffti1 */
920
Dario Mambro624fcef2020-03-28 00:10:58 +0100921static void cffti1_ps(int n, float *wa, int *ifac)
hayati ayguen88918bb2020-03-26 22:34:46 +0100922{
923 static const int ntryh[] = { 5,3,4,2,0 };
924 int k1, j, ii;
925
926 int nf = decompose(n,ifac,ntryh);
927 float argh = (2*(float)M_PI) / n;
928 int i = 1;
929 int l1 = 1;
930 for (k1=1; k1<=nf; k1++) {
931 int ip = ifac[k1+1];
932 int ld = 0;
933 int l2 = l1*ip;
934 int ido = n / l2;
935 int idot = ido + ido + 2;
936 int ipm = ip - 1;
937 for (j=1; j<=ipm; j++) {
938 float argld;
939 int i1 = i, fi = 0;
940 wa[i-1] = 1;
941 wa[i] = 0;
942 ld += l1;
943 argld = ld*argh;
944 for (ii = 4; ii <= idot; ii += 2) {
945 i += 2;
946 fi += 1;
947 wa[i-1] = FUNC_COS(fi*argld);
948 wa[i] = FUNC_SIN(fi*argld);
949 }
950 if (ip > 5) {
951 wa[i1-1] = wa[i-1];
952 wa[i1] = wa[i];
953 }
954 }
955 l1 = l2;
956 }
957} /* cffti1 */
958
959
Dario Mambro624fcef2020-03-28 00:10:58 +0100960static v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
hayati ayguen88918bb2020-03-26 22:34:46 +0100961 v4sf *in = (v4sf*)input_readonly;
962 v4sf *out = (in == work2 ? work1 : work2);
963 int nf = ifac[1], k1;
964 int l1 = 1;
965 int iw = 0;
966 assert(in != out && work1 != work2);
967 for (k1=2; k1<=nf+1; k1++) {
968 int ip = ifac[k1];
969 int l2 = ip*l1;
970 int ido = n / l2;
971 int idot = ido + ido;
972 switch (ip) {
973 case 5: {
974 int ix2 = iw + idot;
975 int ix3 = ix2 + idot;
976 int ix4 = ix3 + idot;
977 passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
978 } break;
979 case 4: {
980 int ix2 = iw + idot;
981 int ix3 = ix2 + idot;
982 passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
983 } break;
984 case 2: {
985 passf2_ps(idot, l1, in, out, &wa[iw], isign);
986 } break;
987 case 3: {
988 int ix2 = iw + idot;
989 passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
990 } break;
991 default:
992 assert(0);
993 }
994 l1 = l2;
995 iw += (ip - 1)*idot;
996 if (out == work2) {
997 out = work1; in = work2;
998 } else {
999 out = work2; in = work1;
1000 }
1001 }
1002
1003 return in; /* this is in fact the output .. */
1004}
1005
1006
1007struct SETUP_STRUCT {
1008 int N;
hayati ayguenc974c1d2020-03-29 03:39:30 +02001009 int Ncvec; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
hayati ayguen88918bb2020-03-26 22:34:46 +01001010 int ifac[15];
1011 pffft_transform_t transform;
hayati ayguenc974c1d2020-03-29 03:39:30 +02001012 v4sf *data; /* allocated room for twiddle coefs */
1013 float *e; /* points into 'data', N/4*3 elements */
1014 float *twiddle; /* points into 'data', N/4 elements */
hayati ayguen88918bb2020-03-26 22:34:46 +01001015};
1016
1017SETUP_STRUCT *FUNC_NEW_SETUP(int N, pffft_transform_t transform) {
1018 SETUP_STRUCT *s = (SETUP_STRUCT*)malloc(sizeof(SETUP_STRUCT));
1019 int k, m;
1020 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1021 and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1022 handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1023 if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1024 if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
hayati ayguenc974c1d2020-03-29 03:39:30 +02001025 /* assert((N % 32) == 0); */
hayati ayguen88918bb2020-03-26 22:34:46 +01001026 s->N = N;
1027 s->transform = transform;
1028 /* nb of complex simd vectors */
1029 s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1030 s->data = (v4sf*)FUNC_ALIGNED_MALLOC(2*s->Ncvec * sizeof(v4sf));
1031 s->e = (float*)s->data;
1032 s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1033
1034 if (transform == PFFFT_REAL) {
1035 for (k=0; k < s->Ncvec; ++k) {
1036 int i = k/SIMD_SZ;
1037 int j = k%SIMD_SZ;
1038 for (m=0; m < SIMD_SZ-1; ++m) {
1039 float A = -2*(float)M_PI*(m+1)*k / N;
1040 s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = FUNC_COS(A);
1041 s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = FUNC_SIN(A);
1042 }
1043 }
1044 rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1045 } else {
1046 for (k=0; k < s->Ncvec; ++k) {
1047 int i = k/SIMD_SZ;
1048 int j = k%SIMD_SZ;
1049 for (m=0; m < SIMD_SZ-1; ++m) {
1050 float A = -2*(float)M_PI*(m+1)*k / N;
1051 s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = FUNC_COS(A);
1052 s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = FUNC_SIN(A);
1053 }
1054 }
1055 cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1056 }
1057
1058 /* check that N is decomposable with allowed prime factors */
1059 for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1060 if (m != N/SIMD_SZ) {
1061 FUNC_DESTROY(s); s = 0;
1062 }
1063
1064 return s;
1065}
1066
1067
1068void FUNC_DESTROY(SETUP_STRUCT *s) {
1069 FUNC_ALIGNED_FREE(s->data);
1070 free(s);
1071}
1072
hayati ayguenca112412020-04-13 00:19:40 +02001073#if ( SIMD_SZ == 4 ) /* !defined(PFFFT_SIMD_DISABLE) */
hayati ayguen88918bb2020-03-26 22:34:46 +01001074
1075/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1076static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1077 v4sf g0, g1;
1078 int k;
1079 INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1080
hayati ayguenc974c1d2020-03-29 03:39:30 +02001081 *--out = VSWAPHL(g0, g1); /* [g0l, g0h], [g1l g1h] -> [g1l, g0h] */
hayati ayguen88918bb2020-03-26 22:34:46 +01001082 for (k=1; k < N; ++k) {
1083 v4sf h0, h1;
1084 INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1085 *--out = VSWAPHL(g1, h0);
1086 *--out = VSWAPHL(h0, h1);
1087 g1 = h1;
1088 }
1089 *--out = VSWAPHL(g1, g0);
1090}
1091
1092static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1093 v4sf g0, g1, h0, h1;
1094 int k;
1095 g0 = g1 = in[0]; ++in;
1096 for (k=1; k < N; ++k) {
1097 h0 = *in++; h1 = *in++;
1098 g1 = VSWAPHL(g1, h0);
1099 h0 = VSWAPHL(h0, h1);
1100 UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1101 g1 = h1;
1102 }
1103 h0 = *in++; h1 = g0;
1104 g1 = VSWAPHL(g1, h0);
1105 h0 = VSWAPHL(h0, h1);
1106 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1107}
1108
1109void FUNC_ZREORDER(SETUP_STRUCT *setup, const float *in, float *out, pffft_direction_t direction) {
1110 int k, N = setup->N, Ncvec = setup->Ncvec;
1111 const v4sf *vin = (const v4sf*)in;
1112 v4sf *vout = (v4sf*)out;
1113 assert(in != out);
1114 if (setup->transform == PFFFT_REAL) {
1115 int k, dk = N/32;
1116 if (direction == PFFFT_FORWARD) {
1117 for (k=0; k < dk; ++k) {
1118 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1119 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1120 }
1121 reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1122 reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1123 } else {
1124 for (k=0; k < dk; ++k) {
1125 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1126 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1127 }
1128 unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1129 unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1130 }
1131 } else {
1132 if (direction == PFFFT_FORWARD) {
1133 for (k=0; k < Ncvec; ++k) {
1134 int kk = (k/4) + (k%4)*(Ncvec/4);
1135 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1136 }
1137 } else {
1138 for (k=0; k < Ncvec; ++k) {
1139 int kk = (k/4) + (k%4)*(Ncvec/4);
1140 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1141 }
1142 }
1143 }
1144}
1145
1146void FUNC_CPLX_FINALIZE(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
hayati ayguenc974c1d2020-03-29 03:39:30 +02001147 int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
hayati ayguen88918bb2020-03-26 22:34:46 +01001148 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1149 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1150 assert(in != out);
1151 for (k=0; k < dk; ++k) {
1152 r0 = in[8*k+0]; i0 = in[8*k+1];
1153 r1 = in[8*k+2]; i1 = in[8*k+3];
1154 r2 = in[8*k+4]; i2 = in[8*k+5];
1155 r3 = in[8*k+6]; i3 = in[8*k+7];
1156 VTRANSPOSE4(r0,r1,r2,r3);
1157 VTRANSPOSE4(i0,i1,i2,i3);
1158 VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1159 VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1160 VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1161
1162 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1163 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1164 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1165 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1166
1167 /*
1168 transformation for each column is:
1169
1170 [1 1 1 1 0 0 0 0] [r0]
1171 [1 0 -1 0 0 -1 0 1] [r1]
1172 [1 -1 1 -1 0 0 0 0] [r2]
1173 [1 0 -1 0 0 1 0 -1] [r3]
1174 [0 0 0 0 1 1 1 1] * [i0]
1175 [0 1 0 -1 1 0 -1 0] [i1]
1176 [0 0 0 0 1 -1 1 -1] [i2]
1177 [0 -1 0 1 1 0 -1 0] [i3]
1178 */
1179
1180 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1181 r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1182 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1183 r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1184
1185 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1186 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1187 }
1188}
1189
1190void FUNC_CPLX_PREPROCESS(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
hayati ayguenc974c1d2020-03-29 03:39:30 +02001191 int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
hayati ayguen88918bb2020-03-26 22:34:46 +01001192 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1193 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1194 assert(in != out);
1195 for (k=0; k < dk; ++k) {
1196 r0 = in[8*k+0]; i0 = in[8*k+1];
1197 r1 = in[8*k+2]; i1 = in[8*k+3];
1198 r2 = in[8*k+4]; i2 = in[8*k+5];
1199 r3 = in[8*k+6]; i3 = in[8*k+7];
1200
1201 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1202 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1203 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1204 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1205
1206 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1207 r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1208 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1209 r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1210
1211 VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1212 VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1213 VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1214
1215 VTRANSPOSE4(r0,r1,r2,r3);
1216 VTRANSPOSE4(i0,i1,i2,i3);
1217
1218 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1219 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1220 }
1221}
1222
1223
1224static ALWAYS_INLINE(void) FUNC_REAL_FINALIZE_4X4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1225 const v4sf *e, v4sf *out) {
1226 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1227 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1228 r0 = *in0; i0 = *in1;
1229 r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1230 VTRANSPOSE4(r0,r1,r2,r3);
1231 VTRANSPOSE4(i0,i1,i2,i3);
1232
1233 /*
1234 transformation for each column is:
1235
1236 [1 1 1 1 0 0 0 0] [r0]
1237 [1 0 -1 0 0 -1 0 1] [r1]
1238 [1 0 -1 0 0 1 0 -1] [r2]
1239 [1 -1 1 -1 0 0 0 0] [r3]
1240 [0 0 0 0 1 1 1 1] * [i0]
1241 [0 -1 0 1 -1 0 1 0] [i1]
1242 [0 -1 0 1 1 0 -1 0] [i2]
1243 [0 0 0 0 -1 1 -1 1] [i3]
1244 */
1245
hayati ayguenc974c1d2020-03-29 03:39:30 +02001246 /* cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */
1247 /* cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */
hayati ayguen88918bb2020-03-26 22:34:46 +01001248
1249 VCPLXMUL(r1,i1,e[0],e[1]);
1250 VCPLXMUL(r2,i2,e[2],e[3]);
1251 VCPLXMUL(r3,i3,e[4],e[5]);
1252
hayati ayguenc974c1d2020-03-29 03:39:30 +02001253 /* cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */
1254 /* cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */
hayati ayguen88918bb2020-03-26 22:34:46 +01001255
1256 sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1257 sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1258 si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1259 si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1260
1261 r0 = VADD(sr0, sr1);
1262 r3 = VSUB(sr0, sr1);
1263 i0 = VADD(si0, si1);
1264 i3 = VSUB(si1, si0);
1265 r1 = VADD(dr0, di1);
1266 r2 = VSUB(dr0, di1);
1267 i1 = VSUB(dr1, di0);
1268 i2 = VADD(dr1, di0);
1269
1270 *out++ = r0;
1271 *out++ = i0;
1272 *out++ = r1;
1273 *out++ = i1;
1274 *out++ = r2;
1275 *out++ = i2;
1276 *out++ = r3;
1277 *out++ = i3;
1278
1279}
1280
1281static NEVER_INLINE(void) FUNC_REAL_FINALIZE(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
hayati ayguenc974c1d2020-03-29 03:39:30 +02001282 int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
hayati ayguen88918bb2020-03-26 22:34:46 +01001283 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1284
1285 v4sf_union cr, ci, *uout = (v4sf_union*)out;
1286 v4sf save = in[7], zero=VZERO();
1287 float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1288 static const float s = (float)M_SQRT2/2;
1289
1290 cr.v = in[0]; ci.v = in[Ncvec*2-1];
1291 assert(in != out);
1292 FUNC_REAL_FINALIZE_4X4(&zero, &zero, in+1, e, out);
1293
1294 /*
1295 [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1296
1297 [Xr(1)] ] [1 1 1 1 0 0 0 0]
1298 [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1299 [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1300 [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1301 [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1302 [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1303 [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1304 [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1305 */
1306
1307 xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1308 xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1309 xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
1310 xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
1311 xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
1312 xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
1313 xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
1314 xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
1315
1316 for (k=1; k < dk; ++k) {
1317 v4sf save_next = in[8*k+7];
1318 FUNC_REAL_FINALIZE_4X4(&save, &in[8*k+0], in + 8*k+1,
1319 e + k*6, out + k*8);
1320 save = save_next;
1321 }
1322
1323}
1324
1325static ALWAYS_INLINE(void) FUNC_REAL_PREPROCESS_4X4(const v4sf *in,
1326 const v4sf *e, v4sf *out, int first) {
1327 v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1328 /*
1329 transformation for each column is:
1330
1331 [1 1 1 1 0 0 0 0] [r0]
1332 [1 0 0 -1 0 -1 -1 0] [r1]
1333 [1 -1 -1 1 0 0 0 0] [r2]
1334 [1 0 0 -1 0 1 1 0] [r3]
1335 [0 0 0 0 1 -1 1 -1] * [i0]
1336 [0 -1 1 0 1 0 0 1] [i1]
1337 [0 0 0 0 1 1 -1 -1] [i2]
1338 [0 1 -1 0 1 0 0 1] [i3]
1339 */
1340
1341 v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1342 v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1343 v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1344 v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1345
1346 r0 = VADD(sr0, sr1);
1347 r2 = VSUB(sr0, sr1);
1348 r1 = VSUB(dr0, si1);
1349 r3 = VADD(dr0, si1);
1350 i0 = VSUB(di0, di1);
1351 i2 = VADD(di0, di1);
1352 i1 = VSUB(si0, dr1);
1353 i3 = VADD(si0, dr1);
1354
1355 VCPLXMULCONJ(r1,i1,e[0],e[1]);
1356 VCPLXMULCONJ(r2,i2,e[2],e[3]);
1357 VCPLXMULCONJ(r3,i3,e[4],e[5]);
1358
1359 VTRANSPOSE4(r0,r1,r2,r3);
1360 VTRANSPOSE4(i0,i1,i2,i3);
1361
1362 if (!first) {
1363 *out++ = r0;
1364 *out++ = i0;
1365 }
1366 *out++ = r1;
1367 *out++ = i1;
1368 *out++ = r2;
1369 *out++ = i2;
1370 *out++ = r3;
1371 *out++ = i3;
1372}
1373
1374static NEVER_INLINE(void) FUNC_REAL_PREPROCESS(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
hayati ayguenc974c1d2020-03-29 03:39:30 +02001375 int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
hayati ayguen88918bb2020-03-26 22:34:46 +01001376 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1377
1378 v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1379 float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1380 static const float s = (float)M_SQRT2;
1381 assert(in != out);
1382 for (k=0; k < 4; ++k) {
1383 Xr.f[k] = ((float*)in)[8*k];
1384 Xi.f[k] = ((float*)in)[8*k+4];
1385 }
1386
hayati ayguenc974c1d2020-03-29 03:39:30 +02001387 FUNC_REAL_PREPROCESS_4X4(in, e, out+1, 1); /* will write only 6 values */
hayati ayguen88918bb2020-03-26 22:34:46 +01001388
1389 /*
1390 [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1391
1392 [cr0] [1 0 2 0 1 0 0 0]
1393 [cr1] [1 0 0 0 -1 0 -2 0]
1394 [cr2] [1 0 -2 0 1 0 0 0]
1395 [cr3] [1 0 0 0 -1 0 2 0]
1396 [ci0] [0 2 0 2 0 0 0 0]
1397 [ci1] [0 s 0 -s 0 -s 0 -s]
1398 [ci2] [0 0 0 0 0 -2 0 2]
1399 [ci3] [0 -s 0 s 0 -s 0 -s]
1400 */
1401 for (k=1; k < dk; ++k) {
1402 FUNC_REAL_PREPROCESS_4X4(in+8*k, e + k*6, out-1+k*8, 0);
1403 }
1404
1405 cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1406 cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1407 cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1408 cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1409 ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0;
1410 ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1411 ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2;
1412 ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1413}
1414
1415
1416void FUNC_TRANSFORM_INTERNAL(SETUP_STRUCT *setup, const float *finput, float *foutput, v4sf *scratch,
1417 pffft_direction_t direction, int ordered) {
1418 int k, Ncvec = setup->Ncvec;
1419 int nf_odd = (setup->ifac[1] & 1);
1420
hayati ayguenc974c1d2020-03-29 03:39:30 +02001421 /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
hayati ayguen88918bb2020-03-26 22:34:46 +01001422 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1423 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1424
1425 const v4sf *vinput = (const v4sf*)finput;
1426 v4sf *voutput = (v4sf*)foutput;
1427 v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
1428 int ib = (nf_odd ^ ordered ? 1 : 0);
1429
1430 assert(VALIGNED(finput) && VALIGNED(foutput));
1431
hayati ayguenc974c1d2020-03-29 03:39:30 +02001432 /* assert(finput != foutput); */
hayati ayguen88918bb2020-03-26 22:34:46 +01001433 if (direction == PFFFT_FORWARD) {
1434 ib = !ib;
1435 if (setup->transform == PFFFT_REAL) {
1436 ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1437 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1438 FUNC_REAL_FINALIZE(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1439 } else {
1440 v4sf *tmp = buff[ib];
1441 for (k=0; k < Ncvec; ++k) {
1442 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1443 }
1444 ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1445 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1446 FUNC_CPLX_FINALIZE(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1447 }
1448 if (ordered) {
1449 FUNC_ZREORDER(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1450 } else ib = !ib;
1451 } else {
1452 if (vinput == buff[ib]) {
hayati ayguenc974c1d2020-03-29 03:39:30 +02001453 ib = !ib; /* may happen when finput == foutput */
hayati ayguen88918bb2020-03-26 22:34:46 +01001454 }
1455 if (ordered) {
1456 FUNC_ZREORDER(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1457 vinput = buff[ib]; ib = !ib;
1458 }
1459 if (setup->transform == PFFFT_REAL) {
1460 FUNC_REAL_PREPROCESS(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1461 ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1462 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1463 } else {
1464 FUNC_CPLX_PREPROCESS(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1465 ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1466 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1467 for (k=0; k < Ncvec; ++k) {
1468 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1469 }
1470 }
1471 }
1472
1473 if (buff[ib] != voutput) {
1474 /* extra copy required -- this situation should only happen when finput == foutput */
1475 assert(finput==foutput);
1476 for (k=0; k < Ncvec; ++k) {
1477 v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1478 voutput[2*k] = a; voutput[2*k+1] = b;
1479 }
1480 ib = !ib;
1481 }
1482 assert(buff[ib] == voutput);
1483}
1484
1485void FUNC_ZCONVOLVE_ACCUMULATE(SETUP_STRUCT *s, const float *a, const float *b, float *ab, float scaling) {
1486 int Ncvec = s->Ncvec;
1487 const v4sf * RESTRICT va = (const v4sf*)a;
1488 const v4sf * RESTRICT vb = (const v4sf*)b;
1489 v4sf * RESTRICT vab = (v4sf*)ab;
1490
1491#ifdef __arm__
1492 __builtin_prefetch(va);
1493 __builtin_prefetch(vb);
1494 __builtin_prefetch(vab);
1495 __builtin_prefetch(va+2);
1496 __builtin_prefetch(vb+2);
1497 __builtin_prefetch(vab+2);
1498 __builtin_prefetch(va+4);
1499 __builtin_prefetch(vb+4);
1500 __builtin_prefetch(vab+4);
1501 __builtin_prefetch(va+6);
1502 __builtin_prefetch(vb+6);
1503 __builtin_prefetch(vab+6);
1504# ifndef __clang__
1505# define ZCONVOLVE_USING_INLINE_NEON_ASM
1506# endif
1507#endif
1508
1509 float ar, ai, br, bi, abr, abi;
1510#ifndef ZCONVOLVE_USING_INLINE_ASM
1511 v4sf vscal = LD_PS1(scaling);
1512 int i;
1513#endif
1514
1515 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1516 ar = ((v4sf_union*)va)[0].f[0];
1517 ai = ((v4sf_union*)va)[1].f[0];
1518 br = ((v4sf_union*)vb)[0].f[0];
1519 bi = ((v4sf_union*)vb)[1].f[0];
1520 abr = ((v4sf_union*)vab)[0].f[0];
1521 abi = ((v4sf_union*)vab)[1].f[0];
1522
hayati ayguenc974c1d2020-03-29 03:39:30 +02001523#ifdef ZCONVOLVE_USING_INLINE_ASM
1524 /* inline asm version, unfortunately miscompiled by clang 3.2,
1525 * at least on ubuntu.. so this will be restricted to gcc */
hayati ayguen88918bb2020-03-26 22:34:46 +01001526 const float *a_ = a, *b_ = b; float *ab_ = ab;
1527 int N = Ncvec;
1528 asm volatile("mov r8, %2 \n"
1529 "vdup.f32 q15, %4 \n"
1530 "1: \n"
1531 "pld [%0,#64] \n"
1532 "pld [%1,#64] \n"
1533 "pld [%2,#64] \n"
1534 "pld [%0,#96] \n"
1535 "pld [%1,#96] \n"
1536 "pld [%2,#96] \n"
1537 "vld1.f32 {q0,q1}, [%0,:128]! \n"
1538 "vld1.f32 {q4,q5}, [%1,:128]! \n"
1539 "vld1.f32 {q2,q3}, [%0,:128]! \n"
1540 "vld1.f32 {q6,q7}, [%1,:128]! \n"
1541 "vld1.f32 {q8,q9}, [r8,:128]! \n"
1542
1543 "vmul.f32 q10, q0, q4 \n"
1544 "vmul.f32 q11, q0, q5 \n"
1545 "vmul.f32 q12, q2, q6 \n"
1546 "vmul.f32 q13, q2, q7 \n"
1547 "vmls.f32 q10, q1, q5 \n"
1548 "vmla.f32 q11, q1, q4 \n"
1549 "vld1.f32 {q0,q1}, [r8,:128]! \n"
1550 "vmls.f32 q12, q3, q7 \n"
1551 "vmla.f32 q13, q3, q6 \n"
1552 "vmla.f32 q8, q10, q15 \n"
1553 "vmla.f32 q9, q11, q15 \n"
1554 "vmla.f32 q0, q12, q15 \n"
1555 "vmla.f32 q1, q13, q15 \n"
1556 "vst1.f32 {q8,q9},[%2,:128]! \n"
1557 "vst1.f32 {q0,q1},[%2,:128]! \n"
1558 "subs %3, #2 \n"
1559 "bne 1b \n"
1560 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
hayati ayguenc974c1d2020-03-29 03:39:30 +02001561#else
1562 /* default routine, works fine for non-arm cpus with current compilers */
hayati ayguen88918bb2020-03-26 22:34:46 +01001563 for (i=0; i < Ncvec; i += 2) {
1564 v4sf ar, ai, br, bi;
1565 ar = va[2*i+0]; ai = va[2*i+1];
1566 br = vb[2*i+0]; bi = vb[2*i+1];
1567 VCPLXMUL(ar, ai, br, bi);
1568 vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1569 vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1570 ar = va[2*i+2]; ai = va[2*i+3];
1571 br = vb[2*i+2]; bi = vb[2*i+3];
1572 VCPLXMUL(ar, ai, br, bi);
1573 vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1574 vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1575 }
1576#endif
1577 if (s->transform == PFFFT_REAL) {
1578 ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1579 ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1580 }
1581}
1582
1583void FUNC_ZCONVOLVE_NO_ACCU(SETUP_STRUCT *s, const float *a, const float *b, float *ab, float scaling) {
1584 v4sf vscal = LD_PS1(scaling);
1585 const v4sf * RESTRICT va = (const v4sf*)a;
1586 const v4sf * RESTRICT vb = (const v4sf*)b;
1587 v4sf * RESTRICT vab = (v4sf*)ab;
1588 float sar, sai, sbr, sbi;
1589 const int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */
1590 int k; /* was i -- but always used "2*i" - except at for() */
1591
1592#ifdef __arm__
1593 __builtin_prefetch(va);
1594 __builtin_prefetch(vb);
1595 __builtin_prefetch(vab);
1596 __builtin_prefetch(va+2);
1597 __builtin_prefetch(vb+2);
1598 __builtin_prefetch(vab+2);
1599 __builtin_prefetch(va+4);
1600 __builtin_prefetch(vb+4);
1601 __builtin_prefetch(vab+4);
1602 __builtin_prefetch(va+6);
1603 __builtin_prefetch(vb+6);
1604 __builtin_prefetch(vab+6);
1605# ifndef __clang__
1606# define ZCONVOLVE_USING_INLINE_NEON_ASM
1607# endif
1608#endif
1609
1610 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1611 sar = ((v4sf_union*)va)[0].f[0];
1612 sai = ((v4sf_union*)va)[1].f[0];
1613 sbr = ((v4sf_union*)vb)[0].f[0];
1614 sbi = ((v4sf_union*)vb)[1].f[0];
1615
1616 /* default routine, works fine for non-arm cpus with current compilers */
1617 for (k=0; k < NcvecMulTwo; k += 4) {
1618 v4sf var, vai, vbr, vbi;
1619 var = va[k+0]; vai = va[k+1];
1620 vbr = vb[k+0]; vbi = vb[k+1];
1621 VCPLXMUL(var, vai, vbr, vbi);
1622 vab[k+0] = VMUL(var, vscal);
1623 vab[k+1] = VMUL(vai, vscal);
1624 var = va[k+2]; vai = va[k+3];
1625 vbr = vb[k+2]; vbi = vb[k+3];
1626 VCPLXMUL(var, vai, vbr, vbi);
1627 vab[k+2] = VMUL(var, vscal);
1628 vab[k+3] = VMUL(vai, vscal);
1629 }
1630
1631 if (s->transform == PFFFT_REAL) {
1632 ((v4sf_union*)vab)[0].f[0] = sar*sbr*scaling;
1633 ((v4sf_union*)vab)[1].f[0] = sai*sbi*scaling;
1634 }
1635}
1636
1637
hayati ayguenca112412020-04-13 00:19:40 +02001638#else /* #if ( SIMD_SZ == 4 ) * !defined(PFFFT_SIMD_DISABLE) */
hayati ayguen88918bb2020-03-26 22:34:46 +01001639
hayati ayguenc974c1d2020-03-29 03:39:30 +02001640/* standard routine using scalar floats, without SIMD stuff. */
hayati ayguen88918bb2020-03-26 22:34:46 +01001641
1642#define pffft_zreorder_nosimd FUNC_ZREORDER
1643void pffft_zreorder_nosimd(SETUP_STRUCT *setup, const float *in, float *out, pffft_direction_t direction) {
1644 int k, N = setup->N;
1645 if (setup->transform == PFFFT_COMPLEX) {
1646 for (k=0; k < 2*N; ++k) out[k] = in[k];
1647 return;
1648 }
1649 else if (direction == PFFFT_FORWARD) {
1650 float x_N = in[N-1];
1651 for (k=N-1; k > 1; --k) out[k] = in[k-1];
1652 out[0] = in[0];
1653 out[1] = x_N;
1654 } else {
1655 float x_N = in[1];
1656 for (k=1; k < N-1; ++k) out[k] = in[k+1];
1657 out[0] = in[0];
1658 out[N-1] = x_N;
1659 }
1660}
1661
1662#define pffft_transform_internal_nosimd FUNC_TRANSFORM_INTERNAL
1663void pffft_transform_internal_nosimd(SETUP_STRUCT *setup, const float *input, float *output, float *scratch,
1664 pffft_direction_t direction, int ordered) {
1665 int Ncvec = setup->Ncvec;
1666 int nf_odd = (setup->ifac[1] & 1);
1667
hayati ayguenc974c1d2020-03-29 03:39:30 +02001668 /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
hayati ayguen88918bb2020-03-26 22:34:46 +01001669 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1670 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1671 float *buff[2];
1672 int ib;
1673 if (scratch == 0) scratch = scratch_on_stack;
1674 buff[0] = output; buff[1] = scratch;
1675
hayati ayguenc974c1d2020-03-29 03:39:30 +02001676 if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */
hayati ayguen88918bb2020-03-26 22:34:46 +01001677 ib = (nf_odd ^ ordered ? 1 : 0);
1678
1679 if (direction == PFFFT_FORWARD) {
1680 if (setup->transform == PFFFT_REAL) {
1681 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1682 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1683 } else {
1684 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1685 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1686 }
1687 if (ordered) {
1688 FUNC_ZREORDER(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1689 }
1690 } else {
1691 if (input == buff[ib]) {
hayati ayguenc974c1d2020-03-29 03:39:30 +02001692 ib = !ib; /* may happen when finput == foutput */
hayati ayguen88918bb2020-03-26 22:34:46 +01001693 }
1694 if (ordered) {
1695 FUNC_ZREORDER(setup, input, buff[!ib], PFFFT_BACKWARD);
1696 input = buff[!ib];
1697 }
1698 if (setup->transform == PFFFT_REAL) {
1699 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1700 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1701 } else {
1702 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1703 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1704 }
1705 }
1706 if (buff[ib] != output) {
1707 int k;
hayati ayguenc974c1d2020-03-29 03:39:30 +02001708 /* extra copy required -- this situation should happens only when finput == foutput */
hayati ayguen88918bb2020-03-26 22:34:46 +01001709 assert(input==output);
1710 for (k=0; k < Ncvec; ++k) {
1711 float a = buff[ib][2*k], b = buff[ib][2*k+1];
1712 output[2*k] = a; output[2*k+1] = b;
1713 }
1714 ib = !ib;
1715 }
1716 assert(buff[ib] == output);
1717}
1718
1719#define pffft_zconvolve_accumulate_nosimd FUNC_ZCONVOLVE_ACCUMULATE
1720void pffft_zconvolve_accumulate_nosimd(SETUP_STRUCT *s, const float *a, const float *b,
1721 float *ab, float scaling) {
1722 int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */
1723 int k; /* was i -- but always used "2*i" - except at for() */
1724
1725 if (s->transform == PFFFT_REAL) {
hayati ayguenc974c1d2020-03-29 03:39:30 +02001726 /* take care of the fftpack ordering */
hayati ayguen88918bb2020-03-26 22:34:46 +01001727 ab[0] += a[0]*b[0]*scaling;
1728 ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling;
1729 ++ab; ++a; ++b; NcvecMulTwo -= 2;
1730 }
1731 for (k=0; k < NcvecMulTwo; k += 2) {
1732 float ar, ai, br, bi;
1733 ar = a[k+0]; ai = a[k+1];
1734 br = b[k+0]; bi = b[k+1];
1735 VCPLXMUL(ar, ai, br, bi);
1736 ab[k+0] += ar*scaling;
1737 ab[k+1] += ai*scaling;
1738 }
1739}
1740
1741#define pffft_zconvolve_no_accu_nosimd FUNC_ZCONVOLVE_NO_ACCU
1742void pffft_zconvolve_no_accu_nosimd(SETUP_STRUCT *s, const float *a, const float *b,
1743 float *ab, float scaling) {
1744 int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */
1745 int k; /* was i -- but always used "2*i" - except at for() */
1746
1747 if (s->transform == PFFFT_REAL) {
hayati ayguenc974c1d2020-03-29 03:39:30 +02001748 /* take care of the fftpack ordering */
hayati ayguen88918bb2020-03-26 22:34:46 +01001749 ab[0] += a[0]*b[0]*scaling;
1750 ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling;
1751 ++ab; ++a; ++b; NcvecMulTwo -= 2;
1752 }
1753 for (k=0; k < NcvecMulTwo; k += 2) {
1754 float ar, ai, br, bi;
1755 ar = a[k+0]; ai = a[k+1];
1756 br = b[k+0]; bi = b[k+1];
1757 VCPLXMUL(ar, ai, br, bi);
1758 ab[k+0] = ar*scaling;
1759 ab[k+1] = ai*scaling;
1760 }
1761}
1762
1763
hayati ayguenca112412020-04-13 00:19:40 +02001764#endif /* #if ( SIMD_SZ == 4 ) * !defined(PFFFT_SIMD_DISABLE) */
1765
hayati ayguen88918bb2020-03-26 22:34:46 +01001766
1767void FUNC_TRANSFORM_UNORDRD(SETUP_STRUCT *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1768 FUNC_TRANSFORM_INTERNAL(setup, input, output, (v4sf*)work, direction, 0);
1769}
1770
1771void FUNC_TRANSFORM_ORDERED(SETUP_STRUCT *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1772 FUNC_TRANSFORM_INTERNAL(setup, input, output, (v4sf*)work, direction, 1);
1773}
1774
1775
hayati ayguenca112412020-04-13 00:19:40 +02001776#if ( SIMD_SZ == 4 )
1777
1778#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
1779
1780/* detect bugs with the vector support macros */
1781void FUNC_VALIDATE_SIMD_A() {
1782 float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
1783 v4sf_union a0, a1, a2, a3, t, u;
1784 memcpy(a0.f, f, 4*sizeof(float));
1785 memcpy(a1.f, f+4, 4*sizeof(float));
1786 memcpy(a2.f, f+8, 4*sizeof(float));
1787 memcpy(a3.f, f+12, 4*sizeof(float));
1788
1789 t = a0; u = a1; t.v = VZERO();
1790 printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
1791 t.v = VADD(a1.v, a2.v);
1792 printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
1793 t.v = VMUL(a1.v, a2.v);
1794 printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
1795 t.v = VMADD(a1.v, a2.v,a0.v);
1796 printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
1797
1798 INTERLEAVE2(a1.v,a2.v,t.v,u.v);
1799 printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
1800 assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
1801 UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
1802 printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
1803 assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
1804
1805 t.v=LD_PS1(f[15]);
1806 printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
1807 assertv4(t, 15, 15, 15, 15);
1808 t.v = VSWAPHL(a1.v, a2.v);
1809 printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
1810 assertv4(t, 8, 9, 6, 7);
1811 VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
1812 printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
1813 a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
1814 a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
1815 assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
1816}
1817
1818
1819static void pffft_assert1( float result, float ref, const char * vartxt, const char * functxt, int * numErrs, const char * f, int lineNo )
1820{
1821 if ( !( fabsf( result - ref ) < 0.01F ) )
1822 {
1823 fprintf(stderr, "%s: assert for %s at %s(%d)\n expected %f value %f\n", functxt, vartxt, f, lineNo, ref, result);
1824 ++(*numErrs);
1825 }
1826}
1827
hayati ayguen79455172020-08-28 04:07:44 +02001828static void pffft_assert4( vsfscalar v0, vsfscalar v1, vsfscalar v2, vsfscalar v3,
1829 float a, float b, float c, float d, const char * functxt, int * numErrs, const char * f, int lineNo )
hayati ayguenca112412020-04-13 00:19:40 +02001830{
hayati ayguen79455172020-08-28 04:07:44 +02001831 pffft_assert1( v0, a, "[0]", functxt, numErrs, f, lineNo );
1832 pffft_assert1( v1, b, "[1]", functxt, numErrs, f, lineNo );
1833 pffft_assert1( v2, c, "[2]", functxt, numErrs, f, lineNo );
1834 pffft_assert1( v3, d, "[3]", functxt, numErrs, f, lineNo );
hayati ayguenca112412020-04-13 00:19:40 +02001835}
1836
hayati ayguen79455172020-08-28 04:07:44 +02001837#define PFFFT_ASSERT4( V, a, b, c, d, FUNCTXT ) pffft_assert4( (V).f[0], (V).f[1], (V).f[2], (V).f[3], a, b, c, d, FUNCTXT, &numErrs, __FILE__, __LINE__ )
hayati ayguenca112412020-04-13 00:19:40 +02001838
1839
1840int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)
1841{
1842 int numErrs = 0;
1843
1844 {
1845 v4sf_union C;
1846 int k;
1847 for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1;
1848
1849 if (DbgOut) {
1850 fprintf(DbgOut, "\ninput: { }\n" );
1851 }
1852 C.v = VZERO();
1853 if (DbgOut) {
1854 fprintf(DbgOut, "VZERO(a) => C) => {\n" );
1855 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1856 fprintf(DbgOut, "}\n" );
1857 }
1858 PFFFT_ASSERT4( C, 0.0F, 0.0F, 0.0F, 0.0F, "VZERO() Out C" );
1859 }
1860
1861 {
1862 v4sf_union C;
1863 float a = 42.0F;
1864 int k;
1865 for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1;
1866
1867 if (DbgOut) {
1868 fprintf(DbgOut, "\ninput: a = {\n" );
1869 fprintf(DbgOut, " Inp a: %f\n", a );
1870 fprintf(DbgOut, "}\n" );
1871 }
1872 C.v = LD_PS1(a);
1873 if (DbgOut) {
1874 fprintf(DbgOut, "LD_PS1(a) => C) => {\n" );
1875 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1876 fprintf(DbgOut, "}\n" );
1877 }
1878 PFFFT_ASSERT4( C, 42.0F, 42.0F, 42.0F, 42.0F, "LD_PS1() Out C" );
1879 }
1880
1881 {
1882 v4sf_union C;
1883 float a[16];
1884 int numAligned = 0, numUnaligned = 0;
1885 int k;
1886 const char * pUn;
1887 for ( k = 0; k < 16; ++k ) a[k] = k+1;
1888
1889 for ( k = 0; k + 3 < 16; ++k )
1890 {
1891 const float * ptr = &a[k];
1892 if (DbgOut)
1893 fprintf(DbgOut, "\ninput: a = [ %f, %f, %f, %f ]\n", ptr[0], ptr[1], ptr[2], ptr[3] );
1894 if ( VALIGNED(ptr) )
1895 {
1896 C.v = VLOAD_ALIGNED( ptr );
1897 pUn = "";
1898 ++numAligned;
1899 }
1900 else
1901 {
1902 C.v = VLOAD_UNALIGNED( ptr );
1903 pUn = "UN";
1904 ++numUnaligned;
1905 }
1906 if (DbgOut) {
1907 fprintf(DbgOut, "C = VLOAD_%sALIGNED(&a[%d]) => {\n", pUn, k );
1908 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1909 fprintf(DbgOut, "}\n" );
1910 }
1911 //PFFFT_ASSERT4( C, 32.0F, 34.0F, 36.0F, 38.0F, "VADD(): Out C" );
1912
1913 if ( numAligned >= 1 && numUnaligned >= 4 )
1914 break;
1915 }
1916 if ( numAligned < 1 ) {
1917 fprintf(stderr, "VALIGNED() should have found at least 1 occurence!");
1918 ++numErrs;
1919 }
1920 if ( numUnaligned < 4 ) {
1921 fprintf(stderr, "!VALIGNED() should have found at least 4 occurences!");
1922 ++numErrs;
1923 }
1924 }
1925
1926 {
1927 v4sf_union A, B, C;
1928 int k;
1929 for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1;
1930 for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1;
1931 for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1;
1932
1933 if (DbgOut) {
1934 fprintf(DbgOut, "\ninput: A,B = {\n" );
1935 fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
1936 fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
1937 fprintf(DbgOut, "}\n" );
1938 }
1939 C.v = VADD(A.v, B.v);
1940 if (DbgOut) {
1941 fprintf(DbgOut, "C = VADD(A,B) => {\n" );
1942 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1943 fprintf(DbgOut, "}\n" );
1944 }
1945 PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VADD(): Inp A" );
1946 PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "VADD(): Inp B" );
1947 PFFFT_ASSERT4( C, 32.0F, 34.0F, 36.0F, 38.0F, "VADD(): Out C" );
1948 }
1949
1950 {
1951 v4sf_union A, B, C;
1952 int k;
1953 for ( k = 0; k < 4; ++k ) A.f[k] = 20 + 2*k+1;
1954 for ( k = 0; k < 4; ++k ) B.f[k] = 10 + k+1;
1955 for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1;
1956
1957 if (DbgOut) {
1958 fprintf(DbgOut, "\ninput: A,B = {\n" );
1959 fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
1960 fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
1961 fprintf(DbgOut, "}\n" );
1962 }
1963 C.v = VSUB(A.v, B.v);
1964 if (DbgOut) {
1965 fprintf(DbgOut, "C = VSUB(A,B) => {\n" );
1966 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1967 fprintf(DbgOut, "}\n" );
1968 }
1969 PFFFT_ASSERT4( A, 21.0F, 23.0F, 25.0F, 27.0F, "VSUB(): Inp A" );
1970 PFFFT_ASSERT4( B, 11.0F, 12.0F, 13.0F, 14.0F, "VSUB(): Inp B" );
1971 PFFFT_ASSERT4( C, 10.0F, 11.0F, 12.0F, 13.0F, "VSUB(): Out C" );
1972 }
1973
1974 {
1975 v4sf_union A, B, C;
1976 int k;
1977 for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1;
1978 for ( k = 0; k < 4; ++k ) B.f[k] = k+1;
1979 for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1;
1980
1981 if (DbgOut) {
1982 fprintf(DbgOut, "\ninput: A,B = {\n" );
1983 fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
1984 fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
1985 fprintf(DbgOut, "}\n" );
1986 }
1987 C.v = VMUL(A.v, B.v);
1988 if (DbgOut) {
1989 fprintf(DbgOut, "C = VMUL(A,B) => {\n" );
1990 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1991 fprintf(DbgOut, "}\n" );
1992 }
1993 PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VMUL(): Inp A" );
1994 PFFFT_ASSERT4( B, 1.0F, 2.0F, 3.0F, 4.0F, "VMUL(): Inp B" );
1995 PFFFT_ASSERT4( C, 11.0F, 24.0F, 39.0F, 56.0F, "VMUL(): Out C" );
1996 }
1997
1998 {
1999 v4sf_union A, B, C, D;
2000 int k;
2001 for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1;
2002 for ( k = 0; k < 4; ++k ) B.f[k] = k+1;
2003 for ( k = 0; k < 4; ++k ) C.f[k] = 10 + k;
2004 for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1;
2005
2006 if (DbgOut) {
2007 fprintf(DbgOut, "\ninput: A,B,C = {\n" );
2008 fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2009 fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2010 fprintf(DbgOut, " Inp C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2011 fprintf(DbgOut, "}\n" );
2012 }
2013 D.v = VMADD(A.v, B.v, C.v);
2014 if (DbgOut) {
2015 fprintf(DbgOut, "D = VMADD(A,B,C) => {\n" );
2016 fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2017 fprintf(DbgOut, "}\n" );
2018 }
2019 PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VMADD(): Inp A" );
2020 PFFFT_ASSERT4( B, 1.0F, 2.0F, 3.0F, 4.0F, "VMADD(): Inp B" );
2021 PFFFT_ASSERT4( C, 10.0F, 11.0F, 12.0F, 13.0F, "VMADD(): Inp C" );
2022 PFFFT_ASSERT4( D, 21.0F, 35.0F, 51.0F, 69.0F, "VMADD(): Out D" );
2023 }
2024
2025 {
2026 v4sf_union A, B, C, D;
2027 int k;
2028 for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1;
2029 for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1;
2030 for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1;
2031 for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1;
2032
2033 if (DbgOut) {
2034 fprintf(DbgOut, "\ninput: A,B = {\n" );
2035 fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2036 fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2037 fprintf(DbgOut, "}\n" );
2038 }
2039 INTERLEAVE2(A.v, B.v, C.v, D.v);
2040 if (DbgOut) {
2041 fprintf(DbgOut, "INTERLEAVE2(A,B, => C,D) => {\n" );
2042 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2043 fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2044 fprintf(DbgOut, "}\n" );
2045 }
2046 PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "INTERLEAVE2() Inp A" );
2047 PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "INTERLEAVE2() Inp B" );
2048 PFFFT_ASSERT4( C, 11.0F, 21.0F, 12.0F, 22.0F, "INTERLEAVE2() Out C" );
2049 PFFFT_ASSERT4( D, 13.0F, 23.0F, 14.0F, 24.0F, "INTERLEAVE2() Out D" );
2050 }
2051
2052 {
2053 v4sf_union A, B, C, D;
2054 int k;
2055 for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1;
2056 for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1;
2057 for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1;
2058 for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1;
2059
2060 if (DbgOut) {
2061 fprintf(DbgOut, "\ninput: A,B = {\n" );
2062 fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2063 fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2064 fprintf(DbgOut, "}\n" );
2065 }
2066 UNINTERLEAVE2(A.v, B.v, C.v, D.v);
2067 if (DbgOut) {
2068 fprintf(DbgOut, "UNINTERLEAVE2(A,B, => C,D) => {\n" );
2069 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2070 fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2071 fprintf(DbgOut, "}\n" );
2072 }
2073 PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "UNINTERLEAVE2() Inp A" );
2074 PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "UNINTERLEAVE2() Inp B" );
2075 PFFFT_ASSERT4( C, 11.0F, 13.0F, 21.0F, 23.0F, "UNINTERLEAVE2() Out C" );
2076 PFFFT_ASSERT4( D, 12.0F, 14.0F, 22.0F, 24.0F, "UNINTERLEAVE2() Out D" );
2077 }
2078
2079 {
2080 v4sf_union A, B, C, D;
2081 int k;
2082 for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1;
2083 for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1;
2084 for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1;
2085 for ( k = 0; k < 4; ++k ) D.f[k] = 40 + k+1;
2086
2087 if (DbgOut) {
2088 fprintf(DbgOut, "\ninput: A,B,C,D = {\n" );
2089 fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2090 fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2091 fprintf(DbgOut, " Inp C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2092 fprintf(DbgOut, " Inp D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2093 fprintf(DbgOut, "}\n" );
2094 }
2095 VTRANSPOSE4(A.v, B.v, C.v, D.v);
2096 if (DbgOut) {
2097 fprintf(DbgOut, "VTRANSPOSE4(A,B,C,D) => {\n" );
2098 fprintf(DbgOut, " Out A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2099 fprintf(DbgOut, " Out B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2100 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2101 fprintf(DbgOut, " Out D: %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2102 fprintf(DbgOut, "}\n" );
2103 }
2104 PFFFT_ASSERT4( A, 11.0F, 21.0F, 31.0F, 41.0F, "VTRANSPOSE4(): Out A" );
2105 PFFFT_ASSERT4( B, 12.0F, 22.0F, 32.0F, 42.0F, "VTRANSPOSE4(): Out B" );
2106 PFFFT_ASSERT4( C, 13.0F, 23.0F, 33.0F, 43.0F, "VTRANSPOSE4(): Out C" );
2107 PFFFT_ASSERT4( D, 14.0F, 24.0F, 34.0F, 44.0F, "VTRANSPOSE4(): Out D" );
2108 }
2109
2110 {
2111 v4sf_union A, B, C;
2112 int k;
2113 for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1;
2114 for ( k = 0; k < 4; ++k ) B.f[k] = 20 + k+1;
2115 for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1;
2116
2117 if (DbgOut) {
2118 fprintf(DbgOut, "\ninput: A,B = {\n" );
2119 fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2120 fprintf(DbgOut, " Inp B: %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2121 fprintf(DbgOut, "}\n" );
2122 }
2123 C.v = VSWAPHL(A.v, B.v);
2124 if (DbgOut) {
2125 fprintf(DbgOut, "C = VSWAPHL(A,B) => {\n" );
2126 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2127 fprintf(DbgOut, "}\n" );
2128 }
2129 PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VSWAPHL(): Inp A" );
2130 PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "VSWAPHL(): Inp B" );
2131 PFFFT_ASSERT4( C, 21.0F, 22.0F, 13.0F, 14.0F, "VSWAPHL(): Out C" );
2132 }
2133
2134 {
2135 v4sf_union A, C;
2136 int k;
2137 for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1;
2138 for ( k = 0; k < 4; ++k ) C.f[k] = 30 + k+1;
2139
2140 if (DbgOut) {
2141 fprintf(DbgOut, "\ninput: A = {\n" );
2142 fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2143 fprintf(DbgOut, "}\n" );
2144 }
2145 C.v = VREV_S(A.v);
2146 if (DbgOut) {
2147 fprintf(DbgOut, "C = VREV_S(A) => {\n" );
2148 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2149 fprintf(DbgOut, "}\n" );
2150 }
2151 PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VREV_S(): Inp A" );
2152 PFFFT_ASSERT4( C, 14.0F, 13.0F, 12.0F, 11.0F, "VREV_S(): Out C" );
2153 }
2154
2155 {
2156 v4sf_union A, C;
2157 int k;
2158 for ( k = 0; k < 4; ++k ) A.f[k] = 10 + k+1;
2159
2160 if (DbgOut) {
2161 fprintf(DbgOut, "\ninput: A = {\n" );
2162 fprintf(DbgOut, " Inp A: %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2163 fprintf(DbgOut, "}\n" );
2164 }
2165 C.v = VREV_C(A.v);
2166 if (DbgOut) {
2167 fprintf(DbgOut, "C = VREV_C(A) => {\n" );
2168 fprintf(DbgOut, " Out C: %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2169 fprintf(DbgOut, "}\n" );
2170 }
2171 PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VREV_C(): Inp A" );
2172 PFFFT_ASSERT4( C, 13.0F, 14.0F, 11.0F, 12.0F, "VREV_C(): Out A" );
2173 }
2174
2175 return numErrs;
2176}
2177
2178#else /* if ( SIMD_SZ == 4 ) */
2179
2180void FUNC_VALIDATE_SIMD_A()
2181{
2182}
2183
2184int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)
2185{
Jiyong Park94eba542020-09-18 16:29:45 +09002186 (void)DbgOut;
hayati ayguenca112412020-04-13 00:19:40 +02002187 return -1;
2188}
2189
2190#endif /* end if ( SIMD_SZ == 4 ) */
2191