Blame - pffft.c - platform/external/pffft

blob: 50cce1914c82e4a11bba74c066b057e311d8818c [file] [log] [blame]

Julien Pommier	370d209	2011-11-19 18:04:25 +0100	[diff] [blame^]	1	/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com )
				2
				3	Based on original fortran 77 code from FFTPACKv4 from NETLIB
				4	(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
				5	of NCAR, in 1985.
				6
				7	As confirmed by the NCAR fftpack software curators, the following
				8	FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
				9	released under the same terms.
				10
				11	FFTPACK license:
				12
				13	http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
				14
				15	Copyright (c) 2004 the University Corporation for Atmospheric
				16	Research ("UCAR"). All rights reserved. Developed by NCAR's
				17	Computational and Information Systems Laboratory, UCAR,
				18	www.cisl.ucar.edu.
				19
				20	Redistribution and use of the Software in source and binary forms,
				21	with or without modification, is permitted provided that the
				22	following conditions are met:
				23
				24	- Neither the names of NCAR's Computational and Information Systems
				25	Laboratory, the University Corporation for Atmospheric Research,
				26	nor the names of its sponsors or contributors may be used to
				27	endorse or promote products derived from this Software without
				28	specific prior written permission.
				29
				30	- Redistributions of source code must retain the above copyright
				31	notices, this list of conditions, and the disclaimer below.
				32
				33	- Redistributions in binary form must reproduce the above copyright
				34	notice, this list of conditions, and the disclaimer below in the
				35	documentation and/or other materials provided with the
				36	distribution.
				37
				38	THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				39	EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
				40	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				41	NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
				42	HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
				43	EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				44	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				45	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
				46	SOFTWARE.
				47
				48
				49	PFFFT : a Pretty Fast FFT.
				50
				51	This file is largerly based on the original FFTPACK implementation, modified in
				52	order to take advantage of SIMD instructions of modern CPUs.
				53	*/
				54
				55	/*
				56	ChangeLog:
				57	- 2011/10/02, version 1: This is the very first release of this file.
				58	*/
				59
				60	#include "pffft.h"
				61	#include <stdlib.h>
				62	#include <stdio.h>
				63	#include <math.h>
				64	#include <assert.h>
				65
				66	/* detect compiler flavour */
				67	#if defined(_MSC_VER)
				68	# define COMPILER_MSVC
				69	#elif defined(__GNUC__)
				70	# define COMPILER_GCC
				71	#endif
				72
				73	#if defined(COMPILER_GCC)
				74	# define ALWAYS_INLINE(return_type) return_type __attribute__ ((always_inline))
				75	# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
				76	# define RESTRICT __restrict
				77	# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
				78	#elif defined(COMPILER_MSVC)
				79	# define ALWAYS_INLINE(return_type) __forceinline return_type
				80	# define NEVER_INLINE(return_type) __declspec(noinline) return_type
				81	# define RESTRICT __restrict
				82	# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__ = (v4sf)_alloca(size__ * sizeof(type__))
				83	#endif
				84
				85
				86	/*
				87	vector support macros: the rest of the code is independant of
				88	SSE/Altivec/NEON -- adding support for other platforms with 4-element
				89	vectors should be limited to these macros
				90	*/
				91
				92
				93	// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
				94	//#define PFFFT_SIMD_DISABLE
				95
				96	/*
				97	Altivec support macros
				98	*/
				99	#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) \|\| defined(__ppc64__))
				100	typedef vector float v4sf;
				101	# define SIMD_SZ 4
				102	# define VZERO() ((vector float) vec_splat_u8(0))
				103	# define VMUL(a,b) vec_madd(a,b, VZERO())
				104	# define VADD(a,b) vec_add(a,b)
				105	# define VMADD(a,b,c) vec_madd(a,b,c)
				106	# define VSUB(a,b) vec_sub(a,b)
				107	inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
				108	# define LD_PS1(p) ld_ps1(&p)
				109	# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
				110	# define UNINTERLEAVE2(in1, in2, out1, out2) { \
				111	vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
				112	vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
				113	v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
				114	}
				115	# define VTRANSPOSE4(x0,x1,x2,x3) { \
				116	v4sf y0 = vec_mergeh(x0, x2); \
				117	v4sf y1 = vec_mergel(x0, x2); \
				118	v4sf y2 = vec_mergeh(x1, x3); \
				119	v4sf y3 = vec_mergel(x1, x3); \
				120	x0 = vec_mergeh(y0, y2); \
				121	x1 = vec_mergel(y0, y2); \
				122	x2 = vec_mergeh(y1, y3); \
				123	x3 = vec_mergel(y1, y3); \
				124	}
				125	# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
				126	# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
				127
				128	/*
				129	SSE1 support macros
				130	*/
				131	#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(i386) \|\| defined(_M_IX86))
				132
				133	#include <xmmintrin.h>
				134	typedef __m128 v4sf;
				135	# define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
				136	# define VZERO() _mm_setzero_ps()
				137	# define VMUL(a,b) _mm_mul_ps(a,b)
				138	# define VADD(a,b) _mm_add_ps(a,b)
				139	# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
				140	# define VSUB(a,b) _mm_sub_ps(a,b)
				141	# define LD_PS1(p) _mm_set1_ps(p)
				142	# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
				143	# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
				144	# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
				145	# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
				146	# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
				147
				148	/*
				149	ARM NEON support macros
				150	*/
				151	#elif !defined(PFFFT_SIMD_DISABLE) && defined(__arm__)
				152	# include <arm_neon.h>
				153	typedef float32x4_t v4sf;
				154	# define SIMD_SZ 4
				155	# define VZERO() vdupq_n_f32(0)
				156	# define VMUL(a,b) vmulq_f32(a,b)
				157	# define VADD(a,b) vaddq_f32(a,b)
				158	# define VMADD(a,b,c) vmlaq_f32(c,a,b)
				159	# define VSUB(a,b) vsubq_f32(a,b)
				160	# define LD_PS1(p) vld1q_dup_f32(&(p))
				161	# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
				162	# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
				163	# define VTRANSPOSE4_(x0,x1,x2,x3) { \
				164	float32x4x2_t t0_ = vzipq_f32(x0, x2); \
				165	float32x4x2_t t1_ = vzipq_f32(x1, x3); \
				166	float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
				167	float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
				168	x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
				169	}
				170	// marginally faster version
				171	# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
				172	# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
				173	# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
				174	#else
				175	# if !defined(PFFFT_SIMD_DISABLE)
				176	# warning "building with simd disabled !\n";
				177	# define PFFFT_SIMD_DISABLE // fallback to scalar code
				178	# endif
				179	#endif
				180
				181	// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
				182	#ifdef PFFFT_SIMD_DISABLE
				183	typedef float v4sf;
				184	# define SIMD_SZ 1
				185	# define VZERO() 0.f
				186	# define VMUL(a,b) ((a)*(b))
				187	# define VADD(a,b) ((a)+(b))
				188	# define VMADD(a,b,c) ((a)*(b)+(c))
				189	# define VSUB(a,b) ((a)-(b))
				190	# define LD_PS1(p) (p)
				191	# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
				192	#endif
				193
				194	// shortcuts for complex multiplcations
				195	#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
				196	#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
				197
				198	#if !defined(PFFFT_SIMD_DISABLE)
				199	typedef union v4sf_union {
				200	v4sf v;
				201	float f[4];
				202	} v4sf_union;
				203
				204	#include <string.h>
				205
				206	#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
				207
				208	/* detect bugs with the vector support macros */
				209	void validate_pffft_simd() {
				210	float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
				211	v4sf_union a0, a1, a2, a3, t, u;
				212	memcpy(a0.f, f, 4*sizeof(float));
				213	memcpy(a1.f, f+4, 4*sizeof(float));
				214	memcpy(a2.f, f+8, 4*sizeof(float));
				215	memcpy(a3.f, f+12, 4*sizeof(float));
				216
				217	t = a0; u = a1; t.v = VZERO();
				218	printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
				219	t.v = VADD(a1.v, a2.v);
				220	printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
				221	t.v = VMUL(a1.v, a2.v);
				222	printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
				223	t.v = VMADD(a1.v, a2.v,a0.v);
				224	printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
				225
				226	INTERLEAVE2(a1.v,a2.v,t.v,u.v);
				227	printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
				228	assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
				229	UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
				230	printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
				231	assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
				232
				233	t.v=LD_PS1(f[15]);
				234	printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
				235	assertv4(t, 15, 15, 15, 15);
				236	t.v = VSWAPHL(a1.v, a2.v);
				237	printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
				238	assertv4(t, 8, 9, 6, 7);
				239	VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
				240	printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
				241	a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
				242	a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
				243	assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
				244	}
				245	#endif //!PFFFT_SIMD_DISABLE
				246
				247	/* SSE and co like 16-bytes aligned pointers */
				248	#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
				249	void *pffft_aligned_malloc(size_t nb_bytes) {
				250	void p0, p;
				251	if (!(p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT))) return (void *) 0;
				252	p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
				253	((void *) p - 1) = p0;
				254	return p;
				255	}
				256
				257	void pffft_aligned_free(void *p) {
				258	if (p) free(((void *) p - 1));
				259	}
				260
				261	int pffft_simd_size() { return SIMD_SZ; }
				262
				263	/*
				264	passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
				265	*/
				266	static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf cc, v4sf ch, const float *wa1, float fsign) {
				267	int k, i;
				268	int l1ido = l1*ido;
				269	if (ido <= 2) {
				270	for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
				271	ch[0] = VADD(cc[0], cc[ido+0]);
				272	ch[l1ido] = VSUB(cc[0], cc[ido+0]);
				273	ch[1] = VADD(cc[1], cc[ido+1]);
				274	ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
				275	}
				276	} else {
				277	for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
				278	for (i=0; i<ido-1; i+=2) {
				279	v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
				280	v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
				281	v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
				282	ch[i] = VADD(cc[i+0], cc[i+ido+0]);
				283	ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
				284	VCPLXMUL(tr2, ti2, wr, wi);
				285	ch[i+l1ido] = tr2;
				286	ch[i+l1ido+1] = ti2;
				287	}
				288	}
				289	}
				290	}
				291
				292	/*
				293	passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
				294	*/
				295	static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf cc, v4sf ch,
				296	const float wa1, const float wa2, float fsign) {
				297	static const float taur = -0.5f;
				298	float taui = 0.866025403784439f*fsign;
				299	int i, k;
				300	v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
				301	int l1ido = l1*ido;
				302	float wr1, wi1, wr2, wi2;
				303	assert(ido > 2);
				304	for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
				305	for (i=0; i<ido-1; i+=2) {
				306	tr2 = VADD(cc[i+ido], cc[i+2*ido]);
				307	cr2 = VADD(cc[i], VMUL(LD_PS1(taur),tr2));
				308	ch[i] = VADD(cc[i], tr2);
				309	ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
				310	ci2 = VADD(cc[i +1], VMUL(LD_PS1(taur),ti2));
				311	ch[i+1] = VADD(cc[i+1], ti2);
				312	cr3 = VMUL(LD_PS1(taui), VSUB(cc[i+ido], cc[i+2*ido]));
				313	ci3 = VMUL(LD_PS1(taui), VSUB(cc[i+ido+1], cc[i+2*ido+1]));
				314	dr2 = VSUB(cr2, ci3);
				315	dr3 = VADD(cr2, ci3);
				316	di2 = VADD(ci2, cr3);
				317	di3 = VSUB(ci2, cr3);
				318	wr1=wa1[i], wi1=fsignwa1[i+1], wr2=wa2[i], wi2=fsignwa2[i+1];
				319	VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
				320	ch[i+l1ido] = dr2;
				321	ch[i+l1ido + 1] = di2;
				322	VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
				323	ch[i+2*l1ido] = dr3;
				324	ch[i+2*l1ido+1] = di3;
				325	}
				326	}
				327	} /* passf3 */
				328
				329	static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf cc, v4sf ch,
				330	const float wa1, const float wa2, const float *wa3, float fsign) {
				331	/* isign == -1 for forward transform and +1 for backward transform */
				332
				333	int i, k;
				334	v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
				335	int l1ido = l1*ido;
				336	if (ido == 2) {
				337	for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
				338	tr1 = VSUB(cc[0], cc[2*ido + 0]);
				339	tr2 = VADD(cc[0], cc[2*ido + 0]);
				340	ti1 = VSUB(cc[1], cc[2*ido + 1]);
				341	ti2 = VADD(cc[1], cc[2*ido + 1]);
				342	ti4 = VMUL(VSUB(cc[1ido + 0], cc[3ido + 0]), LD_PS1(fsign));
				343	tr4 = VMUL(VSUB(cc[3ido + 1], cc[1ido + 1]), LD_PS1(fsign));
				344	tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
				345	ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
				346
				347	ch[0*l1ido + 0] = VADD(tr2, tr3);
				348	ch[0*l1ido + 1] = VADD(ti2, ti3);
				349	ch[1*l1ido + 0] = VADD(tr1, tr4);
				350	ch[1*l1ido + 1] = VADD(ti1, ti4);
				351	ch[2*l1ido + 0] = VSUB(tr2, tr3);
				352	ch[2*l1ido + 1] = VSUB(ti2, ti3);
				353	ch[3*l1ido + 0] = VSUB(tr1, tr4);
				354	ch[3*l1ido + 1] = VSUB(ti1, ti4);
				355	}
				356	} else {
				357	for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
				358	for (i=0; i<ido-1; i+=2) {
				359	float wr1, wi1, wr2, wi2, wr3, wi3;
				360	tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
				361	tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
				362	ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
				363	ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
				364	tr4 = VMUL(VSUB(cc[i + 3ido + 1], cc[i + 1ido + 1]), LD_PS1(fsign));
				365	ti4 = VMUL(VSUB(cc[i + 1ido + 0], cc[i + 3ido + 0]), LD_PS1(fsign));
				366	tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
				367	ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
				368
				369	ch[i] = VADD(tr2, tr3);
				370	cr3 = VSUB(tr2, tr3);
				371	ch[i + 1] = VADD(ti2, ti3);
				372	ci3 = VSUB(ti2, ti3);
				373
				374	cr2 = VADD(tr1, tr4);
				375	cr4 = VSUB(tr1, tr4);
				376	ci2 = VADD(ti1, ti4);
				377	ci4 = VSUB(ti1, ti4);
				378	wr1=wa1[i], wi1=fsign*wa1[i+1];
				379	VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
				380	wr2=wa2[i], wi2=fsign*wa2[i+1];
				381	ch[i + l1ido] = cr2;
				382	ch[i + l1ido + 1] = ci2;
				383
				384	VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
				385	wr3=wa3[i], wi3=fsign*wa3[i+1];
				386	ch[i + 2*l1ido] = cr3;
				387	ch[i + 2*l1ido + 1] = ci3;
				388
				389	VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
				390	ch[i + 3*l1ido] = cr4;
				391	ch[i + 3*l1ido + 1] = ci4;
				392	}
				393	}
				394	}
				395	} /* passf4 */
				396
				397	static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
				398	static const float minus_one = -1.f;
				399	int i, k, l1ido = l1*ido;
				400	for (k=0; k < l1ido; k += ido) {
				401	v4sf a = cc[k], b = cc[k + l1ido];
				402	ch[2*k] = VADD(a, b);
				403	ch[2*(k+ido)-1] = VSUB(a, b);
				404	}
				405	if (ido < 2) return;
				406	if (ido != 2) {
				407	for (k=0; k < l1ido; k += ido) {
				408	for (i=2; i<ido; i+=2) {
				409	v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
				410	v4sf br = cc[i - 1 + k], bi = cc[i + k];
				411	VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
				412	ch[i + 2*k] = VADD(bi, ti2);
				413	ch[2*(k+ido) - i] = VSUB(ti2, bi);
				414	ch[i - 1 + 2*k] = VADD(br, tr2);
				415	ch[2*(k+ido) - i -1] = VSUB(br, tr2);
				416	}
				417	}
				418	if (ido % 2 == 1) return;
				419	}
				420	for (k=0; k < l1ido; k += ido) {
				421	ch[2*k + ido] = VMUL(LD_PS1(minus_one), cc[ido-1 + k + l1ido]);
				422	ch[2*k + ido-1] = cc[k + ido-1];
				423	}
				424	} /* radf2 */
				425
				426
				427	static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf cc, v4sf ch, const float *wa1) {
				428	static const float minus_two=-2;
				429	int i, k, l1ido = l1*ido;
				430	v4sf a,b,c,d, tr2, ti2;
				431	for (k=0; k < l1ido; k += ido) {
				432	a = cc[2k]; b = cc[2(k+ido) - 1];
				433	ch[k] = VADD(a, b);
				434	ch[k + l1ido] =VSUB(a, b);
				435	}
				436	if (ido < 2) return;
				437	if (ido != 2) {
				438	for (k = 0; k < l1ido; k += ido) {
				439	for (i = 2; i < ido; i += 2) {
				440	a = cc[i-1 + 2k]; b = cc[2(k + ido) - i - 1];
				441	c = cc[i+0 + 2k]; d = cc[2(k + ido) - i + 0];
				442	ch[i-1 + k] = VADD(a, b);
				443	tr2 = VSUB(a, b);
				444	ch[i+0 + k] = VSUB(c, d);
				445	ti2 = VADD(c, d);
				446	VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
				447	ch[i-1 + k + l1ido] = tr2;
				448	ch[i+0 + k + l1ido] = ti2;
				449	}
				450	}
				451	if (ido % 2 == 1) return;
				452	}
				453	for (k = 0; k < l1ido; k += ido) {
				454	a = cc[2k + ido-1]; b = cc[2k + ido];
				455	ch[k + ido-1] = VADD(a,a);
				456	ch[k + ido-1 + l1ido] = VMUL(LD_PS1(minus_two), b);
				457	}
				458	} /* radb2 */
				459
				460	static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
				461	const float wa1, const float wa2) {
				462	static const float taur = -0.5f;
				463	static const float taui = 0.866025403784439f;
				464	int i, k, ic;
				465	v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
				466	for (k=0; k<l1; k++) {
				467	cr2 = VADD(cc[(k + l1)ido], cc[(k + 2l1)*ido]);
				468	ch[3kido] = VADD(cc[k*ido], cr2);
				469	ch[(3k+2)ido] = VMUL(LD_PS1(taui), VSUB(cc[(k + l12)ido], cc[(k + l1)*ido]));
				470	ch[ido-1 + (3k + 1)ido] = VADD(cc[k*ido], VMUL(LD_PS1(taur), cr2));
				471	}
				472	if (ido == 1) return;
				473	for (k=0; k<l1; k++) {
				474	for (i=2; i<ido; i+=2) {
				475	ic = ido - i;
				476	wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
				477	dr2 = cc[i - 1 + (k + l1)ido]; di2 = cc[i + (k + l1)ido];
				478	VCPLXMULCONJ(dr2, di2, wr1, wi1);
				479
				480	wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
				481	dr3 = cc[i - 1 + (k + l12)ido]; di3 = cc[i + (k + l12)ido];
				482	VCPLXMULCONJ(dr3, di3, wr2, wi2);
				483
				484	cr2 = VADD(dr2, dr3);
				485	ci2 = VADD(di2, di3);
				486	ch[i - 1 + 3kido] = VADD(cc[i - 1 + k*ido], cr2);
				487	ch[i + 3kido] = VADD(cc[i + k*ido], ci2);
				488	tr2 = VADD(cc[i - 1 + k*ido], VMUL(LD_PS1(taur), cr2));
				489	ti2 = VADD(cc[i + k*ido], VMUL(LD_PS1(taur), ci2));
				490	tr3 = VMUL(LD_PS1(taui), VSUB(di2, di3));
				491	ti3 = VMUL(LD_PS1(taui), VSUB(dr3, dr2));
				492	ch[i - 1 + (3k + 2)ido] = VADD(tr2, tr3);
				493	ch[ic - 1 + (3k + 1)ido] = VSUB(tr2, tr3);
				494	ch[i + (3k + 2)ido] = VADD(ti2, ti3);
				495	ch[ic + (3k + 1)ido] = VSUB(ti3, ti2);
				496	}
				497	}
				498	} /* radf3 */
				499
				500
				501	static void radb3_ps(int ido, int l1, const v4sf RESTRICT cc, v4sf RESTRICT ch,
				502	const float wa1, const float wa2)
				503	{
				504	static const float taur = -0.5f;
				505	static const float taui = 0.866025403784439f;
				506	static const float taui_2 = 0.866025403784439f*2;
				507	int i, k, ic;
				508	v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
				509	for (k=0; k<l1; k++) {
				510	tr2 = cc[ido-1 + (3k + 1)ido]; tr2 = VADD(tr2,tr2);
				511	cr2 = VMADD(LD_PS1(taur), tr2, cc[3kido]);
				512	ch[kido] = VADD(cc[3k*ido], tr2);
				513	ci3 = VMUL(LD_PS1(taui_2), cc[(3k + 2)ido]);
				514	ch[(k + l1)*ido] = VSUB(cr2, ci3);
				515	ch[(k + 2l1)ido] = VADD(cr2, ci3);
				516	}
				517	if (ido == 1) return;
				518	for (k=0; k<l1; k++) {
				519	for (i=2; i<ido; i+=2) {
				520	ic = ido - i;
				521	tr2 = VADD(cc[i - 1 + (3k + 2)ido], cc[ic - 1 + (3k + 1)ido]);
				522	cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3kido]);
				523	ch[i - 1 + kido] = VADD(cc[i - 1 + 3k*ido], tr2);
				524	ti2 = VSUB(cc[i + (3k + 2)ido], cc[ic + (3k + 1)ido]);
				525	ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3kido]);
				526	ch[i + kido] = VADD(cc[i + 3k*ido], ti2);
				527	cr3 = VMUL(LD_PS1(taui), VSUB(cc[i - 1 + (3k + 2)ido], cc[ic - 1 + (3k + 1)ido]));
				528	ci3 = VMUL(LD_PS1(taui), VADD(cc[i + (3k + 2)ido], cc[ic + (3k + 1)ido]));
				529	dr2 = VSUB(cr2, ci3);
				530	dr3 = VADD(cr2, ci3);
				531	di2 = VADD(ci2, cr3);
				532	di3 = VSUB(ci2, cr3);
				533	VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
				534	ch[i - 1 + (k + l1)*ido] = dr2;
				535	ch[i + (k + l1)*ido] = di2;
				536	VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
				537	ch[i - 1 + (k + 2l1)ido] = dr3;
				538	ch[i + (k + 2l1)ido] = di3;
				539	}
				540	}
				541	} /* radb3 */
				542
				543
				544	static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf RESTRICT cc, v4sf RESTRICT ch,
				545	const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
				546	{
				547	static const float minus_hsqt2 = (float)-0.7071067811865475;
				548	int i, k, l1ido = l1*ido;
				549	{
				550	const v4sf RESTRICT cc_ = cc, RESTRICT cc_end = cc + l1ido;
				551	v4sf * RESTRICT ch_ = ch;
				552	while (cc < cc_end) {
				553	// this loop represents between 25% and 40% of total radf4_ps cost !
				554	v4sf a0 = cc[0], a1 = cc[l1ido];
				555	v4sf a2 = cc[2l1ido], a3 = cc[3l1ido];
				556	v4sf tr1 = VADD(a1, a3);
				557	v4sf tr2 = VADD(a0, a2);
				558	ch[2*ido-1] = VSUB(a0, a2);
				559	ch[2*ido ] = VSUB(a3, a1);
				560	ch[0 ] = VADD(tr1, tr2);
				561	ch[4*ido-1] = VSUB(tr2, tr1);
				562	cc += ido; ch += 4*ido;
				563	}
				564	cc = cc_; ch = ch_;
				565	}
				566	if (ido < 2) return;
				567	if (ido != 2) {
				568	for (k = 0; k < l1ido; k += ido) {
				569	const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
				570	for (i=2; i<ido; i += 2, pc += 2) {
				571	int ic = ido - i;
				572	v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
				573	v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
				574
				575	cr2 = pc[1*l1ido+0];
				576	ci2 = pc[1*l1ido+1];
				577	wr=LD_PS1(wa1[i - 2]);
				578	wi=LD_PS1(wa1[i - 1]);
				579	VCPLXMULCONJ(cr2,ci2,wr,wi);
				580
				581	cr3 = pc[2*l1ido+0];
				582	ci3 = pc[2*l1ido+1];
				583	wr = LD_PS1(wa2[i-2]);
				584	wi = LD_PS1(wa2[i-1]);
				585	VCPLXMULCONJ(cr3, ci3, wr, wi);
				586
				587	cr4 = pc[3*l1ido];
				588	ci4 = pc[3*l1ido+1];
				589	wr = LD_PS1(wa3[i-2]);
				590	wi = LD_PS1(wa3[i-1]);
				591	VCPLXMULCONJ(cr4, ci4, wr, wi);
				592
				593	/* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
				594
				595	tr1 = VADD(cr2,cr4);
				596	tr4 = VSUB(cr4,cr2);
				597	tr2 = VADD(pc[0],cr3);
				598	tr3 = VSUB(pc[0],cr3);
				599	ch[i - 1 + 4*k] = VADD(tr1,tr2);
				600	ch[ic - 1 + 4k + 3ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
				601	ti1 = VADD(ci2,ci4);
				602	ti4 = VSUB(ci2,ci4);
				603	ch[i - 1 + 4k + 2ido] = VADD(ti4,tr3);
				604	ch[ic - 1 + 4k + 1ido] = VSUB(tr3,ti4); // dispose tr3, ti4
				605	ti2 = VADD(pc[1],ci3);
				606	ti3 = VSUB(pc[1],ci3);
				607	ch[i + 4*k] = VADD(ti1, ti2);
				608	ch[ic + 4k + 3ido] = VSUB(ti1, ti2);
				609	ch[i + 4k + 2ido] = VADD(tr4, ti3);
				610	ch[ic + 4k + 1ido] = VSUB(tr4, ti3);
				611	}
				612	}
				613	if (ido % 2 == 1) return;
				614	}
				615	for (k=0; k<l1ido; k += ido) {
				616	v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
				617	v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
				618	v4sf ti1 = VMUL(LD_PS1(minus_hsqt2), VADD(a, b));
				619	v4sf tr1 = VMUL(LD_PS1(minus_hsqt2), VSUB(b, a));
				620	ch[ido-1 + 4*k] = VADD(tr1, c);
				621	ch[ido-1 + 4k + 2ido] = VSUB(c, tr1);
				622	ch[4k + 1ido] = VSUB(ti1, d);
				623	ch[4k + 3ido] = VADD(ti1, d);
				624	}
				625	} /* radf4 */
				626
				627
				628	static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
				629	const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
				630	{
				631	static const float minus_sqrt2 = (float)-1.414213562373095;
				632	static const float two = 2.f;
				633	int i, k, l1ido = l1*ido;
				634	v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
				635	{
				636	const v4sf RESTRICT cc_ = cc, RESTRICT ch_end = ch + l1ido;
				637	v4sf *ch_ = ch;
				638	while (ch < ch_end) {
				639	v4sf a = cc[0], b = cc[4*ido-1];
				640	v4sf c = cc[2ido], d = cc[2ido-1];
				641	tr3 = VMUL(LD_PS1(two),d);
				642	tr2 = VADD(a,b);
				643	tr1 = VSUB(a,b);
				644	tr4 = VMUL(LD_PS1(two),c);
				645	ch[0*l1ido] = VADD(tr2, tr3);
				646	ch[2*l1ido] = VSUB(tr2, tr3);
				647	ch[1*l1ido] = VSUB(tr1, tr4);
				648	ch[3*l1ido] = VADD(tr1, tr4);
				649
				650	cc += 4*ido; ch += ido;
				651	}
				652	cc = cc_; ch = ch_;
				653	}
				654	if (ido < 2) return;
				655	if (ido != 2) {
				656	for (k = 0; k < l1ido; k += ido) {
				657	const v4sf * RESTRICT pc = (v4sf)(cc - 1 + 4k);
				658	v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
				659	for (i = 2; i < ido; i += 2) {
				660
				661	tr1 = VSUB(pc[i], pc[4*ido - i]);
				662	tr2 = VADD(pc[i], pc[4*ido - i]);
				663	ti4 = VSUB(pc[2ido + i], pc[2ido - i]);
				664	tr3 = VADD(pc[2ido + i], pc[2ido - i]);
				665	ph[0] = VADD(tr2, tr3);
				666	cr3 = VSUB(tr2, tr3);
				667
				668	ti3 = VSUB(pc[2ido + i + 1], pc[2ido - i + 1]);
				669	tr4 = VADD(pc[2ido + i + 1], pc[2ido - i + 1]);
				670	cr2 = VSUB(tr1, tr4);
				671	cr4 = VADD(tr1, tr4);
				672
				673	ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
				674	ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
				675
				676	ph[1] = VADD(ti2, ti3); ph += l1ido;
				677	ci3 = VSUB(ti2, ti3);
				678	ci2 = VADD(ti1, ti4);
				679	ci4 = VSUB(ti1, ti4);
				680	VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
				681	ph[0] = cr2;
				682	ph[1] = ci2; ph += l1ido;
				683	VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
				684	ph[0] = cr3;
				685	ph[1] = ci3; ph += l1ido;
				686	VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
				687	ph[0] = cr4;
				688	ph[1] = ci4; ph = ph - 3*l1ido + 2;
				689	}
				690	}
				691	if (ido % 2 == 1) return;
				692	}
				693	for (k=0; k < l1ido; k+=ido) {
				694	int i0 = 4*k + ido;
				695	v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
				696	v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
				697	tr1 = VSUB(c,d);
				698	tr2 = VADD(c,d);
				699	ti1 = VADD(b,a);
				700	ti2 = VSUB(b,a);
				701	ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
				702	ch[ido-1 + k + 1*l1ido] = VMUL(LD_PS1(minus_sqrt2), VSUB(ti1, tr1));
				703	ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
				704	ch[ido-1 + k + 3*l1ido] = VMUL(LD_PS1(minus_sqrt2), VADD(ti1, tr1));
				705	}
				706	} /* radb4 */
				707
				708	static NEVER_INLINE(v4sf ) rfftf1_ps(int n, const v4sf input_readonly, v4sf work1, v4sf work2,
				709	const float wa, const int ifac) {
				710	v4sf in = (v4sf)input_readonly;
				711	v4sf *out = (in == work2 ? work1 : work2);
				712	int nf = ifac[1], k1;
				713	int l2 = n;
				714	int iw = n-1;
				715	assert(in != out && work1 != work2);
				716	for (k1 = 1; k1 <= nf; ++k1) {
				717	int kh = nf - k1;
				718	int ip = ifac[kh + 2];
				719	int l1 = l2 / ip;
				720	int ido = n / l2;
				721	iw -= (ip - 1)*ido;
				722	switch (ip) {
				723	case 4: {
				724	int ix2 = iw + ido;
				725	int ix3 = ix2 + ido;
				726	radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
				727	} break;
				728	case 3: {
				729	int ix2 = iw + ido;
				730	radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
				731	} break;
				732	case 2:
				733	radf2_ps(ido, l1, in, out, &wa[iw]);
				734	break;
				735	default:
				736	assert(0);
				737	break;
				738	}
				739	l2 = l1;
				740	if (out == work2) {
				741	out = work1; in = work2;
				742	} else {
				743	out = work2; in = work1;
				744	}
				745	}
				746	return in; /* this is in fact the output .. */
				747	} /* rfftf1 */
				748
				749	static NEVER_INLINE(v4sf ) rfftb1_ps(int n, const v4sf input_readonly, v4sf work1, v4sf work2,
				750	const float wa, const int ifac) {
				751	v4sf in = (v4sf)input_readonly;
				752	v4sf *out = (in == work2 ? work1 : work2);
				753	int nf = ifac[1], k1;
				754	int l1 = 1;
				755	int iw = 0;
				756	assert(in != out);
				757	for (k1=1; k1<=nf; k1++) {
				758	int ip = ifac[k1 + 1];
				759	int l2 = ip*l1;
				760	int ido = n / l2;
				761	switch (ip) {
				762	case 4: {
				763	int ix2 = iw + ido;
				764	int ix3 = ix2 + ido;
				765	radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
				766	} break;
				767	case 3: {
				768	int ix2 = iw + ido;
				769	radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
				770	} break;
				771	case 2:
				772	radb2_ps(ido, l1, in, out, &wa[iw]);
				773	break;
				774	default:
				775	assert(0);
				776	break;
				777	}
				778	l1 = l2;
				779	iw += (ip - 1)*ido;
				780
				781	if (out == work2) {
				782	out = work1; in = work2;
				783	} else {
				784	out = work2; in = work1;
				785	}
				786	}
				787	return in; /* this is in fact the output .. */
				788	}
				789
				790	static int decompose(int n, int *ifac, const int ntryh[3]) {
				791	int nl = n, nf = 0, i, j = 0;
				792	for (j=0; j < 3; ++j) {
				793	int ntry = ntryh[j];
				794	while (nl != 1) {
				795	int nq = nl / ntry;
				796	int nr = nl - ntry * nq;
				797	if (nr == 0) {
				798	ifac[2+nf++] = ntry;
				799	nl = nq;
				800	if (ntry == 2 && nf != 1) {
				801	for (i = 2; i <= nf; ++i) {
				802	int ib = nf - i + 2;
				803	ifac[ib + 1] = ifac[ib];
				804	}
				805	ifac[2] = 2;
				806	}
				807	} else break;
				808	}
				809	}
				810	ifac[0] = n;
				811	ifac[1] = nf;
				812	return nf;
				813	}
				814
				815
				816
				817	static void rffti1_ps(int n, float wa, int ifac)
				818	{
				819	static const int ntryh[3] = { 4,2,3 };
				820	int k1, j, ii;
				821
				822	int nf = decompose(n,ifac,ntryh);
				823	float argh = (2*M_PI) / n;
				824	int is = 0;
				825	int nfm1 = nf - 1;
				826	int l1 = 1;
				827	if (nfm1 == 0) return;
				828	for (k1 = 1; k1 <= nfm1; k1++) {
				829	int ip = ifac[k1 + 1];
				830	int ld = 0;
				831	int l2 = l1*ip;
				832	int ido = n / l2;
				833	int ipm = ip - 1;
				834	for (j = 1; j <= ipm; ++j) {
				835	float argld;
				836	int i = is, fi=0;
				837	ld += l1;
				838	argld = ld*argh;
				839	for (ii = 3; ii <= ido; ii += 2) {
				840	i += 2;
				841	fi += 1;
				842	wa[i - 2] = cos(fi*argld);
				843	wa[i - 1] = sin(fi*argld);
				844	}
				845	is += ido;
				846	}
				847	l1 = l2;
				848	}
				849	} /* rffti1 */
				850
				851	void cffti1_ps(int n, float wa, int ifac)
				852	{
				853	static const int ntryh[3] = { 3,4,2 };
				854	int k1, j, ii;
				855
				856	int nf = decompose(n,ifac,ntryh);
				857	float argh = (2*M_PI)/(float)n;
				858	int i = 1;
				859	int l1 = 1;
				860	for (k1=1; k1<=nf; k1++) {
				861	int ip = ifac[k1+1];
				862	int ld = 0;
				863	int l2 = l1*ip;
				864	int ido = n / l2;
				865	int idot = ido + ido + 2;
				866	int ipm = ip - 1;
				867	for (j=1; j<=ipm; j++) {
				868	float argld;
				869	int i1 = i, fi = 0;
				870	wa[i-1] = 1;
				871	wa[i] = 0;
				872	ld += l1;
				873	argld = ld*argh;
				874	for (ii = 4; ii <= idot; ii += 2) {
				875	i += 2;
				876	fi += 1;
				877	wa[i-1] = cos(fi*argld);
				878	wa[i] = sin(fi*argld);
				879	}
				880	if (ip > 5) {
				881	wa[i1-1] = wa[i-1];
				882	wa[i1] = wa[i];
				883	}
				884	}
				885	l1 = l2;
				886	}
				887	} /* cffti1 */
				888
				889
				890	v4sf cfftf1_ps(int n, const v4sf input_readonly, v4sf work1, v4sf work2, const float wa, const int ifac, int isign) {
				891	v4sf in = (v4sf)input_readonly;
				892	v4sf *out = (in == work2 ? work1 : work2);
				893	int nf = ifac[1], k1;
				894	int l1 = 1;
				895	int iw = 0;
				896	assert(in != out && work1 != work2);
				897	for (k1=2; k1<=nf+1; k1++) {
				898	int ip = ifac[k1];
				899	int l2 = ip*l1;
				900	int ido = n / l2;
				901	int idot = ido + ido;
				902	switch (ip) {
				903	case 4: {
				904	int ix2 = iw + idot;
				905	int ix3 = ix2 + idot;
				906	passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
				907	} break;
				908	case 2: {
				909	passf2_ps(idot, l1, in, out, &wa[iw], isign);
				910	} break;
				911	case 3: {
				912	int ix2 = iw + idot;
				913	passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
				914	} break;
				915	default:
				916	assert(0);
				917	}
				918	l1 = l2;
				919	iw += (ip - 1)*idot;
				920	if (out == work2) {
				921	out = work1; in = work2;
				922	} else {
				923	out = work2; in = work1;
				924	}
				925	}
				926
				927	return in; /* this is in fact the output .. */
				928	}
				929
				930
				931	struct PFFFT_Setup {
				932	int N;
				933	int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
				934	int ifac[15];
				935	pffft_transform_t transform;
				936	v4sf *data; // allocated room for twiddle coefs
				937	float e; // points into 'data' , N/43 elements
				938	float *twiddle; // points into 'data', N/4 elements
				939	};
				940
				941	PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
				942	PFFFT_Setup s = (PFFFT_Setup)malloc(sizeof(PFFFT_Setup));
				943	int k, m;
				944	if (transform == PFFFT_REAL) { assert(N >= 32); }
				945	if (transform == PFFFT_COMPLEX) { assert(N >= 16); }
				946	//assert((N % 32) == 0);
				947	s->N = N;
				948	s->transform = transform;
				949	/* nb of complex simd vectors */
				950	s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
				951	s->data = (v4sf)pffft_aligned_malloc(2s->Ncvec * sizeof(v4sf));
				952	s->e = (float*)s->data;
				953	s->twiddle = (float)(s->data + (2s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
				954
				955	if (transform == PFFFT_REAL) {
				956	for (k=0; k < s->Ncvec; ++k) {
				957	int i = k/SIMD_SZ;
				958	int j = k%SIMD_SZ;
				959	for (m=0; m < SIMD_SZ-1; ++m) {
				960	float A = -2M_PI(m+1)*k / N;
				961	s->e[(2(i3 + m) + 0) * SIMD_SZ + j] = cos(A);
				962	s->e[(2(i3 + m) + 1) * SIMD_SZ + j] = sin(A);
				963	}
				964	}
				965	rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
				966	} else {
				967	for (k=0; k < s->Ncvec; ++k) {
				968	int i = k/SIMD_SZ;
				969	int j = k%SIMD_SZ;
				970	for (m=0; m < SIMD_SZ-1; ++m) {
				971	float A = -2M_PI(m+1)*k / N;
				972	s->e[(2(i3 + m) + 0)*SIMD_SZ + j] = cos(A);
				973	s->e[(2(i3 + m) + 1)*SIMD_SZ + j] = sin(A);
				974	}
				975	}
				976	cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
				977	}
				978	return s;
				979	}
				980
				981
				982	void pffft_destroy_setup(PFFFT_Setup *s) {
				983	pffft_aligned_free(s->data);
				984	free(s);
				985	}
				986
				987	#if !defined(PFFFT_SIMD_DISABLE)
				988
				989	/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
				990	static void reversed_copy(int N, const v4sf in, int in_stride, v4sf out) {
				991	v4sf g0, g1;
				992	int k;
				993	INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
				994
				995	*--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
				996	for (k=1; k < N; ++k) {
				997	v4sf h0, h1;
				998	INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
				999	*--out = VSWAPHL(g1, h0);
				1000	*--out = VSWAPHL(h0, h1);
				1001	g1 = h1;
				1002	}
				1003	*--out = VSWAPHL(g1, g0);
				1004	}
				1005
				1006	static void unreversed_copy(int N, const v4sf in, v4sf out, int out_stride) {
				1007	v4sf g0, g1, h0, h1;
				1008	int k;
				1009	g0 = g1 = in[0]; ++in;
				1010	for (k=1; k < N; ++k) {
				1011	h0 = in++; h1 = in++;
				1012	g1 = VSWAPHL(g1, h0);
				1013	h0 = VSWAPHL(h0, h1);
				1014	UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
				1015	g1 = h1;
				1016	}
				1017	h0 = *in++; h1 = g0;
				1018	g1 = VSWAPHL(g1, h0);
				1019	h0 = VSWAPHL(h0, h1);
				1020	UNINTERLEAVE2(h0, g1, out[0], out[1]);
				1021	}
				1022
				1023	void pffft_zreorder(PFFFT_Setup setup, const float in, float *out, pffft_direction_t direction) {
				1024	int k, N = setup->N, Ncvec = setup->Ncvec;
				1025	const v4sf vin = (const v4sf)in;
				1026	v4sf vout = (v4sf)out;
				1027	assert(in != out);
				1028	if (setup->transform == PFFFT_REAL) {
				1029	int k, dk = N/32;
				1030	if (direction == PFFFT_FORWARD) {
				1031	for (k=0; k < dk; ++k) {
				1032	INTERLEAVE2(vin[k8 + 0], vin[k8 + 1], vout[2(0dk + k) + 0], vout[2(0dk + k) + 1]);
				1033	INTERLEAVE2(vin[k8 + 4], vin[k8 + 5], vout[2(2dk + k) + 0], vout[2(2dk + k) + 1]);
				1034	}
				1035	reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
				1036	reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
				1037	} else {
				1038	for (k=0; k < dk; ++k) {
				1039	UNINTERLEAVE2(vin[2(0dk + k) + 0], vin[2(0dk + k) + 1], vout[k8 + 0], vout[k8 + 1]);
				1040	UNINTERLEAVE2(vin[2(2dk + k) + 0], vin[2(2dk + k) + 1], vout[k8 + 4], vout[k8 + 5]);
				1041	}
				1042	unreversed_copy(dk, (v4sf)(in + N/4), (v4sf)(out + N - 6*SIMD_SZ), -8);
				1043	unreversed_copy(dk, (v4sf)(in + 3N/4), (v4sf)(out + N - 2SIMD_SZ), -8);
				1044	}
				1045	} else {
				1046	if (direction == PFFFT_FORWARD) {
				1047	for (k=0; k < Ncvec; ++k) {
				1048	int kk = (k/4) + (k%4)*(Ncvec/4);
				1049	INTERLEAVE2(vin[k2], vin[k2+1], vout[kk2], vout[kk2+1]);
				1050	}
				1051	} else {
				1052	for (k=0; k < Ncvec; ++k) {
				1053	int kk = (k/4) + (k%4)*(Ncvec/4);
				1054	UNINTERLEAVE2(vin[kk2], vin[kk2+1], vout[k2], vout[k2+1]);
				1055	}
				1056	}
				1057	}
				1058	}
				1059
				1060	void pffft_cplx_finalize(int Ncvec, const v4sf in, v4sf out, const v4sf *e) {
				1061	int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
				1062	v4sf r0, i0, r1, i1, r2, i2, r3, i3;
				1063	v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
				1064	assert(in != out);
				1065	for (k=0; k < dk; ++k) {
				1066	r0 = in[8k+0]; i0 = in[8k+1];
				1067	r1 = in[8k+2]; i1 = in[8k+3];
				1068	r2 = in[8k+4]; i2 = in[8k+5];
				1069	r3 = in[8k+6]; i3 = in[8k+7];
				1070	VTRANSPOSE4(r0,r1,r2,r3);
				1071	VTRANSPOSE4(i0,i1,i2,i3);
				1072	VCPLXMUL(r1,i1,e[k6+0],e[k6+1]);
				1073	VCPLXMUL(r2,i2,e[k6+2],e[k6+3]);
				1074	VCPLXMUL(r3,i3,e[k6+4],e[k6+5]);
				1075
				1076	sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
				1077	sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
				1078	si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
				1079	si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
				1080
				1081	/*
				1082	transformation for each column is:
				1083
				1084	[1 1 1 1 0 0 0 0] [r0]
				1085	[1 0 -1 0 0 -1 0 1] [r1]
				1086	[1 -1 1 -1 0 0 0 0] [r2]
				1087	[1 0 -1 0 0 1 0 -1] [r3]
				1088	[0 0 0 0 1 1 1 1] * [i0]
				1089	[0 1 0 -1 1 0 -1 0] [i1]
				1090	[0 0 0 0 1 -1 1 -1] [i2]
				1091	[0 -1 0 1 1 0 -1 0] [i3]
				1092	*/
				1093
				1094	r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
				1095	r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
				1096	r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
				1097	r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
				1098
				1099	out++ = r0; out++ = i0; out++ = r1; out++ = i1;
				1100	out++ = r2; out++ = i2; out++ = r3; out++ = i3;
				1101	}
				1102	}
				1103
				1104	void pffft_cplx_preprocess(int Ncvec, const v4sf in, v4sf out, const v4sf *e) {
				1105	int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
				1106	v4sf r0, i0, r1, i1, r2, i2, r3, i3;
				1107	v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
				1108	assert(in != out);
				1109	for (k=0; k < dk; ++k) {
				1110	r0 = in[8k+0]; i0 = in[8k+1];
				1111	r1 = in[8k+2]; i1 = in[8k+3];
				1112	r2 = in[8k+4]; i2 = in[8k+5];
				1113	r3 = in[8k+6]; i3 = in[8k+7];
				1114
				1115	sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
				1116	sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
				1117	si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
				1118	si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
				1119
				1120	r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
				1121	r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
				1122	r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
				1123	r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
				1124
				1125	VCPLXMULCONJ(r1,i1,e[k6+0],e[k6+1]);
				1126	VCPLXMULCONJ(r2,i2,e[k6+2],e[k6+3]);
				1127	VCPLXMULCONJ(r3,i3,e[k6+4],e[k6+5]);
				1128
				1129	VTRANSPOSE4(r0,r1,r2,r3);
				1130	VTRANSPOSE4(i0,i1,i2,i3);
				1131
				1132	out++ = r0; out++ = i0; out++ = r1; out++ = i1;
				1133	out++ = r2; out++ = i2; out++ = r3; out++ = i3;
				1134	}
				1135	}
				1136
				1137
				1138	ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf in0, const v4sf in1, const v4sf *in,
				1139	const v4sf e, v4sf out) {
				1140	v4sf r0, i0, r1, i1, r2, i2, r3, i3;
				1141	v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
				1142	r0 = in0; i0 = in1;
				1143	r1 = in++; i1 = in++; r2 = in++; i2 = in++; r3 = in++; i3 = in++;
				1144	VTRANSPOSE4(r0,r1,r2,r3);
				1145	VTRANSPOSE4(i0,i1,i2,i3);
				1146
				1147	/*
				1148	transformation for each column is:
				1149
				1150	[1 1 1 1 0 0 0 0] [r0]
				1151	[1 0 -1 0 0 -1 0 1] [r1]
				1152	[1 0 -1 0 0 1 0 -1] [r2]
				1153	[1 -1 1 -1 0 0 0 0] [r3]
				1154	[0 0 0 0 1 1 1 1] * [i0]
				1155	[0 -1 0 1 -1 0 1 0] [i1]
				1156	[0 -1 0 1 1 0 -1 0] [i2]
				1157	[0 0 0 0 -1 1 -1 1] [i3]
				1158	*/
				1159
				1160	//cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
				1161	//cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
				1162
				1163	VCPLXMUL(r1,i1,e[0],e[1]);
				1164	VCPLXMUL(r2,i2,e[2],e[3]);
				1165	VCPLXMUL(r3,i3,e[4],e[5]);
				1166
				1167	//cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
				1168	//cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
				1169
				1170	sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
				1171	sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
				1172	si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
				1173	si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
				1174
				1175	r0 = VADD(sr0, sr1);
				1176	r3 = VSUB(sr0, sr1);
				1177	i0 = VADD(si0, si1);
				1178	i3 = VSUB(si1, si0);
				1179	r1 = VADD(dr0, di1);
				1180	r2 = VSUB(dr0, di1);
				1181	i1 = VSUB(dr1, di0);
				1182	i2 = VADD(dr1, di0);
				1183
				1184	*out++ = r0;
				1185	*out++ = i0;
				1186	*out++ = r1;
				1187	*out++ = i1;
				1188	*out++ = r2;
				1189	*out++ = i2;
				1190	*out++ = r3;
				1191	*out++ = i3;
				1192
				1193	}
				1194
				1195	static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf in, v4sf out, const v4sf *e) {
				1196	int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
				1197	/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
				1198
				1199	v4sf_union cr, ci, uout = (v4sf_union)out;
				1200	v4sf save = in[7], zero=VZERO();
				1201	float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
				1202	static const float s = M_SQRT2/2;
				1203
				1204	cr.v = in[0]; ci.v = in[Ncvec*2-1];
				1205	assert(in != out);
				1206	pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
				1207
				1208	/*
				1209	[cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
				1210
				1211	[Xr(1)] ] [1 1 1 1 0 0 0 0]
				1212	[Xr(N/4) ] [0 0 0 0 1 s 0 -s]
				1213	[Xr(N/2) ] [1 0 -1 0 0 0 0 0]
				1214	[Xr(3N/4)] [0 0 0 0 1 -s 0 s]
				1215	[Xi(1) ] [1 -1 1 -1 0 0 0 0]
				1216	[Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
				1217	[Xi(N/2) ] [0 -1 0 1 0 0 0 0]
				1218	[Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
				1219	*/
				1220
				1221	xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
				1222	xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
				1223	xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
				1224	xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
				1225	xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
				1226	xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
				1227	xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
				1228	xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
				1229
				1230	for (k=1; k < dk; ++k) {
				1231	v4sf save_next = in[8*k+7];
				1232	pffft_real_finalize_4x4(&save, &in[8k+0], in + 8k+1,
				1233	e + k6, out + k8);
				1234	save = save_next;
				1235	}
				1236
				1237	}
				1238
				1239	ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
				1240	const v4sf e, v4sf out, int first) {
				1241	v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
				1242	/*
				1243	transformation for each column is:
				1244
				1245	[1 1 1 1 0 0 0 0] [r0]
				1246	[1 0 0 -1 0 -1 -1 0] [r1]
				1247	[1 -1 -1 1 0 0 0 0] [r2]
				1248	[1 0 0 -1 0 1 1 0] [r3]
				1249	[0 0 0 0 1 -1 1 -1] * [i0]
				1250	[0 -1 1 0 1 0 0 1] [i1]
				1251	[0 0 0 0 1 1 -1 -1] [i2]
				1252	[0 1 -1 0 1 0 0 1] [i3]
				1253	*/
				1254
				1255	v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
				1256	v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
				1257	v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
				1258	v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
				1259
				1260	r0 = VADD(sr0, sr1);
				1261	r2 = VSUB(sr0, sr1);
				1262	r1 = VSUB(dr0, si1);
				1263	r3 = VADD(dr0, si1);
				1264	i0 = VSUB(di0, di1);
				1265	i2 = VADD(di0, di1);
				1266	i1 = VSUB(si0, dr1);
				1267	i3 = VADD(si0, dr1);
				1268
				1269	VCPLXMULCONJ(r1,i1,e[0],e[1]);
				1270	VCPLXMULCONJ(r2,i2,e[2],e[3]);
				1271	VCPLXMULCONJ(r3,i3,e[4],e[5]);
				1272
				1273	VTRANSPOSE4(r0,r1,r2,r3);
				1274	VTRANSPOSE4(i0,i1,i2,i3);
				1275
				1276	if (!first) {
				1277	*out++ = r0;
				1278	*out++ = i0;
				1279	}
				1280	*out++ = r1;
				1281	*out++ = i1;
				1282	*out++ = r2;
				1283	*out++ = i2;
				1284	*out++ = r3;
				1285	*out++ = i3;
				1286	}
				1287
				1288	static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf in, v4sf out, const v4sf *e) {
				1289	int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
				1290	/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
				1291
				1292	v4sf_union Xr, Xi, uout = (v4sf_union)out;
				1293	float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
				1294	static const float s = M_SQRT2;
				1295	assert(in != out);
				1296	for (k=0; k < 4; ++k) {
				1297	Xr.f[k] = ((float)in)[8k];
				1298	Xi.f[k] = ((float)in)[8k+4];
				1299	}
				1300
				1301	pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
				1302
				1303	/*
				1304	[Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
				1305
				1306	[cr0] [1 0 2 0 1 0 0 0]
				1307	[cr1] [1 0 0 0 -1 0 -2 0]
				1308	[cr2] [1 0 -2 0 1 0 0 0]
				1309	[cr3] [1 0 0 0 -1 0 2 0]
				1310	[ci0] [0 2 0 2 0 0 0 0]
				1311	[ci1] [0 s 0 -s 0 -s 0 -s]
				1312	[ci2] [0 0 0 0 0 -2 0 2]
				1313	[ci3] [0 -s 0 s 0 -s 0 -s]
				1314	*/
				1315	for (k=1; k < dk; ++k) {
				1316	pffft_real_preprocess_4x4(in+8k, e + k6, out-1+k*8, 0);
				1317	}
				1318
				1319	cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
				1320	cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
				1321	cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
				1322	cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
				1323	ci0= 2(Xr.f[1]+Xr.f[3]); uout[2Ncvec-1].f[0] = ci0;
				1324	ci1= s(Xr.f[1]-Xr.f[3]) - s(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
				1325	ci2= 2(Xi.f[3]-Xi.f[1]); uout[2Ncvec-1].f[2] = ci2;
				1326	ci3=-s(Xr.f[1]-Xr.f[3]) - s(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
				1327	}
				1328
				1329
				1330	void pffft_transform_internal(PFFFT_Setup setup, const float finput, float foutput, v4sf scratch,
				1331	pffft_direction_t direction, int ordered) {
				1332	int k, Ncvec = setup->Ncvec;
				1333	int nf_odd = (setup->ifac[1] & 1);
				1334
				1335	// temporary buffer is allocated on the stack if the scratch pointer is NULL
				1336	int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
				1337	VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
				1338	if (scratch == 0) scratch = scratch_on_stack;
				1339
				1340	const v4sf vinput = (const v4sf)finput;
				1341	v4sf voutput = (v4sf)foutput;
				1342	v4sf *buff[2] = { voutput, scratch };
				1343	int ib = (nf_odd ^ ordered ? 1 : 0);
				1344
				1345	assert(VALIGNED(finput) && VALIGNED(foutput));
				1346
				1347	//assert(finput != foutput);
				1348	if (direction == PFFFT_FORWARD) {
				1349	ib = !ib;
				1350	if (setup->transform == PFFFT_REAL) {
				1351	ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
				1352	setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
				1353	pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
				1354	} else {
				1355	v4sf *tmp = buff[ib];
				1356	for (k=0; k < Ncvec; ++k) {
				1357	UNINTERLEAVE2(vinput[k2], vinput[k2+1], tmp[k2], tmp[k2+1]);
				1358	}
				1359	ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
				1360	setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
				1361	pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
				1362	}
				1363	if (ordered) {
				1364	pffft_zreorder(setup, (float)buff[!ib], (float)buff[ib], PFFFT_FORWARD);
				1365	} else ib = !ib;
				1366	} else {
				1367	if (vinput == buff[ib]) {
				1368	ib = !ib; // may happen when finput == foutput
				1369	}
				1370	if (ordered) {
				1371	pffft_zreorder(setup, (float)vinput, (float)buff[ib], PFFFT_BACKWARD);
				1372	vinput = buff[ib]; ib = !ib;
				1373	}
				1374	if (setup->transform == PFFFT_REAL) {
				1375	pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
				1376	ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
				1377	setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
				1378	} else {
				1379	pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
				1380	ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
				1381	setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
				1382	for (k=0; k < Ncvec; ++k) {
				1383	INTERLEAVE2(buff[ib][k2], buff[ib][k2+1], buff[ib][k2], buff[ib][k2+1]);
				1384	}
				1385	}
				1386	}
				1387
				1388	if (buff[ib] != voutput) {
				1389	/* extra copy required -- this situation should only happen when finput == foutput */
				1390	assert(finput==foutput);
				1391	for (k=0; k < Ncvec; ++k) {
				1392	v4sf a = buff[ib][2k], b = buff[ib][2k+1];
				1393	voutput[2k] = a; voutput[2k+1] = b;
				1394	}
				1395	ib = !ib;
				1396	}
				1397	assert(buff[ib] == voutput);
				1398	}
				1399
				1400	void pffft_zconvolve_accumulate(PFFFT_Setup s, const float a, const float b, float ab, float scaling) {
				1401	int i, Ncvec = s->Ncvec;
				1402	const v4sf * RESTRICT va = (const v4sf*)a;
				1403	const v4sf * RESTRICT vb = (const v4sf*)b;
				1404	v4sf * RESTRICT vab = (v4sf*)ab;
				1405
				1406	#ifdef __arm__
				1407	__builtin_prefetch(va);
				1408	__builtin_prefetch(vb);
				1409	__builtin_prefetch(vab);
				1410	__builtin_prefetch(va+2);
				1411	__builtin_prefetch(vb+2);
				1412	__builtin_prefetch(vab+2);
				1413	__builtin_prefetch(va+4);
				1414	__builtin_prefetch(vb+4);
				1415	__builtin_prefetch(vab+4);
				1416	__builtin_prefetch(va+6);
				1417	__builtin_prefetch(vb+6);
				1418	__builtin_prefetch(vab+6);
				1419	#endif
				1420
				1421	float ar, ai, br, bi, abr, abi;
				1422	v4sf vscal = LD_PS1(scaling);
				1423
				1424	assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
				1425	ar = ((v4sf_union*)va)[0].f[0];
				1426	ai = ((v4sf_union*)va)[1].f[0];
				1427	br = ((v4sf_union*)vb)[0].f[0];
				1428	bi = ((v4sf_union*)vb)[1].f[0];
				1429	abr = ((v4sf_union*)vab)[0].f[0];
				1430	abi = ((v4sf_union*)vab)[1].f[0];
				1431
				1432	#ifdef __arm__
				1433	# if 1 // inline asm version
				1434	const float a_ = a, b_ = b; float *ab_ = ab;
				1435	int N = Ncvec;
				1436	asm volatile("mov r8, %2 \n"
				1437	"vdup.f32 q15, %4 \n"
				1438	"1: \n"
				1439	"pld [%0,#64] \n"
				1440	"pld [%1,#64] \n"
				1441	"pld [%2,#64] \n"
				1442	"pld [%0,#96] \n"
				1443	"pld [%1,#96] \n"
				1444	"pld [%2,#96] \n"
				1445	"vld1.f32 {q0,q1}, [%0,:128]! \n"
				1446	"vld1.f32 {q4,q5}, [%1,:128]! \n"
				1447	"vld1.f32 {q2,q3}, [%0,:128]! \n"
				1448	"vld1.f32 {q6,q7}, [%1,:128]! \n"
				1449	"vld1.f32 {q8,q9}, [r8,:128]! \n"
				1450
				1451	"vmul.f32 q10, q0, q4 \n"
				1452	"vmul.f32 q11, q0, q5 \n"
				1453	"vmul.f32 q12, q2, q6 \n"
				1454	"vmul.f32 q13, q2, q7 \n"
				1455	"vmls.f32 q10, q1, q5 \n"
				1456	"vmla.f32 q11, q1, q4 \n"
				1457	"vld1.f32 {q0,q1}, [r8,:128]! \n"
				1458	"vmls.f32 q12, q3, q7 \n"
				1459	"vmla.f32 q13, q3, q6 \n"
				1460	"vmla.f32 q8, q10, q15 \n"
				1461	"vmla.f32 q9, q11, q15 \n"
				1462	"vmla.f32 q0, q12, q15 \n"
				1463	"vmla.f32 q1, q13, q15 \n"
				1464	"vst1.f32 {q8,q9},[%2,:128]! \n"
				1465	"vst1.f32 {q0,q1},[%2,:128]! \n"
				1466	"subs %3, #2 \n"
				1467	"bne 1b \n"
				1468	: "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
				1469
				1470	# else // neon instrinsics version, 30% slower that the asm one with gcc 4.6
				1471	v4sf a1r, a1i, b1r, b1i;
				1472	v4sf a2r, a2i, b2r, b2i;
				1473	v4sf ab1r, ab1i, ab2r, ab2i;
				1474	for (i=0; i < Ncvec; i += 2) {
				1475	__builtin_prefetch(va+8);
				1476	__builtin_prefetch(va+10);
				1477
				1478	a1r = va++; a1i = va++;
				1479	a2r = va++; a2i = va++;
				1480	b1r = vb++; b1i = vb++;
				1481	b2r = vb++; b2i = vb++;
				1482	ab1r = vab[0]; ab1i = vab[1];
				1483	ab2r = vab[2]; ab2i = vab[3];
				1484
				1485	v4sf z1r = VMUL(a1r, b1r);
				1486	v4sf z2r = VMUL(a2r, b2r);
				1487	v4sf z1i = VMUL(a1r, b1i);
				1488	v4sf z2i = VMUL(a2r, b2i);
				1489
				1490	__builtin_prefetch(vb+4);
				1491	__builtin_prefetch(vb+6);
				1492
				1493	z1r = vmlsq_f32(z1r, a1i, b1i);
				1494	z2r = vmlsq_f32(z2r, a2i, b2i);
				1495	z1i = vmlaq_f32(z1i, a1i, b1r);
				1496	z2i = vmlaq_f32(z2i, a2i, b2r);
				1497
				1498	__builtin_prefetch(vab+4);
				1499	__builtin_prefetch(vab+6);
				1500
				1501	ab1r = vmlaq_f32(ab1r, z1r, vscal);
				1502	ab2r = vmlaq_f32(ab2r, z2r, vscal);
				1503	ab1i = vmlaq_f32(ab1i, z1i, vscal);
				1504	ab2i = vmlaq_f32(ab2i, z2i, vscal);
				1505
				1506	vab++ = ab1r; vab++ = ab1i;
				1507	vab++ = ab2r; vab++ = ab2i;
				1508	}
				1509	# endif
				1510
				1511	#else // not ARM, no need to use a special routine
				1512	for (i=0; i < Ncvec; i += 2) {
				1513	v4sf ar, ai, br, bi;
				1514	ar = va[2i+0]; ai = va[2i+1];
				1515	br = vb[2i+0]; bi = vb[2i+1];
				1516	VCPLXMUL(ar, ai, br, bi);
				1517	vab[2i+0] = VMADD(ar, vscal, vab[2i+0]);
				1518	vab[2i+1] = VMADD(ai, vscal, vab[2i+1]);
				1519	ar = va[2i+2]; ai = va[2i+3];
				1520	br = vb[2i+2]; bi = vb[2i+3];
				1521	VCPLXMUL(ar, ai, br, bi);
				1522	vab[2i+2] = VMADD(ar, vscal, vab[2i+2]);
				1523	vab[2i+3] = VMADD(ai, vscal, vab[2i+3]);
				1524	}
				1525	#endif
				1526	if (s->transform == PFFFT_REAL) {
				1527	((v4sf_union)vab)[0].f[0] = abr + arbr*scaling;
				1528	((v4sf_union)vab)[1].f[0] = abi + aibi*scaling;
				1529	}
				1530	}
				1531
				1532
				1533	#else // defined(PFFFT_SIMD_DISABLE)
				1534
				1535	// standard routine using scalar floats, without SIMD stuff.
				1536
				1537	#define pffft_zreorder_nosimd pffft_zreorder
				1538	void pffft_zreorder_nosimd(PFFFT_Setup setup, const float in, float *out, pffft_direction_t direction) {
				1539	int k, N = setup->N;
				1540	if (setup->transform == PFFFT_COMPLEX) {
				1541	for (k=0; k < 2*N; ++k) out[k] = in[k];
				1542	return;
				1543	}
				1544	else if (direction == PFFFT_FORWARD) {
				1545	float x_N = in[N-1];
				1546	for (k=N-1; k > 1; --k) out[k] = in[k-1];
				1547	out[0] = in[0];
				1548	out[1] = x_N;
				1549	} else {
				1550	float x_N = in[1];
				1551	for (k=1; k < N-1; ++k) out[k] = in[k+1];
				1552	out[0] = in[0];
				1553	out[N-1] = x_N;
				1554	}
				1555	}
				1556
				1557	#define pffft_transform_internal_nosimd pffft_transform_internal
				1558	void pffft_transform_internal_nosimd(PFFFT_Setup setup, const float input, float output, float scratch,
				1559	pffft_direction_t direction, int ordered) {
				1560	int Ncvec = setup->Ncvec;
				1561	int nf_odd = (setup->ifac[1] & 1);
				1562
				1563	// temporary buffer is allocated on the stack if the scratch pointer is NULL
				1564	int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
				1565	VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
				1566	if (scratch == 0) scratch = scratch_on_stack;
				1567
				1568	float *buff[2] = { output, scratch };
				1569	int ib;
				1570	if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
				1571	ib = (nf_odd ^ ordered ? 1 : 0);
				1572
				1573	if (direction == PFFFT_FORWARD) {
				1574	if (setup->transform == PFFFT_REAL) {
				1575	ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
				1576	setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
				1577	} else {
				1578	ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
				1579	setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
				1580	}
				1581	if (ordered) {
				1582	pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
				1583	}
				1584	} else {
				1585	if (input == buff[ib]) {
				1586	ib = !ib; // may happen when finput == foutput
				1587	}
				1588	if (ordered) {
				1589	pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
				1590	input = buff[!ib];
				1591	}
				1592	if (setup->transform == PFFFT_REAL) {
				1593	ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
				1594	setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
				1595	} else {
				1596	ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
				1597	setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
				1598	}
				1599	}
				1600	if (buff[ib] != output) {
				1601	int k;
				1602	// extra copy required -- this situation should happens only when finput == foutput
				1603	assert(input==output);
				1604	for (k=0; k < Ncvec; ++k) {
				1605	float a = buff[ib][2k], b = buff[ib][2k+1];
				1606	output[2k] = a; output[2k+1] = b;
				1607	}
				1608	ib = !ib;
				1609	}
				1610	assert(buff[ib] == output);
				1611	}
				1612
				1613	#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
				1614	void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup s, const float a, const float *b,
				1615	float *ab, float scaling) {
				1616	int i, Ncvec = s->Ncvec;
				1617
				1618	if (s->transform == PFFFT_REAL) {
				1619	// take care of the fftpack ordering
				1620	ab[0] += a[0]b[0]scaling;
				1621	ab[2Ncvec-1] += a[2Ncvec-1]b[2Ncvec-1]*scaling;
				1622	++ab; ++a; ++b; --Ncvec;
				1623	}
				1624	for (i=0; i < Ncvec; ++i) {
				1625	float ar, ai, br, bi;
				1626	ar = a[2i+0]; ai = a[2i+1];
				1627	br = b[2i+0]; bi = b[2i+1];
				1628	VCPLXMUL(ar, ai, br, bi);
				1629	ab[2i+0] += arscaling;
				1630	ab[2i+1] += aiscaling;
				1631	}
				1632	}
				1633
				1634	#endif // defined(PFFFT_SIMD_DISABLE)
				1635
				1636	void pffft_transform(PFFFT_Setup setup, const float input, float output, float work, pffft_direction_t direction) {
				1637	pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
				1638	}
				1639
				1640	void pffft_transform_ordered(PFFFT_Setup setup, const float input, float output, float work, pffft_direction_t direction) {
				1641	pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
				1642	}