blob: 90a214ad0e617a7d258f2aa61a07739d6ef9d7c4 [file] [log] [blame]
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +00001/* Copyright (c) 2007-2008 CSIRO
2 Copyright (c) 2007-2008 Xiph.Org Foundation
3 Written by Jean-Marc Valin */
4/*
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 - Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 - Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29/* This is a simple MDCT implementation that uses a N/4 complex FFT
30 to do most of the work. It should be relatively straightforward to
31 plug in pretty much and FFT here.
32
33 This replaces the Vorbis FFT (and uses the exact same API), which
34 was a bit too messy and that was ending up duplicating code
35 (might as well use the same FFT everywhere).
36
37 The algorithm is similar to (and inspired from) Fabrice Bellard's
38 MDCT implementation in FFMPEG, but has differences in signs, ordering
39 and scaling in many places.
40*/
41
42#ifndef SKIP_CONFIG_H
43#ifdef HAVE_CONFIG_H
44#include "config.h"
45#endif
46#endif
47
48#include "mdct.h"
49#include "kiss_fft.h"
50#include "_kiss_fft_guts.h"
51#include <math.h>
52#include "os_support.h"
53#include "mathops.h"
54#include "stack_alloc.h"
55
56#ifdef CUSTOM_MODES
57
58int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
59{
60 int i;
61 int N4;
62 kiss_twiddle_scalar *trig;
63#if defined(FIXED_POINT)
64 int N2=N>>1;
65#endif
66 l->n = N;
67 N4 = N>>2;
68 l->maxshift = maxshift;
69 for (i=0;i<=maxshift;i++)
70 {
71 if (i==0)
72 l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0);
73 else
74 l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0]);
75#ifndef ENABLE_TI_DSPLIB55
76 if (l->kfft[i]==NULL)
77 return 0;
78#endif
79 }
80 l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar));
81 if (l->trig==NULL)
82 return 0;
83 /* We have enough points that sine isn't necessary */
84#if defined(FIXED_POINT)
85 for (i=0;i<=N4;i++)
86 trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));
87#else
88 for (i=0;i<=N4;i++)
89 trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N);
90#endif
91 return 1;
92}
93
94void clt_mdct_clear(mdct_lookup *l)
95{
96 int i;
97 for (i=0;i<=l->maxshift;i++)
98 opus_fft_free(l->kfft[i]);
99 opus_free((kiss_twiddle_scalar*)l->trig);
100}
101
102#endif /* CUSTOM_MODES */
103
104/* Forward MDCT trashes the input array */
105void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
106 const opus_val16 *window, int overlap, int shift, int stride)
107{
108 int i;
109 int N, N2, N4;
110 kiss_twiddle_scalar sine;
111 VARDECL(kiss_fft_scalar, f);
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000112 VARDECL(kiss_fft_scalar, f2);
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000113 SAVE_STACK;
114 N = l->n;
115 N >>= shift;
116 N2 = N>>1;
117 N4 = N>>2;
118 ALLOC(f, N2, kiss_fft_scalar);
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000119 ALLOC(f2, N2, kiss_fft_scalar);
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000120 /* sin(x) ~= x here */
121#ifdef FIXED_POINT
122 sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
123#else
124 sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
125#endif
126
127 /* Consider the input to be composed of four blocks: [a, b, c, d] */
128 /* Window, shuffle, fold */
129 {
130 /* Temp pointers to make it really clear to the compiler what we're doing */
131 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
132 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
133 kiss_fft_scalar * OPUS_RESTRICT yp = f;
134 const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
135 const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000136 for(i=0;i<((overlap+3)>>2);i++)
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000137 {
138 /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
139 *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
140 *yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]);
141 xp1+=2;
142 xp2-=2;
143 wp1+=2;
144 wp2-=2;
145 }
146 wp1 = window;
147 wp2 = window+overlap-1;
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000148 for(;i<N4-((overlap+3)>>2);i++)
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000149 {
150 /* Real part arranged as a-bR, Imag part arranged as -c-dR */
151 *yp++ = *xp2;
152 *yp++ = *xp1;
153 xp1+=2;
154 xp2-=2;
155 }
156 for(;i<N4;i++)
157 {
158 /* Real part arranged as a-bR, Imag part arranged as -c-dR */
159 *yp++ = -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
160 *yp++ = MULT16_32_Q15(*wp2, *xp1) + MULT16_32_Q15(*wp1, xp2[N2]);
161 xp1+=2;
162 xp2-=2;
163 wp1+=2;
164 wp2-=2;
165 }
166 }
167 /* Pre-rotation */
168 {
169 kiss_fft_scalar * OPUS_RESTRICT yp = f;
170 const kiss_twiddle_scalar *t = &l->trig[0];
171 for(i=0;i<N4;i++)
172 {
173 kiss_fft_scalar re, im, yr, yi;
174 re = yp[0];
175 im = yp[1];
176 yr = -S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]);
177 yi = -S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]);
178 /* works because the cos is nearly one */
179 *yp++ = yr + S_MUL(yi,sine);
180 *yp++ = yi - S_MUL(yr,sine);
181 }
182 }
183
184 /* N/4 complex FFT, down-scales by 4/N */
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000185 opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000186
187 /* Post-rotate */
188 {
189 /* Temp pointers to make it really clear to the compiler what we're doing */
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000190 const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000191 kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
192 kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
193 const kiss_twiddle_scalar *t = &l->trig[0];
194 /* Temp pointers to make it really clear to the compiler what we're doing */
195 for(i=0;i<N4;i++)
196 {
197 kiss_fft_scalar yr, yi;
198 yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);
199 yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);
200 /* works because the cos is nearly one */
201 *yp1 = yr - S_MUL(yi,sine);
202 *yp2 = yi + S_MUL(yr,sine);;
203 fp += 2;
204 yp1 += 2*stride;
205 yp2 -= 2*stride;
206 }
207 }
208 RESTORE_STACK;
209}
210
211void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
212 const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
213{
214 int i;
215 int N, N2, N4;
216 kiss_twiddle_scalar sine;
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000217 VARDECL(kiss_fft_scalar, f2);
218 SAVE_STACK;
219 N = l->n;
220 N >>= shift;
221 N2 = N>>1;
222 N4 = N>>2;
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000223 ALLOC(f2, N2, kiss_fft_scalar);
224 /* sin(x) ~= x here */
225#ifdef FIXED_POINT
226 sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
227#else
228 sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
229#endif
230
231 /* Pre-rotate */
232 {
233 /* Temp pointers to make it really clear to the compiler what we're doing */
234 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
235 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
236 kiss_fft_scalar * OPUS_RESTRICT yp = f2;
237 const kiss_twiddle_scalar *t = &l->trig[0];
238 for(i=0;i<N4;i++)
239 {
240 kiss_fft_scalar yr, yi;
241 yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
242 yi = -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
243 /* works because the cos is nearly one */
244 *yp++ = yr - S_MUL(yi,sine);
245 *yp++ = yi + S_MUL(yr,sine);
246 xp1+=2*stride;
247 xp2-=2*stride;
248 }
249 }
250
251 /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000252 opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000253
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000254 /* Post-rotate and de-shuffle from both ends of the buffer at once to make
255 it in-place. */
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000256 {
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000257 kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
258 kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000259 const kiss_twiddle_scalar *t = &l->trig[0];
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000260 /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
261 middle pair will be computed twice. */
262 for(i=0;i<(N4+1)>>1;i++)
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000263 {
264 kiss_fft_scalar re, im, yr, yi;
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000265 kiss_twiddle_scalar t0, t1;
266 re = yp0[0];
267 im = yp0[1];
268 t0 = t[i<<shift];
269 t1 = t[(N4-i)<<shift];
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000270 /* We'd scale up by 2 here, but instead it's done when mixing the windows */
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000271 yr = S_MUL(re,t0) - S_MUL(im,t1);
272 yi = S_MUL(im,t0) + S_MUL(re,t1);
273 re = yp1[0];
274 im = yp1[1];
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000275 /* works because the cos is nearly one */
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000276 yp0[0] = -(yr - S_MUL(yi,sine));
277 yp1[1] = yi + S_MUL(yr,sine);
278
279 t0 = t[(N4-i-1)<<shift];
280 t1 = t[(i+1)<<shift];
281 /* We'd scale up by 2 here, but instead it's done when mixing the windows */
282 yr = S_MUL(re,t0) - S_MUL(im,t1);
283 yi = S_MUL(im,t0) + S_MUL(re,t1);
284 /* works because the cos is nearly one */
285 yp1[0] = -(yr - S_MUL(yi,sine));
286 yp0[1] = yi + S_MUL(yr,sine);
287 yp0 += 2;
288 yp1 -= 2;
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000289 }
290 }
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000291
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000292 /* Mirror on both sides for TDAC */
293 {
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000294 kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
295 kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000296 const opus_val16 * OPUS_RESTRICT wp1 = window;
297 const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000298
299 for(i = 0; i < overlap/2; i++)
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000300 {
tlegrand@chromium.orge3ea0492013-10-23 09:13:50 +0000301 kiss_fft_scalar x1, x2;
302 x1 = *xp1;
303 x2 = *yp1;
304 *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
305 *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
sergeyu@chromium.org885f2ff2012-10-17 22:31:52 +0000306 wp1++;
307 wp2--;
308 }
309 }
310 RESTORE_STACK;
311}