blob: 360ebcc8ddbeb7255693f92f619a4e71d837ed52 [file] [log] [blame]
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -08001/* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3/*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "kiss_fft.h"
33#include "celt.h"
34#include "modes.h"
35#include "arch.h"
36#include "quant_bands.h"
37#include <stdio.h>
38#include "analysis.h"
39#include "mlp.h"
40#include "stack_alloc.h"
41
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -080042#ifndef M_PI
43#define M_PI 3.141592653
44#endif
45
46static const float dct_table[128] = {
47 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
63};
64
65static const float analysis_window[240] = {
66 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
96};
97
98static const int tbands[NB_TBANDS+1] = {
99 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
100};
101
102static const int extra_bands[NB_TOT_BANDS+1] = {
103 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104};
105
106/*static const float tweight[NB_TBANDS+1] = {
107 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108};*/
109
110#define NB_TONAL_SKIP_BANDS 9
111
112#define cA 0.43157974f
113#define cB 0.67848403f
114#define cC 0.08595542f
115#define cE ((float)M_PI/2)
116static OPUS_INLINE float fast_atan2f(float y, float x) {
117 float x2, y2;
118 /* Should avoid underflow on the values we'll get */
119 if (ABS16(x)+ABS16(y)<1e-9f)
120 {
121 x*=1e12f;
122 y*=1e12f;
123 }
124 x2 = x*x;
125 y2 = y*y;
126 if(x2<y2){
127 float den = (y2 + cB*x2) * (y2 + cC*x2);
128 if (den!=0)
129 return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
130 else
131 return (y<0 ? -cE : cE);
132 }else{
133 float den = (x2 + cB*y2) * (x2 + cC*y2);
134 if (den!=0)
135 return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
136 else
137 return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138 }
139}
140
flimc91ee5b2016-01-26 14:33:44 +0100141void tonality_analysis_init(TonalityAnalysisState *tonal)
142{
143 /* Initialize reusable fields. */
144 tonal->arch = opus_select_arch();
145 /* Clear remaining fields. */
146 tonality_analysis_reset(tonal);
147}
148
149void tonality_analysis_reset(TonalityAnalysisState *tonal)
150{
151 /* Clear non-reusable fields. */
152 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
153 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
154}
155
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800156void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
157{
158 int pos;
159 int curr_lookahead;
160 float psum;
161 int i;
162
163 pos = tonal->read_pos;
164 curr_lookahead = tonal->write_pos-tonal->read_pos;
165 if (curr_lookahead<0)
166 curr_lookahead += DETECT_SIZE;
167
168 if (len > 480 && pos != tonal->write_pos)
169 {
170 pos++;
171 if (pos==DETECT_SIZE)
172 pos=0;
173 }
174 if (pos == tonal->write_pos)
175 pos--;
176 if (pos<0)
177 pos = DETECT_SIZE-1;
178 OPUS_COPY(info_out, &tonal->info[pos], 1);
179 tonal->read_subframe += len/120;
180 while (tonal->read_subframe>=4)
181 {
182 tonal->read_subframe -= 4;
183 tonal->read_pos++;
184 }
185 if (tonal->read_pos>=DETECT_SIZE)
186 tonal->read_pos-=DETECT_SIZE;
187
188 /* Compensate for the delay in the features themselves.
189 FIXME: Need a better estimate the 10 I just made up */
190 curr_lookahead = IMAX(curr_lookahead-10, 0);
191
192 psum=0;
193 /* Summing the probability of transition patterns that involve music at
194 time (DETECT_SIZE-curr_lookahead-1) */
195 for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
196 psum += tonal->pmusic[i];
197 for (;i<DETECT_SIZE;i++)
198 psum += tonal->pspeech[i];
199 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
200 /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
201
202 info_out->music_prob = psum;
203}
204
flimc91ee5b2016-01-26 14:33:44 +0100205static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800206{
207 int i, b;
208 const kiss_fft_state *kfft;
209 VARDECL(kiss_fft_cpx, in);
210 VARDECL(kiss_fft_cpx, out);
211 int N = 480, N2=240;
212 float * OPUS_RESTRICT A = tonal->angle;
213 float * OPUS_RESTRICT dA = tonal->d_angle;
214 float * OPUS_RESTRICT d2A = tonal->d2_angle;
215 VARDECL(float, tonality);
216 VARDECL(float, noisiness);
217 float band_tonality[NB_TBANDS];
218 float logE[NB_TBANDS];
219 float BFCC[8];
220 float features[25];
221 float frame_tonality;
222 float max_frame_tonality;
223 /*float tw_sum=0;*/
224 float frame_noisiness;
225 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
226 float slope=0;
227 float frame_stationarity;
228 float relativeE;
229 float frame_probs[2];
230 float alpha, alphaE, alphaE2;
231 float frame_loudness;
232 float bandwidth_mask;
233 int bandwidth=0;
234 float maxE = 0;
235 float noise_floor;
236 int remaining;
237 AnalysisInfo *info;
238 SAVE_STACK;
239
240 tonal->last_transition++;
241 alpha = 1.f/IMIN(20, 1+tonal->count);
242 alphaE = 1.f/IMIN(50, 1+tonal->count);
243 alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
244
245 if (tonal->count<4)
246 tonal->music_prob = .5;
247 kfft = celt_mode->mdct.kfft[0];
248 if (tonal->count==0)
249 tonal->mem_fill = 240;
250 downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
251 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
252 {
253 tonal->mem_fill += len;
254 /* Don't have enough to update the analysis */
255 RESTORE_STACK;
256 return;
257 }
258 info = &tonal->info[tonal->write_pos++];
259 if (tonal->write_pos>=DETECT_SIZE)
260 tonal->write_pos-=DETECT_SIZE;
261
262 ALLOC(in, 480, kiss_fft_cpx);
263 ALLOC(out, 480, kiss_fft_cpx);
264 ALLOC(tonality, 240, float);
265 ALLOC(noisiness, 240, float);
266 for (i=0;i<N2;i++)
267 {
268 float w = analysis_window[i];
269 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
270 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
271 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
272 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
273 }
274 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
275 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
276 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
277 tonal->mem_fill = 240 + remaining;
flimc91ee5b2016-01-26 14:33:44 +0100278 opus_fft(kfft, in, out, tonal->arch);
279#ifndef FIXED_POINT
280 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
281 if (celt_isnan(out[0].r))
282 {
283 info->valid = 0;
284 RESTORE_STACK;
285 return;
286 }
287#endif
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800288
289 for (i=1;i<N2;i++)
290 {
291 float X1r, X2r, X1i, X2i;
292 float angle, d_angle, d2_angle;
293 float angle2, d_angle2, d2_angle2;
294 float mod1, mod2, avg_mod;
295 X1r = (float)out[i].r+out[N-i].r;
296 X1i = (float)out[i].i-out[N-i].i;
297 X2r = (float)out[i].i+out[N-i].i;
298 X2i = (float)out[N-i].r-out[i].r;
299
300 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
301 d_angle = angle - A[i];
302 d2_angle = d_angle - dA[i];
303
304 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
305 d_angle2 = angle2 - angle;
306 d2_angle2 = d_angle2 - d_angle;
307
308 mod1 = d2_angle - (float)floor(.5+d2_angle);
309 noisiness[i] = ABS16(mod1);
310 mod1 *= mod1;
311 mod1 *= mod1;
312
313 mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
314 noisiness[i] += ABS16(mod2);
315 mod2 *= mod2;
316 mod2 *= mod2;
317
318 avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
319 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
320
321 A[i] = angle2;
322 dA[i] = d_angle2;
323 d2A[i] = mod2;
324 }
325
326 frame_tonality = 0;
327 max_frame_tonality = 0;
328 /*tw_sum = 0;*/
329 info->activity = 0;
330 frame_noisiness = 0;
331 frame_stationarity = 0;
332 if (!tonal->count)
333 {
334 for (b=0;b<NB_TBANDS;b++)
335 {
336 tonal->lowE[b] = 1e10;
337 tonal->highE[b] = -1e10;
338 }
339 }
340 relativeE = 0;
341 frame_loudness = 0;
342 for (b=0;b<NB_TBANDS;b++)
343 {
344 float E=0, tE=0, nE=0;
345 float L1, L2;
346 float stationarity;
347 for (i=tbands[b];i<tbands[b+1];i++)
348 {
349 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
350 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
351#ifdef FIXED_POINT
352 /* FIXME: It's probably best to change the BFCC filter initial state instead */
353 binE *= 5.55e-17f;
354#endif
355 E += binE;
356 tE += binE*tonality[i];
357 nE += binE*2.f*(.5f-noisiness[i]);
358 }
flimc91ee5b2016-01-26 14:33:44 +0100359#ifndef FIXED_POINT
360 /* Check for extreme band energies that could cause NaNs later. */
361 if (!(E<1e9f) || celt_isnan(E))
362 {
363 info->valid = 0;
364 RESTORE_STACK;
365 return;
366 }
367#endif
368
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800369 tonal->E[tonal->E_count][b] = E;
370 frame_noisiness += nE/(1e-15f+E);
371
372 frame_loudness += (float)sqrt(E+1e-10f);
373 logE[b] = (float)log(E+1e-10f);
374 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
375 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
376 if (tonal->highE[b] < tonal->lowE[b]+1.f)
377 {
378 tonal->highE[b]+=.5f;
379 tonal->lowE[b]-=.5f;
380 }
381 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
382
383 L1=L2=0;
384 for (i=0;i<NB_FRAMES;i++)
385 {
386 L1 += (float)sqrt(tonal->E[i][b]);
387 L2 += tonal->E[i][b];
388 }
389
390 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
391 stationarity *= stationarity;
392 stationarity *= stationarity;
393 frame_stationarity += stationarity;
394 /*band_tonality[b] = tE/(1e-15+E)*/;
395 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
396#if 0
397 if (b>=NB_TONAL_SKIP_BANDS)
398 {
399 frame_tonality += tweight[b]*band_tonality[b];
400 tw_sum += tweight[b];
401 }
402#else
403 frame_tonality += band_tonality[b];
404 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
405 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
406#endif
407 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
408 slope += band_tonality[b]*(b-8);
409 /*printf("%f %f ", band_tonality[b], stationarity);*/
410 tonal->prev_band_tonality[b] = band_tonality[b];
411 }
412
413 bandwidth_mask = 0;
414 bandwidth = 0;
415 maxE = 0;
416 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
417#ifdef FIXED_POINT
418 noise_floor *= 1<<(15+SIG_SHIFT);
419#endif
420 noise_floor *= noise_floor;
421 for (b=0;b<NB_TOT_BANDS;b++)
422 {
423 float E=0;
424 int band_start, band_end;
425 /* Keep a margin of 300 Hz for aliasing */
426 band_start = extra_bands[b];
427 band_end = extra_bands[b+1];
428 for (i=band_start;i<band_end;i++)
429 {
430 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
431 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
432 E += binE;
433 }
434 maxE = MAX32(maxE, E);
435 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
436 E = MAX32(E, tonal->meanE[b]);
437 /* Use a simple follower with 13 dB/Bark slope for spreading function */
438 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
439 /* Consider the band "active" only if all these conditions are met:
440 1) less than 10 dB below the simple follower
441 2) less than 90 dB below the peak band (maximal masking possible considering
442 both the ATH and the loudness-dependent slope of the spreading function)
443 3) above the PCM quantization noise floor
444 */
445 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
446 bandwidth = b;
447 }
448 if (tonal->count<=2)
449 bandwidth = 20;
450 frame_loudness = 20*(float)log10(frame_loudness);
451 tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
452 tonal->lowECount *= (1-alphaE);
453 if (frame_loudness < tonal->Etracker-30)
454 tonal->lowECount += alphaE;
455
456 for (i=0;i<8;i++)
457 {
458 float sum=0;
459 for (b=0;b<16;b++)
460 sum += dct_table[i*16+b]*logE[b];
461 BFCC[i] = sum;
462 }
463
464 frame_stationarity /= NB_TBANDS;
465 relativeE /= NB_TBANDS;
466 if (tonal->count<10)
467 relativeE = .5;
468 frame_noisiness /= NB_TBANDS;
469#if 1
470 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
471#else
472 info->activity = .5*(1+frame_noisiness-frame_stationarity);
473#endif
474 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
475 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
476 tonal->prev_tonality = frame_tonality;
477
478 slope /= 8*8;
479 info->tonality_slope = slope;
480
481 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
482 tonal->count++;
483 info->tonality = frame_tonality;
484
485 for (i=0;i<4;i++)
486 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
487
488 for (i=0;i<4;i++)
489 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
490
491 for (i=0;i<4;i++)
492 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
493 for (i=0;i<3;i++)
494 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
495
496 if (tonal->count > 5)
497 {
498 for (i=0;i<9;i++)
499 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
500 }
501
502 for (i=0;i<8;i++)
503 {
504 tonal->mem[i+24] = tonal->mem[i+16];
505 tonal->mem[i+16] = tonal->mem[i+8];
506 tonal->mem[i+8] = tonal->mem[i];
507 tonal->mem[i] = BFCC[i];
508 }
509 for (i=0;i<9;i++)
510 features[11+i] = (float)sqrt(tonal->std[i]);
511 features[20] = info->tonality;
512 features[21] = info->activity;
513 features[22] = frame_stationarity;
514 features[23] = info->tonality_slope;
515 features[24] = tonal->lowECount;
516
517#ifndef DISABLE_FLOAT_API
518 mlp_process(&net, features, frame_probs);
519 frame_probs[0] = .5f*(frame_probs[0]+1);
520 /* Curve fitting between the MLP probability and the actual probability */
521 frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
522 /* Probability of active audio (as opposed to silence) */
523 frame_probs[1] = .5f*frame_probs[1]+.5f;
524 /* Consider that silence has a 50-50 probability. */
525 frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
526
527 /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
528 {
529 /* Probability of state transition */
530 float tau;
531 /* Represents independence of the MLP probabilities, where
532 beta=1 means fully independent. */
533 float beta;
534 /* Denormalized probability of speech (p0) and music (p1) after update */
535 float p0, p1;
536 /* Probabilities for "all speech" and "all music" */
537 float s0, m0;
538 /* Probability sum for renormalisation */
539 float psum;
540 /* Instantaneous probability of speech and music, with beta pre-applied. */
541 float speech0;
542 float music0;
543
544 /* One transition every 3 minutes of active audio */
545 tau = .00005f*frame_probs[1];
546 beta = .05f;
547 if (1) {
548 /* Adapt beta based on how "unexpected" the new prob is */
549 float p, q;
550 p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
551 q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
552 beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
553 }
554 /* p0 and p1 are the probabilities of speech and music at this frame
555 using only information from previous frame and applying the
556 state transition model */
557 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
558 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
559 /* We apply the current probability with exponent beta to work around
560 the fact that the probability estimates aren't independent. */
561 p0 *= (float)pow(1-frame_probs[0], beta);
562 p1 *= (float)pow(frame_probs[0], beta);
563 /* Normalise the probabilities to get the Marokv probability of music. */
564 tonal->music_prob = p1/(p0+p1);
565 info->music_prob = tonal->music_prob;
566
567 /* This chunk of code deals with delayed decision. */
568 psum=1e-20f;
569 /* Instantaneous probability of speech and music, with beta pre-applied. */
570 speech0 = (float)pow(1-frame_probs[0], beta);
571 music0 = (float)pow(frame_probs[0], beta);
572 if (tonal->count==1)
573 {
574 tonal->pspeech[0]=.5;
575 tonal->pmusic [0]=.5;
576 }
577 /* Updated probability of having only speech (s0) or only music (m0),
578 before considering the new observation. */
579 s0 = tonal->pspeech[0] + tonal->pspeech[1];
580 m0 = tonal->pmusic [0] + tonal->pmusic [1];
581 /* Updates s0 and m0 with instantaneous probability. */
582 tonal->pspeech[0] = s0*(1-tau)*speech0;
583 tonal->pmusic [0] = m0*(1-tau)*music0;
584 /* Propagate the transition probabilities */
585 for (i=1;i<DETECT_SIZE-1;i++)
586 {
587 tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
588 tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
589 }
590 /* Probability that the latest frame is speech, when all the previous ones were music. */
591 tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
592 /* Probability that the latest frame is music, when all the previous ones were speech. */
593 tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
594
595 /* Renormalise probabilities to 1 */
596 for (i=0;i<DETECT_SIZE;i++)
597 psum += tonal->pspeech[i] + tonal->pmusic[i];
598 psum = 1.f/psum;
599 for (i=0;i<DETECT_SIZE;i++)
600 {
601 tonal->pspeech[i] *= psum;
602 tonal->pmusic [i] *= psum;
603 }
604 psum = tonal->pmusic[0];
605 for (i=1;i<DETECT_SIZE;i++)
606 psum += tonal->pspeech[i];
607
608 /* Estimate our confidence in the speech/music decisions */
609 if (frame_probs[1]>.75)
610 {
611 if (tonal->music_prob>.9)
612 {
613 float adapt;
614 adapt = 1.f/(++tonal->music_confidence_count);
615 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
616 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
617 }
618 if (tonal->music_prob<.1)
619 {
620 float adapt;
621 adapt = 1.f/(++tonal->speech_confidence_count);
622 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
623 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
624 }
625 } else {
626 if (tonal->music_confidence_count==0)
627 tonal->music_confidence = .9f;
628 if (tonal->speech_confidence_count==0)
629 tonal->speech_confidence = .1f;
630 }
631 }
632 if (tonal->last_music != (tonal->music_prob>.5f))
633 tonal->last_transition=0;
634 tonal->last_music = tonal->music_prob>.5f;
635#else
636 info->music_prob = 0;
637#endif
638 /*for (i=0;i<25;i++)
639 printf("%f ", features[i]);
640 printf("\n");*/
641
642 info->bandwidth = bandwidth;
643 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
644 info->noisiness = frame_noisiness;
645 info->valid = 1;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800646 RESTORE_STACK;
647}
648
649void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
650 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
651 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
652{
653 int offset;
654 int pcm_len;
655
656 if (analysis_pcm != NULL)
657 {
658 /* Avoid overflow/wrap-around of the analysis buffer */
659 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
660
661 pcm_len = analysis_frame_size - analysis->analysis_offset;
662 offset = analysis->analysis_offset;
663 do {
flimc91ee5b2016-01-26 14:33:44 +0100664 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800665 offset += 480;
666 pcm_len -= 480;
667 } while (pcm_len>0);
668 analysis->analysis_offset = analysis_frame_size;
669
670 analysis->analysis_offset -= frame_size;
671 }
672
673 analysis_info->valid = 0;
674 tonality_get_info(analysis, analysis_info, frame_size);
675}